From f1beec1b3240e23e026b29050f14d9632a6708b1 Mon Sep 17 00:00:00 2001 From: SecureCRT Date: Sat, 18 Aug 2012 23:45:48 +0800 Subject: [PATCH 01/14] vmalloc(): adjust gfp mask passed on nested vmalloc() invocation --- mm/vmalloc.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) mode change 100644 => 100755 mm/vmalloc.c diff --git a/mm/vmalloc.c b/mm/vmalloc.c old mode 100644 new mode 100755 index c2287313..b689e2ae --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1470,6 +1470,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, { struct page **pages; unsigned int nr_pages, array_size, i; + gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT; array_size = (nr_pages * sizeof(struct page *)); @@ -1477,13 +1478,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, area->nr_pages = nr_pages; /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { - pages = __vmalloc_node(array_size, 1, gfp_mask | __GFP_ZERO, + pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM, PAGE_KERNEL, node, caller); area->flags |= VM_VPAGES; } else { - pages = kmalloc_node(array_size, - (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO, - node); + pages = kmalloc_node(array_size, nested_gfp, node); } area->pages = pages; area->caller = caller; From e0c9143ea1ec510a41b347be043e98034eedf5c8 Mon Sep 17 00:00:00 2001 From: SecureCRT Date: Mon, 20 Aug 2012 00:49:43 +0800 Subject: [PATCH 02/14] mm: cleancache core ops functions and config --- Documentation/vm/cleancache.txt | 279 ++++++++++++++++++++++++++++++++ include/linux/cleancache.h | 122 ++++++++++++++ mm/Kconfig | 22 +++ mm/Makefile | 1 + mm/cleancache.c | 244 ++++++++++++++++++++++++++++ 5 files changed, 668 insertions(+) create mode 100755 Documentation/vm/cleancache.txt create mode 100755 include/linux/cleancache.h mode change 100644 => 100755 mm/Kconfig mode change 100644 => 100755 mm/Makefile create mode 100755 mm/cleancache.c diff --git a/Documentation/vm/cleancache.txt b/Documentation/vm/cleancache.txt new file mode 100755 index 00000000..e0a53567 --- /dev/null +++ b/Documentation/vm/cleancache.txt @@ -0,0 +1,279 @@ +MOTIVATION + +Cleancache is a new optional feature provided by the VFS layer that +potentially dramatically increases page cache effectiveness for +many workloads in many environments at a negligible cost. + +Cleancache can be thought of as a page-granularity victim cache for clean +pages that the kernel's pageframe replacement algorithm (PFRA) would like +to keep around, but can't since there isn't enough memory. So when the +PFRA "evicts" a page, it first attempts to use cleancache code to +put the data contained in that page into "transcendent memory", memory +that is not directly accessible or addressable by the kernel and is +of unknown and possibly time-varying size. + +Later, when a cleancache-enabled filesystem wishes to access a page +in a file on disk, it first checks cleancache to see if it already +contains it; if it does, the page of data is copied into the kernel +and a disk access is avoided. + +Transcendent memory "drivers" for cleancache are currently implemented +in Xen (using hypervisor memory) and zcache (using in-kernel compressed +memory) and other implementations are in development. + +FAQs are included below. + +IMPLEMENTATION OVERVIEW + +A cleancache "backend" that provides transcendent memory registers itself +to the kernel's cleancache "frontend" by calling cleancache_register_ops, +passing a pointer to a cleancache_ops structure with funcs set appropriately. +Note that cleancache_register_ops returns the previous settings so that +chaining can be performed if desired. The functions provided must conform to +certain semantics as follows: + +Most important, cleancache is "ephemeral". Pages which are copied into +cleancache have an indefinite lifetime which is completely unknowable +by the kernel and so may or may not still be in cleancache at any later time. +Thus, as its name implies, cleancache is not suitable for dirty pages. +Cleancache has complete discretion over what pages to preserve and what +pages to discard and when. + +Mounting a cleancache-enabled filesystem should call "init_fs" to obtain a +pool id which, if positive, must be saved in the filesystem's superblock; +a negative return value indicates failure. A "put_page" will copy a +(presumably about-to-be-evicted) page into cleancache and associate it with +the pool id, a file key, and a page index into the file. (The combination +of a pool id, a file key, and an index is sometimes called a "handle".) +A "get_page" will copy the page, if found, from cleancache into kernel memory. +An "invalidate_page" will ensure the page no longer is present in cleancache; +an "invalidate_inode" will invalidate all pages associated with the specified +file; and, when a filesystem is unmounted, an "invalidate_fs" will invalidate +all pages in all files specified by the given pool id and also surrender +the pool id. + +An "init_shared_fs", like init_fs, obtains a pool id but tells cleancache +to treat the pool as shared using a 128-bit UUID as a key. On systems +that may run multiple kernels (such as hard partitioned or virtualized +systems) that may share a clustered filesystem, and where cleancache +may be shared among those kernels, calls to init_shared_fs that specify the +same UUID will receive the same pool id, thus allowing the pages to +be shared. Note that any security requirements must be imposed outside +of the kernel (e.g. by "tools" that control cleancache). Or a +cleancache implementation can simply disable shared_init by always +returning a negative value. + +If a get_page is successful on a non-shared pool, the page is invalidated +(thus making cleancache an "exclusive" cache). On a shared pool, the page +is NOT invalidated on a successful get_page so that it remains accessible to +other sharers. The kernel is responsible for ensuring coherency between +cleancache (shared or not), the page cache, and the filesystem, using +cleancache invalidate operations as required. + +Note that cleancache must enforce put-put-get coherency and get-get +coherency. For the former, if two puts are made to the same handle but +with different data, say AAA by the first put and BBB by the second, a +subsequent get can never return the stale data (AAA). For get-get coherency, +if a get for a given handle fails, subsequent gets for that handle will +never succeed unless preceded by a successful put with that handle. + +Last, cleancache provides no SMP serialization guarantees; if two +different Linux threads are simultaneously putting and invalidating a page +with the same handle, the results are indeterminate. Callers must +lock the page to ensure serial behavior. + +CLEANCACHE PERFORMANCE METRICS + +Cleancache monitoring is done by sysfs files in the +/sys/kernel/mm/cleancache directory. The effectiveness of cleancache +can be measured (across all filesystems) with: + +succ_gets - number of gets that were successful +failed_gets - number of gets that failed +puts - number of puts attempted (all "succeed") +invalidates - number of invalidates attempted + +A backend implementatation may provide additional metrics. + +FAQ + +1) Where's the value? (Andrew Morton) + +Cleancache provides a significant performance benefit to many workloads +in many environments with negligible overhead by improving the +effectiveness of the pagecache. Clean pagecache pages are +saved in transcendent memory (RAM that is otherwise not directly +addressable to the kernel); fetching those pages later avoids "refaults" +and thus disk reads. + +Cleancache (and its sister code "frontswap") provide interfaces for +this transcendent memory (aka "tmem"), which conceptually lies between +fast kernel-directly-addressable RAM and slower DMA/asynchronous devices. +Disallowing direct kernel or userland reads/writes to tmem +is ideal when data is transformed to a different form and size (such +as with compression) or secretly moved (as might be useful for write- +balancing for some RAM-like devices). Evicted page-cache pages (and +swap pages) are a great use for this kind of slower-than-RAM-but-much- +faster-than-disk transcendent memory, and the cleancache (and frontswap) +"page-object-oriented" specification provides a nice way to read and +write -- and indirectly "name" -- the pages. + +In the virtual case, the whole point of virtualization is to statistically +multiplex physical resources across the varying demands of multiple +virtual machines. This is really hard to do with RAM and efforts to +do it well with no kernel change have essentially failed (except in some +well-publicized special-case workloads). Cleancache -- and frontswap -- +with a fairly small impact on the kernel, provide a huge amount +of flexibility for more dynamic, flexible RAM multiplexing. +Specifically, the Xen Transcendent Memory backend allows otherwise +"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple +virtual machines, but the pages can be compressed and deduplicated to +optimize RAM utilization. And when guest OS's are induced to surrender +underutilized RAM (e.g. with "self-ballooning"), page cache pages +are the first to go, and cleancache allows those pages to be +saved and reclaimed if overall host system memory conditions allow. + +And the identical interface used for cleancache can be used in +physical systems as well. The zcache driver acts as a memory-hungry +device that stores pages of data in a compressed state. And +the proposed "RAMster" driver shares RAM across multiple physical +systems. + +2) Why does cleancache have its sticky fingers so deep inside the + filesystems and VFS? (Andrew Morton and Christoph Hellwig) + +The core hooks for cleancache in VFS are in most cases a single line +and the minimum set are placed precisely where needed to maintain +coherency (via cleancache_invalidate operations) between cleancache, +the page cache, and disk. All hooks compile into nothingness if +cleancache is config'ed off and turn into a function-pointer- +compare-to-NULL if config'ed on but no backend claims the ops +functions, or to a compare-struct-element-to-negative if a +backend claims the ops functions but a filesystem doesn't enable +cleancache. + +Some filesystems are built entirely on top of VFS and the hooks +in VFS are sufficient, so don't require an "init_fs" hook; the +initial implementation of cleancache didn't provide this hook. +But for some filesystems (such as btrfs), the VFS hooks are +incomplete and one or more hooks in fs-specific code are required. +And for some other filesystems, such as tmpfs, cleancache may +be counterproductive. So it seemed prudent to require a filesystem +to "opt in" to use cleancache, which requires adding a hook in +each filesystem. Not all filesystems are supported by cleancache +only because they haven't been tested. The existing set should +be sufficient to validate the concept, the opt-in approach means +that untested filesystems are not affected, and the hooks in the +existing filesystems should make it very easy to add more +filesystems in the future. + +The total impact of the hooks to existing fs and mm files is only +about 40 lines added (not counting comments and blank lines). + +3) Why not make cleancache asynchronous and batched so it can + more easily interface with real devices with DMA instead + of copying each individual page? (Minchan Kim) + +The one-page-at-a-time copy semantics simplifies the implementation +on both the frontend and backend and also allows the backend to +do fancy things on-the-fly like page compression and +page deduplication. And since the data is "gone" (copied into/out +of the pageframe) before the cleancache get/put call returns, +a great deal of race conditions and potential coherency issues +are avoided. While the interface seems odd for a "real device" +or for real kernel-addressable RAM, it makes perfect sense for +transcendent memory. + +4) Why is non-shared cleancache "exclusive"? And where is the + page "invalidated" after a "get"? (Minchan Kim) + +The main reason is to free up space in transcendent memory and +to avoid unnecessary cleancache_invalidate calls. If you want inclusive, +the page can be "put" immediately following the "get". If +put-after-get for inclusive becomes common, the interface could +be easily extended to add a "get_no_invalidate" call. + +The invalidate is done by the cleancache backend implementation. + +5) What's the performance impact? + +Performance analysis has been presented at OLS'09 and LCA'10. +Briefly, performance gains can be significant on most workloads, +especially when memory pressure is high (e.g. when RAM is +overcommitted in a virtual workload); and because the hooks are +invoked primarily in place of or in addition to a disk read/write, +overhead is negligible even in worst case workloads. Basically +cleancache replaces I/O with memory-copy-CPU-overhead; on older +single-core systems with slow memory-copy speeds, cleancache +has little value, but in newer multicore machines, especially +consolidated/virtualized machines, it has great value. + +6) How do I add cleancache support for filesystem X? (Boaz Harrash) + +Filesystems that are well-behaved and conform to certain +restrictions can utilize cleancache simply by making a call to +cleancache_init_fs at mount time. Unusual, misbehaving, or +poorly layered filesystems must either add additional hooks +and/or undergo extensive additional testing... or should just +not enable the optional cleancache. + +Some points for a filesystem to consider: + +- The FS should be block-device-based (e.g. a ram-based FS such + as tmpfs should not enable cleancache) +- To ensure coherency/correctness, the FS must ensure that all + file removal or truncation operations either go through VFS or + add hooks to do the equivalent cleancache "invalidate" operations +- To ensure coherency/correctness, either inode numbers must + be unique across the lifetime of the on-disk file OR the + FS must provide an "encode_fh" function. +- The FS must call the VFS superblock alloc and deactivate routines + or add hooks to do the equivalent cleancache calls done there. +- To maximize performance, all pages fetched from the FS should + go through the do_mpag_readpage routine or the FS should add + hooks to do the equivalent (cf. btrfs) +- Currently, the FS blocksize must be the same as PAGESIZE. This + is not an architectural restriction, but no backends currently + support anything different. +- A clustered FS should invoke the "shared_init_fs" cleancache + hook to get best performance for some backends. + +7) Why not use the KVA of the inode as the key? (Christoph Hellwig) + +If cleancache would use the inode virtual address instead of +inode/filehandle, the pool id could be eliminated. But, this +won't work because cleancache retains pagecache data pages +persistently even when the inode has been pruned from the +inode unused list, and only invalidates the data page if the file +gets removed/truncated. So if cleancache used the inode kva, +there would be potential coherency issues if/when the inode +kva is reused for a different file. Alternately, if cleancache +invalidated the pages when the inode kva was freed, much of the value +of cleancache would be lost because the cache of pages in cleanache +is potentially much larger than the kernel pagecache and is most +useful if the pages survive inode cache removal. + +8) Why is a global variable required? + +The cleancache_enabled flag is checked in all of the frequently-used +cleancache hooks. The alternative is a function call to check a static +variable. Since cleancache is enabled dynamically at runtime, systems +that don't enable cleancache would suffer thousands (possibly +tens-of-thousands) of unnecessary function calls per second. So the +global variable allows cleancache to be enabled by default at compile +time, but have insignificant performance impact when cleancache remains +disabled at runtime. + +9) Does cleanache work with KVM? + +The memory model of KVM is sufficiently different that a cleancache +backend may have less value for KVM. This remains to be tested, +especially in an overcommitted system. + +10) Does cleancache work in userspace? It sounds useful for + memory hungry caches like web browsers. (Jamie Lokier) + +No plans yet, though we agree it sounds useful, at least for +apps that bypass the page cache (e.g. O_DIRECT). + +Last updated: Dan Magenheimer, April 13 2011 diff --git a/include/linux/cleancache.h b/include/linux/cleancache.h new file mode 100755 index 00000000..04ffb2e6 --- /dev/null +++ b/include/linux/cleancache.h @@ -0,0 +1,122 @@ +#ifndef _LINUX_CLEANCACHE_H +#define _LINUX_CLEANCACHE_H + +#include +#include +#include + +#define CLEANCACHE_KEY_MAX 6 + +/* + * cleancache requires every file with a page in cleancache to have a + * unique key unless/until the file is removed/truncated. For some + * filesystems, the inode number is unique, but for "modern" filesystems + * an exportable filehandle is required (see exportfs.h) + */ +struct cleancache_filekey { + union { + ino_t ino; + __u32 fh[CLEANCACHE_KEY_MAX]; + u32 key[CLEANCACHE_KEY_MAX]; + } u; +}; + +struct cleancache_ops { + int (*init_fs)(size_t); + int (*init_shared_fs)(char *uuid, size_t); + int (*get_page)(int, struct cleancache_filekey, + pgoff_t, struct page *); + void (*put_page)(int, struct cleancache_filekey, + pgoff_t, struct page *); + void (*flush_page)(int, struct cleancache_filekey, pgoff_t); + void (*flush_inode)(int, struct cleancache_filekey); + void (*flush_fs)(int); +}; + +extern struct cleancache_ops + cleancache_register_ops(struct cleancache_ops *ops); +extern void __cleancache_init_fs(struct super_block *); +extern void __cleancache_init_shared_fs(char *, struct super_block *); +extern int __cleancache_get_page(struct page *); +extern void __cleancache_put_page(struct page *); +extern void __cleancache_flush_page(struct address_space *, struct page *); +extern void __cleancache_flush_inode(struct address_space *); +extern void __cleancache_flush_fs(struct super_block *); +extern int cleancache_enabled; + +#ifdef CONFIG_CLEANCACHE +static inline bool cleancache_fs_enabled(struct page *page) +{ + return page->mapping->host->i_sb->cleancache_poolid >= 0; +} +static inline bool cleancache_fs_enabled_mapping(struct address_space *mapping) +{ + return mapping->host->i_sb->cleancache_poolid >= 0; +} +#else +#define cleancache_enabled (0) +#define cleancache_fs_enabled(_page) (0) +#define cleancache_fs_enabled_mapping(_page) (0) +#endif + +/* + * The shim layer provided by these inline functions allows the compiler + * to reduce all cleancache hooks to nothingness if CONFIG_CLEANCACHE + * is disabled, to a single global variable check if CONFIG_CLEANCACHE + * is enabled but no cleancache "backend" has dynamically enabled it, + * and, for the most frequent cleancache ops, to a single global variable + * check plus a superblock element comparison if CONFIG_CLEANCACHE is enabled + * and a cleancache backend has dynamically enabled cleancache, but the + * filesystem referenced by that cleancache op has not enabled cleancache. + * As a result, CONFIG_CLEANCACHE can be enabled by default with essentially + * no measurable performance impact. + */ + +static inline void cleancache_init_fs(struct super_block *sb) +{ + if (cleancache_enabled) + __cleancache_init_fs(sb); +} + +static inline void cleancache_init_shared_fs(char *uuid, struct super_block *sb) +{ + if (cleancache_enabled) + __cleancache_init_shared_fs(uuid, sb); +} + +static inline int cleancache_get_page(struct page *page) +{ + int ret = -1; + + if (cleancache_enabled && cleancache_fs_enabled(page)) + ret = __cleancache_get_page(page); + return ret; +} + +static inline void cleancache_put_page(struct page *page) +{ + if (cleancache_enabled && cleancache_fs_enabled(page)) + __cleancache_put_page(page); +} + +static inline void cleancache_flush_page(struct address_space *mapping, + struct page *page) +{ + /* careful... page->mapping is NULL sometimes when this is called */ + if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping)) + __cleancache_flush_page(mapping, page); +} + +static inline void cleancache_flush_inode(struct address_space *mapping) +{ + if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping)) + __cleancache_flush_inode(mapping); +} + +static inline void cleancache_flush_fs(struct super_block *sb) +{ + if (cleancache_enabled) + __cleancache_flush_fs(sb); +} + +#endif /* _LINUX_CLEANCACHE_H */ diff --git a/mm/Kconfig b/mm/Kconfig old mode 100644 new mode 100755 index 2c19c0ba..f86e0d29 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -288,3 +288,25 @@ config NOMMU_INITIAL_TRIM_EXCESS of 1 says that all excess pages should be trimmed. See Documentation/nommu-mmap.txt for more information. +config CLEANCACHE + bool "Enable cleancache driver to cache clean pages if tmem is present" + default n + help + Cleancache can be thought of as a page-granularity victim cache + for clean pages that the kernel's pageframe replacement algorithm + (PFRA) would like to keep around, but can't since there isn't enough + memory. So when the PFRA "evicts" a page, it first attempts to use + cleancacne code to put the data contained in that page into + "transcendent memory", memory that is not directly accessible or + addressable by the kernel and is of unknown and possibly + time-varying size. And when a cleancache-enabled + filesystem wishes to access a page in a file on disk, it first + checks cleancache to see if it already contains it; if it does, + the page is copied into the kernel and a disk access is avoided. + When a transcendent memory driver is available (such as zcache or + Xen transcendent memory), a significant I/O reduction + may be achieved. When none is available, all cleancache calls + are reduced to a single pointer-compare-against-NULL resulting + in a negligible performance hit. + + If unsure, say Y to enable cleancache \ No newline at end of file diff --git a/mm/Makefile b/mm/Makefile old mode 100644 new mode 100755 index 66f54865..82a734fd --- a/mm/Makefile +++ b/mm/Makefile @@ -46,3 +46,4 @@ obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o +obj-$(CONFIG_CLEANCACHE) += cleancache.o diff --git a/mm/cleancache.c b/mm/cleancache.c new file mode 100755 index 00000000..bcaae4c2 --- /dev/null +++ b/mm/cleancache.c @@ -0,0 +1,244 @@ +/* + * Cleancache frontend + * + * This code provides the generic "frontend" layer to call a matching + * "backend" driver implementation of cleancache. See + * Documentation/vm/cleancache.txt for more information. + * + * Copyright (C) 2009-2010 Oracle Corp. All rights reserved. + * Author: Dan Magenheimer + * + * This work is licensed under the terms of the GNU GPL, version 2. + */ + +#include +#include +#include +#include +#include + +/* + * This global enablement flag may be read thousands of times per second + * by cleancache_get/put/flush even on systems where cleancache_ops + * is not claimed (e.g. cleancache is config'ed on but remains + * disabled), so is preferred to the slower alternative: a function + * call that checks a non-global. + */ +int cleancache_enabled; +EXPORT_SYMBOL(cleancache_enabled); + +/* + * cleancache_ops is set by cleancache_ops_register to contain the pointers + * to the cleancache "backend" implementation functions. + */ +static struct cleancache_ops cleancache_ops; + +/* useful stats available in /sys/kernel/mm/cleancache */ +static unsigned long cleancache_succ_gets; +static unsigned long cleancache_failed_gets; +static unsigned long cleancache_puts; +static unsigned long cleancache_flushes; + +/* + * register operations for cleancache, returning previous thus allowing + * detection of multiple backends and possible nesting + */ +struct cleancache_ops cleancache_register_ops(struct cleancache_ops *ops) +{ + struct cleancache_ops old = cleancache_ops; + + cleancache_ops = *ops; + cleancache_enabled = 1; + return old; +} +EXPORT_SYMBOL(cleancache_register_ops); + +/* Called by a cleancache-enabled filesystem at time of mount */ +void __cleancache_init_fs(struct super_block *sb) +{ + sb->cleancache_poolid = (*cleancache_ops.init_fs)(PAGE_SIZE); +} +EXPORT_SYMBOL(__cleancache_init_fs); + +/* Called by a cleancache-enabled clustered filesystem at time of mount */ +void __cleancache_init_shared_fs(char *uuid, struct super_block *sb) +{ + sb->cleancache_poolid = + (*cleancache_ops.init_shared_fs)(uuid, PAGE_SIZE); +} +EXPORT_SYMBOL(__cleancache_init_shared_fs); + +/* + * If the filesystem uses exportable filehandles, use the filehandle as + * the key, else use the inode number. + */ +static int cleancache_get_key(struct inode *inode, + struct cleancache_filekey *key) +{ + int (*fhfn)(struct dentry *, __u32 *fh, int *, int); + int len = 0, maxlen = CLEANCACHE_KEY_MAX; + struct super_block *sb = inode->i_sb; + + key->u.ino = inode->i_ino; + if (sb->s_export_op != NULL) { + fhfn = sb->s_export_op->encode_fh; + if (fhfn) { + struct dentry d; + d.d_inode = inode; + len = (*fhfn)(&d, &key->u.fh[0], &maxlen, 0); + if (len <= 0 || len == 255) + return -1; + if (maxlen > CLEANCACHE_KEY_MAX) + return -1; + } + } + return 0; +} + +/* + * "Get" data from cleancache associated with the poolid/inode/index + * that were specified when the data was put to cleanache and, if + * successful, use it to fill the specified page with data and return 0. + * The pageframe is unchanged and returns -1 if the get fails. + * Page must be locked by caller. + */ +int __cleancache_get_page(struct page *page) +{ + int ret = -1; + int pool_id; + struct cleancache_filekey key = { .u.key = { 0 } }; + + VM_BUG_ON(!PageLocked(page)); + pool_id = page->mapping->host->i_sb->cleancache_poolid; + if (pool_id < 0) + goto out; + + if (cleancache_get_key(page->mapping->host, &key) < 0) + goto out; + + ret = (*cleancache_ops.get_page)(pool_id, key, page->index, page); + if (ret == 0) + cleancache_succ_gets++; + else + cleancache_failed_gets++; +out: + return ret; +} +EXPORT_SYMBOL(__cleancache_get_page); + +/* + * "Put" data from a page to cleancache and associate it with the + * (previously-obtained per-filesystem) poolid and the page's, + * inode and page index. Page must be locked. Note that a put_page + * always "succeeds", though a subsequent get_page may succeed or fail. + */ +void __cleancache_put_page(struct page *page) +{ + int pool_id; + struct cleancache_filekey key = { .u.key = { 0 } }; + + VM_BUG_ON(!PageLocked(page)); + pool_id = page->mapping->host->i_sb->cleancache_poolid; + if (pool_id >= 0 && + cleancache_get_key(page->mapping->host, &key) >= 0) { + (*cleancache_ops.put_page)(pool_id, key, page->index, page); + cleancache_puts++; + } +} +EXPORT_SYMBOL(__cleancache_put_page); + +/* + * Flush any data from cleancache associated with the poolid and the + * page's inode and page index so that a subsequent "get" will fail. + */ +void __cleancache_flush_page(struct address_space *mapping, struct page *page) +{ + /* careful... page->mapping is NULL sometimes when this is called */ + int pool_id = mapping->host->i_sb->cleancache_poolid; + struct cleancache_filekey key = { .u.key = { 0 } }; + + if (pool_id >= 0) { + VM_BUG_ON(!PageLocked(page)); + if (cleancache_get_key(mapping->host, &key) >= 0) { + (*cleancache_ops.flush_page)(pool_id, key, page->index); + cleancache_flushes++; + } + } +} +EXPORT_SYMBOL(__cleancache_flush_page); + +/* + * Flush all data from cleancache associated with the poolid and the + * mappings's inode so that all subsequent gets to this poolid/inode + * will fail. + */ +void __cleancache_flush_inode(struct address_space *mapping) +{ + int pool_id = mapping->host->i_sb->cleancache_poolid; + struct cleancache_filekey key = { .u.key = { 0 } }; + + if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0) + (*cleancache_ops.flush_inode)(pool_id, key); +} +EXPORT_SYMBOL(__cleancache_flush_inode); + +/* + * Called by any cleancache-enabled filesystem at time of unmount; + * note that pool_id is surrendered and may be reutrned by a subsequent + * cleancache_init_fs or cleancache_init_shared_fs + */ +void __cleancache_flush_fs(struct super_block *sb) +{ + if (sb->cleancache_poolid >= 0) { + int old_poolid = sb->cleancache_poolid; + sb->cleancache_poolid = -1; + (*cleancache_ops.flush_fs)(old_poolid); + } +} +EXPORT_SYMBOL(__cleancache_flush_fs); + +#ifdef CONFIG_SYSFS + +/* see Documentation/ABI/xxx/sysfs-kernel-mm-cleancache */ + +#define CLEANCACHE_SYSFS_RO(_name) \ + static ssize_t cleancache_##_name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *buf) \ + { \ + return sprintf(buf, "%lu\n", cleancache_##_name); \ + } \ + static struct kobj_attribute cleancache_##_name##_attr = { \ + .attr = { .name = __stringify(_name), .mode = 0444 }, \ + .show = cleancache_##_name##_show, \ + } + +CLEANCACHE_SYSFS_RO(succ_gets); +CLEANCACHE_SYSFS_RO(failed_gets); +CLEANCACHE_SYSFS_RO(puts); +CLEANCACHE_SYSFS_RO(flushes); + +static struct attribute *cleancache_attrs[] = { + &cleancache_succ_gets_attr.attr, + &cleancache_failed_gets_attr.attr, + &cleancache_puts_attr.attr, + &cleancache_flushes_attr.attr, + NULL, +}; + +static struct attribute_group cleancache_attr_group = { + .attrs = cleancache_attrs, + .name = "cleancache", +}; + +#endif /* CONFIG_SYSFS */ + +static int __init init_cleancache(void) +{ +#ifdef CONFIG_SYSFS + int err; + + err = sysfs_create_group(mm_kobj, &cleancache_attr_group); +#endif /* CONFIG_SYSFS */ + return 0; +} +module_init(init_cleancache) From 8eb6724dbfb99bb1f17f3192483fafc1f9eb73fe Mon Sep 17 00:00:00 2001 From: SecureCRT Date: Mon, 20 Aug 2012 00:51:06 +0800 Subject: [PATCH 03/14] add zcache --- drivers/staging/Kconfig | 1 + drivers/staging/Makefile | 1 + drivers/staging/zcache/Kconfig | 13 + drivers/staging/zcache/Makefile | 3 + drivers/staging/zcache/tmem.c | 710 +++++++++++++ drivers/staging/zcache/tmem.h | 195 ++++ drivers/staging/zcache/zcache.c | 1658 +++++++++++++++++++++++++++++++ 7 files changed, 2581 insertions(+) create mode 100755 drivers/staging/zcache/Kconfig create mode 100755 drivers/staging/zcache/Makefile create mode 100755 drivers/staging/zcache/tmem.c create mode 100755 drivers/staging/zcache/tmem.h create mode 100755 drivers/staging/zcache/zcache.c diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig index 8ee4bfa6..e4c3c9dd 100644 --- a/drivers/staging/Kconfig +++ b/drivers/staging/Kconfig @@ -125,5 +125,6 @@ source "drivers/staging/iio/Kconfig" source "drivers/staging/zram/Kconfig" +source "drivers/staging/zcache/Kconfig" endif # !STAGING_EXCLUDE_BUILD endif # STAGING diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile index 5a1b7341..5f0f554b 100644 --- a/drivers/staging/Makefile +++ b/drivers/staging/Makefile @@ -45,4 +45,5 @@ obj-$(CONFIG_DX_SEP) += sep/ obj-$(CONFIG_IIO) += iio/ obj-$(CONFIG_ZRAM) += zram/ obj-$(CONFIG_XVMALLOC) += zram/ +obj-$(CONFIG_ZCACHE) += zcache/ diff --git a/drivers/staging/zcache/Kconfig b/drivers/staging/zcache/Kconfig new file mode 100755 index 00000000..7fabcb2b --- /dev/null +++ b/drivers/staging/zcache/Kconfig @@ -0,0 +1,13 @@ +config ZCACHE + tristate "Dynamic compression of swap pages and clean pagecache pages" + depends on CLEANCACHE || FRONTSWAP + select XVMALLOC + select LZO_COMPRESS + select LZO_DECOMPRESS + default n + help + Zcache doubles RAM efficiency while providing a significant + performance boosts on many workloads. Zcache uses lzo1x + compression and an in-kernel implementation of transcendent + memory to store clean page cache pages and swap in RAM, + providing a noticeable reduction in disk I/O. diff --git a/drivers/staging/zcache/Makefile b/drivers/staging/zcache/Makefile new file mode 100755 index 00000000..f5ec64f9 --- /dev/null +++ b/drivers/staging/zcache/Makefile @@ -0,0 +1,3 @@ +zcache-y := tmem.o + +obj-$(CONFIG_ZCACHE) += zcache.o diff --git a/drivers/staging/zcache/tmem.c b/drivers/staging/zcache/tmem.c new file mode 100755 index 00000000..e954d405 --- /dev/null +++ b/drivers/staging/zcache/tmem.c @@ -0,0 +1,710 @@ +/* + * In-kernel transcendent memory (generic implementation) + * + * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp. + * + * The primary purpose of Transcedent Memory ("tmem") is to map object-oriented + * "handles" (triples containing a pool id, and object id, and an index), to + * pages in a page-accessible memory (PAM). Tmem references the PAM pages via + * an abstract "pampd" (PAM page-descriptor), which can be operated on by a + * set of functions (pamops). Each pampd contains some representation of + * PAGE_SIZE bytes worth of data. Tmem must support potentially millions of + * pages and must be able to insert, find, and delete these pages at a + * potential frequency of thousands per second concurrently across many CPUs, + * (and, if used with KVM, across many vcpus across many guests). + * Tmem is tracked with a hierarchy of data structures, organized by + * the elements in a handle-tuple: pool_id, object_id, and page index. + * One or more "clients" (e.g. guests) each provide one or more tmem_pools. + * Each pool, contains a hash table of rb_trees of tmem_objs. Each + * tmem_obj contains a radix-tree-like tree of pointers, with intermediate + * nodes called tmem_objnodes. Each leaf pointer in this tree points to + * a pampd, which is accessible only through a small set of callbacks + * registered by the PAM implementation (see tmem_register_pamops). Tmem + * does all memory allocation via a set of callbacks registered by the tmem + * host implementation (e.g. see tmem_register_hostops). + */ + +#include +#include +#include + +#include "tmem.h" + +/* data structure sentinels used for debugging... see tmem.h */ +#define POOL_SENTINEL 0x87658765 +#define OBJ_SENTINEL 0x12345678 +#define OBJNODE_SENTINEL 0xfedcba09 + +/* + * A tmem host implementation must use this function to register callbacks + * for memory allocation. + */ +static struct tmem_hostops tmem_hostops; + +static void tmem_objnode_tree_init(void); + +void tmem_register_hostops(struct tmem_hostops *m) +{ + tmem_objnode_tree_init(); + tmem_hostops = *m; +} + +/* + * A tmem host implementation must use this function to register + * callbacks for a page-accessible memory (PAM) implementation + */ +static struct tmem_pamops tmem_pamops; + +void tmem_register_pamops(struct tmem_pamops *m) +{ + tmem_pamops = *m; +} + +/* + * Oid's are potentially very sparse and tmem_objs may have an indeterminately + * short life, being added and deleted at a relatively high frequency. + * So an rb_tree is an ideal data structure to manage tmem_objs. But because + * of the potentially huge number of tmem_objs, each pool manages a hashtable + * of rb_trees to reduce search, insert, delete, and rebalancing time. + * Each hashbucket also has a lock to manage concurrent access. + * + * The following routines manage tmem_objs. When any tmem_obj is accessed, + * the hashbucket lock must be held. + */ + +/* searches for object==oid in pool, returns locked object if found */ +static struct tmem_obj *tmem_obj_find(struct tmem_hashbucket *hb, + struct tmem_oid *oidp) +{ + struct rb_node *rbnode; + struct tmem_obj *obj; + + rbnode = hb->obj_rb_root.rb_node; + while (rbnode) { + BUG_ON(RB_EMPTY_NODE(rbnode)); + obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node); + switch (tmem_oid_compare(oidp, &obj->oid)) { + case 0: /* equal */ + goto out; + case -1: + rbnode = rbnode->rb_left; + break; + case 1: + rbnode = rbnode->rb_right; + break; + } + } + obj = NULL; +out: + return obj; +} + +static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *); + +/* free an object that has no more pampds in it */ +static void tmem_obj_free(struct tmem_obj *obj, struct tmem_hashbucket *hb) +{ + struct tmem_pool *pool; + + BUG_ON(obj == NULL); + ASSERT_SENTINEL(obj, OBJ); + BUG_ON(obj->pampd_count > 0); + pool = obj->pool; + BUG_ON(pool == NULL); + if (obj->objnode_tree_root != NULL) /* may be "stump" with no leaves */ + tmem_pampd_destroy_all_in_obj(obj); + BUG_ON(obj->objnode_tree_root != NULL); + BUG_ON((long)obj->objnode_count != 0); + atomic_dec(&pool->obj_count); + BUG_ON(atomic_read(&pool->obj_count) < 0); + INVERT_SENTINEL(obj, OBJ); + obj->pool = NULL; + tmem_oid_set_invalid(&obj->oid); + rb_erase(&obj->rb_tree_node, &hb->obj_rb_root); +} + +/* + * initialize, and insert an tmem_object_root (called only if find failed) + */ +static void tmem_obj_init(struct tmem_obj *obj, struct tmem_hashbucket *hb, + struct tmem_pool *pool, + struct tmem_oid *oidp) +{ + struct rb_root *root = &hb->obj_rb_root; + struct rb_node **new = &(root->rb_node), *parent = NULL; + struct tmem_obj *this; + + BUG_ON(pool == NULL); + atomic_inc(&pool->obj_count); + obj->objnode_tree_height = 0; + obj->objnode_tree_root = NULL; + obj->pool = pool; + obj->oid = *oidp; + obj->objnode_count = 0; + obj->pampd_count = 0; + SET_SENTINEL(obj, OBJ); + while (*new) { + BUG_ON(RB_EMPTY_NODE(*new)); + this = rb_entry(*new, struct tmem_obj, rb_tree_node); + parent = *new; + switch (tmem_oid_compare(oidp, &this->oid)) { + case 0: + BUG(); /* already present; should never happen! */ + break; + case -1: + new = &(*new)->rb_left; + break; + case 1: + new = &(*new)->rb_right; + break; + } + } + rb_link_node(&obj->rb_tree_node, parent, new); + rb_insert_color(&obj->rb_tree_node, root); +} + +/* + * Tmem is managed as a set of tmem_pools with certain attributes, such as + * "ephemeral" vs "persistent". These attributes apply to all tmem_objs + * and all pampds that belong to a tmem_pool. A tmem_pool is created + * or deleted relatively rarely (for example, when a filesystem is + * mounted or unmounted. + */ + +/* flush all data from a pool and, optionally, free it */ +static void tmem_pool_flush(struct tmem_pool *pool, bool destroy) +{ + struct rb_node *rbnode; + struct tmem_obj *obj; + struct tmem_hashbucket *hb = &pool->hashbucket[0]; + int i; + + BUG_ON(pool == NULL); + for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) { + spin_lock(&hb->lock); + rbnode = rb_first(&hb->obj_rb_root); + while (rbnode != NULL) { + obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node); + rbnode = rb_next(rbnode); + tmem_pampd_destroy_all_in_obj(obj); + tmem_obj_free(obj, hb); + (*tmem_hostops.obj_free)(obj, pool); + } + spin_unlock(&hb->lock); + } + if (destroy) + list_del(&pool->pool_list); +} + +/* + * A tmem_obj contains a radix-tree-like tree in which the intermediate + * nodes are called tmem_objnodes. (The kernel lib/radix-tree.c implementation + * is very specialized and tuned for specific uses and is not particularly + * suited for use from this code, though some code from the core algorithms has + * been reused, thus the copyright notices below). Each tmem_objnode contains + * a set of pointers which point to either a set of intermediate tmem_objnodes + * or a set of of pampds. + * + * Portions Copyright (C) 2001 Momchil Velikov + * Portions Copyright (C) 2001 Christoph Hellwig + * Portions Copyright (C) 2005 SGI, Christoph Lameter + */ + +struct tmem_objnode_tree_path { + struct tmem_objnode *objnode; + int offset; +}; + +/* objnode height_to_maxindex translation */ +static unsigned long tmem_objnode_tree_h2max[OBJNODE_TREE_MAX_PATH + 1]; + +static void tmem_objnode_tree_init(void) +{ + unsigned int ht, tmp; + + for (ht = 0; ht < ARRAY_SIZE(tmem_objnode_tree_h2max); ht++) { + tmp = ht * OBJNODE_TREE_MAP_SHIFT; + if (tmp >= OBJNODE_TREE_INDEX_BITS) + tmem_objnode_tree_h2max[ht] = ~0UL; + else + tmem_objnode_tree_h2max[ht] = + (~0UL >> (OBJNODE_TREE_INDEX_BITS - tmp - 1)) >> 1; + } +} + +static struct tmem_objnode *tmem_objnode_alloc(struct tmem_obj *obj) +{ + struct tmem_objnode *objnode; + + ASSERT_SENTINEL(obj, OBJ); + BUG_ON(obj->pool == NULL); + ASSERT_SENTINEL(obj->pool, POOL); + objnode = (*tmem_hostops.objnode_alloc)(obj->pool); + if (unlikely(objnode == NULL)) + goto out; + objnode->obj = obj; + SET_SENTINEL(objnode, OBJNODE); + memset(&objnode->slots, 0, sizeof(objnode->slots)); + objnode->slots_in_use = 0; + obj->objnode_count++; +out: + return objnode; +} + +static void tmem_objnode_free(struct tmem_objnode *objnode) +{ + struct tmem_pool *pool; + int i; + + BUG_ON(objnode == NULL); + for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) + BUG_ON(objnode->slots[i] != NULL); + ASSERT_SENTINEL(objnode, OBJNODE); + INVERT_SENTINEL(objnode, OBJNODE); + BUG_ON(objnode->obj == NULL); + ASSERT_SENTINEL(objnode->obj, OBJ); + pool = objnode->obj->pool; + BUG_ON(pool == NULL); + ASSERT_SENTINEL(pool, POOL); + objnode->obj->objnode_count--; + objnode->obj = NULL; + (*tmem_hostops.objnode_free)(objnode, pool); +} + +/* + * lookup index in object and return associated pampd (or NULL if not found) + */ +static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index) +{ + unsigned int height, shift; + struct tmem_objnode **slot = NULL; + + BUG_ON(obj == NULL); + ASSERT_SENTINEL(obj, OBJ); + BUG_ON(obj->pool == NULL); + ASSERT_SENTINEL(obj->pool, POOL); + + height = obj->objnode_tree_height; + if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) + goto out; + if (height == 0 && obj->objnode_tree_root) { + slot = &obj->objnode_tree_root; + goto out; + } + shift = (height-1) * OBJNODE_TREE_MAP_SHIFT; + slot = &obj->objnode_tree_root; + while (height > 0) { + if (*slot == NULL) + goto out; + slot = (struct tmem_objnode **) + ((*slot)->slots + + ((index >> shift) & OBJNODE_TREE_MAP_MASK)); + shift -= OBJNODE_TREE_MAP_SHIFT; + height--; + } +out: + return slot != NULL ? *slot : NULL; +} + +static int tmem_pampd_add_to_obj(struct tmem_obj *obj, uint32_t index, + void *pampd) +{ + int ret = 0; + struct tmem_objnode *objnode = NULL, *newnode, *slot; + unsigned int height, shift; + int offset = 0; + + /* if necessary, extend the tree to be higher */ + if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) { + height = obj->objnode_tree_height + 1; + if (index > tmem_objnode_tree_h2max[height]) + while (index > tmem_objnode_tree_h2max[height]) + height++; + if (obj->objnode_tree_root == NULL) { + obj->objnode_tree_height = height; + goto insert; + } + do { + newnode = tmem_objnode_alloc(obj); + if (!newnode) { + ret = -ENOMEM; + goto out; + } + newnode->slots[0] = obj->objnode_tree_root; + newnode->slots_in_use = 1; + obj->objnode_tree_root = newnode; + obj->objnode_tree_height++; + } while (height > obj->objnode_tree_height); + } +insert: + slot = obj->objnode_tree_root; + height = obj->objnode_tree_height; + shift = (height-1) * OBJNODE_TREE_MAP_SHIFT; + while (height > 0) { + if (slot == NULL) { + /* add a child objnode. */ + slot = tmem_objnode_alloc(obj); + if (!slot) { + ret = -ENOMEM; + goto out; + } + if (objnode) { + + objnode->slots[offset] = slot; + objnode->slots_in_use++; + } else + obj->objnode_tree_root = slot; + } + /* go down a level */ + offset = (index >> shift) & OBJNODE_TREE_MAP_MASK; + objnode = slot; + slot = objnode->slots[offset]; + shift -= OBJNODE_TREE_MAP_SHIFT; + height--; + } + BUG_ON(slot != NULL); + if (objnode) { + objnode->slots_in_use++; + objnode->slots[offset] = pampd; + } else + obj->objnode_tree_root = pampd; + obj->pampd_count++; +out: + return ret; +} + +static void *tmem_pampd_delete_from_obj(struct tmem_obj *obj, uint32_t index) +{ + struct tmem_objnode_tree_path path[OBJNODE_TREE_MAX_PATH + 1]; + struct tmem_objnode_tree_path *pathp = path; + struct tmem_objnode *slot = NULL; + unsigned int height, shift; + int offset; + + BUG_ON(obj == NULL); + ASSERT_SENTINEL(obj, OBJ); + BUG_ON(obj->pool == NULL); + ASSERT_SENTINEL(obj->pool, POOL); + height = obj->objnode_tree_height; + if (index > tmem_objnode_tree_h2max[height]) + goto out; + slot = obj->objnode_tree_root; + if (height == 0 && obj->objnode_tree_root) { + obj->objnode_tree_root = NULL; + goto out; + } + shift = (height - 1) * OBJNODE_TREE_MAP_SHIFT; + pathp->objnode = NULL; + do { + if (slot == NULL) + goto out; + pathp++; + offset = (index >> shift) & OBJNODE_TREE_MAP_MASK; + pathp->offset = offset; + pathp->objnode = slot; + slot = slot->slots[offset]; + shift -= OBJNODE_TREE_MAP_SHIFT; + height--; + } while (height > 0); + if (slot == NULL) + goto out; + while (pathp->objnode) { + pathp->objnode->slots[pathp->offset] = NULL; + pathp->objnode->slots_in_use--; + if (pathp->objnode->slots_in_use) { + if (pathp->objnode == obj->objnode_tree_root) { + while (obj->objnode_tree_height > 0 && + obj->objnode_tree_root->slots_in_use == 1 && + obj->objnode_tree_root->slots[0]) { + struct tmem_objnode *to_free = + obj->objnode_tree_root; + + obj->objnode_tree_root = + to_free->slots[0]; + obj->objnode_tree_height--; + to_free->slots[0] = NULL; + to_free->slots_in_use = 0; + tmem_objnode_free(to_free); + } + } + goto out; + } + tmem_objnode_free(pathp->objnode); /* 0 slots used, free it */ + pathp--; + } + obj->objnode_tree_height = 0; + obj->objnode_tree_root = NULL; + +out: + if (slot != NULL) + obj->pampd_count--; + BUG_ON(obj->pampd_count < 0); + return slot; +} + +/* recursively walk the objnode_tree destroying pampds and objnodes */ +static void tmem_objnode_node_destroy(struct tmem_obj *obj, + struct tmem_objnode *objnode, + unsigned int ht) +{ + int i; + + if (ht == 0) + return; + for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) { + if (objnode->slots[i]) { + if (ht == 1) { + obj->pampd_count--; + (*tmem_pamops.free)(objnode->slots[i], + obj->pool); + objnode->slots[i] = NULL; + continue; + } + tmem_objnode_node_destroy(obj, objnode->slots[i], ht-1); + tmem_objnode_free(objnode->slots[i]); + objnode->slots[i] = NULL; + } + } +} + +static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj) +{ + if (obj->objnode_tree_root == NULL) + return; + if (obj->objnode_tree_height == 0) { + obj->pampd_count--; + (*tmem_pamops.free)(obj->objnode_tree_root, obj->pool); + } else { + tmem_objnode_node_destroy(obj, obj->objnode_tree_root, + obj->objnode_tree_height); + tmem_objnode_free(obj->objnode_tree_root); + obj->objnode_tree_height = 0; + } + obj->objnode_tree_root = NULL; +} + +/* + * Tmem is operated on by a set of well-defined actions: + * "put", "get", "flush", "flush_object", "new pool" and "destroy pool". + * (The tmem ABI allows for subpages and exchanges but these operations + * are not included in this implementation.) + * + * These "tmem core" operations are implemented in the following functions. + */ + +/* + * "Put" a page, e.g. copy a page from the kernel into newly allocated + * PAM space (if such space is available). Tmem_put is complicated by + * a corner case: What if a page with matching handle already exists in + * tmem? To guarantee coherency, one of two actions is necessary: Either + * the data for the page must be overwritten, or the page must be + * "flushed" so that the data is not accessible to a subsequent "get". + * Since these "duplicate puts" are relatively rare, this implementation + * always flushes for simplicity. + */ +int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index, + struct page *page) +{ + struct tmem_obj *obj = NULL, *objfound = NULL, *objnew = NULL; + void *pampd = NULL, *pampd_del = NULL; + int ret = -ENOMEM; + bool ephemeral; + struct tmem_hashbucket *hb; + + ephemeral = is_ephemeral(pool); + hb = &pool->hashbucket[tmem_oid_hash(oidp)]; + spin_lock(&hb->lock); + obj = objfound = tmem_obj_find(hb, oidp); + if (obj != NULL) { + pampd = tmem_pampd_lookup_in_obj(objfound, index); + if (pampd != NULL) { + /* if found, is a dup put, flush the old one */ + pampd_del = tmem_pampd_delete_from_obj(obj, index); + BUG_ON(pampd_del != pampd); + (*tmem_pamops.free)(pampd, pool); + if (obj->pampd_count == 0) { + objnew = obj; + objfound = NULL; + } + pampd = NULL; + } + } else { + obj = objnew = (*tmem_hostops.obj_alloc)(pool); + if (unlikely(obj == NULL)) { + ret = -ENOMEM; + goto out; + } + tmem_obj_init(obj, hb, pool, oidp); + } + BUG_ON(obj == NULL); + BUG_ON(((objnew != obj) && (objfound != obj)) || (objnew == objfound)); + pampd = (*tmem_pamops.create)(obj->pool, &obj->oid, index, page); + if (unlikely(pampd == NULL)) + goto free; + ret = tmem_pampd_add_to_obj(obj, index, pampd); + if (unlikely(ret == -ENOMEM)) + /* may have partially built objnode tree ("stump") */ + goto delete_and_free; + goto out; + +delete_and_free: + (void)tmem_pampd_delete_from_obj(obj, index); +free: + if (pampd) + (*tmem_pamops.free)(pampd, pool); + if (objnew) { + tmem_obj_free(objnew, hb); + (*tmem_hostops.obj_free)(objnew, pool); + } +out: + spin_unlock(&hb->lock); + return ret; +} + +/* + * "Get" a page, e.g. if one can be found, copy the tmem page with the + * matching handle from PAM space to the kernel. By tmem definition, + * when a "get" is successful on an ephemeral page, the page is "flushed", + * and when a "get" is successful on a persistent page, the page is retained + * in tmem. Note that to preserve + * coherency, "get" can never be skipped if tmem contains the data. + * That is, if a get is done with a certain handle and fails, any + * subsequent "get" must also fail (unless of course there is a + * "put" done with the same handle). + + */ +int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, + uint32_t index, struct page *page) +{ + struct tmem_obj *obj; + void *pampd; + bool ephemeral = is_ephemeral(pool); + uint32_t ret = -1; + struct tmem_hashbucket *hb; + + hb = &pool->hashbucket[tmem_oid_hash(oidp)]; + spin_lock(&hb->lock); + obj = tmem_obj_find(hb, oidp); + if (obj == NULL) + goto out; + ephemeral = is_ephemeral(pool); + if (ephemeral) + pampd = tmem_pampd_delete_from_obj(obj, index); + else + pampd = tmem_pampd_lookup_in_obj(obj, index); + if (pampd == NULL) + goto out; + ret = (*tmem_pamops.get_data)(page, pampd, pool); + if (ret < 0) + goto out; + if (ephemeral) { + (*tmem_pamops.free)(pampd, pool); + if (obj->pampd_count == 0) { + tmem_obj_free(obj, hb); + (*tmem_hostops.obj_free)(obj, pool); + obj = NULL; + } + } + ret = 0; +out: + spin_unlock(&hb->lock); + return ret; +} + +/* + * If a page in tmem matches the handle, "flush" this page from tmem such + * that any subsequent "get" does not succeed (unless, of course, there + * was another "put" with the same handle). + */ +int tmem_flush_page(struct tmem_pool *pool, + struct tmem_oid *oidp, uint32_t index) +{ + struct tmem_obj *obj; + void *pampd; + int ret = -1; + struct tmem_hashbucket *hb; + + hb = &pool->hashbucket[tmem_oid_hash(oidp)]; + spin_lock(&hb->lock); + obj = tmem_obj_find(hb, oidp); + if (obj == NULL) + goto out; + pampd = tmem_pampd_delete_from_obj(obj, index); + if (pampd == NULL) + goto out; + (*tmem_pamops.free)(pampd, pool); + if (obj->pampd_count == 0) { + tmem_obj_free(obj, hb); + (*tmem_hostops.obj_free)(obj, pool); + } + ret = 0; + +out: + spin_unlock(&hb->lock); + return ret; +} + +/* + * "Flush" all pages in tmem matching this oid. + */ +int tmem_flush_object(struct tmem_pool *pool, struct tmem_oid *oidp) +{ + struct tmem_obj *obj; + struct tmem_hashbucket *hb; + int ret = -1; + + hb = &pool->hashbucket[tmem_oid_hash(oidp)]; + spin_lock(&hb->lock); + obj = tmem_obj_find(hb, oidp); + if (obj == NULL) + goto out; + tmem_pampd_destroy_all_in_obj(obj); + tmem_obj_free(obj, hb); + (*tmem_hostops.obj_free)(obj, pool); + ret = 0; + +out: + spin_unlock(&hb->lock); + return ret; +} + +/* + * "Flush" all pages (and tmem_objs) from this tmem_pool and disable + * all subsequent access to this tmem_pool. + */ +int tmem_destroy_pool(struct tmem_pool *pool) +{ + int ret = -1; + + if (pool == NULL) + goto out; + tmem_pool_flush(pool, 1); + ret = 0; +out: + return ret; +} + +static LIST_HEAD(tmem_global_pool_list); + +/* + * Create a new tmem_pool with the provided flag and return + * a pool id provided by the tmem host implementation. + */ +void tmem_new_pool(struct tmem_pool *pool, uint32_t flags) +{ + int persistent = flags & TMEM_POOL_PERSIST; + int shared = flags & TMEM_POOL_SHARED; + struct tmem_hashbucket *hb = &pool->hashbucket[0]; + int i; + + for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) { + hb->obj_rb_root = RB_ROOT; + spin_lock_init(&hb->lock); + } + INIT_LIST_HEAD(&pool->pool_list); + atomic_set(&pool->obj_count, 0); + SET_SENTINEL(pool, POOL); + list_add_tail(&pool->pool_list, &tmem_global_pool_list); + pool->persistent = persistent; + pool->shared = shared; +} diff --git a/drivers/staging/zcache/tmem.h b/drivers/staging/zcache/tmem.h new file mode 100755 index 00000000..2e07e217 --- /dev/null +++ b/drivers/staging/zcache/tmem.h @@ -0,0 +1,195 @@ +/* + * tmem.h + * + * Transcendent memory + * + * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp. + */ + +#ifndef _TMEM_H_ +#define _TMEM_H_ + +#include +#include +#include +#include + +/* + * These are pre-defined by the Xen<->Linux ABI + */ +#define TMEM_PUT_PAGE 4 +#define TMEM_GET_PAGE 5 +#define TMEM_FLUSH_PAGE 6 +#define TMEM_FLUSH_OBJECT 7 +#define TMEM_POOL_PERSIST 1 +#define TMEM_POOL_SHARED 2 +#define TMEM_POOL_PRECOMPRESSED 4 +#define TMEM_POOL_PAGESIZE_SHIFT 4 +#define TMEM_POOL_PAGESIZE_MASK 0xf +#define TMEM_POOL_RESERVED_BITS 0x00ffff00 + +/* + * sentinels have proven very useful for debugging but can be removed + * or disabled before final merge. + */ +#define SENTINELS +#ifdef SENTINELS +#define DECL_SENTINEL uint32_t sentinel; +#define SET_SENTINEL(_x, _y) (_x->sentinel = _y##_SENTINEL) +#define INVERT_SENTINEL(_x, _y) (_x->sentinel = ~_y##_SENTINEL) +#define ASSERT_SENTINEL(_x, _y) WARN_ON(_x->sentinel != _y##_SENTINEL) +#define ASSERT_INVERTED_SENTINEL(_x, _y) WARN_ON(_x->sentinel != ~_y##_SENTINEL) +#else +#define DECL_SENTINEL +#define SET_SENTINEL(_x, _y) do { } while (0) +#define INVERT_SENTINEL(_x, _y) do { } while (0) +#define ASSERT_SENTINEL(_x, _y) do { } while (0) +#define ASSERT_INVERTED_SENTINEL(_x, _y) do { } while (0) +#endif + +#define ASSERT_SPINLOCK(_l) WARN_ON(!spin_is_locked(_l)) + +/* + * A pool is the highest-level data structure managed by tmem and + * usually corresponds to a large independent set of pages such as + * a filesystem. Each pool has an id, and certain attributes and counters. + * It also contains a set of hash buckets, each of which contains an rbtree + * of objects and a lock to manage concurrency within the pool. + */ + +#define TMEM_HASH_BUCKET_BITS 8 +#define TMEM_HASH_BUCKETS (1<persistent) +#define is_ephemeral(_p) (!(_p->persistent)) + +/* + * An object id ("oid") is large: 192-bits (to ensure, for example, files + * in a modern filesystem can be uniquely identified). + */ + +struct tmem_oid { + uint64_t oid[3]; +}; + +static inline void tmem_oid_set_invalid(struct tmem_oid *oidp) +{ + oidp->oid[0] = oidp->oid[1] = oidp->oid[2] = -1UL; +} + +static inline bool tmem_oid_valid(struct tmem_oid *oidp) +{ + return oidp->oid[0] != -1UL || oidp->oid[1] != -1UL || + oidp->oid[2] != -1UL; +} + +static inline int tmem_oid_compare(struct tmem_oid *left, + struct tmem_oid *right) +{ + int ret; + + if (left->oid[2] == right->oid[2]) { + if (left->oid[1] == right->oid[1]) { + if (left->oid[0] == right->oid[0]) + ret = 0; + else if (left->oid[0] < right->oid[0]) + ret = -1; + else + return 1; + } else if (left->oid[1] < right->oid[1]) + ret = -1; + else + ret = 1; + } else if (left->oid[2] < right->oid[2]) + ret = -1; + else + ret = 1; + return ret; +} + +static inline unsigned tmem_oid_hash(struct tmem_oid *oidp) +{ + return hash_long(oidp->oid[0] ^ oidp->oid[1] ^ oidp->oid[2], + TMEM_HASH_BUCKET_BITS); +} + +/* + * A tmem_obj contains an identifier (oid), pointers to the parent + * pool and the rb_tree to which it belongs, counters, and an ordered + * set of pampds, structured in a radix-tree-like tree. The intermediate + * nodes of the tree are called tmem_objnodes. + */ + +struct tmem_objnode; + +struct tmem_obj { + struct tmem_oid oid; + struct tmem_pool *pool; + struct rb_node rb_tree_node; + struct tmem_objnode *objnode_tree_root; + unsigned int objnode_tree_height; + unsigned long objnode_count; + long pampd_count; + DECL_SENTINEL +}; + +#define OBJNODE_TREE_MAP_SHIFT 6 +#define OBJNODE_TREE_MAP_SIZE (1UL << OBJNODE_TREE_MAP_SHIFT) +#define OBJNODE_TREE_MAP_MASK (OBJNODE_TREE_MAP_SIZE-1) +#define OBJNODE_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) +#define OBJNODE_TREE_MAX_PATH \ + (OBJNODE_TREE_INDEX_BITS/OBJNODE_TREE_MAP_SHIFT + 2) + +struct tmem_objnode { + struct tmem_obj *obj; + DECL_SENTINEL + void *slots[OBJNODE_TREE_MAP_SIZE]; + unsigned int slots_in_use; +}; + +/* pampd abstract datatype methods provided by the PAM implementation */ +struct tmem_pamops { + void *(*create)(struct tmem_pool *, struct tmem_oid *, uint32_t, + struct page *); + int (*get_data)(struct page *, void *, struct tmem_pool *); + void (*free)(void *, struct tmem_pool *); +}; +extern void tmem_register_pamops(struct tmem_pamops *m); + +/* memory allocation methods provided by the host implementation */ +struct tmem_hostops { + struct tmem_obj *(*obj_alloc)(struct tmem_pool *); + void (*obj_free)(struct tmem_obj *, struct tmem_pool *); + struct tmem_objnode *(*objnode_alloc)(struct tmem_pool *); + void (*objnode_free)(struct tmem_objnode *, struct tmem_pool *); +}; +extern void tmem_register_hostops(struct tmem_hostops *m); + +/* core tmem accessor functions */ +extern int tmem_put(struct tmem_pool *, struct tmem_oid *, uint32_t index, + struct page *page); +extern int tmem_get(struct tmem_pool *, struct tmem_oid *, uint32_t index, + struct page *page); +extern int tmem_flush_page(struct tmem_pool *, struct tmem_oid *, + uint32_t index); +extern int tmem_flush_object(struct tmem_pool *, struct tmem_oid *); +extern int tmem_destroy_pool(struct tmem_pool *); +extern void tmem_new_pool(struct tmem_pool *, uint32_t); +#endif /* _TMEM_H */ diff --git a/drivers/staging/zcache/zcache.c b/drivers/staging/zcache/zcache.c new file mode 100755 index 00000000..b8a2b30a --- /dev/null +++ b/drivers/staging/zcache/zcache.c @@ -0,0 +1,1658 @@ +/* + * zcache.c + * + * Copyright (c) 2010,2011, Dan Magenheimer, Oracle Corp. + * Copyright (c) 2010,2011, Nitin Gupta + * + * Zcache provides an in-kernel "host implementation" for transcendent memory + * and, thus indirectly, for cleancache and frontswap. Zcache includes two + * page-accessible memory [1] interfaces, both utilizing lzo1x compression: + * 1) "compression buddies" ("zbud") is used for ephemeral pages + * 2) xvmalloc is used for persistent pages. + * Xvmalloc (based on the TLSF allocator) has very low fragmentation + * so maximizes space efficiency, while zbud allows pairs (and potentially, + * in the future, more than a pair of) compressed pages to be closely linked + * so that reclaiming can be done via the kernel's physical-page-oriented + * "shrinker" interface. + * + * [1] For a definition of page-accessible memory (aka PAM), see: + * http://marc.info/?l=linux-mm&m=127811271605009 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "tmem.h" + +#include "../zram/xvmalloc.h" /* if built in drivers/staging */ + +#if (!defined(CONFIG_CLEANCACHE) && !defined(CONFIG_FRONTSWAP)) +#error "zcache is useless without CONFIG_CLEANCACHE or CONFIG_FRONTSWAP" +#endif +#ifdef CONFIG_CLEANCACHE +#include +#endif +#ifdef CONFIG_FRONTSWAP +#include +#endif + +#if 0 +/* this is more aggressive but may cause other problems? */ +#define ZCACHE_GFP_MASK (GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN) +#else +#define ZCACHE_GFP_MASK \ + (__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC) +#endif + +/********** + * Compression buddies ("zbud") provides for packing two (or, possibly + * in the future, more) compressed ephemeral pages into a single "raw" + * (physical) page and tracking them with data structures so that + * the raw pages can be easily reclaimed. + * + * A zbud page ("zbpg") is an aligned page containing a list_head, + * a lock, and two "zbud headers". The remainder of the physical + * page is divided up into aligned 64-byte "chunks" which contain + * the compressed data for zero, one, or two zbuds. Each zbpg + * resides on: (1) an "unused list" if it has no zbuds; (2) a + * "buddied" list if it is fully populated with two zbuds; or + * (3) one of PAGE_SIZE/64 "unbuddied" lists indexed by how many chunks + * the one unbuddied zbud uses. The data inside a zbpg cannot be + * read or written unless the zbpg's lock is held. + */ + +#define ZBH_SENTINEL 0x43214321 +#define ZBPG_SENTINEL 0xdeadbeef + +#define ZBUD_MAX_BUDS 2 + +struct zbud_hdr { + uint32_t pool_id; + struct tmem_oid oid; + uint32_t index; + uint16_t size; /* compressed size in bytes, zero means unused */ + DECL_SENTINEL +}; + +struct zbud_page { + struct list_head bud_list; + spinlock_t lock; + struct zbud_hdr buddy[ZBUD_MAX_BUDS]; + DECL_SENTINEL + /* followed by NUM_CHUNK aligned CHUNK_SIZE-byte chunks */ +}; + +#define CHUNK_SHIFT 6 +#define CHUNK_SIZE (1 << CHUNK_SHIFT) +#define CHUNK_MASK (~(CHUNK_SIZE-1)) +#define NCHUNKS (((PAGE_SIZE - sizeof(struct zbud_page)) & \ + CHUNK_MASK) >> CHUNK_SHIFT) +#define MAX_CHUNK (NCHUNKS-1) + +static struct { + struct list_head list; + unsigned count; +} zbud_unbuddied[NCHUNKS]; +/* list N contains pages with N chunks USED and NCHUNKS-N unused */ +/* element 0 is never used but optimizing that isn't worth it */ +static unsigned long zbud_cumul_chunk_counts[NCHUNKS]; + +struct list_head zbud_buddied_list; +static unsigned long zcache_zbud_buddied_count; + +/* protects the buddied list and all unbuddied lists */ +static DEFINE_SPINLOCK(zbud_budlists_spinlock); + +static LIST_HEAD(zbpg_unused_list); +static unsigned long zcache_zbpg_unused_list_count; + +/* protects the unused page list */ +static DEFINE_SPINLOCK(zbpg_unused_list_spinlock); + +static atomic_t zcache_zbud_curr_raw_pages; +static atomic_t zcache_zbud_curr_zpages; +static unsigned long zcache_zbud_curr_zbytes; +static unsigned long zcache_zbud_cumul_zpages; +static unsigned long zcache_zbud_cumul_zbytes; +static unsigned long zcache_compress_poor; + +/* forward references */ +static void *zcache_get_free_page(void); +static void zcache_free_page(void *p); + +/* + * zbud helper functions + */ + +static inline unsigned zbud_max_buddy_size(void) +{ + return MAX_CHUNK << CHUNK_SHIFT; +} + +static inline unsigned zbud_size_to_chunks(unsigned size) +{ + BUG_ON(size == 0 || size > zbud_max_buddy_size()); + return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; +} + +static inline int zbud_budnum(struct zbud_hdr *zh) +{ + unsigned offset = (unsigned long)zh & (PAGE_SIZE - 1); + struct zbud_page *zbpg = NULL; + unsigned budnum = -1U; + int i; + + for (i = 0; i < ZBUD_MAX_BUDS; i++) + if (offset == offsetof(typeof(*zbpg), buddy[i])) { + budnum = i; + break; + } + BUG_ON(budnum == -1U); + return budnum; +} + +static char *zbud_data(struct zbud_hdr *zh, unsigned size) +{ + struct zbud_page *zbpg; + char *p; + unsigned budnum; + + ASSERT_SENTINEL(zh, ZBH); + budnum = zbud_budnum(zh); + BUG_ON(size == 0 || size > zbud_max_buddy_size()); + zbpg = container_of(zh, struct zbud_page, buddy[budnum]); + ASSERT_SPINLOCK(&zbpg->lock); + p = (char *)zbpg; + if (budnum == 0) + p += ((sizeof(struct zbud_page) + CHUNK_SIZE - 1) & + CHUNK_MASK); + else if (budnum == 1) + p += PAGE_SIZE - ((size + CHUNK_SIZE - 1) & CHUNK_MASK); + return p; +} + +/* + * zbud raw page management + */ + +static struct zbud_page *zbud_alloc_raw_page(void) +{ + struct zbud_page *zbpg = NULL; + struct zbud_hdr *zh0, *zh1; + bool recycled = 0; + + /* if any pages on the zbpg list, use one */ + spin_lock(&zbpg_unused_list_spinlock); + if (!list_empty(&zbpg_unused_list)) { + zbpg = list_first_entry(&zbpg_unused_list, + struct zbud_page, bud_list); + list_del_init(&zbpg->bud_list); + zcache_zbpg_unused_list_count--; + recycled = 1; + } + spin_unlock(&zbpg_unused_list_spinlock); + if (zbpg == NULL) + /* none on zbpg list, try to get a kernel page */ + zbpg = zcache_get_free_page(); + if (likely(zbpg != NULL)) { + INIT_LIST_HEAD(&zbpg->bud_list); + zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1]; + spin_lock_init(&zbpg->lock); + if (recycled) { + ASSERT_INVERTED_SENTINEL(zbpg, ZBPG); + SET_SENTINEL(zbpg, ZBPG); + BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid)); + BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid)); + } else { + atomic_inc(&zcache_zbud_curr_raw_pages); + INIT_LIST_HEAD(&zbpg->bud_list); + SET_SENTINEL(zbpg, ZBPG); + zh0->size = 0; zh1->size = 0; + tmem_oid_set_invalid(&zh0->oid); + tmem_oid_set_invalid(&zh1->oid); + } + } + return zbpg; +} + +static void zbud_free_raw_page(struct zbud_page *zbpg) +{ + struct zbud_hdr *zh0 = &zbpg->buddy[0], *zh1 = &zbpg->buddy[1]; + + ASSERT_SENTINEL(zbpg, ZBPG); + BUG_ON(!list_empty(&zbpg->bud_list)); + ASSERT_SPINLOCK(&zbpg->lock); + BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid)); + BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid)); + INVERT_SENTINEL(zbpg, ZBPG); + spin_unlock(&zbpg->lock); + spin_lock(&zbpg_unused_list_spinlock); + list_add(&zbpg->bud_list, &zbpg_unused_list); + zcache_zbpg_unused_list_count++; + spin_unlock(&zbpg_unused_list_spinlock); +} + +/* + * core zbud handling routines + */ + +static unsigned zbud_free(struct zbud_hdr *zh) +{ + unsigned size; + + ASSERT_SENTINEL(zh, ZBH); + BUG_ON(!tmem_oid_valid(&zh->oid)); + size = zh->size; + BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size()); + zh->size = 0; + tmem_oid_set_invalid(&zh->oid); + INVERT_SENTINEL(zh, ZBH); + zcache_zbud_curr_zbytes -= size; + atomic_dec(&zcache_zbud_curr_zpages); + return size; +} + +static void zbud_free_and_delist(struct zbud_hdr *zh) +{ + unsigned chunks; + struct zbud_hdr *zh_other; + unsigned budnum = zbud_budnum(zh), size; + struct zbud_page *zbpg = + container_of(zh, struct zbud_page, buddy[budnum]); + + spin_lock(&zbpg->lock); + if (list_empty(&zbpg->bud_list)) { + /* ignore zombie page... see zbud_evict_pages() */ + spin_unlock(&zbpg->lock); + return; + } + size = zbud_free(zh); + ASSERT_SPINLOCK(&zbpg->lock); + zh_other = &zbpg->buddy[(budnum == 0) ? 1 : 0]; + if (zh_other->size == 0) { /* was unbuddied: unlist and free */ + chunks = zbud_size_to_chunks(size) ; + spin_lock(&zbud_budlists_spinlock); + BUG_ON(list_empty(&zbud_unbuddied[chunks].list)); + list_del_init(&zbpg->bud_list); + zbud_unbuddied[chunks].count--; + spin_unlock(&zbud_budlists_spinlock); + zbud_free_raw_page(zbpg); + } else { /* was buddied: move remaining buddy to unbuddied list */ + chunks = zbud_size_to_chunks(zh_other->size) ; + spin_lock(&zbud_budlists_spinlock); + list_del_init(&zbpg->bud_list); + zcache_zbud_buddied_count--; + list_add_tail(&zbpg->bud_list, &zbud_unbuddied[chunks].list); + zbud_unbuddied[chunks].count++; + spin_unlock(&zbud_budlists_spinlock); + spin_unlock(&zbpg->lock); + } +} + +static struct zbud_hdr *zbud_create(uint32_t pool_id, struct tmem_oid *oid, + uint32_t index, struct page *page, + void *cdata, unsigned size) +{ + struct zbud_hdr *zh0, *zh1, *zh = NULL; + struct zbud_page *zbpg = NULL, *ztmp; + unsigned nchunks; + char *to; + int i, found_good_buddy = 0; + + nchunks = zbud_size_to_chunks(size) ; + for (i = MAX_CHUNK - nchunks + 1; i > 0; i--) { + spin_lock(&zbud_budlists_spinlock); + if (!list_empty(&zbud_unbuddied[i].list)) { + list_for_each_entry_safe(zbpg, ztmp, + &zbud_unbuddied[i].list, bud_list) { + if (spin_trylock(&zbpg->lock)) { + found_good_buddy = i; + goto found_unbuddied; + } + } + } + spin_unlock(&zbud_budlists_spinlock); + } + /* didn't find a good buddy, try allocating a new page */ + zbpg = zbud_alloc_raw_page(); + if (unlikely(zbpg == NULL)) + goto out; + /* ok, have a page, now compress the data before taking locks */ + spin_lock(&zbpg->lock); + spin_lock(&zbud_budlists_spinlock); + list_add_tail(&zbpg->bud_list, &zbud_unbuddied[nchunks].list); + zbud_unbuddied[nchunks].count++; + zh = &zbpg->buddy[0]; + goto init_zh; + +found_unbuddied: + ASSERT_SPINLOCK(&zbpg->lock); + zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1]; + BUG_ON(!((zh0->size == 0) ^ (zh1->size == 0))); + if (zh0->size != 0) { /* buddy0 in use, buddy1 is vacant */ + ASSERT_SENTINEL(zh0, ZBH); + zh = zh1; + } else if (zh1->size != 0) { /* buddy1 in use, buddy0 is vacant */ + ASSERT_SENTINEL(zh1, ZBH); + zh = zh0; + } else + BUG(); + list_del_init(&zbpg->bud_list); + zbud_unbuddied[found_good_buddy].count--; + list_add_tail(&zbpg->bud_list, &zbud_buddied_list); + zcache_zbud_buddied_count++; + +init_zh: + SET_SENTINEL(zh, ZBH); + zh->size = size; + zh->index = index; + zh->oid = *oid; + zh->pool_id = pool_id; + /* can wait to copy the data until the list locks are dropped */ + spin_unlock(&zbud_budlists_spinlock); + + to = zbud_data(zh, size); + memcpy(to, cdata, size); + spin_unlock(&zbpg->lock); + zbud_cumul_chunk_counts[nchunks]++; + atomic_inc(&zcache_zbud_curr_zpages); + zcache_zbud_cumul_zpages++; + zcache_zbud_curr_zbytes += size; + zcache_zbud_cumul_zbytes += size; +out: + return zh; +} + +static int zbud_decompress(struct page *page, struct zbud_hdr *zh) +{ + struct zbud_page *zbpg; + unsigned budnum = zbud_budnum(zh); + size_t out_len = PAGE_SIZE; + char *to_va, *from_va; + unsigned size; + int ret = 0; + + zbpg = container_of(zh, struct zbud_page, buddy[budnum]); + spin_lock(&zbpg->lock); + if (list_empty(&zbpg->bud_list)) { + /* ignore zombie page... see zbud_evict_pages() */ + ret = -EINVAL; + goto out; + } + ASSERT_SENTINEL(zh, ZBH); + BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size()); + to_va = kmap_atomic(page, KM_USER0); + size = zh->size; + from_va = zbud_data(zh, size); + ret = lzo1x_decompress_safe(from_va, size, to_va, &out_len); + BUG_ON(ret != LZO_E_OK); + BUG_ON(out_len != PAGE_SIZE); + kunmap_atomic(to_va, KM_USER0); +out: + spin_unlock(&zbpg->lock); + return ret; +} + +/* + * The following routines handle shrinking of ephemeral pages by evicting + * pages "least valuable" first. + */ + +static unsigned long zcache_evicted_raw_pages; +static unsigned long zcache_evicted_buddied_pages; +static unsigned long zcache_evicted_unbuddied_pages; + +static struct tmem_pool *zcache_get_pool_by_id(uint32_t poolid); +static void zcache_put_pool(struct tmem_pool *pool); + +/* + * Flush and free all zbuds in a zbpg, then free the pageframe + */ +static void zbud_evict_zbpg(struct zbud_page *zbpg) +{ + struct zbud_hdr *zh; + int i, j; + uint32_t pool_id[ZBUD_MAX_BUDS], index[ZBUD_MAX_BUDS]; + struct tmem_oid oid[ZBUD_MAX_BUDS]; + struct tmem_pool *pool; + + ASSERT_SPINLOCK(&zbpg->lock); + BUG_ON(!list_empty(&zbpg->bud_list)); + for (i = 0, j = 0; i < ZBUD_MAX_BUDS; i++) { + zh = &zbpg->buddy[i]; + if (zh->size) { + pool_id[j] = zh->pool_id; + oid[j] = zh->oid; + index[j] = zh->index; + j++; + zbud_free(zh); + } + } + spin_unlock(&zbpg->lock); + for (i = 0; i < j; i++) { + pool = zcache_get_pool_by_id(pool_id[i]); + if (pool != NULL) { + tmem_flush_page(pool, &oid[i], index[i]); + zcache_put_pool(pool); + } + } + ASSERT_SENTINEL(zbpg, ZBPG); + spin_lock(&zbpg->lock); + zbud_free_raw_page(zbpg); +} + +/* + * Free nr pages. This code is funky because we want to hold the locks + * protecting various lists for as short a time as possible, and in some + * circumstances the list may change asynchronously when the list lock is + * not held. In some cases we also trylock not only to avoid waiting on a + * page in use by another cpu, but also to avoid potential deadlock due to + * lock inversion. + */ +static void zbud_evict_pages(int nr) +{ + struct zbud_page *zbpg; + int i; + + /* first try freeing any pages on unused list */ +retry_unused_list: + spin_lock_bh(&zbpg_unused_list_spinlock); + if (!list_empty(&zbpg_unused_list)) { + /* can't walk list here, since it may change when unlocked */ + zbpg = list_first_entry(&zbpg_unused_list, + struct zbud_page, bud_list); + list_del_init(&zbpg->bud_list); + zcache_zbpg_unused_list_count--; + atomic_dec(&zcache_zbud_curr_raw_pages); + spin_unlock_bh(&zbpg_unused_list_spinlock); + zcache_free_page(zbpg); + zcache_evicted_raw_pages++; + if (--nr <= 0) + goto out; + goto retry_unused_list; + } + spin_unlock_bh(&zbpg_unused_list_spinlock); + + /* now try freeing unbuddied pages, starting with least space avail */ + for (i = 0; i < MAX_CHUNK; i++) { +retry_unbud_list_i: + spin_lock_bh(&zbud_budlists_spinlock); + if (list_empty(&zbud_unbuddied[i].list)) { + spin_unlock_bh(&zbud_budlists_spinlock); + continue; + } + list_for_each_entry(zbpg, &zbud_unbuddied[i].list, bud_list) { + if (unlikely(!spin_trylock(&zbpg->lock))) + continue; + list_del_init(&zbpg->bud_list); + zbud_unbuddied[i].count--; + spin_unlock(&zbud_budlists_spinlock); + zcache_evicted_unbuddied_pages++; + /* want budlists unlocked when doing zbpg eviction */ + zbud_evict_zbpg(zbpg); + local_bh_enable(); + if (--nr <= 0) + goto out; + goto retry_unbud_list_i; + } + spin_unlock_bh(&zbud_budlists_spinlock); + } + + /* as a last resort, free buddied pages */ +retry_bud_list: + spin_lock_bh(&zbud_budlists_spinlock); + if (list_empty(&zbud_buddied_list)) { + spin_unlock_bh(&zbud_budlists_spinlock); + goto out; + } + list_for_each_entry(zbpg, &zbud_buddied_list, bud_list) { + if (unlikely(!spin_trylock(&zbpg->lock))) + continue; + list_del_init(&zbpg->bud_list); + zcache_zbud_buddied_count--; + spin_unlock(&zbud_budlists_spinlock); + zcache_evicted_buddied_pages++; + /* want budlists unlocked when doing zbpg eviction */ + zbud_evict_zbpg(zbpg); + local_bh_enable(); + if (--nr <= 0) + goto out; + goto retry_bud_list; + } + spin_unlock_bh(&zbud_budlists_spinlock); +out: + return; +} + +static void zbud_init(void) +{ + int i; + + INIT_LIST_HEAD(&zbud_buddied_list); + zcache_zbud_buddied_count = 0; + for (i = 0; i < NCHUNKS; i++) { + INIT_LIST_HEAD(&zbud_unbuddied[i].list); + zbud_unbuddied[i].count = 0; + } +} + +#ifdef CONFIG_SYSFS +/* + * These sysfs routines show a nice distribution of how many zbpg's are + * currently (and have ever been placed) in each unbuddied list. It's fun + * to watch but can probably go away before final merge. + */ +static int zbud_show_unbuddied_list_counts(char *buf) +{ + int i; + char *p = buf; + + for (i = 0; i < NCHUNKS - 1; i++) + p += sprintf(p, "%u ", zbud_unbuddied[i].count); + p += sprintf(p, "%d\n", zbud_unbuddied[i].count); + return p - buf; +} + +static int zbud_show_cumul_chunk_counts(char *buf) +{ + unsigned long i, chunks = 0, total_chunks = 0, sum_total_chunks = 0; + unsigned long total_chunks_lte_21 = 0, total_chunks_lte_32 = 0; + unsigned long total_chunks_lte_42 = 0; + char *p = buf; + + for (i = 0; i < NCHUNKS; i++) { + p += sprintf(p, "%lu ", zbud_cumul_chunk_counts[i]); + chunks += zbud_cumul_chunk_counts[i]; + total_chunks += zbud_cumul_chunk_counts[i]; + sum_total_chunks += i * zbud_cumul_chunk_counts[i]; + if (i == 21) + total_chunks_lte_21 = total_chunks; + if (i == 32) + total_chunks_lte_32 = total_chunks; + if (i == 42) + total_chunks_lte_42 = total_chunks; + } + p += sprintf(p, "<=21:%lu <=32:%lu <=42:%lu, mean:%lu\n", + total_chunks_lte_21, total_chunks_lte_32, total_chunks_lte_42, + chunks == 0 ? 0 : sum_total_chunks / chunks); + return p - buf; +} +#endif + +/********** + * This "zv" PAM implementation combines the TLSF-based xvMalloc + * with lzo1x compression to maximize the amount of data that can + * be packed into a physical page. + * + * Zv represents a PAM page with the index and object (plus a "size" value + * necessary for decompression) immediately preceding the compressed data. + */ + +#define ZVH_SENTINEL 0x43214321 + +struct zv_hdr { + uint32_t pool_id; + struct tmem_oid oid; + uint32_t index; + DECL_SENTINEL +}; + +static const int zv_max_page_size = (PAGE_SIZE / 8) * 7; + +static struct zv_hdr *zv_create(struct xv_pool *xvpool, uint32_t pool_id, + struct tmem_oid *oid, uint32_t index, + void *cdata, unsigned clen) +{ + struct page *page; + struct zv_hdr *zv = NULL; + uint32_t offset; + int ret; + + BUG_ON(!irqs_disabled()); + ret = xv_malloc(xvpool, clen + sizeof(struct zv_hdr), + &page, &offset, ZCACHE_GFP_MASK); + if (unlikely(ret)) + goto out; + zv = kmap_atomic(page, KM_USER0) + offset; + zv->index = index; + zv->oid = *oid; + zv->pool_id = pool_id; + SET_SENTINEL(zv, ZVH); + memcpy((char *)zv + sizeof(struct zv_hdr), cdata, clen); + kunmap_atomic(zv, KM_USER0); +out: + return zv; +} + +static void zv_free(struct xv_pool *xvpool, struct zv_hdr *zv) +{ + unsigned long flags; + struct page *page; + uint32_t offset; + uint16_t size; + + ASSERT_SENTINEL(zv, ZVH); + size = xv_get_object_size(zv) - sizeof(*zv); + BUG_ON(size == 0 || size > zv_max_page_size); + INVERT_SENTINEL(zv, ZVH); + page = virt_to_page(zv); + offset = (unsigned long)zv & ~PAGE_MASK; + local_irq_save(flags); + xv_free(xvpool, page, offset); + local_irq_restore(flags); +} + +static void zv_decompress(struct page *page, struct zv_hdr *zv) +{ + size_t clen = PAGE_SIZE; + char *to_va; + unsigned size; + int ret; + + ASSERT_SENTINEL(zv, ZVH); + size = xv_get_object_size(zv) - sizeof(*zv); + BUG_ON(size == 0 || size > zv_max_page_size); + to_va = kmap_atomic(page, KM_USER0); + ret = lzo1x_decompress_safe((char *)zv + sizeof(*zv), + size, to_va, &clen); + kunmap_atomic(to_va, KM_USER0); + BUG_ON(ret != LZO_E_OK); + BUG_ON(clen != PAGE_SIZE); +} + +/* + * zcache core code starts here + */ + +/* useful stats not collected by cleancache or frontswap */ +static unsigned long zcache_flush_total; +static unsigned long zcache_flush_found; +static unsigned long zcache_flobj_total; +static unsigned long zcache_flobj_found; +static unsigned long zcache_failed_eph_puts; +static unsigned long zcache_failed_pers_puts; + +#define MAX_POOLS_PER_CLIENT 16 + +static struct { + struct tmem_pool *tmem_pools[MAX_POOLS_PER_CLIENT]; + struct xv_pool *xvpool; +} zcache_client; + +/* + * Tmem operations assume the poolid implies the invoking client. + * Zcache only has one client (the kernel itself), so translate + * the poolid into the tmem_pool allocated for it. A KVM version + * of zcache would have one client per guest and each client might + * have a poolid==N. + */ +static struct tmem_pool *zcache_get_pool_by_id(uint32_t poolid) +{ + struct tmem_pool *pool = NULL; + + if (poolid >= 0) { + pool = zcache_client.tmem_pools[poolid]; + if (pool != NULL) + atomic_inc(&pool->refcount); + } + return pool; +} + +static void zcache_put_pool(struct tmem_pool *pool) +{ + if (pool != NULL) + atomic_dec(&pool->refcount); +} + +/* counters for debugging */ +static unsigned long zcache_failed_get_free_pages; +static unsigned long zcache_failed_alloc; +static unsigned long zcache_put_to_flush; +static unsigned long zcache_aborted_preload; +static unsigned long zcache_aborted_shrink; + +/* + * Ensure that memory allocation requests in zcache don't result + * in direct reclaim requests via the shrinker, which would cause + * an infinite loop. Maybe a GFP flag would be better? + */ +static DEFINE_SPINLOCK(zcache_direct_reclaim_lock); + +/* + * for now, used named slabs so can easily track usage; later can + * either just use kmalloc, or perhaps add a slab-like allocator + * to more carefully manage total memory utilization + */ +static struct kmem_cache *zcache_objnode_cache; +static struct kmem_cache *zcache_obj_cache; +static atomic_t zcache_curr_obj_count = ATOMIC_INIT(0); +static unsigned long zcache_curr_obj_count_max; +static atomic_t zcache_curr_objnode_count = ATOMIC_INIT(0); +static unsigned long zcache_curr_objnode_count_max; + +/* + * to avoid memory allocation recursion (e.g. due to direct reclaim), we + * preload all necessary data structures so the hostops callbacks never + * actually do a malloc + */ +struct zcache_preload { + void *page; + struct tmem_obj *obj; + int nr; + struct tmem_objnode *objnodes[OBJNODE_TREE_MAX_PATH]; +}; +static DEFINE_PER_CPU(struct zcache_preload, zcache_preloads) = { 0, }; + +static int zcache_do_preload(struct tmem_pool *pool) +{ + struct zcache_preload *kp; + struct tmem_objnode *objnode; + struct tmem_obj *obj; + void *page; + int ret = -ENOMEM; + + if (unlikely(zcache_objnode_cache == NULL)) + goto out; + if (unlikely(zcache_obj_cache == NULL)) + goto out; + if (!spin_trylock(&zcache_direct_reclaim_lock)) { + zcache_aborted_preload++; + goto out; + } + preempt_disable(); + kp = &__get_cpu_var(zcache_preloads); + while (kp->nr < ARRAY_SIZE(kp->objnodes)) { + preempt_enable_no_resched(); + objnode = kmem_cache_alloc(zcache_objnode_cache, + ZCACHE_GFP_MASK); + if (unlikely(objnode == NULL)) { + zcache_failed_alloc++; + goto unlock_out; + } + preempt_disable(); + kp = &__get_cpu_var(zcache_preloads); + if (kp->nr < ARRAY_SIZE(kp->objnodes)) + kp->objnodes[kp->nr++] = objnode; + else + kmem_cache_free(zcache_objnode_cache, objnode); + } + preempt_enable_no_resched(); + obj = kmem_cache_alloc(zcache_obj_cache, ZCACHE_GFP_MASK); + if (unlikely(obj == NULL)) { + zcache_failed_alloc++; + goto unlock_out; + } + page = (void *)__get_free_page(ZCACHE_GFP_MASK); + if (unlikely(page == NULL)) { + zcache_failed_get_free_pages++; + kmem_cache_free(zcache_obj_cache, obj); + goto unlock_out; + } + preempt_disable(); + kp = &__get_cpu_var(zcache_preloads); + if (kp->obj == NULL) + kp->obj = obj; + else + kmem_cache_free(zcache_obj_cache, obj); + if (kp->page == NULL) + kp->page = page; + else + free_page((unsigned long)page); + ret = 0; +unlock_out: + spin_unlock(&zcache_direct_reclaim_lock); +out: + return ret; +} + +static void *zcache_get_free_page(void) +{ + struct zcache_preload *kp; + void *page; + + kp = &__get_cpu_var(zcache_preloads); + page = kp->page; + BUG_ON(page == NULL); + kp->page = NULL; + return page; +} + +static void zcache_free_page(void *p) +{ + free_page((unsigned long)p); +} + +/* + * zcache implementation for tmem host ops + */ + +static struct tmem_objnode *zcache_objnode_alloc(struct tmem_pool *pool) +{ + struct tmem_objnode *objnode = NULL; + unsigned long count; + struct zcache_preload *kp; + + kp = &__get_cpu_var(zcache_preloads); + if (kp->nr <= 0) + goto out; + objnode = kp->objnodes[kp->nr - 1]; + BUG_ON(objnode == NULL); + kp->objnodes[kp->nr - 1] = NULL; + kp->nr--; + count = atomic_inc_return(&zcache_curr_objnode_count); + if (count > zcache_curr_objnode_count_max) + zcache_curr_objnode_count_max = count; +out: + return objnode; +} + +static void zcache_objnode_free(struct tmem_objnode *objnode, + struct tmem_pool *pool) +{ + atomic_dec(&zcache_curr_objnode_count); + BUG_ON(atomic_read(&zcache_curr_objnode_count) < 0); + kmem_cache_free(zcache_objnode_cache, objnode); +} + +static struct tmem_obj *zcache_obj_alloc(struct tmem_pool *pool) +{ + struct tmem_obj *obj = NULL; + unsigned long count; + struct zcache_preload *kp; + + kp = &__get_cpu_var(zcache_preloads); + obj = kp->obj; + BUG_ON(obj == NULL); + kp->obj = NULL; + count = atomic_inc_return(&zcache_curr_obj_count); + if (count > zcache_curr_obj_count_max) + zcache_curr_obj_count_max = count; + return obj; +} + +static void zcache_obj_free(struct tmem_obj *obj, struct tmem_pool *pool) +{ + atomic_dec(&zcache_curr_obj_count); + BUG_ON(atomic_read(&zcache_curr_obj_count) < 0); + kmem_cache_free(zcache_obj_cache, obj); +} + +static struct tmem_hostops zcache_hostops = { + .obj_alloc = zcache_obj_alloc, + .obj_free = zcache_obj_free, + .objnode_alloc = zcache_objnode_alloc, + .objnode_free = zcache_objnode_free, +}; + +/* + * zcache implementations for PAM page descriptor ops + */ + +static atomic_t zcache_curr_eph_pampd_count = ATOMIC_INIT(0); +static unsigned long zcache_curr_eph_pampd_count_max; +static atomic_t zcache_curr_pers_pampd_count = ATOMIC_INIT(0); +static unsigned long zcache_curr_pers_pampd_count_max; + +/* forward reference */ +static int zcache_compress(struct page *from, void **out_va, size_t *out_len); + +static void *zcache_pampd_create(struct tmem_pool *pool, struct tmem_oid *oid, + uint32_t index, struct page *page) +{ + void *pampd = NULL, *cdata; + size_t clen; + int ret; + bool ephemeral = is_ephemeral(pool); + unsigned long count; + + if (ephemeral) { + ret = zcache_compress(page, &cdata, &clen); + if (ret == 0) + + goto out; + if (clen == 0 || clen > zbud_max_buddy_size()) { + zcache_compress_poor++; + goto out; + } + pampd = (void *)zbud_create(pool->pool_id, oid, index, + page, cdata, clen); + if (pampd != NULL) { + count = atomic_inc_return(&zcache_curr_eph_pampd_count); + if (count > zcache_curr_eph_pampd_count_max) + zcache_curr_eph_pampd_count_max = count; + } + } else { + /* + * FIXME: This is all the "policy" there is for now. + * 3/4 totpages should allow ~37% of RAM to be filled with + * compressed frontswap pages + */ + if (atomic_read(&zcache_curr_pers_pampd_count) > + 3 * totalram_pages / 4) + goto out; + ret = zcache_compress(page, &cdata, &clen); + if (ret == 0) + goto out; + if (clen > zv_max_page_size) { + zcache_compress_poor++; + goto out; + } + pampd = (void *)zv_create(zcache_client.xvpool, pool->pool_id, + oid, index, cdata, clen); + if (pampd == NULL) + goto out; + count = atomic_inc_return(&zcache_curr_pers_pampd_count); + if (count > zcache_curr_pers_pampd_count_max) + zcache_curr_pers_pampd_count_max = count; + } +out: + return pampd; +} + +/* + * fill the pageframe corresponding to the struct page with the data + * from the passed pampd + */ +static int zcache_pampd_get_data(struct page *page, void *pampd, + struct tmem_pool *pool) +{ + int ret = 0; + + if (is_ephemeral(pool)) + ret = zbud_decompress(page, pampd); + else + zv_decompress(page, pampd); + return ret; +} + +/* + * free the pampd and remove it from any zcache lists + * pampd must no longer be pointed to from any tmem data structures! + */ +static void zcache_pampd_free(void *pampd, struct tmem_pool *pool) +{ + if (is_ephemeral(pool)) { + zbud_free_and_delist((struct zbud_hdr *)pampd); + atomic_dec(&zcache_curr_eph_pampd_count); + BUG_ON(atomic_read(&zcache_curr_eph_pampd_count) < 0); + } else { + zv_free(zcache_client.xvpool, (struct zv_hdr *)pampd); + atomic_dec(&zcache_curr_pers_pampd_count); + BUG_ON(atomic_read(&zcache_curr_pers_pampd_count) < 0); + } +} + +static struct tmem_pamops zcache_pamops = { + .create = zcache_pampd_create, + .get_data = zcache_pampd_get_data, + .free = zcache_pampd_free, +}; + +/* + * zcache compression/decompression and related per-cpu stuff + */ + +#define LZO_WORKMEM_BYTES LZO1X_1_MEM_COMPRESS +#define LZO_DSTMEM_PAGE_ORDER 1 +static DEFINE_PER_CPU(unsigned char *, zcache_workmem); +static DEFINE_PER_CPU(unsigned char *, zcache_dstmem); + +static int zcache_compress(struct page *from, void **out_va, size_t *out_len) +{ + int ret = 0; + unsigned char *dmem = __get_cpu_var(zcache_dstmem); + unsigned char *wmem = __get_cpu_var(zcache_workmem); + char *from_va; + + BUG_ON(!irqs_disabled()); + if (unlikely(dmem == NULL || wmem == NULL)) + goto out; /* no buffer, so can't compress */ + from_va = kmap_atomic(from, KM_USER0); + mb(); + ret = lzo1x_1_compress(from_va, PAGE_SIZE, dmem, out_len, wmem); + BUG_ON(ret != LZO_E_OK); + *out_va = dmem; + kunmap_atomic(from_va, KM_USER0); + ret = 1; +out: + return ret; +} + + +static int zcache_cpu_notifier(struct notifier_block *nb, + unsigned long action, void *pcpu) +{ + int cpu = (long)pcpu; + struct zcache_preload *kp; + + switch (action) { + case CPU_UP_PREPARE: + per_cpu(zcache_dstmem, cpu) = (void *)__get_free_pages( + GFP_KERNEL | __GFP_REPEAT, + LZO_DSTMEM_PAGE_ORDER), + per_cpu(zcache_workmem, cpu) = + kzalloc(LZO1X_MEM_COMPRESS, + GFP_KERNEL | __GFP_REPEAT); + break; + case CPU_DEAD: + case CPU_UP_CANCELED: + free_pages((unsigned long)per_cpu(zcache_dstmem, cpu), + LZO_DSTMEM_PAGE_ORDER); + per_cpu(zcache_dstmem, cpu) = NULL; + kfree(per_cpu(zcache_workmem, cpu)); + per_cpu(zcache_workmem, cpu) = NULL; + kp = &per_cpu(zcache_preloads, cpu); + while (kp->nr) { + kmem_cache_free(zcache_objnode_cache, + kp->objnodes[kp->nr - 1]); + kp->objnodes[kp->nr - 1] = NULL; + kp->nr--; + } + kmem_cache_free(zcache_obj_cache, kp->obj); + free_page((unsigned long)kp->page); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block zcache_cpu_notifier_block = { + .notifier_call = zcache_cpu_notifier +}; + +#ifdef CONFIG_SYSFS +#define ZCACHE_SYSFS_RO(_name) \ + static ssize_t zcache_##_name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *buf) \ + { \ + return sprintf(buf, "%lu\n", zcache_##_name); \ + } \ + static struct kobj_attribute zcache_##_name##_attr = { \ + .attr = { .name = __stringify(_name), .mode = 0444 }, \ + .show = zcache_##_name##_show, \ + } + +#define ZCACHE_SYSFS_RO_ATOMIC(_name) \ + static ssize_t zcache_##_name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *buf) \ + { \ + return sprintf(buf, "%d\n", atomic_read(&zcache_##_name)); \ + } \ + static struct kobj_attribute zcache_##_name##_attr = { \ + .attr = { .name = __stringify(_name), .mode = 0444 }, \ + .show = zcache_##_name##_show, \ + } + +#define ZCACHE_SYSFS_RO_CUSTOM(_name, _func) \ + static ssize_t zcache_##_name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *buf) \ + { \ + return _func(buf); \ + } \ + static struct kobj_attribute zcache_##_name##_attr = { \ + .attr = { .name = __stringify(_name), .mode = 0444 }, \ + .show = zcache_##_name##_show, \ + } + +ZCACHE_SYSFS_RO(curr_obj_count_max); +ZCACHE_SYSFS_RO(curr_objnode_count_max); +ZCACHE_SYSFS_RO(flush_total); +ZCACHE_SYSFS_RO(flush_found); +ZCACHE_SYSFS_RO(flobj_total); +ZCACHE_SYSFS_RO(flobj_found); +ZCACHE_SYSFS_RO(failed_eph_puts); +ZCACHE_SYSFS_RO(failed_pers_puts); +ZCACHE_SYSFS_RO(zbud_curr_zbytes); +ZCACHE_SYSFS_RO(zbud_cumul_zpages); +ZCACHE_SYSFS_RO(zbud_cumul_zbytes); +ZCACHE_SYSFS_RO(zbud_buddied_count); +ZCACHE_SYSFS_RO(zbpg_unused_list_count); +ZCACHE_SYSFS_RO(evicted_raw_pages); +ZCACHE_SYSFS_RO(evicted_unbuddied_pages); +ZCACHE_SYSFS_RO(evicted_buddied_pages); +ZCACHE_SYSFS_RO(failed_get_free_pages); +ZCACHE_SYSFS_RO(failed_alloc); +ZCACHE_SYSFS_RO(put_to_flush); +ZCACHE_SYSFS_RO(aborted_preload); +ZCACHE_SYSFS_RO(aborted_shrink); +ZCACHE_SYSFS_RO(compress_poor); +ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_raw_pages); +ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_zpages); +ZCACHE_SYSFS_RO_ATOMIC(curr_obj_count); +ZCACHE_SYSFS_RO_ATOMIC(curr_objnode_count); +ZCACHE_SYSFS_RO_CUSTOM(zbud_unbuddied_list_counts, + zbud_show_unbuddied_list_counts); +ZCACHE_SYSFS_RO_CUSTOM(zbud_cumul_chunk_counts, + zbud_show_cumul_chunk_counts); + +static struct attribute *zcache_attrs[] = { + &zcache_curr_obj_count_attr.attr, + &zcache_curr_obj_count_max_attr.attr, + &zcache_curr_objnode_count_attr.attr, + &zcache_curr_objnode_count_max_attr.attr, + &zcache_flush_total_attr.attr, + &zcache_flobj_total_attr.attr, + &zcache_flush_found_attr.attr, + &zcache_flobj_found_attr.attr, + &zcache_failed_eph_puts_attr.attr, + &zcache_failed_pers_puts_attr.attr, + &zcache_compress_poor_attr.attr, + &zcache_zbud_curr_raw_pages_attr.attr, + &zcache_zbud_curr_zpages_attr.attr, + &zcache_zbud_curr_zbytes_attr.attr, + &zcache_zbud_cumul_zpages_attr.attr, + &zcache_zbud_cumul_zbytes_attr.attr, + &zcache_zbud_buddied_count_attr.attr, + &zcache_zbpg_unused_list_count_attr.attr, + &zcache_evicted_raw_pages_attr.attr, + &zcache_evicted_unbuddied_pages_attr.attr, + &zcache_evicted_buddied_pages_attr.attr, + &zcache_failed_get_free_pages_attr.attr, + &zcache_failed_alloc_attr.attr, + &zcache_put_to_flush_attr.attr, + &zcache_aborted_preload_attr.attr, + &zcache_aborted_shrink_attr.attr, + &zcache_zbud_unbuddied_list_counts_attr.attr, + &zcache_zbud_cumul_chunk_counts_attr.attr, + NULL, +}; + +static struct attribute_group zcache_attr_group = { + .attrs = zcache_attrs, + .name = "zcache", +}; + +#endif /* CONFIG_SYSFS */ +/* + * When zcache is disabled ("frozen"), pools can be created and destroyed, + * but all puts (and thus all other operations that require memory allocation) + * must fail. If zcache is unfrozen, accepts puts, then frozen again, + * data consistency requires all puts while frozen to be converted into + * flushes. + */ +static bool zcache_freeze; + +/* + * zcache shrinker interface (only useful for ephemeral pages, so zbud only) + */ +static int shrink_zcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) +{ + int ret = -1; + + if (nr >= 0) { + if (!(gfp_mask & __GFP_FS)) + /* does this case really need to be skipped? */ + goto out; + if (spin_trylock(&zcache_direct_reclaim_lock)) { + zbud_evict_pages(nr); + spin_unlock(&zcache_direct_reclaim_lock); + } else + zcache_aborted_shrink++; + } + ret = (int)atomic_read(&zcache_zbud_curr_raw_pages); +out: + return ret; +} + +static struct shrinker zcache_shrinker = { + .shrink = shrink_zcache_memory, + .seeks = DEFAULT_SEEKS, +}; + +/* + * zcache shims between cleancache/frontswap ops and tmem + */ + +static int zcache_put_page(int pool_id, struct tmem_oid *oidp, + uint32_t index, struct page *page) +{ + struct tmem_pool *pool; + int ret = -1; + + BUG_ON(!irqs_disabled()); + pool = zcache_get_pool_by_id(pool_id); + if (unlikely(pool == NULL)) + goto out; + if (!zcache_freeze && zcache_do_preload(pool) == 0) { + /* preload does preempt_disable on success */ + ret = tmem_put(pool, oidp, index, page); + if (ret < 0) { + if (is_ephemeral(pool)) + zcache_failed_eph_puts++; + else + zcache_failed_pers_puts++; + } + zcache_put_pool(pool); + preempt_enable_no_resched(); + } else { + zcache_put_to_flush++; + if (atomic_read(&pool->obj_count) > 0) + /* the put fails whether the flush succeeds or not */ + (void)tmem_flush_page(pool, oidp, index); + zcache_put_pool(pool); + } +out: + return ret; +} + +static int zcache_get_page(int pool_id, struct tmem_oid *oidp, + uint32_t index, struct page *page) +{ + struct tmem_pool *pool; + int ret = -1; + unsigned long flags; + + local_irq_save(flags); + pool = zcache_get_pool_by_id(pool_id); + if (likely(pool != NULL)) { + if (atomic_read(&pool->obj_count) > 0) + ret = tmem_get(pool, oidp, index, page); + zcache_put_pool(pool); + } + local_irq_restore(flags); + return ret; +} + +static int zcache_flush_page(int pool_id, struct tmem_oid *oidp, uint32_t index) +{ + struct tmem_pool *pool; + int ret = -1; + unsigned long flags; + + local_irq_save(flags); + zcache_flush_total++; + pool = zcache_get_pool_by_id(pool_id); + if (likely(pool != NULL)) { + if (atomic_read(&pool->obj_count) > 0) + ret = tmem_flush_page(pool, oidp, index); + zcache_put_pool(pool); + } + if (ret >= 0) + zcache_flush_found++; + local_irq_restore(flags); + return ret; +} + +static int zcache_flush_object(int pool_id, struct tmem_oid *oidp) +{ + struct tmem_pool *pool; + int ret = -1; + unsigned long flags; + + local_irq_save(flags); + zcache_flobj_total++; + pool = zcache_get_pool_by_id(pool_id); + if (likely(pool != NULL)) { + if (atomic_read(&pool->obj_count) > 0) + ret = tmem_flush_object(pool, oidp); + zcache_put_pool(pool); + } + if (ret >= 0) + zcache_flobj_found++; + local_irq_restore(flags); + return ret; +} + +static int zcache_destroy_pool(int pool_id) +{ + struct tmem_pool *pool = NULL; + int ret = -1; + + if (pool_id < 0) + goto out; + pool = zcache_client.tmem_pools[pool_id]; + if (pool == NULL) + goto out; + zcache_client.tmem_pools[pool_id] = NULL; + /* wait for pool activity on other cpus to quiesce */ + while (atomic_read(&pool->refcount) != 0) + ; + local_bh_disable(); + ret = tmem_destroy_pool(pool); + local_bh_enable(); + kfree(pool); + pr_info("zcache: destroyed pool id=%d\n", pool_id); +out: + return ret; +} + +static int zcache_new_pool(uint32_t flags) +{ + int poolid = -1; + struct tmem_pool *pool; + + pool = kmalloc(sizeof(struct tmem_pool), GFP_KERNEL); + if (pool == NULL) { + pr_info("zcache: pool creation failed: out of memory\n"); + goto out; + } + + for (poolid = 0; poolid < MAX_POOLS_PER_CLIENT; poolid++) + if (zcache_client.tmem_pools[poolid] == NULL) + break; + if (poolid >= MAX_POOLS_PER_CLIENT) { + pr_info("zcache: pool creation failed: max exceeded\n"); + kfree(pool); + poolid = -1; + goto out; + } + atomic_set(&pool->refcount, 0); + pool->client = &zcache_client; + pool->pool_id = poolid; + tmem_new_pool(pool, flags); + zcache_client.tmem_pools[poolid] = pool; + pr_info("zcache: created %s tmem pool, id=%d\n", + flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral", + poolid); +out: + return poolid; +} + +/********** + * Two kernel functionalities currently can be layered on top of tmem. + * These are "cleancache" which is used as a second-chance cache for clean + * page cache pages; and "frontswap" which is used for swap pages + * to avoid writes to disk. A generic "shim" is provided here for each + * to translate in-kernel semantics to zcache semantics. + */ + +#ifdef CONFIG_CLEANCACHE +static void zcache_cleancache_put_page(int pool_id, + struct cleancache_filekey key, + pgoff_t index, struct page *page) +{ + u32 ind = (u32) index; + struct tmem_oid oid = *(struct tmem_oid *)&key; + + if (likely(ind == index)) + (void)zcache_put_page(pool_id, &oid, index, page); +} + +static int zcache_cleancache_get_page(int pool_id, + struct cleancache_filekey key, + pgoff_t index, struct page *page) +{ + u32 ind = (u32) index; + struct tmem_oid oid = *(struct tmem_oid *)&key; + int ret = -1; + + if (likely(ind == index)) + ret = zcache_get_page(pool_id, &oid, index, page); + return ret; +} + +static void zcache_cleancache_flush_page(int pool_id, + struct cleancache_filekey key, + pgoff_t index) +{ + u32 ind = (u32) index; + struct tmem_oid oid = *(struct tmem_oid *)&key; + + if (likely(ind == index)) + (void)zcache_flush_page(pool_id, &oid, ind); +} + +static void zcache_cleancache_flush_inode(int pool_id, + struct cleancache_filekey key) +{ + struct tmem_oid oid = *(struct tmem_oid *)&key; + + (void)zcache_flush_object(pool_id, &oid); +} + +static void zcache_cleancache_flush_fs(int pool_id) +{ + if (pool_id >= 0) + (void)zcache_destroy_pool(pool_id); +} + +static int zcache_cleancache_init_fs(size_t pagesize) +{ + BUG_ON(sizeof(struct cleancache_filekey) != + sizeof(struct tmem_oid)); + BUG_ON(pagesize != PAGE_SIZE); + return zcache_new_pool(0); +} + +static int zcache_cleancache_init_shared_fs(char *uuid, size_t pagesize) +{ + /* shared pools are unsupported and map to private */ + BUG_ON(sizeof(struct cleancache_filekey) != + sizeof(struct tmem_oid)); + BUG_ON(pagesize != PAGE_SIZE); + return zcache_new_pool(0); +} + +static struct cleancache_ops zcache_cleancache_ops = { + .put_page = zcache_cleancache_put_page, + .get_page = zcache_cleancache_get_page, + .flush_page = zcache_cleancache_flush_page, + .flush_inode = zcache_cleancache_flush_inode, + .flush_fs = zcache_cleancache_flush_fs, + .init_shared_fs = zcache_cleancache_init_shared_fs, + .init_fs = zcache_cleancache_init_fs +}; + +struct cleancache_ops zcache_cleancache_register_ops(void) +{ + struct cleancache_ops old_ops = + cleancache_register_ops(&zcache_cleancache_ops); + + return old_ops; +} +#endif + +#ifdef CONFIG_FRONTSWAP +/* a single tmem poolid is used for all frontswap "types" (swapfiles) */ +static int zcache_frontswap_poolid = -1; + +/* + * Swizzling increases objects per swaptype, increasing tmem concurrency + * for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS + */ +#define SWIZ_BITS 4 +#define SWIZ_MASK ((1 << SWIZ_BITS) - 1) +#define _oswiz(_type, _ind) ((_type << SWIZ_BITS) | (_ind & SWIZ_MASK)) +#define iswiz(_ind) (_ind >> SWIZ_BITS) + +static inline struct tmem_oid oswiz(unsigned type, u32 ind) +{ + struct tmem_oid oid = { .oid = { 0 } }; + oid.oid[0] = _oswiz(type, ind); + return oid; +} + +static int zcache_frontswap_put_page(unsigned type, pgoff_t offset, + struct page *page) +{ + u64 ind64 = (u64)offset; + u32 ind = (u32)offset; + struct tmem_oid oid = oswiz(type, ind); + int ret = -1; + unsigned long flags; + + BUG_ON(!PageLocked(page)); + if (likely(ind64 == ind)) { + local_irq_save(flags); + ret = zcache_put_page(zcache_frontswap_poolid, &oid, + iswiz(ind), page); + local_irq_restore(flags); + } + return ret; +} + +/* returns 0 if the page was successfully gotten from frontswap, -1 if + * was not present (should never happen!) */ +static int zcache_frontswap_get_page(unsigned type, pgoff_t offset, + struct page *page) +{ + u64 ind64 = (u64)offset; + u32 ind = (u32)offset; + struct tmem_oid oid = oswiz(type, ind); + int ret = -1; + + BUG_ON(!PageLocked(page)); + if (likely(ind64 == ind)) + ret = zcache_get_page(zcache_frontswap_poolid, &oid, + iswiz(ind), page); + return ret; +} + +/* flush a single page from frontswap */ +static void zcache_frontswap_flush_page(unsigned type, pgoff_t offset) +{ + u64 ind64 = (u64)offset; + u32 ind = (u32)offset; + struct tmem_oid oid = oswiz(type, ind); + + if (likely(ind64 == ind)) + (void)zcache_flush_page(zcache_frontswap_poolid, &oid, + iswiz(ind)); +} + +/* flush all pages from the passed swaptype */ +static void zcache_frontswap_flush_area(unsigned type) +{ + struct tmem_oid oid; + int ind; + + for (ind = SWIZ_MASK; ind >= 0; ind--) { + oid = oswiz(type, ind); + (void)zcache_flush_object(zcache_frontswap_poolid, &oid); + } +} + +static void zcache_frontswap_init(unsigned ignored) +{ + /* a single tmem poolid is used for all frontswap "types" (swapfiles) */ + if (zcache_frontswap_poolid < 0) + zcache_frontswap_poolid = zcache_new_pool(TMEM_POOL_PERSIST); +} + +static struct frontswap_ops zcache_frontswap_ops = { + .put_page = zcache_frontswap_put_page, + .get_page = zcache_frontswap_get_page, + .flush_page = zcache_frontswap_flush_page, + .flush_area = zcache_frontswap_flush_area, + .init = zcache_frontswap_init +}; + +struct frontswap_ops zcache_frontswap_register_ops(void) +{ + struct frontswap_ops old_ops = + frontswap_register_ops(&zcache_frontswap_ops); + + return old_ops; +} +#endif + +/* + * zcache initialization + * NOTE FOR NOW zcache MUST BE PROVIDED AS A KERNEL BOOT PARAMETER OR + * NOTHING HAPPENS! + */ + +static int zcache_enabled; + +static int __init enable_zcache(char *s) +{ + zcache_enabled = 1; + return 1; +} +__setup("zcache", enable_zcache); + +/* allow independent dynamic disabling of cleancache and frontswap */ + +static int use_cleancache = 1; + +static int __init no_cleancache(char *s) +{ + use_cleancache = 0; + return 1; +} + +__setup("nocleancache", no_cleancache); + +static int use_frontswap = 1; + +static int __init no_frontswap(char *s) +{ + use_frontswap = 0; + return 1; +} + +__setup("nofrontswap", no_frontswap); + +static int __init zcache_init(void) +{ +#ifdef CONFIG_SYSFS + int ret = 0; + + ret = sysfs_create_group(mm_kobj, &zcache_attr_group); + if (ret) { + pr_err("zcache: can't create sysfs\n"); + goto out; + } +#endif /* CONFIG_SYSFS */ +#if defined(CONFIG_CLEANCACHE) || defined(CONFIG_FRONTSWAP) + if (zcache_enabled) { + unsigned int cpu; + + tmem_register_hostops(&zcache_hostops); + tmem_register_pamops(&zcache_pamops); + ret = register_cpu_notifier(&zcache_cpu_notifier_block); + if (ret) { + pr_err("zcache: can't register cpu notifier\n"); + goto out; + } + for_each_online_cpu(cpu) { + void *pcpu = (void *)(long)cpu; + zcache_cpu_notifier(&zcache_cpu_notifier_block, + CPU_UP_PREPARE, pcpu); + } + } + zcache_objnode_cache = kmem_cache_create("zcache_objnode", + sizeof(struct tmem_objnode), 0, 0, NULL); + zcache_obj_cache = kmem_cache_create("zcache_obj", + sizeof(struct tmem_obj), 0, 0, NULL); +#endif +#ifdef CONFIG_CLEANCACHE + if (zcache_enabled && use_cleancache) { + struct cleancache_ops old_ops; + + zbud_init(); + register_shrinker(&zcache_shrinker); + old_ops = zcache_cleancache_register_ops(); + pr_info("zcache: cleancache enabled using kernel " + "transcendent memory and compression buddies\n"); + if (old_ops.init_fs != NULL) + pr_warning("zcache: cleancache_ops overridden"); + } +#endif +#ifdef CONFIG_FRONTSWAP + if (zcache_enabled && use_frontswap) { + struct frontswap_ops old_ops; + + zcache_client.xvpool = xv_create_pool(); + if (zcache_client.xvpool == NULL) { + pr_err("zcache: can't create xvpool\n"); + goto out; + } + old_ops = zcache_frontswap_register_ops(); + pr_info("zcache: frontswap enabled using kernel " + "transcendent memory and xvmalloc\n"); + if (old_ops.init != NULL) + pr_warning("ktmem: frontswap_ops overridden"); + } +#endif +out: + return ret; +} + +module_init(zcache_init) From 1abd4f495eaa84e73b597f238cce704f06c54dc4 Mon Sep 17 00:00:00 2001 From: SecureCRT Date: Mon, 20 Aug 2012 00:51:37 +0800 Subject: [PATCH 04/14] fs: add field to superblock to support cleancache --- include/linux/fs.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/fs.h b/include/linux/fs.h index 9b678052..925a431e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1383,6 +1383,11 @@ struct super_block { * generic_show_options() */ char *s_options; + + /* + * Saved pool identifier for cleancache (-1 means none) + */ + int cleancache_poolid; }; extern struct timespec current_fs_time(struct super_block *sb); From 8b62d33820d84cc081745802f9a62425151047f4 Mon Sep 17 00:00:00 2001 From: SecureCRT Date: Mon, 20 Aug 2012 00:52:57 +0800 Subject: [PATCH 05/14] enable zcache & cleancache --- arch/arm/mach-msm/board-htcleo.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/mach-msm/board-htcleo.h b/arch/arm/mach-msm/board-htcleo.h index 8d633974..f20da79f 100755 --- a/arch/arm/mach-msm/board-htcleo.h +++ b/arch/arm/mach-msm/board-htcleo.h @@ -40,7 +40,7 @@ #define MSM_FB_SIZE 0x00600000 #define MSM_PMEM_MDP_BASE 0x3B700000 -#define MSM_PMEM_MDP_SIZE 0x02000000 +#define MSM_PMEM_MDP_SIZE 0x03000000 #define MSM_PMEM_ADSP_BASE 0x3D700000 #define MSM_PMEM_ADSP_SIZE 0x01800000 @@ -59,7 +59,7 @@ /* Begin EBI region */ #define PMEM_KERNEL_EBI1_SIZE 0x00028000 -#define MSM_PMEM_SF_SIZE 0x02000000 +#define MSM_PMEM_SF_SIZE 0x03000000 /* MSM_RAM_CONSOLE uses the last 0x00040000 of EBI memory, defined in msm_iomap.h #define MSM_RAM_CONSOLE_SIZE 0x00040000 From c2ff7098d4011efb6143305f62b71c0f95e38108 Mon Sep 17 00:00:00 2001 From: SecureCRT Date: Mon, 20 Aug 2012 00:53:53 +0800 Subject: [PATCH 06/14] enable zcache & cleancache --- arch/arm/configs/htcleo_defconfig | 6 ++++++ 1 file changed, 6 insertions(+) mode change 100644 => 100755 arch/arm/configs/htcleo_defconfig diff --git a/arch/arm/configs/htcleo_defconfig b/arch/arm/configs/htcleo_defconfig old mode 100644 new mode 100755 index 12f408f2..a26de0ce --- a/arch/arm/configs/htcleo_defconfig +++ b/arch/arm/configs/htcleo_defconfig @@ -406,6 +406,7 @@ CONFIG_DEFAULT_MMAP_MIN_ADDR=4096 CONFIG_ALIGNMENT_TRAP=y CONFIG_ALLOW_CPU_ALIGNMENT=y # CONFIG_UACCESS_WITH_MEMCPY is not set +CONFIG_CLEANCACHE=y # # Boot options @@ -1688,6 +1689,11 @@ CONFIG_ANDROID_LOW_MEMORY_KILLER=y # CONFIG_IIO is not set # CONFIG_BTPORT is not set +# +# ZCACHE +# +CONFIG_ZCACHE=y + # # ZRAM # From b6c1977f351a6095b33a667546ec0496cbe8ca20 Mon Sep 17 00:00:00 2001 From: SecureCRT Date: Mon, 20 Aug 2012 00:57:38 +0800 Subject: [PATCH 07/14] revert #8b62d33 --- arch/arm/mach-msm/board-htcleo.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/mach-msm/board-htcleo.h b/arch/arm/mach-msm/board-htcleo.h index f20da79f..8d633974 100755 --- a/arch/arm/mach-msm/board-htcleo.h +++ b/arch/arm/mach-msm/board-htcleo.h @@ -40,7 +40,7 @@ #define MSM_FB_SIZE 0x00600000 #define MSM_PMEM_MDP_BASE 0x3B700000 -#define MSM_PMEM_MDP_SIZE 0x03000000 +#define MSM_PMEM_MDP_SIZE 0x02000000 #define MSM_PMEM_ADSP_BASE 0x3D700000 #define MSM_PMEM_ADSP_SIZE 0x01800000 @@ -59,7 +59,7 @@ /* Begin EBI region */ #define PMEM_KERNEL_EBI1_SIZE 0x00028000 -#define MSM_PMEM_SF_SIZE 0x03000000 +#define MSM_PMEM_SF_SIZE 0x02000000 /* MSM_RAM_CONSOLE uses the last 0x00040000 of EBI memory, defined in msm_iomap.h #define MSM_RAM_CONSOLE_SIZE 0x00040000 From 3d343ac32afc6eb933d98eb2dc4b3ad532de7f3a Mon Sep 17 00:00:00 2001 From: SecureCRT Date: Mon, 20 Aug 2012 12:10:10 +0800 Subject: [PATCH 08/14] mm/fs: add hooks to support cleancache --- fs/buffer.c | 5 +++++ fs/mpage.c | 7 +++++++ fs/super.c | 3 +++ mm/filemap.c | 11 +++++++++++ mm/truncate.c | 8 +++++++- 5 files changed, 33 insertions(+), 1 deletion(-) mode change 100644 => 100755 fs/buffer.c mode change 100644 => 100755 fs/mpage.c mode change 100644 => 100755 fs/super.c mode change 100644 => 100755 mm/filemap.c mode change 100644 => 100755 mm/truncate.c diff --git a/fs/buffer.c b/fs/buffer.c old mode 100644 new mode 100755 index 6fa53025..abf188f3 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -41,6 +41,7 @@ #include #include #include +#include static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); @@ -276,6 +277,10 @@ void invalidate_bdev(struct block_device *bdev) invalidate_bh_lrus(); invalidate_mapping_pages(mapping, 0, -1); + /* 99% of the time, we don't need to flush the cleancache on the bdev. + * But, for the strange corners, lets be cautious + */ + cleancache_flush_inode(mapping); } EXPORT_SYMBOL(invalidate_bdev); diff --git a/fs/mpage.c b/fs/mpage.c old mode 100644 new mode 100755 index 42381bd6..b5677aba --- a/fs/mpage.c +++ b/fs/mpage.c @@ -26,6 +26,7 @@ #include #include #include +#include /* * I/O completion handler for multipage BIOs. @@ -284,6 +285,12 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, } else if (fully_mapped) { SetPageMappedToDisk(page); } + + if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) && + cleancache_get_page(page) == 0) { + SetPageUptodate(page); + goto confused; + } /* * This page will go to BIO. Do we need to send this BIO off first? diff --git a/fs/super.c b/fs/super.c old mode 100644 new mode 100755 index aff046b0..a0ba74eb --- a/fs/super.c +++ b/fs/super.c @@ -38,6 +38,7 @@ #include #include #include +#include #include "internal.h" @@ -104,6 +105,7 @@ static struct super_block *alloc_super(struct file_system_type *type) s->s_qcop = sb_quotactl_ops; s->s_op = &default_op; s->s_time_gran = 1000000000; + s->cleancache_poolid = -1; } out: return s; @@ -219,6 +221,7 @@ void deactivate_locked_super(struct super_block *s) s->s_count -= S_BIAS-1; spin_unlock(&sb_lock); vfs_dq_off(s, 0); + cleancache_flush_fs(s); fs->kill_sb(s); put_filesystem(fs); put_super(s); diff --git a/mm/filemap.c b/mm/filemap.c old mode 100644 new mode 100755 index 8e96c907..a4399ff2 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -34,6 +34,7 @@ #include /* for BUG_ON(!in_atomic()) only */ #include #include /* for page_is_file_cache() */ +#include #include "internal.h" /* @@ -119,6 +120,16 @@ void __remove_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; + /* + * if we're uptodate, flush out into the cleancache, otherwise + * invalidate any existing cleancache entries. We can't leave + * stale data around in the cleancache once our page is gone + */ + if (PageUptodate(page) && PageMappedToDisk(page)) + cleancache_put_page(page); + else + cleancache_flush_page(mapping, page); + radix_tree_delete(&mapping->page_tree, page->index); page->mapping = NULL; mapping->nrpages--; diff --git a/mm/truncate.c b/mm/truncate.c old mode 100644 new mode 100755 index 258bda7e..31c639ea --- a/mm/truncate.c +++ b/mm/truncate.c @@ -18,6 +18,7 @@ #include #include /* grr. try_to_release_page, do_invalidatepage */ +#include #include "internal.h" @@ -50,6 +51,7 @@ void do_invalidatepage(struct page *page, unsigned long offset) static inline void truncate_partial_page(struct page *page, unsigned partial) { zero_user_segment(page, partial, PAGE_CACHE_SIZE); + cleancache_flush_page(page->mapping, page); if (page_has_private(page)) do_invalidatepage(page, partial); } @@ -213,7 +215,8 @@ void truncate_inode_pages_range(struct address_space *mapping, struct pagevec pvec; pgoff_t next; int i; - + + cleancache_flush_inode(mapping); if (mapping->nrpages == 0) return; @@ -287,6 +290,7 @@ void truncate_inode_pages_range(struct address_space *mapping, } pagevec_release(&pvec); } + cleancache_flush_inode(mapping); } EXPORT_SYMBOL(truncate_inode_pages_range); @@ -423,6 +427,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, int did_range_unmap = 0; int wrapped = 0; + cleancache_flush_inode(mapping); pagevec_init(&pvec, 0); next = start; while (next <= end && !wrapped && @@ -479,6 +484,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, pagevec_release(&pvec); cond_resched(); } + cleancache_flush_inode(mapping); return ret; } EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); From 7c50bd921f08bf56b04e171115dd7e3f52896f51 Mon Sep 17 00:00:00 2001 From: SecureCRT Date: Mon, 20 Aug 2012 15:24:37 +0800 Subject: [PATCH 09/14] staging:lowmemkiller add Fudgeswap fudgeswap acts as follows: If set to non zero (defualt is 512k): Check for the amount of SWAP_FREE space avalible If > 0KB is avalible: if fudgeswap > swapfree: other_file += swapfree else: other_file += fugeswap In short: we will add in fugeswap as long as its less then the free swap Setting this to a very large positive number will indicate swap ought to be fully used as free (and will slow the system down) smaller numbers will allow you to put some pressure on SWAP without slowing the system down as much. small negitive numbers will allow the system to be faster at the same minfree level. default is 512 to give a very little bit of pressure to use some swap but this can be modified at runtime via: /sys/module/lowmemorykiller/parameters/fugeswap originally by ezterry Please enter the commit message for your changes. Lines starting --- drivers/staging/android/lowmemorykiller.c | 24 +++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c index 05ebece0..5930a813 100755 --- a/drivers/staging/android/lowmemorykiller.c +++ b/drivers/staging/android/lowmemorykiller.c @@ -41,6 +41,11 @@ #include #include +#ifdef CONFIG_SWAP +#include +#include +#endif + static uint32_t lowmem_debug_level = 2; static int lowmem_adj[6] = { 0, @@ -64,6 +69,10 @@ static struct task_struct *lowmem_deathpending; static unsigned long lowmem_deathpending_timeout; static struct kobject *lowmem_kobj; +#ifdef CONFIG_SWAP +static int fudgeswap = 512; +#endif + #define lowmem_print(level, x...) \ do { \ if (lowmem_debug_level >= (level)) \ @@ -122,7 +131,19 @@ static inline void get_free_ram(int *other_free, int *other_file) *other_free = global_page_state(NR_FREE_PAGES); *other_file = global_page_state(NR_FILE_PAGES) - global_page_state(NR_SHMEM); +#ifdef CONFIG_SWAP + if(fudgeswap != 0){ + struct sysinfo si; + si_swapinfo(&si); + if(si.freeswap > 0){ + if(fudgeswap > si.freeswap) + other_file += si.freeswap; + else + other_file += fudgeswap; + } + } +#endif if (offlining) { /* Discount all free space in the section being offlined */ for_each_zone(zone) { @@ -347,6 +368,9 @@ module_param_named(debug_level, lowmem_debug_level, uint, S_IRUGO | S_IWUSR); module_param_named(notify_trigger, lowmem_minfree_notif_trigger, uint, S_IRUGO | S_IWUSR); +#ifdef CONFIG_SWAP +module_param_named(fudgeswap, fudgeswap, int, S_IRUGO | S_IWUSR); +#endif module_init(lowmem_init); module_exit(lowmem_exit); From f3a9b636b38756c67fc9a3edf2db3088dfd7fd3c Mon Sep 17 00:00:00 2001 From: SecureCRT Date: Mon, 20 Aug 2012 23:10:46 +0800 Subject: [PATCH 10/14] Revert "mm/fs: add hooks to support cleancache" This reverts commit 3d343ac32afc6eb933d98eb2dc4b3ad532de7f3a. --- fs/buffer.c | 5 ----- fs/mpage.c | 7 ------- fs/super.c | 3 --- mm/filemap.c | 11 ----------- mm/truncate.c | 8 +------- 5 files changed, 1 insertion(+), 33 deletions(-) mode change 100755 => 100644 fs/buffer.c mode change 100755 => 100644 fs/mpage.c mode change 100755 => 100644 fs/super.c mode change 100755 => 100644 mm/filemap.c mode change 100755 => 100644 mm/truncate.c diff --git a/fs/buffer.c b/fs/buffer.c old mode 100755 new mode 100644 index abf188f3..6fa53025 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -41,7 +41,6 @@ #include #include #include -#include static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); @@ -277,10 +276,6 @@ void invalidate_bdev(struct block_device *bdev) invalidate_bh_lrus(); invalidate_mapping_pages(mapping, 0, -1); - /* 99% of the time, we don't need to flush the cleancache on the bdev. - * But, for the strange corners, lets be cautious - */ - cleancache_flush_inode(mapping); } EXPORT_SYMBOL(invalidate_bdev); diff --git a/fs/mpage.c b/fs/mpage.c old mode 100755 new mode 100644 index b5677aba..42381bd6 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -26,7 +26,6 @@ #include #include #include -#include /* * I/O completion handler for multipage BIOs. @@ -285,12 +284,6 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, } else if (fully_mapped) { SetPageMappedToDisk(page); } - - if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) && - cleancache_get_page(page) == 0) { - SetPageUptodate(page); - goto confused; - } /* * This page will go to BIO. Do we need to send this BIO off first? diff --git a/fs/super.c b/fs/super.c old mode 100755 new mode 100644 index a0ba74eb..aff046b0 --- a/fs/super.c +++ b/fs/super.c @@ -38,7 +38,6 @@ #include #include #include -#include #include "internal.h" @@ -105,7 +104,6 @@ static struct super_block *alloc_super(struct file_system_type *type) s->s_qcop = sb_quotactl_ops; s->s_op = &default_op; s->s_time_gran = 1000000000; - s->cleancache_poolid = -1; } out: return s; @@ -221,7 +219,6 @@ void deactivate_locked_super(struct super_block *s) s->s_count -= S_BIAS-1; spin_unlock(&sb_lock); vfs_dq_off(s, 0); - cleancache_flush_fs(s); fs->kill_sb(s); put_filesystem(fs); put_super(s); diff --git a/mm/filemap.c b/mm/filemap.c old mode 100755 new mode 100644 index a4399ff2..8e96c907 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -34,7 +34,6 @@ #include /* for BUG_ON(!in_atomic()) only */ #include #include /* for page_is_file_cache() */ -#include #include "internal.h" /* @@ -120,16 +119,6 @@ void __remove_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; - /* - * if we're uptodate, flush out into the cleancache, otherwise - * invalidate any existing cleancache entries. We can't leave - * stale data around in the cleancache once our page is gone - */ - if (PageUptodate(page) && PageMappedToDisk(page)) - cleancache_put_page(page); - else - cleancache_flush_page(mapping, page); - radix_tree_delete(&mapping->page_tree, page->index); page->mapping = NULL; mapping->nrpages--; diff --git a/mm/truncate.c b/mm/truncate.c old mode 100755 new mode 100644 index 31c639ea..258bda7e --- a/mm/truncate.c +++ b/mm/truncate.c @@ -18,7 +18,6 @@ #include #include /* grr. try_to_release_page, do_invalidatepage */ -#include #include "internal.h" @@ -51,7 +50,6 @@ void do_invalidatepage(struct page *page, unsigned long offset) static inline void truncate_partial_page(struct page *page, unsigned partial) { zero_user_segment(page, partial, PAGE_CACHE_SIZE); - cleancache_flush_page(page->mapping, page); if (page_has_private(page)) do_invalidatepage(page, partial); } @@ -215,8 +213,7 @@ void truncate_inode_pages_range(struct address_space *mapping, struct pagevec pvec; pgoff_t next; int i; - - cleancache_flush_inode(mapping); + if (mapping->nrpages == 0) return; @@ -290,7 +287,6 @@ void truncate_inode_pages_range(struct address_space *mapping, } pagevec_release(&pvec); } - cleancache_flush_inode(mapping); } EXPORT_SYMBOL(truncate_inode_pages_range); @@ -427,7 +423,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping, int did_range_unmap = 0; int wrapped = 0; - cleancache_flush_inode(mapping); pagevec_init(&pvec, 0); next = start; while (next <= end && !wrapped && @@ -484,7 +479,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping, pagevec_release(&pvec); cond_resched(); } - cleancache_flush_inode(mapping); return ret; } EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); From 0f794ead76c1db9f60edb03bdd2632c15b936401 Mon Sep 17 00:00:00 2001 From: SecureCRT Date: Mon, 20 Aug 2012 23:12:10 +0800 Subject: [PATCH 11/14] Revert "enable zcache & cleancache" This reverts commit c2ff7098d4011efb6143305f62b71c0f95e38108. --- arch/arm/configs/htcleo_defconfig | 6 ------ 1 file changed, 6 deletions(-) mode change 100755 => 100644 arch/arm/configs/htcleo_defconfig diff --git a/arch/arm/configs/htcleo_defconfig b/arch/arm/configs/htcleo_defconfig old mode 100755 new mode 100644 index a26de0ce..12f408f2 --- a/arch/arm/configs/htcleo_defconfig +++ b/arch/arm/configs/htcleo_defconfig @@ -406,7 +406,6 @@ CONFIG_DEFAULT_MMAP_MIN_ADDR=4096 CONFIG_ALIGNMENT_TRAP=y CONFIG_ALLOW_CPU_ALIGNMENT=y # CONFIG_UACCESS_WITH_MEMCPY is not set -CONFIG_CLEANCACHE=y # # Boot options @@ -1689,11 +1688,6 @@ CONFIG_ANDROID_LOW_MEMORY_KILLER=y # CONFIG_IIO is not set # CONFIG_BTPORT is not set -# -# ZCACHE -# -CONFIG_ZCACHE=y - # # ZRAM # From 4cecd4ccb2ef21842d6ab0fb62a15d3b1a1801aa Mon Sep 17 00:00:00 2001 From: SecureCRT Date: Mon, 20 Aug 2012 23:12:22 +0800 Subject: [PATCH 12/14] Revert "fs: add field to superblock to support cleancache" This reverts commit 1abd4f495eaa84e73b597f238cce704f06c54dc4. --- include/linux/fs.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 925a431e..9b678052 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1383,11 +1383,6 @@ struct super_block { * generic_show_options() */ char *s_options; - - /* - * Saved pool identifier for cleancache (-1 means none) - */ - int cleancache_poolid; }; extern struct timespec current_fs_time(struct super_block *sb); From 61eb7c5296af5ca7b86f8d8075f3dd248b3534f0 Mon Sep 17 00:00:00 2001 From: SecureCRT Date: Mon, 20 Aug 2012 23:14:51 +0800 Subject: [PATCH 13/14] Revert "add zcache" This reverts commit 8eb6724dbfb99bb1f17f3192483fafc1f9eb73fe. --- drivers/staging/Kconfig | 1 - drivers/staging/Makefile | 1 - drivers/staging/zcache/Kconfig | 13 - drivers/staging/zcache/Makefile | 3 - drivers/staging/zcache/tmem.c | 710 ------------- drivers/staging/zcache/tmem.h | 195 ---- drivers/staging/zcache/zcache.c | 1658 ------------------------------- 7 files changed, 2581 deletions(-) delete mode 100755 drivers/staging/zcache/Kconfig delete mode 100755 drivers/staging/zcache/Makefile delete mode 100755 drivers/staging/zcache/tmem.c delete mode 100755 drivers/staging/zcache/tmem.h delete mode 100755 drivers/staging/zcache/zcache.c diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig index e4c3c9dd..8ee4bfa6 100644 --- a/drivers/staging/Kconfig +++ b/drivers/staging/Kconfig @@ -125,6 +125,5 @@ source "drivers/staging/iio/Kconfig" source "drivers/staging/zram/Kconfig" -source "drivers/staging/zcache/Kconfig" endif # !STAGING_EXCLUDE_BUILD endif # STAGING diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile index 5f0f554b..5a1b7341 100644 --- a/drivers/staging/Makefile +++ b/drivers/staging/Makefile @@ -45,5 +45,4 @@ obj-$(CONFIG_DX_SEP) += sep/ obj-$(CONFIG_IIO) += iio/ obj-$(CONFIG_ZRAM) += zram/ obj-$(CONFIG_XVMALLOC) += zram/ -obj-$(CONFIG_ZCACHE) += zcache/ diff --git a/drivers/staging/zcache/Kconfig b/drivers/staging/zcache/Kconfig deleted file mode 100755 index 7fabcb2b..00000000 --- a/drivers/staging/zcache/Kconfig +++ /dev/null @@ -1,13 +0,0 @@ -config ZCACHE - tristate "Dynamic compression of swap pages and clean pagecache pages" - depends on CLEANCACHE || FRONTSWAP - select XVMALLOC - select LZO_COMPRESS - select LZO_DECOMPRESS - default n - help - Zcache doubles RAM efficiency while providing a significant - performance boosts on many workloads. Zcache uses lzo1x - compression and an in-kernel implementation of transcendent - memory to store clean page cache pages and swap in RAM, - providing a noticeable reduction in disk I/O. diff --git a/drivers/staging/zcache/Makefile b/drivers/staging/zcache/Makefile deleted file mode 100755 index f5ec64f9..00000000 --- a/drivers/staging/zcache/Makefile +++ /dev/null @@ -1,3 +0,0 @@ -zcache-y := tmem.o - -obj-$(CONFIG_ZCACHE) += zcache.o diff --git a/drivers/staging/zcache/tmem.c b/drivers/staging/zcache/tmem.c deleted file mode 100755 index e954d405..00000000 --- a/drivers/staging/zcache/tmem.c +++ /dev/null @@ -1,710 +0,0 @@ -/* - * In-kernel transcendent memory (generic implementation) - * - * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp. - * - * The primary purpose of Transcedent Memory ("tmem") is to map object-oriented - * "handles" (triples containing a pool id, and object id, and an index), to - * pages in a page-accessible memory (PAM). Tmem references the PAM pages via - * an abstract "pampd" (PAM page-descriptor), which can be operated on by a - * set of functions (pamops). Each pampd contains some representation of - * PAGE_SIZE bytes worth of data. Tmem must support potentially millions of - * pages and must be able to insert, find, and delete these pages at a - * potential frequency of thousands per second concurrently across many CPUs, - * (and, if used with KVM, across many vcpus across many guests). - * Tmem is tracked with a hierarchy of data structures, organized by - * the elements in a handle-tuple: pool_id, object_id, and page index. - * One or more "clients" (e.g. guests) each provide one or more tmem_pools. - * Each pool, contains a hash table of rb_trees of tmem_objs. Each - * tmem_obj contains a radix-tree-like tree of pointers, with intermediate - * nodes called tmem_objnodes. Each leaf pointer in this tree points to - * a pampd, which is accessible only through a small set of callbacks - * registered by the PAM implementation (see tmem_register_pamops). Tmem - * does all memory allocation via a set of callbacks registered by the tmem - * host implementation (e.g. see tmem_register_hostops). - */ - -#include -#include -#include - -#include "tmem.h" - -/* data structure sentinels used for debugging... see tmem.h */ -#define POOL_SENTINEL 0x87658765 -#define OBJ_SENTINEL 0x12345678 -#define OBJNODE_SENTINEL 0xfedcba09 - -/* - * A tmem host implementation must use this function to register callbacks - * for memory allocation. - */ -static struct tmem_hostops tmem_hostops; - -static void tmem_objnode_tree_init(void); - -void tmem_register_hostops(struct tmem_hostops *m) -{ - tmem_objnode_tree_init(); - tmem_hostops = *m; -} - -/* - * A tmem host implementation must use this function to register - * callbacks for a page-accessible memory (PAM) implementation - */ -static struct tmem_pamops tmem_pamops; - -void tmem_register_pamops(struct tmem_pamops *m) -{ - tmem_pamops = *m; -} - -/* - * Oid's are potentially very sparse and tmem_objs may have an indeterminately - * short life, being added and deleted at a relatively high frequency. - * So an rb_tree is an ideal data structure to manage tmem_objs. But because - * of the potentially huge number of tmem_objs, each pool manages a hashtable - * of rb_trees to reduce search, insert, delete, and rebalancing time. - * Each hashbucket also has a lock to manage concurrent access. - * - * The following routines manage tmem_objs. When any tmem_obj is accessed, - * the hashbucket lock must be held. - */ - -/* searches for object==oid in pool, returns locked object if found */ -static struct tmem_obj *tmem_obj_find(struct tmem_hashbucket *hb, - struct tmem_oid *oidp) -{ - struct rb_node *rbnode; - struct tmem_obj *obj; - - rbnode = hb->obj_rb_root.rb_node; - while (rbnode) { - BUG_ON(RB_EMPTY_NODE(rbnode)); - obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node); - switch (tmem_oid_compare(oidp, &obj->oid)) { - case 0: /* equal */ - goto out; - case -1: - rbnode = rbnode->rb_left; - break; - case 1: - rbnode = rbnode->rb_right; - break; - } - } - obj = NULL; -out: - return obj; -} - -static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *); - -/* free an object that has no more pampds in it */ -static void tmem_obj_free(struct tmem_obj *obj, struct tmem_hashbucket *hb) -{ - struct tmem_pool *pool; - - BUG_ON(obj == NULL); - ASSERT_SENTINEL(obj, OBJ); - BUG_ON(obj->pampd_count > 0); - pool = obj->pool; - BUG_ON(pool == NULL); - if (obj->objnode_tree_root != NULL) /* may be "stump" with no leaves */ - tmem_pampd_destroy_all_in_obj(obj); - BUG_ON(obj->objnode_tree_root != NULL); - BUG_ON((long)obj->objnode_count != 0); - atomic_dec(&pool->obj_count); - BUG_ON(atomic_read(&pool->obj_count) < 0); - INVERT_SENTINEL(obj, OBJ); - obj->pool = NULL; - tmem_oid_set_invalid(&obj->oid); - rb_erase(&obj->rb_tree_node, &hb->obj_rb_root); -} - -/* - * initialize, and insert an tmem_object_root (called only if find failed) - */ -static void tmem_obj_init(struct tmem_obj *obj, struct tmem_hashbucket *hb, - struct tmem_pool *pool, - struct tmem_oid *oidp) -{ - struct rb_root *root = &hb->obj_rb_root; - struct rb_node **new = &(root->rb_node), *parent = NULL; - struct tmem_obj *this; - - BUG_ON(pool == NULL); - atomic_inc(&pool->obj_count); - obj->objnode_tree_height = 0; - obj->objnode_tree_root = NULL; - obj->pool = pool; - obj->oid = *oidp; - obj->objnode_count = 0; - obj->pampd_count = 0; - SET_SENTINEL(obj, OBJ); - while (*new) { - BUG_ON(RB_EMPTY_NODE(*new)); - this = rb_entry(*new, struct tmem_obj, rb_tree_node); - parent = *new; - switch (tmem_oid_compare(oidp, &this->oid)) { - case 0: - BUG(); /* already present; should never happen! */ - break; - case -1: - new = &(*new)->rb_left; - break; - case 1: - new = &(*new)->rb_right; - break; - } - } - rb_link_node(&obj->rb_tree_node, parent, new); - rb_insert_color(&obj->rb_tree_node, root); -} - -/* - * Tmem is managed as a set of tmem_pools with certain attributes, such as - * "ephemeral" vs "persistent". These attributes apply to all tmem_objs - * and all pampds that belong to a tmem_pool. A tmem_pool is created - * or deleted relatively rarely (for example, when a filesystem is - * mounted or unmounted. - */ - -/* flush all data from a pool and, optionally, free it */ -static void tmem_pool_flush(struct tmem_pool *pool, bool destroy) -{ - struct rb_node *rbnode; - struct tmem_obj *obj; - struct tmem_hashbucket *hb = &pool->hashbucket[0]; - int i; - - BUG_ON(pool == NULL); - for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) { - spin_lock(&hb->lock); - rbnode = rb_first(&hb->obj_rb_root); - while (rbnode != NULL) { - obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node); - rbnode = rb_next(rbnode); - tmem_pampd_destroy_all_in_obj(obj); - tmem_obj_free(obj, hb); - (*tmem_hostops.obj_free)(obj, pool); - } - spin_unlock(&hb->lock); - } - if (destroy) - list_del(&pool->pool_list); -} - -/* - * A tmem_obj contains a radix-tree-like tree in which the intermediate - * nodes are called tmem_objnodes. (The kernel lib/radix-tree.c implementation - * is very specialized and tuned for specific uses and is not particularly - * suited for use from this code, though some code from the core algorithms has - * been reused, thus the copyright notices below). Each tmem_objnode contains - * a set of pointers which point to either a set of intermediate tmem_objnodes - * or a set of of pampds. - * - * Portions Copyright (C) 2001 Momchil Velikov - * Portions Copyright (C) 2001 Christoph Hellwig - * Portions Copyright (C) 2005 SGI, Christoph Lameter - */ - -struct tmem_objnode_tree_path { - struct tmem_objnode *objnode; - int offset; -}; - -/* objnode height_to_maxindex translation */ -static unsigned long tmem_objnode_tree_h2max[OBJNODE_TREE_MAX_PATH + 1]; - -static void tmem_objnode_tree_init(void) -{ - unsigned int ht, tmp; - - for (ht = 0; ht < ARRAY_SIZE(tmem_objnode_tree_h2max); ht++) { - tmp = ht * OBJNODE_TREE_MAP_SHIFT; - if (tmp >= OBJNODE_TREE_INDEX_BITS) - tmem_objnode_tree_h2max[ht] = ~0UL; - else - tmem_objnode_tree_h2max[ht] = - (~0UL >> (OBJNODE_TREE_INDEX_BITS - tmp - 1)) >> 1; - } -} - -static struct tmem_objnode *tmem_objnode_alloc(struct tmem_obj *obj) -{ - struct tmem_objnode *objnode; - - ASSERT_SENTINEL(obj, OBJ); - BUG_ON(obj->pool == NULL); - ASSERT_SENTINEL(obj->pool, POOL); - objnode = (*tmem_hostops.objnode_alloc)(obj->pool); - if (unlikely(objnode == NULL)) - goto out; - objnode->obj = obj; - SET_SENTINEL(objnode, OBJNODE); - memset(&objnode->slots, 0, sizeof(objnode->slots)); - objnode->slots_in_use = 0; - obj->objnode_count++; -out: - return objnode; -} - -static void tmem_objnode_free(struct tmem_objnode *objnode) -{ - struct tmem_pool *pool; - int i; - - BUG_ON(objnode == NULL); - for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) - BUG_ON(objnode->slots[i] != NULL); - ASSERT_SENTINEL(objnode, OBJNODE); - INVERT_SENTINEL(objnode, OBJNODE); - BUG_ON(objnode->obj == NULL); - ASSERT_SENTINEL(objnode->obj, OBJ); - pool = objnode->obj->pool; - BUG_ON(pool == NULL); - ASSERT_SENTINEL(pool, POOL); - objnode->obj->objnode_count--; - objnode->obj = NULL; - (*tmem_hostops.objnode_free)(objnode, pool); -} - -/* - * lookup index in object and return associated pampd (or NULL if not found) - */ -static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index) -{ - unsigned int height, shift; - struct tmem_objnode **slot = NULL; - - BUG_ON(obj == NULL); - ASSERT_SENTINEL(obj, OBJ); - BUG_ON(obj->pool == NULL); - ASSERT_SENTINEL(obj->pool, POOL); - - height = obj->objnode_tree_height; - if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) - goto out; - if (height == 0 && obj->objnode_tree_root) { - slot = &obj->objnode_tree_root; - goto out; - } - shift = (height-1) * OBJNODE_TREE_MAP_SHIFT; - slot = &obj->objnode_tree_root; - while (height > 0) { - if (*slot == NULL) - goto out; - slot = (struct tmem_objnode **) - ((*slot)->slots + - ((index >> shift) & OBJNODE_TREE_MAP_MASK)); - shift -= OBJNODE_TREE_MAP_SHIFT; - height--; - } -out: - return slot != NULL ? *slot : NULL; -} - -static int tmem_pampd_add_to_obj(struct tmem_obj *obj, uint32_t index, - void *pampd) -{ - int ret = 0; - struct tmem_objnode *objnode = NULL, *newnode, *slot; - unsigned int height, shift; - int offset = 0; - - /* if necessary, extend the tree to be higher */ - if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) { - height = obj->objnode_tree_height + 1; - if (index > tmem_objnode_tree_h2max[height]) - while (index > tmem_objnode_tree_h2max[height]) - height++; - if (obj->objnode_tree_root == NULL) { - obj->objnode_tree_height = height; - goto insert; - } - do { - newnode = tmem_objnode_alloc(obj); - if (!newnode) { - ret = -ENOMEM; - goto out; - } - newnode->slots[0] = obj->objnode_tree_root; - newnode->slots_in_use = 1; - obj->objnode_tree_root = newnode; - obj->objnode_tree_height++; - } while (height > obj->objnode_tree_height); - } -insert: - slot = obj->objnode_tree_root; - height = obj->objnode_tree_height; - shift = (height-1) * OBJNODE_TREE_MAP_SHIFT; - while (height > 0) { - if (slot == NULL) { - /* add a child objnode. */ - slot = tmem_objnode_alloc(obj); - if (!slot) { - ret = -ENOMEM; - goto out; - } - if (objnode) { - - objnode->slots[offset] = slot; - objnode->slots_in_use++; - } else - obj->objnode_tree_root = slot; - } - /* go down a level */ - offset = (index >> shift) & OBJNODE_TREE_MAP_MASK; - objnode = slot; - slot = objnode->slots[offset]; - shift -= OBJNODE_TREE_MAP_SHIFT; - height--; - } - BUG_ON(slot != NULL); - if (objnode) { - objnode->slots_in_use++; - objnode->slots[offset] = pampd; - } else - obj->objnode_tree_root = pampd; - obj->pampd_count++; -out: - return ret; -} - -static void *tmem_pampd_delete_from_obj(struct tmem_obj *obj, uint32_t index) -{ - struct tmem_objnode_tree_path path[OBJNODE_TREE_MAX_PATH + 1]; - struct tmem_objnode_tree_path *pathp = path; - struct tmem_objnode *slot = NULL; - unsigned int height, shift; - int offset; - - BUG_ON(obj == NULL); - ASSERT_SENTINEL(obj, OBJ); - BUG_ON(obj->pool == NULL); - ASSERT_SENTINEL(obj->pool, POOL); - height = obj->objnode_tree_height; - if (index > tmem_objnode_tree_h2max[height]) - goto out; - slot = obj->objnode_tree_root; - if (height == 0 && obj->objnode_tree_root) { - obj->objnode_tree_root = NULL; - goto out; - } - shift = (height - 1) * OBJNODE_TREE_MAP_SHIFT; - pathp->objnode = NULL; - do { - if (slot == NULL) - goto out; - pathp++; - offset = (index >> shift) & OBJNODE_TREE_MAP_MASK; - pathp->offset = offset; - pathp->objnode = slot; - slot = slot->slots[offset]; - shift -= OBJNODE_TREE_MAP_SHIFT; - height--; - } while (height > 0); - if (slot == NULL) - goto out; - while (pathp->objnode) { - pathp->objnode->slots[pathp->offset] = NULL; - pathp->objnode->slots_in_use--; - if (pathp->objnode->slots_in_use) { - if (pathp->objnode == obj->objnode_tree_root) { - while (obj->objnode_tree_height > 0 && - obj->objnode_tree_root->slots_in_use == 1 && - obj->objnode_tree_root->slots[0]) { - struct tmem_objnode *to_free = - obj->objnode_tree_root; - - obj->objnode_tree_root = - to_free->slots[0]; - obj->objnode_tree_height--; - to_free->slots[0] = NULL; - to_free->slots_in_use = 0; - tmem_objnode_free(to_free); - } - } - goto out; - } - tmem_objnode_free(pathp->objnode); /* 0 slots used, free it */ - pathp--; - } - obj->objnode_tree_height = 0; - obj->objnode_tree_root = NULL; - -out: - if (slot != NULL) - obj->pampd_count--; - BUG_ON(obj->pampd_count < 0); - return slot; -} - -/* recursively walk the objnode_tree destroying pampds and objnodes */ -static void tmem_objnode_node_destroy(struct tmem_obj *obj, - struct tmem_objnode *objnode, - unsigned int ht) -{ - int i; - - if (ht == 0) - return; - for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) { - if (objnode->slots[i]) { - if (ht == 1) { - obj->pampd_count--; - (*tmem_pamops.free)(objnode->slots[i], - obj->pool); - objnode->slots[i] = NULL; - continue; - } - tmem_objnode_node_destroy(obj, objnode->slots[i], ht-1); - tmem_objnode_free(objnode->slots[i]); - objnode->slots[i] = NULL; - } - } -} - -static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj) -{ - if (obj->objnode_tree_root == NULL) - return; - if (obj->objnode_tree_height == 0) { - obj->pampd_count--; - (*tmem_pamops.free)(obj->objnode_tree_root, obj->pool); - } else { - tmem_objnode_node_destroy(obj, obj->objnode_tree_root, - obj->objnode_tree_height); - tmem_objnode_free(obj->objnode_tree_root); - obj->objnode_tree_height = 0; - } - obj->objnode_tree_root = NULL; -} - -/* - * Tmem is operated on by a set of well-defined actions: - * "put", "get", "flush", "flush_object", "new pool" and "destroy pool". - * (The tmem ABI allows for subpages and exchanges but these operations - * are not included in this implementation.) - * - * These "tmem core" operations are implemented in the following functions. - */ - -/* - * "Put" a page, e.g. copy a page from the kernel into newly allocated - * PAM space (if such space is available). Tmem_put is complicated by - * a corner case: What if a page with matching handle already exists in - * tmem? To guarantee coherency, one of two actions is necessary: Either - * the data for the page must be overwritten, or the page must be - * "flushed" so that the data is not accessible to a subsequent "get". - * Since these "duplicate puts" are relatively rare, this implementation - * always flushes for simplicity. - */ -int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index, - struct page *page) -{ - struct tmem_obj *obj = NULL, *objfound = NULL, *objnew = NULL; - void *pampd = NULL, *pampd_del = NULL; - int ret = -ENOMEM; - bool ephemeral; - struct tmem_hashbucket *hb; - - ephemeral = is_ephemeral(pool); - hb = &pool->hashbucket[tmem_oid_hash(oidp)]; - spin_lock(&hb->lock); - obj = objfound = tmem_obj_find(hb, oidp); - if (obj != NULL) { - pampd = tmem_pampd_lookup_in_obj(objfound, index); - if (pampd != NULL) { - /* if found, is a dup put, flush the old one */ - pampd_del = tmem_pampd_delete_from_obj(obj, index); - BUG_ON(pampd_del != pampd); - (*tmem_pamops.free)(pampd, pool); - if (obj->pampd_count == 0) { - objnew = obj; - objfound = NULL; - } - pampd = NULL; - } - } else { - obj = objnew = (*tmem_hostops.obj_alloc)(pool); - if (unlikely(obj == NULL)) { - ret = -ENOMEM; - goto out; - } - tmem_obj_init(obj, hb, pool, oidp); - } - BUG_ON(obj == NULL); - BUG_ON(((objnew != obj) && (objfound != obj)) || (objnew == objfound)); - pampd = (*tmem_pamops.create)(obj->pool, &obj->oid, index, page); - if (unlikely(pampd == NULL)) - goto free; - ret = tmem_pampd_add_to_obj(obj, index, pampd); - if (unlikely(ret == -ENOMEM)) - /* may have partially built objnode tree ("stump") */ - goto delete_and_free; - goto out; - -delete_and_free: - (void)tmem_pampd_delete_from_obj(obj, index); -free: - if (pampd) - (*tmem_pamops.free)(pampd, pool); - if (objnew) { - tmem_obj_free(objnew, hb); - (*tmem_hostops.obj_free)(objnew, pool); - } -out: - spin_unlock(&hb->lock); - return ret; -} - -/* - * "Get" a page, e.g. if one can be found, copy the tmem page with the - * matching handle from PAM space to the kernel. By tmem definition, - * when a "get" is successful on an ephemeral page, the page is "flushed", - * and when a "get" is successful on a persistent page, the page is retained - * in tmem. Note that to preserve - * coherency, "get" can never be skipped if tmem contains the data. - * That is, if a get is done with a certain handle and fails, any - * subsequent "get" must also fail (unless of course there is a - * "put" done with the same handle). - - */ -int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, - uint32_t index, struct page *page) -{ - struct tmem_obj *obj; - void *pampd; - bool ephemeral = is_ephemeral(pool); - uint32_t ret = -1; - struct tmem_hashbucket *hb; - - hb = &pool->hashbucket[tmem_oid_hash(oidp)]; - spin_lock(&hb->lock); - obj = tmem_obj_find(hb, oidp); - if (obj == NULL) - goto out; - ephemeral = is_ephemeral(pool); - if (ephemeral) - pampd = tmem_pampd_delete_from_obj(obj, index); - else - pampd = tmem_pampd_lookup_in_obj(obj, index); - if (pampd == NULL) - goto out; - ret = (*tmem_pamops.get_data)(page, pampd, pool); - if (ret < 0) - goto out; - if (ephemeral) { - (*tmem_pamops.free)(pampd, pool); - if (obj->pampd_count == 0) { - tmem_obj_free(obj, hb); - (*tmem_hostops.obj_free)(obj, pool); - obj = NULL; - } - } - ret = 0; -out: - spin_unlock(&hb->lock); - return ret; -} - -/* - * If a page in tmem matches the handle, "flush" this page from tmem such - * that any subsequent "get" does not succeed (unless, of course, there - * was another "put" with the same handle). - */ -int tmem_flush_page(struct tmem_pool *pool, - struct tmem_oid *oidp, uint32_t index) -{ - struct tmem_obj *obj; - void *pampd; - int ret = -1; - struct tmem_hashbucket *hb; - - hb = &pool->hashbucket[tmem_oid_hash(oidp)]; - spin_lock(&hb->lock); - obj = tmem_obj_find(hb, oidp); - if (obj == NULL) - goto out; - pampd = tmem_pampd_delete_from_obj(obj, index); - if (pampd == NULL) - goto out; - (*tmem_pamops.free)(pampd, pool); - if (obj->pampd_count == 0) { - tmem_obj_free(obj, hb); - (*tmem_hostops.obj_free)(obj, pool); - } - ret = 0; - -out: - spin_unlock(&hb->lock); - return ret; -} - -/* - * "Flush" all pages in tmem matching this oid. - */ -int tmem_flush_object(struct tmem_pool *pool, struct tmem_oid *oidp) -{ - struct tmem_obj *obj; - struct tmem_hashbucket *hb; - int ret = -1; - - hb = &pool->hashbucket[tmem_oid_hash(oidp)]; - spin_lock(&hb->lock); - obj = tmem_obj_find(hb, oidp); - if (obj == NULL) - goto out; - tmem_pampd_destroy_all_in_obj(obj); - tmem_obj_free(obj, hb); - (*tmem_hostops.obj_free)(obj, pool); - ret = 0; - -out: - spin_unlock(&hb->lock); - return ret; -} - -/* - * "Flush" all pages (and tmem_objs) from this tmem_pool and disable - * all subsequent access to this tmem_pool. - */ -int tmem_destroy_pool(struct tmem_pool *pool) -{ - int ret = -1; - - if (pool == NULL) - goto out; - tmem_pool_flush(pool, 1); - ret = 0; -out: - return ret; -} - -static LIST_HEAD(tmem_global_pool_list); - -/* - * Create a new tmem_pool with the provided flag and return - * a pool id provided by the tmem host implementation. - */ -void tmem_new_pool(struct tmem_pool *pool, uint32_t flags) -{ - int persistent = flags & TMEM_POOL_PERSIST; - int shared = flags & TMEM_POOL_SHARED; - struct tmem_hashbucket *hb = &pool->hashbucket[0]; - int i; - - for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) { - hb->obj_rb_root = RB_ROOT; - spin_lock_init(&hb->lock); - } - INIT_LIST_HEAD(&pool->pool_list); - atomic_set(&pool->obj_count, 0); - SET_SENTINEL(pool, POOL); - list_add_tail(&pool->pool_list, &tmem_global_pool_list); - pool->persistent = persistent; - pool->shared = shared; -} diff --git a/drivers/staging/zcache/tmem.h b/drivers/staging/zcache/tmem.h deleted file mode 100755 index 2e07e217..00000000 --- a/drivers/staging/zcache/tmem.h +++ /dev/null @@ -1,195 +0,0 @@ -/* - * tmem.h - * - * Transcendent memory - * - * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp. - */ - -#ifndef _TMEM_H_ -#define _TMEM_H_ - -#include -#include -#include -#include - -/* - * These are pre-defined by the Xen<->Linux ABI - */ -#define TMEM_PUT_PAGE 4 -#define TMEM_GET_PAGE 5 -#define TMEM_FLUSH_PAGE 6 -#define TMEM_FLUSH_OBJECT 7 -#define TMEM_POOL_PERSIST 1 -#define TMEM_POOL_SHARED 2 -#define TMEM_POOL_PRECOMPRESSED 4 -#define TMEM_POOL_PAGESIZE_SHIFT 4 -#define TMEM_POOL_PAGESIZE_MASK 0xf -#define TMEM_POOL_RESERVED_BITS 0x00ffff00 - -/* - * sentinels have proven very useful for debugging but can be removed - * or disabled before final merge. - */ -#define SENTINELS -#ifdef SENTINELS -#define DECL_SENTINEL uint32_t sentinel; -#define SET_SENTINEL(_x, _y) (_x->sentinel = _y##_SENTINEL) -#define INVERT_SENTINEL(_x, _y) (_x->sentinel = ~_y##_SENTINEL) -#define ASSERT_SENTINEL(_x, _y) WARN_ON(_x->sentinel != _y##_SENTINEL) -#define ASSERT_INVERTED_SENTINEL(_x, _y) WARN_ON(_x->sentinel != ~_y##_SENTINEL) -#else -#define DECL_SENTINEL -#define SET_SENTINEL(_x, _y) do { } while (0) -#define INVERT_SENTINEL(_x, _y) do { } while (0) -#define ASSERT_SENTINEL(_x, _y) do { } while (0) -#define ASSERT_INVERTED_SENTINEL(_x, _y) do { } while (0) -#endif - -#define ASSERT_SPINLOCK(_l) WARN_ON(!spin_is_locked(_l)) - -/* - * A pool is the highest-level data structure managed by tmem and - * usually corresponds to a large independent set of pages such as - * a filesystem. Each pool has an id, and certain attributes and counters. - * It also contains a set of hash buckets, each of which contains an rbtree - * of objects and a lock to manage concurrency within the pool. - */ - -#define TMEM_HASH_BUCKET_BITS 8 -#define TMEM_HASH_BUCKETS (1<persistent) -#define is_ephemeral(_p) (!(_p->persistent)) - -/* - * An object id ("oid") is large: 192-bits (to ensure, for example, files - * in a modern filesystem can be uniquely identified). - */ - -struct tmem_oid { - uint64_t oid[3]; -}; - -static inline void tmem_oid_set_invalid(struct tmem_oid *oidp) -{ - oidp->oid[0] = oidp->oid[1] = oidp->oid[2] = -1UL; -} - -static inline bool tmem_oid_valid(struct tmem_oid *oidp) -{ - return oidp->oid[0] != -1UL || oidp->oid[1] != -1UL || - oidp->oid[2] != -1UL; -} - -static inline int tmem_oid_compare(struct tmem_oid *left, - struct tmem_oid *right) -{ - int ret; - - if (left->oid[2] == right->oid[2]) { - if (left->oid[1] == right->oid[1]) { - if (left->oid[0] == right->oid[0]) - ret = 0; - else if (left->oid[0] < right->oid[0]) - ret = -1; - else - return 1; - } else if (left->oid[1] < right->oid[1]) - ret = -1; - else - ret = 1; - } else if (left->oid[2] < right->oid[2]) - ret = -1; - else - ret = 1; - return ret; -} - -static inline unsigned tmem_oid_hash(struct tmem_oid *oidp) -{ - return hash_long(oidp->oid[0] ^ oidp->oid[1] ^ oidp->oid[2], - TMEM_HASH_BUCKET_BITS); -} - -/* - * A tmem_obj contains an identifier (oid), pointers to the parent - * pool and the rb_tree to which it belongs, counters, and an ordered - * set of pampds, structured in a radix-tree-like tree. The intermediate - * nodes of the tree are called tmem_objnodes. - */ - -struct tmem_objnode; - -struct tmem_obj { - struct tmem_oid oid; - struct tmem_pool *pool; - struct rb_node rb_tree_node; - struct tmem_objnode *objnode_tree_root; - unsigned int objnode_tree_height; - unsigned long objnode_count; - long pampd_count; - DECL_SENTINEL -}; - -#define OBJNODE_TREE_MAP_SHIFT 6 -#define OBJNODE_TREE_MAP_SIZE (1UL << OBJNODE_TREE_MAP_SHIFT) -#define OBJNODE_TREE_MAP_MASK (OBJNODE_TREE_MAP_SIZE-1) -#define OBJNODE_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) -#define OBJNODE_TREE_MAX_PATH \ - (OBJNODE_TREE_INDEX_BITS/OBJNODE_TREE_MAP_SHIFT + 2) - -struct tmem_objnode { - struct tmem_obj *obj; - DECL_SENTINEL - void *slots[OBJNODE_TREE_MAP_SIZE]; - unsigned int slots_in_use; -}; - -/* pampd abstract datatype methods provided by the PAM implementation */ -struct tmem_pamops { - void *(*create)(struct tmem_pool *, struct tmem_oid *, uint32_t, - struct page *); - int (*get_data)(struct page *, void *, struct tmem_pool *); - void (*free)(void *, struct tmem_pool *); -}; -extern void tmem_register_pamops(struct tmem_pamops *m); - -/* memory allocation methods provided by the host implementation */ -struct tmem_hostops { - struct tmem_obj *(*obj_alloc)(struct tmem_pool *); - void (*obj_free)(struct tmem_obj *, struct tmem_pool *); - struct tmem_objnode *(*objnode_alloc)(struct tmem_pool *); - void (*objnode_free)(struct tmem_objnode *, struct tmem_pool *); -}; -extern void tmem_register_hostops(struct tmem_hostops *m); - -/* core tmem accessor functions */ -extern int tmem_put(struct tmem_pool *, struct tmem_oid *, uint32_t index, - struct page *page); -extern int tmem_get(struct tmem_pool *, struct tmem_oid *, uint32_t index, - struct page *page); -extern int tmem_flush_page(struct tmem_pool *, struct tmem_oid *, - uint32_t index); -extern int tmem_flush_object(struct tmem_pool *, struct tmem_oid *); -extern int tmem_destroy_pool(struct tmem_pool *); -extern void tmem_new_pool(struct tmem_pool *, uint32_t); -#endif /* _TMEM_H */ diff --git a/drivers/staging/zcache/zcache.c b/drivers/staging/zcache/zcache.c deleted file mode 100755 index b8a2b30a..00000000 --- a/drivers/staging/zcache/zcache.c +++ /dev/null @@ -1,1658 +0,0 @@ -/* - * zcache.c - * - * Copyright (c) 2010,2011, Dan Magenheimer, Oracle Corp. - * Copyright (c) 2010,2011, Nitin Gupta - * - * Zcache provides an in-kernel "host implementation" for transcendent memory - * and, thus indirectly, for cleancache and frontswap. Zcache includes two - * page-accessible memory [1] interfaces, both utilizing lzo1x compression: - * 1) "compression buddies" ("zbud") is used for ephemeral pages - * 2) xvmalloc is used for persistent pages. - * Xvmalloc (based on the TLSF allocator) has very low fragmentation - * so maximizes space efficiency, while zbud allows pairs (and potentially, - * in the future, more than a pair of) compressed pages to be closely linked - * so that reclaiming can be done via the kernel's physical-page-oriented - * "shrinker" interface. - * - * [1] For a definition of page-accessible memory (aka PAM), see: - * http://marc.info/?l=linux-mm&m=127811271605009 - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include "tmem.h" - -#include "../zram/xvmalloc.h" /* if built in drivers/staging */ - -#if (!defined(CONFIG_CLEANCACHE) && !defined(CONFIG_FRONTSWAP)) -#error "zcache is useless without CONFIG_CLEANCACHE or CONFIG_FRONTSWAP" -#endif -#ifdef CONFIG_CLEANCACHE -#include -#endif -#ifdef CONFIG_FRONTSWAP -#include -#endif - -#if 0 -/* this is more aggressive but may cause other problems? */ -#define ZCACHE_GFP_MASK (GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN) -#else -#define ZCACHE_GFP_MASK \ - (__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC) -#endif - -/********** - * Compression buddies ("zbud") provides for packing two (or, possibly - * in the future, more) compressed ephemeral pages into a single "raw" - * (physical) page and tracking them with data structures so that - * the raw pages can be easily reclaimed. - * - * A zbud page ("zbpg") is an aligned page containing a list_head, - * a lock, and two "zbud headers". The remainder of the physical - * page is divided up into aligned 64-byte "chunks" which contain - * the compressed data for zero, one, or two zbuds. Each zbpg - * resides on: (1) an "unused list" if it has no zbuds; (2) a - * "buddied" list if it is fully populated with two zbuds; or - * (3) one of PAGE_SIZE/64 "unbuddied" lists indexed by how many chunks - * the one unbuddied zbud uses. The data inside a zbpg cannot be - * read or written unless the zbpg's lock is held. - */ - -#define ZBH_SENTINEL 0x43214321 -#define ZBPG_SENTINEL 0xdeadbeef - -#define ZBUD_MAX_BUDS 2 - -struct zbud_hdr { - uint32_t pool_id; - struct tmem_oid oid; - uint32_t index; - uint16_t size; /* compressed size in bytes, zero means unused */ - DECL_SENTINEL -}; - -struct zbud_page { - struct list_head bud_list; - spinlock_t lock; - struct zbud_hdr buddy[ZBUD_MAX_BUDS]; - DECL_SENTINEL - /* followed by NUM_CHUNK aligned CHUNK_SIZE-byte chunks */ -}; - -#define CHUNK_SHIFT 6 -#define CHUNK_SIZE (1 << CHUNK_SHIFT) -#define CHUNK_MASK (~(CHUNK_SIZE-1)) -#define NCHUNKS (((PAGE_SIZE - sizeof(struct zbud_page)) & \ - CHUNK_MASK) >> CHUNK_SHIFT) -#define MAX_CHUNK (NCHUNKS-1) - -static struct { - struct list_head list; - unsigned count; -} zbud_unbuddied[NCHUNKS]; -/* list N contains pages with N chunks USED and NCHUNKS-N unused */ -/* element 0 is never used but optimizing that isn't worth it */ -static unsigned long zbud_cumul_chunk_counts[NCHUNKS]; - -struct list_head zbud_buddied_list; -static unsigned long zcache_zbud_buddied_count; - -/* protects the buddied list and all unbuddied lists */ -static DEFINE_SPINLOCK(zbud_budlists_spinlock); - -static LIST_HEAD(zbpg_unused_list); -static unsigned long zcache_zbpg_unused_list_count; - -/* protects the unused page list */ -static DEFINE_SPINLOCK(zbpg_unused_list_spinlock); - -static atomic_t zcache_zbud_curr_raw_pages; -static atomic_t zcache_zbud_curr_zpages; -static unsigned long zcache_zbud_curr_zbytes; -static unsigned long zcache_zbud_cumul_zpages; -static unsigned long zcache_zbud_cumul_zbytes; -static unsigned long zcache_compress_poor; - -/* forward references */ -static void *zcache_get_free_page(void); -static void zcache_free_page(void *p); - -/* - * zbud helper functions - */ - -static inline unsigned zbud_max_buddy_size(void) -{ - return MAX_CHUNK << CHUNK_SHIFT; -} - -static inline unsigned zbud_size_to_chunks(unsigned size) -{ - BUG_ON(size == 0 || size > zbud_max_buddy_size()); - return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; -} - -static inline int zbud_budnum(struct zbud_hdr *zh) -{ - unsigned offset = (unsigned long)zh & (PAGE_SIZE - 1); - struct zbud_page *zbpg = NULL; - unsigned budnum = -1U; - int i; - - for (i = 0; i < ZBUD_MAX_BUDS; i++) - if (offset == offsetof(typeof(*zbpg), buddy[i])) { - budnum = i; - break; - } - BUG_ON(budnum == -1U); - return budnum; -} - -static char *zbud_data(struct zbud_hdr *zh, unsigned size) -{ - struct zbud_page *zbpg; - char *p; - unsigned budnum; - - ASSERT_SENTINEL(zh, ZBH); - budnum = zbud_budnum(zh); - BUG_ON(size == 0 || size > zbud_max_buddy_size()); - zbpg = container_of(zh, struct zbud_page, buddy[budnum]); - ASSERT_SPINLOCK(&zbpg->lock); - p = (char *)zbpg; - if (budnum == 0) - p += ((sizeof(struct zbud_page) + CHUNK_SIZE - 1) & - CHUNK_MASK); - else if (budnum == 1) - p += PAGE_SIZE - ((size + CHUNK_SIZE - 1) & CHUNK_MASK); - return p; -} - -/* - * zbud raw page management - */ - -static struct zbud_page *zbud_alloc_raw_page(void) -{ - struct zbud_page *zbpg = NULL; - struct zbud_hdr *zh0, *zh1; - bool recycled = 0; - - /* if any pages on the zbpg list, use one */ - spin_lock(&zbpg_unused_list_spinlock); - if (!list_empty(&zbpg_unused_list)) { - zbpg = list_first_entry(&zbpg_unused_list, - struct zbud_page, bud_list); - list_del_init(&zbpg->bud_list); - zcache_zbpg_unused_list_count--; - recycled = 1; - } - spin_unlock(&zbpg_unused_list_spinlock); - if (zbpg == NULL) - /* none on zbpg list, try to get a kernel page */ - zbpg = zcache_get_free_page(); - if (likely(zbpg != NULL)) { - INIT_LIST_HEAD(&zbpg->bud_list); - zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1]; - spin_lock_init(&zbpg->lock); - if (recycled) { - ASSERT_INVERTED_SENTINEL(zbpg, ZBPG); - SET_SENTINEL(zbpg, ZBPG); - BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid)); - BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid)); - } else { - atomic_inc(&zcache_zbud_curr_raw_pages); - INIT_LIST_HEAD(&zbpg->bud_list); - SET_SENTINEL(zbpg, ZBPG); - zh0->size = 0; zh1->size = 0; - tmem_oid_set_invalid(&zh0->oid); - tmem_oid_set_invalid(&zh1->oid); - } - } - return zbpg; -} - -static void zbud_free_raw_page(struct zbud_page *zbpg) -{ - struct zbud_hdr *zh0 = &zbpg->buddy[0], *zh1 = &zbpg->buddy[1]; - - ASSERT_SENTINEL(zbpg, ZBPG); - BUG_ON(!list_empty(&zbpg->bud_list)); - ASSERT_SPINLOCK(&zbpg->lock); - BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid)); - BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid)); - INVERT_SENTINEL(zbpg, ZBPG); - spin_unlock(&zbpg->lock); - spin_lock(&zbpg_unused_list_spinlock); - list_add(&zbpg->bud_list, &zbpg_unused_list); - zcache_zbpg_unused_list_count++; - spin_unlock(&zbpg_unused_list_spinlock); -} - -/* - * core zbud handling routines - */ - -static unsigned zbud_free(struct zbud_hdr *zh) -{ - unsigned size; - - ASSERT_SENTINEL(zh, ZBH); - BUG_ON(!tmem_oid_valid(&zh->oid)); - size = zh->size; - BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size()); - zh->size = 0; - tmem_oid_set_invalid(&zh->oid); - INVERT_SENTINEL(zh, ZBH); - zcache_zbud_curr_zbytes -= size; - atomic_dec(&zcache_zbud_curr_zpages); - return size; -} - -static void zbud_free_and_delist(struct zbud_hdr *zh) -{ - unsigned chunks; - struct zbud_hdr *zh_other; - unsigned budnum = zbud_budnum(zh), size; - struct zbud_page *zbpg = - container_of(zh, struct zbud_page, buddy[budnum]); - - spin_lock(&zbpg->lock); - if (list_empty(&zbpg->bud_list)) { - /* ignore zombie page... see zbud_evict_pages() */ - spin_unlock(&zbpg->lock); - return; - } - size = zbud_free(zh); - ASSERT_SPINLOCK(&zbpg->lock); - zh_other = &zbpg->buddy[(budnum == 0) ? 1 : 0]; - if (zh_other->size == 0) { /* was unbuddied: unlist and free */ - chunks = zbud_size_to_chunks(size) ; - spin_lock(&zbud_budlists_spinlock); - BUG_ON(list_empty(&zbud_unbuddied[chunks].list)); - list_del_init(&zbpg->bud_list); - zbud_unbuddied[chunks].count--; - spin_unlock(&zbud_budlists_spinlock); - zbud_free_raw_page(zbpg); - } else { /* was buddied: move remaining buddy to unbuddied list */ - chunks = zbud_size_to_chunks(zh_other->size) ; - spin_lock(&zbud_budlists_spinlock); - list_del_init(&zbpg->bud_list); - zcache_zbud_buddied_count--; - list_add_tail(&zbpg->bud_list, &zbud_unbuddied[chunks].list); - zbud_unbuddied[chunks].count++; - spin_unlock(&zbud_budlists_spinlock); - spin_unlock(&zbpg->lock); - } -} - -static struct zbud_hdr *zbud_create(uint32_t pool_id, struct tmem_oid *oid, - uint32_t index, struct page *page, - void *cdata, unsigned size) -{ - struct zbud_hdr *zh0, *zh1, *zh = NULL; - struct zbud_page *zbpg = NULL, *ztmp; - unsigned nchunks; - char *to; - int i, found_good_buddy = 0; - - nchunks = zbud_size_to_chunks(size) ; - for (i = MAX_CHUNK - nchunks + 1; i > 0; i--) { - spin_lock(&zbud_budlists_spinlock); - if (!list_empty(&zbud_unbuddied[i].list)) { - list_for_each_entry_safe(zbpg, ztmp, - &zbud_unbuddied[i].list, bud_list) { - if (spin_trylock(&zbpg->lock)) { - found_good_buddy = i; - goto found_unbuddied; - } - } - } - spin_unlock(&zbud_budlists_spinlock); - } - /* didn't find a good buddy, try allocating a new page */ - zbpg = zbud_alloc_raw_page(); - if (unlikely(zbpg == NULL)) - goto out; - /* ok, have a page, now compress the data before taking locks */ - spin_lock(&zbpg->lock); - spin_lock(&zbud_budlists_spinlock); - list_add_tail(&zbpg->bud_list, &zbud_unbuddied[nchunks].list); - zbud_unbuddied[nchunks].count++; - zh = &zbpg->buddy[0]; - goto init_zh; - -found_unbuddied: - ASSERT_SPINLOCK(&zbpg->lock); - zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1]; - BUG_ON(!((zh0->size == 0) ^ (zh1->size == 0))); - if (zh0->size != 0) { /* buddy0 in use, buddy1 is vacant */ - ASSERT_SENTINEL(zh0, ZBH); - zh = zh1; - } else if (zh1->size != 0) { /* buddy1 in use, buddy0 is vacant */ - ASSERT_SENTINEL(zh1, ZBH); - zh = zh0; - } else - BUG(); - list_del_init(&zbpg->bud_list); - zbud_unbuddied[found_good_buddy].count--; - list_add_tail(&zbpg->bud_list, &zbud_buddied_list); - zcache_zbud_buddied_count++; - -init_zh: - SET_SENTINEL(zh, ZBH); - zh->size = size; - zh->index = index; - zh->oid = *oid; - zh->pool_id = pool_id; - /* can wait to copy the data until the list locks are dropped */ - spin_unlock(&zbud_budlists_spinlock); - - to = zbud_data(zh, size); - memcpy(to, cdata, size); - spin_unlock(&zbpg->lock); - zbud_cumul_chunk_counts[nchunks]++; - atomic_inc(&zcache_zbud_curr_zpages); - zcache_zbud_cumul_zpages++; - zcache_zbud_curr_zbytes += size; - zcache_zbud_cumul_zbytes += size; -out: - return zh; -} - -static int zbud_decompress(struct page *page, struct zbud_hdr *zh) -{ - struct zbud_page *zbpg; - unsigned budnum = zbud_budnum(zh); - size_t out_len = PAGE_SIZE; - char *to_va, *from_va; - unsigned size; - int ret = 0; - - zbpg = container_of(zh, struct zbud_page, buddy[budnum]); - spin_lock(&zbpg->lock); - if (list_empty(&zbpg->bud_list)) { - /* ignore zombie page... see zbud_evict_pages() */ - ret = -EINVAL; - goto out; - } - ASSERT_SENTINEL(zh, ZBH); - BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size()); - to_va = kmap_atomic(page, KM_USER0); - size = zh->size; - from_va = zbud_data(zh, size); - ret = lzo1x_decompress_safe(from_va, size, to_va, &out_len); - BUG_ON(ret != LZO_E_OK); - BUG_ON(out_len != PAGE_SIZE); - kunmap_atomic(to_va, KM_USER0); -out: - spin_unlock(&zbpg->lock); - return ret; -} - -/* - * The following routines handle shrinking of ephemeral pages by evicting - * pages "least valuable" first. - */ - -static unsigned long zcache_evicted_raw_pages; -static unsigned long zcache_evicted_buddied_pages; -static unsigned long zcache_evicted_unbuddied_pages; - -static struct tmem_pool *zcache_get_pool_by_id(uint32_t poolid); -static void zcache_put_pool(struct tmem_pool *pool); - -/* - * Flush and free all zbuds in a zbpg, then free the pageframe - */ -static void zbud_evict_zbpg(struct zbud_page *zbpg) -{ - struct zbud_hdr *zh; - int i, j; - uint32_t pool_id[ZBUD_MAX_BUDS], index[ZBUD_MAX_BUDS]; - struct tmem_oid oid[ZBUD_MAX_BUDS]; - struct tmem_pool *pool; - - ASSERT_SPINLOCK(&zbpg->lock); - BUG_ON(!list_empty(&zbpg->bud_list)); - for (i = 0, j = 0; i < ZBUD_MAX_BUDS; i++) { - zh = &zbpg->buddy[i]; - if (zh->size) { - pool_id[j] = zh->pool_id; - oid[j] = zh->oid; - index[j] = zh->index; - j++; - zbud_free(zh); - } - } - spin_unlock(&zbpg->lock); - for (i = 0; i < j; i++) { - pool = zcache_get_pool_by_id(pool_id[i]); - if (pool != NULL) { - tmem_flush_page(pool, &oid[i], index[i]); - zcache_put_pool(pool); - } - } - ASSERT_SENTINEL(zbpg, ZBPG); - spin_lock(&zbpg->lock); - zbud_free_raw_page(zbpg); -} - -/* - * Free nr pages. This code is funky because we want to hold the locks - * protecting various lists for as short a time as possible, and in some - * circumstances the list may change asynchronously when the list lock is - * not held. In some cases we also trylock not only to avoid waiting on a - * page in use by another cpu, but also to avoid potential deadlock due to - * lock inversion. - */ -static void zbud_evict_pages(int nr) -{ - struct zbud_page *zbpg; - int i; - - /* first try freeing any pages on unused list */ -retry_unused_list: - spin_lock_bh(&zbpg_unused_list_spinlock); - if (!list_empty(&zbpg_unused_list)) { - /* can't walk list here, since it may change when unlocked */ - zbpg = list_first_entry(&zbpg_unused_list, - struct zbud_page, bud_list); - list_del_init(&zbpg->bud_list); - zcache_zbpg_unused_list_count--; - atomic_dec(&zcache_zbud_curr_raw_pages); - spin_unlock_bh(&zbpg_unused_list_spinlock); - zcache_free_page(zbpg); - zcache_evicted_raw_pages++; - if (--nr <= 0) - goto out; - goto retry_unused_list; - } - spin_unlock_bh(&zbpg_unused_list_spinlock); - - /* now try freeing unbuddied pages, starting with least space avail */ - for (i = 0; i < MAX_CHUNK; i++) { -retry_unbud_list_i: - spin_lock_bh(&zbud_budlists_spinlock); - if (list_empty(&zbud_unbuddied[i].list)) { - spin_unlock_bh(&zbud_budlists_spinlock); - continue; - } - list_for_each_entry(zbpg, &zbud_unbuddied[i].list, bud_list) { - if (unlikely(!spin_trylock(&zbpg->lock))) - continue; - list_del_init(&zbpg->bud_list); - zbud_unbuddied[i].count--; - spin_unlock(&zbud_budlists_spinlock); - zcache_evicted_unbuddied_pages++; - /* want budlists unlocked when doing zbpg eviction */ - zbud_evict_zbpg(zbpg); - local_bh_enable(); - if (--nr <= 0) - goto out; - goto retry_unbud_list_i; - } - spin_unlock_bh(&zbud_budlists_spinlock); - } - - /* as a last resort, free buddied pages */ -retry_bud_list: - spin_lock_bh(&zbud_budlists_spinlock); - if (list_empty(&zbud_buddied_list)) { - spin_unlock_bh(&zbud_budlists_spinlock); - goto out; - } - list_for_each_entry(zbpg, &zbud_buddied_list, bud_list) { - if (unlikely(!spin_trylock(&zbpg->lock))) - continue; - list_del_init(&zbpg->bud_list); - zcache_zbud_buddied_count--; - spin_unlock(&zbud_budlists_spinlock); - zcache_evicted_buddied_pages++; - /* want budlists unlocked when doing zbpg eviction */ - zbud_evict_zbpg(zbpg); - local_bh_enable(); - if (--nr <= 0) - goto out; - goto retry_bud_list; - } - spin_unlock_bh(&zbud_budlists_spinlock); -out: - return; -} - -static void zbud_init(void) -{ - int i; - - INIT_LIST_HEAD(&zbud_buddied_list); - zcache_zbud_buddied_count = 0; - for (i = 0; i < NCHUNKS; i++) { - INIT_LIST_HEAD(&zbud_unbuddied[i].list); - zbud_unbuddied[i].count = 0; - } -} - -#ifdef CONFIG_SYSFS -/* - * These sysfs routines show a nice distribution of how many zbpg's are - * currently (and have ever been placed) in each unbuddied list. It's fun - * to watch but can probably go away before final merge. - */ -static int zbud_show_unbuddied_list_counts(char *buf) -{ - int i; - char *p = buf; - - for (i = 0; i < NCHUNKS - 1; i++) - p += sprintf(p, "%u ", zbud_unbuddied[i].count); - p += sprintf(p, "%d\n", zbud_unbuddied[i].count); - return p - buf; -} - -static int zbud_show_cumul_chunk_counts(char *buf) -{ - unsigned long i, chunks = 0, total_chunks = 0, sum_total_chunks = 0; - unsigned long total_chunks_lte_21 = 0, total_chunks_lte_32 = 0; - unsigned long total_chunks_lte_42 = 0; - char *p = buf; - - for (i = 0; i < NCHUNKS; i++) { - p += sprintf(p, "%lu ", zbud_cumul_chunk_counts[i]); - chunks += zbud_cumul_chunk_counts[i]; - total_chunks += zbud_cumul_chunk_counts[i]; - sum_total_chunks += i * zbud_cumul_chunk_counts[i]; - if (i == 21) - total_chunks_lte_21 = total_chunks; - if (i == 32) - total_chunks_lte_32 = total_chunks; - if (i == 42) - total_chunks_lte_42 = total_chunks; - } - p += sprintf(p, "<=21:%lu <=32:%lu <=42:%lu, mean:%lu\n", - total_chunks_lte_21, total_chunks_lte_32, total_chunks_lte_42, - chunks == 0 ? 0 : sum_total_chunks / chunks); - return p - buf; -} -#endif - -/********** - * This "zv" PAM implementation combines the TLSF-based xvMalloc - * with lzo1x compression to maximize the amount of data that can - * be packed into a physical page. - * - * Zv represents a PAM page with the index and object (plus a "size" value - * necessary for decompression) immediately preceding the compressed data. - */ - -#define ZVH_SENTINEL 0x43214321 - -struct zv_hdr { - uint32_t pool_id; - struct tmem_oid oid; - uint32_t index; - DECL_SENTINEL -}; - -static const int zv_max_page_size = (PAGE_SIZE / 8) * 7; - -static struct zv_hdr *zv_create(struct xv_pool *xvpool, uint32_t pool_id, - struct tmem_oid *oid, uint32_t index, - void *cdata, unsigned clen) -{ - struct page *page; - struct zv_hdr *zv = NULL; - uint32_t offset; - int ret; - - BUG_ON(!irqs_disabled()); - ret = xv_malloc(xvpool, clen + sizeof(struct zv_hdr), - &page, &offset, ZCACHE_GFP_MASK); - if (unlikely(ret)) - goto out; - zv = kmap_atomic(page, KM_USER0) + offset; - zv->index = index; - zv->oid = *oid; - zv->pool_id = pool_id; - SET_SENTINEL(zv, ZVH); - memcpy((char *)zv + sizeof(struct zv_hdr), cdata, clen); - kunmap_atomic(zv, KM_USER0); -out: - return zv; -} - -static void zv_free(struct xv_pool *xvpool, struct zv_hdr *zv) -{ - unsigned long flags; - struct page *page; - uint32_t offset; - uint16_t size; - - ASSERT_SENTINEL(zv, ZVH); - size = xv_get_object_size(zv) - sizeof(*zv); - BUG_ON(size == 0 || size > zv_max_page_size); - INVERT_SENTINEL(zv, ZVH); - page = virt_to_page(zv); - offset = (unsigned long)zv & ~PAGE_MASK; - local_irq_save(flags); - xv_free(xvpool, page, offset); - local_irq_restore(flags); -} - -static void zv_decompress(struct page *page, struct zv_hdr *zv) -{ - size_t clen = PAGE_SIZE; - char *to_va; - unsigned size; - int ret; - - ASSERT_SENTINEL(zv, ZVH); - size = xv_get_object_size(zv) - sizeof(*zv); - BUG_ON(size == 0 || size > zv_max_page_size); - to_va = kmap_atomic(page, KM_USER0); - ret = lzo1x_decompress_safe((char *)zv + sizeof(*zv), - size, to_va, &clen); - kunmap_atomic(to_va, KM_USER0); - BUG_ON(ret != LZO_E_OK); - BUG_ON(clen != PAGE_SIZE); -} - -/* - * zcache core code starts here - */ - -/* useful stats not collected by cleancache or frontswap */ -static unsigned long zcache_flush_total; -static unsigned long zcache_flush_found; -static unsigned long zcache_flobj_total; -static unsigned long zcache_flobj_found; -static unsigned long zcache_failed_eph_puts; -static unsigned long zcache_failed_pers_puts; - -#define MAX_POOLS_PER_CLIENT 16 - -static struct { - struct tmem_pool *tmem_pools[MAX_POOLS_PER_CLIENT]; - struct xv_pool *xvpool; -} zcache_client; - -/* - * Tmem operations assume the poolid implies the invoking client. - * Zcache only has one client (the kernel itself), so translate - * the poolid into the tmem_pool allocated for it. A KVM version - * of zcache would have one client per guest and each client might - * have a poolid==N. - */ -static struct tmem_pool *zcache_get_pool_by_id(uint32_t poolid) -{ - struct tmem_pool *pool = NULL; - - if (poolid >= 0) { - pool = zcache_client.tmem_pools[poolid]; - if (pool != NULL) - atomic_inc(&pool->refcount); - } - return pool; -} - -static void zcache_put_pool(struct tmem_pool *pool) -{ - if (pool != NULL) - atomic_dec(&pool->refcount); -} - -/* counters for debugging */ -static unsigned long zcache_failed_get_free_pages; -static unsigned long zcache_failed_alloc; -static unsigned long zcache_put_to_flush; -static unsigned long zcache_aborted_preload; -static unsigned long zcache_aborted_shrink; - -/* - * Ensure that memory allocation requests in zcache don't result - * in direct reclaim requests via the shrinker, which would cause - * an infinite loop. Maybe a GFP flag would be better? - */ -static DEFINE_SPINLOCK(zcache_direct_reclaim_lock); - -/* - * for now, used named slabs so can easily track usage; later can - * either just use kmalloc, or perhaps add a slab-like allocator - * to more carefully manage total memory utilization - */ -static struct kmem_cache *zcache_objnode_cache; -static struct kmem_cache *zcache_obj_cache; -static atomic_t zcache_curr_obj_count = ATOMIC_INIT(0); -static unsigned long zcache_curr_obj_count_max; -static atomic_t zcache_curr_objnode_count = ATOMIC_INIT(0); -static unsigned long zcache_curr_objnode_count_max; - -/* - * to avoid memory allocation recursion (e.g. due to direct reclaim), we - * preload all necessary data structures so the hostops callbacks never - * actually do a malloc - */ -struct zcache_preload { - void *page; - struct tmem_obj *obj; - int nr; - struct tmem_objnode *objnodes[OBJNODE_TREE_MAX_PATH]; -}; -static DEFINE_PER_CPU(struct zcache_preload, zcache_preloads) = { 0, }; - -static int zcache_do_preload(struct tmem_pool *pool) -{ - struct zcache_preload *kp; - struct tmem_objnode *objnode; - struct tmem_obj *obj; - void *page; - int ret = -ENOMEM; - - if (unlikely(zcache_objnode_cache == NULL)) - goto out; - if (unlikely(zcache_obj_cache == NULL)) - goto out; - if (!spin_trylock(&zcache_direct_reclaim_lock)) { - zcache_aborted_preload++; - goto out; - } - preempt_disable(); - kp = &__get_cpu_var(zcache_preloads); - while (kp->nr < ARRAY_SIZE(kp->objnodes)) { - preempt_enable_no_resched(); - objnode = kmem_cache_alloc(zcache_objnode_cache, - ZCACHE_GFP_MASK); - if (unlikely(objnode == NULL)) { - zcache_failed_alloc++; - goto unlock_out; - } - preempt_disable(); - kp = &__get_cpu_var(zcache_preloads); - if (kp->nr < ARRAY_SIZE(kp->objnodes)) - kp->objnodes[kp->nr++] = objnode; - else - kmem_cache_free(zcache_objnode_cache, objnode); - } - preempt_enable_no_resched(); - obj = kmem_cache_alloc(zcache_obj_cache, ZCACHE_GFP_MASK); - if (unlikely(obj == NULL)) { - zcache_failed_alloc++; - goto unlock_out; - } - page = (void *)__get_free_page(ZCACHE_GFP_MASK); - if (unlikely(page == NULL)) { - zcache_failed_get_free_pages++; - kmem_cache_free(zcache_obj_cache, obj); - goto unlock_out; - } - preempt_disable(); - kp = &__get_cpu_var(zcache_preloads); - if (kp->obj == NULL) - kp->obj = obj; - else - kmem_cache_free(zcache_obj_cache, obj); - if (kp->page == NULL) - kp->page = page; - else - free_page((unsigned long)page); - ret = 0; -unlock_out: - spin_unlock(&zcache_direct_reclaim_lock); -out: - return ret; -} - -static void *zcache_get_free_page(void) -{ - struct zcache_preload *kp; - void *page; - - kp = &__get_cpu_var(zcache_preloads); - page = kp->page; - BUG_ON(page == NULL); - kp->page = NULL; - return page; -} - -static void zcache_free_page(void *p) -{ - free_page((unsigned long)p); -} - -/* - * zcache implementation for tmem host ops - */ - -static struct tmem_objnode *zcache_objnode_alloc(struct tmem_pool *pool) -{ - struct tmem_objnode *objnode = NULL; - unsigned long count; - struct zcache_preload *kp; - - kp = &__get_cpu_var(zcache_preloads); - if (kp->nr <= 0) - goto out; - objnode = kp->objnodes[kp->nr - 1]; - BUG_ON(objnode == NULL); - kp->objnodes[kp->nr - 1] = NULL; - kp->nr--; - count = atomic_inc_return(&zcache_curr_objnode_count); - if (count > zcache_curr_objnode_count_max) - zcache_curr_objnode_count_max = count; -out: - return objnode; -} - -static void zcache_objnode_free(struct tmem_objnode *objnode, - struct tmem_pool *pool) -{ - atomic_dec(&zcache_curr_objnode_count); - BUG_ON(atomic_read(&zcache_curr_objnode_count) < 0); - kmem_cache_free(zcache_objnode_cache, objnode); -} - -static struct tmem_obj *zcache_obj_alloc(struct tmem_pool *pool) -{ - struct tmem_obj *obj = NULL; - unsigned long count; - struct zcache_preload *kp; - - kp = &__get_cpu_var(zcache_preloads); - obj = kp->obj; - BUG_ON(obj == NULL); - kp->obj = NULL; - count = atomic_inc_return(&zcache_curr_obj_count); - if (count > zcache_curr_obj_count_max) - zcache_curr_obj_count_max = count; - return obj; -} - -static void zcache_obj_free(struct tmem_obj *obj, struct tmem_pool *pool) -{ - atomic_dec(&zcache_curr_obj_count); - BUG_ON(atomic_read(&zcache_curr_obj_count) < 0); - kmem_cache_free(zcache_obj_cache, obj); -} - -static struct tmem_hostops zcache_hostops = { - .obj_alloc = zcache_obj_alloc, - .obj_free = zcache_obj_free, - .objnode_alloc = zcache_objnode_alloc, - .objnode_free = zcache_objnode_free, -}; - -/* - * zcache implementations for PAM page descriptor ops - */ - -static atomic_t zcache_curr_eph_pampd_count = ATOMIC_INIT(0); -static unsigned long zcache_curr_eph_pampd_count_max; -static atomic_t zcache_curr_pers_pampd_count = ATOMIC_INIT(0); -static unsigned long zcache_curr_pers_pampd_count_max; - -/* forward reference */ -static int zcache_compress(struct page *from, void **out_va, size_t *out_len); - -static void *zcache_pampd_create(struct tmem_pool *pool, struct tmem_oid *oid, - uint32_t index, struct page *page) -{ - void *pampd = NULL, *cdata; - size_t clen; - int ret; - bool ephemeral = is_ephemeral(pool); - unsigned long count; - - if (ephemeral) { - ret = zcache_compress(page, &cdata, &clen); - if (ret == 0) - - goto out; - if (clen == 0 || clen > zbud_max_buddy_size()) { - zcache_compress_poor++; - goto out; - } - pampd = (void *)zbud_create(pool->pool_id, oid, index, - page, cdata, clen); - if (pampd != NULL) { - count = atomic_inc_return(&zcache_curr_eph_pampd_count); - if (count > zcache_curr_eph_pampd_count_max) - zcache_curr_eph_pampd_count_max = count; - } - } else { - /* - * FIXME: This is all the "policy" there is for now. - * 3/4 totpages should allow ~37% of RAM to be filled with - * compressed frontswap pages - */ - if (atomic_read(&zcache_curr_pers_pampd_count) > - 3 * totalram_pages / 4) - goto out; - ret = zcache_compress(page, &cdata, &clen); - if (ret == 0) - goto out; - if (clen > zv_max_page_size) { - zcache_compress_poor++; - goto out; - } - pampd = (void *)zv_create(zcache_client.xvpool, pool->pool_id, - oid, index, cdata, clen); - if (pampd == NULL) - goto out; - count = atomic_inc_return(&zcache_curr_pers_pampd_count); - if (count > zcache_curr_pers_pampd_count_max) - zcache_curr_pers_pampd_count_max = count; - } -out: - return pampd; -} - -/* - * fill the pageframe corresponding to the struct page with the data - * from the passed pampd - */ -static int zcache_pampd_get_data(struct page *page, void *pampd, - struct tmem_pool *pool) -{ - int ret = 0; - - if (is_ephemeral(pool)) - ret = zbud_decompress(page, pampd); - else - zv_decompress(page, pampd); - return ret; -} - -/* - * free the pampd and remove it from any zcache lists - * pampd must no longer be pointed to from any tmem data structures! - */ -static void zcache_pampd_free(void *pampd, struct tmem_pool *pool) -{ - if (is_ephemeral(pool)) { - zbud_free_and_delist((struct zbud_hdr *)pampd); - atomic_dec(&zcache_curr_eph_pampd_count); - BUG_ON(atomic_read(&zcache_curr_eph_pampd_count) < 0); - } else { - zv_free(zcache_client.xvpool, (struct zv_hdr *)pampd); - atomic_dec(&zcache_curr_pers_pampd_count); - BUG_ON(atomic_read(&zcache_curr_pers_pampd_count) < 0); - } -} - -static struct tmem_pamops zcache_pamops = { - .create = zcache_pampd_create, - .get_data = zcache_pampd_get_data, - .free = zcache_pampd_free, -}; - -/* - * zcache compression/decompression and related per-cpu stuff - */ - -#define LZO_WORKMEM_BYTES LZO1X_1_MEM_COMPRESS -#define LZO_DSTMEM_PAGE_ORDER 1 -static DEFINE_PER_CPU(unsigned char *, zcache_workmem); -static DEFINE_PER_CPU(unsigned char *, zcache_dstmem); - -static int zcache_compress(struct page *from, void **out_va, size_t *out_len) -{ - int ret = 0; - unsigned char *dmem = __get_cpu_var(zcache_dstmem); - unsigned char *wmem = __get_cpu_var(zcache_workmem); - char *from_va; - - BUG_ON(!irqs_disabled()); - if (unlikely(dmem == NULL || wmem == NULL)) - goto out; /* no buffer, so can't compress */ - from_va = kmap_atomic(from, KM_USER0); - mb(); - ret = lzo1x_1_compress(from_va, PAGE_SIZE, dmem, out_len, wmem); - BUG_ON(ret != LZO_E_OK); - *out_va = dmem; - kunmap_atomic(from_va, KM_USER0); - ret = 1; -out: - return ret; -} - - -static int zcache_cpu_notifier(struct notifier_block *nb, - unsigned long action, void *pcpu) -{ - int cpu = (long)pcpu; - struct zcache_preload *kp; - - switch (action) { - case CPU_UP_PREPARE: - per_cpu(zcache_dstmem, cpu) = (void *)__get_free_pages( - GFP_KERNEL | __GFP_REPEAT, - LZO_DSTMEM_PAGE_ORDER), - per_cpu(zcache_workmem, cpu) = - kzalloc(LZO1X_MEM_COMPRESS, - GFP_KERNEL | __GFP_REPEAT); - break; - case CPU_DEAD: - case CPU_UP_CANCELED: - free_pages((unsigned long)per_cpu(zcache_dstmem, cpu), - LZO_DSTMEM_PAGE_ORDER); - per_cpu(zcache_dstmem, cpu) = NULL; - kfree(per_cpu(zcache_workmem, cpu)); - per_cpu(zcache_workmem, cpu) = NULL; - kp = &per_cpu(zcache_preloads, cpu); - while (kp->nr) { - kmem_cache_free(zcache_objnode_cache, - kp->objnodes[kp->nr - 1]); - kp->objnodes[kp->nr - 1] = NULL; - kp->nr--; - } - kmem_cache_free(zcache_obj_cache, kp->obj); - free_page((unsigned long)kp->page); - break; - default: - break; - } - return NOTIFY_OK; -} - -static struct notifier_block zcache_cpu_notifier_block = { - .notifier_call = zcache_cpu_notifier -}; - -#ifdef CONFIG_SYSFS -#define ZCACHE_SYSFS_RO(_name) \ - static ssize_t zcache_##_name##_show(struct kobject *kobj, \ - struct kobj_attribute *attr, char *buf) \ - { \ - return sprintf(buf, "%lu\n", zcache_##_name); \ - } \ - static struct kobj_attribute zcache_##_name##_attr = { \ - .attr = { .name = __stringify(_name), .mode = 0444 }, \ - .show = zcache_##_name##_show, \ - } - -#define ZCACHE_SYSFS_RO_ATOMIC(_name) \ - static ssize_t zcache_##_name##_show(struct kobject *kobj, \ - struct kobj_attribute *attr, char *buf) \ - { \ - return sprintf(buf, "%d\n", atomic_read(&zcache_##_name)); \ - } \ - static struct kobj_attribute zcache_##_name##_attr = { \ - .attr = { .name = __stringify(_name), .mode = 0444 }, \ - .show = zcache_##_name##_show, \ - } - -#define ZCACHE_SYSFS_RO_CUSTOM(_name, _func) \ - static ssize_t zcache_##_name##_show(struct kobject *kobj, \ - struct kobj_attribute *attr, char *buf) \ - { \ - return _func(buf); \ - } \ - static struct kobj_attribute zcache_##_name##_attr = { \ - .attr = { .name = __stringify(_name), .mode = 0444 }, \ - .show = zcache_##_name##_show, \ - } - -ZCACHE_SYSFS_RO(curr_obj_count_max); -ZCACHE_SYSFS_RO(curr_objnode_count_max); -ZCACHE_SYSFS_RO(flush_total); -ZCACHE_SYSFS_RO(flush_found); -ZCACHE_SYSFS_RO(flobj_total); -ZCACHE_SYSFS_RO(flobj_found); -ZCACHE_SYSFS_RO(failed_eph_puts); -ZCACHE_SYSFS_RO(failed_pers_puts); -ZCACHE_SYSFS_RO(zbud_curr_zbytes); -ZCACHE_SYSFS_RO(zbud_cumul_zpages); -ZCACHE_SYSFS_RO(zbud_cumul_zbytes); -ZCACHE_SYSFS_RO(zbud_buddied_count); -ZCACHE_SYSFS_RO(zbpg_unused_list_count); -ZCACHE_SYSFS_RO(evicted_raw_pages); -ZCACHE_SYSFS_RO(evicted_unbuddied_pages); -ZCACHE_SYSFS_RO(evicted_buddied_pages); -ZCACHE_SYSFS_RO(failed_get_free_pages); -ZCACHE_SYSFS_RO(failed_alloc); -ZCACHE_SYSFS_RO(put_to_flush); -ZCACHE_SYSFS_RO(aborted_preload); -ZCACHE_SYSFS_RO(aborted_shrink); -ZCACHE_SYSFS_RO(compress_poor); -ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_raw_pages); -ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_zpages); -ZCACHE_SYSFS_RO_ATOMIC(curr_obj_count); -ZCACHE_SYSFS_RO_ATOMIC(curr_objnode_count); -ZCACHE_SYSFS_RO_CUSTOM(zbud_unbuddied_list_counts, - zbud_show_unbuddied_list_counts); -ZCACHE_SYSFS_RO_CUSTOM(zbud_cumul_chunk_counts, - zbud_show_cumul_chunk_counts); - -static struct attribute *zcache_attrs[] = { - &zcache_curr_obj_count_attr.attr, - &zcache_curr_obj_count_max_attr.attr, - &zcache_curr_objnode_count_attr.attr, - &zcache_curr_objnode_count_max_attr.attr, - &zcache_flush_total_attr.attr, - &zcache_flobj_total_attr.attr, - &zcache_flush_found_attr.attr, - &zcache_flobj_found_attr.attr, - &zcache_failed_eph_puts_attr.attr, - &zcache_failed_pers_puts_attr.attr, - &zcache_compress_poor_attr.attr, - &zcache_zbud_curr_raw_pages_attr.attr, - &zcache_zbud_curr_zpages_attr.attr, - &zcache_zbud_curr_zbytes_attr.attr, - &zcache_zbud_cumul_zpages_attr.attr, - &zcache_zbud_cumul_zbytes_attr.attr, - &zcache_zbud_buddied_count_attr.attr, - &zcache_zbpg_unused_list_count_attr.attr, - &zcache_evicted_raw_pages_attr.attr, - &zcache_evicted_unbuddied_pages_attr.attr, - &zcache_evicted_buddied_pages_attr.attr, - &zcache_failed_get_free_pages_attr.attr, - &zcache_failed_alloc_attr.attr, - &zcache_put_to_flush_attr.attr, - &zcache_aborted_preload_attr.attr, - &zcache_aborted_shrink_attr.attr, - &zcache_zbud_unbuddied_list_counts_attr.attr, - &zcache_zbud_cumul_chunk_counts_attr.attr, - NULL, -}; - -static struct attribute_group zcache_attr_group = { - .attrs = zcache_attrs, - .name = "zcache", -}; - -#endif /* CONFIG_SYSFS */ -/* - * When zcache is disabled ("frozen"), pools can be created and destroyed, - * but all puts (and thus all other operations that require memory allocation) - * must fail. If zcache is unfrozen, accepts puts, then frozen again, - * data consistency requires all puts while frozen to be converted into - * flushes. - */ -static bool zcache_freeze; - -/* - * zcache shrinker interface (only useful for ephemeral pages, so zbud only) - */ -static int shrink_zcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) -{ - int ret = -1; - - if (nr >= 0) { - if (!(gfp_mask & __GFP_FS)) - /* does this case really need to be skipped? */ - goto out; - if (spin_trylock(&zcache_direct_reclaim_lock)) { - zbud_evict_pages(nr); - spin_unlock(&zcache_direct_reclaim_lock); - } else - zcache_aborted_shrink++; - } - ret = (int)atomic_read(&zcache_zbud_curr_raw_pages); -out: - return ret; -} - -static struct shrinker zcache_shrinker = { - .shrink = shrink_zcache_memory, - .seeks = DEFAULT_SEEKS, -}; - -/* - * zcache shims between cleancache/frontswap ops and tmem - */ - -static int zcache_put_page(int pool_id, struct tmem_oid *oidp, - uint32_t index, struct page *page) -{ - struct tmem_pool *pool; - int ret = -1; - - BUG_ON(!irqs_disabled()); - pool = zcache_get_pool_by_id(pool_id); - if (unlikely(pool == NULL)) - goto out; - if (!zcache_freeze && zcache_do_preload(pool) == 0) { - /* preload does preempt_disable on success */ - ret = tmem_put(pool, oidp, index, page); - if (ret < 0) { - if (is_ephemeral(pool)) - zcache_failed_eph_puts++; - else - zcache_failed_pers_puts++; - } - zcache_put_pool(pool); - preempt_enable_no_resched(); - } else { - zcache_put_to_flush++; - if (atomic_read(&pool->obj_count) > 0) - /* the put fails whether the flush succeeds or not */ - (void)tmem_flush_page(pool, oidp, index); - zcache_put_pool(pool); - } -out: - return ret; -} - -static int zcache_get_page(int pool_id, struct tmem_oid *oidp, - uint32_t index, struct page *page) -{ - struct tmem_pool *pool; - int ret = -1; - unsigned long flags; - - local_irq_save(flags); - pool = zcache_get_pool_by_id(pool_id); - if (likely(pool != NULL)) { - if (atomic_read(&pool->obj_count) > 0) - ret = tmem_get(pool, oidp, index, page); - zcache_put_pool(pool); - } - local_irq_restore(flags); - return ret; -} - -static int zcache_flush_page(int pool_id, struct tmem_oid *oidp, uint32_t index) -{ - struct tmem_pool *pool; - int ret = -1; - unsigned long flags; - - local_irq_save(flags); - zcache_flush_total++; - pool = zcache_get_pool_by_id(pool_id); - if (likely(pool != NULL)) { - if (atomic_read(&pool->obj_count) > 0) - ret = tmem_flush_page(pool, oidp, index); - zcache_put_pool(pool); - } - if (ret >= 0) - zcache_flush_found++; - local_irq_restore(flags); - return ret; -} - -static int zcache_flush_object(int pool_id, struct tmem_oid *oidp) -{ - struct tmem_pool *pool; - int ret = -1; - unsigned long flags; - - local_irq_save(flags); - zcache_flobj_total++; - pool = zcache_get_pool_by_id(pool_id); - if (likely(pool != NULL)) { - if (atomic_read(&pool->obj_count) > 0) - ret = tmem_flush_object(pool, oidp); - zcache_put_pool(pool); - } - if (ret >= 0) - zcache_flobj_found++; - local_irq_restore(flags); - return ret; -} - -static int zcache_destroy_pool(int pool_id) -{ - struct tmem_pool *pool = NULL; - int ret = -1; - - if (pool_id < 0) - goto out; - pool = zcache_client.tmem_pools[pool_id]; - if (pool == NULL) - goto out; - zcache_client.tmem_pools[pool_id] = NULL; - /* wait for pool activity on other cpus to quiesce */ - while (atomic_read(&pool->refcount) != 0) - ; - local_bh_disable(); - ret = tmem_destroy_pool(pool); - local_bh_enable(); - kfree(pool); - pr_info("zcache: destroyed pool id=%d\n", pool_id); -out: - return ret; -} - -static int zcache_new_pool(uint32_t flags) -{ - int poolid = -1; - struct tmem_pool *pool; - - pool = kmalloc(sizeof(struct tmem_pool), GFP_KERNEL); - if (pool == NULL) { - pr_info("zcache: pool creation failed: out of memory\n"); - goto out; - } - - for (poolid = 0; poolid < MAX_POOLS_PER_CLIENT; poolid++) - if (zcache_client.tmem_pools[poolid] == NULL) - break; - if (poolid >= MAX_POOLS_PER_CLIENT) { - pr_info("zcache: pool creation failed: max exceeded\n"); - kfree(pool); - poolid = -1; - goto out; - } - atomic_set(&pool->refcount, 0); - pool->client = &zcache_client; - pool->pool_id = poolid; - tmem_new_pool(pool, flags); - zcache_client.tmem_pools[poolid] = pool; - pr_info("zcache: created %s tmem pool, id=%d\n", - flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral", - poolid); -out: - return poolid; -} - -/********** - * Two kernel functionalities currently can be layered on top of tmem. - * These are "cleancache" which is used as a second-chance cache for clean - * page cache pages; and "frontswap" which is used for swap pages - * to avoid writes to disk. A generic "shim" is provided here for each - * to translate in-kernel semantics to zcache semantics. - */ - -#ifdef CONFIG_CLEANCACHE -static void zcache_cleancache_put_page(int pool_id, - struct cleancache_filekey key, - pgoff_t index, struct page *page) -{ - u32 ind = (u32) index; - struct tmem_oid oid = *(struct tmem_oid *)&key; - - if (likely(ind == index)) - (void)zcache_put_page(pool_id, &oid, index, page); -} - -static int zcache_cleancache_get_page(int pool_id, - struct cleancache_filekey key, - pgoff_t index, struct page *page) -{ - u32 ind = (u32) index; - struct tmem_oid oid = *(struct tmem_oid *)&key; - int ret = -1; - - if (likely(ind == index)) - ret = zcache_get_page(pool_id, &oid, index, page); - return ret; -} - -static void zcache_cleancache_flush_page(int pool_id, - struct cleancache_filekey key, - pgoff_t index) -{ - u32 ind = (u32) index; - struct tmem_oid oid = *(struct tmem_oid *)&key; - - if (likely(ind == index)) - (void)zcache_flush_page(pool_id, &oid, ind); -} - -static void zcache_cleancache_flush_inode(int pool_id, - struct cleancache_filekey key) -{ - struct tmem_oid oid = *(struct tmem_oid *)&key; - - (void)zcache_flush_object(pool_id, &oid); -} - -static void zcache_cleancache_flush_fs(int pool_id) -{ - if (pool_id >= 0) - (void)zcache_destroy_pool(pool_id); -} - -static int zcache_cleancache_init_fs(size_t pagesize) -{ - BUG_ON(sizeof(struct cleancache_filekey) != - sizeof(struct tmem_oid)); - BUG_ON(pagesize != PAGE_SIZE); - return zcache_new_pool(0); -} - -static int zcache_cleancache_init_shared_fs(char *uuid, size_t pagesize) -{ - /* shared pools are unsupported and map to private */ - BUG_ON(sizeof(struct cleancache_filekey) != - sizeof(struct tmem_oid)); - BUG_ON(pagesize != PAGE_SIZE); - return zcache_new_pool(0); -} - -static struct cleancache_ops zcache_cleancache_ops = { - .put_page = zcache_cleancache_put_page, - .get_page = zcache_cleancache_get_page, - .flush_page = zcache_cleancache_flush_page, - .flush_inode = zcache_cleancache_flush_inode, - .flush_fs = zcache_cleancache_flush_fs, - .init_shared_fs = zcache_cleancache_init_shared_fs, - .init_fs = zcache_cleancache_init_fs -}; - -struct cleancache_ops zcache_cleancache_register_ops(void) -{ - struct cleancache_ops old_ops = - cleancache_register_ops(&zcache_cleancache_ops); - - return old_ops; -} -#endif - -#ifdef CONFIG_FRONTSWAP -/* a single tmem poolid is used for all frontswap "types" (swapfiles) */ -static int zcache_frontswap_poolid = -1; - -/* - * Swizzling increases objects per swaptype, increasing tmem concurrency - * for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS - */ -#define SWIZ_BITS 4 -#define SWIZ_MASK ((1 << SWIZ_BITS) - 1) -#define _oswiz(_type, _ind) ((_type << SWIZ_BITS) | (_ind & SWIZ_MASK)) -#define iswiz(_ind) (_ind >> SWIZ_BITS) - -static inline struct tmem_oid oswiz(unsigned type, u32 ind) -{ - struct tmem_oid oid = { .oid = { 0 } }; - oid.oid[0] = _oswiz(type, ind); - return oid; -} - -static int zcache_frontswap_put_page(unsigned type, pgoff_t offset, - struct page *page) -{ - u64 ind64 = (u64)offset; - u32 ind = (u32)offset; - struct tmem_oid oid = oswiz(type, ind); - int ret = -1; - unsigned long flags; - - BUG_ON(!PageLocked(page)); - if (likely(ind64 == ind)) { - local_irq_save(flags); - ret = zcache_put_page(zcache_frontswap_poolid, &oid, - iswiz(ind), page); - local_irq_restore(flags); - } - return ret; -} - -/* returns 0 if the page was successfully gotten from frontswap, -1 if - * was not present (should never happen!) */ -static int zcache_frontswap_get_page(unsigned type, pgoff_t offset, - struct page *page) -{ - u64 ind64 = (u64)offset; - u32 ind = (u32)offset; - struct tmem_oid oid = oswiz(type, ind); - int ret = -1; - - BUG_ON(!PageLocked(page)); - if (likely(ind64 == ind)) - ret = zcache_get_page(zcache_frontswap_poolid, &oid, - iswiz(ind), page); - return ret; -} - -/* flush a single page from frontswap */ -static void zcache_frontswap_flush_page(unsigned type, pgoff_t offset) -{ - u64 ind64 = (u64)offset; - u32 ind = (u32)offset; - struct tmem_oid oid = oswiz(type, ind); - - if (likely(ind64 == ind)) - (void)zcache_flush_page(zcache_frontswap_poolid, &oid, - iswiz(ind)); -} - -/* flush all pages from the passed swaptype */ -static void zcache_frontswap_flush_area(unsigned type) -{ - struct tmem_oid oid; - int ind; - - for (ind = SWIZ_MASK; ind >= 0; ind--) { - oid = oswiz(type, ind); - (void)zcache_flush_object(zcache_frontswap_poolid, &oid); - } -} - -static void zcache_frontswap_init(unsigned ignored) -{ - /* a single tmem poolid is used for all frontswap "types" (swapfiles) */ - if (zcache_frontswap_poolid < 0) - zcache_frontswap_poolid = zcache_new_pool(TMEM_POOL_PERSIST); -} - -static struct frontswap_ops zcache_frontswap_ops = { - .put_page = zcache_frontswap_put_page, - .get_page = zcache_frontswap_get_page, - .flush_page = zcache_frontswap_flush_page, - .flush_area = zcache_frontswap_flush_area, - .init = zcache_frontswap_init -}; - -struct frontswap_ops zcache_frontswap_register_ops(void) -{ - struct frontswap_ops old_ops = - frontswap_register_ops(&zcache_frontswap_ops); - - return old_ops; -} -#endif - -/* - * zcache initialization - * NOTE FOR NOW zcache MUST BE PROVIDED AS A KERNEL BOOT PARAMETER OR - * NOTHING HAPPENS! - */ - -static int zcache_enabled; - -static int __init enable_zcache(char *s) -{ - zcache_enabled = 1; - return 1; -} -__setup("zcache", enable_zcache); - -/* allow independent dynamic disabling of cleancache and frontswap */ - -static int use_cleancache = 1; - -static int __init no_cleancache(char *s) -{ - use_cleancache = 0; - return 1; -} - -__setup("nocleancache", no_cleancache); - -static int use_frontswap = 1; - -static int __init no_frontswap(char *s) -{ - use_frontswap = 0; - return 1; -} - -__setup("nofrontswap", no_frontswap); - -static int __init zcache_init(void) -{ -#ifdef CONFIG_SYSFS - int ret = 0; - - ret = sysfs_create_group(mm_kobj, &zcache_attr_group); - if (ret) { - pr_err("zcache: can't create sysfs\n"); - goto out; - } -#endif /* CONFIG_SYSFS */ -#if defined(CONFIG_CLEANCACHE) || defined(CONFIG_FRONTSWAP) - if (zcache_enabled) { - unsigned int cpu; - - tmem_register_hostops(&zcache_hostops); - tmem_register_pamops(&zcache_pamops); - ret = register_cpu_notifier(&zcache_cpu_notifier_block); - if (ret) { - pr_err("zcache: can't register cpu notifier\n"); - goto out; - } - for_each_online_cpu(cpu) { - void *pcpu = (void *)(long)cpu; - zcache_cpu_notifier(&zcache_cpu_notifier_block, - CPU_UP_PREPARE, pcpu); - } - } - zcache_objnode_cache = kmem_cache_create("zcache_objnode", - sizeof(struct tmem_objnode), 0, 0, NULL); - zcache_obj_cache = kmem_cache_create("zcache_obj", - sizeof(struct tmem_obj), 0, 0, NULL); -#endif -#ifdef CONFIG_CLEANCACHE - if (zcache_enabled && use_cleancache) { - struct cleancache_ops old_ops; - - zbud_init(); - register_shrinker(&zcache_shrinker); - old_ops = zcache_cleancache_register_ops(); - pr_info("zcache: cleancache enabled using kernel " - "transcendent memory and compression buddies\n"); - if (old_ops.init_fs != NULL) - pr_warning("zcache: cleancache_ops overridden"); - } -#endif -#ifdef CONFIG_FRONTSWAP - if (zcache_enabled && use_frontswap) { - struct frontswap_ops old_ops; - - zcache_client.xvpool = xv_create_pool(); - if (zcache_client.xvpool == NULL) { - pr_err("zcache: can't create xvpool\n"); - goto out; - } - old_ops = zcache_frontswap_register_ops(); - pr_info("zcache: frontswap enabled using kernel " - "transcendent memory and xvmalloc\n"); - if (old_ops.init != NULL) - pr_warning("ktmem: frontswap_ops overridden"); - } -#endif -out: - return ret; -} - -module_init(zcache_init) From 5c726afbb6f0983468eec87d86147c91d94b4430 Mon Sep 17 00:00:00 2001 From: SecureCRT Date: Mon, 20 Aug 2012 23:15:00 +0800 Subject: [PATCH 14/14] Revert "mm: cleancache core ops functions and config" This reverts commit e0c9143ea1ec510a41b347be043e98034eedf5c8. --- Documentation/vm/cleancache.txt | 279 -------------------------------- include/linux/cleancache.h | 122 -------------- mm/Kconfig | 22 --- mm/Makefile | 1 - mm/cleancache.c | 244 ---------------------------- 5 files changed, 668 deletions(-) delete mode 100755 Documentation/vm/cleancache.txt delete mode 100755 include/linux/cleancache.h mode change 100755 => 100644 mm/Kconfig mode change 100755 => 100644 mm/Makefile delete mode 100755 mm/cleancache.c diff --git a/Documentation/vm/cleancache.txt b/Documentation/vm/cleancache.txt deleted file mode 100755 index e0a53567..00000000 --- a/Documentation/vm/cleancache.txt +++ /dev/null @@ -1,279 +0,0 @@ -MOTIVATION - -Cleancache is a new optional feature provided by the VFS layer that -potentially dramatically increases page cache effectiveness for -many workloads in many environments at a negligible cost. - -Cleancache can be thought of as a page-granularity victim cache for clean -pages that the kernel's pageframe replacement algorithm (PFRA) would like -to keep around, but can't since there isn't enough memory. So when the -PFRA "evicts" a page, it first attempts to use cleancache code to -put the data contained in that page into "transcendent memory", memory -that is not directly accessible or addressable by the kernel and is -of unknown and possibly time-varying size. - -Later, when a cleancache-enabled filesystem wishes to access a page -in a file on disk, it first checks cleancache to see if it already -contains it; if it does, the page of data is copied into the kernel -and a disk access is avoided. - -Transcendent memory "drivers" for cleancache are currently implemented -in Xen (using hypervisor memory) and zcache (using in-kernel compressed -memory) and other implementations are in development. - -FAQs are included below. - -IMPLEMENTATION OVERVIEW - -A cleancache "backend" that provides transcendent memory registers itself -to the kernel's cleancache "frontend" by calling cleancache_register_ops, -passing a pointer to a cleancache_ops structure with funcs set appropriately. -Note that cleancache_register_ops returns the previous settings so that -chaining can be performed if desired. The functions provided must conform to -certain semantics as follows: - -Most important, cleancache is "ephemeral". Pages which are copied into -cleancache have an indefinite lifetime which is completely unknowable -by the kernel and so may or may not still be in cleancache at any later time. -Thus, as its name implies, cleancache is not suitable for dirty pages. -Cleancache has complete discretion over what pages to preserve and what -pages to discard and when. - -Mounting a cleancache-enabled filesystem should call "init_fs" to obtain a -pool id which, if positive, must be saved in the filesystem's superblock; -a negative return value indicates failure. A "put_page" will copy a -(presumably about-to-be-evicted) page into cleancache and associate it with -the pool id, a file key, and a page index into the file. (The combination -of a pool id, a file key, and an index is sometimes called a "handle".) -A "get_page" will copy the page, if found, from cleancache into kernel memory. -An "invalidate_page" will ensure the page no longer is present in cleancache; -an "invalidate_inode" will invalidate all pages associated with the specified -file; and, when a filesystem is unmounted, an "invalidate_fs" will invalidate -all pages in all files specified by the given pool id and also surrender -the pool id. - -An "init_shared_fs", like init_fs, obtains a pool id but tells cleancache -to treat the pool as shared using a 128-bit UUID as a key. On systems -that may run multiple kernels (such as hard partitioned or virtualized -systems) that may share a clustered filesystem, and where cleancache -may be shared among those kernels, calls to init_shared_fs that specify the -same UUID will receive the same pool id, thus allowing the pages to -be shared. Note that any security requirements must be imposed outside -of the kernel (e.g. by "tools" that control cleancache). Or a -cleancache implementation can simply disable shared_init by always -returning a negative value. - -If a get_page is successful on a non-shared pool, the page is invalidated -(thus making cleancache an "exclusive" cache). On a shared pool, the page -is NOT invalidated on a successful get_page so that it remains accessible to -other sharers. The kernel is responsible for ensuring coherency between -cleancache (shared or not), the page cache, and the filesystem, using -cleancache invalidate operations as required. - -Note that cleancache must enforce put-put-get coherency and get-get -coherency. For the former, if two puts are made to the same handle but -with different data, say AAA by the first put and BBB by the second, a -subsequent get can never return the stale data (AAA). For get-get coherency, -if a get for a given handle fails, subsequent gets for that handle will -never succeed unless preceded by a successful put with that handle. - -Last, cleancache provides no SMP serialization guarantees; if two -different Linux threads are simultaneously putting and invalidating a page -with the same handle, the results are indeterminate. Callers must -lock the page to ensure serial behavior. - -CLEANCACHE PERFORMANCE METRICS - -Cleancache monitoring is done by sysfs files in the -/sys/kernel/mm/cleancache directory. The effectiveness of cleancache -can be measured (across all filesystems) with: - -succ_gets - number of gets that were successful -failed_gets - number of gets that failed -puts - number of puts attempted (all "succeed") -invalidates - number of invalidates attempted - -A backend implementatation may provide additional metrics. - -FAQ - -1) Where's the value? (Andrew Morton) - -Cleancache provides a significant performance benefit to many workloads -in many environments with negligible overhead by improving the -effectiveness of the pagecache. Clean pagecache pages are -saved in transcendent memory (RAM that is otherwise not directly -addressable to the kernel); fetching those pages later avoids "refaults" -and thus disk reads. - -Cleancache (and its sister code "frontswap") provide interfaces for -this transcendent memory (aka "tmem"), which conceptually lies between -fast kernel-directly-addressable RAM and slower DMA/asynchronous devices. -Disallowing direct kernel or userland reads/writes to tmem -is ideal when data is transformed to a different form and size (such -as with compression) or secretly moved (as might be useful for write- -balancing for some RAM-like devices). Evicted page-cache pages (and -swap pages) are a great use for this kind of slower-than-RAM-but-much- -faster-than-disk transcendent memory, and the cleancache (and frontswap) -"page-object-oriented" specification provides a nice way to read and -write -- and indirectly "name" -- the pages. - -In the virtual case, the whole point of virtualization is to statistically -multiplex physical resources across the varying demands of multiple -virtual machines. This is really hard to do with RAM and efforts to -do it well with no kernel change have essentially failed (except in some -well-publicized special-case workloads). Cleancache -- and frontswap -- -with a fairly small impact on the kernel, provide a huge amount -of flexibility for more dynamic, flexible RAM multiplexing. -Specifically, the Xen Transcendent Memory backend allows otherwise -"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple -virtual machines, but the pages can be compressed and deduplicated to -optimize RAM utilization. And when guest OS's are induced to surrender -underutilized RAM (e.g. with "self-ballooning"), page cache pages -are the first to go, and cleancache allows those pages to be -saved and reclaimed if overall host system memory conditions allow. - -And the identical interface used for cleancache can be used in -physical systems as well. The zcache driver acts as a memory-hungry -device that stores pages of data in a compressed state. And -the proposed "RAMster" driver shares RAM across multiple physical -systems. - -2) Why does cleancache have its sticky fingers so deep inside the - filesystems and VFS? (Andrew Morton and Christoph Hellwig) - -The core hooks for cleancache in VFS are in most cases a single line -and the minimum set are placed precisely where needed to maintain -coherency (via cleancache_invalidate operations) between cleancache, -the page cache, and disk. All hooks compile into nothingness if -cleancache is config'ed off and turn into a function-pointer- -compare-to-NULL if config'ed on but no backend claims the ops -functions, or to a compare-struct-element-to-negative if a -backend claims the ops functions but a filesystem doesn't enable -cleancache. - -Some filesystems are built entirely on top of VFS and the hooks -in VFS are sufficient, so don't require an "init_fs" hook; the -initial implementation of cleancache didn't provide this hook. -But for some filesystems (such as btrfs), the VFS hooks are -incomplete and one or more hooks in fs-specific code are required. -And for some other filesystems, such as tmpfs, cleancache may -be counterproductive. So it seemed prudent to require a filesystem -to "opt in" to use cleancache, which requires adding a hook in -each filesystem. Not all filesystems are supported by cleancache -only because they haven't been tested. The existing set should -be sufficient to validate the concept, the opt-in approach means -that untested filesystems are not affected, and the hooks in the -existing filesystems should make it very easy to add more -filesystems in the future. - -The total impact of the hooks to existing fs and mm files is only -about 40 lines added (not counting comments and blank lines). - -3) Why not make cleancache asynchronous and batched so it can - more easily interface with real devices with DMA instead - of copying each individual page? (Minchan Kim) - -The one-page-at-a-time copy semantics simplifies the implementation -on both the frontend and backend and also allows the backend to -do fancy things on-the-fly like page compression and -page deduplication. And since the data is "gone" (copied into/out -of the pageframe) before the cleancache get/put call returns, -a great deal of race conditions and potential coherency issues -are avoided. While the interface seems odd for a "real device" -or for real kernel-addressable RAM, it makes perfect sense for -transcendent memory. - -4) Why is non-shared cleancache "exclusive"? And where is the - page "invalidated" after a "get"? (Minchan Kim) - -The main reason is to free up space in transcendent memory and -to avoid unnecessary cleancache_invalidate calls. If you want inclusive, -the page can be "put" immediately following the "get". If -put-after-get for inclusive becomes common, the interface could -be easily extended to add a "get_no_invalidate" call. - -The invalidate is done by the cleancache backend implementation. - -5) What's the performance impact? - -Performance analysis has been presented at OLS'09 and LCA'10. -Briefly, performance gains can be significant on most workloads, -especially when memory pressure is high (e.g. when RAM is -overcommitted in a virtual workload); and because the hooks are -invoked primarily in place of or in addition to a disk read/write, -overhead is negligible even in worst case workloads. Basically -cleancache replaces I/O with memory-copy-CPU-overhead; on older -single-core systems with slow memory-copy speeds, cleancache -has little value, but in newer multicore machines, especially -consolidated/virtualized machines, it has great value. - -6) How do I add cleancache support for filesystem X? (Boaz Harrash) - -Filesystems that are well-behaved and conform to certain -restrictions can utilize cleancache simply by making a call to -cleancache_init_fs at mount time. Unusual, misbehaving, or -poorly layered filesystems must either add additional hooks -and/or undergo extensive additional testing... or should just -not enable the optional cleancache. - -Some points for a filesystem to consider: - -- The FS should be block-device-based (e.g. a ram-based FS such - as tmpfs should not enable cleancache) -- To ensure coherency/correctness, the FS must ensure that all - file removal or truncation operations either go through VFS or - add hooks to do the equivalent cleancache "invalidate" operations -- To ensure coherency/correctness, either inode numbers must - be unique across the lifetime of the on-disk file OR the - FS must provide an "encode_fh" function. -- The FS must call the VFS superblock alloc and deactivate routines - or add hooks to do the equivalent cleancache calls done there. -- To maximize performance, all pages fetched from the FS should - go through the do_mpag_readpage routine or the FS should add - hooks to do the equivalent (cf. btrfs) -- Currently, the FS blocksize must be the same as PAGESIZE. This - is not an architectural restriction, but no backends currently - support anything different. -- A clustered FS should invoke the "shared_init_fs" cleancache - hook to get best performance for some backends. - -7) Why not use the KVA of the inode as the key? (Christoph Hellwig) - -If cleancache would use the inode virtual address instead of -inode/filehandle, the pool id could be eliminated. But, this -won't work because cleancache retains pagecache data pages -persistently even when the inode has been pruned from the -inode unused list, and only invalidates the data page if the file -gets removed/truncated. So if cleancache used the inode kva, -there would be potential coherency issues if/when the inode -kva is reused for a different file. Alternately, if cleancache -invalidated the pages when the inode kva was freed, much of the value -of cleancache would be lost because the cache of pages in cleanache -is potentially much larger than the kernel pagecache and is most -useful if the pages survive inode cache removal. - -8) Why is a global variable required? - -The cleancache_enabled flag is checked in all of the frequently-used -cleancache hooks. The alternative is a function call to check a static -variable. Since cleancache is enabled dynamically at runtime, systems -that don't enable cleancache would suffer thousands (possibly -tens-of-thousands) of unnecessary function calls per second. So the -global variable allows cleancache to be enabled by default at compile -time, but have insignificant performance impact when cleancache remains -disabled at runtime. - -9) Does cleanache work with KVM? - -The memory model of KVM is sufficiently different that a cleancache -backend may have less value for KVM. This remains to be tested, -especially in an overcommitted system. - -10) Does cleancache work in userspace? It sounds useful for - memory hungry caches like web browsers. (Jamie Lokier) - -No plans yet, though we agree it sounds useful, at least for -apps that bypass the page cache (e.g. O_DIRECT). - -Last updated: Dan Magenheimer, April 13 2011 diff --git a/include/linux/cleancache.h b/include/linux/cleancache.h deleted file mode 100755 index 04ffb2e6..00000000 --- a/include/linux/cleancache.h +++ /dev/null @@ -1,122 +0,0 @@ -#ifndef _LINUX_CLEANCACHE_H -#define _LINUX_CLEANCACHE_H - -#include -#include -#include - -#define CLEANCACHE_KEY_MAX 6 - -/* - * cleancache requires every file with a page in cleancache to have a - * unique key unless/until the file is removed/truncated. For some - * filesystems, the inode number is unique, but for "modern" filesystems - * an exportable filehandle is required (see exportfs.h) - */ -struct cleancache_filekey { - union { - ino_t ino; - __u32 fh[CLEANCACHE_KEY_MAX]; - u32 key[CLEANCACHE_KEY_MAX]; - } u; -}; - -struct cleancache_ops { - int (*init_fs)(size_t); - int (*init_shared_fs)(char *uuid, size_t); - int (*get_page)(int, struct cleancache_filekey, - pgoff_t, struct page *); - void (*put_page)(int, struct cleancache_filekey, - pgoff_t, struct page *); - void (*flush_page)(int, struct cleancache_filekey, pgoff_t); - void (*flush_inode)(int, struct cleancache_filekey); - void (*flush_fs)(int); -}; - -extern struct cleancache_ops - cleancache_register_ops(struct cleancache_ops *ops); -extern void __cleancache_init_fs(struct super_block *); -extern void __cleancache_init_shared_fs(char *, struct super_block *); -extern int __cleancache_get_page(struct page *); -extern void __cleancache_put_page(struct page *); -extern void __cleancache_flush_page(struct address_space *, struct page *); -extern void __cleancache_flush_inode(struct address_space *); -extern void __cleancache_flush_fs(struct super_block *); -extern int cleancache_enabled; - -#ifdef CONFIG_CLEANCACHE -static inline bool cleancache_fs_enabled(struct page *page) -{ - return page->mapping->host->i_sb->cleancache_poolid >= 0; -} -static inline bool cleancache_fs_enabled_mapping(struct address_space *mapping) -{ - return mapping->host->i_sb->cleancache_poolid >= 0; -} -#else -#define cleancache_enabled (0) -#define cleancache_fs_enabled(_page) (0) -#define cleancache_fs_enabled_mapping(_page) (0) -#endif - -/* - * The shim layer provided by these inline functions allows the compiler - * to reduce all cleancache hooks to nothingness if CONFIG_CLEANCACHE - * is disabled, to a single global variable check if CONFIG_CLEANCACHE - * is enabled but no cleancache "backend" has dynamically enabled it, - * and, for the most frequent cleancache ops, to a single global variable - * check plus a superblock element comparison if CONFIG_CLEANCACHE is enabled - * and a cleancache backend has dynamically enabled cleancache, but the - * filesystem referenced by that cleancache op has not enabled cleancache. - * As a result, CONFIG_CLEANCACHE can be enabled by default with essentially - * no measurable performance impact. - */ - -static inline void cleancache_init_fs(struct super_block *sb) -{ - if (cleancache_enabled) - __cleancache_init_fs(sb); -} - -static inline void cleancache_init_shared_fs(char *uuid, struct super_block *sb) -{ - if (cleancache_enabled) - __cleancache_init_shared_fs(uuid, sb); -} - -static inline int cleancache_get_page(struct page *page) -{ - int ret = -1; - - if (cleancache_enabled && cleancache_fs_enabled(page)) - ret = __cleancache_get_page(page); - return ret; -} - -static inline void cleancache_put_page(struct page *page) -{ - if (cleancache_enabled && cleancache_fs_enabled(page)) - __cleancache_put_page(page); -} - -static inline void cleancache_flush_page(struct address_space *mapping, - struct page *page) -{ - /* careful... page->mapping is NULL sometimes when this is called */ - if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping)) - __cleancache_flush_page(mapping, page); -} - -static inline void cleancache_flush_inode(struct address_space *mapping) -{ - if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping)) - __cleancache_flush_inode(mapping); -} - -static inline void cleancache_flush_fs(struct super_block *sb) -{ - if (cleancache_enabled) - __cleancache_flush_fs(sb); -} - -#endif /* _LINUX_CLEANCACHE_H */ diff --git a/mm/Kconfig b/mm/Kconfig old mode 100755 new mode 100644 index f86e0d29..2c19c0ba --- a/mm/Kconfig +++ b/mm/Kconfig @@ -288,25 +288,3 @@ config NOMMU_INITIAL_TRIM_EXCESS of 1 says that all excess pages should be trimmed. See Documentation/nommu-mmap.txt for more information. -config CLEANCACHE - bool "Enable cleancache driver to cache clean pages if tmem is present" - default n - help - Cleancache can be thought of as a page-granularity victim cache - for clean pages that the kernel's pageframe replacement algorithm - (PFRA) would like to keep around, but can't since there isn't enough - memory. So when the PFRA "evicts" a page, it first attempts to use - cleancacne code to put the data contained in that page into - "transcendent memory", memory that is not directly accessible or - addressable by the kernel and is of unknown and possibly - time-varying size. And when a cleancache-enabled - filesystem wishes to access a page in a file on disk, it first - checks cleancache to see if it already contains it; if it does, - the page is copied into the kernel and a disk access is avoided. - When a transcendent memory driver is available (such as zcache or - Xen transcendent memory), a significant I/O reduction - may be achieved. When none is available, all cleancache calls - are reduced to a single pointer-compare-against-NULL resulting - in a negligible performance hit. - - If unsure, say Y to enable cleancache \ No newline at end of file diff --git a/mm/Makefile b/mm/Makefile old mode 100755 new mode 100644 index 82a734fd..66f54865 --- a/mm/Makefile +++ b/mm/Makefile @@ -46,4 +46,3 @@ obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o -obj-$(CONFIG_CLEANCACHE) += cleancache.o diff --git a/mm/cleancache.c b/mm/cleancache.c deleted file mode 100755 index bcaae4c2..00000000 --- a/mm/cleancache.c +++ /dev/null @@ -1,244 +0,0 @@ -/* - * Cleancache frontend - * - * This code provides the generic "frontend" layer to call a matching - * "backend" driver implementation of cleancache. See - * Documentation/vm/cleancache.txt for more information. - * - * Copyright (C) 2009-2010 Oracle Corp. All rights reserved. - * Author: Dan Magenheimer - * - * This work is licensed under the terms of the GNU GPL, version 2. - */ - -#include -#include -#include -#include -#include - -/* - * This global enablement flag may be read thousands of times per second - * by cleancache_get/put/flush even on systems where cleancache_ops - * is not claimed (e.g. cleancache is config'ed on but remains - * disabled), so is preferred to the slower alternative: a function - * call that checks a non-global. - */ -int cleancache_enabled; -EXPORT_SYMBOL(cleancache_enabled); - -/* - * cleancache_ops is set by cleancache_ops_register to contain the pointers - * to the cleancache "backend" implementation functions. - */ -static struct cleancache_ops cleancache_ops; - -/* useful stats available in /sys/kernel/mm/cleancache */ -static unsigned long cleancache_succ_gets; -static unsigned long cleancache_failed_gets; -static unsigned long cleancache_puts; -static unsigned long cleancache_flushes; - -/* - * register operations for cleancache, returning previous thus allowing - * detection of multiple backends and possible nesting - */ -struct cleancache_ops cleancache_register_ops(struct cleancache_ops *ops) -{ - struct cleancache_ops old = cleancache_ops; - - cleancache_ops = *ops; - cleancache_enabled = 1; - return old; -} -EXPORT_SYMBOL(cleancache_register_ops); - -/* Called by a cleancache-enabled filesystem at time of mount */ -void __cleancache_init_fs(struct super_block *sb) -{ - sb->cleancache_poolid = (*cleancache_ops.init_fs)(PAGE_SIZE); -} -EXPORT_SYMBOL(__cleancache_init_fs); - -/* Called by a cleancache-enabled clustered filesystem at time of mount */ -void __cleancache_init_shared_fs(char *uuid, struct super_block *sb) -{ - sb->cleancache_poolid = - (*cleancache_ops.init_shared_fs)(uuid, PAGE_SIZE); -} -EXPORT_SYMBOL(__cleancache_init_shared_fs); - -/* - * If the filesystem uses exportable filehandles, use the filehandle as - * the key, else use the inode number. - */ -static int cleancache_get_key(struct inode *inode, - struct cleancache_filekey *key) -{ - int (*fhfn)(struct dentry *, __u32 *fh, int *, int); - int len = 0, maxlen = CLEANCACHE_KEY_MAX; - struct super_block *sb = inode->i_sb; - - key->u.ino = inode->i_ino; - if (sb->s_export_op != NULL) { - fhfn = sb->s_export_op->encode_fh; - if (fhfn) { - struct dentry d; - d.d_inode = inode; - len = (*fhfn)(&d, &key->u.fh[0], &maxlen, 0); - if (len <= 0 || len == 255) - return -1; - if (maxlen > CLEANCACHE_KEY_MAX) - return -1; - } - } - return 0; -} - -/* - * "Get" data from cleancache associated with the poolid/inode/index - * that were specified when the data was put to cleanache and, if - * successful, use it to fill the specified page with data and return 0. - * The pageframe is unchanged and returns -1 if the get fails. - * Page must be locked by caller. - */ -int __cleancache_get_page(struct page *page) -{ - int ret = -1; - int pool_id; - struct cleancache_filekey key = { .u.key = { 0 } }; - - VM_BUG_ON(!PageLocked(page)); - pool_id = page->mapping->host->i_sb->cleancache_poolid; - if (pool_id < 0) - goto out; - - if (cleancache_get_key(page->mapping->host, &key) < 0) - goto out; - - ret = (*cleancache_ops.get_page)(pool_id, key, page->index, page); - if (ret == 0) - cleancache_succ_gets++; - else - cleancache_failed_gets++; -out: - return ret; -} -EXPORT_SYMBOL(__cleancache_get_page); - -/* - * "Put" data from a page to cleancache and associate it with the - * (previously-obtained per-filesystem) poolid and the page's, - * inode and page index. Page must be locked. Note that a put_page - * always "succeeds", though a subsequent get_page may succeed or fail. - */ -void __cleancache_put_page(struct page *page) -{ - int pool_id; - struct cleancache_filekey key = { .u.key = { 0 } }; - - VM_BUG_ON(!PageLocked(page)); - pool_id = page->mapping->host->i_sb->cleancache_poolid; - if (pool_id >= 0 && - cleancache_get_key(page->mapping->host, &key) >= 0) { - (*cleancache_ops.put_page)(pool_id, key, page->index, page); - cleancache_puts++; - } -} -EXPORT_SYMBOL(__cleancache_put_page); - -/* - * Flush any data from cleancache associated with the poolid and the - * page's inode and page index so that a subsequent "get" will fail. - */ -void __cleancache_flush_page(struct address_space *mapping, struct page *page) -{ - /* careful... page->mapping is NULL sometimes when this is called */ - int pool_id = mapping->host->i_sb->cleancache_poolid; - struct cleancache_filekey key = { .u.key = { 0 } }; - - if (pool_id >= 0) { - VM_BUG_ON(!PageLocked(page)); - if (cleancache_get_key(mapping->host, &key) >= 0) { - (*cleancache_ops.flush_page)(pool_id, key, page->index); - cleancache_flushes++; - } - } -} -EXPORT_SYMBOL(__cleancache_flush_page); - -/* - * Flush all data from cleancache associated with the poolid and the - * mappings's inode so that all subsequent gets to this poolid/inode - * will fail. - */ -void __cleancache_flush_inode(struct address_space *mapping) -{ - int pool_id = mapping->host->i_sb->cleancache_poolid; - struct cleancache_filekey key = { .u.key = { 0 } }; - - if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0) - (*cleancache_ops.flush_inode)(pool_id, key); -} -EXPORT_SYMBOL(__cleancache_flush_inode); - -/* - * Called by any cleancache-enabled filesystem at time of unmount; - * note that pool_id is surrendered and may be reutrned by a subsequent - * cleancache_init_fs or cleancache_init_shared_fs - */ -void __cleancache_flush_fs(struct super_block *sb) -{ - if (sb->cleancache_poolid >= 0) { - int old_poolid = sb->cleancache_poolid; - sb->cleancache_poolid = -1; - (*cleancache_ops.flush_fs)(old_poolid); - } -} -EXPORT_SYMBOL(__cleancache_flush_fs); - -#ifdef CONFIG_SYSFS - -/* see Documentation/ABI/xxx/sysfs-kernel-mm-cleancache */ - -#define CLEANCACHE_SYSFS_RO(_name) \ - static ssize_t cleancache_##_name##_show(struct kobject *kobj, \ - struct kobj_attribute *attr, char *buf) \ - { \ - return sprintf(buf, "%lu\n", cleancache_##_name); \ - } \ - static struct kobj_attribute cleancache_##_name##_attr = { \ - .attr = { .name = __stringify(_name), .mode = 0444 }, \ - .show = cleancache_##_name##_show, \ - } - -CLEANCACHE_SYSFS_RO(succ_gets); -CLEANCACHE_SYSFS_RO(failed_gets); -CLEANCACHE_SYSFS_RO(puts); -CLEANCACHE_SYSFS_RO(flushes); - -static struct attribute *cleancache_attrs[] = { - &cleancache_succ_gets_attr.attr, - &cleancache_failed_gets_attr.attr, - &cleancache_puts_attr.attr, - &cleancache_flushes_attr.attr, - NULL, -}; - -static struct attribute_group cleancache_attr_group = { - .attrs = cleancache_attrs, - .name = "cleancache", -}; - -#endif /* CONFIG_SYSFS */ - -static int __init init_cleancache(void) -{ -#ifdef CONFIG_SYSFS - int err; - - err = sysfs_create_group(mm_kobj, &cleancache_attr_group); -#endif /* CONFIG_SYSFS */ - return 0; -} -module_init(init_cleancache)