From 8bdae42d959e8aa197fcc9571d12d7adaa2d6610 Mon Sep 17 00:00:00 2001 From: securecrt Date: Wed, 1 Aug 2012 10:55:06 +0800 Subject: [PATCH 01/35] net: netfilter: enable bandwidth control to be able to set mobile data limit --- arch/arm/configs/htcleo_defconfig | 26 +++++++++++++++++++------- net/netfilter/xt_TPROXY.c | 2 ++ net/netfilter/xt_socket.c | 2 ++ 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/arch/arm/configs/htcleo_defconfig b/arch/arm/configs/htcleo_defconfig index d330e5ef..12f408f2 100644 --- a/arch/arm/configs/htcleo_defconfig +++ b/arch/arm/configs/htcleo_defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit # Linux kernel version: 2.6.32-ics -# Fri Jun 1 01:10:03 CST 2012 +# Fri Aug 1 01:10:03 CST 2012 # CONFIG_ARM=y CONFIG_SYS_SUPPORTS_APM_EMULATION=y @@ -32,7 +32,7 @@ CONFIG_EXPERIMENTAL=y CONFIG_BROKEN_ON_SMP=y CONFIG_LOCK_KERNEL=y CONFIG_INIT_ENV_ARG_LIMIT=32 -CONFIG_LOCALVERSION="_tytung_HWA_r2.5" +CONFIG_LOCALVERSION="_tytung_HWA_r3.3" # CONFIG_LOCALVERSION_AUTO is not set CONFIG_HAVE_KERNEL_GZIP=y CONFIG_HAVE_KERNEL_BZIP2=y @@ -604,11 +604,10 @@ CONFIG_NETFILTER_XT_CONNMARK=y # CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y CONFIG_NETFILTER_XT_TARGET_CONNMARK=y -# CONFIG_NETFILTER_XT_TARGET_CT is not set # CONFIG_NETFILTER_XT_TARGET_DSCP is not set # CONFIG_NETFILTER_XT_TARGET_HL is not set CONFIG_NETFILTER_XT_TARGET_MARK=y -# CONFIG_NETFILTER_XT_TARGET_NFLOG is not set +CONFIG_NETFILTER_XT_TARGET_NFLOG=y CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y # CONFIG_NETFILTER_XT_TARGET_NOTRACK is not set # CONFIG_NETFILTER_XT_TARGET_RATEEST is not set @@ -632,7 +631,7 @@ CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y # CONFIG_NETFILTER_XT_MATCH_ESP is not set CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y CONFIG_NETFILTER_XT_MATCH_HELPER=y -CONFIG_NETFILTER_XT_MATCH_HL=y +# CONFIG_NETFILTER_XT_MATCH_HL is not set CONFIG_NETFILTER_XT_MATCH_IPRANGE=y CONFIG_NETFILTER_XT_MATCH_LENGTH=y CONFIG_NETFILTER_XT_MATCH_LIMIT=y @@ -709,8 +708,21 @@ CONFIG_IP_NF_ARP_MANGLE=y CONFIG_NF_DEFRAG_IPV6=y CONFIG_NF_CONNTRACK_IPV6=y # CONFIG_IP6_NF_QUEUE is not set -# CONFIG_IP6_NF_IPTABLES is not set -# CONFIG_BRIDGE_NF_EBTABLES is not set +CONFIG_IP6_NF_IPTABLES=y +# CONFIG_IP6_NF_MATCH_AH is not set +# CONFIG_IP6_NF_MATCH_EUI64 is not set +# CONFIG_IP6_NF_MATCH_FRAG is not set +# CONFIG_IP6_NF_MATCH_OPTS is not set +# CONFIG_IP6_NF_MATCH_HL is not set +# CONFIG_IP6_NF_MATCH_IPV6HEADER is not set +# CONFIG_IP6_NF_MATCH_MH is not set +# CONFIG_IP6_NF_MATCH_RT is not set +# CONFIG_IP6_NF_TARGET_HL is not set +CONFIG_IP6_NF_TARGET_LOG=y +CONFIG_IP6_NF_FILTER=y +CONFIG_IP6_NF_TARGET_REJECT=y +CONFIG_IP6_NF_MANGLE=y +CONFIG_IP6_NF_RAW=y # CONFIG_IP_DCCP is not set # CONFIG_IP_SCTP is not set # CONFIG_RDS is not set diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index 4fa12857..10640fdd 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -22,6 +22,7 @@ #include +/* #if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) #define XT_TPROXY_HAVE_IPV6 1 #include @@ -29,6 +30,7 @@ #include #include #endif +*/ #include #include diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 9b38fd15..0f10dfc6 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -22,11 +22,13 @@ #include #include +/* #if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) #define XT_SOCKET_HAVE_IPV6 1 #include #include #endif +*/ #include From e20f7d7077255a4a2b250d3d3b25c8101a48526b Mon Sep 17 00:00:00 2001 From: securecrt Date: Tue, 14 Aug 2012 13:45:36 +0800 Subject: [PATCH 02/35] base: genlock: Remove genlock_release_lock and associated ioctl base: genlock: allow synchronization with a single gralloc handle base: genlock: remove BIT macro usage --- drivers/base/genlock.c | 302 ++++++++++++++++++++++++++++++++-------- include/linux/genlock.h | 11 +- 2 files changed, 250 insertions(+), 63 deletions(-) mode change 100644 => 100755 drivers/base/genlock.c mode change 100644 => 100755 include/linux/genlock.h diff --git a/drivers/base/genlock.c b/drivers/base/genlock.c old mode 100644 new mode 100755 index afe8eb1c..b5f8e42e --- a/drivers/base/genlock.c +++ b/drivers/base/genlock.c @@ -1,4 +1,4 @@ -/* Copyright (c) 2011, Code Aurora Forum. All rights reserved. +/* Copyright (c) 2011-2012, Code Aurora Forum. All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 and @@ -22,7 +22,7 @@ #include #include #include -#include /* for in_interrupt() */ +#include /* Lock states - can either be unlocked, held as an exclusive write lock or a * shared read lock @@ -32,6 +32,9 @@ #define _RDLOCK GENLOCK_RDLOCK #define _WRLOCK GENLOCK_WRLOCK +#define GENLOCK_LOG_ERR(fmt, args...) \ +pr_err("genlock: %s: " fmt, __func__, ##args) + struct genlock { struct list_head active; /* List of handles holding lock */ spinlock_t lock; /* Spinlock to protect the lock internals */ @@ -49,12 +52,29 @@ struct genlock_handle { taken */ }; +/* + * Create a spinlock to protect against a race condition when a lock gets + * released while another process tries to attach it + */ + +static DEFINE_SPINLOCK(genlock_file_lock); + static void genlock_destroy(struct kref *kref) { - struct genlock *lock = container_of(kref, struct genlock, - refcount); + struct genlock *lock = container_of(kref, struct genlock, + refcount); - kfree(lock); + /* + * Clear the private data for the file descriptor in case the fd is + * still active after the lock gets released + */ + + spin_lock(&genlock_file_lock); + if (lock->file) + lock->file->private_data = NULL; + spin_unlock(&genlock_file_lock); + + kfree(lock); } /* @@ -64,6 +84,15 @@ static void genlock_destroy(struct kref *kref) static int genlock_release(struct inode *inodep, struct file *file) { + struct genlock *lock = file->private_data; + /* + * Clear the refrence back to this file structure to avoid + * somehow reusing the lock after the file has been destroyed + */ + + if (lock) + lock->file = NULL; + return 0; } @@ -82,12 +111,21 @@ struct genlock *genlock_create_lock(struct genlock_handle *handle) { struct genlock *lock; - if (handle->lock != NULL) + if (IS_ERR_OR_NULL(handle)) { + GENLOCK_LOG_ERR("Invalid handle\n"); return ERR_PTR(-EINVAL); + } + + if (handle->lock != NULL) { + GENLOCK_LOG_ERR("Handle already has a lock attached\n"); + return ERR_PTR(-EINVAL); + } lock = kzalloc(sizeof(*lock), GFP_KERNEL); - if (lock == NULL) + if (lock == NULL) { + GENLOCK_LOG_ERR("Unable to allocate memory for a lock\n"); return ERR_PTR(-ENOMEM); + } INIT_LIST_HEAD(&lock->active); init_waitqueue_head(&lock->queue); @@ -120,8 +158,10 @@ static int genlock_get_fd(struct genlock *lock) { int ret; - if (!lock->file) + if (!lock->file) { + GENLOCK_LOG_ERR("No file attached to the lock\n"); return -EINVAL; + } ret = get_unused_fd_flags(0); if (ret < 0) @@ -143,19 +183,37 @@ struct genlock *genlock_attach_lock(struct genlock_handle *handle, int fd) struct file *file; struct genlock *lock; - if (handle->lock != NULL) + if (IS_ERR_OR_NULL(handle)) { + GENLOCK_LOG_ERR("Invalid handle\n"); return ERR_PTR(-EINVAL); + } + + if (handle->lock != NULL) { + GENLOCK_LOG_ERR("Handle already has a lock attached\n"); + return ERR_PTR(-EINVAL); + } file = fget(fd); - if (file == NULL) + if (file == NULL) { + GENLOCK_LOG_ERR("Bad file descriptor\n"); return ERR_PTR(-EBADF); + } + /* + * take a spinlock to avoid a race condition if the lock is + * released and then attached + */ + + spin_lock(&genlock_file_lock); lock = file->private_data; + spin_unlock(&genlock_file_lock); fput(file); - if (lock == NULL) + if (lock == NULL) { + GENLOCK_LOG_ERR("File descriptor is invalid\n"); return ERR_PTR(-EINVAL); + } handle->lock = lock; kref_get(&lock->refcount); @@ -199,13 +257,16 @@ static int _genlock_unlock(struct genlock *lock, struct genlock_handle *handle) spin_lock_irqsave(&lock->lock, irqflags); - if (lock->state == _UNLOCKED) + if (lock->state == _UNLOCKED) { + GENLOCK_LOG_ERR("Trying to unlock an unlocked handle\n"); goto done; + } /* Make sure this handle is an owner of the lock */ - if (!handle_has_lock(lock, handle)) + if (!handle_has_lock(lock, handle)) { + GENLOCK_LOG_ERR("handle does not have lock attached to it\n"); goto done; - + } /* If the handle holds no more references to the lock then release it (maybe) */ @@ -228,7 +289,7 @@ static int _genlock_lock(struct genlock *lock, struct genlock_handle *handle, { unsigned long irqflags; int ret = 0; - unsigned int ticks = msecs_to_jiffies(timeout); + unsigned long ticks = msecs_to_jiffies(timeout); spin_lock_irqsave(&lock->lock, irqflags); @@ -247,12 +308,15 @@ static int _genlock_lock(struct genlock *lock, struct genlock_handle *handle, if (handle_has_lock(lock, handle)) { /* - * If the handle already holds the lock and the type matches, - * then just increment the active pointer. This allows the - * handle to do recursive locks + * If the handle already holds the lock and the lock type is + * a read lock then just increment the active pointer. This + * allows the handle to do recursive read locks. Recursive + * write locks are not allowed in order to support + * synchronization within a process using a single gralloc + * handle. */ - if (lock->state == op) { + if (lock->state == _RDLOCK && op == _RDLOCK) { handle->active++; goto done; } @@ -261,32 +325,46 @@ static int _genlock_lock(struct genlock *lock, struct genlock_handle *handle, * If the handle holds a write lock then the owner can switch * to a read lock if they want. Do the transition atomically * then wake up any pending waiters in case they want a read - * lock too. + * lock too. In order to support synchronization within a + * process the caller must explicity request to convert the + * lock type with the GENLOCK_WRITE_TO_READ flag. */ - if (op == _RDLOCK && handle->active == 1) { - lock->state = _RDLOCK; - wake_up(&lock->queue); + if (flags & GENLOCK_WRITE_TO_READ) { + if (lock->state == _WRLOCK && op == _RDLOCK) { + lock->state = _RDLOCK; + wake_up(&lock->queue); + goto done; + } else { + GENLOCK_LOG_ERR("Invalid state to convert" + "write to read\n"); + ret = -EINVAL; + goto done; + } + } + } else { + + /* + * Check to ensure the caller has not attempted to convert a + * write to a read without holding the lock. + */ + + if (flags & GENLOCK_WRITE_TO_READ) { + GENLOCK_LOG_ERR("Handle must have lock to convert" + "write to read\n"); + ret = -EINVAL; goto done; } /* - * Otherwise the user tried to turn a read into a write, and we - * don't allow that. + * If we request a read and the lock is held by a read, then go + * ahead and share the lock */ - ret = -EINVAL; - goto done; + if (op == GENLOCK_RDLOCK && lock->state == _RDLOCK) + goto dolock; } - /* - * If we request a read and the lock is held by a read, then go - * ahead and share the lock - */ - - if (op == GENLOCK_RDLOCK && lock->state == _RDLOCK) - goto dolock; - /* Treat timeout 0 just like a NOBLOCK flag and return if the lock cannot be aquired without blocking */ @@ -295,15 +373,26 @@ static int _genlock_lock(struct genlock *lock, struct genlock_handle *handle, goto done; } - /* Wait while the lock remains in an incompatible state */ + /* + * Wait while the lock remains in an incompatible state + * state op wait + * ------------------- + * unlocked n/a no + * read read no + * read write yes + * write n/a yes + */ - while (lock->state != _UNLOCKED) { - unsigned int elapsed; + while ((lock->state == _RDLOCK && op == _WRLOCK) || + lock->state == _WRLOCK) { + signed long elapsed; spin_unlock_irqrestore(&lock->lock, irqflags); elapsed = wait_event_interruptible_timeout(lock->queue, - lock->state == _UNLOCKED, ticks); + lock->state == _UNLOCKED || + (lock->state == _RDLOCK && op == _RDLOCK), + ticks); spin_lock_irqsave(&lock->lock, irqflags); @@ -312,7 +401,7 @@ static int _genlock_lock(struct genlock *lock, struct genlock_handle *handle, goto done; } - ticks = elapsed; + ticks = (unsigned long) elapsed; } dolock: @@ -320,7 +409,7 @@ dolock: list_add_tail(&handle->entry, &lock->active); lock->state = op; - handle->active = 1; + handle->active++; done: spin_unlock_irqrestore(&lock->lock, irqflags); @@ -329,7 +418,7 @@ done: } /** - * genlock_lock - Acquire or release a lock + * genlock_lock - Acquire or release a lock (depreciated) * @handle - pointer to the genlock handle that is requesting the lock * @op - the operation to perform (RDLOCK, WRLOCK, UNLOCK) * @flags - flags to control the operation @@ -341,11 +430,76 @@ done: int genlock_lock(struct genlock_handle *handle, int op, int flags, uint32_t timeout) { - struct genlock *lock = handle->lock; + struct genlock *lock; + unsigned long irqflags; + int ret = 0; - if (lock == NULL) + if (IS_ERR_OR_NULL(handle)) { + GENLOCK_LOG_ERR("Invalid handle\n"); return -EINVAL; + } + + lock = handle->lock; + + if (lock == NULL) { + GENLOCK_LOG_ERR("Handle does not have a lock attached\n"); + return -EINVAL; + } + + switch (op) { + case GENLOCK_UNLOCK: + ret = _genlock_unlock(lock, handle); + break; + case GENLOCK_RDLOCK: + spin_lock_irqsave(&lock->lock, irqflags); + if (handle_has_lock(lock, handle)) { + /* request the WRITE_TO_READ flag for compatibility */ + flags |= GENLOCK_WRITE_TO_READ; + } + spin_unlock_irqrestore(&lock->lock, irqflags); + /* fall through to take lock */ + case GENLOCK_WRLOCK: + ret = _genlock_lock(lock, handle, op, flags, timeout); + break; + default: + GENLOCK_LOG_ERR("Invalid lock operation\n"); + ret = -EINVAL; + break; + } + + return ret; +} +EXPORT_SYMBOL(genlock_lock); + +/** + * genlock_dreadlock - Acquire or release a lock + * @handle - pointer to the genlock handle that is requesting the lock + * @op - the operation to perform (RDLOCK, WRLOCK, UNLOCK) + * @flags - flags to control the operation + * @timeout - optional timeout to wait for the lock to come free + * + * Returns: 0 on success or error code on failure + */ + +int genlock_dreadlock(struct genlock_handle *handle, int op, int flags, + uint32_t timeout) +{ + struct genlock *lock; + + int ret = 0; + + if (IS_ERR_OR_NULL(handle)) { + GENLOCK_LOG_ERR("Invalid handle\n"); + return -EINVAL; + } + + lock = handle->lock; + + if (lock == NULL) { + GENLOCK_LOG_ERR("Handle does not have a lock attached\n"); + return -EINVAL; + } switch (op) { case GENLOCK_UNLOCK: @@ -356,13 +510,14 @@ int genlock_lock(struct genlock_handle *handle, int op, int flags, ret = _genlock_lock(lock, handle, op, flags, timeout); break; default: + GENLOCK_LOG_ERR("Invalid lock operation\n"); ret = -EINVAL; break; } return ret; } -EXPORT_SYMBOL(genlock_lock); +EXPORT_SYMBOL(genlock_dreadlock); /** * genlock_wait - Wait for the lock to be released @@ -372,13 +527,22 @@ EXPORT_SYMBOL(genlock_lock); int genlock_wait(struct genlock_handle *handle, uint32_t timeout) { - struct genlock *lock = handle->lock; + struct genlock *lock; unsigned long irqflags; int ret = 0; - unsigned int ticks = msecs_to_jiffies(timeout); + unsigned long ticks = msecs_to_jiffies(timeout); - if (lock == NULL) + if (IS_ERR_OR_NULL(handle)) { + GENLOCK_LOG_ERR("Invalid handle\n"); return -EINVAL; + } + + lock = handle->lock; + + if (lock == NULL) { + GENLOCK_LOG_ERR("Handle does not have a lock attached\n"); + return -EINVAL; + } spin_lock_irqsave(&lock->lock, irqflags); @@ -393,7 +557,7 @@ int genlock_wait(struct genlock_handle *handle, uint32_t timeout) } while (lock->state != _UNLOCKED) { - unsigned int elapsed; + signed long elapsed; spin_unlock_irqrestore(&lock->lock, irqflags); @@ -407,7 +571,7 @@ int genlock_wait(struct genlock_handle *handle, uint32_t timeout) break; } - ticks = elapsed; + ticks = (unsigned long) elapsed; } done: @@ -415,12 +579,7 @@ done: return ret; } -/** - * genlock_release_lock - Release a lock attached to a handle - * @handle - Pointer to the handle holding the lock - */ - -void genlock_release_lock(struct genlock_handle *handle) +static void genlock_release_lock(struct genlock_handle *handle) { unsigned long flags; @@ -441,7 +600,6 @@ void genlock_release_lock(struct genlock_handle *handle) handle->lock = NULL; handle->active = 0; } -EXPORT_SYMBOL(genlock_release_lock); /* * Release function called when all references to a handle are released @@ -468,8 +626,10 @@ static const struct file_operations genlock_handle_fops = { static struct genlock_handle *_genlock_get_handle(void) { struct genlock_handle *handle = kzalloc(sizeof(*handle), GFP_KERNEL); - if (handle == NULL) + if (handle == NULL) { + GENLOCK_LOG_ERR("Unable to allocate memory for the handle\n"); return ERR_PTR(-ENOMEM); + } return handle; } @@ -531,6 +691,9 @@ static long genlock_dev_ioctl(struct file *filep, unsigned int cmd, struct genlock *lock; int ret; + if (IS_ERR_OR_NULL(handle)) + return -EINVAL; + switch (cmd) { case GENLOCK_IOC_NEW: { lock = genlock_create_lock(handle); @@ -540,8 +703,11 @@ static long genlock_dev_ioctl(struct file *filep, unsigned int cmd, return 0; } case GENLOCK_IOC_EXPORT: { - if (handle->lock == NULL) + if (handle->lock == NULL) { + GENLOCK_LOG_ERR("Handle does not have a lock" + "attached\n"); return -EINVAL; + } ret = genlock_get_fd(handle->lock); if (ret < 0) @@ -574,6 +740,14 @@ static long genlock_dev_ioctl(struct file *filep, unsigned int cmd, return genlock_lock(handle, param.op, param.flags, param.timeout); } + case GENLOCK_IOC_DREADLOCK: { + if (copy_from_user(¶m, (void __user *) arg, + sizeof(param))) + return -EFAULT; + + return genlock_dreadlock(handle, param.op, param.flags, + param.timeout); + } case GENLOCK_IOC_WAIT: { if (copy_from_user(¶m, (void __user *) arg, sizeof(param))) @@ -582,10 +756,16 @@ static long genlock_dev_ioctl(struct file *filep, unsigned int cmd, return genlock_wait(handle, param.timeout); } case GENLOCK_IOC_RELEASE: { - genlock_release_lock(handle); - return 0; + /* + * Return error - this ioctl has been deprecated. + * Locks should only be released when the handle is + * destroyed + */ + GENLOCK_LOG_ERR("Deprecated RELEASE ioctl called\n"); + return -EINVAL; } default: + GENLOCK_LOG_ERR("Invalid ioctl\n"); return -EINVAL; } } diff --git a/include/linux/genlock.h b/include/linux/genlock.h old mode 100644 new mode 100755 index 2e9f9d68..587c49df --- a/include/linux/genlock.h +++ b/include/linux/genlock.h @@ -12,7 +12,7 @@ void genlock_put_handle(struct genlock_handle *handle); struct genlock *genlock_create_lock(struct genlock_handle *); struct genlock *genlock_attach_lock(struct genlock_handle *, int fd); int genlock_wait(struct genlock_handle *handle, u32 timeout); -void genlock_release_lock(struct genlock_handle *); +/* genlock_release_lock was deprecated */ int genlock_lock(struct genlock_handle *handle, int op, int flags, u32 timeout); #endif @@ -21,7 +21,8 @@ int genlock_lock(struct genlock_handle *handle, int op, int flags, #define GENLOCK_WRLOCK 1 #define GENLOCK_RDLOCK 2 -#define GENLOCK_NOBLOCK (1 << 0) +#define GENLOCK_NOBLOCK (1 << 0) +#define GENLOCK_WRITE_TO_READ (1 << 1) struct genlock_lock { int fd; @@ -37,9 +38,15 @@ struct genlock_lock { struct genlock_lock) #define GENLOCK_IOC_ATTACH _IOW(GENLOCK_IOC_MAGIC, 2, \ struct genlock_lock) + +/* Deprecated */ #define GENLOCK_IOC_LOCK _IOW(GENLOCK_IOC_MAGIC, 3, \ struct genlock_lock) + +/* Deprecated */ #define GENLOCK_IOC_RELEASE _IO(GENLOCK_IOC_MAGIC, 4) #define GENLOCK_IOC_WAIT _IOW(GENLOCK_IOC_MAGIC, 5, \ struct genlock_lock) +#define GENLOCK_IOC_DREADLOCK _IOW(GENLOCK_IOC_MAGIC, 6, \ + struct genlock_lock) #endif From 9b8229cafbe29f53a31c5c646f3839259a5dfd17 Mon Sep 17 00:00:00 2001 From: securecrt Date: Tue, 14 Aug 2012 16:48:25 +0800 Subject: [PATCH 03/35] Staging: android: binder: Don't call dump_stack in binder_vma_open Staging: android: binder: Fix crashes when sharing a binder file between processes drivers:staging:android Typos: fix some comments that have typos in them. fs: Remove missed ->fds_bits from cessation use of fd_set structs internally Staging:android: Change type for binder_debug_no_lock switch to bool Staging: android: binder: Fix use-after-free bug --- drivers/staging/android/binder.c | 270 ++++++++++++++----------------- 1 file changed, 120 insertions(+), 150 deletions(-) mode change 100644 => 100755 drivers/staging/android/binder.c diff --git a/drivers/staging/android/binder.c b/drivers/staging/android/binder.c old mode 100644 new mode 100755 index a0763da9..c44eb407 --- a/drivers/staging/android/binder.c +++ b/drivers/staging/android/binder.c @@ -3,7 +3,6 @@ * Android IPC Subsystem * * Copyright (C) 2007-2008 Google, Inc. - * Copyright (c) 2012, Code Aurora Forum. All rights reserved. * * This software is licensed under the terms of the GNU General Public * License version 2, as published by the Free Software Foundation, and @@ -31,14 +30,15 @@ #include #include #include -#include #include #include +#include #include "binder.h" static DEFINE_MUTEX(binder_lock); static DEFINE_MUTEX(binder_deferred_lock); +static DEFINE_MUTEX(binder_mmap_lock); static HLIST_HEAD(binder_procs); static HLIST_HEAD(binder_deferred_list); @@ -98,12 +98,12 @@ enum { BINDER_DEBUG_BUFFER_ALLOC = 1U << 13, BINDER_DEBUG_PRIORITY_CAP = 1U << 14, BINDER_DEBUG_BUFFER_ALLOC_ASYNC = 1U << 15, - BINDER_DEBUG_TOP_ERRORS = 1U << 16, + BINDER_DEBUG_TOP_ERRORS = 1U << 16, }; static uint32_t binder_debug_mask; module_param_named(debug_mask, binder_debug_mask, uint, S_IWUSR | S_IRUGO); -static int binder_debug_no_lock; +static bool binder_debug_no_lock; module_param_named(proc_no_lock, binder_debug_no_lock, bool, S_IWUSR | S_IRUGO); static DECLARE_WAIT_QUEUE_HEAD(binder_user_error_wait); @@ -258,7 +258,7 @@ struct binder_ref { }; struct binder_buffer { - struct list_head entry; /* free and allocated entries by addesss */ + struct list_head entry; /* free and allocated entries by address */ struct rb_node rb_node; /* free entry by size or allocated entry */ /* by address */ unsigned free:1; @@ -288,6 +288,7 @@ struct binder_proc { struct rb_root refs_by_node; int pid; struct vm_area_struct *vma; + struct mm_struct *vma_vm_mm; struct task_struct *tsk; struct files_struct *files; struct hlist_node deferred_work_node; @@ -380,8 +381,7 @@ int task_get_unused_fd_flags(struct binder_proc *proc, int flags) repeat: fdt = files_fdtable(files); - fd = find_next_zero_bit(fdt->open_fds->fds_bits, fdt->max_fds, - files->next_fd); + fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, files->next_fd); /* * N.B. For clone tasks sharing a files structure, this test @@ -633,6 +633,11 @@ static int binder_update_page_range(struct binder_proc *proc, int allocate, if (mm) { down_write(&mm->mmap_sem); vma = proc->vma; + if (vma && mm != proc->vma_vm_mm) { + pr_err("binder: %d: vma mm and task mm mismatch\n", + proc->pid); + vma = NULL; + } } if (allocate == 0) @@ -640,8 +645,8 @@ static int binder_update_page_range(struct binder_proc *proc, int allocate, if (vma == NULL) { binder_debug(BINDER_DEBUG_TOP_ERRORS, - "binder: %d: binder_alloc_buf failed to " - "map pages in userspace, no vma\n", proc->pid); + "binder: %d: binder_alloc_buf failed to " + "map pages in userspace, no vma\n", proc->pid); goto err_no_vma; } @@ -654,8 +659,8 @@ static int binder_update_page_range(struct binder_proc *proc, int allocate, *page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (*page == NULL) { binder_debug(BINDER_DEBUG_TOP_ERRORS, - "binder: %d: binder_alloc_buf failed " - "for page at %p\n", proc->pid, page_addr); + "binder: %d: binder_alloc_buf failed " + "for page at %p\n", proc->pid, page_addr); goto err_alloc_page_failed; } tmp_area.addr = page_addr; @@ -664,9 +669,9 @@ static int binder_update_page_range(struct binder_proc *proc, int allocate, ret = map_vm_area(&tmp_area, PAGE_KERNEL, &page_array_ptr); if (ret) { binder_debug(BINDER_DEBUG_TOP_ERRORS, - "binder: %d: binder_alloc_buf failed " - "to map page at %p in kernel\n", - proc->pid, page_addr); + "binder: %d: binder_alloc_buf failed " + "to map page at %p in kernel\n", + proc->pid, page_addr); goto err_map_kernel_failed; } user_page_addr = @@ -674,9 +679,9 @@ static int binder_update_page_range(struct binder_proc *proc, int allocate, ret = vm_insert_page(vma, user_page_addr, page[0]); if (ret) { binder_debug(BINDER_DEBUG_TOP_ERRORS, - "binder: %d: binder_alloc_buf failed " - "to map page at %lx in userspace\n", - proc->pid, user_page_addr); + "binder: %d: binder_alloc_buf failed " + "to map page at %lx in userspace\n", + proc->pid, user_page_addr); goto err_vm_insert_page_failed; } /* vm_insert_page does not seem to increment the refcount */ @@ -724,8 +729,8 @@ static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc, if (proc->vma == NULL) { binder_debug(BINDER_DEBUG_TOP_ERRORS, - "binder: %d: binder_alloc_buf, no vma\n", - proc->pid); + "binder: %d: binder_alloc_buf, no vma\n", + proc->pid); return NULL; } @@ -763,8 +768,8 @@ static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc, } if (best_fit == NULL) { binder_debug(BINDER_DEBUG_TOP_ERRORS, - "binder: %d: binder_alloc_buf size %zd failed, " - "no address space\n", proc->pid, size); + "binder: %d: binder_alloc_buf size %zd failed, " + "no address space\n", proc->pid, size); return NULL; } if (n == NULL) { @@ -999,8 +1004,8 @@ static int binder_inc_node(struct binder_node *node, int strong, int internal, !(node == binder_context_mgr_node && node->has_strong_ref)) { binder_debug(BINDER_DEBUG_TOP_ERRORS, - "binder: invalid inc strong " - "node for %d\n", node->debug_id); + "binder: invalid inc strong " + "node for %d\n", node->debug_id); return -EINVAL; } node->internal_strong_refs++; @@ -1016,8 +1021,8 @@ static int binder_inc_node(struct binder_node *node, int strong, int internal, if (!node->has_weak_ref && list_empty(&node->work.entry)) { if (target_list == NULL) { binder_debug(BINDER_DEBUG_TOP_ERRORS, - "binder: invalid inc weak node " - "for %d\n", node->debug_id); + "binder: invalid inc weak node " + "for %d\n", node->debug_id); return -EINVAL; } list_add_tail(&node->work.entry, target_list); @@ -1053,7 +1058,7 @@ static int binder_dec_node(struct binder_node *node, int strong, int internal) if (node->proc) { rb_erase(&node->rb_node, &node->proc->nodes); binder_debug(BINDER_DEBUG_INTERNAL_REFS, - "binder: refless node %d deleted\n", + "binder: refless node %d deleted\n", node->debug_id); } else { hlist_del(&node->dead_node); @@ -1272,8 +1277,7 @@ static void binder_send_failed_reply(struct binder_transaction *t, binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, "binder: send failed reply for " "transaction %d to %d:%d\n", - t->debug_id, - target_thread->proc->pid, + t->debug_id, target_thread->proc->pid, target_thread->pid); binder_pop_transaction(target_thread, t); @@ -1281,11 +1285,12 @@ static void binder_send_failed_reply(struct binder_transaction *t, wake_up_interruptible(&target_thread->wait); } else { binder_debug(BINDER_DEBUG_TOP_ERRORS, - "binder: reply failed, target " - "thread, %d:%d, has error code %d " - "already\n", target_thread->proc->pid, - target_thread->pid, - target_thread->return_error); + "binder: reply failed, target " + "thread, %d:%d, has error code %d " + "already\n", + target_thread->proc->pid, + target_thread->pid, + target_thread->return_error); } return; } else { @@ -1319,15 +1324,14 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, int debug_id = buffer->debug_id; binder_debug(BINDER_DEBUG_TRANSACTION, - "binder: %d buffer release %d, size %zd-%zd, failed at" - " %p\n", proc->pid, buffer->debug_id, + "binder: %d buffer release %d, size %zd-%zd, failed at %p\n", + proc->pid, buffer->debug_id, buffer->data_size, buffer->offsets_size, failed_at); if (buffer->target_node) binder_dec_node(buffer->target_node, 1, 0); - offp = (size_t *)(buffer->data + ALIGN(buffer->data_size, - sizeof(void *))); + offp = (size_t *)(buffer->data + ALIGN(buffer->data_size, sizeof(void *))); if (failed_at) off_end = failed_at; else @@ -1338,44 +1342,41 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, buffer->data_size < sizeof(*fp) || !IS_ALIGNED(*offp, sizeof(void *))) { binder_debug(BINDER_DEBUG_TOP_ERRORS, - "binder: transaction release %d bad" - "offset %zd, size %zd\n", debug_id, - *offp, buffer->data_size); + "binder: transaction release %d bad" + "offset %zd, size %zd\n", debug_id, + *offp, buffer->data_size); continue; } fp = (struct flat_binder_object *)(buffer->data + *offp); switch (fp->type) { case BINDER_TYPE_BINDER: case BINDER_TYPE_WEAK_BINDER: { - struct binder_node *node = binder_get_node(proc, - fp->binder); + struct binder_node *node = binder_get_node(proc, fp->binder); if (node == NULL) { binder_debug(BINDER_DEBUG_TOP_ERRORS, - "binder: transaction release %d" - " bad node %p\n", debug_id, fp->binder); + "binder: transaction release %d" + " bad node %p\n", debug_id, + fp->binder); break; } binder_debug(BINDER_DEBUG_TRANSACTION, " node %d u%p\n", node->debug_id, node->ptr); - binder_dec_node(node, fp->type == BINDER_TYPE_BINDER, - 0); + binder_dec_node(node, fp->type == BINDER_TYPE_BINDER, 0); } break; case BINDER_TYPE_HANDLE: case BINDER_TYPE_WEAK_HANDLE: { - struct binder_ref *ref = binder_get_ref(proc, - fp->handle); + struct binder_ref *ref = binder_get_ref(proc, fp->handle); if (ref == NULL) { binder_debug(BINDER_DEBUG_TOP_ERRORS, - "binder: transaction release %d" - " bad handle %ld\n", debug_id, - fp->handle); + "binder: transaction release %d" + " bad handle %ld\n", debug_id, + fp->handle); break; } binder_debug(BINDER_DEBUG_TRANSACTION, " ref %d desc %d (node %d)\n", - ref->debug_id, ref->desc, - ref->node->debug_id); + ref->debug_id, ref->desc, ref->node->debug_id); binder_dec_ref(ref, fp->type == BINDER_TYPE_HANDLE); } break; @@ -1388,8 +1389,8 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, default: binder_debug(BINDER_DEBUG_TOP_ERRORS, - "binder: transaction release %d bad " - "object type %lx\n", debug_id, fp->type); + "binder: transaction release %d bad " + "object type %lx\n", debug_id, fp->type); break; } } @@ -1614,19 +1615,15 @@ static void binder_transaction(struct binder_proc *proc, case BINDER_TYPE_BINDER: case BINDER_TYPE_WEAK_BINDER: { struct binder_ref *ref; - struct binder_node *node = binder_get_node(proc, - fp->binder); + struct binder_node *node = binder_get_node(proc, fp->binder); if (node == NULL) { - node = binder_new_node(proc, fp->binder, - fp->cookie); + node = binder_new_node(proc, fp->binder, fp->cookie); if (node == NULL) { return_error = BR_FAILED_REPLY; goto err_binder_new_node_failed; } - node->min_priority = fp->flags & - FLAT_BINDER_FLAG_PRIORITY_MASK; - node->accept_fds = !!(fp->flags & - FLAT_BINDER_FLAG_ACCEPTS_FDS); + node->min_priority = fp->flags & FLAT_BINDER_FLAG_PRIORITY_MASK; + node->accept_fds = !!(fp->flags & FLAT_BINDER_FLAG_ACCEPTS_FDS); } if (fp->cookie != node->cookie) { binder_user_error("binder: %d:%d sending u%p " @@ -1656,8 +1653,7 @@ static void binder_transaction(struct binder_proc *proc, } break; case BINDER_TYPE_HANDLE: case BINDER_TYPE_WEAK_HANDLE: { - struct binder_ref *ref = binder_get_ref(proc, - fp->handle); + struct binder_ref *ref = binder_get_ref(proc, fp->handle); if (ref == NULL) { binder_user_error("binder: %d:%d got " "transaction with invalid " @@ -1673,31 +1669,24 @@ static void binder_transaction(struct binder_proc *proc, fp->type = BINDER_TYPE_WEAK_BINDER; fp->binder = ref->node->ptr; fp->cookie = ref->node->cookie; - binder_inc_node(ref->node, fp->type == - BINDER_TYPE_BINDER, 0, NULL); + binder_inc_node(ref->node, fp->type == BINDER_TYPE_BINDER, 0, NULL); binder_debug(BINDER_DEBUG_TRANSACTION, - " ref %d desc %d -> node %d u%p\n", - ref->debug_id, ref->desc, - ref->node->debug_id, - ref->node->ptr); + " ref %d desc %d -> node %d u%p\n", + ref->debug_id, ref->desc, ref->node->debug_id, + ref->node->ptr); } else { struct binder_ref *new_ref; - new_ref = binder_get_ref_for_node(target_proc, - ref->node); + new_ref = binder_get_ref_for_node(target_proc, ref->node); if (new_ref == NULL) { return_error = BR_FAILED_REPLY; goto err_binder_get_ref_for_node_failed; } fp->handle = new_ref->desc; - binder_inc_ref(new_ref, fp->type == - BINDER_TYPE_HANDLE, NULL); + binder_inc_ref(new_ref, fp->type == BINDER_TYPE_HANDLE, NULL); binder_debug(BINDER_DEBUG_TRANSACTION, - " ref %d desc %d -> ref %d" - " desc %d (node %d)\n", - ref->debug_id, ref->desc, - new_ref->debug_id, - new_ref->desc, - ref->node->debug_id); + " ref %d desc %d -> ref %d desc %d (node %d)\n", + ref->debug_id, ref->desc, new_ref->debug_id, + new_ref->desc, ref->node->debug_id); } } break; @@ -1707,19 +1696,13 @@ static void binder_transaction(struct binder_proc *proc, if (reply) { if (!(in_reply_to->flags & TF_ACCEPT_FDS)) { - binder_user_error("binder: %d:%d got" - " reply with fd, %ld, but" - " target does not allow fds\n", - proc->pid, thread->pid, - fp->handle); + binder_user_error("binder: %d:%d got reply with fd, %ld, but target does not allow fds\n", + proc->pid, thread->pid, fp->handle); return_error = BR_FAILED_REPLY; goto err_fd_not_allowed; } } else if (!target_node->accept_fds) { - binder_user_error( - "binder: %d:%d got transaction" - " with fd, %ld, but target does" - " not allow fds\n", + binder_user_error("binder: %d:%d got transaction with fd, %ld, but target does not allow fds\n", proc->pid, thread->pid, fp->handle); return_error = BR_FAILED_REPLY; goto err_fd_not_allowed; @@ -1727,15 +1710,12 @@ static void binder_transaction(struct binder_proc *proc, file = fget(fp->handle); if (file == NULL) { - binder_user_error( - "binder: %d:%d got transaction" - " with invalid fd, %ld\n", + binder_user_error("binder: %d:%d got transaction with invalid fd, %ld\n", proc->pid, thread->pid, fp->handle); return_error = BR_FAILED_REPLY; goto err_fget_failed; } - target_fd = task_get_unused_fd_flags(target_proc, - O_CLOEXEC); + target_fd = task_get_unused_fd_flags(target_proc, O_CLOEXEC); if (target_fd < 0) { fput(file); return_error = BR_FAILED_REPLY; @@ -1743,8 +1723,7 @@ static void binder_transaction(struct binder_proc *proc, } task_fd_install(target_proc, target_fd, file); binder_debug(BINDER_DEBUG_TRANSACTION, - " fd %ld -> %d\n", fp->handle, - target_fd); + " fd %ld -> %d\n", fp->handle, target_fd); /* TODO: fput? */ fp->handle = target_fd; } break; @@ -1893,11 +1872,9 @@ int binder_thread_write(struct binder_proc *proc, struct binder_thread *thread, break; } binder_debug(BINDER_DEBUG_USER_REFS, - "binder: %d:%d %s ref %d desc %d s %d w %d" - " for node %d\n", proc->pid, thread->pid, - debug_string, ref->debug_id, ref->desc, - ref->strong, ref->weak, - ref->node->debug_id); + "binder: %d:%d %s ref %d desc %d s %d w %d for node %d\n", + proc->pid, thread->pid, debug_string, ref->debug_id, + ref->desc, ref->strong, ref->weak, ref->node->debug_id); break; } case BC_INCREFS_DONE: @@ -1958,19 +1935,17 @@ int binder_thread_write(struct binder_proc *proc, struct binder_thread *thread, binder_debug(BINDER_DEBUG_USER_REFS, "binder: %d:%d %s node %d ls %d lw %d\n", proc->pid, thread->pid, - cmd == BC_INCREFS_DONE ? "BC_INCREFS_DONE" - : "BC_ACQUIRE_DONE", - node->debug_id, node->local_strong_refs, - node->local_weak_refs); + cmd == BC_INCREFS_DONE ? "BC_INCREFS_DONE" : "BC_ACQUIRE_DONE", + node->debug_id, node->local_strong_refs, node->local_weak_refs); break; } case BC_ATTEMPT_ACQUIRE: binder_debug(BINDER_DEBUG_TOP_ERRORS, - "binder: BC_ATTEMPT_ACQUIRE not supported\n"); + "binder: BC_ATTEMPT_ACQUIRE not supported\n"); return -EINVAL; case BC_ACQUIRE_RESULT: - binder_debug(BINDER_DEBUG_TOP_ERRORS, - "binder: BC_ACQUIRE_RESULT not supported\n"); + binder_debug(BINDER_DEBUG_TOP_ERRORS, + "binder: BC_ACQUIRE_RESULT not supported\n"); return -EINVAL; case BC_FREE_BUFFER: { @@ -1996,11 +1971,9 @@ int binder_thread_write(struct binder_proc *proc, struct binder_thread *thread, break; } binder_debug(BINDER_DEBUG_FREE_BUFFER, - "binder: %d:%d BC_FREE_BUFFER u%p found" - " buffer %d for %s transaction\n", - proc->pid, thread->pid, data_ptr, - buffer->debug_id, buffer->transaction ? - "active" : "finished"); + "binder: %d:%d BC_FREE_BUFFER u%p found buffer %d for %s transaction\n", + proc->pid, thread->pid, data_ptr, buffer->debug_id, + buffer->transaction ? "active" : "finished"); if (buffer->transaction) { buffer->transaction->buffer = NULL; @@ -2097,15 +2070,13 @@ int binder_thread_write(struct binder_proc *proc, struct binder_thread *thread, } binder_debug(BINDER_DEBUG_DEATH_NOTIFICATION, - "binder: %d:%d %s %p ref %d desc %d s %d" - " w %d for node %d\n", + "binder: %d:%d %s %p ref %d desc %d s %d w %d for node %d\n", proc->pid, thread->pid, cmd == BC_REQUEST_DEATH_NOTIFICATION ? "BC_REQUEST_DEATH_NOTIFICATION" : "BC_CLEAR_DEATH_NOTIFICATION", cookie, ref->debug_id, ref->desc, - ref->strong, ref->weak, - ref->node->debug_id); + ref->strong, ref->weak, ref->node->debug_id); if (cmd == BC_REQUEST_DEATH_NOTIFICATION) { if (ref->death) { @@ -2119,12 +2090,10 @@ int binder_thread_write(struct binder_proc *proc, struct binder_thread *thread, death = kzalloc(sizeof(*death), GFP_KERNEL); if (death == NULL) { thread->return_error = BR_ERROR; - binder_debug( - BINDER_DEBUG_FAILED_TRANSACTION, - "binder: %d:%d " - "BC_REQUEST_DEATH_NOTIFICATION" - " failed\n", - proc->pid, thread->pid); + binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, + "binder: %d:%d " + "BC_REQUEST_DEATH_NOTIFICATION failed\n", + proc->pid, thread->pid); break; } binder_stats_created(BINDER_STAT_DEATH); @@ -2214,8 +2183,8 @@ int binder_thread_write(struct binder_proc *proc, struct binder_thread *thread, default: binder_debug(BINDER_DEBUG_TOP_ERRORS, - "binder: %d:%d unknown command %d\n", - proc->pid, thread->pid, cmd); + "binder: %d:%d unknown command %d\n", + proc->pid, thread->pid, cmd); return -EINVAL; } *consumed = ptr - buffer; @@ -2684,11 +2653,9 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) unsigned int size = _IOC_SIZE(cmd); void __user *ubuf = (void __user *)arg; - /*binder_debug(BINDER_DEBUG_TOP_ERRORS, "binder_ioctl: %d:%d %x %lx\n", - proc->pid, current->pid, cmd, arg);*/ + /*printk(KERN_INFO "binder_ioctl: %d:%d %x %lx\n", proc->pid, current->pid, cmd, arg);*/ - ret = wait_event_interruptible(binder_user_error_wait, - binder_stop_on_user_error < 2); + ret = wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2); if (ret) return ret; @@ -2745,8 +2712,7 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) break; } case BINDER_SET_MAX_THREADS: - if (copy_from_user(&proc->max_threads, ubuf, - sizeof(proc->max_threads))) { + if (copy_from_user(&proc->max_threads, ubuf, sizeof(proc->max_threads))) { ret = -EINVAL; goto err; } @@ -2754,17 +2720,17 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case BINDER_SET_CONTEXT_MGR: if (binder_context_mgr_node != NULL) { binder_debug(BINDER_DEBUG_TOP_ERRORS, - "binder: BINDER_SET_CONTEXT_MGR already set\n"); + "binder: BINDER_SET_CONTEXT_MGR already set\n"); ret = -EBUSY; goto err; } if (binder_context_mgr_uid != -1) { if (binder_context_mgr_uid != current->cred->euid) { binder_debug(BINDER_DEBUG_TOP_ERRORS, - "binder: BINDER_SET_" - "CONTEXT_MGR bad uid %d != %d\n", - current->cred->euid, - binder_context_mgr_uid); + "binder: BINDER_SET_" + "CONTEXT_MGR bad uid %d != %d\n", + current->cred->euid, + binder_context_mgr_uid); ret = -EPERM; goto err; } @@ -2808,8 +2774,8 @@ err: wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2); if (ret && ret != -ERESTARTSYS) binder_debug(BINDER_DEBUG_TOP_ERRORS, - "binder: %d:%d ioctl %x %lx returned %d\n", - proc->pid, current->pid, cmd, arg, ret); + "binder: %d:%d ioctl %x %lx returned %d\n", + proc->pid, current->pid, cmd, arg, ret); return ret; } @@ -2821,7 +2787,6 @@ static void binder_vma_open(struct vm_area_struct *vma) proc->pid, vma->vm_start, vma->vm_end, (vma->vm_end - vma->vm_start) / SZ_1K, vma->vm_flags, (unsigned long)pgprot_val(vma->vm_page_prot)); - dump_stack(); } static void binder_vma_close(struct vm_area_struct *vma) @@ -2833,6 +2798,7 @@ static void binder_vma_close(struct vm_area_struct *vma) (vma->vm_end - vma->vm_start) / SZ_1K, vma->vm_flags, (unsigned long)pgprot_val(vma->vm_page_prot)); proc->vma = NULL; + proc->vma_vm_mm = NULL; binder_defer_work(proc, BINDER_DEFERRED_PUT_FILES); } @@ -2865,6 +2831,7 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma) } vma->vm_flags = (vma->vm_flags | VM_DONTCOPY) & ~VM_MAYWRITE; + mutex_lock(&binder_mmap_lock); if (proc->buffer) { ret = -EBUSY; failure_string = "already mapped"; @@ -2879,13 +2846,13 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma) } proc->buffer = area->addr; proc->user_buffer_offset = vma->vm_start - (uintptr_t)proc->buffer; + mutex_unlock(&binder_mmap_lock); #ifdef CONFIG_CPU_CACHE_VIPT if (cache_is_vipt_aliasing()) { while (CACHE_COLOUR((vma->vm_start ^ (uint32_t)proc->buffer))) { binder_debug(BINDER_DEBUG_TOP_ERRORS, - "binder_mmap: %d %lx-%lx maps %p bad alignment\n", - proc->pid, vma->vm_start, vma->vm_end, proc->buffer); + "binder_mmap: %d %lx-%lx maps %p bad alignment\n", proc->pid, vma->vm_start, vma->vm_end, proc->buffer); vma->vm_start += PAGE_SIZE; } } @@ -2913,11 +2880,11 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma) binder_insert_free_buffer(proc, buffer); proc->free_async_space = proc->buffer_size / 2; barrier(); - proc->files = get_files_struct(current); + proc->files = get_files_struct(proc->tsk); proc->vma = vma; + proc->vma_vm_mm = vma->vm_mm; - /*binder_debug(BINDER_DEBUG_TOP_ERRORS, - "binder_mmap: %d %lx-%lx maps %p\n", + /*printk(KERN_INFO "binder_mmap: %d %lx-%lx maps %p\n", proc->pid, vma->vm_start, vma->vm_end, proc->buffer);*/ return 0; @@ -2925,14 +2892,17 @@ err_alloc_small_buf_failed: kfree(proc->pages); proc->pages = NULL; err_alloc_pages_failed: + mutex_lock(&binder_mmap_lock); vfree(proc->buffer); proc->buffer = NULL; err_get_vm_area_failed: err_already_mapped: + mutex_unlock(&binder_mmap_lock); err_bad_arg: binder_debug(BINDER_DEBUG_TOP_ERRORS, - "binder_mmap: %d %lx-%lx %s failed %d\n", - proc->pid, vma->vm_start, vma->vm_end, failure_string, ret); + "binder_mmap: %d %lx-%lx %s failed %d\n", + proc->pid, vma->vm_start, vma->vm_end, failure_string, + ret); return ret; } @@ -3087,9 +3057,9 @@ static void binder_deferred_release(struct binder_proc *proc) t->buffer = NULL; buffer->transaction = NULL; binder_debug(BINDER_DEBUG_TOP_ERRORS, - "binder: release proc %d, " - "transaction %d, not freed\n", - proc->pid, t->debug_id); + "binder: release proc %d, " + "transaction %d, not freed\n", + proc->pid, t->debug_id); /*BUG();*/ } binder_free_buf(proc, buffer); From d6b41b0def6ca7bf09bc79255bbc534f80d270be Mon Sep 17 00:00:00 2001 From: securecrt Date: Wed, 15 Aug 2012 13:37:30 +0800 Subject: [PATCH 04/35] staging: android: lowmemorykiller: sysfs node and notifications android, lowmemorykiller: remove task handoff notifier staging: android: lowmemorykiller: Fix task_struct leak staging: android/lowmemorykiller: Don't unregister notifier from atomic context staging: android, lowmemorykiller: convert to use oom_score_adj staging: android/lowmemorykiller: Do not kill kernel threads staging: android/lowmemorykiller: No need for task->signal check staging: android/lowmemorykiller: Better mm handling staging: android/lowmemorykiller: Don't grab tasklist_lock staging: android: lowmemorykiller: Don't wait more than one second for a process to die Staging: android: fixed 80 characters warnings in lowmemorykiller.c staging: android: lowmemorykiller: Ignore shmem pages in page-cache staging: android: lowmemorykiller: Remove bitrotted codepath staging: android: lowmemkiller: Substantially reduce overhead during reclaim staging: android: lowmemorykiller: Don't try to kill the same pid over and over --- drivers/staging/android/lowmemorykiller.c | 201 ++++++++++++++++++---- 1 file changed, 164 insertions(+), 37 deletions(-) mode change 100644 => 100755 drivers/staging/android/lowmemorykiller.c diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c old mode 100644 new mode 100755 index 42cd93ea..05ebece0 --- a/drivers/staging/android/lowmemorykiller.c +++ b/drivers/staging/android/lowmemorykiller.c @@ -29,12 +29,17 @@ * */ -#include #include +#include +#include +#include #include +#include +#include #include #include -#include +#include +#include static uint32_t lowmem_debug_level = 2; static int lowmem_adj[6] = { @@ -52,8 +57,12 @@ static size_t lowmem_minfree[6] = { }; static int lowmem_minfree_size = 4; +static size_t lowmem_minfree_notif_trigger; + +static unsigned int offlining; static struct task_struct *lowmem_deathpending; -static DEFINE_SPINLOCK(lowmem_deathpending_lock); +static unsigned long lowmem_deathpending_timeout; +static struct kobject *lowmem_kobj; #define lowmem_print(level, x...) \ do { \ @@ -68,29 +77,66 @@ static struct notifier_block task_nb = { .notifier_call = task_notify_func, }; - -static void task_free_fn(struct work_struct *work) -{ - unsigned long flags; - - task_free_unregister(&task_nb); - spin_lock_irqsave(&lowmem_deathpending_lock, flags); - lowmem_deathpending = NULL; - spin_unlock_irqrestore(&lowmem_deathpending_lock, flags); -} -static DECLARE_WORK(task_free_work, task_free_fn); - static int task_notify_func(struct notifier_block *self, unsigned long val, void *data) { struct task_struct *task = data; - if (task == lowmem_deathpending) { - schedule_work(&task_free_work); - } + if (task == lowmem_deathpending) + lowmem_deathpending = NULL; + return NOTIFY_OK; } +#ifdef CONFIG_MEMORY_HOTPLUG +static int lmk_hotplug_callback(struct notifier_block *self, + unsigned long cmd, void *data) +{ + switch (cmd) { + /* Don't care LMK cases */ + case MEM_ONLINE: + case MEM_OFFLINE: + case MEM_CANCEL_ONLINE: + case MEM_CANCEL_OFFLINE: + case MEM_GOING_ONLINE: + offlining = 0; + lowmem_print(4, "lmk in normal mode\n"); + break; + /* LMK should account for movable zone */ + case MEM_GOING_OFFLINE: + offlining = 1; + lowmem_print(4, "lmk in hotplug mode\n"); + break; + } + return NOTIFY_DONE; +} +#endif + + + +static void lowmem_notify_killzone_approach(void); + +static inline void get_free_ram(int *other_free, int *other_file) +{ + struct zone *zone; + *other_free = global_page_state(NR_FREE_PAGES); + *other_file = global_page_state(NR_FILE_PAGES) - + global_page_state(NR_SHMEM); + + if (offlining) { + /* Discount all free space in the section being offlined */ + for_each_zone(zone) { + if (zone_idx(zone) == ZONE_MOVABLE) { + *other_free -= zone_page_state(zone, + NR_FREE_PAGES); + lowmem_print(4, "lowmem_shrink discounted " + "%lu pages in movable zone\n", + zone_page_state(zone, NR_FREE_PAGES)); + } + } + } +} + static int lowmem_shrink(int nr_to_scan, gfp_t gfp_mask) { struct task_struct *p; @@ -102,10 +148,8 @@ static int lowmem_shrink(int nr_to_scan, gfp_t gfp_mask) int selected_tasksize = 0; int selected_oom_adj; int array_size = ARRAY_SIZE(lowmem_adj); - int other_free = global_page_state(NR_FREE_PAGES); - int other_file = global_page_state(NR_FILE_PAGES); - unsigned long flags; - + int other_free; + int other_file; /* * If we already have a death outstanding, then * bail out right away; indicating to vmscan @@ -113,15 +157,24 @@ static int lowmem_shrink(int nr_to_scan, gfp_t gfp_mask) * this pass. * */ - if (lowmem_deathpending) + if (lowmem_deathpending && + time_before_eq(jiffies, lowmem_deathpending_timeout)) return 0; + get_free_ram(&other_free, &other_file); + + if (other_free < lowmem_minfree_notif_trigger && + other_file < lowmem_minfree_notif_trigger) { + lowmem_notify_killzone_approach(); + } + if (lowmem_adj_size < array_size) array_size = lowmem_adj_size; if (lowmem_minfree_size < array_size) array_size = lowmem_minfree_size; for (i = 0; i < array_size; i++) { - if (other_file < lowmem_minfree[i]) { + if (other_free < lowmem_minfree[i] && + other_file < lowmem_minfree[i]) { min_adj = lowmem_adj[i]; break; } @@ -176,20 +229,14 @@ static int lowmem_shrink(int nr_to_scan, gfp_t gfp_mask) lowmem_print(2, "select %d (%s), adj %d, size %d, to kill\n", p->pid, p->comm, oom_adj, tasksize); } - if (selected) { - spin_lock_irqsave(&lowmem_deathpending_lock, flags); - if (!lowmem_deathpending) { - lowmem_print(1, - "send sigkill to %d (%s), adj %d, size %d\n", - selected->pid, selected->comm, - selected_oom_adj, selected_tasksize); - lowmem_deathpending = selected; - task_free_register(&task_nb); - force_sig(SIGKILL, selected); - rem -= selected_tasksize; - } - spin_unlock_irqrestore(&lowmem_deathpending_lock, flags); + lowmem_print(1, "send sigkill to %d (%s), adj %d, size %d\n", + selected->pid, selected->comm, + selected_oom_adj, selected_tasksize); + lowmem_deathpending = selected; + lowmem_deathpending_timeout = jiffies + HZ; + force_sig(SIGKILL, selected); + rem -= selected_tasksize; } lowmem_print(4, "lowmem_shrink %d, %x, return %d\n", nr_to_scan, gfp_mask, rem); @@ -202,15 +249,93 @@ static struct shrinker lowmem_shrinker = { .seeks = DEFAULT_SEEKS * 16 }; +static void lowmem_notify_killzone_approach(void) +{ + lowmem_print(3, "notification trigger activated\n"); + sysfs_notify(lowmem_kobj, NULL, "notify_trigger_active"); +} + +static ssize_t lowmem_notify_trigger_active_show(struct kobject *k, + struct kobj_attribute *attr, char *buf) +{ + int other_free, other_file; + get_free_ram(&other_free, &other_file); + if (other_free < lowmem_minfree_notif_trigger && + other_file < lowmem_minfree_notif_trigger) + return snprintf(buf, 3, "1\n"); + else + return snprintf(buf, 3, "0\n"); +} + +static struct kobj_attribute lowmem_notify_trigger_active_attr = + __ATTR(notify_trigger_active, S_IRUGO, + lowmem_notify_trigger_active_show, NULL); + +static struct attribute *lowmem_default_attrs[] = { + &lowmem_notify_trigger_active_attr.attr, + NULL, +}; + +static ssize_t lowmem_show(struct kobject *k, struct attribute *attr, char *buf) +{ + struct kobj_attribute *kobj_attr; + kobj_attr = container_of(attr, struct kobj_attribute, attr); + return kobj_attr->show(k, kobj_attr, buf); +} + +static const struct sysfs_ops lowmem_ops = { + .show = lowmem_show, +}; + +static void lowmem_kobj_release(struct kobject *kobj) +{ + /* Nothing to be done here */ +} + +static struct kobj_type lowmem_kobj_type = { + .release = lowmem_kobj_release, + .sysfs_ops = &lowmem_ops, + .default_attrs = lowmem_default_attrs, +}; + static int __init lowmem_init(void) { + int rc; + task_free_register(&task_nb); register_shrinker(&lowmem_shrinker); +#ifdef CONFIG_MEMORY_HOTPLUG + hotplug_memory_notifier(lmk_hotplug_callback, 0); +#endif + + lowmem_kobj = kzalloc(sizeof(*lowmem_kobj), GFP_KERNEL); + if (!lowmem_kobj) { + rc = -ENOMEM; + goto err; + } + + rc = kobject_init_and_add(lowmem_kobj, &lowmem_kobj_type, + mm_kobj, "lowmemkiller"); + if (rc) + goto err_kobj; + return 0; + +err_kobj: + kfree(lowmem_kobj); + +err: + unregister_shrinker(&lowmem_shrinker); + task_free_unregister(&task_nb); + + return rc; } static void __exit lowmem_exit(void) { + kobject_put(lowmem_kobj); + kfree(lowmem_kobj); unregister_shrinker(&lowmem_shrinker); + task_free_unregister(&task_nb); } module_param_named(cost, lowmem_shrinker.seeks, int, S_IRUGO | S_IWUSR); @@ -219,6 +344,8 @@ module_param_array_named(adj, lowmem_adj, int, &lowmem_adj_size, module_param_array_named(minfree, lowmem_minfree, uint, &lowmem_minfree_size, S_IRUGO | S_IWUSR); module_param_named(debug_level, lowmem_debug_level, uint, S_IRUGO | S_IWUSR); +module_param_named(notify_trigger, lowmem_minfree_notif_trigger, uint, + S_IRUGO | S_IWUSR); module_init(lowmem_init); module_exit(lowmem_exit); From 9c3257d8592f5d9e6f82a59cea4b1ac172288d20 Mon Sep 17 00:00:00 2001 From: securecrt Date: Wed, 15 Aug 2012 14:56:37 +0800 Subject: [PATCH 05/35] android: logger: Allow a UID to read it's own log entries staging: logger: hold mutex while removing reader staging: android: logger: clarify non-update of w_off in do_write_log_from_user staging: android: logger: clarify code in clock_interval staging: android: logger: reorder prepare_to_wait and mutex_lock staging: android: logger: simplify and optimize get_entry_len staging: android: logger: Change logger_offset() from macro to function Staging: android: fixed white spaces coding style issue in logger.c android: logger: bump up the logger buffer sizes --- drivers/staging/android/logger.c | 268 +++++++++++++++++++++++++------ drivers/staging/android/logger.h | 29 +++- include/linux/capability.h | 6 +- 3 files changed, 249 insertions(+), 54 deletions(-) mode change 100644 => 100755 drivers/staging/android/logger.c mode change 100644 => 100755 drivers/staging/android/logger.h mode change 100644 => 100755 include/linux/capability.h diff --git a/drivers/staging/android/logger.c b/drivers/staging/android/logger.c old mode 100644 new mode 100755 index 1a0c1391..eb3d4ca5 --- a/drivers/staging/android/logger.c +++ b/drivers/staging/android/logger.c @@ -37,7 +37,7 @@ * mutex 'mutex'. */ struct logger_log { - unsigned char *buffer;/* the ring buffer itself */ + unsigned char *buffer;/* the ring buffer itself */ struct miscdevice misc; /* misc device representing the log */ wait_queue_head_t wq; /* wait queue for readers */ struct list_head readers; /* this log's readers */ @@ -57,19 +57,25 @@ struct logger_reader { struct logger_log *log; /* associated log */ struct list_head list; /* entry in logger_log's list */ size_t r_off; /* current read head offset */ + bool r_all; /* reader can read all entries */ + int r_ver; /* reader ABI version */ }; /* logger_offset - returns index 'n' into the log via (optimized) modulus */ -#define logger_offset(n) ((n) & (log->size - 1)) +size_t logger_offset(struct logger_log *log, size_t n) +{ + return n & (log->size-1); +} + /* * file_get_log - Given a file structure, return the associated log * * This isn't aesthetic. We have several goals: * - * 1) Need to quickly obtain the associated log during an I/O operation - * 2) Readers need to maintain state (logger_reader) - * 3) Writers need to be very fast (open() should be a near no-op) + * 1) Need to quickly obtain the associated log during an I/O operation + * 2) Readers need to maintain state (logger_reader) + * 3) Writers need to be very fast (open() should be a near no-op) * * In the reader case, we can trivially go file->logger_reader->logger_log. * For a writer, we don't want to maintain a logger_reader, so we just go @@ -86,25 +92,75 @@ static inline struct logger_log *file_get_log(struct file *file) } /* - * get_entry_len - Grabs the length of the payload of the next entry starting - * from 'off'. + * get_entry_header - returns a pointer to the logger_entry header within + * 'log' starting at offset 'off'. A temporary logger_entry 'scratch' must + * be provided. Typically the return value will be a pointer within + * 'logger->buf'. However, a pointer to 'scratch' may be returned if + * the log entry spans the end and beginning of the circular buffer. + */ +static struct logger_entry *get_entry_header(struct logger_log *log, + size_t off, struct logger_entry *scratch) +{ + size_t len = min(sizeof(struct logger_entry), log->size - off); + if (len != sizeof(struct logger_entry)) { + memcpy(((void *) scratch), log->buffer + off, len); + memcpy(((void *) scratch) + len, log->buffer, + sizeof(struct logger_entry) - len); + return scratch; + } + + return (struct logger_entry *) (log->buffer + off); +} + +/* + * get_entry_msg_len - Grabs the length of the message of the entry + * starting from from 'off'. + * + * An entry length is 2 bytes (16 bits) in host endian order. + * In the log, the length does not include the size of the log entry structure. + * This function returns the size including the log entry structure. * * Caller needs to hold log->mutex. */ -static __u32 get_entry_len(struct logger_log *log, size_t off) +static __u32 get_entry_msg_len(struct logger_log *log, size_t off) { - __u16 val; + struct logger_entry scratch; + struct logger_entry *entry; - switch (log->size - off) { - case 1: - memcpy(&val, log->buffer + off, 1); - memcpy(((char *) &val) + 1, log->buffer, 1); - break; - default: - memcpy(&val, log->buffer + off, 2); + entry = get_entry_header(log, off, &scratch); + return entry->len; +} + +static size_t get_user_hdr_len(int ver) +{ + if (ver < 2) + return sizeof(struct user_logger_entry_compat); + else + return sizeof(struct logger_entry); +} + +static ssize_t copy_header_to_user(int ver, struct logger_entry *entry, + char __user *buf) +{ + void *hdr; + size_t hdr_len; + struct user_logger_entry_compat v1; + + if (ver < 2) { + v1.len = entry->len; + v1.__pad = 0; + v1.pid = entry->pid; + v1.tid = entry->tid; + v1.sec = entry->sec; + v1.nsec = entry->nsec; + hdr = &v1; + hdr_len = sizeof(struct user_logger_entry_compat); + } else { + hdr = entry; + hdr_len = sizeof(struct logger_entry); } - return sizeof(struct logger_entry) + val; + return copy_to_user(buf, hdr, hdr_len); } /* @@ -118,15 +174,31 @@ static ssize_t do_read_log_to_user(struct logger_log *log, char __user *buf, size_t count) { + struct logger_entry scratch; + struct logger_entry *entry; size_t len; + size_t msg_start; /* - * We read from the log in two disjoint operations. First, we read from - * the current read head offset up to 'count' bytes or to the end of + * First, copy the header to userspace, using the version of + * the header requested + */ + entry = get_entry_header(log, reader->r_off, &scratch); + if (copy_header_to_user(reader->r_ver, entry, buf)) + return -EFAULT; + + count -= get_user_hdr_len(reader->r_ver); + buf += get_user_hdr_len(reader->r_ver); + msg_start = logger_offset(log, + reader->r_off + sizeof(struct logger_entry)); + + /* + * We read from the msg in two disjoint operations. First, we read from + * the current msg head offset up to 'count' bytes or to the end of * the log, whichever comes first. */ - len = min(count, log->size - reader->r_off); - if (copy_to_user(buf, log->buffer + reader->r_off, len)) + len = min(count, log->size - msg_start); + if (copy_to_user(buf, log->buffer + msg_start, len)) return -EFAULT; /* @@ -137,9 +209,34 @@ static ssize_t do_read_log_to_user(struct logger_log *log, if (copy_to_user(buf + len, log->buffer, count - len)) return -EFAULT; - reader->r_off = logger_offset(reader->r_off + count); + reader->r_off = logger_offset(log, reader->r_off + + sizeof(struct logger_entry) + count); - return count; + return count + get_user_hdr_len(reader->r_ver); +} + +/* + * get_next_entry_by_uid - Starting at 'off', returns an offset into + * 'log->buffer' which contains the first entry readable by 'euid' + */ +static size_t get_next_entry_by_uid(struct logger_log *log, + size_t off, uid_t euid) +{ + while (off != log->w_off) { + struct logger_entry *entry; + struct logger_entry scratch; + size_t next_len; + + entry = get_entry_header(log, off, &scratch); + + if (entry->euid == euid) + return off; + + next_len = sizeof(struct logger_entry) + entry->len; + off = logger_offset(log, off + next_len); + } + + return off; } /* @@ -147,11 +244,11 @@ static ssize_t do_read_log_to_user(struct logger_log *log, * * Behavior: * - * - O_NONBLOCK works - * - If there are no log entries to read, blocks until log is written to - * - Atomically reads exactly one log entry + * - O_NONBLOCK works + * - If there are no log entries to read, blocks until log is written to + * - Atomically reads exactly one log entry * - * Optimal read size is LOGGER_ENTRY_MAX_LEN. Will set errno to EINVAL if read + * Will set errno to EINVAL if read * buffer is insufficient to hold next entry. */ static ssize_t logger_read(struct file *file, char __user *buf, @@ -164,9 +261,10 @@ static ssize_t logger_read(struct file *file, char __user *buf, start: while (1) { + mutex_lock(&log->mutex); + prepare_to_wait(&log->wq, &wait, TASK_INTERRUPTIBLE); - mutex_lock(&log->mutex); ret = (log->w_off == reader->r_off); mutex_unlock(&log->mutex); if (!ret) @@ -191,6 +289,10 @@ start: mutex_lock(&log->mutex); + if (!reader->r_all) + reader->r_off = get_next_entry_by_uid(log, + reader->r_off, current_euid()); + /* is there still something to read or did we race? */ if (unlikely(log->w_off == reader->r_off)) { mutex_unlock(&log->mutex); @@ -198,7 +300,8 @@ start: } /* get the size of the next entry */ - ret = get_entry_len(log, reader->r_off); + ret = get_user_hdr_len(reader->r_ver) + + get_entry_msg_len(log, reader->r_off); if (count < ret) { ret = -EINVAL; goto out; @@ -224,8 +327,9 @@ static size_t get_next_entry(struct logger_log *log, size_t off, size_t len) size_t count = 0; do { - size_t nr = get_entry_len(log, off); - off = logger_offset(off + nr); + size_t nr = sizeof(struct logger_entry) + + get_entry_msg_len(log, off); + off = logger_offset(log, off + nr); count += nr; } while (count < len); @@ -233,16 +337,28 @@ static size_t get_next_entry(struct logger_log *log, size_t off, size_t len) } /* - * clock_interval - is a < c < b in mod-space? Put another way, does the line - * from a to b cross c? + * is_between - is a < c < b, accounting for wrapping of a, b, and c + * positions in the buffer + * + * That is, if ab, check for c outside (not between) a and b + * + * |------- a xxxxxxxx b --------| + * c^ + * + * |xxxxx b --------- a xxxxxxxxx| + * c^ + * or c^ */ -static inline int clock_interval(size_t a, size_t b, size_t c) +static inline int is_between(size_t a, size_t b, size_t c) { - if (b < a) { - if (a < c || b >= c) + if (a < b) { + /* is c between a and b? */ + if (a < c && c <= b) return 1; } else { - if (a < c && b >= c) + /* is c outside of b through a? */ + if (c <= b || a < c) return 1; } @@ -260,14 +376,14 @@ static inline int clock_interval(size_t a, size_t b, size_t c) static void fix_up_readers(struct logger_log *log, size_t len) { size_t old = log->w_off; - size_t new = logger_offset(old + len); + size_t new = logger_offset(log, old + len); struct logger_reader *reader; - if (clock_interval(old, new, log->head)) + if (is_between(old, new, log->head)) log->head = get_next_entry(log, log->head, len); list_for_each_entry(reader, &log->readers, list) - if (clock_interval(old, new, reader->r_off)) + if (is_between(old, new, reader->r_off)) reader->r_off = get_next_entry(log, reader->r_off, len); } @@ -286,7 +402,7 @@ static void do_write_log(struct logger_log *log, const void *buf, size_t count) if (count != len) memcpy(log->buffer, buf + len, count - len); - log->w_off = logger_offset(log->w_off + count); + log->w_off = logger_offset(log, log->w_off + count); } @@ -309,9 +425,15 @@ static ssize_t do_write_log_from_user(struct logger_log *log, if (count != len) if (copy_from_user(log->buffer, buf + len, count - len)) + /* + * Note that by not updating w_off, this abandons the + * portion of the new entry that *was* successfully + * copied, just above. This is intentional to avoid + * message corruption from missing fragments. + */ return -EFAULT; - log->w_off = logger_offset(log->w_off + count); + log->w_off = logger_offset(log, log->w_off + count); return count; } @@ -336,7 +458,9 @@ ssize_t logger_aio_write(struct kiocb *iocb, const struct iovec *iov, header.tid = current->pid; header.sec = now.tv_sec; header.nsec = now.tv_nsec; + header.euid = current_euid(); header.len = min_t(size_t, iocb->ki_left, LOGGER_ENTRY_MAX_PAYLOAD); + header.hdr_size = sizeof(struct logger_entry); /* null writes succeed, return zero */ if (unlikely(!header.len)) @@ -409,6 +533,10 @@ static int logger_open(struct inode *inode, struct file *file) return -ENOMEM; reader->log = log; + reader->r_ver = 1; + reader->r_all = in_egroup_p(inode->i_gid) || + capable(CAP_SYSLOG); + INIT_LIST_HEAD(&reader->list); mutex_lock(&log->mutex); @@ -433,9 +561,11 @@ static int logger_release(struct inode *ignored, struct file *file) if (file->f_mode & FMODE_READ) { struct logger_reader *reader = file->private_data; struct logger_log *log = reader->log; + mutex_lock(&log->mutex); list_del(&reader->list); mutex_unlock(&log->mutex); + kfree(reader); } @@ -466,6 +596,10 @@ static unsigned int logger_poll(struct file *file, poll_table *wait) poll_wait(file, &log->wq, wait); mutex_lock(&log->mutex); + if (!reader->r_all) + reader->r_off = get_next_entry_by_uid(log, + reader->r_off, current_euid()); + if (log->w_off != reader->r_off) ret |= POLLIN | POLLRDNORM; mutex_unlock(&log->mutex); @@ -473,11 +607,25 @@ static unsigned int logger_poll(struct file *file, poll_table *wait) return ret; } +static long logger_set_version(struct logger_reader *reader, void __user *arg) +{ + int version; + if (copy_from_user(&version, arg, sizeof(int))) + return -EFAULT; + + if ((version < 1) || (version > 2)) + return -EINVAL; + + reader->r_ver = version; + return 0; +} + static long logger_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct logger_log *log = file_get_log(file); struct logger_reader *reader; - long ret = -ENOTTY; + long ret = -EINVAL; + void __user *argp = (void __user *) arg; mutex_lock(&log->mutex); @@ -502,8 +650,14 @@ static long logger_ioctl(struct file *file, unsigned int cmd, unsigned long arg) break; } reader = file->private_data; + + if (!reader->r_all) + reader->r_off = get_next_entry_by_uid(log, + reader->r_off, current_euid()); + if (log->w_off != reader->r_off) - ret = get_entry_len(log, reader->r_off); + ret = get_user_hdr_len(reader->r_ver) + + get_entry_msg_len(log, reader->r_off); else ret = 0; break; @@ -517,6 +671,22 @@ static long logger_ioctl(struct file *file, unsigned int cmd, unsigned long arg) log->head = log->w_off; ret = 0; break; + case LOGGER_GET_VERSION: + if (!(file->f_mode & FMODE_READ)) { + ret = -EBADF; + break; + } + reader = file->private_data; + ret = reader->r_ver; + break; + case LOGGER_SET_VERSION: + if (!(file->f_mode & FMODE_READ)) { + ret = -EBADF; + break; + } + reader = file->private_data; + ret = logger_set_version(reader, argp); + break; } mutex_unlock(&log->mutex); @@ -537,8 +707,8 @@ static const struct file_operations logger_fops = { /* * Defines a log structure with name 'NAME' and a size of 'SIZE' bytes, which - * must be a power of two, greater than LOGGER_ENTRY_MAX_LEN, and less than - * LONG_MAX minus LOGGER_ENTRY_MAX_LEN. + * must be a power of two, and greater than + * (LOGGER_ENTRY_MAX_PAYLOAD + sizeof(struct logger_entry)). */ #define DEFINE_LOGGER_DEVICE(VAR, NAME, SIZE) \ static unsigned char _buf_ ## VAR[SIZE]; \ @@ -558,10 +728,10 @@ static struct logger_log VAR = { \ .size = SIZE, \ }; -DEFINE_LOGGER_DEVICE(log_main, LOGGER_LOG_MAIN, 64*1024) +DEFINE_LOGGER_DEVICE(log_main, LOGGER_LOG_MAIN, 256*1024) DEFINE_LOGGER_DEVICE(log_events, LOGGER_LOG_EVENTS, 256*1024) -DEFINE_LOGGER_DEVICE(log_radio, LOGGER_LOG_RADIO, 64*1024) -DEFINE_LOGGER_DEVICE(log_system, LOGGER_LOG_SYSTEM, 64*1024) +DEFINE_LOGGER_DEVICE(log_radio, LOGGER_LOG_RADIO, 256*1024) +DEFINE_LOGGER_DEVICE(log_system, LOGGER_LOG_SYSTEM, 256*1024) static struct logger_log *get_log_from_minor(int minor) { diff --git a/drivers/staging/android/logger.h b/drivers/staging/android/logger.h old mode 100644 new mode 100755 index 2cb06e9d..3f612a3b --- a/drivers/staging/android/logger.h +++ b/drivers/staging/android/logger.h @@ -20,7 +20,12 @@ #include #include -struct logger_entry { +/* + * The userspace structure for version 1 of the logger_entry ABI. + * This structure is returned to userspace unless the caller requests + * an upgrade to a newer ABI version. + */ +struct user_logger_entry_compat { __u16 len; /* length of the payload */ __u16 __pad; /* no matter what, we get 2 bytes of padding */ __s32 pid; /* generating process's pid */ @@ -30,14 +35,28 @@ struct logger_entry { char msg[0]; /* the entry's payload */ }; +/* + * The structure for version 2 of the logger_entry ABI. + * This structure is returned to userspace if ioctl(LOGGER_SET_VERSION) + * is called with version >= 2 + */ +struct logger_entry { + __u16 len; /* length of the payload */ + __u16 hdr_size; /* sizeof(struct logger_entry_v2) */ + __s32 pid; /* generating process's pid */ + __s32 tid; /* generating process's tid */ + __s32 sec; /* seconds since Epoch */ + __s32 nsec; /* nanoseconds */ + uid_t euid; /* effective UID of logger */ + char msg[0]; /* the entry's payload */ +}; + #define LOGGER_LOG_RADIO "log_radio" /* radio-related messages */ #define LOGGER_LOG_EVENTS "log_events" /* system/hardware events */ #define LOGGER_LOG_SYSTEM "log_system" /* system/framework messages */ #define LOGGER_LOG_MAIN "log_main" /* everything else */ -#define LOGGER_ENTRY_MAX_LEN (4*1024) -#define LOGGER_ENTRY_MAX_PAYLOAD \ - (LOGGER_ENTRY_MAX_LEN - sizeof(struct logger_entry)) +#define LOGGER_ENTRY_MAX_PAYLOAD 4076 #define __LOGGERIO 0xAE @@ -45,5 +64,7 @@ struct logger_entry { #define LOGGER_GET_LOG_LEN _IO(__LOGGERIO, 2) /* used log len */ #define LOGGER_GET_NEXT_ENTRY_LEN _IO(__LOGGERIO, 3) /* next entry len */ #define LOGGER_FLUSH_LOG _IO(__LOGGERIO, 4) /* flush log */ +#define LOGGER_GET_VERSION _IO(__LOGGERIO, 5) /* abi version */ +#define LOGGER_SET_VERSION _IO(__LOGGERIO, 6) /* abi version */ #endif /* _LINUX_LOGGER_H */ diff --git a/include/linux/capability.h b/include/linux/capability.h old mode 100644 new mode 100755 index c8f2a5f7..c4f6d94d --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -357,7 +357,11 @@ struct cpu_vfs_cap_data { #define CAP_MAC_ADMIN 33 -#define CAP_LAST_CAP CAP_MAC_ADMIN +/* Allow configuring the kernel's syslog (printk behaviour) */ + +#define CAP_SYSLOG 34 + +#define CAP_LAST_CAP CAP_SYSLOG #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP) From e04d028ecf0cbc29b16a6af5b85981ef93a1020c Mon Sep 17 00:00:00 2001 From: securecrt Date: Wed, 15 Aug 2012 18:03:44 +0800 Subject: [PATCH 06/35] reduced the log size --- drivers/staging/android/logger.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/staging/android/logger.c b/drivers/staging/android/logger.c index eb3d4ca5..05559eec 100755 --- a/drivers/staging/android/logger.c +++ b/drivers/staging/android/logger.c @@ -728,10 +728,10 @@ static struct logger_log VAR = { \ .size = SIZE, \ }; -DEFINE_LOGGER_DEVICE(log_main, LOGGER_LOG_MAIN, 256*1024) +DEFINE_LOGGER_DEVICE(log_main, LOGGER_LOG_MAIN, 64*1024) DEFINE_LOGGER_DEVICE(log_events, LOGGER_LOG_EVENTS, 256*1024) -DEFINE_LOGGER_DEVICE(log_radio, LOGGER_LOG_RADIO, 256*1024) -DEFINE_LOGGER_DEVICE(log_system, LOGGER_LOG_SYSTEM, 256*1024) +DEFINE_LOGGER_DEVICE(log_radio, LOGGER_LOG_RADIO, 64*1024) +DEFINE_LOGGER_DEVICE(log_system, LOGGER_LOG_SYSTEM, 64*1024) static struct logger_log *get_log_from_minor(int minor) { From fced437cd2706f5cb8af6346df85478743dce64e Mon Sep 17 00:00:00 2001 From: securecrt Date: Wed, 15 Aug 2012 18:04:29 +0800 Subject: [PATCH 07/35] pmem: Check for valid virtual address while flushing pmem: Correctly account for aligned blocks --- drivers/misc/pmem.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/misc/pmem.c b/drivers/misc/pmem.c index f1523fd5..bc083e17 100755 --- a/drivers/misc/pmem.c +++ b/drivers/misc/pmem.c @@ -1,7 +1,7 @@ /* drivers/android/pmem.c * * Copyright (C) 2007 Google, Inc. - * Copyright (c) 2009-2010, Code Aurora Forum. All rights reserved. + * Copyright (c) 2009-2012, Code Aurora Forum. All rights reserved. * * This software is licensed under the terms of the GNU General Public * License version 2, as published by the Free Software Foundation, and @@ -1074,17 +1074,17 @@ static void bitmap_bits_set_all(uint32_t *bitp, int bit_start, int bit_end) static int bitmap_allocate_contiguous(uint32_t *bitp, int num_bits_to_alloc, - int total_bits, int spacing) + int total_bits, int spacing, int start_bit) { int bit_start, last_bit, word_index; if (num_bits_to_alloc <= 0) return -1; - for (bit_start = 0; ; - bit_start = (last_bit + + for (bit_start = start_bit; ; + bit_start = ((last_bit + (word_index << PMEM_32BIT_WORD_ORDER) + spacing - 1) - & ~(spacing - 1)) { + & ~(spacing - 1)) + start_bit) { int bit_end = bit_start + num_bits_to_alloc, total_words; if (bit_end > total_bits) @@ -1162,7 +1162,8 @@ static int reserve_quanta(const unsigned int quanta_needed, ret = bitmap_allocate_contiguous(pmem[id].allocator.bitmap.bitmap, quanta_needed, (pmem[id].size + pmem[id].quantum - 1) / pmem[id].quantum, - spacing); + spacing, + start_bit); #if PMEM_DEBUG if (ret < 0) @@ -1915,6 +1916,13 @@ int pmem_cache_maint(struct file *file, unsigned int cmd, if (!file) return -EBADF; + /* + * check that the vaddr passed for flushing is valid + * so that you don't crash the kernel + */ + if (!pmem_addr->vaddr) + return -EINVAL; + data = file->private_data; id = get_id(file); From cdcb35c854ea4426832c0270bb60441684954258 Mon Sep 17 00:00:00 2001 From: securecrt Date: Wed, 15 Aug 2012 18:06:24 +0800 Subject: [PATCH 08/35] pmem_adsp (user-space) as non-cached pmem_venc (user-space) as non-cached --- arch/arm/mach-msm/board-htcleo.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/mach-msm/board-htcleo.c b/arch/arm/mach-msm/board-htcleo.c index 4697ef53..4913f043 100755 --- a/arch/arm/mach-msm/board-htcleo.c +++ b/arch/arm/mach-msm/board-htcleo.c @@ -771,7 +771,7 @@ static struct android_pmem_platform_data android_pmem_adsp_pdata = { #else .no_allocator = 0, #endif - .cached = 1, + .cached = 0, }; @@ -784,7 +784,7 @@ static struct android_pmem_platform_data android_pmem_venc_pdata = { #else .no_allocator = 0, #endif - .cached = 1, + .cached = 0, }; static struct platform_device android_pmem_mdp_device = { From 0c4a37e304734e297d586e8a6dfdebcea74e68b3 Mon Sep 17 00:00:00 2001 From: securecrt Date: Wed, 15 Aug 2012 18:07:29 +0800 Subject: [PATCH 09/35] change MSM_NAND_DMA_BUFFER_SIZE to SZ_1M --- drivers/mtd/devices/htcleo_nand.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/devices/htcleo_nand.c b/drivers/mtd/devices/htcleo_nand.c index 9b23d680..12a9b390 100755 --- a/drivers/mtd/devices/htcleo_nand.c +++ b/drivers/mtd/devices/htcleo_nand.c @@ -51,7 +51,7 @@ unsigned crci_mask; #include "msm_nand.h" -#define MSM_NAND_DMA_BUFFER_SIZE SZ_4K +#define MSM_NAND_DMA_BUFFER_SIZE SZ_1M #define MSM_NAND_DMA_BUFFER_SLOTS \ (MSM_NAND_DMA_BUFFER_SIZE / (sizeof(((atomic_t *)0)->counter) * 8)) From cc0db50c3303916e1d867327aa37c688a69c2bd4 Mon Sep 17 00:00:00 2001 From: SecureCRT Date: Fri, 17 Aug 2012 01:19:24 +0800 Subject: [PATCH 10/35] ksm: remove unswappable max_kernel_pages ksm: fix bad user data when swapping thp: ksm: free swap when swapcache page is replaced --- mm/ksm.c | 56 ++++++++++++++++++-------------------------------------- 1 file changed, 18 insertions(+), 38 deletions(-) mode change 100644 => 100755 mm/ksm.c diff --git a/mm/ksm.c b/mm/ksm.c old mode 100644 new mode 100755 index e9501f83..17abf485 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -163,9 +163,6 @@ static unsigned long ksm_pages_unshared; /* The number of rmap_items in use: to calculate pages_volatile */ static unsigned long ksm_rmap_items; -/* Limit on the number of unswappable pages used */ -static unsigned long ksm_max_kernel_pages; - /* Number of pages ksmd should scan in one batch */ static unsigned int ksm_thread_pages_to_scan = 100; @@ -628,7 +625,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, if (!ptep) goto out; - if (pte_write(*ptep)) { + if (pte_write(*ptep) || pte_dirty(*ptep)) { pte_t entry; swapped = PageSwapCache(page); @@ -651,7 +648,9 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, set_pte_at_notify(mm, addr, ptep, entry); goto out_unlock; } - entry = pte_wrprotect(entry); + if (pte_dirty(entry)) + set_page_dirty(page); + entry = pte_mkclean(pte_wrprotect(entry)); set_pte_at_notify(mm, addr, ptep, entry); } *orig_pte = *ptep; @@ -717,6 +716,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *oldpage, set_pte_at_notify(mm, addr, ptep, mk_pte(newpage, prot)); page_remove_rmap(oldpage); + if (!page_mapped(oldpage)) + try_to_free_swap(oldpage); put_page(oldpage); pte_unmap_unlock(ptep, ptl); @@ -827,13 +828,6 @@ static int try_to_merge_two_pages(struct mm_struct *mm1, unsigned long addr1, struct page *kpage; int err = -EFAULT; - /* - * The number of nodes in the stable tree - * is the number of kernel pages that we hold. - */ - if (ksm_max_kernel_pages && - ksm_max_kernel_pages <= ksm_pages_shared) - return err; kpage = alloc_page(GFP_HIGHUSER); if (!kpage) @@ -1209,6 +1203,18 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) slot = ksm_scan.mm_slot; if (slot == &ksm_mm_head) { + /* + * A number of pages can hang around indefinitely on per-cpu + * pagevecs, raised page count preventing write_protect_page + * from merging them. Though it doesn't really matter much, + * it is puzzling to see some stuck in pages_volatile until + * other activity jostles them out, and they also prevented + * LTP's KSM test from succeeding deterministically; so drain + * them here (here rather than on entry to ksm_do_scan(), + * so we don't IPI too often when pages_to_scan is set low). + */ + lru_add_drain_all(); + root_unstable_tree = RB_ROOT; spin_lock(&ksm_mmlist_lock); @@ -1577,29 +1583,6 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, } KSM_ATTR(run); -static ssize_t max_kernel_pages_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - int err; - unsigned long nr_pages; - - err = strict_strtoul(buf, 10, &nr_pages); - if (err) - return -EINVAL; - - ksm_max_kernel_pages = nr_pages; - - return count; -} - -static ssize_t max_kernel_pages_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sprintf(buf, "%lu\n", ksm_max_kernel_pages); -} -KSM_ATTR(max_kernel_pages); - static ssize_t pages_shared_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -1649,7 +1632,6 @@ static struct attribute *ksm_attrs[] = { &sleep_millisecs_attr.attr, &pages_to_scan_attr.attr, &run_attr.attr, - &max_kernel_pages_attr.attr, &pages_shared_attr.attr, &pages_sharing_attr.attr, &pages_unshared_attr.attr, @@ -1669,8 +1651,6 @@ static int __init ksm_init(void) struct task_struct *ksm_thread; int err; - ksm_max_kernel_pages = totalram_pages / 4; - err = ksm_slab_init(); if (err) goto out; From c95ed3371a7d0749ca59bc395640ef8a7d2b79b4 Mon Sep 17 00:00:00 2001 From: SecureCRT Date: Fri, 17 Aug 2012 01:21:55 +0800 Subject: [PATCH 11/35] staging: zram: fix zram locking Staging: zram: Replace mutex lock by a R/W semaphore Staging: zram: Add a missing GFP_KERNEL specifier in zram_init_device() --- drivers/staging/zram/Kconfig | 25 ------ drivers/staging/zram/Makefile | 0 drivers/staging/zram/xvmalloc.c | 0 drivers/staging/zram/xvmalloc.h | 0 drivers/staging/zram/xvmalloc_int.h | 0 drivers/staging/zram/zram.txt | 0 drivers/staging/zram/zram_drv.c | 134 ++++++++++++++++------------ drivers/staging/zram/zram_drv.h | 8 +- drivers/staging/zram/zram_sysfs.c | 18 ++-- 9 files changed, 95 insertions(+), 90 deletions(-) mode change 100644 => 100755 drivers/staging/zram/Kconfig mode change 100644 => 100755 drivers/staging/zram/Makefile mode change 100644 => 100755 drivers/staging/zram/xvmalloc.c mode change 100644 => 100755 drivers/staging/zram/xvmalloc.h mode change 100644 => 100755 drivers/staging/zram/xvmalloc_int.h mode change 100644 => 100755 drivers/staging/zram/zram.txt mode change 100644 => 100755 drivers/staging/zram/zram_drv.c mode change 100644 => 100755 drivers/staging/zram/zram_drv.h mode change 100644 => 100755 drivers/staging/zram/zram_sysfs.c diff --git a/drivers/staging/zram/Kconfig b/drivers/staging/zram/Kconfig old mode 100644 new mode 100755 index 18c1a971..3bec4dba --- a/drivers/staging/zram/Kconfig +++ b/drivers/staging/zram/Kconfig @@ -21,23 +21,6 @@ config ZRAM See zram.txt for more information. Project home: http://compcache.googlecode.com/ -config ZRAM_NUM_DEVICES - int "Default number of zram devices" - depends on ZRAM - range 1 32 - default 1 - help - Select default number of zram devices. You can override this value - using 'num_devices' module parameter. - -config ZRAM_DEFAULT_PERCENTAGE - int "Default number of zram percentage" - depends on ZRAM - range 10 80 - default 25 - help - Select default zram disk size: percentage of total RAM - config ZRAM_DEBUG bool "Compressed RAM block device debug support" depends on ZRAM @@ -45,11 +28,3 @@ config ZRAM_DEBUG help This option adds additional debugging code to the compressed RAM block device driver. - -config ZRAM_DEFAULT_DISKSIZE - int "Default size of zram in bytes" - depends on ZRAM - default 100663296 - help - Set default zram disk size (default ~ 96MB) - diff --git a/drivers/staging/zram/Makefile b/drivers/staging/zram/Makefile old mode 100644 new mode 100755 diff --git a/drivers/staging/zram/xvmalloc.c b/drivers/staging/zram/xvmalloc.c old mode 100644 new mode 100755 diff --git a/drivers/staging/zram/xvmalloc.h b/drivers/staging/zram/xvmalloc.h old mode 100644 new mode 100755 diff --git a/drivers/staging/zram/xvmalloc_int.h b/drivers/staging/zram/xvmalloc_int.h old mode 100644 new mode 100755 diff --git a/drivers/staging/zram/zram.txt b/drivers/staging/zram/zram.txt old mode 100644 new mode 100755 diff --git a/drivers/staging/zram/zram_drv.c b/drivers/staging/zram/zram_drv.c old mode 100644 new mode 100755 index fc4c2e6f..88383651 --- a/drivers/staging/zram/zram_drv.c +++ b/drivers/staging/zram/zram_drv.c @@ -104,19 +104,33 @@ static int page_zero_filled(void *ptr) return 1; } -static u64 zram_default_disksize_bytes(void) +static void zram_set_disksize(struct zram *zram, size_t totalram_bytes) { -#if 0 - return ((totalram_pages << PAGE_SHIFT) * - default_disksize_perc_ram / 100) & PAGE_MASK; -#endif - return CONFIG_ZRAM_DEFAULT_DISKSIZE; -} + if (!zram->disksize) { + pr_info( + "disk size not provided. You can use disksize_kb module " + "param to specify size.\nUsing default: (%u%% of RAM).\n", + default_disksize_perc_ram + ); + zram->disksize = default_disksize_perc_ram * + (totalram_bytes / 100); + } -static void zram_set_disksize(struct zram *zram, u64 size_bytes) -{ - zram->disksize = size_bytes; - set_capacity(zram->disk, size_bytes >> SECTOR_SHIFT); + if (zram->disksize > 2 * (totalram_bytes)) { + pr_info( + "There is little point creating a zram of greater than " + "twice the size of memory since we expect a 2:1 compression " + "ratio. Note that zram uses about 0.1%% of the size of " + "the disk when not in use so a huge zram is " + "wasteful.\n" + "\tMemory Size: %zu kB\n" + "\tSize you selected: %llu kB\n" + "Continuing anyway ...\n", + totalram_bytes >> 10, zram->disksize + ); + } + + zram->disksize &= PAGE_MASK; } static void zram_free_page(struct zram *zram, size_t index) @@ -546,27 +560,35 @@ static int zram_make_request(struct request_queue *queue, struct bio *bio) { struct zram *zram = queue->queuedata; + if (unlikely(!zram->init_done) && zram_init_device(zram)) + goto error; + + down_read(&zram->init_lock); + if (unlikely(!zram->init_done)) + goto error_unlock; + if (!valid_io_request(zram, bio)) { zram_stat64_inc(zram, &zram->stats.invalid_io); - bio_io_error(bio); - return 0; - } - - if (unlikely(!zram->init_done) && zram_init_device(zram)) { - bio_io_error(bio); - return 0; + goto error_unlock; } __zram_make_request(zram, bio, bio_data_dir(bio)); + up_read(&zram->init_lock); return 0; + +error_unlock: + up_read(&zram->init_lock); +error: + bio_io_error(bio); + return 0; + } -void zram_reset_device(struct zram *zram) +void __zram_reset_device(struct zram *zram) { size_t index; - mutex_lock(&zram->init_lock); zram->init_done = 0; /* Free various per-device buffers */ @@ -602,8 +624,14 @@ void zram_reset_device(struct zram *zram) /* Reset stats */ memset(&zram->stats, 0, sizeof(zram->stats)); - zram_set_disksize(zram, zram_default_disksize_bytes()); - mutex_unlock(&zram->init_lock); + zram->disksize = 0; +} + +void zram_reset_device(struct zram *zram) +{ + down_write(&zram->init_lock); + __zram_reset_device(zram); + up_write(&zram->init_lock); } int zram_init_device(struct zram *zram) @@ -611,37 +639,39 @@ int zram_init_device(struct zram *zram) int ret; size_t num_pages; - mutex_lock(&zram->init_lock); + down_write(&zram->init_lock); if (zram->init_done) { - mutex_unlock(&zram->init_lock); + up_write(&zram->init_lock); return 0; } + zram_set_disksize(zram, totalram_pages << PAGE_SHIFT); + zram->compress_workmem = kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); if (!zram->compress_workmem) { pr_err("Error allocating compressor working memory!\n"); ret = -ENOMEM; - goto fail; + goto fail_no_table; } - zram->compress_buffer = (void *)__get_free_pages(__GFP_ZERO, 1); + zram->compress_buffer = + (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1); if (!zram->compress_buffer) { pr_err("Error allocating compressor buffer space\n"); ret = -ENOMEM; - goto fail; + goto fail_no_table; } num_pages = zram->disksize >> PAGE_SHIFT; zram->table = vmalloc(num_pages * sizeof(*zram->table)); if (!zram->table) { pr_err("Error allocating zram address table\n"); - /* To prevent accessing table entries during cleanup */ - zram->disksize = 0; ret = -ENOMEM; - goto fail; + goto fail_no_table; } - memset(zram->table, 0, num_pages * sizeof(*zram->table)); + memset(zram->table, 0, num_pages * sizeof(*zram->table)); + set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); /* zram devices sort of resembles non-rotational disks */ queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue); @@ -654,15 +684,17 @@ int zram_init_device(struct zram *zram) } zram->init_done = 1; - mutex_unlock(&zram->init_lock); + up_write(&zram->init_lock); pr_debug("Initialization done!\n"); return 0; +fail_no_table: + /* To prevent accessing table entries during cleanup */ + zram->disksize = 0; fail: - mutex_unlock(&zram->init_lock); - zram_reset_device(zram); - + __zram_reset_device(zram); + up_write(&zram->init_lock); pr_err("Initialization failed: err=%d\n", ret); return ret; } @@ -687,7 +719,7 @@ static int create_device(struct zram *zram, int device_id) int ret = 0; init_rwsem(&zram->lock); - mutex_init(&zram->init_lock); + init_rwsem(&zram->init_lock); spin_lock_init(&zram->stat64_lock); zram->queue = blk_alloc_queue(GFP_KERNEL); @@ -718,12 +750,8 @@ static int create_device(struct zram *zram, int device_id) zram->disk->private_data = zram; snprintf(zram->disk->disk_name, 16, "zram%d", device_id); - /* - * Set some default disksize. To set another disksize, user - * must reset the device and then write a new disksize to - * corresponding device's sysfs node. - */ - zram_set_disksize(zram, zram_default_disksize_bytes()); + /* Actual capacity set using syfs (/sys/block/zram/disksize */ + set_capacity(zram->disk, 0); /* * To ensure that we always get PAGE_SIZE aligned @@ -768,13 +796,6 @@ static int __init zram_init(void) { int ret, dev_id; - /* - * Module parameter not specified by user. Use default - * value as defined during kernel config. - */ - if (zram_num_devices == 0) - zram_num_devices = CONFIG_ZRAM_NUM_DEVICES; - if (zram_num_devices > max_num_devices) { pr_warning("Invalid value for num_devices: %u\n", zram_num_devices); @@ -789,12 +810,15 @@ static int __init zram_init(void) goto out; } + if (!zram_num_devices) { + pr_info("num_devices not specified. Using default: 1\n"); + zram_num_devices = 1; + } + /* Allocate the device array and initialize each one */ pr_info("Creating %u devices ...\n", zram_num_devices); - zram_devices = kzalloc(zram_num_devices * sizeof(struct zram), - GFP_KERNEL); - if (!zram_devices) - { + zram_devices = kzalloc(zram_num_devices * sizeof(struct zram), GFP_KERNEL); + if (!zram_devices) { ret = -ENOMEM; goto unregister; } @@ -836,8 +860,8 @@ static void __exit zram_exit(void) pr_debug("Cleanup done!\n"); } -module_param_named(num_devices, zram_num_devices, uint, 0); -MODULE_PARM_DESC(num_devices, "Number of zram devices"); +module_param(zram_num_devices, uint, 0); +MODULE_PARM_DESC(zram_num_devices, "Number of zram devices"); module_init(zram_init); module_exit(zram_exit); diff --git a/drivers/staging/zram/zram_drv.h b/drivers/staging/zram/zram_drv.h old mode 100644 new mode 100755 index fed0d14b..31617ee7 --- a/drivers/staging/zram/zram_drv.h +++ b/drivers/staging/zram/zram_drv.h @@ -41,7 +41,7 @@ struct zobj_header { /*-- Configurable parameters */ /* Default zram disk size: 25% of total RAM */ -static const unsigned default_disksize_perc_ram = CONFIG_ZRAM_DEFAULT_PERCENTAGE; +static const unsigned default_disksize_perc_ram = 25; /* * Pages that compress to size greater than this are stored @@ -112,8 +112,8 @@ struct zram { struct request_queue *queue; struct gendisk *disk; int init_done; - /* Prevent concurrent execution of device init and reset */ - struct mutex init_lock; + /* Prevent concurrent execution of device init, reset and R/W request */ + struct rw_semaphore init_lock; /* * This is the limit on amount of *uncompressed* worth of data * we can store in a disk. @@ -130,7 +130,7 @@ extern struct attribute_group zram_disk_attr_group; #endif extern int zram_init_device(struct zram *zram); -extern void zram_reset_device(struct zram *zram); +extern void __zram_reset_device(struct zram *zram); #endif diff --git a/drivers/staging/zram/zram_sysfs.c b/drivers/staging/zram/zram_sysfs.c old mode 100644 new mode 100755 index d894928a..41e51a2b --- a/drivers/staging/zram/zram_sysfs.c +++ b/drivers/staging/zram/zram_sysfs.c @@ -55,19 +55,23 @@ static ssize_t disksize_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { int ret; + u64 disksize; struct zram *zram = dev_to_zram(dev); + ret = strict_strtoull(buf, 10, &disksize); + if (ret) + return ret; + + down_write(&zram->init_lock); if (zram->init_done) { + up_write(&zram->init_lock); pr_info("Cannot change disksize for initialized device\n"); return -EBUSY; } - ret = strict_strtoull(buf, 10, &zram->disksize); - if (ret) - return ret; - - zram->disksize = PAGE_ALIGN(zram->disksize); + zram->disksize = PAGE_ALIGN(disksize); set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); + up_write(&zram->init_lock); return len; } @@ -106,8 +110,10 @@ static ssize_t reset_store(struct device *dev, if (bdev) fsync_bdev(bdev); + down_write(&zram->init_lock); if (zram->init_done) - zram_reset_device(zram); + __zram_reset_device(zram); + up_write(&zram->init_lock); return len; } From 8b041c69af8468d32fe97236fdae7e6bd3f87cf7 Mon Sep 17 00:00:00 2001 From: SecureCRT Date: Fri, 17 Aug 2012 01:40:53 +0800 Subject: [PATCH 12/35] mm/ksm.c is doing an unneeded _notify in write_protect_page. --- mm/ksm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/ksm.c b/mm/ksm.c index 17abf485..d40ed022 100755 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -645,7 +645,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, * page */ if ((page_mapcount(page) + 2 + swapped) != page_count(page)) { - set_pte_at_notify(mm, addr, ptep, entry); + set_pte_at(mm, addr, ptep, entry); goto out_unlock; } if (pte_dirty(entry)) From 7c81b7476dc1b0931c47849307bf1c92445c4999 Mon Sep 17 00:00:00 2001 From: SecureCRT Date: Fri, 17 Aug 2012 01:49:14 +0800 Subject: [PATCH 13/35] ksm: check for ERR_PTR from follow_page() --- mm/ksm.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index d40ed022..d8ce84ad 100755 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -314,7 +314,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) do { cond_resched(); page = follow_page(vma, addr, FOLL_GET); - if (!page) + if (IS_ERR_OR_NULL(page)) break; if (PageKsm(page)) ret = handle_mm_fault(vma->vm_mm, vma, addr, @@ -388,7 +388,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item) goto out; page = follow_page(vma, addr, FOLL_GET); - if (!page) + if (IS_ERR_OR_NULL(page)) goto out; if (PageAnon(page)) { flush_anon_page(vma, page, addr); @@ -1320,7 +1320,7 @@ next_mm: static void ksm_do_scan(unsigned int scan_npages) { struct rmap_item *rmap_item; - struct page *page; + struct page *uninitialized_var(page); while (scan_npages--) { cond_resched(); From f1beec1b3240e23e026b29050f14d9632a6708b1 Mon Sep 17 00:00:00 2001 From: SecureCRT Date: Sat, 18 Aug 2012 23:45:48 +0800 Subject: [PATCH 14/35] vmalloc(): adjust gfp mask passed on nested vmalloc() invocation --- mm/vmalloc.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) mode change 100644 => 100755 mm/vmalloc.c diff --git a/mm/vmalloc.c b/mm/vmalloc.c old mode 100644 new mode 100755 index c2287313..b689e2ae --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1470,6 +1470,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, { struct page **pages; unsigned int nr_pages, array_size, i; + gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT; array_size = (nr_pages * sizeof(struct page *)); @@ -1477,13 +1478,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, area->nr_pages = nr_pages; /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { - pages = __vmalloc_node(array_size, 1, gfp_mask | __GFP_ZERO, + pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM, PAGE_KERNEL, node, caller); area->flags |= VM_VPAGES; } else { - pages = kmalloc_node(array_size, - (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO, - node); + pages = kmalloc_node(array_size, nested_gfp, node); } area->pages = pages; area->caller = caller; From e0c9143ea1ec510a41b347be043e98034eedf5c8 Mon Sep 17 00:00:00 2001 From: SecureCRT Date: Mon, 20 Aug 2012 00:49:43 +0800 Subject: [PATCH 15/35] mm: cleancache core ops functions and config --- Documentation/vm/cleancache.txt | 279 ++++++++++++++++++++++++++++++++ include/linux/cleancache.h | 122 ++++++++++++++ mm/Kconfig | 22 +++ mm/Makefile | 1 + mm/cleancache.c | 244 ++++++++++++++++++++++++++++ 5 files changed, 668 insertions(+) create mode 100755 Documentation/vm/cleancache.txt create mode 100755 include/linux/cleancache.h mode change 100644 => 100755 mm/Kconfig mode change 100644 => 100755 mm/Makefile create mode 100755 mm/cleancache.c diff --git a/Documentation/vm/cleancache.txt b/Documentation/vm/cleancache.txt new file mode 100755 index 00000000..e0a53567 --- /dev/null +++ b/Documentation/vm/cleancache.txt @@ -0,0 +1,279 @@ +MOTIVATION + +Cleancache is a new optional feature provided by the VFS layer that +potentially dramatically increases page cache effectiveness for +many workloads in many environments at a negligible cost. + +Cleancache can be thought of as a page-granularity victim cache for clean +pages that the kernel's pageframe replacement algorithm (PFRA) would like +to keep around, but can't since there isn't enough memory. So when the +PFRA "evicts" a page, it first attempts to use cleancache code to +put the data contained in that page into "transcendent memory", memory +that is not directly accessible or addressable by the kernel and is +of unknown and possibly time-varying size. + +Later, when a cleancache-enabled filesystem wishes to access a page +in a file on disk, it first checks cleancache to see if it already +contains it; if it does, the page of data is copied into the kernel +and a disk access is avoided. + +Transcendent memory "drivers" for cleancache are currently implemented +in Xen (using hypervisor memory) and zcache (using in-kernel compressed +memory) and other implementations are in development. + +FAQs are included below. + +IMPLEMENTATION OVERVIEW + +A cleancache "backend" that provides transcendent memory registers itself +to the kernel's cleancache "frontend" by calling cleancache_register_ops, +passing a pointer to a cleancache_ops structure with funcs set appropriately. +Note that cleancache_register_ops returns the previous settings so that +chaining can be performed if desired. The functions provided must conform to +certain semantics as follows: + +Most important, cleancache is "ephemeral". Pages which are copied into +cleancache have an indefinite lifetime which is completely unknowable +by the kernel and so may or may not still be in cleancache at any later time. +Thus, as its name implies, cleancache is not suitable for dirty pages. +Cleancache has complete discretion over what pages to preserve and what +pages to discard and when. + +Mounting a cleancache-enabled filesystem should call "init_fs" to obtain a +pool id which, if positive, must be saved in the filesystem's superblock; +a negative return value indicates failure. A "put_page" will copy a +(presumably about-to-be-evicted) page into cleancache and associate it with +the pool id, a file key, and a page index into the file. (The combination +of a pool id, a file key, and an index is sometimes called a "handle".) +A "get_page" will copy the page, if found, from cleancache into kernel memory. +An "invalidate_page" will ensure the page no longer is present in cleancache; +an "invalidate_inode" will invalidate all pages associated with the specified +file; and, when a filesystem is unmounted, an "invalidate_fs" will invalidate +all pages in all files specified by the given pool id and also surrender +the pool id. + +An "init_shared_fs", like init_fs, obtains a pool id but tells cleancache +to treat the pool as shared using a 128-bit UUID as a key. On systems +that may run multiple kernels (such as hard partitioned or virtualized +systems) that may share a clustered filesystem, and where cleancache +may be shared among those kernels, calls to init_shared_fs that specify the +same UUID will receive the same pool id, thus allowing the pages to +be shared. Note that any security requirements must be imposed outside +of the kernel (e.g. by "tools" that control cleancache). Or a +cleancache implementation can simply disable shared_init by always +returning a negative value. + +If a get_page is successful on a non-shared pool, the page is invalidated +(thus making cleancache an "exclusive" cache). On a shared pool, the page +is NOT invalidated on a successful get_page so that it remains accessible to +other sharers. The kernel is responsible for ensuring coherency between +cleancache (shared or not), the page cache, and the filesystem, using +cleancache invalidate operations as required. + +Note that cleancache must enforce put-put-get coherency and get-get +coherency. For the former, if two puts are made to the same handle but +with different data, say AAA by the first put and BBB by the second, a +subsequent get can never return the stale data (AAA). For get-get coherency, +if a get for a given handle fails, subsequent gets for that handle will +never succeed unless preceded by a successful put with that handle. + +Last, cleancache provides no SMP serialization guarantees; if two +different Linux threads are simultaneously putting and invalidating a page +with the same handle, the results are indeterminate. Callers must +lock the page to ensure serial behavior. + +CLEANCACHE PERFORMANCE METRICS + +Cleancache monitoring is done by sysfs files in the +/sys/kernel/mm/cleancache directory. The effectiveness of cleancache +can be measured (across all filesystems) with: + +succ_gets - number of gets that were successful +failed_gets - number of gets that failed +puts - number of puts attempted (all "succeed") +invalidates - number of invalidates attempted + +A backend implementatation may provide additional metrics. + +FAQ + +1) Where's the value? (Andrew Morton) + +Cleancache provides a significant performance benefit to many workloads +in many environments with negligible overhead by improving the +effectiveness of the pagecache. Clean pagecache pages are +saved in transcendent memory (RAM that is otherwise not directly +addressable to the kernel); fetching those pages later avoids "refaults" +and thus disk reads. + +Cleancache (and its sister code "frontswap") provide interfaces for +this transcendent memory (aka "tmem"), which conceptually lies between +fast kernel-directly-addressable RAM and slower DMA/asynchronous devices. +Disallowing direct kernel or userland reads/writes to tmem +is ideal when data is transformed to a different form and size (such +as with compression) or secretly moved (as might be useful for write- +balancing for some RAM-like devices). Evicted page-cache pages (and +swap pages) are a great use for this kind of slower-than-RAM-but-much- +faster-than-disk transcendent memory, and the cleancache (and frontswap) +"page-object-oriented" specification provides a nice way to read and +write -- and indirectly "name" -- the pages. + +In the virtual case, the whole point of virtualization is to statistically +multiplex physical resources across the varying demands of multiple +virtual machines. This is really hard to do with RAM and efforts to +do it well with no kernel change have essentially failed (except in some +well-publicized special-case workloads). Cleancache -- and frontswap -- +with a fairly small impact on the kernel, provide a huge amount +of flexibility for more dynamic, flexible RAM multiplexing. +Specifically, the Xen Transcendent Memory backend allows otherwise +"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple +virtual machines, but the pages can be compressed and deduplicated to +optimize RAM utilization. And when guest OS's are induced to surrender +underutilized RAM (e.g. with "self-ballooning"), page cache pages +are the first to go, and cleancache allows those pages to be +saved and reclaimed if overall host system memory conditions allow. + +And the identical interface used for cleancache can be used in +physical systems as well. The zcache driver acts as a memory-hungry +device that stores pages of data in a compressed state. And +the proposed "RAMster" driver shares RAM across multiple physical +systems. + +2) Why does cleancache have its sticky fingers so deep inside the + filesystems and VFS? (Andrew Morton and Christoph Hellwig) + +The core hooks for cleancache in VFS are in most cases a single line +and the minimum set are placed precisely where needed to maintain +coherency (via cleancache_invalidate operations) between cleancache, +the page cache, and disk. All hooks compile into nothingness if +cleancache is config'ed off and turn into a function-pointer- +compare-to-NULL if config'ed on but no backend claims the ops +functions, or to a compare-struct-element-to-negative if a +backend claims the ops functions but a filesystem doesn't enable +cleancache. + +Some filesystems are built entirely on top of VFS and the hooks +in VFS are sufficient, so don't require an "init_fs" hook; the +initial implementation of cleancache didn't provide this hook. +But for some filesystems (such as btrfs), the VFS hooks are +incomplete and one or more hooks in fs-specific code are required. +And for some other filesystems, such as tmpfs, cleancache may +be counterproductive. So it seemed prudent to require a filesystem +to "opt in" to use cleancache, which requires adding a hook in +each filesystem. Not all filesystems are supported by cleancache +only because they haven't been tested. The existing set should +be sufficient to validate the concept, the opt-in approach means +that untested filesystems are not affected, and the hooks in the +existing filesystems should make it very easy to add more +filesystems in the future. + +The total impact of the hooks to existing fs and mm files is only +about 40 lines added (not counting comments and blank lines). + +3) Why not make cleancache asynchronous and batched so it can + more easily interface with real devices with DMA instead + of copying each individual page? (Minchan Kim) + +The one-page-at-a-time copy semantics simplifies the implementation +on both the frontend and backend and also allows the backend to +do fancy things on-the-fly like page compression and +page deduplication. And since the data is "gone" (copied into/out +of the pageframe) before the cleancache get/put call returns, +a great deal of race conditions and potential coherency issues +are avoided. While the interface seems odd for a "real device" +or for real kernel-addressable RAM, it makes perfect sense for +transcendent memory. + +4) Why is non-shared cleancache "exclusive"? And where is the + page "invalidated" after a "get"? (Minchan Kim) + +The main reason is to free up space in transcendent memory and +to avoid unnecessary cleancache_invalidate calls. If you want inclusive, +the page can be "put" immediately following the "get". If +put-after-get for inclusive becomes common, the interface could +be easily extended to add a "get_no_invalidate" call. + +The invalidate is done by the cleancache backend implementation. + +5) What's the performance impact? + +Performance analysis has been presented at OLS'09 and LCA'10. +Briefly, performance gains can be significant on most workloads, +especially when memory pressure is high (e.g. when RAM is +overcommitted in a virtual workload); and because the hooks are +invoked primarily in place of or in addition to a disk read/write, +overhead is negligible even in worst case workloads. Basically +cleancache replaces I/O with memory-copy-CPU-overhead; on older +single-core systems with slow memory-copy speeds, cleancache +has little value, but in newer multicore machines, especially +consolidated/virtualized machines, it has great value. + +6) How do I add cleancache support for filesystem X? (Boaz Harrash) + +Filesystems that are well-behaved and conform to certain +restrictions can utilize cleancache simply by making a call to +cleancache_init_fs at mount time. Unusual, misbehaving, or +poorly layered filesystems must either add additional hooks +and/or undergo extensive additional testing... or should just +not enable the optional cleancache. + +Some points for a filesystem to consider: + +- The FS should be block-device-based (e.g. a ram-based FS such + as tmpfs should not enable cleancache) +- To ensure coherency/correctness, the FS must ensure that all + file removal or truncation operations either go through VFS or + add hooks to do the equivalent cleancache "invalidate" operations +- To ensure coherency/correctness, either inode numbers must + be unique across the lifetime of the on-disk file OR the + FS must provide an "encode_fh" function. +- The FS must call the VFS superblock alloc and deactivate routines + or add hooks to do the equivalent cleancache calls done there. +- To maximize performance, all pages fetched from the FS should + go through the do_mpag_readpage routine or the FS should add + hooks to do the equivalent (cf. btrfs) +- Currently, the FS blocksize must be the same as PAGESIZE. This + is not an architectural restriction, but no backends currently + support anything different. +- A clustered FS should invoke the "shared_init_fs" cleancache + hook to get best performance for some backends. + +7) Why not use the KVA of the inode as the key? (Christoph Hellwig) + +If cleancache would use the inode virtual address instead of +inode/filehandle, the pool id could be eliminated. But, this +won't work because cleancache retains pagecache data pages +persistently even when the inode has been pruned from the +inode unused list, and only invalidates the data page if the file +gets removed/truncated. So if cleancache used the inode kva, +there would be potential coherency issues if/when the inode +kva is reused for a different file. Alternately, if cleancache +invalidated the pages when the inode kva was freed, much of the value +of cleancache would be lost because the cache of pages in cleanache +is potentially much larger than the kernel pagecache and is most +useful if the pages survive inode cache removal. + +8) Why is a global variable required? + +The cleancache_enabled flag is checked in all of the frequently-used +cleancache hooks. The alternative is a function call to check a static +variable. Since cleancache is enabled dynamically at runtime, systems +that don't enable cleancache would suffer thousands (possibly +tens-of-thousands) of unnecessary function calls per second. So the +global variable allows cleancache to be enabled by default at compile +time, but have insignificant performance impact when cleancache remains +disabled at runtime. + +9) Does cleanache work with KVM? + +The memory model of KVM is sufficiently different that a cleancache +backend may have less value for KVM. This remains to be tested, +especially in an overcommitted system. + +10) Does cleancache work in userspace? It sounds useful for + memory hungry caches like web browsers. (Jamie Lokier) + +No plans yet, though we agree it sounds useful, at least for +apps that bypass the page cache (e.g. O_DIRECT). + +Last updated: Dan Magenheimer, April 13 2011 diff --git a/include/linux/cleancache.h b/include/linux/cleancache.h new file mode 100755 index 00000000..04ffb2e6 --- /dev/null +++ b/include/linux/cleancache.h @@ -0,0 +1,122 @@ +#ifndef _LINUX_CLEANCACHE_H +#define _LINUX_CLEANCACHE_H + +#include +#include +#include + +#define CLEANCACHE_KEY_MAX 6 + +/* + * cleancache requires every file with a page in cleancache to have a + * unique key unless/until the file is removed/truncated. For some + * filesystems, the inode number is unique, but for "modern" filesystems + * an exportable filehandle is required (see exportfs.h) + */ +struct cleancache_filekey { + union { + ino_t ino; + __u32 fh[CLEANCACHE_KEY_MAX]; + u32 key[CLEANCACHE_KEY_MAX]; + } u; +}; + +struct cleancache_ops { + int (*init_fs)(size_t); + int (*init_shared_fs)(char *uuid, size_t); + int (*get_page)(int, struct cleancache_filekey, + pgoff_t, struct page *); + void (*put_page)(int, struct cleancache_filekey, + pgoff_t, struct page *); + void (*flush_page)(int, struct cleancache_filekey, pgoff_t); + void (*flush_inode)(int, struct cleancache_filekey); + void (*flush_fs)(int); +}; + +extern struct cleancache_ops + cleancache_register_ops(struct cleancache_ops *ops); +extern void __cleancache_init_fs(struct super_block *); +extern void __cleancache_init_shared_fs(char *, struct super_block *); +extern int __cleancache_get_page(struct page *); +extern void __cleancache_put_page(struct page *); +extern void __cleancache_flush_page(struct address_space *, struct page *); +extern void __cleancache_flush_inode(struct address_space *); +extern void __cleancache_flush_fs(struct super_block *); +extern int cleancache_enabled; + +#ifdef CONFIG_CLEANCACHE +static inline bool cleancache_fs_enabled(struct page *page) +{ + return page->mapping->host->i_sb->cleancache_poolid >= 0; +} +static inline bool cleancache_fs_enabled_mapping(struct address_space *mapping) +{ + return mapping->host->i_sb->cleancache_poolid >= 0; +} +#else +#define cleancache_enabled (0) +#define cleancache_fs_enabled(_page) (0) +#define cleancache_fs_enabled_mapping(_page) (0) +#endif + +/* + * The shim layer provided by these inline functions allows the compiler + * to reduce all cleancache hooks to nothingness if CONFIG_CLEANCACHE + * is disabled, to a single global variable check if CONFIG_CLEANCACHE + * is enabled but no cleancache "backend" has dynamically enabled it, + * and, for the most frequent cleancache ops, to a single global variable + * check plus a superblock element comparison if CONFIG_CLEANCACHE is enabled + * and a cleancache backend has dynamically enabled cleancache, but the + * filesystem referenced by that cleancache op has not enabled cleancache. + * As a result, CONFIG_CLEANCACHE can be enabled by default with essentially + * no measurable performance impact. + */ + +static inline void cleancache_init_fs(struct super_block *sb) +{ + if (cleancache_enabled) + __cleancache_init_fs(sb); +} + +static inline void cleancache_init_shared_fs(char *uuid, struct super_block *sb) +{ + if (cleancache_enabled) + __cleancache_init_shared_fs(uuid, sb); +} + +static inline int cleancache_get_page(struct page *page) +{ + int ret = -1; + + if (cleancache_enabled && cleancache_fs_enabled(page)) + ret = __cleancache_get_page(page); + return ret; +} + +static inline void cleancache_put_page(struct page *page) +{ + if (cleancache_enabled && cleancache_fs_enabled(page)) + __cleancache_put_page(page); +} + +static inline void cleancache_flush_page(struct address_space *mapping, + struct page *page) +{ + /* careful... page->mapping is NULL sometimes when this is called */ + if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping)) + __cleancache_flush_page(mapping, page); +} + +static inline void cleancache_flush_inode(struct address_space *mapping) +{ + if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping)) + __cleancache_flush_inode(mapping); +} + +static inline void cleancache_flush_fs(struct super_block *sb) +{ + if (cleancache_enabled) + __cleancache_flush_fs(sb); +} + +#endif /* _LINUX_CLEANCACHE_H */ diff --git a/mm/Kconfig b/mm/Kconfig old mode 100644 new mode 100755 index 2c19c0ba..f86e0d29 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -288,3 +288,25 @@ config NOMMU_INITIAL_TRIM_EXCESS of 1 says that all excess pages should be trimmed. See Documentation/nommu-mmap.txt for more information. +config CLEANCACHE + bool "Enable cleancache driver to cache clean pages if tmem is present" + default n + help + Cleancache can be thought of as a page-granularity victim cache + for clean pages that the kernel's pageframe replacement algorithm + (PFRA) would like to keep around, but can't since there isn't enough + memory. So when the PFRA "evicts" a page, it first attempts to use + cleancacne code to put the data contained in that page into + "transcendent memory", memory that is not directly accessible or + addressable by the kernel and is of unknown and possibly + time-varying size. And when a cleancache-enabled + filesystem wishes to access a page in a file on disk, it first + checks cleancache to see if it already contains it; if it does, + the page is copied into the kernel and a disk access is avoided. + When a transcendent memory driver is available (such as zcache or + Xen transcendent memory), a significant I/O reduction + may be achieved. When none is available, all cleancache calls + are reduced to a single pointer-compare-against-NULL resulting + in a negligible performance hit. + + If unsure, say Y to enable cleancache \ No newline at end of file diff --git a/mm/Makefile b/mm/Makefile old mode 100644 new mode 100755 index 66f54865..82a734fd --- a/mm/Makefile +++ b/mm/Makefile @@ -46,3 +46,4 @@ obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o +obj-$(CONFIG_CLEANCACHE) += cleancache.o diff --git a/mm/cleancache.c b/mm/cleancache.c new file mode 100755 index 00000000..bcaae4c2 --- /dev/null +++ b/mm/cleancache.c @@ -0,0 +1,244 @@ +/* + * Cleancache frontend + * + * This code provides the generic "frontend" layer to call a matching + * "backend" driver implementation of cleancache. See + * Documentation/vm/cleancache.txt for more information. + * + * Copyright (C) 2009-2010 Oracle Corp. All rights reserved. + * Author: Dan Magenheimer + * + * This work is licensed under the terms of the GNU GPL, version 2. + */ + +#include +#include +#include +#include +#include + +/* + * This global enablement flag may be read thousands of times per second + * by cleancache_get/put/flush even on systems where cleancache_ops + * is not claimed (e.g. cleancache is config'ed on but remains + * disabled), so is preferred to the slower alternative: a function + * call that checks a non-global. + */ +int cleancache_enabled; +EXPORT_SYMBOL(cleancache_enabled); + +/* + * cleancache_ops is set by cleancache_ops_register to contain the pointers + * to the cleancache "backend" implementation functions. + */ +static struct cleancache_ops cleancache_ops; + +/* useful stats available in /sys/kernel/mm/cleancache */ +static unsigned long cleancache_succ_gets; +static unsigned long cleancache_failed_gets; +static unsigned long cleancache_puts; +static unsigned long cleancache_flushes; + +/* + * register operations for cleancache, returning previous thus allowing + * detection of multiple backends and possible nesting + */ +struct cleancache_ops cleancache_register_ops(struct cleancache_ops *ops) +{ + struct cleancache_ops old = cleancache_ops; + + cleancache_ops = *ops; + cleancache_enabled = 1; + return old; +} +EXPORT_SYMBOL(cleancache_register_ops); + +/* Called by a cleancache-enabled filesystem at time of mount */ +void __cleancache_init_fs(struct super_block *sb) +{ + sb->cleancache_poolid = (*cleancache_ops.init_fs)(PAGE_SIZE); +} +EXPORT_SYMBOL(__cleancache_init_fs); + +/* Called by a cleancache-enabled clustered filesystem at time of mount */ +void __cleancache_init_shared_fs(char *uuid, struct super_block *sb) +{ + sb->cleancache_poolid = + (*cleancache_ops.init_shared_fs)(uuid, PAGE_SIZE); +} +EXPORT_SYMBOL(__cleancache_init_shared_fs); + +/* + * If the filesystem uses exportable filehandles, use the filehandle as + * the key, else use the inode number. + */ +static int cleancache_get_key(struct inode *inode, + struct cleancache_filekey *key) +{ + int (*fhfn)(struct dentry *, __u32 *fh, int *, int); + int len = 0, maxlen = CLEANCACHE_KEY_MAX; + struct super_block *sb = inode->i_sb; + + key->u.ino = inode->i_ino; + if (sb->s_export_op != NULL) { + fhfn = sb->s_export_op->encode_fh; + if (fhfn) { + struct dentry d; + d.d_inode = inode; + len = (*fhfn)(&d, &key->u.fh[0], &maxlen, 0); + if (len <= 0 || len == 255) + return -1; + if (maxlen > CLEANCACHE_KEY_MAX) + return -1; + } + } + return 0; +} + +/* + * "Get" data from cleancache associated with the poolid/inode/index + * that were specified when the data was put to cleanache and, if + * successful, use it to fill the specified page with data and return 0. + * The pageframe is unchanged and returns -1 if the get fails. + * Page must be locked by caller. + */ +int __cleancache_get_page(struct page *page) +{ + int ret = -1; + int pool_id; + struct cleancache_filekey key = { .u.key = { 0 } }; + + VM_BUG_ON(!PageLocked(page)); + pool_id = page->mapping->host->i_sb->cleancache_poolid; + if (pool_id < 0) + goto out; + + if (cleancache_get_key(page->mapping->host, &key) < 0) + goto out; + + ret = (*cleancache_ops.get_page)(pool_id, key, page->index, page); + if (ret == 0) + cleancache_succ_gets++; + else + cleancache_failed_gets++; +out: + return ret; +} +EXPORT_SYMBOL(__cleancache_get_page); + +/* + * "Put" data from a page to cleancache and associate it with the + * (previously-obtained per-filesystem) poolid and the page's, + * inode and page index. Page must be locked. Note that a put_page + * always "succeeds", though a subsequent get_page may succeed or fail. + */ +void __cleancache_put_page(struct page *page) +{ + int pool_id; + struct cleancache_filekey key = { .u.key = { 0 } }; + + VM_BUG_ON(!PageLocked(page)); + pool_id = page->mapping->host->i_sb->cleancache_poolid; + if (pool_id >= 0 && + cleancache_get_key(page->mapping->host, &key) >= 0) { + (*cleancache_ops.put_page)(pool_id, key, page->index, page); + cleancache_puts++; + } +} +EXPORT_SYMBOL(__cleancache_put_page); + +/* + * Flush any data from cleancache associated with the poolid and the + * page's inode and page index so that a subsequent "get" will fail. + */ +void __cleancache_flush_page(struct address_space *mapping, struct page *page) +{ + /* careful... page->mapping is NULL sometimes when this is called */ + int pool_id = mapping->host->i_sb->cleancache_poolid; + struct cleancache_filekey key = { .u.key = { 0 } }; + + if (pool_id >= 0) { + VM_BUG_ON(!PageLocked(page)); + if (cleancache_get_key(mapping->host, &key) >= 0) { + (*cleancache_ops.flush_page)(pool_id, key, page->index); + cleancache_flushes++; + } + } +} +EXPORT_SYMBOL(__cleancache_flush_page); + +/* + * Flush all data from cleancache associated with the poolid and the + * mappings's inode so that all subsequent gets to this poolid/inode + * will fail. + */ +void __cleancache_flush_inode(struct address_space *mapping) +{ + int pool_id = mapping->host->i_sb->cleancache_poolid; + struct cleancache_filekey key = { .u.key = { 0 } }; + + if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0) + (*cleancache_ops.flush_inode)(pool_id, key); +} +EXPORT_SYMBOL(__cleancache_flush_inode); + +/* + * Called by any cleancache-enabled filesystem at time of unmount; + * note that pool_id is surrendered and may be reutrned by a subsequent + * cleancache_init_fs or cleancache_init_shared_fs + */ +void __cleancache_flush_fs(struct super_block *sb) +{ + if (sb->cleancache_poolid >= 0) { + int old_poolid = sb->cleancache_poolid; + sb->cleancache_poolid = -1; + (*cleancache_ops.flush_fs)(old_poolid); + } +} +EXPORT_SYMBOL(__cleancache_flush_fs); + +#ifdef CONFIG_SYSFS + +/* see Documentation/ABI/xxx/sysfs-kernel-mm-cleancache */ + +#define CLEANCACHE_SYSFS_RO(_name) \ + static ssize_t cleancache_##_name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *buf) \ + { \ + return sprintf(buf, "%lu\n", cleancache_##_name); \ + } \ + static struct kobj_attribute cleancache_##_name##_attr = { \ + .attr = { .name = __stringify(_name), .mode = 0444 }, \ + .show = cleancache_##_name##_show, \ + } + +CLEANCACHE_SYSFS_RO(succ_gets); +CLEANCACHE_SYSFS_RO(failed_gets); +CLEANCACHE_SYSFS_RO(puts); +CLEANCACHE_SYSFS_RO(flushes); + +static struct attribute *cleancache_attrs[] = { + &cleancache_succ_gets_attr.attr, + &cleancache_failed_gets_attr.attr, + &cleancache_puts_attr.attr, + &cleancache_flushes_attr.attr, + NULL, +}; + +static struct attribute_group cleancache_attr_group = { + .attrs = cleancache_attrs, + .name = "cleancache", +}; + +#endif /* CONFIG_SYSFS */ + +static int __init init_cleancache(void) +{ +#ifdef CONFIG_SYSFS + int err; + + err = sysfs_create_group(mm_kobj, &cleancache_attr_group); +#endif /* CONFIG_SYSFS */ + return 0; +} +module_init(init_cleancache) From 8eb6724dbfb99bb1f17f3192483fafc1f9eb73fe Mon Sep 17 00:00:00 2001 From: SecureCRT Date: Mon, 20 Aug 2012 00:51:06 +0800 Subject: [PATCH 16/35] add zcache --- drivers/staging/Kconfig | 1 + drivers/staging/Makefile | 1 + drivers/staging/zcache/Kconfig | 13 + drivers/staging/zcache/Makefile | 3 + drivers/staging/zcache/tmem.c | 710 +++++++++++++ drivers/staging/zcache/tmem.h | 195 ++++ drivers/staging/zcache/zcache.c | 1658 +++++++++++++++++++++++++++++++ 7 files changed, 2581 insertions(+) create mode 100755 drivers/staging/zcache/Kconfig create mode 100755 drivers/staging/zcache/Makefile create mode 100755 drivers/staging/zcache/tmem.c create mode 100755 drivers/staging/zcache/tmem.h create mode 100755 drivers/staging/zcache/zcache.c diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig index 8ee4bfa6..e4c3c9dd 100644 --- a/drivers/staging/Kconfig +++ b/drivers/staging/Kconfig @@ -125,5 +125,6 @@ source "drivers/staging/iio/Kconfig" source "drivers/staging/zram/Kconfig" +source "drivers/staging/zcache/Kconfig" endif # !STAGING_EXCLUDE_BUILD endif # STAGING diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile index 5a1b7341..5f0f554b 100644 --- a/drivers/staging/Makefile +++ b/drivers/staging/Makefile @@ -45,4 +45,5 @@ obj-$(CONFIG_DX_SEP) += sep/ obj-$(CONFIG_IIO) += iio/ obj-$(CONFIG_ZRAM) += zram/ obj-$(CONFIG_XVMALLOC) += zram/ +obj-$(CONFIG_ZCACHE) += zcache/ diff --git a/drivers/staging/zcache/Kconfig b/drivers/staging/zcache/Kconfig new file mode 100755 index 00000000..7fabcb2b --- /dev/null +++ b/drivers/staging/zcache/Kconfig @@ -0,0 +1,13 @@ +config ZCACHE + tristate "Dynamic compression of swap pages and clean pagecache pages" + depends on CLEANCACHE || FRONTSWAP + select XVMALLOC + select LZO_COMPRESS + select LZO_DECOMPRESS + default n + help + Zcache doubles RAM efficiency while providing a significant + performance boosts on many workloads. Zcache uses lzo1x + compression and an in-kernel implementation of transcendent + memory to store clean page cache pages and swap in RAM, + providing a noticeable reduction in disk I/O. diff --git a/drivers/staging/zcache/Makefile b/drivers/staging/zcache/Makefile new file mode 100755 index 00000000..f5ec64f9 --- /dev/null +++ b/drivers/staging/zcache/Makefile @@ -0,0 +1,3 @@ +zcache-y := tmem.o + +obj-$(CONFIG_ZCACHE) += zcache.o diff --git a/drivers/staging/zcache/tmem.c b/drivers/staging/zcache/tmem.c new file mode 100755 index 00000000..e954d405 --- /dev/null +++ b/drivers/staging/zcache/tmem.c @@ -0,0 +1,710 @@ +/* + * In-kernel transcendent memory (generic implementation) + * + * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp. + * + * The primary purpose of Transcedent Memory ("tmem") is to map object-oriented + * "handles" (triples containing a pool id, and object id, and an index), to + * pages in a page-accessible memory (PAM). Tmem references the PAM pages via + * an abstract "pampd" (PAM page-descriptor), which can be operated on by a + * set of functions (pamops). Each pampd contains some representation of + * PAGE_SIZE bytes worth of data. Tmem must support potentially millions of + * pages and must be able to insert, find, and delete these pages at a + * potential frequency of thousands per second concurrently across many CPUs, + * (and, if used with KVM, across many vcpus across many guests). + * Tmem is tracked with a hierarchy of data structures, organized by + * the elements in a handle-tuple: pool_id, object_id, and page index. + * One or more "clients" (e.g. guests) each provide one or more tmem_pools. + * Each pool, contains a hash table of rb_trees of tmem_objs. Each + * tmem_obj contains a radix-tree-like tree of pointers, with intermediate + * nodes called tmem_objnodes. Each leaf pointer in this tree points to + * a pampd, which is accessible only through a small set of callbacks + * registered by the PAM implementation (see tmem_register_pamops). Tmem + * does all memory allocation via a set of callbacks registered by the tmem + * host implementation (e.g. see tmem_register_hostops). + */ + +#include +#include +#include + +#include "tmem.h" + +/* data structure sentinels used for debugging... see tmem.h */ +#define POOL_SENTINEL 0x87658765 +#define OBJ_SENTINEL 0x12345678 +#define OBJNODE_SENTINEL 0xfedcba09 + +/* + * A tmem host implementation must use this function to register callbacks + * for memory allocation. + */ +static struct tmem_hostops tmem_hostops; + +static void tmem_objnode_tree_init(void); + +void tmem_register_hostops(struct tmem_hostops *m) +{ + tmem_objnode_tree_init(); + tmem_hostops = *m; +} + +/* + * A tmem host implementation must use this function to register + * callbacks for a page-accessible memory (PAM) implementation + */ +static struct tmem_pamops tmem_pamops; + +void tmem_register_pamops(struct tmem_pamops *m) +{ + tmem_pamops = *m; +} + +/* + * Oid's are potentially very sparse and tmem_objs may have an indeterminately + * short life, being added and deleted at a relatively high frequency. + * So an rb_tree is an ideal data structure to manage tmem_objs. But because + * of the potentially huge number of tmem_objs, each pool manages a hashtable + * of rb_trees to reduce search, insert, delete, and rebalancing time. + * Each hashbucket also has a lock to manage concurrent access. + * + * The following routines manage tmem_objs. When any tmem_obj is accessed, + * the hashbucket lock must be held. + */ + +/* searches for object==oid in pool, returns locked object if found */ +static struct tmem_obj *tmem_obj_find(struct tmem_hashbucket *hb, + struct tmem_oid *oidp) +{ + struct rb_node *rbnode; + struct tmem_obj *obj; + + rbnode = hb->obj_rb_root.rb_node; + while (rbnode) { + BUG_ON(RB_EMPTY_NODE(rbnode)); + obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node); + switch (tmem_oid_compare(oidp, &obj->oid)) { + case 0: /* equal */ + goto out; + case -1: + rbnode = rbnode->rb_left; + break; + case 1: + rbnode = rbnode->rb_right; + break; + } + } + obj = NULL; +out: + return obj; +} + +static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *); + +/* free an object that has no more pampds in it */ +static void tmem_obj_free(struct tmem_obj *obj, struct tmem_hashbucket *hb) +{ + struct tmem_pool *pool; + + BUG_ON(obj == NULL); + ASSERT_SENTINEL(obj, OBJ); + BUG_ON(obj->pampd_count > 0); + pool = obj->pool; + BUG_ON(pool == NULL); + if (obj->objnode_tree_root != NULL) /* may be "stump" with no leaves */ + tmem_pampd_destroy_all_in_obj(obj); + BUG_ON(obj->objnode_tree_root != NULL); + BUG_ON((long)obj->objnode_count != 0); + atomic_dec(&pool->obj_count); + BUG_ON(atomic_read(&pool->obj_count) < 0); + INVERT_SENTINEL(obj, OBJ); + obj->pool = NULL; + tmem_oid_set_invalid(&obj->oid); + rb_erase(&obj->rb_tree_node, &hb->obj_rb_root); +} + +/* + * initialize, and insert an tmem_object_root (called only if find failed) + */ +static void tmem_obj_init(struct tmem_obj *obj, struct tmem_hashbucket *hb, + struct tmem_pool *pool, + struct tmem_oid *oidp) +{ + struct rb_root *root = &hb->obj_rb_root; + struct rb_node **new = &(root->rb_node), *parent = NULL; + struct tmem_obj *this; + + BUG_ON(pool == NULL); + atomic_inc(&pool->obj_count); + obj->objnode_tree_height = 0; + obj->objnode_tree_root = NULL; + obj->pool = pool; + obj->oid = *oidp; + obj->objnode_count = 0; + obj->pampd_count = 0; + SET_SENTINEL(obj, OBJ); + while (*new) { + BUG_ON(RB_EMPTY_NODE(*new)); + this = rb_entry(*new, struct tmem_obj, rb_tree_node); + parent = *new; + switch (tmem_oid_compare(oidp, &this->oid)) { + case 0: + BUG(); /* already present; should never happen! */ + break; + case -1: + new = &(*new)->rb_left; + break; + case 1: + new = &(*new)->rb_right; + break; + } + } + rb_link_node(&obj->rb_tree_node, parent, new); + rb_insert_color(&obj->rb_tree_node, root); +} + +/* + * Tmem is managed as a set of tmem_pools with certain attributes, such as + * "ephemeral" vs "persistent". These attributes apply to all tmem_objs + * and all pampds that belong to a tmem_pool. A tmem_pool is created + * or deleted relatively rarely (for example, when a filesystem is + * mounted or unmounted. + */ + +/* flush all data from a pool and, optionally, free it */ +static void tmem_pool_flush(struct tmem_pool *pool, bool destroy) +{ + struct rb_node *rbnode; + struct tmem_obj *obj; + struct tmem_hashbucket *hb = &pool->hashbucket[0]; + int i; + + BUG_ON(pool == NULL); + for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) { + spin_lock(&hb->lock); + rbnode = rb_first(&hb->obj_rb_root); + while (rbnode != NULL) { + obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node); + rbnode = rb_next(rbnode); + tmem_pampd_destroy_all_in_obj(obj); + tmem_obj_free(obj, hb); + (*tmem_hostops.obj_free)(obj, pool); + } + spin_unlock(&hb->lock); + } + if (destroy) + list_del(&pool->pool_list); +} + +/* + * A tmem_obj contains a radix-tree-like tree in which the intermediate + * nodes are called tmem_objnodes. (The kernel lib/radix-tree.c implementation + * is very specialized and tuned for specific uses and is not particularly + * suited for use from this code, though some code from the core algorithms has + * been reused, thus the copyright notices below). Each tmem_objnode contains + * a set of pointers which point to either a set of intermediate tmem_objnodes + * or a set of of pampds. + * + * Portions Copyright (C) 2001 Momchil Velikov + * Portions Copyright (C) 2001 Christoph Hellwig + * Portions Copyright (C) 2005 SGI, Christoph Lameter + */ + +struct tmem_objnode_tree_path { + struct tmem_objnode *objnode; + int offset; +}; + +/* objnode height_to_maxindex translation */ +static unsigned long tmem_objnode_tree_h2max[OBJNODE_TREE_MAX_PATH + 1]; + +static void tmem_objnode_tree_init(void) +{ + unsigned int ht, tmp; + + for (ht = 0; ht < ARRAY_SIZE(tmem_objnode_tree_h2max); ht++) { + tmp = ht * OBJNODE_TREE_MAP_SHIFT; + if (tmp >= OBJNODE_TREE_INDEX_BITS) + tmem_objnode_tree_h2max[ht] = ~0UL; + else + tmem_objnode_tree_h2max[ht] = + (~0UL >> (OBJNODE_TREE_INDEX_BITS - tmp - 1)) >> 1; + } +} + +static struct tmem_objnode *tmem_objnode_alloc(struct tmem_obj *obj) +{ + struct tmem_objnode *objnode; + + ASSERT_SENTINEL(obj, OBJ); + BUG_ON(obj->pool == NULL); + ASSERT_SENTINEL(obj->pool, POOL); + objnode = (*tmem_hostops.objnode_alloc)(obj->pool); + if (unlikely(objnode == NULL)) + goto out; + objnode->obj = obj; + SET_SENTINEL(objnode, OBJNODE); + memset(&objnode->slots, 0, sizeof(objnode->slots)); + objnode->slots_in_use = 0; + obj->objnode_count++; +out: + return objnode; +} + +static void tmem_objnode_free(struct tmem_objnode *objnode) +{ + struct tmem_pool *pool; + int i; + + BUG_ON(objnode == NULL); + for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) + BUG_ON(objnode->slots[i] != NULL); + ASSERT_SENTINEL(objnode, OBJNODE); + INVERT_SENTINEL(objnode, OBJNODE); + BUG_ON(objnode->obj == NULL); + ASSERT_SENTINEL(objnode->obj, OBJ); + pool = objnode->obj->pool; + BUG_ON(pool == NULL); + ASSERT_SENTINEL(pool, POOL); + objnode->obj->objnode_count--; + objnode->obj = NULL; + (*tmem_hostops.objnode_free)(objnode, pool); +} + +/* + * lookup index in object and return associated pampd (or NULL if not found) + */ +static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index) +{ + unsigned int height, shift; + struct tmem_objnode **slot = NULL; + + BUG_ON(obj == NULL); + ASSERT_SENTINEL(obj, OBJ); + BUG_ON(obj->pool == NULL); + ASSERT_SENTINEL(obj->pool, POOL); + + height = obj->objnode_tree_height; + if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) + goto out; + if (height == 0 && obj->objnode_tree_root) { + slot = &obj->objnode_tree_root; + goto out; + } + shift = (height-1) * OBJNODE_TREE_MAP_SHIFT; + slot = &obj->objnode_tree_root; + while (height > 0) { + if (*slot == NULL) + goto out; + slot = (struct tmem_objnode **) + ((*slot)->slots + + ((index >> shift) & OBJNODE_TREE_MAP_MASK)); + shift -= OBJNODE_TREE_MAP_SHIFT; + height--; + } +out: + return slot != NULL ? *slot : NULL; +} + +static int tmem_pampd_add_to_obj(struct tmem_obj *obj, uint32_t index, + void *pampd) +{ + int ret = 0; + struct tmem_objnode *objnode = NULL, *newnode, *slot; + unsigned int height, shift; + int offset = 0; + + /* if necessary, extend the tree to be higher */ + if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) { + height = obj->objnode_tree_height + 1; + if (index > tmem_objnode_tree_h2max[height]) + while (index > tmem_objnode_tree_h2max[height]) + height++; + if (obj->objnode_tree_root == NULL) { + obj->objnode_tree_height = height; + goto insert; + } + do { + newnode = tmem_objnode_alloc(obj); + if (!newnode) { + ret = -ENOMEM; + goto out; + } + newnode->slots[0] = obj->objnode_tree_root; + newnode->slots_in_use = 1; + obj->objnode_tree_root = newnode; + obj->objnode_tree_height++; + } while (height > obj->objnode_tree_height); + } +insert: + slot = obj->objnode_tree_root; + height = obj->objnode_tree_height; + shift = (height-1) * OBJNODE_TREE_MAP_SHIFT; + while (height > 0) { + if (slot == NULL) { + /* add a child objnode. */ + slot = tmem_objnode_alloc(obj); + if (!slot) { + ret = -ENOMEM; + goto out; + } + if (objnode) { + + objnode->slots[offset] = slot; + objnode->slots_in_use++; + } else + obj->objnode_tree_root = slot; + } + /* go down a level */ + offset = (index >> shift) & OBJNODE_TREE_MAP_MASK; + objnode = slot; + slot = objnode->slots[offset]; + shift -= OBJNODE_TREE_MAP_SHIFT; + height--; + } + BUG_ON(slot != NULL); + if (objnode) { + objnode->slots_in_use++; + objnode->slots[offset] = pampd; + } else + obj->objnode_tree_root = pampd; + obj->pampd_count++; +out: + return ret; +} + +static void *tmem_pampd_delete_from_obj(struct tmem_obj *obj, uint32_t index) +{ + struct tmem_objnode_tree_path path[OBJNODE_TREE_MAX_PATH + 1]; + struct tmem_objnode_tree_path *pathp = path; + struct tmem_objnode *slot = NULL; + unsigned int height, shift; + int offset; + + BUG_ON(obj == NULL); + ASSERT_SENTINEL(obj, OBJ); + BUG_ON(obj->pool == NULL); + ASSERT_SENTINEL(obj->pool, POOL); + height = obj->objnode_tree_height; + if (index > tmem_objnode_tree_h2max[height]) + goto out; + slot = obj->objnode_tree_root; + if (height == 0 && obj->objnode_tree_root) { + obj->objnode_tree_root = NULL; + goto out; + } + shift = (height - 1) * OBJNODE_TREE_MAP_SHIFT; + pathp->objnode = NULL; + do { + if (slot == NULL) + goto out; + pathp++; + offset = (index >> shift) & OBJNODE_TREE_MAP_MASK; + pathp->offset = offset; + pathp->objnode = slot; + slot = slot->slots[offset]; + shift -= OBJNODE_TREE_MAP_SHIFT; + height--; + } while (height > 0); + if (slot == NULL) + goto out; + while (pathp->objnode) { + pathp->objnode->slots[pathp->offset] = NULL; + pathp->objnode->slots_in_use--; + if (pathp->objnode->slots_in_use) { + if (pathp->objnode == obj->objnode_tree_root) { + while (obj->objnode_tree_height > 0 && + obj->objnode_tree_root->slots_in_use == 1 && + obj->objnode_tree_root->slots[0]) { + struct tmem_objnode *to_free = + obj->objnode_tree_root; + + obj->objnode_tree_root = + to_free->slots[0]; + obj->objnode_tree_height--; + to_free->slots[0] = NULL; + to_free->slots_in_use = 0; + tmem_objnode_free(to_free); + } + } + goto out; + } + tmem_objnode_free(pathp->objnode); /* 0 slots used, free it */ + pathp--; + } + obj->objnode_tree_height = 0; + obj->objnode_tree_root = NULL; + +out: + if (slot != NULL) + obj->pampd_count--; + BUG_ON(obj->pampd_count < 0); + return slot; +} + +/* recursively walk the objnode_tree destroying pampds and objnodes */ +static void tmem_objnode_node_destroy(struct tmem_obj *obj, + struct tmem_objnode *objnode, + unsigned int ht) +{ + int i; + + if (ht == 0) + return; + for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) { + if (objnode->slots[i]) { + if (ht == 1) { + obj->pampd_count--; + (*tmem_pamops.free)(objnode->slots[i], + obj->pool); + objnode->slots[i] = NULL; + continue; + } + tmem_objnode_node_destroy(obj, objnode->slots[i], ht-1); + tmem_objnode_free(objnode->slots[i]); + objnode->slots[i] = NULL; + } + } +} + +static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj) +{ + if (obj->objnode_tree_root == NULL) + return; + if (obj->objnode_tree_height == 0) { + obj->pampd_count--; + (*tmem_pamops.free)(obj->objnode_tree_root, obj->pool); + } else { + tmem_objnode_node_destroy(obj, obj->objnode_tree_root, + obj->objnode_tree_height); + tmem_objnode_free(obj->objnode_tree_root); + obj->objnode_tree_height = 0; + } + obj->objnode_tree_root = NULL; +} + +/* + * Tmem is operated on by a set of well-defined actions: + * "put", "get", "flush", "flush_object", "new pool" and "destroy pool". + * (The tmem ABI allows for subpages and exchanges but these operations + * are not included in this implementation.) + * + * These "tmem core" operations are implemented in the following functions. + */ + +/* + * "Put" a page, e.g. copy a page from the kernel into newly allocated + * PAM space (if such space is available). Tmem_put is complicated by + * a corner case: What if a page with matching handle already exists in + * tmem? To guarantee coherency, one of two actions is necessary: Either + * the data for the page must be overwritten, or the page must be + * "flushed" so that the data is not accessible to a subsequent "get". + * Since these "duplicate puts" are relatively rare, this implementation + * always flushes for simplicity. + */ +int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index, + struct page *page) +{ + struct tmem_obj *obj = NULL, *objfound = NULL, *objnew = NULL; + void *pampd = NULL, *pampd_del = NULL; + int ret = -ENOMEM; + bool ephemeral; + struct tmem_hashbucket *hb; + + ephemeral = is_ephemeral(pool); + hb = &pool->hashbucket[tmem_oid_hash(oidp)]; + spin_lock(&hb->lock); + obj = objfound = tmem_obj_find(hb, oidp); + if (obj != NULL) { + pampd = tmem_pampd_lookup_in_obj(objfound, index); + if (pampd != NULL) { + /* if found, is a dup put, flush the old one */ + pampd_del = tmem_pampd_delete_from_obj(obj, index); + BUG_ON(pampd_del != pampd); + (*tmem_pamops.free)(pampd, pool); + if (obj->pampd_count == 0) { + objnew = obj; + objfound = NULL; + } + pampd = NULL; + } + } else { + obj = objnew = (*tmem_hostops.obj_alloc)(pool); + if (unlikely(obj == NULL)) { + ret = -ENOMEM; + goto out; + } + tmem_obj_init(obj, hb, pool, oidp); + } + BUG_ON(obj == NULL); + BUG_ON(((objnew != obj) && (objfound != obj)) || (objnew == objfound)); + pampd = (*tmem_pamops.create)(obj->pool, &obj->oid, index, page); + if (unlikely(pampd == NULL)) + goto free; + ret = tmem_pampd_add_to_obj(obj, index, pampd); + if (unlikely(ret == -ENOMEM)) + /* may have partially built objnode tree ("stump") */ + goto delete_and_free; + goto out; + +delete_and_free: + (void)tmem_pampd_delete_from_obj(obj, index); +free: + if (pampd) + (*tmem_pamops.free)(pampd, pool); + if (objnew) { + tmem_obj_free(objnew, hb); + (*tmem_hostops.obj_free)(objnew, pool); + } +out: + spin_unlock(&hb->lock); + return ret; +} + +/* + * "Get" a page, e.g. if one can be found, copy the tmem page with the + * matching handle from PAM space to the kernel. By tmem definition, + * when a "get" is successful on an ephemeral page, the page is "flushed", + * and when a "get" is successful on a persistent page, the page is retained + * in tmem. Note that to preserve + * coherency, "get" can never be skipped if tmem contains the data. + * That is, if a get is done with a certain handle and fails, any + * subsequent "get" must also fail (unless of course there is a + * "put" done with the same handle). + + */ +int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, + uint32_t index, struct page *page) +{ + struct tmem_obj *obj; + void *pampd; + bool ephemeral = is_ephemeral(pool); + uint32_t ret = -1; + struct tmem_hashbucket *hb; + + hb = &pool->hashbucket[tmem_oid_hash(oidp)]; + spin_lock(&hb->lock); + obj = tmem_obj_find(hb, oidp); + if (obj == NULL) + goto out; + ephemeral = is_ephemeral(pool); + if (ephemeral) + pampd = tmem_pampd_delete_from_obj(obj, index); + else + pampd = tmem_pampd_lookup_in_obj(obj, index); + if (pampd == NULL) + goto out; + ret = (*tmem_pamops.get_data)(page, pampd, pool); + if (ret < 0) + goto out; + if (ephemeral) { + (*tmem_pamops.free)(pampd, pool); + if (obj->pampd_count == 0) { + tmem_obj_free(obj, hb); + (*tmem_hostops.obj_free)(obj, pool); + obj = NULL; + } + } + ret = 0; +out: + spin_unlock(&hb->lock); + return ret; +} + +/* + * If a page in tmem matches the handle, "flush" this page from tmem such + * that any subsequent "get" does not succeed (unless, of course, there + * was another "put" with the same handle). + */ +int tmem_flush_page(struct tmem_pool *pool, + struct tmem_oid *oidp, uint32_t index) +{ + struct tmem_obj *obj; + void *pampd; + int ret = -1; + struct tmem_hashbucket *hb; + + hb = &pool->hashbucket[tmem_oid_hash(oidp)]; + spin_lock(&hb->lock); + obj = tmem_obj_find(hb, oidp); + if (obj == NULL) + goto out; + pampd = tmem_pampd_delete_from_obj(obj, index); + if (pampd == NULL) + goto out; + (*tmem_pamops.free)(pampd, pool); + if (obj->pampd_count == 0) { + tmem_obj_free(obj, hb); + (*tmem_hostops.obj_free)(obj, pool); + } + ret = 0; + +out: + spin_unlock(&hb->lock); + return ret; +} + +/* + * "Flush" all pages in tmem matching this oid. + */ +int tmem_flush_object(struct tmem_pool *pool, struct tmem_oid *oidp) +{ + struct tmem_obj *obj; + struct tmem_hashbucket *hb; + int ret = -1; + + hb = &pool->hashbucket[tmem_oid_hash(oidp)]; + spin_lock(&hb->lock); + obj = tmem_obj_find(hb, oidp); + if (obj == NULL) + goto out; + tmem_pampd_destroy_all_in_obj(obj); + tmem_obj_free(obj, hb); + (*tmem_hostops.obj_free)(obj, pool); + ret = 0; + +out: + spin_unlock(&hb->lock); + return ret; +} + +/* + * "Flush" all pages (and tmem_objs) from this tmem_pool and disable + * all subsequent access to this tmem_pool. + */ +int tmem_destroy_pool(struct tmem_pool *pool) +{ + int ret = -1; + + if (pool == NULL) + goto out; + tmem_pool_flush(pool, 1); + ret = 0; +out: + return ret; +} + +static LIST_HEAD(tmem_global_pool_list); + +/* + * Create a new tmem_pool with the provided flag and return + * a pool id provided by the tmem host implementation. + */ +void tmem_new_pool(struct tmem_pool *pool, uint32_t flags) +{ + int persistent = flags & TMEM_POOL_PERSIST; + int shared = flags & TMEM_POOL_SHARED; + struct tmem_hashbucket *hb = &pool->hashbucket[0]; + int i; + + for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) { + hb->obj_rb_root = RB_ROOT; + spin_lock_init(&hb->lock); + } + INIT_LIST_HEAD(&pool->pool_list); + atomic_set(&pool->obj_count, 0); + SET_SENTINEL(pool, POOL); + list_add_tail(&pool->pool_list, &tmem_global_pool_list); + pool->persistent = persistent; + pool->shared = shared; +} diff --git a/drivers/staging/zcache/tmem.h b/drivers/staging/zcache/tmem.h new file mode 100755 index 00000000..2e07e217 --- /dev/null +++ b/drivers/staging/zcache/tmem.h @@ -0,0 +1,195 @@ +/* + * tmem.h + * + * Transcendent memory + * + * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp. + */ + +#ifndef _TMEM_H_ +#define _TMEM_H_ + +#include +#include +#include +#include + +/* + * These are pre-defined by the Xen<->Linux ABI + */ +#define TMEM_PUT_PAGE 4 +#define TMEM_GET_PAGE 5 +#define TMEM_FLUSH_PAGE 6 +#define TMEM_FLUSH_OBJECT 7 +#define TMEM_POOL_PERSIST 1 +#define TMEM_POOL_SHARED 2 +#define TMEM_POOL_PRECOMPRESSED 4 +#define TMEM_POOL_PAGESIZE_SHIFT 4 +#define TMEM_POOL_PAGESIZE_MASK 0xf +#define TMEM_POOL_RESERVED_BITS 0x00ffff00 + +/* + * sentinels have proven very useful for debugging but can be removed + * or disabled before final merge. + */ +#define SENTINELS +#ifdef SENTINELS +#define DECL_SENTINEL uint32_t sentinel; +#define SET_SENTINEL(_x, _y) (_x->sentinel = _y##_SENTINEL) +#define INVERT_SENTINEL(_x, _y) (_x->sentinel = ~_y##_SENTINEL) +#define ASSERT_SENTINEL(_x, _y) WARN_ON(_x->sentinel != _y##_SENTINEL) +#define ASSERT_INVERTED_SENTINEL(_x, _y) WARN_ON(_x->sentinel != ~_y##_SENTINEL) +#else +#define DECL_SENTINEL +#define SET_SENTINEL(_x, _y) do { } while (0) +#define INVERT_SENTINEL(_x, _y) do { } while (0) +#define ASSERT_SENTINEL(_x, _y) do { } while (0) +#define ASSERT_INVERTED_SENTINEL(_x, _y) do { } while (0) +#endif + +#define ASSERT_SPINLOCK(_l) WARN_ON(!spin_is_locked(_l)) + +/* + * A pool is the highest-level data structure managed by tmem and + * usually corresponds to a large independent set of pages such as + * a filesystem. Each pool has an id, and certain attributes and counters. + * It also contains a set of hash buckets, each of which contains an rbtree + * of objects and a lock to manage concurrency within the pool. + */ + +#define TMEM_HASH_BUCKET_BITS 8 +#define TMEM_HASH_BUCKETS (1<persistent) +#define is_ephemeral(_p) (!(_p->persistent)) + +/* + * An object id ("oid") is large: 192-bits (to ensure, for example, files + * in a modern filesystem can be uniquely identified). + */ + +struct tmem_oid { + uint64_t oid[3]; +}; + +static inline void tmem_oid_set_invalid(struct tmem_oid *oidp) +{ + oidp->oid[0] = oidp->oid[1] = oidp->oid[2] = -1UL; +} + +static inline bool tmem_oid_valid(struct tmem_oid *oidp) +{ + return oidp->oid[0] != -1UL || oidp->oid[1] != -1UL || + oidp->oid[2] != -1UL; +} + +static inline int tmem_oid_compare(struct tmem_oid *left, + struct tmem_oid *right) +{ + int ret; + + if (left->oid[2] == right->oid[2]) { + if (left->oid[1] == right->oid[1]) { + if (left->oid[0] == right->oid[0]) + ret = 0; + else if (left->oid[0] < right->oid[0]) + ret = -1; + else + return 1; + } else if (left->oid[1] < right->oid[1]) + ret = -1; + else + ret = 1; + } else if (left->oid[2] < right->oid[2]) + ret = -1; + else + ret = 1; + return ret; +} + +static inline unsigned tmem_oid_hash(struct tmem_oid *oidp) +{ + return hash_long(oidp->oid[0] ^ oidp->oid[1] ^ oidp->oid[2], + TMEM_HASH_BUCKET_BITS); +} + +/* + * A tmem_obj contains an identifier (oid), pointers to the parent + * pool and the rb_tree to which it belongs, counters, and an ordered + * set of pampds, structured in a radix-tree-like tree. The intermediate + * nodes of the tree are called tmem_objnodes. + */ + +struct tmem_objnode; + +struct tmem_obj { + struct tmem_oid oid; + struct tmem_pool *pool; + struct rb_node rb_tree_node; + struct tmem_objnode *objnode_tree_root; + unsigned int objnode_tree_height; + unsigned long objnode_count; + long pampd_count; + DECL_SENTINEL +}; + +#define OBJNODE_TREE_MAP_SHIFT 6 +#define OBJNODE_TREE_MAP_SIZE (1UL << OBJNODE_TREE_MAP_SHIFT) +#define OBJNODE_TREE_MAP_MASK (OBJNODE_TREE_MAP_SIZE-1) +#define OBJNODE_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) +#define OBJNODE_TREE_MAX_PATH \ + (OBJNODE_TREE_INDEX_BITS/OBJNODE_TREE_MAP_SHIFT + 2) + +struct tmem_objnode { + struct tmem_obj *obj; + DECL_SENTINEL + void *slots[OBJNODE_TREE_MAP_SIZE]; + unsigned int slots_in_use; +}; + +/* pampd abstract datatype methods provided by the PAM implementation */ +struct tmem_pamops { + void *(*create)(struct tmem_pool *, struct tmem_oid *, uint32_t, + struct page *); + int (*get_data)(struct page *, void *, struct tmem_pool *); + void (*free)(void *, struct tmem_pool *); +}; +extern void tmem_register_pamops(struct tmem_pamops *m); + +/* memory allocation methods provided by the host implementation */ +struct tmem_hostops { + struct tmem_obj *(*obj_alloc)(struct tmem_pool *); + void (*obj_free)(struct tmem_obj *, struct tmem_pool *); + struct tmem_objnode *(*objnode_alloc)(struct tmem_pool *); + void (*objnode_free)(struct tmem_objnode *, struct tmem_pool *); +}; +extern void tmem_register_hostops(struct tmem_hostops *m); + +/* core tmem accessor functions */ +extern int tmem_put(struct tmem_pool *, struct tmem_oid *, uint32_t index, + struct page *page); +extern int tmem_get(struct tmem_pool *, struct tmem_oid *, uint32_t index, + struct page *page); +extern int tmem_flush_page(struct tmem_pool *, struct tmem_oid *, + uint32_t index); +extern int tmem_flush_object(struct tmem_pool *, struct tmem_oid *); +extern int tmem_destroy_pool(struct tmem_pool *); +extern void tmem_new_pool(struct tmem_pool *, uint32_t); +#endif /* _TMEM_H */ diff --git a/drivers/staging/zcache/zcache.c b/drivers/staging/zcache/zcache.c new file mode 100755 index 00000000..b8a2b30a --- /dev/null +++ b/drivers/staging/zcache/zcache.c @@ -0,0 +1,1658 @@ +/* + * zcache.c + * + * Copyright (c) 2010,2011, Dan Magenheimer, Oracle Corp. + * Copyright (c) 2010,2011, Nitin Gupta + * + * Zcache provides an in-kernel "host implementation" for transcendent memory + * and, thus indirectly, for cleancache and frontswap. Zcache includes two + * page-accessible memory [1] interfaces, both utilizing lzo1x compression: + * 1) "compression buddies" ("zbud") is used for ephemeral pages + * 2) xvmalloc is used for persistent pages. + * Xvmalloc (based on the TLSF allocator) has very low fragmentation + * so maximizes space efficiency, while zbud allows pairs (and potentially, + * in the future, more than a pair of) compressed pages to be closely linked + * so that reclaiming can be done via the kernel's physical-page-oriented + * "shrinker" interface. + * + * [1] For a definition of page-accessible memory (aka PAM), see: + * http://marc.info/?l=linux-mm&m=127811271605009 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "tmem.h" + +#include "../zram/xvmalloc.h" /* if built in drivers/staging */ + +#if (!defined(CONFIG_CLEANCACHE) && !defined(CONFIG_FRONTSWAP)) +#error "zcache is useless without CONFIG_CLEANCACHE or CONFIG_FRONTSWAP" +#endif +#ifdef CONFIG_CLEANCACHE +#include +#endif +#ifdef CONFIG_FRONTSWAP +#include +#endif + +#if 0 +/* this is more aggressive but may cause other problems? */ +#define ZCACHE_GFP_MASK (GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN) +#else +#define ZCACHE_GFP_MASK \ + (__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC) +#endif + +/********** + * Compression buddies ("zbud") provides for packing two (or, possibly + * in the future, more) compressed ephemeral pages into a single "raw" + * (physical) page and tracking them with data structures so that + * the raw pages can be easily reclaimed. + * + * A zbud page ("zbpg") is an aligned page containing a list_head, + * a lock, and two "zbud headers". The remainder of the physical + * page is divided up into aligned 64-byte "chunks" which contain + * the compressed data for zero, one, or two zbuds. Each zbpg + * resides on: (1) an "unused list" if it has no zbuds; (2) a + * "buddied" list if it is fully populated with two zbuds; or + * (3) one of PAGE_SIZE/64 "unbuddied" lists indexed by how many chunks + * the one unbuddied zbud uses. The data inside a zbpg cannot be + * read or written unless the zbpg's lock is held. + */ + +#define ZBH_SENTINEL 0x43214321 +#define ZBPG_SENTINEL 0xdeadbeef + +#define ZBUD_MAX_BUDS 2 + +struct zbud_hdr { + uint32_t pool_id; + struct tmem_oid oid; + uint32_t index; + uint16_t size; /* compressed size in bytes, zero means unused */ + DECL_SENTINEL +}; + +struct zbud_page { + struct list_head bud_list; + spinlock_t lock; + struct zbud_hdr buddy[ZBUD_MAX_BUDS]; + DECL_SENTINEL + /* followed by NUM_CHUNK aligned CHUNK_SIZE-byte chunks */ +}; + +#define CHUNK_SHIFT 6 +#define CHUNK_SIZE (1 << CHUNK_SHIFT) +#define CHUNK_MASK (~(CHUNK_SIZE-1)) +#define NCHUNKS (((PAGE_SIZE - sizeof(struct zbud_page)) & \ + CHUNK_MASK) >> CHUNK_SHIFT) +#define MAX_CHUNK (NCHUNKS-1) + +static struct { + struct list_head list; + unsigned count; +} zbud_unbuddied[NCHUNKS]; +/* list N contains pages with N chunks USED and NCHUNKS-N unused */ +/* element 0 is never used but optimizing that isn't worth it */ +static unsigned long zbud_cumul_chunk_counts[NCHUNKS]; + +struct list_head zbud_buddied_list; +static unsigned long zcache_zbud_buddied_count; + +/* protects the buddied list and all unbuddied lists */ +static DEFINE_SPINLOCK(zbud_budlists_spinlock); + +static LIST_HEAD(zbpg_unused_list); +static unsigned long zcache_zbpg_unused_list_count; + +/* protects the unused page list */ +static DEFINE_SPINLOCK(zbpg_unused_list_spinlock); + +static atomic_t zcache_zbud_curr_raw_pages; +static atomic_t zcache_zbud_curr_zpages; +static unsigned long zcache_zbud_curr_zbytes; +static unsigned long zcache_zbud_cumul_zpages; +static unsigned long zcache_zbud_cumul_zbytes; +static unsigned long zcache_compress_poor; + +/* forward references */ +static void *zcache_get_free_page(void); +static void zcache_free_page(void *p); + +/* + * zbud helper functions + */ + +static inline unsigned zbud_max_buddy_size(void) +{ + return MAX_CHUNK << CHUNK_SHIFT; +} + +static inline unsigned zbud_size_to_chunks(unsigned size) +{ + BUG_ON(size == 0 || size > zbud_max_buddy_size()); + return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; +} + +static inline int zbud_budnum(struct zbud_hdr *zh) +{ + unsigned offset = (unsigned long)zh & (PAGE_SIZE - 1); + struct zbud_page *zbpg = NULL; + unsigned budnum = -1U; + int i; + + for (i = 0; i < ZBUD_MAX_BUDS; i++) + if (offset == offsetof(typeof(*zbpg), buddy[i])) { + budnum = i; + break; + } + BUG_ON(budnum == -1U); + return budnum; +} + +static char *zbud_data(struct zbud_hdr *zh, unsigned size) +{ + struct zbud_page *zbpg; + char *p; + unsigned budnum; + + ASSERT_SENTINEL(zh, ZBH); + budnum = zbud_budnum(zh); + BUG_ON(size == 0 || size > zbud_max_buddy_size()); + zbpg = container_of(zh, struct zbud_page, buddy[budnum]); + ASSERT_SPINLOCK(&zbpg->lock); + p = (char *)zbpg; + if (budnum == 0) + p += ((sizeof(struct zbud_page) + CHUNK_SIZE - 1) & + CHUNK_MASK); + else if (budnum == 1) + p += PAGE_SIZE - ((size + CHUNK_SIZE - 1) & CHUNK_MASK); + return p; +} + +/* + * zbud raw page management + */ + +static struct zbud_page *zbud_alloc_raw_page(void) +{ + struct zbud_page *zbpg = NULL; + struct zbud_hdr *zh0, *zh1; + bool recycled = 0; + + /* if any pages on the zbpg list, use one */ + spin_lock(&zbpg_unused_list_spinlock); + if (!list_empty(&zbpg_unused_list)) { + zbpg = list_first_entry(&zbpg_unused_list, + struct zbud_page, bud_list); + list_del_init(&zbpg->bud_list); + zcache_zbpg_unused_list_count--; + recycled = 1; + } + spin_unlock(&zbpg_unused_list_spinlock); + if (zbpg == NULL) + /* none on zbpg list, try to get a kernel page */ + zbpg = zcache_get_free_page(); + if (likely(zbpg != NULL)) { + INIT_LIST_HEAD(&zbpg->bud_list); + zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1]; + spin_lock_init(&zbpg->lock); + if (recycled) { + ASSERT_INVERTED_SENTINEL(zbpg, ZBPG); + SET_SENTINEL(zbpg, ZBPG); + BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid)); + BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid)); + } else { + atomic_inc(&zcache_zbud_curr_raw_pages); + INIT_LIST_HEAD(&zbpg->bud_list); + SET_SENTINEL(zbpg, ZBPG); + zh0->size = 0; zh1->size = 0; + tmem_oid_set_invalid(&zh0->oid); + tmem_oid_set_invalid(&zh1->oid); + } + } + return zbpg; +} + +static void zbud_free_raw_page(struct zbud_page *zbpg) +{ + struct zbud_hdr *zh0 = &zbpg->buddy[0], *zh1 = &zbpg->buddy[1]; + + ASSERT_SENTINEL(zbpg, ZBPG); + BUG_ON(!list_empty(&zbpg->bud_list)); + ASSERT_SPINLOCK(&zbpg->lock); + BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid)); + BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid)); + INVERT_SENTINEL(zbpg, ZBPG); + spin_unlock(&zbpg->lock); + spin_lock(&zbpg_unused_list_spinlock); + list_add(&zbpg->bud_list, &zbpg_unused_list); + zcache_zbpg_unused_list_count++; + spin_unlock(&zbpg_unused_list_spinlock); +} + +/* + * core zbud handling routines + */ + +static unsigned zbud_free(struct zbud_hdr *zh) +{ + unsigned size; + + ASSERT_SENTINEL(zh, ZBH); + BUG_ON(!tmem_oid_valid(&zh->oid)); + size = zh->size; + BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size()); + zh->size = 0; + tmem_oid_set_invalid(&zh->oid); + INVERT_SENTINEL(zh, ZBH); + zcache_zbud_curr_zbytes -= size; + atomic_dec(&zcache_zbud_curr_zpages); + return size; +} + +static void zbud_free_and_delist(struct zbud_hdr *zh) +{ + unsigned chunks; + struct zbud_hdr *zh_other; + unsigned budnum = zbud_budnum(zh), size; + struct zbud_page *zbpg = + container_of(zh, struct zbud_page, buddy[budnum]); + + spin_lock(&zbpg->lock); + if (list_empty(&zbpg->bud_list)) { + /* ignore zombie page... see zbud_evict_pages() */ + spin_unlock(&zbpg->lock); + return; + } + size = zbud_free(zh); + ASSERT_SPINLOCK(&zbpg->lock); + zh_other = &zbpg->buddy[(budnum == 0) ? 1 : 0]; + if (zh_other->size == 0) { /* was unbuddied: unlist and free */ + chunks = zbud_size_to_chunks(size) ; + spin_lock(&zbud_budlists_spinlock); + BUG_ON(list_empty(&zbud_unbuddied[chunks].list)); + list_del_init(&zbpg->bud_list); + zbud_unbuddied[chunks].count--; + spin_unlock(&zbud_budlists_spinlock); + zbud_free_raw_page(zbpg); + } else { /* was buddied: move remaining buddy to unbuddied list */ + chunks = zbud_size_to_chunks(zh_other->size) ; + spin_lock(&zbud_budlists_spinlock); + list_del_init(&zbpg->bud_list); + zcache_zbud_buddied_count--; + list_add_tail(&zbpg->bud_list, &zbud_unbuddied[chunks].list); + zbud_unbuddied[chunks].count++; + spin_unlock(&zbud_budlists_spinlock); + spin_unlock(&zbpg->lock); + } +} + +static struct zbud_hdr *zbud_create(uint32_t pool_id, struct tmem_oid *oid, + uint32_t index, struct page *page, + void *cdata, unsigned size) +{ + struct zbud_hdr *zh0, *zh1, *zh = NULL; + struct zbud_page *zbpg = NULL, *ztmp; + unsigned nchunks; + char *to; + int i, found_good_buddy = 0; + + nchunks = zbud_size_to_chunks(size) ; + for (i = MAX_CHUNK - nchunks + 1; i > 0; i--) { + spin_lock(&zbud_budlists_spinlock); + if (!list_empty(&zbud_unbuddied[i].list)) { + list_for_each_entry_safe(zbpg, ztmp, + &zbud_unbuddied[i].list, bud_list) { + if (spin_trylock(&zbpg->lock)) { + found_good_buddy = i; + goto found_unbuddied; + } + } + } + spin_unlock(&zbud_budlists_spinlock); + } + /* didn't find a good buddy, try allocating a new page */ + zbpg = zbud_alloc_raw_page(); + if (unlikely(zbpg == NULL)) + goto out; + /* ok, have a page, now compress the data before taking locks */ + spin_lock(&zbpg->lock); + spin_lock(&zbud_budlists_spinlock); + list_add_tail(&zbpg->bud_list, &zbud_unbuddied[nchunks].list); + zbud_unbuddied[nchunks].count++; + zh = &zbpg->buddy[0]; + goto init_zh; + +found_unbuddied: + ASSERT_SPINLOCK(&zbpg->lock); + zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1]; + BUG_ON(!((zh0->size == 0) ^ (zh1->size == 0))); + if (zh0->size != 0) { /* buddy0 in use, buddy1 is vacant */ + ASSERT_SENTINEL(zh0, ZBH); + zh = zh1; + } else if (zh1->size != 0) { /* buddy1 in use, buddy0 is vacant */ + ASSERT_SENTINEL(zh1, ZBH); + zh = zh0; + } else + BUG(); + list_del_init(&zbpg->bud_list); + zbud_unbuddied[found_good_buddy].count--; + list_add_tail(&zbpg->bud_list, &zbud_buddied_list); + zcache_zbud_buddied_count++; + +init_zh: + SET_SENTINEL(zh, ZBH); + zh->size = size; + zh->index = index; + zh->oid = *oid; + zh->pool_id = pool_id; + /* can wait to copy the data until the list locks are dropped */ + spin_unlock(&zbud_budlists_spinlock); + + to = zbud_data(zh, size); + memcpy(to, cdata, size); + spin_unlock(&zbpg->lock); + zbud_cumul_chunk_counts[nchunks]++; + atomic_inc(&zcache_zbud_curr_zpages); + zcache_zbud_cumul_zpages++; + zcache_zbud_curr_zbytes += size; + zcache_zbud_cumul_zbytes += size; +out: + return zh; +} + +static int zbud_decompress(struct page *page, struct zbud_hdr *zh) +{ + struct zbud_page *zbpg; + unsigned budnum = zbud_budnum(zh); + size_t out_len = PAGE_SIZE; + char *to_va, *from_va; + unsigned size; + int ret = 0; + + zbpg = container_of(zh, struct zbud_page, buddy[budnum]); + spin_lock(&zbpg->lock); + if (list_empty(&zbpg->bud_list)) { + /* ignore zombie page... see zbud_evict_pages() */ + ret = -EINVAL; + goto out; + } + ASSERT_SENTINEL(zh, ZBH); + BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size()); + to_va = kmap_atomic(page, KM_USER0); + size = zh->size; + from_va = zbud_data(zh, size); + ret = lzo1x_decompress_safe(from_va, size, to_va, &out_len); + BUG_ON(ret != LZO_E_OK); + BUG_ON(out_len != PAGE_SIZE); + kunmap_atomic(to_va, KM_USER0); +out: + spin_unlock(&zbpg->lock); + return ret; +} + +/* + * The following routines handle shrinking of ephemeral pages by evicting + * pages "least valuable" first. + */ + +static unsigned long zcache_evicted_raw_pages; +static unsigned long zcache_evicted_buddied_pages; +static unsigned long zcache_evicted_unbuddied_pages; + +static struct tmem_pool *zcache_get_pool_by_id(uint32_t poolid); +static void zcache_put_pool(struct tmem_pool *pool); + +/* + * Flush and free all zbuds in a zbpg, then free the pageframe + */ +static void zbud_evict_zbpg(struct zbud_page *zbpg) +{ + struct zbud_hdr *zh; + int i, j; + uint32_t pool_id[ZBUD_MAX_BUDS], index[ZBUD_MAX_BUDS]; + struct tmem_oid oid[ZBUD_MAX_BUDS]; + struct tmem_pool *pool; + + ASSERT_SPINLOCK(&zbpg->lock); + BUG_ON(!list_empty(&zbpg->bud_list)); + for (i = 0, j = 0; i < ZBUD_MAX_BUDS; i++) { + zh = &zbpg->buddy[i]; + if (zh->size) { + pool_id[j] = zh->pool_id; + oid[j] = zh->oid; + index[j] = zh->index; + j++; + zbud_free(zh); + } + } + spin_unlock(&zbpg->lock); + for (i = 0; i < j; i++) { + pool = zcache_get_pool_by_id(pool_id[i]); + if (pool != NULL) { + tmem_flush_page(pool, &oid[i], index[i]); + zcache_put_pool(pool); + } + } + ASSERT_SENTINEL(zbpg, ZBPG); + spin_lock(&zbpg->lock); + zbud_free_raw_page(zbpg); +} + +/* + * Free nr pages. This code is funky because we want to hold the locks + * protecting various lists for as short a time as possible, and in some + * circumstances the list may change asynchronously when the list lock is + * not held. In some cases we also trylock not only to avoid waiting on a + * page in use by another cpu, but also to avoid potential deadlock due to + * lock inversion. + */ +static void zbud_evict_pages(int nr) +{ + struct zbud_page *zbpg; + int i; + + /* first try freeing any pages on unused list */ +retry_unused_list: + spin_lock_bh(&zbpg_unused_list_spinlock); + if (!list_empty(&zbpg_unused_list)) { + /* can't walk list here, since it may change when unlocked */ + zbpg = list_first_entry(&zbpg_unused_list, + struct zbud_page, bud_list); + list_del_init(&zbpg->bud_list); + zcache_zbpg_unused_list_count--; + atomic_dec(&zcache_zbud_curr_raw_pages); + spin_unlock_bh(&zbpg_unused_list_spinlock); + zcache_free_page(zbpg); + zcache_evicted_raw_pages++; + if (--nr <= 0) + goto out; + goto retry_unused_list; + } + spin_unlock_bh(&zbpg_unused_list_spinlock); + + /* now try freeing unbuddied pages, starting with least space avail */ + for (i = 0; i < MAX_CHUNK; i++) { +retry_unbud_list_i: + spin_lock_bh(&zbud_budlists_spinlock); + if (list_empty(&zbud_unbuddied[i].list)) { + spin_unlock_bh(&zbud_budlists_spinlock); + continue; + } + list_for_each_entry(zbpg, &zbud_unbuddied[i].list, bud_list) { + if (unlikely(!spin_trylock(&zbpg->lock))) + continue; + list_del_init(&zbpg->bud_list); + zbud_unbuddied[i].count--; + spin_unlock(&zbud_budlists_spinlock); + zcache_evicted_unbuddied_pages++; + /* want budlists unlocked when doing zbpg eviction */ + zbud_evict_zbpg(zbpg); + local_bh_enable(); + if (--nr <= 0) + goto out; + goto retry_unbud_list_i; + } + spin_unlock_bh(&zbud_budlists_spinlock); + } + + /* as a last resort, free buddied pages */ +retry_bud_list: + spin_lock_bh(&zbud_budlists_spinlock); + if (list_empty(&zbud_buddied_list)) { + spin_unlock_bh(&zbud_budlists_spinlock); + goto out; + } + list_for_each_entry(zbpg, &zbud_buddied_list, bud_list) { + if (unlikely(!spin_trylock(&zbpg->lock))) + continue; + list_del_init(&zbpg->bud_list); + zcache_zbud_buddied_count--; + spin_unlock(&zbud_budlists_spinlock); + zcache_evicted_buddied_pages++; + /* want budlists unlocked when doing zbpg eviction */ + zbud_evict_zbpg(zbpg); + local_bh_enable(); + if (--nr <= 0) + goto out; + goto retry_bud_list; + } + spin_unlock_bh(&zbud_budlists_spinlock); +out: + return; +} + +static void zbud_init(void) +{ + int i; + + INIT_LIST_HEAD(&zbud_buddied_list); + zcache_zbud_buddied_count = 0; + for (i = 0; i < NCHUNKS; i++) { + INIT_LIST_HEAD(&zbud_unbuddied[i].list); + zbud_unbuddied[i].count = 0; + } +} + +#ifdef CONFIG_SYSFS +/* + * These sysfs routines show a nice distribution of how many zbpg's are + * currently (and have ever been placed) in each unbuddied list. It's fun + * to watch but can probably go away before final merge. + */ +static int zbud_show_unbuddied_list_counts(char *buf) +{ + int i; + char *p = buf; + + for (i = 0; i < NCHUNKS - 1; i++) + p += sprintf(p, "%u ", zbud_unbuddied[i].count); + p += sprintf(p, "%d\n", zbud_unbuddied[i].count); + return p - buf; +} + +static int zbud_show_cumul_chunk_counts(char *buf) +{ + unsigned long i, chunks = 0, total_chunks = 0, sum_total_chunks = 0; + unsigned long total_chunks_lte_21 = 0, total_chunks_lte_32 = 0; + unsigned long total_chunks_lte_42 = 0; + char *p = buf; + + for (i = 0; i < NCHUNKS; i++) { + p += sprintf(p, "%lu ", zbud_cumul_chunk_counts[i]); + chunks += zbud_cumul_chunk_counts[i]; + total_chunks += zbud_cumul_chunk_counts[i]; + sum_total_chunks += i * zbud_cumul_chunk_counts[i]; + if (i == 21) + total_chunks_lte_21 = total_chunks; + if (i == 32) + total_chunks_lte_32 = total_chunks; + if (i == 42) + total_chunks_lte_42 = total_chunks; + } + p += sprintf(p, "<=21:%lu <=32:%lu <=42:%lu, mean:%lu\n", + total_chunks_lte_21, total_chunks_lte_32, total_chunks_lte_42, + chunks == 0 ? 0 : sum_total_chunks / chunks); + return p - buf; +} +#endif + +/********** + * This "zv" PAM implementation combines the TLSF-based xvMalloc + * with lzo1x compression to maximize the amount of data that can + * be packed into a physical page. + * + * Zv represents a PAM page with the index and object (plus a "size" value + * necessary for decompression) immediately preceding the compressed data. + */ + +#define ZVH_SENTINEL 0x43214321 + +struct zv_hdr { + uint32_t pool_id; + struct tmem_oid oid; + uint32_t index; + DECL_SENTINEL +}; + +static const int zv_max_page_size = (PAGE_SIZE / 8) * 7; + +static struct zv_hdr *zv_create(struct xv_pool *xvpool, uint32_t pool_id, + struct tmem_oid *oid, uint32_t index, + void *cdata, unsigned clen) +{ + struct page *page; + struct zv_hdr *zv = NULL; + uint32_t offset; + int ret; + + BUG_ON(!irqs_disabled()); + ret = xv_malloc(xvpool, clen + sizeof(struct zv_hdr), + &page, &offset, ZCACHE_GFP_MASK); + if (unlikely(ret)) + goto out; + zv = kmap_atomic(page, KM_USER0) + offset; + zv->index = index; + zv->oid = *oid; + zv->pool_id = pool_id; + SET_SENTINEL(zv, ZVH); + memcpy((char *)zv + sizeof(struct zv_hdr), cdata, clen); + kunmap_atomic(zv, KM_USER0); +out: + return zv; +} + +static void zv_free(struct xv_pool *xvpool, struct zv_hdr *zv) +{ + unsigned long flags; + struct page *page; + uint32_t offset; + uint16_t size; + + ASSERT_SENTINEL(zv, ZVH); + size = xv_get_object_size(zv) - sizeof(*zv); + BUG_ON(size == 0 || size > zv_max_page_size); + INVERT_SENTINEL(zv, ZVH); + page = virt_to_page(zv); + offset = (unsigned long)zv & ~PAGE_MASK; + local_irq_save(flags); + xv_free(xvpool, page, offset); + local_irq_restore(flags); +} + +static void zv_decompress(struct page *page, struct zv_hdr *zv) +{ + size_t clen = PAGE_SIZE; + char *to_va; + unsigned size; + int ret; + + ASSERT_SENTINEL(zv, ZVH); + size = xv_get_object_size(zv) - sizeof(*zv); + BUG_ON(size == 0 || size > zv_max_page_size); + to_va = kmap_atomic(page, KM_USER0); + ret = lzo1x_decompress_safe((char *)zv + sizeof(*zv), + size, to_va, &clen); + kunmap_atomic(to_va, KM_USER0); + BUG_ON(ret != LZO_E_OK); + BUG_ON(clen != PAGE_SIZE); +} + +/* + * zcache core code starts here + */ + +/* useful stats not collected by cleancache or frontswap */ +static unsigned long zcache_flush_total; +static unsigned long zcache_flush_found; +static unsigned long zcache_flobj_total; +static unsigned long zcache_flobj_found; +static unsigned long zcache_failed_eph_puts; +static unsigned long zcache_failed_pers_puts; + +#define MAX_POOLS_PER_CLIENT 16 + +static struct { + struct tmem_pool *tmem_pools[MAX_POOLS_PER_CLIENT]; + struct xv_pool *xvpool; +} zcache_client; + +/* + * Tmem operations assume the poolid implies the invoking client. + * Zcache only has one client (the kernel itself), so translate + * the poolid into the tmem_pool allocated for it. A KVM version + * of zcache would have one client per guest and each client might + * have a poolid==N. + */ +static struct tmem_pool *zcache_get_pool_by_id(uint32_t poolid) +{ + struct tmem_pool *pool = NULL; + + if (poolid >= 0) { + pool = zcache_client.tmem_pools[poolid]; + if (pool != NULL) + atomic_inc(&pool->refcount); + } + return pool; +} + +static void zcache_put_pool(struct tmem_pool *pool) +{ + if (pool != NULL) + atomic_dec(&pool->refcount); +} + +/* counters for debugging */ +static unsigned long zcache_failed_get_free_pages; +static unsigned long zcache_failed_alloc; +static unsigned long zcache_put_to_flush; +static unsigned long zcache_aborted_preload; +static unsigned long zcache_aborted_shrink; + +/* + * Ensure that memory allocation requests in zcache don't result + * in direct reclaim requests via the shrinker, which would cause + * an infinite loop. Maybe a GFP flag would be better? + */ +static DEFINE_SPINLOCK(zcache_direct_reclaim_lock); + +/* + * for now, used named slabs so can easily track usage; later can + * either just use kmalloc, or perhaps add a slab-like allocator + * to more carefully manage total memory utilization + */ +static struct kmem_cache *zcache_objnode_cache; +static struct kmem_cache *zcache_obj_cache; +static atomic_t zcache_curr_obj_count = ATOMIC_INIT(0); +static unsigned long zcache_curr_obj_count_max; +static atomic_t zcache_curr_objnode_count = ATOMIC_INIT(0); +static unsigned long zcache_curr_objnode_count_max; + +/* + * to avoid memory allocation recursion (e.g. due to direct reclaim), we + * preload all necessary data structures so the hostops callbacks never + * actually do a malloc + */ +struct zcache_preload { + void *page; + struct tmem_obj *obj; + int nr; + struct tmem_objnode *objnodes[OBJNODE_TREE_MAX_PATH]; +}; +static DEFINE_PER_CPU(struct zcache_preload, zcache_preloads) = { 0, }; + +static int zcache_do_preload(struct tmem_pool *pool) +{ + struct zcache_preload *kp; + struct tmem_objnode *objnode; + struct tmem_obj *obj; + void *page; + int ret = -ENOMEM; + + if (unlikely(zcache_objnode_cache == NULL)) + goto out; + if (unlikely(zcache_obj_cache == NULL)) + goto out; + if (!spin_trylock(&zcache_direct_reclaim_lock)) { + zcache_aborted_preload++; + goto out; + } + preempt_disable(); + kp = &__get_cpu_var(zcache_preloads); + while (kp->nr < ARRAY_SIZE(kp->objnodes)) { + preempt_enable_no_resched(); + objnode = kmem_cache_alloc(zcache_objnode_cache, + ZCACHE_GFP_MASK); + if (unlikely(objnode == NULL)) { + zcache_failed_alloc++; + goto unlock_out; + } + preempt_disable(); + kp = &__get_cpu_var(zcache_preloads); + if (kp->nr < ARRAY_SIZE(kp->objnodes)) + kp->objnodes[kp->nr++] = objnode; + else + kmem_cache_free(zcache_objnode_cache, objnode); + } + preempt_enable_no_resched(); + obj = kmem_cache_alloc(zcache_obj_cache, ZCACHE_GFP_MASK); + if (unlikely(obj == NULL)) { + zcache_failed_alloc++; + goto unlock_out; + } + page = (void *)__get_free_page(ZCACHE_GFP_MASK); + if (unlikely(page == NULL)) { + zcache_failed_get_free_pages++; + kmem_cache_free(zcache_obj_cache, obj); + goto unlock_out; + } + preempt_disable(); + kp = &__get_cpu_var(zcache_preloads); + if (kp->obj == NULL) + kp->obj = obj; + else + kmem_cache_free(zcache_obj_cache, obj); + if (kp->page == NULL) + kp->page = page; + else + free_page((unsigned long)page); + ret = 0; +unlock_out: + spin_unlock(&zcache_direct_reclaim_lock); +out: + return ret; +} + +static void *zcache_get_free_page(void) +{ + struct zcache_preload *kp; + void *page; + + kp = &__get_cpu_var(zcache_preloads); + page = kp->page; + BUG_ON(page == NULL); + kp->page = NULL; + return page; +} + +static void zcache_free_page(void *p) +{ + free_page((unsigned long)p); +} + +/* + * zcache implementation for tmem host ops + */ + +static struct tmem_objnode *zcache_objnode_alloc(struct tmem_pool *pool) +{ + struct tmem_objnode *objnode = NULL; + unsigned long count; + struct zcache_preload *kp; + + kp = &__get_cpu_var(zcache_preloads); + if (kp->nr <= 0) + goto out; + objnode = kp->objnodes[kp->nr - 1]; + BUG_ON(objnode == NULL); + kp->objnodes[kp->nr - 1] = NULL; + kp->nr--; + count = atomic_inc_return(&zcache_curr_objnode_count); + if (count > zcache_curr_objnode_count_max) + zcache_curr_objnode_count_max = count; +out: + return objnode; +} + +static void zcache_objnode_free(struct tmem_objnode *objnode, + struct tmem_pool *pool) +{ + atomic_dec(&zcache_curr_objnode_count); + BUG_ON(atomic_read(&zcache_curr_objnode_count) < 0); + kmem_cache_free(zcache_objnode_cache, objnode); +} + +static struct tmem_obj *zcache_obj_alloc(struct tmem_pool *pool) +{ + struct tmem_obj *obj = NULL; + unsigned long count; + struct zcache_preload *kp; + + kp = &__get_cpu_var(zcache_preloads); + obj = kp->obj; + BUG_ON(obj == NULL); + kp->obj = NULL; + count = atomic_inc_return(&zcache_curr_obj_count); + if (count > zcache_curr_obj_count_max) + zcache_curr_obj_count_max = count; + return obj; +} + +static void zcache_obj_free(struct tmem_obj *obj, struct tmem_pool *pool) +{ + atomic_dec(&zcache_curr_obj_count); + BUG_ON(atomic_read(&zcache_curr_obj_count) < 0); + kmem_cache_free(zcache_obj_cache, obj); +} + +static struct tmem_hostops zcache_hostops = { + .obj_alloc = zcache_obj_alloc, + .obj_free = zcache_obj_free, + .objnode_alloc = zcache_objnode_alloc, + .objnode_free = zcache_objnode_free, +}; + +/* + * zcache implementations for PAM page descriptor ops + */ + +static atomic_t zcache_curr_eph_pampd_count = ATOMIC_INIT(0); +static unsigned long zcache_curr_eph_pampd_count_max; +static atomic_t zcache_curr_pers_pampd_count = ATOMIC_INIT(0); +static unsigned long zcache_curr_pers_pampd_count_max; + +/* forward reference */ +static int zcache_compress(struct page *from, void **out_va, size_t *out_len); + +static void *zcache_pampd_create(struct tmem_pool *pool, struct tmem_oid *oid, + uint32_t index, struct page *page) +{ + void *pampd = NULL, *cdata; + size_t clen; + int ret; + bool ephemeral = is_ephemeral(pool); + unsigned long count; + + if (ephemeral) { + ret = zcache_compress(page, &cdata, &clen); + if (ret == 0) + + goto out; + if (clen == 0 || clen > zbud_max_buddy_size()) { + zcache_compress_poor++; + goto out; + } + pampd = (void *)zbud_create(pool->pool_id, oid, index, + page, cdata, clen); + if (pampd != NULL) { + count = atomic_inc_return(&zcache_curr_eph_pampd_count); + if (count > zcache_curr_eph_pampd_count_max) + zcache_curr_eph_pampd_count_max = count; + } + } else { + /* + * FIXME: This is all the "policy" there is for now. + * 3/4 totpages should allow ~37% of RAM to be filled with + * compressed frontswap pages + */ + if (atomic_read(&zcache_curr_pers_pampd_count) > + 3 * totalram_pages / 4) + goto out; + ret = zcache_compress(page, &cdata, &clen); + if (ret == 0) + goto out; + if (clen > zv_max_page_size) { + zcache_compress_poor++; + goto out; + } + pampd = (void *)zv_create(zcache_client.xvpool, pool->pool_id, + oid, index, cdata, clen); + if (pampd == NULL) + goto out; + count = atomic_inc_return(&zcache_curr_pers_pampd_count); + if (count > zcache_curr_pers_pampd_count_max) + zcache_curr_pers_pampd_count_max = count; + } +out: + return pampd; +} + +/* + * fill the pageframe corresponding to the struct page with the data + * from the passed pampd + */ +static int zcache_pampd_get_data(struct page *page, void *pampd, + struct tmem_pool *pool) +{ + int ret = 0; + + if (is_ephemeral(pool)) + ret = zbud_decompress(page, pampd); + else + zv_decompress(page, pampd); + return ret; +} + +/* + * free the pampd and remove it from any zcache lists + * pampd must no longer be pointed to from any tmem data structures! + */ +static void zcache_pampd_free(void *pampd, struct tmem_pool *pool) +{ + if (is_ephemeral(pool)) { + zbud_free_and_delist((struct zbud_hdr *)pampd); + atomic_dec(&zcache_curr_eph_pampd_count); + BUG_ON(atomic_read(&zcache_curr_eph_pampd_count) < 0); + } else { + zv_free(zcache_client.xvpool, (struct zv_hdr *)pampd); + atomic_dec(&zcache_curr_pers_pampd_count); + BUG_ON(atomic_read(&zcache_curr_pers_pampd_count) < 0); + } +} + +static struct tmem_pamops zcache_pamops = { + .create = zcache_pampd_create, + .get_data = zcache_pampd_get_data, + .free = zcache_pampd_free, +}; + +/* + * zcache compression/decompression and related per-cpu stuff + */ + +#define LZO_WORKMEM_BYTES LZO1X_1_MEM_COMPRESS +#define LZO_DSTMEM_PAGE_ORDER 1 +static DEFINE_PER_CPU(unsigned char *, zcache_workmem); +static DEFINE_PER_CPU(unsigned char *, zcache_dstmem); + +static int zcache_compress(struct page *from, void **out_va, size_t *out_len) +{ + int ret = 0; + unsigned char *dmem = __get_cpu_var(zcache_dstmem); + unsigned char *wmem = __get_cpu_var(zcache_workmem); + char *from_va; + + BUG_ON(!irqs_disabled()); + if (unlikely(dmem == NULL || wmem == NULL)) + goto out; /* no buffer, so can't compress */ + from_va = kmap_atomic(from, KM_USER0); + mb(); + ret = lzo1x_1_compress(from_va, PAGE_SIZE, dmem, out_len, wmem); + BUG_ON(ret != LZO_E_OK); + *out_va = dmem; + kunmap_atomic(from_va, KM_USER0); + ret = 1; +out: + return ret; +} + + +static int zcache_cpu_notifier(struct notifier_block *nb, + unsigned long action, void *pcpu) +{ + int cpu = (long)pcpu; + struct zcache_preload *kp; + + switch (action) { + case CPU_UP_PREPARE: + per_cpu(zcache_dstmem, cpu) = (void *)__get_free_pages( + GFP_KERNEL | __GFP_REPEAT, + LZO_DSTMEM_PAGE_ORDER), + per_cpu(zcache_workmem, cpu) = + kzalloc(LZO1X_MEM_COMPRESS, + GFP_KERNEL | __GFP_REPEAT); + break; + case CPU_DEAD: + case CPU_UP_CANCELED: + free_pages((unsigned long)per_cpu(zcache_dstmem, cpu), + LZO_DSTMEM_PAGE_ORDER); + per_cpu(zcache_dstmem, cpu) = NULL; + kfree(per_cpu(zcache_workmem, cpu)); + per_cpu(zcache_workmem, cpu) = NULL; + kp = &per_cpu(zcache_preloads, cpu); + while (kp->nr) { + kmem_cache_free(zcache_objnode_cache, + kp->objnodes[kp->nr - 1]); + kp->objnodes[kp->nr - 1] = NULL; + kp->nr--; + } + kmem_cache_free(zcache_obj_cache, kp->obj); + free_page((unsigned long)kp->page); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block zcache_cpu_notifier_block = { + .notifier_call = zcache_cpu_notifier +}; + +#ifdef CONFIG_SYSFS +#define ZCACHE_SYSFS_RO(_name) \ + static ssize_t zcache_##_name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *buf) \ + { \ + return sprintf(buf, "%lu\n", zcache_##_name); \ + } \ + static struct kobj_attribute zcache_##_name##_attr = { \ + .attr = { .name = __stringify(_name), .mode = 0444 }, \ + .show = zcache_##_name##_show, \ + } + +#define ZCACHE_SYSFS_RO_ATOMIC(_name) \ + static ssize_t zcache_##_name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *buf) \ + { \ + return sprintf(buf, "%d\n", atomic_read(&zcache_##_name)); \ + } \ + static struct kobj_attribute zcache_##_name##_attr = { \ + .attr = { .name = __stringify(_name), .mode = 0444 }, \ + .show = zcache_##_name##_show, \ + } + +#define ZCACHE_SYSFS_RO_CUSTOM(_name, _func) \ + static ssize_t zcache_##_name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *buf) \ + { \ + return _func(buf); \ + } \ + static struct kobj_attribute zcache_##_name##_attr = { \ + .attr = { .name = __stringify(_name), .mode = 0444 }, \ + .show = zcache_##_name##_show, \ + } + +ZCACHE_SYSFS_RO(curr_obj_count_max); +ZCACHE_SYSFS_RO(curr_objnode_count_max); +ZCACHE_SYSFS_RO(flush_total); +ZCACHE_SYSFS_RO(flush_found); +ZCACHE_SYSFS_RO(flobj_total); +ZCACHE_SYSFS_RO(flobj_found); +ZCACHE_SYSFS_RO(failed_eph_puts); +ZCACHE_SYSFS_RO(failed_pers_puts); +ZCACHE_SYSFS_RO(zbud_curr_zbytes); +ZCACHE_SYSFS_RO(zbud_cumul_zpages); +ZCACHE_SYSFS_RO(zbud_cumul_zbytes); +ZCACHE_SYSFS_RO(zbud_buddied_count); +ZCACHE_SYSFS_RO(zbpg_unused_list_count); +ZCACHE_SYSFS_RO(evicted_raw_pages); +ZCACHE_SYSFS_RO(evicted_unbuddied_pages); +ZCACHE_SYSFS_RO(evicted_buddied_pages); +ZCACHE_SYSFS_RO(failed_get_free_pages); +ZCACHE_SYSFS_RO(failed_alloc); +ZCACHE_SYSFS_RO(put_to_flush); +ZCACHE_SYSFS_RO(aborted_preload); +ZCACHE_SYSFS_RO(aborted_shrink); +ZCACHE_SYSFS_RO(compress_poor); +ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_raw_pages); +ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_zpages); +ZCACHE_SYSFS_RO_ATOMIC(curr_obj_count); +ZCACHE_SYSFS_RO_ATOMIC(curr_objnode_count); +ZCACHE_SYSFS_RO_CUSTOM(zbud_unbuddied_list_counts, + zbud_show_unbuddied_list_counts); +ZCACHE_SYSFS_RO_CUSTOM(zbud_cumul_chunk_counts, + zbud_show_cumul_chunk_counts); + +static struct attribute *zcache_attrs[] = { + &zcache_curr_obj_count_attr.attr, + &zcache_curr_obj_count_max_attr.attr, + &zcache_curr_objnode_count_attr.attr, + &zcache_curr_objnode_count_max_attr.attr, + &zcache_flush_total_attr.attr, + &zcache_flobj_total_attr.attr, + &zcache_flush_found_attr.attr, + &zcache_flobj_found_attr.attr, + &zcache_failed_eph_puts_attr.attr, + &zcache_failed_pers_puts_attr.attr, + &zcache_compress_poor_attr.attr, + &zcache_zbud_curr_raw_pages_attr.attr, + &zcache_zbud_curr_zpages_attr.attr, + &zcache_zbud_curr_zbytes_attr.attr, + &zcache_zbud_cumul_zpages_attr.attr, + &zcache_zbud_cumul_zbytes_attr.attr, + &zcache_zbud_buddied_count_attr.attr, + &zcache_zbpg_unused_list_count_attr.attr, + &zcache_evicted_raw_pages_attr.attr, + &zcache_evicted_unbuddied_pages_attr.attr, + &zcache_evicted_buddied_pages_attr.attr, + &zcache_failed_get_free_pages_attr.attr, + &zcache_failed_alloc_attr.attr, + &zcache_put_to_flush_attr.attr, + &zcache_aborted_preload_attr.attr, + &zcache_aborted_shrink_attr.attr, + &zcache_zbud_unbuddied_list_counts_attr.attr, + &zcache_zbud_cumul_chunk_counts_attr.attr, + NULL, +}; + +static struct attribute_group zcache_attr_group = { + .attrs = zcache_attrs, + .name = "zcache", +}; + +#endif /* CONFIG_SYSFS */ +/* + * When zcache is disabled ("frozen"), pools can be created and destroyed, + * but all puts (and thus all other operations that require memory allocation) + * must fail. If zcache is unfrozen, accepts puts, then frozen again, + * data consistency requires all puts while frozen to be converted into + * flushes. + */ +static bool zcache_freeze; + +/* + * zcache shrinker interface (only useful for ephemeral pages, so zbud only) + */ +static int shrink_zcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) +{ + int ret = -1; + + if (nr >= 0) { + if (!(gfp_mask & __GFP_FS)) + /* does this case really need to be skipped? */ + goto out; + if (spin_trylock(&zcache_direct_reclaim_lock)) { + zbud_evict_pages(nr); + spin_unlock(&zcache_direct_reclaim_lock); + } else + zcache_aborted_shrink++; + } + ret = (int)atomic_read(&zcache_zbud_curr_raw_pages); +out: + return ret; +} + +static struct shrinker zcache_shrinker = { + .shrink = shrink_zcache_memory, + .seeks = DEFAULT_SEEKS, +}; + +/* + * zcache shims between cleancache/frontswap ops and tmem + */ + +static int zcache_put_page(int pool_id, struct tmem_oid *oidp, + uint32_t index, struct page *page) +{ + struct tmem_pool *pool; + int ret = -1; + + BUG_ON(!irqs_disabled()); + pool = zcache_get_pool_by_id(pool_id); + if (unlikely(pool == NULL)) + goto out; + if (!zcache_freeze && zcache_do_preload(pool) == 0) { + /* preload does preempt_disable on success */ + ret = tmem_put(pool, oidp, index, page); + if (ret < 0) { + if (is_ephemeral(pool)) + zcache_failed_eph_puts++; + else + zcache_failed_pers_puts++; + } + zcache_put_pool(pool); + preempt_enable_no_resched(); + } else { + zcache_put_to_flush++; + if (atomic_read(&pool->obj_count) > 0) + /* the put fails whether the flush succeeds or not */ + (void)tmem_flush_page(pool, oidp, index); + zcache_put_pool(pool); + } +out: + return ret; +} + +static int zcache_get_page(int pool_id, struct tmem_oid *oidp, + uint32_t index, struct page *page) +{ + struct tmem_pool *pool; + int ret = -1; + unsigned long flags; + + local_irq_save(flags); + pool = zcache_get_pool_by_id(pool_id); + if (likely(pool != NULL)) { + if (atomic_read(&pool->obj_count) > 0) + ret = tmem_get(pool, oidp, index, page); + zcache_put_pool(pool); + } + local_irq_restore(flags); + return ret; +} + +static int zcache_flush_page(int pool_id, struct tmem_oid *oidp, uint32_t index) +{ + struct tmem_pool *pool; + int ret = -1; + unsigned long flags; + + local_irq_save(flags); + zcache_flush_total++; + pool = zcache_get_pool_by_id(pool_id); + if (likely(pool != NULL)) { + if (atomic_read(&pool->obj_count) > 0) + ret = tmem_flush_page(pool, oidp, index); + zcache_put_pool(pool); + } + if (ret >= 0) + zcache_flush_found++; + local_irq_restore(flags); + return ret; +} + +static int zcache_flush_object(int pool_id, struct tmem_oid *oidp) +{ + struct tmem_pool *pool; + int ret = -1; + unsigned long flags; + + local_irq_save(flags); + zcache_flobj_total++; + pool = zcache_get_pool_by_id(pool_id); + if (likely(pool != NULL)) { + if (atomic_read(&pool->obj_count) > 0) + ret = tmem_flush_object(pool, oidp); + zcache_put_pool(pool); + } + if (ret >= 0) + zcache_flobj_found++; + local_irq_restore(flags); + return ret; +} + +static int zcache_destroy_pool(int pool_id) +{ + struct tmem_pool *pool = NULL; + int ret = -1; + + if (pool_id < 0) + goto out; + pool = zcache_client.tmem_pools[pool_id]; + if (pool == NULL) + goto out; + zcache_client.tmem_pools[pool_id] = NULL; + /* wait for pool activity on other cpus to quiesce */ + while (atomic_read(&pool->refcount) != 0) + ; + local_bh_disable(); + ret = tmem_destroy_pool(pool); + local_bh_enable(); + kfree(pool); + pr_info("zcache: destroyed pool id=%d\n", pool_id); +out: + return ret; +} + +static int zcache_new_pool(uint32_t flags) +{ + int poolid = -1; + struct tmem_pool *pool; + + pool = kmalloc(sizeof(struct tmem_pool), GFP_KERNEL); + if (pool == NULL) { + pr_info("zcache: pool creation failed: out of memory\n"); + goto out; + } + + for (poolid = 0; poolid < MAX_POOLS_PER_CLIENT; poolid++) + if (zcache_client.tmem_pools[poolid] == NULL) + break; + if (poolid >= MAX_POOLS_PER_CLIENT) { + pr_info("zcache: pool creation failed: max exceeded\n"); + kfree(pool); + poolid = -1; + goto out; + } + atomic_set(&pool->refcount, 0); + pool->client = &zcache_client; + pool->pool_id = poolid; + tmem_new_pool(pool, flags); + zcache_client.tmem_pools[poolid] = pool; + pr_info("zcache: created %s tmem pool, id=%d\n", + flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral", + poolid); +out: + return poolid; +} + +/********** + * Two kernel functionalities currently can be layered on top of tmem. + * These are "cleancache" which is used as a second-chance cache for clean + * page cache pages; and "frontswap" which is used for swap pages + * to avoid writes to disk. A generic "shim" is provided here for each + * to translate in-kernel semantics to zcache semantics. + */ + +#ifdef CONFIG_CLEANCACHE +static void zcache_cleancache_put_page(int pool_id, + struct cleancache_filekey key, + pgoff_t index, struct page *page) +{ + u32 ind = (u32) index; + struct tmem_oid oid = *(struct tmem_oid *)&key; + + if (likely(ind == index)) + (void)zcache_put_page(pool_id, &oid, index, page); +} + +static int zcache_cleancache_get_page(int pool_id, + struct cleancache_filekey key, + pgoff_t index, struct page *page) +{ + u32 ind = (u32) index; + struct tmem_oid oid = *(struct tmem_oid *)&key; + int ret = -1; + + if (likely(ind == index)) + ret = zcache_get_page(pool_id, &oid, index, page); + return ret; +} + +static void zcache_cleancache_flush_page(int pool_id, + struct cleancache_filekey key, + pgoff_t index) +{ + u32 ind = (u32) index; + struct tmem_oid oid = *(struct tmem_oid *)&key; + + if (likely(ind == index)) + (void)zcache_flush_page(pool_id, &oid, ind); +} + +static void zcache_cleancache_flush_inode(int pool_id, + struct cleancache_filekey key) +{ + struct tmem_oid oid = *(struct tmem_oid *)&key; + + (void)zcache_flush_object(pool_id, &oid); +} + +static void zcache_cleancache_flush_fs(int pool_id) +{ + if (pool_id >= 0) + (void)zcache_destroy_pool(pool_id); +} + +static int zcache_cleancache_init_fs(size_t pagesize) +{ + BUG_ON(sizeof(struct cleancache_filekey) != + sizeof(struct tmem_oid)); + BUG_ON(pagesize != PAGE_SIZE); + return zcache_new_pool(0); +} + +static int zcache_cleancache_init_shared_fs(char *uuid, size_t pagesize) +{ + /* shared pools are unsupported and map to private */ + BUG_ON(sizeof(struct cleancache_filekey) != + sizeof(struct tmem_oid)); + BUG_ON(pagesize != PAGE_SIZE); + return zcache_new_pool(0); +} + +static struct cleancache_ops zcache_cleancache_ops = { + .put_page = zcache_cleancache_put_page, + .get_page = zcache_cleancache_get_page, + .flush_page = zcache_cleancache_flush_page, + .flush_inode = zcache_cleancache_flush_inode, + .flush_fs = zcache_cleancache_flush_fs, + .init_shared_fs = zcache_cleancache_init_shared_fs, + .init_fs = zcache_cleancache_init_fs +}; + +struct cleancache_ops zcache_cleancache_register_ops(void) +{ + struct cleancache_ops old_ops = + cleancache_register_ops(&zcache_cleancache_ops); + + return old_ops; +} +#endif + +#ifdef CONFIG_FRONTSWAP +/* a single tmem poolid is used for all frontswap "types" (swapfiles) */ +static int zcache_frontswap_poolid = -1; + +/* + * Swizzling increases objects per swaptype, increasing tmem concurrency + * for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS + */ +#define SWIZ_BITS 4 +#define SWIZ_MASK ((1 << SWIZ_BITS) - 1) +#define _oswiz(_type, _ind) ((_type << SWIZ_BITS) | (_ind & SWIZ_MASK)) +#define iswiz(_ind) (_ind >> SWIZ_BITS) + +static inline struct tmem_oid oswiz(unsigned type, u32 ind) +{ + struct tmem_oid oid = { .oid = { 0 } }; + oid.oid[0] = _oswiz(type, ind); + return oid; +} + +static int zcache_frontswap_put_page(unsigned type, pgoff_t offset, + struct page *page) +{ + u64 ind64 = (u64)offset; + u32 ind = (u32)offset; + struct tmem_oid oid = oswiz(type, ind); + int ret = -1; + unsigned long flags; + + BUG_ON(!PageLocked(page)); + if (likely(ind64 == ind)) { + local_irq_save(flags); + ret = zcache_put_page(zcache_frontswap_poolid, &oid, + iswiz(ind), page); + local_irq_restore(flags); + } + return ret; +} + +/* returns 0 if the page was successfully gotten from frontswap, -1 if + * was not present (should never happen!) */ +static int zcache_frontswap_get_page(unsigned type, pgoff_t offset, + struct page *page) +{ + u64 ind64 = (u64)offset; + u32 ind = (u32)offset; + struct tmem_oid oid = oswiz(type, ind); + int ret = -1; + + BUG_ON(!PageLocked(page)); + if (likely(ind64 == ind)) + ret = zcache_get_page(zcache_frontswap_poolid, &oid, + iswiz(ind), page); + return ret; +} + +/* flush a single page from frontswap */ +static void zcache_frontswap_flush_page(unsigned type, pgoff_t offset) +{ + u64 ind64 = (u64)offset; + u32 ind = (u32)offset; + struct tmem_oid oid = oswiz(type, ind); + + if (likely(ind64 == ind)) + (void)zcache_flush_page(zcache_frontswap_poolid, &oid, + iswiz(ind)); +} + +/* flush all pages from the passed swaptype */ +static void zcache_frontswap_flush_area(unsigned type) +{ + struct tmem_oid oid; + int ind; + + for (ind = SWIZ_MASK; ind >= 0; ind--) { + oid = oswiz(type, ind); + (void)zcache_flush_object(zcache_frontswap_poolid, &oid); + } +} + +static void zcache_frontswap_init(unsigned ignored) +{ + /* a single tmem poolid is used for all frontswap "types" (swapfiles) */ + if (zcache_frontswap_poolid < 0) + zcache_frontswap_poolid = zcache_new_pool(TMEM_POOL_PERSIST); +} + +static struct frontswap_ops zcache_frontswap_ops = { + .put_page = zcache_frontswap_put_page, + .get_page = zcache_frontswap_get_page, + .flush_page = zcache_frontswap_flush_page, + .flush_area = zcache_frontswap_flush_area, + .init = zcache_frontswap_init +}; + +struct frontswap_ops zcache_frontswap_register_ops(void) +{ + struct frontswap_ops old_ops = + frontswap_register_ops(&zcache_frontswap_ops); + + return old_ops; +} +#endif + +/* + * zcache initialization + * NOTE FOR NOW zcache MUST BE PROVIDED AS A KERNEL BOOT PARAMETER OR + * NOTHING HAPPENS! + */ + +static int zcache_enabled; + +static int __init enable_zcache(char *s) +{ + zcache_enabled = 1; + return 1; +} +__setup("zcache", enable_zcache); + +/* allow independent dynamic disabling of cleancache and frontswap */ + +static int use_cleancache = 1; + +static int __init no_cleancache(char *s) +{ + use_cleancache = 0; + return 1; +} + +__setup("nocleancache", no_cleancache); + +static int use_frontswap = 1; + +static int __init no_frontswap(char *s) +{ + use_frontswap = 0; + return 1; +} + +__setup("nofrontswap", no_frontswap); + +static int __init zcache_init(void) +{ +#ifdef CONFIG_SYSFS + int ret = 0; + + ret = sysfs_create_group(mm_kobj, &zcache_attr_group); + if (ret) { + pr_err("zcache: can't create sysfs\n"); + goto out; + } +#endif /* CONFIG_SYSFS */ +#if defined(CONFIG_CLEANCACHE) || defined(CONFIG_FRONTSWAP) + if (zcache_enabled) { + unsigned int cpu; + + tmem_register_hostops(&zcache_hostops); + tmem_register_pamops(&zcache_pamops); + ret = register_cpu_notifier(&zcache_cpu_notifier_block); + if (ret) { + pr_err("zcache: can't register cpu notifier\n"); + goto out; + } + for_each_online_cpu(cpu) { + void *pcpu = (void *)(long)cpu; + zcache_cpu_notifier(&zcache_cpu_notifier_block, + CPU_UP_PREPARE, pcpu); + } + } + zcache_objnode_cache = kmem_cache_create("zcache_objnode", + sizeof(struct tmem_objnode), 0, 0, NULL); + zcache_obj_cache = kmem_cache_create("zcache_obj", + sizeof(struct tmem_obj), 0, 0, NULL); +#endif +#ifdef CONFIG_CLEANCACHE + if (zcache_enabled && use_cleancache) { + struct cleancache_ops old_ops; + + zbud_init(); + register_shrinker(&zcache_shrinker); + old_ops = zcache_cleancache_register_ops(); + pr_info("zcache: cleancache enabled using kernel " + "transcendent memory and compression buddies\n"); + if (old_ops.init_fs != NULL) + pr_warning("zcache: cleancache_ops overridden"); + } +#endif +#ifdef CONFIG_FRONTSWAP + if (zcache_enabled && use_frontswap) { + struct frontswap_ops old_ops; + + zcache_client.xvpool = xv_create_pool(); + if (zcache_client.xvpool == NULL) { + pr_err("zcache: can't create xvpool\n"); + goto out; + } + old_ops = zcache_frontswap_register_ops(); + pr_info("zcache: frontswap enabled using kernel " + "transcendent memory and xvmalloc\n"); + if (old_ops.init != NULL) + pr_warning("ktmem: frontswap_ops overridden"); + } +#endif +out: + return ret; +} + +module_init(zcache_init) From 1abd4f495eaa84e73b597f238cce704f06c54dc4 Mon Sep 17 00:00:00 2001 From: SecureCRT Date: Mon, 20 Aug 2012 00:51:37 +0800 Subject: [PATCH 17/35] fs: add field to superblock to support cleancache --- include/linux/fs.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/fs.h b/include/linux/fs.h index 9b678052..925a431e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1383,6 +1383,11 @@ struct super_block { * generic_show_options() */ char *s_options; + + /* + * Saved pool identifier for cleancache (-1 means none) + */ + int cleancache_poolid; }; extern struct timespec current_fs_time(struct super_block *sb); From 8b62d33820d84cc081745802f9a62425151047f4 Mon Sep 17 00:00:00 2001 From: SecureCRT Date: Mon, 20 Aug 2012 00:52:57 +0800 Subject: [PATCH 18/35] enable zcache & cleancache --- arch/arm/mach-msm/board-htcleo.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/mach-msm/board-htcleo.h b/arch/arm/mach-msm/board-htcleo.h index 8d633974..f20da79f 100755 --- a/arch/arm/mach-msm/board-htcleo.h +++ b/arch/arm/mach-msm/board-htcleo.h @@ -40,7 +40,7 @@ #define MSM_FB_SIZE 0x00600000 #define MSM_PMEM_MDP_BASE 0x3B700000 -#define MSM_PMEM_MDP_SIZE 0x02000000 +#define MSM_PMEM_MDP_SIZE 0x03000000 #define MSM_PMEM_ADSP_BASE 0x3D700000 #define MSM_PMEM_ADSP_SIZE 0x01800000 @@ -59,7 +59,7 @@ /* Begin EBI region */ #define PMEM_KERNEL_EBI1_SIZE 0x00028000 -#define MSM_PMEM_SF_SIZE 0x02000000 +#define MSM_PMEM_SF_SIZE 0x03000000 /* MSM_RAM_CONSOLE uses the last 0x00040000 of EBI memory, defined in msm_iomap.h #define MSM_RAM_CONSOLE_SIZE 0x00040000 From c2ff7098d4011efb6143305f62b71c0f95e38108 Mon Sep 17 00:00:00 2001 From: SecureCRT Date: Mon, 20 Aug 2012 00:53:53 +0800 Subject: [PATCH 19/35] enable zcache & cleancache --- arch/arm/configs/htcleo_defconfig | 6 ++++++ 1 file changed, 6 insertions(+) mode change 100644 => 100755 arch/arm/configs/htcleo_defconfig diff --git a/arch/arm/configs/htcleo_defconfig b/arch/arm/configs/htcleo_defconfig old mode 100644 new mode 100755 index 12f408f2..a26de0ce --- a/arch/arm/configs/htcleo_defconfig +++ b/arch/arm/configs/htcleo_defconfig @@ -406,6 +406,7 @@ CONFIG_DEFAULT_MMAP_MIN_ADDR=4096 CONFIG_ALIGNMENT_TRAP=y CONFIG_ALLOW_CPU_ALIGNMENT=y # CONFIG_UACCESS_WITH_MEMCPY is not set +CONFIG_CLEANCACHE=y # # Boot options @@ -1688,6 +1689,11 @@ CONFIG_ANDROID_LOW_MEMORY_KILLER=y # CONFIG_IIO is not set # CONFIG_BTPORT is not set +# +# ZCACHE +# +CONFIG_ZCACHE=y + # # ZRAM # From b6c1977f351a6095b33a667546ec0496cbe8ca20 Mon Sep 17 00:00:00 2001 From: SecureCRT Date: Mon, 20 Aug 2012 00:57:38 +0800 Subject: [PATCH 20/35] revert #8b62d33 --- arch/arm/mach-msm/board-htcleo.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/mach-msm/board-htcleo.h b/arch/arm/mach-msm/board-htcleo.h index f20da79f..8d633974 100755 --- a/arch/arm/mach-msm/board-htcleo.h +++ b/arch/arm/mach-msm/board-htcleo.h @@ -40,7 +40,7 @@ #define MSM_FB_SIZE 0x00600000 #define MSM_PMEM_MDP_BASE 0x3B700000 -#define MSM_PMEM_MDP_SIZE 0x03000000 +#define MSM_PMEM_MDP_SIZE 0x02000000 #define MSM_PMEM_ADSP_BASE 0x3D700000 #define MSM_PMEM_ADSP_SIZE 0x01800000 @@ -59,7 +59,7 @@ /* Begin EBI region */ #define PMEM_KERNEL_EBI1_SIZE 0x00028000 -#define MSM_PMEM_SF_SIZE 0x03000000 +#define MSM_PMEM_SF_SIZE 0x02000000 /* MSM_RAM_CONSOLE uses the last 0x00040000 of EBI memory, defined in msm_iomap.h #define MSM_RAM_CONSOLE_SIZE 0x00040000 From 3d343ac32afc6eb933d98eb2dc4b3ad532de7f3a Mon Sep 17 00:00:00 2001 From: SecureCRT Date: Mon, 20 Aug 2012 12:10:10 +0800 Subject: [PATCH 21/35] mm/fs: add hooks to support cleancache --- fs/buffer.c | 5 +++++ fs/mpage.c | 7 +++++++ fs/super.c | 3 +++ mm/filemap.c | 11 +++++++++++ mm/truncate.c | 8 +++++++- 5 files changed, 33 insertions(+), 1 deletion(-) mode change 100644 => 100755 fs/buffer.c mode change 100644 => 100755 fs/mpage.c mode change 100644 => 100755 fs/super.c mode change 100644 => 100755 mm/filemap.c mode change 100644 => 100755 mm/truncate.c diff --git a/fs/buffer.c b/fs/buffer.c old mode 100644 new mode 100755 index 6fa53025..abf188f3 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -41,6 +41,7 @@ #include #include #include +#include static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); @@ -276,6 +277,10 @@ void invalidate_bdev(struct block_device *bdev) invalidate_bh_lrus(); invalidate_mapping_pages(mapping, 0, -1); + /* 99% of the time, we don't need to flush the cleancache on the bdev. + * But, for the strange corners, lets be cautious + */ + cleancache_flush_inode(mapping); } EXPORT_SYMBOL(invalidate_bdev); diff --git a/fs/mpage.c b/fs/mpage.c old mode 100644 new mode 100755 index 42381bd6..b5677aba --- a/fs/mpage.c +++ b/fs/mpage.c @@ -26,6 +26,7 @@ #include #include #include +#include /* * I/O completion handler for multipage BIOs. @@ -284,6 +285,12 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, } else if (fully_mapped) { SetPageMappedToDisk(page); } + + if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) && + cleancache_get_page(page) == 0) { + SetPageUptodate(page); + goto confused; + } /* * This page will go to BIO. Do we need to send this BIO off first? diff --git a/fs/super.c b/fs/super.c old mode 100644 new mode 100755 index aff046b0..a0ba74eb --- a/fs/super.c +++ b/fs/super.c @@ -38,6 +38,7 @@ #include #include #include +#include #include "internal.h" @@ -104,6 +105,7 @@ static struct super_block *alloc_super(struct file_system_type *type) s->s_qcop = sb_quotactl_ops; s->s_op = &default_op; s->s_time_gran = 1000000000; + s->cleancache_poolid = -1; } out: return s; @@ -219,6 +221,7 @@ void deactivate_locked_super(struct super_block *s) s->s_count -= S_BIAS-1; spin_unlock(&sb_lock); vfs_dq_off(s, 0); + cleancache_flush_fs(s); fs->kill_sb(s); put_filesystem(fs); put_super(s); diff --git a/mm/filemap.c b/mm/filemap.c old mode 100644 new mode 100755 index 8e96c907..a4399ff2 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -34,6 +34,7 @@ #include /* for BUG_ON(!in_atomic()) only */ #include #include /* for page_is_file_cache() */ +#include #include "internal.h" /* @@ -119,6 +120,16 @@ void __remove_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; + /* + * if we're uptodate, flush out into the cleancache, otherwise + * invalidate any existing cleancache entries. We can't leave + * stale data around in the cleancache once our page is gone + */ + if (PageUptodate(page) && PageMappedToDisk(page)) + cleancache_put_page(page); + else + cleancache_flush_page(mapping, page); + radix_tree_delete(&mapping->page_tree, page->index); page->mapping = NULL; mapping->nrpages--; diff --git a/mm/truncate.c b/mm/truncate.c old mode 100644 new mode 100755 index 258bda7e..31c639ea --- a/mm/truncate.c +++ b/mm/truncate.c @@ -18,6 +18,7 @@ #include #include /* grr. try_to_release_page, do_invalidatepage */ +#include #include "internal.h" @@ -50,6 +51,7 @@ void do_invalidatepage(struct page *page, unsigned long offset) static inline void truncate_partial_page(struct page *page, unsigned partial) { zero_user_segment(page, partial, PAGE_CACHE_SIZE); + cleancache_flush_page(page->mapping, page); if (page_has_private(page)) do_invalidatepage(page, partial); } @@ -213,7 +215,8 @@ void truncate_inode_pages_range(struct address_space *mapping, struct pagevec pvec; pgoff_t next; int i; - + + cleancache_flush_inode(mapping); if (mapping->nrpages == 0) return; @@ -287,6 +290,7 @@ void truncate_inode_pages_range(struct address_space *mapping, } pagevec_release(&pvec); } + cleancache_flush_inode(mapping); } EXPORT_SYMBOL(truncate_inode_pages_range); @@ -423,6 +427,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, int did_range_unmap = 0; int wrapped = 0; + cleancache_flush_inode(mapping); pagevec_init(&pvec, 0); next = start; while (next <= end && !wrapped && @@ -479,6 +484,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, pagevec_release(&pvec); cond_resched(); } + cleancache_flush_inode(mapping); return ret; } EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); From 7c50bd921f08bf56b04e171115dd7e3f52896f51 Mon Sep 17 00:00:00 2001 From: SecureCRT Date: Mon, 20 Aug 2012 15:24:37 +0800 Subject: [PATCH 22/35] staging:lowmemkiller add Fudgeswap fudgeswap acts as follows: If set to non zero (defualt is 512k): Check for the amount of SWAP_FREE space avalible If > 0KB is avalible: if fudgeswap > swapfree: other_file += swapfree else: other_file += fugeswap In short: we will add in fugeswap as long as its less then the free swap Setting this to a very large positive number will indicate swap ought to be fully used as free (and will slow the system down) smaller numbers will allow you to put some pressure on SWAP without slowing the system down as much. small negitive numbers will allow the system to be faster at the same minfree level. default is 512 to give a very little bit of pressure to use some swap but this can be modified at runtime via: /sys/module/lowmemorykiller/parameters/fugeswap originally by ezterry Please enter the commit message for your changes. Lines starting --- drivers/staging/android/lowmemorykiller.c | 24 +++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c index 05ebece0..5930a813 100755 --- a/drivers/staging/android/lowmemorykiller.c +++ b/drivers/staging/android/lowmemorykiller.c @@ -41,6 +41,11 @@ #include #include +#ifdef CONFIG_SWAP +#include +#include +#endif + static uint32_t lowmem_debug_level = 2; static int lowmem_adj[6] = { 0, @@ -64,6 +69,10 @@ static struct task_struct *lowmem_deathpending; static unsigned long lowmem_deathpending_timeout; static struct kobject *lowmem_kobj; +#ifdef CONFIG_SWAP +static int fudgeswap = 512; +#endif + #define lowmem_print(level, x...) \ do { \ if (lowmem_debug_level >= (level)) \ @@ -122,7 +131,19 @@ static inline void get_free_ram(int *other_free, int *other_file) *other_free = global_page_state(NR_FREE_PAGES); *other_file = global_page_state(NR_FILE_PAGES) - global_page_state(NR_SHMEM); +#ifdef CONFIG_SWAP + if(fudgeswap != 0){ + struct sysinfo si; + si_swapinfo(&si); + if(si.freeswap > 0){ + if(fudgeswap > si.freeswap) + other_file += si.freeswap; + else + other_file += fudgeswap; + } + } +#endif if (offlining) { /* Discount all free space in the section being offlined */ for_each_zone(zone) { @@ -347,6 +368,9 @@ module_param_named(debug_level, lowmem_debug_level, uint, S_IRUGO | S_IWUSR); module_param_named(notify_trigger, lowmem_minfree_notif_trigger, uint, S_IRUGO | S_IWUSR); +#ifdef CONFIG_SWAP +module_param_named(fudgeswap, fudgeswap, int, S_IRUGO | S_IWUSR); +#endif module_init(lowmem_init); module_exit(lowmem_exit); From f3a9b636b38756c67fc9a3edf2db3088dfd7fd3c Mon Sep 17 00:00:00 2001 From: SecureCRT Date: Mon, 20 Aug 2012 23:10:46 +0800 Subject: [PATCH 23/35] Revert "mm/fs: add hooks to support cleancache" This reverts commit 3d343ac32afc6eb933d98eb2dc4b3ad532de7f3a. --- fs/buffer.c | 5 ----- fs/mpage.c | 7 ------- fs/super.c | 3 --- mm/filemap.c | 11 ----------- mm/truncate.c | 8 +------- 5 files changed, 1 insertion(+), 33 deletions(-) mode change 100755 => 100644 fs/buffer.c mode change 100755 => 100644 fs/mpage.c mode change 100755 => 100644 fs/super.c mode change 100755 => 100644 mm/filemap.c mode change 100755 => 100644 mm/truncate.c diff --git a/fs/buffer.c b/fs/buffer.c old mode 100755 new mode 100644 index abf188f3..6fa53025 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -41,7 +41,6 @@ #include #include #include -#include static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); @@ -277,10 +276,6 @@ void invalidate_bdev(struct block_device *bdev) invalidate_bh_lrus(); invalidate_mapping_pages(mapping, 0, -1); - /* 99% of the time, we don't need to flush the cleancache on the bdev. - * But, for the strange corners, lets be cautious - */ - cleancache_flush_inode(mapping); } EXPORT_SYMBOL(invalidate_bdev); diff --git a/fs/mpage.c b/fs/mpage.c old mode 100755 new mode 100644 index b5677aba..42381bd6 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -26,7 +26,6 @@ #include #include #include -#include /* * I/O completion handler for multipage BIOs. @@ -285,12 +284,6 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, } else if (fully_mapped) { SetPageMappedToDisk(page); } - - if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) && - cleancache_get_page(page) == 0) { - SetPageUptodate(page); - goto confused; - } /* * This page will go to BIO. Do we need to send this BIO off first? diff --git a/fs/super.c b/fs/super.c old mode 100755 new mode 100644 index a0ba74eb..aff046b0 --- a/fs/super.c +++ b/fs/super.c @@ -38,7 +38,6 @@ #include #include #include -#include #include "internal.h" @@ -105,7 +104,6 @@ static struct super_block *alloc_super(struct file_system_type *type) s->s_qcop = sb_quotactl_ops; s->s_op = &default_op; s->s_time_gran = 1000000000; - s->cleancache_poolid = -1; } out: return s; @@ -221,7 +219,6 @@ void deactivate_locked_super(struct super_block *s) s->s_count -= S_BIAS-1; spin_unlock(&sb_lock); vfs_dq_off(s, 0); - cleancache_flush_fs(s); fs->kill_sb(s); put_filesystem(fs); put_super(s); diff --git a/mm/filemap.c b/mm/filemap.c old mode 100755 new mode 100644 index a4399ff2..8e96c907 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -34,7 +34,6 @@ #include /* for BUG_ON(!in_atomic()) only */ #include #include /* for page_is_file_cache() */ -#include #include "internal.h" /* @@ -120,16 +119,6 @@ void __remove_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; - /* - * if we're uptodate, flush out into the cleancache, otherwise - * invalidate any existing cleancache entries. We can't leave - * stale data around in the cleancache once our page is gone - */ - if (PageUptodate(page) && PageMappedToDisk(page)) - cleancache_put_page(page); - else - cleancache_flush_page(mapping, page); - radix_tree_delete(&mapping->page_tree, page->index); page->mapping = NULL; mapping->nrpages--; diff --git a/mm/truncate.c b/mm/truncate.c old mode 100755 new mode 100644 index 31c639ea..258bda7e --- a/mm/truncate.c +++ b/mm/truncate.c @@ -18,7 +18,6 @@ #include #include /* grr. try_to_release_page, do_invalidatepage */ -#include #include "internal.h" @@ -51,7 +50,6 @@ void do_invalidatepage(struct page *page, unsigned long offset) static inline void truncate_partial_page(struct page *page, unsigned partial) { zero_user_segment(page, partial, PAGE_CACHE_SIZE); - cleancache_flush_page(page->mapping, page); if (page_has_private(page)) do_invalidatepage(page, partial); } @@ -215,8 +213,7 @@ void truncate_inode_pages_range(struct address_space *mapping, struct pagevec pvec; pgoff_t next; int i; - - cleancache_flush_inode(mapping); + if (mapping->nrpages == 0) return; @@ -290,7 +287,6 @@ void truncate_inode_pages_range(struct address_space *mapping, } pagevec_release(&pvec); } - cleancache_flush_inode(mapping); } EXPORT_SYMBOL(truncate_inode_pages_range); @@ -427,7 +423,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping, int did_range_unmap = 0; int wrapped = 0; - cleancache_flush_inode(mapping); pagevec_init(&pvec, 0); next = start; while (next <= end && !wrapped && @@ -484,7 +479,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping, pagevec_release(&pvec); cond_resched(); } - cleancache_flush_inode(mapping); return ret; } EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); From 0f794ead76c1db9f60edb03bdd2632c15b936401 Mon Sep 17 00:00:00 2001 From: SecureCRT Date: Mon, 20 Aug 2012 23:12:10 +0800 Subject: [PATCH 24/35] Revert "enable zcache & cleancache" This reverts commit c2ff7098d4011efb6143305f62b71c0f95e38108. --- arch/arm/configs/htcleo_defconfig | 6 ------ 1 file changed, 6 deletions(-) mode change 100755 => 100644 arch/arm/configs/htcleo_defconfig diff --git a/arch/arm/configs/htcleo_defconfig b/arch/arm/configs/htcleo_defconfig old mode 100755 new mode 100644 index a26de0ce..12f408f2 --- a/arch/arm/configs/htcleo_defconfig +++ b/arch/arm/configs/htcleo_defconfig @@ -406,7 +406,6 @@ CONFIG_DEFAULT_MMAP_MIN_ADDR=4096 CONFIG_ALIGNMENT_TRAP=y CONFIG_ALLOW_CPU_ALIGNMENT=y # CONFIG_UACCESS_WITH_MEMCPY is not set -CONFIG_CLEANCACHE=y # # Boot options @@ -1689,11 +1688,6 @@ CONFIG_ANDROID_LOW_MEMORY_KILLER=y # CONFIG_IIO is not set # CONFIG_BTPORT is not set -# -# ZCACHE -# -CONFIG_ZCACHE=y - # # ZRAM # From 4cecd4ccb2ef21842d6ab0fb62a15d3b1a1801aa Mon Sep 17 00:00:00 2001 From: SecureCRT Date: Mon, 20 Aug 2012 23:12:22 +0800 Subject: [PATCH 25/35] Revert "fs: add field to superblock to support cleancache" This reverts commit 1abd4f495eaa84e73b597f238cce704f06c54dc4. --- include/linux/fs.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 925a431e..9b678052 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1383,11 +1383,6 @@ struct super_block { * generic_show_options() */ char *s_options; - - /* - * Saved pool identifier for cleancache (-1 means none) - */ - int cleancache_poolid; }; extern struct timespec current_fs_time(struct super_block *sb); From 61eb7c5296af5ca7b86f8d8075f3dd248b3534f0 Mon Sep 17 00:00:00 2001 From: SecureCRT Date: Mon, 20 Aug 2012 23:14:51 +0800 Subject: [PATCH 26/35] Revert "add zcache" This reverts commit 8eb6724dbfb99bb1f17f3192483fafc1f9eb73fe. --- drivers/staging/Kconfig | 1 - drivers/staging/Makefile | 1 - drivers/staging/zcache/Kconfig | 13 - drivers/staging/zcache/Makefile | 3 - drivers/staging/zcache/tmem.c | 710 ------------- drivers/staging/zcache/tmem.h | 195 ---- drivers/staging/zcache/zcache.c | 1658 ------------------------------- 7 files changed, 2581 deletions(-) delete mode 100755 drivers/staging/zcache/Kconfig delete mode 100755 drivers/staging/zcache/Makefile delete mode 100755 drivers/staging/zcache/tmem.c delete mode 100755 drivers/staging/zcache/tmem.h delete mode 100755 drivers/staging/zcache/zcache.c diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig index e4c3c9dd..8ee4bfa6 100644 --- a/drivers/staging/Kconfig +++ b/drivers/staging/Kconfig @@ -125,6 +125,5 @@ source "drivers/staging/iio/Kconfig" source "drivers/staging/zram/Kconfig" -source "drivers/staging/zcache/Kconfig" endif # !STAGING_EXCLUDE_BUILD endif # STAGING diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile index 5f0f554b..5a1b7341 100644 --- a/drivers/staging/Makefile +++ b/drivers/staging/Makefile @@ -45,5 +45,4 @@ obj-$(CONFIG_DX_SEP) += sep/ obj-$(CONFIG_IIO) += iio/ obj-$(CONFIG_ZRAM) += zram/ obj-$(CONFIG_XVMALLOC) += zram/ -obj-$(CONFIG_ZCACHE) += zcache/ diff --git a/drivers/staging/zcache/Kconfig b/drivers/staging/zcache/Kconfig deleted file mode 100755 index 7fabcb2b..00000000 --- a/drivers/staging/zcache/Kconfig +++ /dev/null @@ -1,13 +0,0 @@ -config ZCACHE - tristate "Dynamic compression of swap pages and clean pagecache pages" - depends on CLEANCACHE || FRONTSWAP - select XVMALLOC - select LZO_COMPRESS - select LZO_DECOMPRESS - default n - help - Zcache doubles RAM efficiency while providing a significant - performance boosts on many workloads. Zcache uses lzo1x - compression and an in-kernel implementation of transcendent - memory to store clean page cache pages and swap in RAM, - providing a noticeable reduction in disk I/O. diff --git a/drivers/staging/zcache/Makefile b/drivers/staging/zcache/Makefile deleted file mode 100755 index f5ec64f9..00000000 --- a/drivers/staging/zcache/Makefile +++ /dev/null @@ -1,3 +0,0 @@ -zcache-y := tmem.o - -obj-$(CONFIG_ZCACHE) += zcache.o diff --git a/drivers/staging/zcache/tmem.c b/drivers/staging/zcache/tmem.c deleted file mode 100755 index e954d405..00000000 --- a/drivers/staging/zcache/tmem.c +++ /dev/null @@ -1,710 +0,0 @@ -/* - * In-kernel transcendent memory (generic implementation) - * - * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp. - * - * The primary purpose of Transcedent Memory ("tmem") is to map object-oriented - * "handles" (triples containing a pool id, and object id, and an index), to - * pages in a page-accessible memory (PAM). Tmem references the PAM pages via - * an abstract "pampd" (PAM page-descriptor), which can be operated on by a - * set of functions (pamops). Each pampd contains some representation of - * PAGE_SIZE bytes worth of data. Tmem must support potentially millions of - * pages and must be able to insert, find, and delete these pages at a - * potential frequency of thousands per second concurrently across many CPUs, - * (and, if used with KVM, across many vcpus across many guests). - * Tmem is tracked with a hierarchy of data structures, organized by - * the elements in a handle-tuple: pool_id, object_id, and page index. - * One or more "clients" (e.g. guests) each provide one or more tmem_pools. - * Each pool, contains a hash table of rb_trees of tmem_objs. Each - * tmem_obj contains a radix-tree-like tree of pointers, with intermediate - * nodes called tmem_objnodes. Each leaf pointer in this tree points to - * a pampd, which is accessible only through a small set of callbacks - * registered by the PAM implementation (see tmem_register_pamops). Tmem - * does all memory allocation via a set of callbacks registered by the tmem - * host implementation (e.g. see tmem_register_hostops). - */ - -#include -#include -#include - -#include "tmem.h" - -/* data structure sentinels used for debugging... see tmem.h */ -#define POOL_SENTINEL 0x87658765 -#define OBJ_SENTINEL 0x12345678 -#define OBJNODE_SENTINEL 0xfedcba09 - -/* - * A tmem host implementation must use this function to register callbacks - * for memory allocation. - */ -static struct tmem_hostops tmem_hostops; - -static void tmem_objnode_tree_init(void); - -void tmem_register_hostops(struct tmem_hostops *m) -{ - tmem_objnode_tree_init(); - tmem_hostops = *m; -} - -/* - * A tmem host implementation must use this function to register - * callbacks for a page-accessible memory (PAM) implementation - */ -static struct tmem_pamops tmem_pamops; - -void tmem_register_pamops(struct tmem_pamops *m) -{ - tmem_pamops = *m; -} - -/* - * Oid's are potentially very sparse and tmem_objs may have an indeterminately - * short life, being added and deleted at a relatively high frequency. - * So an rb_tree is an ideal data structure to manage tmem_objs. But because - * of the potentially huge number of tmem_objs, each pool manages a hashtable - * of rb_trees to reduce search, insert, delete, and rebalancing time. - * Each hashbucket also has a lock to manage concurrent access. - * - * The following routines manage tmem_objs. When any tmem_obj is accessed, - * the hashbucket lock must be held. - */ - -/* searches for object==oid in pool, returns locked object if found */ -static struct tmem_obj *tmem_obj_find(struct tmem_hashbucket *hb, - struct tmem_oid *oidp) -{ - struct rb_node *rbnode; - struct tmem_obj *obj; - - rbnode = hb->obj_rb_root.rb_node; - while (rbnode) { - BUG_ON(RB_EMPTY_NODE(rbnode)); - obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node); - switch (tmem_oid_compare(oidp, &obj->oid)) { - case 0: /* equal */ - goto out; - case -1: - rbnode = rbnode->rb_left; - break; - case 1: - rbnode = rbnode->rb_right; - break; - } - } - obj = NULL; -out: - return obj; -} - -static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *); - -/* free an object that has no more pampds in it */ -static void tmem_obj_free(struct tmem_obj *obj, struct tmem_hashbucket *hb) -{ - struct tmem_pool *pool; - - BUG_ON(obj == NULL); - ASSERT_SENTINEL(obj, OBJ); - BUG_ON(obj->pampd_count > 0); - pool = obj->pool; - BUG_ON(pool == NULL); - if (obj->objnode_tree_root != NULL) /* may be "stump" with no leaves */ - tmem_pampd_destroy_all_in_obj(obj); - BUG_ON(obj->objnode_tree_root != NULL); - BUG_ON((long)obj->objnode_count != 0); - atomic_dec(&pool->obj_count); - BUG_ON(atomic_read(&pool->obj_count) < 0); - INVERT_SENTINEL(obj, OBJ); - obj->pool = NULL; - tmem_oid_set_invalid(&obj->oid); - rb_erase(&obj->rb_tree_node, &hb->obj_rb_root); -} - -/* - * initialize, and insert an tmem_object_root (called only if find failed) - */ -static void tmem_obj_init(struct tmem_obj *obj, struct tmem_hashbucket *hb, - struct tmem_pool *pool, - struct tmem_oid *oidp) -{ - struct rb_root *root = &hb->obj_rb_root; - struct rb_node **new = &(root->rb_node), *parent = NULL; - struct tmem_obj *this; - - BUG_ON(pool == NULL); - atomic_inc(&pool->obj_count); - obj->objnode_tree_height = 0; - obj->objnode_tree_root = NULL; - obj->pool = pool; - obj->oid = *oidp; - obj->objnode_count = 0; - obj->pampd_count = 0; - SET_SENTINEL(obj, OBJ); - while (*new) { - BUG_ON(RB_EMPTY_NODE(*new)); - this = rb_entry(*new, struct tmem_obj, rb_tree_node); - parent = *new; - switch (tmem_oid_compare(oidp, &this->oid)) { - case 0: - BUG(); /* already present; should never happen! */ - break; - case -1: - new = &(*new)->rb_left; - break; - case 1: - new = &(*new)->rb_right; - break; - } - } - rb_link_node(&obj->rb_tree_node, parent, new); - rb_insert_color(&obj->rb_tree_node, root); -} - -/* - * Tmem is managed as a set of tmem_pools with certain attributes, such as - * "ephemeral" vs "persistent". These attributes apply to all tmem_objs - * and all pampds that belong to a tmem_pool. A tmem_pool is created - * or deleted relatively rarely (for example, when a filesystem is - * mounted or unmounted. - */ - -/* flush all data from a pool and, optionally, free it */ -static void tmem_pool_flush(struct tmem_pool *pool, bool destroy) -{ - struct rb_node *rbnode; - struct tmem_obj *obj; - struct tmem_hashbucket *hb = &pool->hashbucket[0]; - int i; - - BUG_ON(pool == NULL); - for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) { - spin_lock(&hb->lock); - rbnode = rb_first(&hb->obj_rb_root); - while (rbnode != NULL) { - obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node); - rbnode = rb_next(rbnode); - tmem_pampd_destroy_all_in_obj(obj); - tmem_obj_free(obj, hb); - (*tmem_hostops.obj_free)(obj, pool); - } - spin_unlock(&hb->lock); - } - if (destroy) - list_del(&pool->pool_list); -} - -/* - * A tmem_obj contains a radix-tree-like tree in which the intermediate - * nodes are called tmem_objnodes. (The kernel lib/radix-tree.c implementation - * is very specialized and tuned for specific uses and is not particularly - * suited for use from this code, though some code from the core algorithms has - * been reused, thus the copyright notices below). Each tmem_objnode contains - * a set of pointers which point to either a set of intermediate tmem_objnodes - * or a set of of pampds. - * - * Portions Copyright (C) 2001 Momchil Velikov - * Portions Copyright (C) 2001 Christoph Hellwig - * Portions Copyright (C) 2005 SGI, Christoph Lameter - */ - -struct tmem_objnode_tree_path { - struct tmem_objnode *objnode; - int offset; -}; - -/* objnode height_to_maxindex translation */ -static unsigned long tmem_objnode_tree_h2max[OBJNODE_TREE_MAX_PATH + 1]; - -static void tmem_objnode_tree_init(void) -{ - unsigned int ht, tmp; - - for (ht = 0; ht < ARRAY_SIZE(tmem_objnode_tree_h2max); ht++) { - tmp = ht * OBJNODE_TREE_MAP_SHIFT; - if (tmp >= OBJNODE_TREE_INDEX_BITS) - tmem_objnode_tree_h2max[ht] = ~0UL; - else - tmem_objnode_tree_h2max[ht] = - (~0UL >> (OBJNODE_TREE_INDEX_BITS - tmp - 1)) >> 1; - } -} - -static struct tmem_objnode *tmem_objnode_alloc(struct tmem_obj *obj) -{ - struct tmem_objnode *objnode; - - ASSERT_SENTINEL(obj, OBJ); - BUG_ON(obj->pool == NULL); - ASSERT_SENTINEL(obj->pool, POOL); - objnode = (*tmem_hostops.objnode_alloc)(obj->pool); - if (unlikely(objnode == NULL)) - goto out; - objnode->obj = obj; - SET_SENTINEL(objnode, OBJNODE); - memset(&objnode->slots, 0, sizeof(objnode->slots)); - objnode->slots_in_use = 0; - obj->objnode_count++; -out: - return objnode; -} - -static void tmem_objnode_free(struct tmem_objnode *objnode) -{ - struct tmem_pool *pool; - int i; - - BUG_ON(objnode == NULL); - for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) - BUG_ON(objnode->slots[i] != NULL); - ASSERT_SENTINEL(objnode, OBJNODE); - INVERT_SENTINEL(objnode, OBJNODE); - BUG_ON(objnode->obj == NULL); - ASSERT_SENTINEL(objnode->obj, OBJ); - pool = objnode->obj->pool; - BUG_ON(pool == NULL); - ASSERT_SENTINEL(pool, POOL); - objnode->obj->objnode_count--; - objnode->obj = NULL; - (*tmem_hostops.objnode_free)(objnode, pool); -} - -/* - * lookup index in object and return associated pampd (or NULL if not found) - */ -static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index) -{ - unsigned int height, shift; - struct tmem_objnode **slot = NULL; - - BUG_ON(obj == NULL); - ASSERT_SENTINEL(obj, OBJ); - BUG_ON(obj->pool == NULL); - ASSERT_SENTINEL(obj->pool, POOL); - - height = obj->objnode_tree_height; - if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) - goto out; - if (height == 0 && obj->objnode_tree_root) { - slot = &obj->objnode_tree_root; - goto out; - } - shift = (height-1) * OBJNODE_TREE_MAP_SHIFT; - slot = &obj->objnode_tree_root; - while (height > 0) { - if (*slot == NULL) - goto out; - slot = (struct tmem_objnode **) - ((*slot)->slots + - ((index >> shift) & OBJNODE_TREE_MAP_MASK)); - shift -= OBJNODE_TREE_MAP_SHIFT; - height--; - } -out: - return slot != NULL ? *slot : NULL; -} - -static int tmem_pampd_add_to_obj(struct tmem_obj *obj, uint32_t index, - void *pampd) -{ - int ret = 0; - struct tmem_objnode *objnode = NULL, *newnode, *slot; - unsigned int height, shift; - int offset = 0; - - /* if necessary, extend the tree to be higher */ - if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) { - height = obj->objnode_tree_height + 1; - if (index > tmem_objnode_tree_h2max[height]) - while (index > tmem_objnode_tree_h2max[height]) - height++; - if (obj->objnode_tree_root == NULL) { - obj->objnode_tree_height = height; - goto insert; - } - do { - newnode = tmem_objnode_alloc(obj); - if (!newnode) { - ret = -ENOMEM; - goto out; - } - newnode->slots[0] = obj->objnode_tree_root; - newnode->slots_in_use = 1; - obj->objnode_tree_root = newnode; - obj->objnode_tree_height++; - } while (height > obj->objnode_tree_height); - } -insert: - slot = obj->objnode_tree_root; - height = obj->objnode_tree_height; - shift = (height-1) * OBJNODE_TREE_MAP_SHIFT; - while (height > 0) { - if (slot == NULL) { - /* add a child objnode. */ - slot = tmem_objnode_alloc(obj); - if (!slot) { - ret = -ENOMEM; - goto out; - } - if (objnode) { - - objnode->slots[offset] = slot; - objnode->slots_in_use++; - } else - obj->objnode_tree_root = slot; - } - /* go down a level */ - offset = (index >> shift) & OBJNODE_TREE_MAP_MASK; - objnode = slot; - slot = objnode->slots[offset]; - shift -= OBJNODE_TREE_MAP_SHIFT; - height--; - } - BUG_ON(slot != NULL); - if (objnode) { - objnode->slots_in_use++; - objnode->slots[offset] = pampd; - } else - obj->objnode_tree_root = pampd; - obj->pampd_count++; -out: - return ret; -} - -static void *tmem_pampd_delete_from_obj(struct tmem_obj *obj, uint32_t index) -{ - struct tmem_objnode_tree_path path[OBJNODE_TREE_MAX_PATH + 1]; - struct tmem_objnode_tree_path *pathp = path; - struct tmem_objnode *slot = NULL; - unsigned int height, shift; - int offset; - - BUG_ON(obj == NULL); - ASSERT_SENTINEL(obj, OBJ); - BUG_ON(obj->pool == NULL); - ASSERT_SENTINEL(obj->pool, POOL); - height = obj->objnode_tree_height; - if (index > tmem_objnode_tree_h2max[height]) - goto out; - slot = obj->objnode_tree_root; - if (height == 0 && obj->objnode_tree_root) { - obj->objnode_tree_root = NULL; - goto out; - } - shift = (height - 1) * OBJNODE_TREE_MAP_SHIFT; - pathp->objnode = NULL; - do { - if (slot == NULL) - goto out; - pathp++; - offset = (index >> shift) & OBJNODE_TREE_MAP_MASK; - pathp->offset = offset; - pathp->objnode = slot; - slot = slot->slots[offset]; - shift -= OBJNODE_TREE_MAP_SHIFT; - height--; - } while (height > 0); - if (slot == NULL) - goto out; - while (pathp->objnode) { - pathp->objnode->slots[pathp->offset] = NULL; - pathp->objnode->slots_in_use--; - if (pathp->objnode->slots_in_use) { - if (pathp->objnode == obj->objnode_tree_root) { - while (obj->objnode_tree_height > 0 && - obj->objnode_tree_root->slots_in_use == 1 && - obj->objnode_tree_root->slots[0]) { - struct tmem_objnode *to_free = - obj->objnode_tree_root; - - obj->objnode_tree_root = - to_free->slots[0]; - obj->objnode_tree_height--; - to_free->slots[0] = NULL; - to_free->slots_in_use = 0; - tmem_objnode_free(to_free); - } - } - goto out; - } - tmem_objnode_free(pathp->objnode); /* 0 slots used, free it */ - pathp--; - } - obj->objnode_tree_height = 0; - obj->objnode_tree_root = NULL; - -out: - if (slot != NULL) - obj->pampd_count--; - BUG_ON(obj->pampd_count < 0); - return slot; -} - -/* recursively walk the objnode_tree destroying pampds and objnodes */ -static void tmem_objnode_node_destroy(struct tmem_obj *obj, - struct tmem_objnode *objnode, - unsigned int ht) -{ - int i; - - if (ht == 0) - return; - for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) { - if (objnode->slots[i]) { - if (ht == 1) { - obj->pampd_count--; - (*tmem_pamops.free)(objnode->slots[i], - obj->pool); - objnode->slots[i] = NULL; - continue; - } - tmem_objnode_node_destroy(obj, objnode->slots[i], ht-1); - tmem_objnode_free(objnode->slots[i]); - objnode->slots[i] = NULL; - } - } -} - -static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj) -{ - if (obj->objnode_tree_root == NULL) - return; - if (obj->objnode_tree_height == 0) { - obj->pampd_count--; - (*tmem_pamops.free)(obj->objnode_tree_root, obj->pool); - } else { - tmem_objnode_node_destroy(obj, obj->objnode_tree_root, - obj->objnode_tree_height); - tmem_objnode_free(obj->objnode_tree_root); - obj->objnode_tree_height = 0; - } - obj->objnode_tree_root = NULL; -} - -/* - * Tmem is operated on by a set of well-defined actions: - * "put", "get", "flush", "flush_object", "new pool" and "destroy pool". - * (The tmem ABI allows for subpages and exchanges but these operations - * are not included in this implementation.) - * - * These "tmem core" operations are implemented in the following functions. - */ - -/* - * "Put" a page, e.g. copy a page from the kernel into newly allocated - * PAM space (if such space is available). Tmem_put is complicated by - * a corner case: What if a page with matching handle already exists in - * tmem? To guarantee coherency, one of two actions is necessary: Either - * the data for the page must be overwritten, or the page must be - * "flushed" so that the data is not accessible to a subsequent "get". - * Since these "duplicate puts" are relatively rare, this implementation - * always flushes for simplicity. - */ -int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index, - struct page *page) -{ - struct tmem_obj *obj = NULL, *objfound = NULL, *objnew = NULL; - void *pampd = NULL, *pampd_del = NULL; - int ret = -ENOMEM; - bool ephemeral; - struct tmem_hashbucket *hb; - - ephemeral = is_ephemeral(pool); - hb = &pool->hashbucket[tmem_oid_hash(oidp)]; - spin_lock(&hb->lock); - obj = objfound = tmem_obj_find(hb, oidp); - if (obj != NULL) { - pampd = tmem_pampd_lookup_in_obj(objfound, index); - if (pampd != NULL) { - /* if found, is a dup put, flush the old one */ - pampd_del = tmem_pampd_delete_from_obj(obj, index); - BUG_ON(pampd_del != pampd); - (*tmem_pamops.free)(pampd, pool); - if (obj->pampd_count == 0) { - objnew = obj; - objfound = NULL; - } - pampd = NULL; - } - } else { - obj = objnew = (*tmem_hostops.obj_alloc)(pool); - if (unlikely(obj == NULL)) { - ret = -ENOMEM; - goto out; - } - tmem_obj_init(obj, hb, pool, oidp); - } - BUG_ON(obj == NULL); - BUG_ON(((objnew != obj) && (objfound != obj)) || (objnew == objfound)); - pampd = (*tmem_pamops.create)(obj->pool, &obj->oid, index, page); - if (unlikely(pampd == NULL)) - goto free; - ret = tmem_pampd_add_to_obj(obj, index, pampd); - if (unlikely(ret == -ENOMEM)) - /* may have partially built objnode tree ("stump") */ - goto delete_and_free; - goto out; - -delete_and_free: - (void)tmem_pampd_delete_from_obj(obj, index); -free: - if (pampd) - (*tmem_pamops.free)(pampd, pool); - if (objnew) { - tmem_obj_free(objnew, hb); - (*tmem_hostops.obj_free)(objnew, pool); - } -out: - spin_unlock(&hb->lock); - return ret; -} - -/* - * "Get" a page, e.g. if one can be found, copy the tmem page with the - * matching handle from PAM space to the kernel. By tmem definition, - * when a "get" is successful on an ephemeral page, the page is "flushed", - * and when a "get" is successful on a persistent page, the page is retained - * in tmem. Note that to preserve - * coherency, "get" can never be skipped if tmem contains the data. - * That is, if a get is done with a certain handle and fails, any - * subsequent "get" must also fail (unless of course there is a - * "put" done with the same handle). - - */ -int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, - uint32_t index, struct page *page) -{ - struct tmem_obj *obj; - void *pampd; - bool ephemeral = is_ephemeral(pool); - uint32_t ret = -1; - struct tmem_hashbucket *hb; - - hb = &pool->hashbucket[tmem_oid_hash(oidp)]; - spin_lock(&hb->lock); - obj = tmem_obj_find(hb, oidp); - if (obj == NULL) - goto out; - ephemeral = is_ephemeral(pool); - if (ephemeral) - pampd = tmem_pampd_delete_from_obj(obj, index); - else - pampd = tmem_pampd_lookup_in_obj(obj, index); - if (pampd == NULL) - goto out; - ret = (*tmem_pamops.get_data)(page, pampd, pool); - if (ret < 0) - goto out; - if (ephemeral) { - (*tmem_pamops.free)(pampd, pool); - if (obj->pampd_count == 0) { - tmem_obj_free(obj, hb); - (*tmem_hostops.obj_free)(obj, pool); - obj = NULL; - } - } - ret = 0; -out: - spin_unlock(&hb->lock); - return ret; -} - -/* - * If a page in tmem matches the handle, "flush" this page from tmem such - * that any subsequent "get" does not succeed (unless, of course, there - * was another "put" with the same handle). - */ -int tmem_flush_page(struct tmem_pool *pool, - struct tmem_oid *oidp, uint32_t index) -{ - struct tmem_obj *obj; - void *pampd; - int ret = -1; - struct tmem_hashbucket *hb; - - hb = &pool->hashbucket[tmem_oid_hash(oidp)]; - spin_lock(&hb->lock); - obj = tmem_obj_find(hb, oidp); - if (obj == NULL) - goto out; - pampd = tmem_pampd_delete_from_obj(obj, index); - if (pampd == NULL) - goto out; - (*tmem_pamops.free)(pampd, pool); - if (obj->pampd_count == 0) { - tmem_obj_free(obj, hb); - (*tmem_hostops.obj_free)(obj, pool); - } - ret = 0; - -out: - spin_unlock(&hb->lock); - return ret; -} - -/* - * "Flush" all pages in tmem matching this oid. - */ -int tmem_flush_object(struct tmem_pool *pool, struct tmem_oid *oidp) -{ - struct tmem_obj *obj; - struct tmem_hashbucket *hb; - int ret = -1; - - hb = &pool->hashbucket[tmem_oid_hash(oidp)]; - spin_lock(&hb->lock); - obj = tmem_obj_find(hb, oidp); - if (obj == NULL) - goto out; - tmem_pampd_destroy_all_in_obj(obj); - tmem_obj_free(obj, hb); - (*tmem_hostops.obj_free)(obj, pool); - ret = 0; - -out: - spin_unlock(&hb->lock); - return ret; -} - -/* - * "Flush" all pages (and tmem_objs) from this tmem_pool and disable - * all subsequent access to this tmem_pool. - */ -int tmem_destroy_pool(struct tmem_pool *pool) -{ - int ret = -1; - - if (pool == NULL) - goto out; - tmem_pool_flush(pool, 1); - ret = 0; -out: - return ret; -} - -static LIST_HEAD(tmem_global_pool_list); - -/* - * Create a new tmem_pool with the provided flag and return - * a pool id provided by the tmem host implementation. - */ -void tmem_new_pool(struct tmem_pool *pool, uint32_t flags) -{ - int persistent = flags & TMEM_POOL_PERSIST; - int shared = flags & TMEM_POOL_SHARED; - struct tmem_hashbucket *hb = &pool->hashbucket[0]; - int i; - - for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) { - hb->obj_rb_root = RB_ROOT; - spin_lock_init(&hb->lock); - } - INIT_LIST_HEAD(&pool->pool_list); - atomic_set(&pool->obj_count, 0); - SET_SENTINEL(pool, POOL); - list_add_tail(&pool->pool_list, &tmem_global_pool_list); - pool->persistent = persistent; - pool->shared = shared; -} diff --git a/drivers/staging/zcache/tmem.h b/drivers/staging/zcache/tmem.h deleted file mode 100755 index 2e07e217..00000000 --- a/drivers/staging/zcache/tmem.h +++ /dev/null @@ -1,195 +0,0 @@ -/* - * tmem.h - * - * Transcendent memory - * - * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp. - */ - -#ifndef _TMEM_H_ -#define _TMEM_H_ - -#include -#include -#include -#include - -/* - * These are pre-defined by the Xen<->Linux ABI - */ -#define TMEM_PUT_PAGE 4 -#define TMEM_GET_PAGE 5 -#define TMEM_FLUSH_PAGE 6 -#define TMEM_FLUSH_OBJECT 7 -#define TMEM_POOL_PERSIST 1 -#define TMEM_POOL_SHARED 2 -#define TMEM_POOL_PRECOMPRESSED 4 -#define TMEM_POOL_PAGESIZE_SHIFT 4 -#define TMEM_POOL_PAGESIZE_MASK 0xf -#define TMEM_POOL_RESERVED_BITS 0x00ffff00 - -/* - * sentinels have proven very useful for debugging but can be removed - * or disabled before final merge. - */ -#define SENTINELS -#ifdef SENTINELS -#define DECL_SENTINEL uint32_t sentinel; -#define SET_SENTINEL(_x, _y) (_x->sentinel = _y##_SENTINEL) -#define INVERT_SENTINEL(_x, _y) (_x->sentinel = ~_y##_SENTINEL) -#define ASSERT_SENTINEL(_x, _y) WARN_ON(_x->sentinel != _y##_SENTINEL) -#define ASSERT_INVERTED_SENTINEL(_x, _y) WARN_ON(_x->sentinel != ~_y##_SENTINEL) -#else -#define DECL_SENTINEL -#define SET_SENTINEL(_x, _y) do { } while (0) -#define INVERT_SENTINEL(_x, _y) do { } while (0) -#define ASSERT_SENTINEL(_x, _y) do { } while (0) -#define ASSERT_INVERTED_SENTINEL(_x, _y) do { } while (0) -#endif - -#define ASSERT_SPINLOCK(_l) WARN_ON(!spin_is_locked(_l)) - -/* - * A pool is the highest-level data structure managed by tmem and - * usually corresponds to a large independent set of pages such as - * a filesystem. Each pool has an id, and certain attributes and counters. - * It also contains a set of hash buckets, each of which contains an rbtree - * of objects and a lock to manage concurrency within the pool. - */ - -#define TMEM_HASH_BUCKET_BITS 8 -#define TMEM_HASH_BUCKETS (1<persistent) -#define is_ephemeral(_p) (!(_p->persistent)) - -/* - * An object id ("oid") is large: 192-bits (to ensure, for example, files - * in a modern filesystem can be uniquely identified). - */ - -struct tmem_oid { - uint64_t oid[3]; -}; - -static inline void tmem_oid_set_invalid(struct tmem_oid *oidp) -{ - oidp->oid[0] = oidp->oid[1] = oidp->oid[2] = -1UL; -} - -static inline bool tmem_oid_valid(struct tmem_oid *oidp) -{ - return oidp->oid[0] != -1UL || oidp->oid[1] != -1UL || - oidp->oid[2] != -1UL; -} - -static inline int tmem_oid_compare(struct tmem_oid *left, - struct tmem_oid *right) -{ - int ret; - - if (left->oid[2] == right->oid[2]) { - if (left->oid[1] == right->oid[1]) { - if (left->oid[0] == right->oid[0]) - ret = 0; - else if (left->oid[0] < right->oid[0]) - ret = -1; - else - return 1; - } else if (left->oid[1] < right->oid[1]) - ret = -1; - else - ret = 1; - } else if (left->oid[2] < right->oid[2]) - ret = -1; - else - ret = 1; - return ret; -} - -static inline unsigned tmem_oid_hash(struct tmem_oid *oidp) -{ - return hash_long(oidp->oid[0] ^ oidp->oid[1] ^ oidp->oid[2], - TMEM_HASH_BUCKET_BITS); -} - -/* - * A tmem_obj contains an identifier (oid), pointers to the parent - * pool and the rb_tree to which it belongs, counters, and an ordered - * set of pampds, structured in a radix-tree-like tree. The intermediate - * nodes of the tree are called tmem_objnodes. - */ - -struct tmem_objnode; - -struct tmem_obj { - struct tmem_oid oid; - struct tmem_pool *pool; - struct rb_node rb_tree_node; - struct tmem_objnode *objnode_tree_root; - unsigned int objnode_tree_height; - unsigned long objnode_count; - long pampd_count; - DECL_SENTINEL -}; - -#define OBJNODE_TREE_MAP_SHIFT 6 -#define OBJNODE_TREE_MAP_SIZE (1UL << OBJNODE_TREE_MAP_SHIFT) -#define OBJNODE_TREE_MAP_MASK (OBJNODE_TREE_MAP_SIZE-1) -#define OBJNODE_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) -#define OBJNODE_TREE_MAX_PATH \ - (OBJNODE_TREE_INDEX_BITS/OBJNODE_TREE_MAP_SHIFT + 2) - -struct tmem_objnode { - struct tmem_obj *obj; - DECL_SENTINEL - void *slots[OBJNODE_TREE_MAP_SIZE]; - unsigned int slots_in_use; -}; - -/* pampd abstract datatype methods provided by the PAM implementation */ -struct tmem_pamops { - void *(*create)(struct tmem_pool *, struct tmem_oid *, uint32_t, - struct page *); - int (*get_data)(struct page *, void *, struct tmem_pool *); - void (*free)(void *, struct tmem_pool *); -}; -extern void tmem_register_pamops(struct tmem_pamops *m); - -/* memory allocation methods provided by the host implementation */ -struct tmem_hostops { - struct tmem_obj *(*obj_alloc)(struct tmem_pool *); - void (*obj_free)(struct tmem_obj *, struct tmem_pool *); - struct tmem_objnode *(*objnode_alloc)(struct tmem_pool *); - void (*objnode_free)(struct tmem_objnode *, struct tmem_pool *); -}; -extern void tmem_register_hostops(struct tmem_hostops *m); - -/* core tmem accessor functions */ -extern int tmem_put(struct tmem_pool *, struct tmem_oid *, uint32_t index, - struct page *page); -extern int tmem_get(struct tmem_pool *, struct tmem_oid *, uint32_t index, - struct page *page); -extern int tmem_flush_page(struct tmem_pool *, struct tmem_oid *, - uint32_t index); -extern int tmem_flush_object(struct tmem_pool *, struct tmem_oid *); -extern int tmem_destroy_pool(struct tmem_pool *); -extern void tmem_new_pool(struct tmem_pool *, uint32_t); -#endif /* _TMEM_H */ diff --git a/drivers/staging/zcache/zcache.c b/drivers/staging/zcache/zcache.c deleted file mode 100755 index b8a2b30a..00000000 --- a/drivers/staging/zcache/zcache.c +++ /dev/null @@ -1,1658 +0,0 @@ -/* - * zcache.c - * - * Copyright (c) 2010,2011, Dan Magenheimer, Oracle Corp. - * Copyright (c) 2010,2011, Nitin Gupta - * - * Zcache provides an in-kernel "host implementation" for transcendent memory - * and, thus indirectly, for cleancache and frontswap. Zcache includes two - * page-accessible memory [1] interfaces, both utilizing lzo1x compression: - * 1) "compression buddies" ("zbud") is used for ephemeral pages - * 2) xvmalloc is used for persistent pages. - * Xvmalloc (based on the TLSF allocator) has very low fragmentation - * so maximizes space efficiency, while zbud allows pairs (and potentially, - * in the future, more than a pair of) compressed pages to be closely linked - * so that reclaiming can be done via the kernel's physical-page-oriented - * "shrinker" interface. - * - * [1] For a definition of page-accessible memory (aka PAM), see: - * http://marc.info/?l=linux-mm&m=127811271605009 - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include "tmem.h" - -#include "../zram/xvmalloc.h" /* if built in drivers/staging */ - -#if (!defined(CONFIG_CLEANCACHE) && !defined(CONFIG_FRONTSWAP)) -#error "zcache is useless without CONFIG_CLEANCACHE or CONFIG_FRONTSWAP" -#endif -#ifdef CONFIG_CLEANCACHE -#include -#endif -#ifdef CONFIG_FRONTSWAP -#include -#endif - -#if 0 -/* this is more aggressive but may cause other problems? */ -#define ZCACHE_GFP_MASK (GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN) -#else -#define ZCACHE_GFP_MASK \ - (__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC) -#endif - -/********** - * Compression buddies ("zbud") provides for packing two (or, possibly - * in the future, more) compressed ephemeral pages into a single "raw" - * (physical) page and tracking them with data structures so that - * the raw pages can be easily reclaimed. - * - * A zbud page ("zbpg") is an aligned page containing a list_head, - * a lock, and two "zbud headers". The remainder of the physical - * page is divided up into aligned 64-byte "chunks" which contain - * the compressed data for zero, one, or two zbuds. Each zbpg - * resides on: (1) an "unused list" if it has no zbuds; (2) a - * "buddied" list if it is fully populated with two zbuds; or - * (3) one of PAGE_SIZE/64 "unbuddied" lists indexed by how many chunks - * the one unbuddied zbud uses. The data inside a zbpg cannot be - * read or written unless the zbpg's lock is held. - */ - -#define ZBH_SENTINEL 0x43214321 -#define ZBPG_SENTINEL 0xdeadbeef - -#define ZBUD_MAX_BUDS 2 - -struct zbud_hdr { - uint32_t pool_id; - struct tmem_oid oid; - uint32_t index; - uint16_t size; /* compressed size in bytes, zero means unused */ - DECL_SENTINEL -}; - -struct zbud_page { - struct list_head bud_list; - spinlock_t lock; - struct zbud_hdr buddy[ZBUD_MAX_BUDS]; - DECL_SENTINEL - /* followed by NUM_CHUNK aligned CHUNK_SIZE-byte chunks */ -}; - -#define CHUNK_SHIFT 6 -#define CHUNK_SIZE (1 << CHUNK_SHIFT) -#define CHUNK_MASK (~(CHUNK_SIZE-1)) -#define NCHUNKS (((PAGE_SIZE - sizeof(struct zbud_page)) & \ - CHUNK_MASK) >> CHUNK_SHIFT) -#define MAX_CHUNK (NCHUNKS-1) - -static struct { - struct list_head list; - unsigned count; -} zbud_unbuddied[NCHUNKS]; -/* list N contains pages with N chunks USED and NCHUNKS-N unused */ -/* element 0 is never used but optimizing that isn't worth it */ -static unsigned long zbud_cumul_chunk_counts[NCHUNKS]; - -struct list_head zbud_buddied_list; -static unsigned long zcache_zbud_buddied_count; - -/* protects the buddied list and all unbuddied lists */ -static DEFINE_SPINLOCK(zbud_budlists_spinlock); - -static LIST_HEAD(zbpg_unused_list); -static unsigned long zcache_zbpg_unused_list_count; - -/* protects the unused page list */ -static DEFINE_SPINLOCK(zbpg_unused_list_spinlock); - -static atomic_t zcache_zbud_curr_raw_pages; -static atomic_t zcache_zbud_curr_zpages; -static unsigned long zcache_zbud_curr_zbytes; -static unsigned long zcache_zbud_cumul_zpages; -static unsigned long zcache_zbud_cumul_zbytes; -static unsigned long zcache_compress_poor; - -/* forward references */ -static void *zcache_get_free_page(void); -static void zcache_free_page(void *p); - -/* - * zbud helper functions - */ - -static inline unsigned zbud_max_buddy_size(void) -{ - return MAX_CHUNK << CHUNK_SHIFT; -} - -static inline unsigned zbud_size_to_chunks(unsigned size) -{ - BUG_ON(size == 0 || size > zbud_max_buddy_size()); - return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; -} - -static inline int zbud_budnum(struct zbud_hdr *zh) -{ - unsigned offset = (unsigned long)zh & (PAGE_SIZE - 1); - struct zbud_page *zbpg = NULL; - unsigned budnum = -1U; - int i; - - for (i = 0; i < ZBUD_MAX_BUDS; i++) - if (offset == offsetof(typeof(*zbpg), buddy[i])) { - budnum = i; - break; - } - BUG_ON(budnum == -1U); - return budnum; -} - -static char *zbud_data(struct zbud_hdr *zh, unsigned size) -{ - struct zbud_page *zbpg; - char *p; - unsigned budnum; - - ASSERT_SENTINEL(zh, ZBH); - budnum = zbud_budnum(zh); - BUG_ON(size == 0 || size > zbud_max_buddy_size()); - zbpg = container_of(zh, struct zbud_page, buddy[budnum]); - ASSERT_SPINLOCK(&zbpg->lock); - p = (char *)zbpg; - if (budnum == 0) - p += ((sizeof(struct zbud_page) + CHUNK_SIZE - 1) & - CHUNK_MASK); - else if (budnum == 1) - p += PAGE_SIZE - ((size + CHUNK_SIZE - 1) & CHUNK_MASK); - return p; -} - -/* - * zbud raw page management - */ - -static struct zbud_page *zbud_alloc_raw_page(void) -{ - struct zbud_page *zbpg = NULL; - struct zbud_hdr *zh0, *zh1; - bool recycled = 0; - - /* if any pages on the zbpg list, use one */ - spin_lock(&zbpg_unused_list_spinlock); - if (!list_empty(&zbpg_unused_list)) { - zbpg = list_first_entry(&zbpg_unused_list, - struct zbud_page, bud_list); - list_del_init(&zbpg->bud_list); - zcache_zbpg_unused_list_count--; - recycled = 1; - } - spin_unlock(&zbpg_unused_list_spinlock); - if (zbpg == NULL) - /* none on zbpg list, try to get a kernel page */ - zbpg = zcache_get_free_page(); - if (likely(zbpg != NULL)) { - INIT_LIST_HEAD(&zbpg->bud_list); - zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1]; - spin_lock_init(&zbpg->lock); - if (recycled) { - ASSERT_INVERTED_SENTINEL(zbpg, ZBPG); - SET_SENTINEL(zbpg, ZBPG); - BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid)); - BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid)); - } else { - atomic_inc(&zcache_zbud_curr_raw_pages); - INIT_LIST_HEAD(&zbpg->bud_list); - SET_SENTINEL(zbpg, ZBPG); - zh0->size = 0; zh1->size = 0; - tmem_oid_set_invalid(&zh0->oid); - tmem_oid_set_invalid(&zh1->oid); - } - } - return zbpg; -} - -static void zbud_free_raw_page(struct zbud_page *zbpg) -{ - struct zbud_hdr *zh0 = &zbpg->buddy[0], *zh1 = &zbpg->buddy[1]; - - ASSERT_SENTINEL(zbpg, ZBPG); - BUG_ON(!list_empty(&zbpg->bud_list)); - ASSERT_SPINLOCK(&zbpg->lock); - BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid)); - BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid)); - INVERT_SENTINEL(zbpg, ZBPG); - spin_unlock(&zbpg->lock); - spin_lock(&zbpg_unused_list_spinlock); - list_add(&zbpg->bud_list, &zbpg_unused_list); - zcache_zbpg_unused_list_count++; - spin_unlock(&zbpg_unused_list_spinlock); -} - -/* - * core zbud handling routines - */ - -static unsigned zbud_free(struct zbud_hdr *zh) -{ - unsigned size; - - ASSERT_SENTINEL(zh, ZBH); - BUG_ON(!tmem_oid_valid(&zh->oid)); - size = zh->size; - BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size()); - zh->size = 0; - tmem_oid_set_invalid(&zh->oid); - INVERT_SENTINEL(zh, ZBH); - zcache_zbud_curr_zbytes -= size; - atomic_dec(&zcache_zbud_curr_zpages); - return size; -} - -static void zbud_free_and_delist(struct zbud_hdr *zh) -{ - unsigned chunks; - struct zbud_hdr *zh_other; - unsigned budnum = zbud_budnum(zh), size; - struct zbud_page *zbpg = - container_of(zh, struct zbud_page, buddy[budnum]); - - spin_lock(&zbpg->lock); - if (list_empty(&zbpg->bud_list)) { - /* ignore zombie page... see zbud_evict_pages() */ - spin_unlock(&zbpg->lock); - return; - } - size = zbud_free(zh); - ASSERT_SPINLOCK(&zbpg->lock); - zh_other = &zbpg->buddy[(budnum == 0) ? 1 : 0]; - if (zh_other->size == 0) { /* was unbuddied: unlist and free */ - chunks = zbud_size_to_chunks(size) ; - spin_lock(&zbud_budlists_spinlock); - BUG_ON(list_empty(&zbud_unbuddied[chunks].list)); - list_del_init(&zbpg->bud_list); - zbud_unbuddied[chunks].count--; - spin_unlock(&zbud_budlists_spinlock); - zbud_free_raw_page(zbpg); - } else { /* was buddied: move remaining buddy to unbuddied list */ - chunks = zbud_size_to_chunks(zh_other->size) ; - spin_lock(&zbud_budlists_spinlock); - list_del_init(&zbpg->bud_list); - zcache_zbud_buddied_count--; - list_add_tail(&zbpg->bud_list, &zbud_unbuddied[chunks].list); - zbud_unbuddied[chunks].count++; - spin_unlock(&zbud_budlists_spinlock); - spin_unlock(&zbpg->lock); - } -} - -static struct zbud_hdr *zbud_create(uint32_t pool_id, struct tmem_oid *oid, - uint32_t index, struct page *page, - void *cdata, unsigned size) -{ - struct zbud_hdr *zh0, *zh1, *zh = NULL; - struct zbud_page *zbpg = NULL, *ztmp; - unsigned nchunks; - char *to; - int i, found_good_buddy = 0; - - nchunks = zbud_size_to_chunks(size) ; - for (i = MAX_CHUNK - nchunks + 1; i > 0; i--) { - spin_lock(&zbud_budlists_spinlock); - if (!list_empty(&zbud_unbuddied[i].list)) { - list_for_each_entry_safe(zbpg, ztmp, - &zbud_unbuddied[i].list, bud_list) { - if (spin_trylock(&zbpg->lock)) { - found_good_buddy = i; - goto found_unbuddied; - } - } - } - spin_unlock(&zbud_budlists_spinlock); - } - /* didn't find a good buddy, try allocating a new page */ - zbpg = zbud_alloc_raw_page(); - if (unlikely(zbpg == NULL)) - goto out; - /* ok, have a page, now compress the data before taking locks */ - spin_lock(&zbpg->lock); - spin_lock(&zbud_budlists_spinlock); - list_add_tail(&zbpg->bud_list, &zbud_unbuddied[nchunks].list); - zbud_unbuddied[nchunks].count++; - zh = &zbpg->buddy[0]; - goto init_zh; - -found_unbuddied: - ASSERT_SPINLOCK(&zbpg->lock); - zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1]; - BUG_ON(!((zh0->size == 0) ^ (zh1->size == 0))); - if (zh0->size != 0) { /* buddy0 in use, buddy1 is vacant */ - ASSERT_SENTINEL(zh0, ZBH); - zh = zh1; - } else if (zh1->size != 0) { /* buddy1 in use, buddy0 is vacant */ - ASSERT_SENTINEL(zh1, ZBH); - zh = zh0; - } else - BUG(); - list_del_init(&zbpg->bud_list); - zbud_unbuddied[found_good_buddy].count--; - list_add_tail(&zbpg->bud_list, &zbud_buddied_list); - zcache_zbud_buddied_count++; - -init_zh: - SET_SENTINEL(zh, ZBH); - zh->size = size; - zh->index = index; - zh->oid = *oid; - zh->pool_id = pool_id; - /* can wait to copy the data until the list locks are dropped */ - spin_unlock(&zbud_budlists_spinlock); - - to = zbud_data(zh, size); - memcpy(to, cdata, size); - spin_unlock(&zbpg->lock); - zbud_cumul_chunk_counts[nchunks]++; - atomic_inc(&zcache_zbud_curr_zpages); - zcache_zbud_cumul_zpages++; - zcache_zbud_curr_zbytes += size; - zcache_zbud_cumul_zbytes += size; -out: - return zh; -} - -static int zbud_decompress(struct page *page, struct zbud_hdr *zh) -{ - struct zbud_page *zbpg; - unsigned budnum = zbud_budnum(zh); - size_t out_len = PAGE_SIZE; - char *to_va, *from_va; - unsigned size; - int ret = 0; - - zbpg = container_of(zh, struct zbud_page, buddy[budnum]); - spin_lock(&zbpg->lock); - if (list_empty(&zbpg->bud_list)) { - /* ignore zombie page... see zbud_evict_pages() */ - ret = -EINVAL; - goto out; - } - ASSERT_SENTINEL(zh, ZBH); - BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size()); - to_va = kmap_atomic(page, KM_USER0); - size = zh->size; - from_va = zbud_data(zh, size); - ret = lzo1x_decompress_safe(from_va, size, to_va, &out_len); - BUG_ON(ret != LZO_E_OK); - BUG_ON(out_len != PAGE_SIZE); - kunmap_atomic(to_va, KM_USER0); -out: - spin_unlock(&zbpg->lock); - return ret; -} - -/* - * The following routines handle shrinking of ephemeral pages by evicting - * pages "least valuable" first. - */ - -static unsigned long zcache_evicted_raw_pages; -static unsigned long zcache_evicted_buddied_pages; -static unsigned long zcache_evicted_unbuddied_pages; - -static struct tmem_pool *zcache_get_pool_by_id(uint32_t poolid); -static void zcache_put_pool(struct tmem_pool *pool); - -/* - * Flush and free all zbuds in a zbpg, then free the pageframe - */ -static void zbud_evict_zbpg(struct zbud_page *zbpg) -{ - struct zbud_hdr *zh; - int i, j; - uint32_t pool_id[ZBUD_MAX_BUDS], index[ZBUD_MAX_BUDS]; - struct tmem_oid oid[ZBUD_MAX_BUDS]; - struct tmem_pool *pool; - - ASSERT_SPINLOCK(&zbpg->lock); - BUG_ON(!list_empty(&zbpg->bud_list)); - for (i = 0, j = 0; i < ZBUD_MAX_BUDS; i++) { - zh = &zbpg->buddy[i]; - if (zh->size) { - pool_id[j] = zh->pool_id; - oid[j] = zh->oid; - index[j] = zh->index; - j++; - zbud_free(zh); - } - } - spin_unlock(&zbpg->lock); - for (i = 0; i < j; i++) { - pool = zcache_get_pool_by_id(pool_id[i]); - if (pool != NULL) { - tmem_flush_page(pool, &oid[i], index[i]); - zcache_put_pool(pool); - } - } - ASSERT_SENTINEL(zbpg, ZBPG); - spin_lock(&zbpg->lock); - zbud_free_raw_page(zbpg); -} - -/* - * Free nr pages. This code is funky because we want to hold the locks - * protecting various lists for as short a time as possible, and in some - * circumstances the list may change asynchronously when the list lock is - * not held. In some cases we also trylock not only to avoid waiting on a - * page in use by another cpu, but also to avoid potential deadlock due to - * lock inversion. - */ -static void zbud_evict_pages(int nr) -{ - struct zbud_page *zbpg; - int i; - - /* first try freeing any pages on unused list */ -retry_unused_list: - spin_lock_bh(&zbpg_unused_list_spinlock); - if (!list_empty(&zbpg_unused_list)) { - /* can't walk list here, since it may change when unlocked */ - zbpg = list_first_entry(&zbpg_unused_list, - struct zbud_page, bud_list); - list_del_init(&zbpg->bud_list); - zcache_zbpg_unused_list_count--; - atomic_dec(&zcache_zbud_curr_raw_pages); - spin_unlock_bh(&zbpg_unused_list_spinlock); - zcache_free_page(zbpg); - zcache_evicted_raw_pages++; - if (--nr <= 0) - goto out; - goto retry_unused_list; - } - spin_unlock_bh(&zbpg_unused_list_spinlock); - - /* now try freeing unbuddied pages, starting with least space avail */ - for (i = 0; i < MAX_CHUNK; i++) { -retry_unbud_list_i: - spin_lock_bh(&zbud_budlists_spinlock); - if (list_empty(&zbud_unbuddied[i].list)) { - spin_unlock_bh(&zbud_budlists_spinlock); - continue; - } - list_for_each_entry(zbpg, &zbud_unbuddied[i].list, bud_list) { - if (unlikely(!spin_trylock(&zbpg->lock))) - continue; - list_del_init(&zbpg->bud_list); - zbud_unbuddied[i].count--; - spin_unlock(&zbud_budlists_spinlock); - zcache_evicted_unbuddied_pages++; - /* want budlists unlocked when doing zbpg eviction */ - zbud_evict_zbpg(zbpg); - local_bh_enable(); - if (--nr <= 0) - goto out; - goto retry_unbud_list_i; - } - spin_unlock_bh(&zbud_budlists_spinlock); - } - - /* as a last resort, free buddied pages */ -retry_bud_list: - spin_lock_bh(&zbud_budlists_spinlock); - if (list_empty(&zbud_buddied_list)) { - spin_unlock_bh(&zbud_budlists_spinlock); - goto out; - } - list_for_each_entry(zbpg, &zbud_buddied_list, bud_list) { - if (unlikely(!spin_trylock(&zbpg->lock))) - continue; - list_del_init(&zbpg->bud_list); - zcache_zbud_buddied_count--; - spin_unlock(&zbud_budlists_spinlock); - zcache_evicted_buddied_pages++; - /* want budlists unlocked when doing zbpg eviction */ - zbud_evict_zbpg(zbpg); - local_bh_enable(); - if (--nr <= 0) - goto out; - goto retry_bud_list; - } - spin_unlock_bh(&zbud_budlists_spinlock); -out: - return; -} - -static void zbud_init(void) -{ - int i; - - INIT_LIST_HEAD(&zbud_buddied_list); - zcache_zbud_buddied_count = 0; - for (i = 0; i < NCHUNKS; i++) { - INIT_LIST_HEAD(&zbud_unbuddied[i].list); - zbud_unbuddied[i].count = 0; - } -} - -#ifdef CONFIG_SYSFS -/* - * These sysfs routines show a nice distribution of how many zbpg's are - * currently (and have ever been placed) in each unbuddied list. It's fun - * to watch but can probably go away before final merge. - */ -static int zbud_show_unbuddied_list_counts(char *buf) -{ - int i; - char *p = buf; - - for (i = 0; i < NCHUNKS - 1; i++) - p += sprintf(p, "%u ", zbud_unbuddied[i].count); - p += sprintf(p, "%d\n", zbud_unbuddied[i].count); - return p - buf; -} - -static int zbud_show_cumul_chunk_counts(char *buf) -{ - unsigned long i, chunks = 0, total_chunks = 0, sum_total_chunks = 0; - unsigned long total_chunks_lte_21 = 0, total_chunks_lte_32 = 0; - unsigned long total_chunks_lte_42 = 0; - char *p = buf; - - for (i = 0; i < NCHUNKS; i++) { - p += sprintf(p, "%lu ", zbud_cumul_chunk_counts[i]); - chunks += zbud_cumul_chunk_counts[i]; - total_chunks += zbud_cumul_chunk_counts[i]; - sum_total_chunks += i * zbud_cumul_chunk_counts[i]; - if (i == 21) - total_chunks_lte_21 = total_chunks; - if (i == 32) - total_chunks_lte_32 = total_chunks; - if (i == 42) - total_chunks_lte_42 = total_chunks; - } - p += sprintf(p, "<=21:%lu <=32:%lu <=42:%lu, mean:%lu\n", - total_chunks_lte_21, total_chunks_lte_32, total_chunks_lte_42, - chunks == 0 ? 0 : sum_total_chunks / chunks); - return p - buf; -} -#endif - -/********** - * This "zv" PAM implementation combines the TLSF-based xvMalloc - * with lzo1x compression to maximize the amount of data that can - * be packed into a physical page. - * - * Zv represents a PAM page with the index and object (plus a "size" value - * necessary for decompression) immediately preceding the compressed data. - */ - -#define ZVH_SENTINEL 0x43214321 - -struct zv_hdr { - uint32_t pool_id; - struct tmem_oid oid; - uint32_t index; - DECL_SENTINEL -}; - -static const int zv_max_page_size = (PAGE_SIZE / 8) * 7; - -static struct zv_hdr *zv_create(struct xv_pool *xvpool, uint32_t pool_id, - struct tmem_oid *oid, uint32_t index, - void *cdata, unsigned clen) -{ - struct page *page; - struct zv_hdr *zv = NULL; - uint32_t offset; - int ret; - - BUG_ON(!irqs_disabled()); - ret = xv_malloc(xvpool, clen + sizeof(struct zv_hdr), - &page, &offset, ZCACHE_GFP_MASK); - if (unlikely(ret)) - goto out; - zv = kmap_atomic(page, KM_USER0) + offset; - zv->index = index; - zv->oid = *oid; - zv->pool_id = pool_id; - SET_SENTINEL(zv, ZVH); - memcpy((char *)zv + sizeof(struct zv_hdr), cdata, clen); - kunmap_atomic(zv, KM_USER0); -out: - return zv; -} - -static void zv_free(struct xv_pool *xvpool, struct zv_hdr *zv) -{ - unsigned long flags; - struct page *page; - uint32_t offset; - uint16_t size; - - ASSERT_SENTINEL(zv, ZVH); - size = xv_get_object_size(zv) - sizeof(*zv); - BUG_ON(size == 0 || size > zv_max_page_size); - INVERT_SENTINEL(zv, ZVH); - page = virt_to_page(zv); - offset = (unsigned long)zv & ~PAGE_MASK; - local_irq_save(flags); - xv_free(xvpool, page, offset); - local_irq_restore(flags); -} - -static void zv_decompress(struct page *page, struct zv_hdr *zv) -{ - size_t clen = PAGE_SIZE; - char *to_va; - unsigned size; - int ret; - - ASSERT_SENTINEL(zv, ZVH); - size = xv_get_object_size(zv) - sizeof(*zv); - BUG_ON(size == 0 || size > zv_max_page_size); - to_va = kmap_atomic(page, KM_USER0); - ret = lzo1x_decompress_safe((char *)zv + sizeof(*zv), - size, to_va, &clen); - kunmap_atomic(to_va, KM_USER0); - BUG_ON(ret != LZO_E_OK); - BUG_ON(clen != PAGE_SIZE); -} - -/* - * zcache core code starts here - */ - -/* useful stats not collected by cleancache or frontswap */ -static unsigned long zcache_flush_total; -static unsigned long zcache_flush_found; -static unsigned long zcache_flobj_total; -static unsigned long zcache_flobj_found; -static unsigned long zcache_failed_eph_puts; -static unsigned long zcache_failed_pers_puts; - -#define MAX_POOLS_PER_CLIENT 16 - -static struct { - struct tmem_pool *tmem_pools[MAX_POOLS_PER_CLIENT]; - struct xv_pool *xvpool; -} zcache_client; - -/* - * Tmem operations assume the poolid implies the invoking client. - * Zcache only has one client (the kernel itself), so translate - * the poolid into the tmem_pool allocated for it. A KVM version - * of zcache would have one client per guest and each client might - * have a poolid==N. - */ -static struct tmem_pool *zcache_get_pool_by_id(uint32_t poolid) -{ - struct tmem_pool *pool = NULL; - - if (poolid >= 0) { - pool = zcache_client.tmem_pools[poolid]; - if (pool != NULL) - atomic_inc(&pool->refcount); - } - return pool; -} - -static void zcache_put_pool(struct tmem_pool *pool) -{ - if (pool != NULL) - atomic_dec(&pool->refcount); -} - -/* counters for debugging */ -static unsigned long zcache_failed_get_free_pages; -static unsigned long zcache_failed_alloc; -static unsigned long zcache_put_to_flush; -static unsigned long zcache_aborted_preload; -static unsigned long zcache_aborted_shrink; - -/* - * Ensure that memory allocation requests in zcache don't result - * in direct reclaim requests via the shrinker, which would cause - * an infinite loop. Maybe a GFP flag would be better? - */ -static DEFINE_SPINLOCK(zcache_direct_reclaim_lock); - -/* - * for now, used named slabs so can easily track usage; later can - * either just use kmalloc, or perhaps add a slab-like allocator - * to more carefully manage total memory utilization - */ -static struct kmem_cache *zcache_objnode_cache; -static struct kmem_cache *zcache_obj_cache; -static atomic_t zcache_curr_obj_count = ATOMIC_INIT(0); -static unsigned long zcache_curr_obj_count_max; -static atomic_t zcache_curr_objnode_count = ATOMIC_INIT(0); -static unsigned long zcache_curr_objnode_count_max; - -/* - * to avoid memory allocation recursion (e.g. due to direct reclaim), we - * preload all necessary data structures so the hostops callbacks never - * actually do a malloc - */ -struct zcache_preload { - void *page; - struct tmem_obj *obj; - int nr; - struct tmem_objnode *objnodes[OBJNODE_TREE_MAX_PATH]; -}; -static DEFINE_PER_CPU(struct zcache_preload, zcache_preloads) = { 0, }; - -static int zcache_do_preload(struct tmem_pool *pool) -{ - struct zcache_preload *kp; - struct tmem_objnode *objnode; - struct tmem_obj *obj; - void *page; - int ret = -ENOMEM; - - if (unlikely(zcache_objnode_cache == NULL)) - goto out; - if (unlikely(zcache_obj_cache == NULL)) - goto out; - if (!spin_trylock(&zcache_direct_reclaim_lock)) { - zcache_aborted_preload++; - goto out; - } - preempt_disable(); - kp = &__get_cpu_var(zcache_preloads); - while (kp->nr < ARRAY_SIZE(kp->objnodes)) { - preempt_enable_no_resched(); - objnode = kmem_cache_alloc(zcache_objnode_cache, - ZCACHE_GFP_MASK); - if (unlikely(objnode == NULL)) { - zcache_failed_alloc++; - goto unlock_out; - } - preempt_disable(); - kp = &__get_cpu_var(zcache_preloads); - if (kp->nr < ARRAY_SIZE(kp->objnodes)) - kp->objnodes[kp->nr++] = objnode; - else - kmem_cache_free(zcache_objnode_cache, objnode); - } - preempt_enable_no_resched(); - obj = kmem_cache_alloc(zcache_obj_cache, ZCACHE_GFP_MASK); - if (unlikely(obj == NULL)) { - zcache_failed_alloc++; - goto unlock_out; - } - page = (void *)__get_free_page(ZCACHE_GFP_MASK); - if (unlikely(page == NULL)) { - zcache_failed_get_free_pages++; - kmem_cache_free(zcache_obj_cache, obj); - goto unlock_out; - } - preempt_disable(); - kp = &__get_cpu_var(zcache_preloads); - if (kp->obj == NULL) - kp->obj = obj; - else - kmem_cache_free(zcache_obj_cache, obj); - if (kp->page == NULL) - kp->page = page; - else - free_page((unsigned long)page); - ret = 0; -unlock_out: - spin_unlock(&zcache_direct_reclaim_lock); -out: - return ret; -} - -static void *zcache_get_free_page(void) -{ - struct zcache_preload *kp; - void *page; - - kp = &__get_cpu_var(zcache_preloads); - page = kp->page; - BUG_ON(page == NULL); - kp->page = NULL; - return page; -} - -static void zcache_free_page(void *p) -{ - free_page((unsigned long)p); -} - -/* - * zcache implementation for tmem host ops - */ - -static struct tmem_objnode *zcache_objnode_alloc(struct tmem_pool *pool) -{ - struct tmem_objnode *objnode = NULL; - unsigned long count; - struct zcache_preload *kp; - - kp = &__get_cpu_var(zcache_preloads); - if (kp->nr <= 0) - goto out; - objnode = kp->objnodes[kp->nr - 1]; - BUG_ON(objnode == NULL); - kp->objnodes[kp->nr - 1] = NULL; - kp->nr--; - count = atomic_inc_return(&zcache_curr_objnode_count); - if (count > zcache_curr_objnode_count_max) - zcache_curr_objnode_count_max = count; -out: - return objnode; -} - -static void zcache_objnode_free(struct tmem_objnode *objnode, - struct tmem_pool *pool) -{ - atomic_dec(&zcache_curr_objnode_count); - BUG_ON(atomic_read(&zcache_curr_objnode_count) < 0); - kmem_cache_free(zcache_objnode_cache, objnode); -} - -static struct tmem_obj *zcache_obj_alloc(struct tmem_pool *pool) -{ - struct tmem_obj *obj = NULL; - unsigned long count; - struct zcache_preload *kp; - - kp = &__get_cpu_var(zcache_preloads); - obj = kp->obj; - BUG_ON(obj == NULL); - kp->obj = NULL; - count = atomic_inc_return(&zcache_curr_obj_count); - if (count > zcache_curr_obj_count_max) - zcache_curr_obj_count_max = count; - return obj; -} - -static void zcache_obj_free(struct tmem_obj *obj, struct tmem_pool *pool) -{ - atomic_dec(&zcache_curr_obj_count); - BUG_ON(atomic_read(&zcache_curr_obj_count) < 0); - kmem_cache_free(zcache_obj_cache, obj); -} - -static struct tmem_hostops zcache_hostops = { - .obj_alloc = zcache_obj_alloc, - .obj_free = zcache_obj_free, - .objnode_alloc = zcache_objnode_alloc, - .objnode_free = zcache_objnode_free, -}; - -/* - * zcache implementations for PAM page descriptor ops - */ - -static atomic_t zcache_curr_eph_pampd_count = ATOMIC_INIT(0); -static unsigned long zcache_curr_eph_pampd_count_max; -static atomic_t zcache_curr_pers_pampd_count = ATOMIC_INIT(0); -static unsigned long zcache_curr_pers_pampd_count_max; - -/* forward reference */ -static int zcache_compress(struct page *from, void **out_va, size_t *out_len); - -static void *zcache_pampd_create(struct tmem_pool *pool, struct tmem_oid *oid, - uint32_t index, struct page *page) -{ - void *pampd = NULL, *cdata; - size_t clen; - int ret; - bool ephemeral = is_ephemeral(pool); - unsigned long count; - - if (ephemeral) { - ret = zcache_compress(page, &cdata, &clen); - if (ret == 0) - - goto out; - if (clen == 0 || clen > zbud_max_buddy_size()) { - zcache_compress_poor++; - goto out; - } - pampd = (void *)zbud_create(pool->pool_id, oid, index, - page, cdata, clen); - if (pampd != NULL) { - count = atomic_inc_return(&zcache_curr_eph_pampd_count); - if (count > zcache_curr_eph_pampd_count_max) - zcache_curr_eph_pampd_count_max = count; - } - } else { - /* - * FIXME: This is all the "policy" there is for now. - * 3/4 totpages should allow ~37% of RAM to be filled with - * compressed frontswap pages - */ - if (atomic_read(&zcache_curr_pers_pampd_count) > - 3 * totalram_pages / 4) - goto out; - ret = zcache_compress(page, &cdata, &clen); - if (ret == 0) - goto out; - if (clen > zv_max_page_size) { - zcache_compress_poor++; - goto out; - } - pampd = (void *)zv_create(zcache_client.xvpool, pool->pool_id, - oid, index, cdata, clen); - if (pampd == NULL) - goto out; - count = atomic_inc_return(&zcache_curr_pers_pampd_count); - if (count > zcache_curr_pers_pampd_count_max) - zcache_curr_pers_pampd_count_max = count; - } -out: - return pampd; -} - -/* - * fill the pageframe corresponding to the struct page with the data - * from the passed pampd - */ -static int zcache_pampd_get_data(struct page *page, void *pampd, - struct tmem_pool *pool) -{ - int ret = 0; - - if (is_ephemeral(pool)) - ret = zbud_decompress(page, pampd); - else - zv_decompress(page, pampd); - return ret; -} - -/* - * free the pampd and remove it from any zcache lists - * pampd must no longer be pointed to from any tmem data structures! - */ -static void zcache_pampd_free(void *pampd, struct tmem_pool *pool) -{ - if (is_ephemeral(pool)) { - zbud_free_and_delist((struct zbud_hdr *)pampd); - atomic_dec(&zcache_curr_eph_pampd_count); - BUG_ON(atomic_read(&zcache_curr_eph_pampd_count) < 0); - } else { - zv_free(zcache_client.xvpool, (struct zv_hdr *)pampd); - atomic_dec(&zcache_curr_pers_pampd_count); - BUG_ON(atomic_read(&zcache_curr_pers_pampd_count) < 0); - } -} - -static struct tmem_pamops zcache_pamops = { - .create = zcache_pampd_create, - .get_data = zcache_pampd_get_data, - .free = zcache_pampd_free, -}; - -/* - * zcache compression/decompression and related per-cpu stuff - */ - -#define LZO_WORKMEM_BYTES LZO1X_1_MEM_COMPRESS -#define LZO_DSTMEM_PAGE_ORDER 1 -static DEFINE_PER_CPU(unsigned char *, zcache_workmem); -static DEFINE_PER_CPU(unsigned char *, zcache_dstmem); - -static int zcache_compress(struct page *from, void **out_va, size_t *out_len) -{ - int ret = 0; - unsigned char *dmem = __get_cpu_var(zcache_dstmem); - unsigned char *wmem = __get_cpu_var(zcache_workmem); - char *from_va; - - BUG_ON(!irqs_disabled()); - if (unlikely(dmem == NULL || wmem == NULL)) - goto out; /* no buffer, so can't compress */ - from_va = kmap_atomic(from, KM_USER0); - mb(); - ret = lzo1x_1_compress(from_va, PAGE_SIZE, dmem, out_len, wmem); - BUG_ON(ret != LZO_E_OK); - *out_va = dmem; - kunmap_atomic(from_va, KM_USER0); - ret = 1; -out: - return ret; -} - - -static int zcache_cpu_notifier(struct notifier_block *nb, - unsigned long action, void *pcpu) -{ - int cpu = (long)pcpu; - struct zcache_preload *kp; - - switch (action) { - case CPU_UP_PREPARE: - per_cpu(zcache_dstmem, cpu) = (void *)__get_free_pages( - GFP_KERNEL | __GFP_REPEAT, - LZO_DSTMEM_PAGE_ORDER), - per_cpu(zcache_workmem, cpu) = - kzalloc(LZO1X_MEM_COMPRESS, - GFP_KERNEL | __GFP_REPEAT); - break; - case CPU_DEAD: - case CPU_UP_CANCELED: - free_pages((unsigned long)per_cpu(zcache_dstmem, cpu), - LZO_DSTMEM_PAGE_ORDER); - per_cpu(zcache_dstmem, cpu) = NULL; - kfree(per_cpu(zcache_workmem, cpu)); - per_cpu(zcache_workmem, cpu) = NULL; - kp = &per_cpu(zcache_preloads, cpu); - while (kp->nr) { - kmem_cache_free(zcache_objnode_cache, - kp->objnodes[kp->nr - 1]); - kp->objnodes[kp->nr - 1] = NULL; - kp->nr--; - } - kmem_cache_free(zcache_obj_cache, kp->obj); - free_page((unsigned long)kp->page); - break; - default: - break; - } - return NOTIFY_OK; -} - -static struct notifier_block zcache_cpu_notifier_block = { - .notifier_call = zcache_cpu_notifier -}; - -#ifdef CONFIG_SYSFS -#define ZCACHE_SYSFS_RO(_name) \ - static ssize_t zcache_##_name##_show(struct kobject *kobj, \ - struct kobj_attribute *attr, char *buf) \ - { \ - return sprintf(buf, "%lu\n", zcache_##_name); \ - } \ - static struct kobj_attribute zcache_##_name##_attr = { \ - .attr = { .name = __stringify(_name), .mode = 0444 }, \ - .show = zcache_##_name##_show, \ - } - -#define ZCACHE_SYSFS_RO_ATOMIC(_name) \ - static ssize_t zcache_##_name##_show(struct kobject *kobj, \ - struct kobj_attribute *attr, char *buf) \ - { \ - return sprintf(buf, "%d\n", atomic_read(&zcache_##_name)); \ - } \ - static struct kobj_attribute zcache_##_name##_attr = { \ - .attr = { .name = __stringify(_name), .mode = 0444 }, \ - .show = zcache_##_name##_show, \ - } - -#define ZCACHE_SYSFS_RO_CUSTOM(_name, _func) \ - static ssize_t zcache_##_name##_show(struct kobject *kobj, \ - struct kobj_attribute *attr, char *buf) \ - { \ - return _func(buf); \ - } \ - static struct kobj_attribute zcache_##_name##_attr = { \ - .attr = { .name = __stringify(_name), .mode = 0444 }, \ - .show = zcache_##_name##_show, \ - } - -ZCACHE_SYSFS_RO(curr_obj_count_max); -ZCACHE_SYSFS_RO(curr_objnode_count_max); -ZCACHE_SYSFS_RO(flush_total); -ZCACHE_SYSFS_RO(flush_found); -ZCACHE_SYSFS_RO(flobj_total); -ZCACHE_SYSFS_RO(flobj_found); -ZCACHE_SYSFS_RO(failed_eph_puts); -ZCACHE_SYSFS_RO(failed_pers_puts); -ZCACHE_SYSFS_RO(zbud_curr_zbytes); -ZCACHE_SYSFS_RO(zbud_cumul_zpages); -ZCACHE_SYSFS_RO(zbud_cumul_zbytes); -ZCACHE_SYSFS_RO(zbud_buddied_count); -ZCACHE_SYSFS_RO(zbpg_unused_list_count); -ZCACHE_SYSFS_RO(evicted_raw_pages); -ZCACHE_SYSFS_RO(evicted_unbuddied_pages); -ZCACHE_SYSFS_RO(evicted_buddied_pages); -ZCACHE_SYSFS_RO(failed_get_free_pages); -ZCACHE_SYSFS_RO(failed_alloc); -ZCACHE_SYSFS_RO(put_to_flush); -ZCACHE_SYSFS_RO(aborted_preload); -ZCACHE_SYSFS_RO(aborted_shrink); -ZCACHE_SYSFS_RO(compress_poor); -ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_raw_pages); -ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_zpages); -ZCACHE_SYSFS_RO_ATOMIC(curr_obj_count); -ZCACHE_SYSFS_RO_ATOMIC(curr_objnode_count); -ZCACHE_SYSFS_RO_CUSTOM(zbud_unbuddied_list_counts, - zbud_show_unbuddied_list_counts); -ZCACHE_SYSFS_RO_CUSTOM(zbud_cumul_chunk_counts, - zbud_show_cumul_chunk_counts); - -static struct attribute *zcache_attrs[] = { - &zcache_curr_obj_count_attr.attr, - &zcache_curr_obj_count_max_attr.attr, - &zcache_curr_objnode_count_attr.attr, - &zcache_curr_objnode_count_max_attr.attr, - &zcache_flush_total_attr.attr, - &zcache_flobj_total_attr.attr, - &zcache_flush_found_attr.attr, - &zcache_flobj_found_attr.attr, - &zcache_failed_eph_puts_attr.attr, - &zcache_failed_pers_puts_attr.attr, - &zcache_compress_poor_attr.attr, - &zcache_zbud_curr_raw_pages_attr.attr, - &zcache_zbud_curr_zpages_attr.attr, - &zcache_zbud_curr_zbytes_attr.attr, - &zcache_zbud_cumul_zpages_attr.attr, - &zcache_zbud_cumul_zbytes_attr.attr, - &zcache_zbud_buddied_count_attr.attr, - &zcache_zbpg_unused_list_count_attr.attr, - &zcache_evicted_raw_pages_attr.attr, - &zcache_evicted_unbuddied_pages_attr.attr, - &zcache_evicted_buddied_pages_attr.attr, - &zcache_failed_get_free_pages_attr.attr, - &zcache_failed_alloc_attr.attr, - &zcache_put_to_flush_attr.attr, - &zcache_aborted_preload_attr.attr, - &zcache_aborted_shrink_attr.attr, - &zcache_zbud_unbuddied_list_counts_attr.attr, - &zcache_zbud_cumul_chunk_counts_attr.attr, - NULL, -}; - -static struct attribute_group zcache_attr_group = { - .attrs = zcache_attrs, - .name = "zcache", -}; - -#endif /* CONFIG_SYSFS */ -/* - * When zcache is disabled ("frozen"), pools can be created and destroyed, - * but all puts (and thus all other operations that require memory allocation) - * must fail. If zcache is unfrozen, accepts puts, then frozen again, - * data consistency requires all puts while frozen to be converted into - * flushes. - */ -static bool zcache_freeze; - -/* - * zcache shrinker interface (only useful for ephemeral pages, so zbud only) - */ -static int shrink_zcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) -{ - int ret = -1; - - if (nr >= 0) { - if (!(gfp_mask & __GFP_FS)) - /* does this case really need to be skipped? */ - goto out; - if (spin_trylock(&zcache_direct_reclaim_lock)) { - zbud_evict_pages(nr); - spin_unlock(&zcache_direct_reclaim_lock); - } else - zcache_aborted_shrink++; - } - ret = (int)atomic_read(&zcache_zbud_curr_raw_pages); -out: - return ret; -} - -static struct shrinker zcache_shrinker = { - .shrink = shrink_zcache_memory, - .seeks = DEFAULT_SEEKS, -}; - -/* - * zcache shims between cleancache/frontswap ops and tmem - */ - -static int zcache_put_page(int pool_id, struct tmem_oid *oidp, - uint32_t index, struct page *page) -{ - struct tmem_pool *pool; - int ret = -1; - - BUG_ON(!irqs_disabled()); - pool = zcache_get_pool_by_id(pool_id); - if (unlikely(pool == NULL)) - goto out; - if (!zcache_freeze && zcache_do_preload(pool) == 0) { - /* preload does preempt_disable on success */ - ret = tmem_put(pool, oidp, index, page); - if (ret < 0) { - if (is_ephemeral(pool)) - zcache_failed_eph_puts++; - else - zcache_failed_pers_puts++; - } - zcache_put_pool(pool); - preempt_enable_no_resched(); - } else { - zcache_put_to_flush++; - if (atomic_read(&pool->obj_count) > 0) - /* the put fails whether the flush succeeds or not */ - (void)tmem_flush_page(pool, oidp, index); - zcache_put_pool(pool); - } -out: - return ret; -} - -static int zcache_get_page(int pool_id, struct tmem_oid *oidp, - uint32_t index, struct page *page) -{ - struct tmem_pool *pool; - int ret = -1; - unsigned long flags; - - local_irq_save(flags); - pool = zcache_get_pool_by_id(pool_id); - if (likely(pool != NULL)) { - if (atomic_read(&pool->obj_count) > 0) - ret = tmem_get(pool, oidp, index, page); - zcache_put_pool(pool); - } - local_irq_restore(flags); - return ret; -} - -static int zcache_flush_page(int pool_id, struct tmem_oid *oidp, uint32_t index) -{ - struct tmem_pool *pool; - int ret = -1; - unsigned long flags; - - local_irq_save(flags); - zcache_flush_total++; - pool = zcache_get_pool_by_id(pool_id); - if (likely(pool != NULL)) { - if (atomic_read(&pool->obj_count) > 0) - ret = tmem_flush_page(pool, oidp, index); - zcache_put_pool(pool); - } - if (ret >= 0) - zcache_flush_found++; - local_irq_restore(flags); - return ret; -} - -static int zcache_flush_object(int pool_id, struct tmem_oid *oidp) -{ - struct tmem_pool *pool; - int ret = -1; - unsigned long flags; - - local_irq_save(flags); - zcache_flobj_total++; - pool = zcache_get_pool_by_id(pool_id); - if (likely(pool != NULL)) { - if (atomic_read(&pool->obj_count) > 0) - ret = tmem_flush_object(pool, oidp); - zcache_put_pool(pool); - } - if (ret >= 0) - zcache_flobj_found++; - local_irq_restore(flags); - return ret; -} - -static int zcache_destroy_pool(int pool_id) -{ - struct tmem_pool *pool = NULL; - int ret = -1; - - if (pool_id < 0) - goto out; - pool = zcache_client.tmem_pools[pool_id]; - if (pool == NULL) - goto out; - zcache_client.tmem_pools[pool_id] = NULL; - /* wait for pool activity on other cpus to quiesce */ - while (atomic_read(&pool->refcount) != 0) - ; - local_bh_disable(); - ret = tmem_destroy_pool(pool); - local_bh_enable(); - kfree(pool); - pr_info("zcache: destroyed pool id=%d\n", pool_id); -out: - return ret; -} - -static int zcache_new_pool(uint32_t flags) -{ - int poolid = -1; - struct tmem_pool *pool; - - pool = kmalloc(sizeof(struct tmem_pool), GFP_KERNEL); - if (pool == NULL) { - pr_info("zcache: pool creation failed: out of memory\n"); - goto out; - } - - for (poolid = 0; poolid < MAX_POOLS_PER_CLIENT; poolid++) - if (zcache_client.tmem_pools[poolid] == NULL) - break; - if (poolid >= MAX_POOLS_PER_CLIENT) { - pr_info("zcache: pool creation failed: max exceeded\n"); - kfree(pool); - poolid = -1; - goto out; - } - atomic_set(&pool->refcount, 0); - pool->client = &zcache_client; - pool->pool_id = poolid; - tmem_new_pool(pool, flags); - zcache_client.tmem_pools[poolid] = pool; - pr_info("zcache: created %s tmem pool, id=%d\n", - flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral", - poolid); -out: - return poolid; -} - -/********** - * Two kernel functionalities currently can be layered on top of tmem. - * These are "cleancache" which is used as a second-chance cache for clean - * page cache pages; and "frontswap" which is used for swap pages - * to avoid writes to disk. A generic "shim" is provided here for each - * to translate in-kernel semantics to zcache semantics. - */ - -#ifdef CONFIG_CLEANCACHE -static void zcache_cleancache_put_page(int pool_id, - struct cleancache_filekey key, - pgoff_t index, struct page *page) -{ - u32 ind = (u32) index; - struct tmem_oid oid = *(struct tmem_oid *)&key; - - if (likely(ind == index)) - (void)zcache_put_page(pool_id, &oid, index, page); -} - -static int zcache_cleancache_get_page(int pool_id, - struct cleancache_filekey key, - pgoff_t index, struct page *page) -{ - u32 ind = (u32) index; - struct tmem_oid oid = *(struct tmem_oid *)&key; - int ret = -1; - - if (likely(ind == index)) - ret = zcache_get_page(pool_id, &oid, index, page); - return ret; -} - -static void zcache_cleancache_flush_page(int pool_id, - struct cleancache_filekey key, - pgoff_t index) -{ - u32 ind = (u32) index; - struct tmem_oid oid = *(struct tmem_oid *)&key; - - if (likely(ind == index)) - (void)zcache_flush_page(pool_id, &oid, ind); -} - -static void zcache_cleancache_flush_inode(int pool_id, - struct cleancache_filekey key) -{ - struct tmem_oid oid = *(struct tmem_oid *)&key; - - (void)zcache_flush_object(pool_id, &oid); -} - -static void zcache_cleancache_flush_fs(int pool_id) -{ - if (pool_id >= 0) - (void)zcache_destroy_pool(pool_id); -} - -static int zcache_cleancache_init_fs(size_t pagesize) -{ - BUG_ON(sizeof(struct cleancache_filekey) != - sizeof(struct tmem_oid)); - BUG_ON(pagesize != PAGE_SIZE); - return zcache_new_pool(0); -} - -static int zcache_cleancache_init_shared_fs(char *uuid, size_t pagesize) -{ - /* shared pools are unsupported and map to private */ - BUG_ON(sizeof(struct cleancache_filekey) != - sizeof(struct tmem_oid)); - BUG_ON(pagesize != PAGE_SIZE); - return zcache_new_pool(0); -} - -static struct cleancache_ops zcache_cleancache_ops = { - .put_page = zcache_cleancache_put_page, - .get_page = zcache_cleancache_get_page, - .flush_page = zcache_cleancache_flush_page, - .flush_inode = zcache_cleancache_flush_inode, - .flush_fs = zcache_cleancache_flush_fs, - .init_shared_fs = zcache_cleancache_init_shared_fs, - .init_fs = zcache_cleancache_init_fs -}; - -struct cleancache_ops zcache_cleancache_register_ops(void) -{ - struct cleancache_ops old_ops = - cleancache_register_ops(&zcache_cleancache_ops); - - return old_ops; -} -#endif - -#ifdef CONFIG_FRONTSWAP -/* a single tmem poolid is used for all frontswap "types" (swapfiles) */ -static int zcache_frontswap_poolid = -1; - -/* - * Swizzling increases objects per swaptype, increasing tmem concurrency - * for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS - */ -#define SWIZ_BITS 4 -#define SWIZ_MASK ((1 << SWIZ_BITS) - 1) -#define _oswiz(_type, _ind) ((_type << SWIZ_BITS) | (_ind & SWIZ_MASK)) -#define iswiz(_ind) (_ind >> SWIZ_BITS) - -static inline struct tmem_oid oswiz(unsigned type, u32 ind) -{ - struct tmem_oid oid = { .oid = { 0 } }; - oid.oid[0] = _oswiz(type, ind); - return oid; -} - -static int zcache_frontswap_put_page(unsigned type, pgoff_t offset, - struct page *page) -{ - u64 ind64 = (u64)offset; - u32 ind = (u32)offset; - struct tmem_oid oid = oswiz(type, ind); - int ret = -1; - unsigned long flags; - - BUG_ON(!PageLocked(page)); - if (likely(ind64 == ind)) { - local_irq_save(flags); - ret = zcache_put_page(zcache_frontswap_poolid, &oid, - iswiz(ind), page); - local_irq_restore(flags); - } - return ret; -} - -/* returns 0 if the page was successfully gotten from frontswap, -1 if - * was not present (should never happen!) */ -static int zcache_frontswap_get_page(unsigned type, pgoff_t offset, - struct page *page) -{ - u64 ind64 = (u64)offset; - u32 ind = (u32)offset; - struct tmem_oid oid = oswiz(type, ind); - int ret = -1; - - BUG_ON(!PageLocked(page)); - if (likely(ind64 == ind)) - ret = zcache_get_page(zcache_frontswap_poolid, &oid, - iswiz(ind), page); - return ret; -} - -/* flush a single page from frontswap */ -static void zcache_frontswap_flush_page(unsigned type, pgoff_t offset) -{ - u64 ind64 = (u64)offset; - u32 ind = (u32)offset; - struct tmem_oid oid = oswiz(type, ind); - - if (likely(ind64 == ind)) - (void)zcache_flush_page(zcache_frontswap_poolid, &oid, - iswiz(ind)); -} - -/* flush all pages from the passed swaptype */ -static void zcache_frontswap_flush_area(unsigned type) -{ - struct tmem_oid oid; - int ind; - - for (ind = SWIZ_MASK; ind >= 0; ind--) { - oid = oswiz(type, ind); - (void)zcache_flush_object(zcache_frontswap_poolid, &oid); - } -} - -static void zcache_frontswap_init(unsigned ignored) -{ - /* a single tmem poolid is used for all frontswap "types" (swapfiles) */ - if (zcache_frontswap_poolid < 0) - zcache_frontswap_poolid = zcache_new_pool(TMEM_POOL_PERSIST); -} - -static struct frontswap_ops zcache_frontswap_ops = { - .put_page = zcache_frontswap_put_page, - .get_page = zcache_frontswap_get_page, - .flush_page = zcache_frontswap_flush_page, - .flush_area = zcache_frontswap_flush_area, - .init = zcache_frontswap_init -}; - -struct frontswap_ops zcache_frontswap_register_ops(void) -{ - struct frontswap_ops old_ops = - frontswap_register_ops(&zcache_frontswap_ops); - - return old_ops; -} -#endif - -/* - * zcache initialization - * NOTE FOR NOW zcache MUST BE PROVIDED AS A KERNEL BOOT PARAMETER OR - * NOTHING HAPPENS! - */ - -static int zcache_enabled; - -static int __init enable_zcache(char *s) -{ - zcache_enabled = 1; - return 1; -} -__setup("zcache", enable_zcache); - -/* allow independent dynamic disabling of cleancache and frontswap */ - -static int use_cleancache = 1; - -static int __init no_cleancache(char *s) -{ - use_cleancache = 0; - return 1; -} - -__setup("nocleancache", no_cleancache); - -static int use_frontswap = 1; - -static int __init no_frontswap(char *s) -{ - use_frontswap = 0; - return 1; -} - -__setup("nofrontswap", no_frontswap); - -static int __init zcache_init(void) -{ -#ifdef CONFIG_SYSFS - int ret = 0; - - ret = sysfs_create_group(mm_kobj, &zcache_attr_group); - if (ret) { - pr_err("zcache: can't create sysfs\n"); - goto out; - } -#endif /* CONFIG_SYSFS */ -#if defined(CONFIG_CLEANCACHE) || defined(CONFIG_FRONTSWAP) - if (zcache_enabled) { - unsigned int cpu; - - tmem_register_hostops(&zcache_hostops); - tmem_register_pamops(&zcache_pamops); - ret = register_cpu_notifier(&zcache_cpu_notifier_block); - if (ret) { - pr_err("zcache: can't register cpu notifier\n"); - goto out; - } - for_each_online_cpu(cpu) { - void *pcpu = (void *)(long)cpu; - zcache_cpu_notifier(&zcache_cpu_notifier_block, - CPU_UP_PREPARE, pcpu); - } - } - zcache_objnode_cache = kmem_cache_create("zcache_objnode", - sizeof(struct tmem_objnode), 0, 0, NULL); - zcache_obj_cache = kmem_cache_create("zcache_obj", - sizeof(struct tmem_obj), 0, 0, NULL); -#endif -#ifdef CONFIG_CLEANCACHE - if (zcache_enabled && use_cleancache) { - struct cleancache_ops old_ops; - - zbud_init(); - register_shrinker(&zcache_shrinker); - old_ops = zcache_cleancache_register_ops(); - pr_info("zcache: cleancache enabled using kernel " - "transcendent memory and compression buddies\n"); - if (old_ops.init_fs != NULL) - pr_warning("zcache: cleancache_ops overridden"); - } -#endif -#ifdef CONFIG_FRONTSWAP - if (zcache_enabled && use_frontswap) { - struct frontswap_ops old_ops; - - zcache_client.xvpool = xv_create_pool(); - if (zcache_client.xvpool == NULL) { - pr_err("zcache: can't create xvpool\n"); - goto out; - } - old_ops = zcache_frontswap_register_ops(); - pr_info("zcache: frontswap enabled using kernel " - "transcendent memory and xvmalloc\n"); - if (old_ops.init != NULL) - pr_warning("ktmem: frontswap_ops overridden"); - } -#endif -out: - return ret; -} - -module_init(zcache_init) From 5c726afbb6f0983468eec87d86147c91d94b4430 Mon Sep 17 00:00:00 2001 From: SecureCRT Date: Mon, 20 Aug 2012 23:15:00 +0800 Subject: [PATCH 27/35] Revert "mm: cleancache core ops functions and config" This reverts commit e0c9143ea1ec510a41b347be043e98034eedf5c8. --- Documentation/vm/cleancache.txt | 279 -------------------------------- include/linux/cleancache.h | 122 -------------- mm/Kconfig | 22 --- mm/Makefile | 1 - mm/cleancache.c | 244 ---------------------------- 5 files changed, 668 deletions(-) delete mode 100755 Documentation/vm/cleancache.txt delete mode 100755 include/linux/cleancache.h mode change 100755 => 100644 mm/Kconfig mode change 100755 => 100644 mm/Makefile delete mode 100755 mm/cleancache.c diff --git a/Documentation/vm/cleancache.txt b/Documentation/vm/cleancache.txt deleted file mode 100755 index e0a53567..00000000 --- a/Documentation/vm/cleancache.txt +++ /dev/null @@ -1,279 +0,0 @@ -MOTIVATION - -Cleancache is a new optional feature provided by the VFS layer that -potentially dramatically increases page cache effectiveness for -many workloads in many environments at a negligible cost. - -Cleancache can be thought of as a page-granularity victim cache for clean -pages that the kernel's pageframe replacement algorithm (PFRA) would like -to keep around, but can't since there isn't enough memory. So when the -PFRA "evicts" a page, it first attempts to use cleancache code to -put the data contained in that page into "transcendent memory", memory -that is not directly accessible or addressable by the kernel and is -of unknown and possibly time-varying size. - -Later, when a cleancache-enabled filesystem wishes to access a page -in a file on disk, it first checks cleancache to see if it already -contains it; if it does, the page of data is copied into the kernel -and a disk access is avoided. - -Transcendent memory "drivers" for cleancache are currently implemented -in Xen (using hypervisor memory) and zcache (using in-kernel compressed -memory) and other implementations are in development. - -FAQs are included below. - -IMPLEMENTATION OVERVIEW - -A cleancache "backend" that provides transcendent memory registers itself -to the kernel's cleancache "frontend" by calling cleancache_register_ops, -passing a pointer to a cleancache_ops structure with funcs set appropriately. -Note that cleancache_register_ops returns the previous settings so that -chaining can be performed if desired. The functions provided must conform to -certain semantics as follows: - -Most important, cleancache is "ephemeral". Pages which are copied into -cleancache have an indefinite lifetime which is completely unknowable -by the kernel and so may or may not still be in cleancache at any later time. -Thus, as its name implies, cleancache is not suitable for dirty pages. -Cleancache has complete discretion over what pages to preserve and what -pages to discard and when. - -Mounting a cleancache-enabled filesystem should call "init_fs" to obtain a -pool id which, if positive, must be saved in the filesystem's superblock; -a negative return value indicates failure. A "put_page" will copy a -(presumably about-to-be-evicted) page into cleancache and associate it with -the pool id, a file key, and a page index into the file. (The combination -of a pool id, a file key, and an index is sometimes called a "handle".) -A "get_page" will copy the page, if found, from cleancache into kernel memory. -An "invalidate_page" will ensure the page no longer is present in cleancache; -an "invalidate_inode" will invalidate all pages associated with the specified -file; and, when a filesystem is unmounted, an "invalidate_fs" will invalidate -all pages in all files specified by the given pool id and also surrender -the pool id. - -An "init_shared_fs", like init_fs, obtains a pool id but tells cleancache -to treat the pool as shared using a 128-bit UUID as a key. On systems -that may run multiple kernels (such as hard partitioned or virtualized -systems) that may share a clustered filesystem, and where cleancache -may be shared among those kernels, calls to init_shared_fs that specify the -same UUID will receive the same pool id, thus allowing the pages to -be shared. Note that any security requirements must be imposed outside -of the kernel (e.g. by "tools" that control cleancache). Or a -cleancache implementation can simply disable shared_init by always -returning a negative value. - -If a get_page is successful on a non-shared pool, the page is invalidated -(thus making cleancache an "exclusive" cache). On a shared pool, the page -is NOT invalidated on a successful get_page so that it remains accessible to -other sharers. The kernel is responsible for ensuring coherency between -cleancache (shared or not), the page cache, and the filesystem, using -cleancache invalidate operations as required. - -Note that cleancache must enforce put-put-get coherency and get-get -coherency. For the former, if two puts are made to the same handle but -with different data, say AAA by the first put and BBB by the second, a -subsequent get can never return the stale data (AAA). For get-get coherency, -if a get for a given handle fails, subsequent gets for that handle will -never succeed unless preceded by a successful put with that handle. - -Last, cleancache provides no SMP serialization guarantees; if two -different Linux threads are simultaneously putting and invalidating a page -with the same handle, the results are indeterminate. Callers must -lock the page to ensure serial behavior. - -CLEANCACHE PERFORMANCE METRICS - -Cleancache monitoring is done by sysfs files in the -/sys/kernel/mm/cleancache directory. The effectiveness of cleancache -can be measured (across all filesystems) with: - -succ_gets - number of gets that were successful -failed_gets - number of gets that failed -puts - number of puts attempted (all "succeed") -invalidates - number of invalidates attempted - -A backend implementatation may provide additional metrics. - -FAQ - -1) Where's the value? (Andrew Morton) - -Cleancache provides a significant performance benefit to many workloads -in many environments with negligible overhead by improving the -effectiveness of the pagecache. Clean pagecache pages are -saved in transcendent memory (RAM that is otherwise not directly -addressable to the kernel); fetching those pages later avoids "refaults" -and thus disk reads. - -Cleancache (and its sister code "frontswap") provide interfaces for -this transcendent memory (aka "tmem"), which conceptually lies between -fast kernel-directly-addressable RAM and slower DMA/asynchronous devices. -Disallowing direct kernel or userland reads/writes to tmem -is ideal when data is transformed to a different form and size (such -as with compression) or secretly moved (as might be useful for write- -balancing for some RAM-like devices). Evicted page-cache pages (and -swap pages) are a great use for this kind of slower-than-RAM-but-much- -faster-than-disk transcendent memory, and the cleancache (and frontswap) -"page-object-oriented" specification provides a nice way to read and -write -- and indirectly "name" -- the pages. - -In the virtual case, the whole point of virtualization is to statistically -multiplex physical resources across the varying demands of multiple -virtual machines. This is really hard to do with RAM and efforts to -do it well with no kernel change have essentially failed (except in some -well-publicized special-case workloads). Cleancache -- and frontswap -- -with a fairly small impact on the kernel, provide a huge amount -of flexibility for more dynamic, flexible RAM multiplexing. -Specifically, the Xen Transcendent Memory backend allows otherwise -"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple -virtual machines, but the pages can be compressed and deduplicated to -optimize RAM utilization. And when guest OS's are induced to surrender -underutilized RAM (e.g. with "self-ballooning"), page cache pages -are the first to go, and cleancache allows those pages to be -saved and reclaimed if overall host system memory conditions allow. - -And the identical interface used for cleancache can be used in -physical systems as well. The zcache driver acts as a memory-hungry -device that stores pages of data in a compressed state. And -the proposed "RAMster" driver shares RAM across multiple physical -systems. - -2) Why does cleancache have its sticky fingers so deep inside the - filesystems and VFS? (Andrew Morton and Christoph Hellwig) - -The core hooks for cleancache in VFS are in most cases a single line -and the minimum set are placed precisely where needed to maintain -coherency (via cleancache_invalidate operations) between cleancache, -the page cache, and disk. All hooks compile into nothingness if -cleancache is config'ed off and turn into a function-pointer- -compare-to-NULL if config'ed on but no backend claims the ops -functions, or to a compare-struct-element-to-negative if a -backend claims the ops functions but a filesystem doesn't enable -cleancache. - -Some filesystems are built entirely on top of VFS and the hooks -in VFS are sufficient, so don't require an "init_fs" hook; the -initial implementation of cleancache didn't provide this hook. -But for some filesystems (such as btrfs), the VFS hooks are -incomplete and one or more hooks in fs-specific code are required. -And for some other filesystems, such as tmpfs, cleancache may -be counterproductive. So it seemed prudent to require a filesystem -to "opt in" to use cleancache, which requires adding a hook in -each filesystem. Not all filesystems are supported by cleancache -only because they haven't been tested. The existing set should -be sufficient to validate the concept, the opt-in approach means -that untested filesystems are not affected, and the hooks in the -existing filesystems should make it very easy to add more -filesystems in the future. - -The total impact of the hooks to existing fs and mm files is only -about 40 lines added (not counting comments and blank lines). - -3) Why not make cleancache asynchronous and batched so it can - more easily interface with real devices with DMA instead - of copying each individual page? (Minchan Kim) - -The one-page-at-a-time copy semantics simplifies the implementation -on both the frontend and backend and also allows the backend to -do fancy things on-the-fly like page compression and -page deduplication. And since the data is "gone" (copied into/out -of the pageframe) before the cleancache get/put call returns, -a great deal of race conditions and potential coherency issues -are avoided. While the interface seems odd for a "real device" -or for real kernel-addressable RAM, it makes perfect sense for -transcendent memory. - -4) Why is non-shared cleancache "exclusive"? And where is the - page "invalidated" after a "get"? (Minchan Kim) - -The main reason is to free up space in transcendent memory and -to avoid unnecessary cleancache_invalidate calls. If you want inclusive, -the page can be "put" immediately following the "get". If -put-after-get for inclusive becomes common, the interface could -be easily extended to add a "get_no_invalidate" call. - -The invalidate is done by the cleancache backend implementation. - -5) What's the performance impact? - -Performance analysis has been presented at OLS'09 and LCA'10. -Briefly, performance gains can be significant on most workloads, -especially when memory pressure is high (e.g. when RAM is -overcommitted in a virtual workload); and because the hooks are -invoked primarily in place of or in addition to a disk read/write, -overhead is negligible even in worst case workloads. Basically -cleancache replaces I/O with memory-copy-CPU-overhead; on older -single-core systems with slow memory-copy speeds, cleancache -has little value, but in newer multicore machines, especially -consolidated/virtualized machines, it has great value. - -6) How do I add cleancache support for filesystem X? (Boaz Harrash) - -Filesystems that are well-behaved and conform to certain -restrictions can utilize cleancache simply by making a call to -cleancache_init_fs at mount time. Unusual, misbehaving, or -poorly layered filesystems must either add additional hooks -and/or undergo extensive additional testing... or should just -not enable the optional cleancache. - -Some points for a filesystem to consider: - -- The FS should be block-device-based (e.g. a ram-based FS such - as tmpfs should not enable cleancache) -- To ensure coherency/correctness, the FS must ensure that all - file removal or truncation operations either go through VFS or - add hooks to do the equivalent cleancache "invalidate" operations -- To ensure coherency/correctness, either inode numbers must - be unique across the lifetime of the on-disk file OR the - FS must provide an "encode_fh" function. -- The FS must call the VFS superblock alloc and deactivate routines - or add hooks to do the equivalent cleancache calls done there. -- To maximize performance, all pages fetched from the FS should - go through the do_mpag_readpage routine or the FS should add - hooks to do the equivalent (cf. btrfs) -- Currently, the FS blocksize must be the same as PAGESIZE. This - is not an architectural restriction, but no backends currently - support anything different. -- A clustered FS should invoke the "shared_init_fs" cleancache - hook to get best performance for some backends. - -7) Why not use the KVA of the inode as the key? (Christoph Hellwig) - -If cleancache would use the inode virtual address instead of -inode/filehandle, the pool id could be eliminated. But, this -won't work because cleancache retains pagecache data pages -persistently even when the inode has been pruned from the -inode unused list, and only invalidates the data page if the file -gets removed/truncated. So if cleancache used the inode kva, -there would be potential coherency issues if/when the inode -kva is reused for a different file. Alternately, if cleancache -invalidated the pages when the inode kva was freed, much of the value -of cleancache would be lost because the cache of pages in cleanache -is potentially much larger than the kernel pagecache and is most -useful if the pages survive inode cache removal. - -8) Why is a global variable required? - -The cleancache_enabled flag is checked in all of the frequently-used -cleancache hooks. The alternative is a function call to check a static -variable. Since cleancache is enabled dynamically at runtime, systems -that don't enable cleancache would suffer thousands (possibly -tens-of-thousands) of unnecessary function calls per second. So the -global variable allows cleancache to be enabled by default at compile -time, but have insignificant performance impact when cleancache remains -disabled at runtime. - -9) Does cleanache work with KVM? - -The memory model of KVM is sufficiently different that a cleancache -backend may have less value for KVM. This remains to be tested, -especially in an overcommitted system. - -10) Does cleancache work in userspace? It sounds useful for - memory hungry caches like web browsers. (Jamie Lokier) - -No plans yet, though we agree it sounds useful, at least for -apps that bypass the page cache (e.g. O_DIRECT). - -Last updated: Dan Magenheimer, April 13 2011 diff --git a/include/linux/cleancache.h b/include/linux/cleancache.h deleted file mode 100755 index 04ffb2e6..00000000 --- a/include/linux/cleancache.h +++ /dev/null @@ -1,122 +0,0 @@ -#ifndef _LINUX_CLEANCACHE_H -#define _LINUX_CLEANCACHE_H - -#include -#include -#include - -#define CLEANCACHE_KEY_MAX 6 - -/* - * cleancache requires every file with a page in cleancache to have a - * unique key unless/until the file is removed/truncated. For some - * filesystems, the inode number is unique, but for "modern" filesystems - * an exportable filehandle is required (see exportfs.h) - */ -struct cleancache_filekey { - union { - ino_t ino; - __u32 fh[CLEANCACHE_KEY_MAX]; - u32 key[CLEANCACHE_KEY_MAX]; - } u; -}; - -struct cleancache_ops { - int (*init_fs)(size_t); - int (*init_shared_fs)(char *uuid, size_t); - int (*get_page)(int, struct cleancache_filekey, - pgoff_t, struct page *); - void (*put_page)(int, struct cleancache_filekey, - pgoff_t, struct page *); - void (*flush_page)(int, struct cleancache_filekey, pgoff_t); - void (*flush_inode)(int, struct cleancache_filekey); - void (*flush_fs)(int); -}; - -extern struct cleancache_ops - cleancache_register_ops(struct cleancache_ops *ops); -extern void __cleancache_init_fs(struct super_block *); -extern void __cleancache_init_shared_fs(char *, struct super_block *); -extern int __cleancache_get_page(struct page *); -extern void __cleancache_put_page(struct page *); -extern void __cleancache_flush_page(struct address_space *, struct page *); -extern void __cleancache_flush_inode(struct address_space *); -extern void __cleancache_flush_fs(struct super_block *); -extern int cleancache_enabled; - -#ifdef CONFIG_CLEANCACHE -static inline bool cleancache_fs_enabled(struct page *page) -{ - return page->mapping->host->i_sb->cleancache_poolid >= 0; -} -static inline bool cleancache_fs_enabled_mapping(struct address_space *mapping) -{ - return mapping->host->i_sb->cleancache_poolid >= 0; -} -#else -#define cleancache_enabled (0) -#define cleancache_fs_enabled(_page) (0) -#define cleancache_fs_enabled_mapping(_page) (0) -#endif - -/* - * The shim layer provided by these inline functions allows the compiler - * to reduce all cleancache hooks to nothingness if CONFIG_CLEANCACHE - * is disabled, to a single global variable check if CONFIG_CLEANCACHE - * is enabled but no cleancache "backend" has dynamically enabled it, - * and, for the most frequent cleancache ops, to a single global variable - * check plus a superblock element comparison if CONFIG_CLEANCACHE is enabled - * and a cleancache backend has dynamically enabled cleancache, but the - * filesystem referenced by that cleancache op has not enabled cleancache. - * As a result, CONFIG_CLEANCACHE can be enabled by default with essentially - * no measurable performance impact. - */ - -static inline void cleancache_init_fs(struct super_block *sb) -{ - if (cleancache_enabled) - __cleancache_init_fs(sb); -} - -static inline void cleancache_init_shared_fs(char *uuid, struct super_block *sb) -{ - if (cleancache_enabled) - __cleancache_init_shared_fs(uuid, sb); -} - -static inline int cleancache_get_page(struct page *page) -{ - int ret = -1; - - if (cleancache_enabled && cleancache_fs_enabled(page)) - ret = __cleancache_get_page(page); - return ret; -} - -static inline void cleancache_put_page(struct page *page) -{ - if (cleancache_enabled && cleancache_fs_enabled(page)) - __cleancache_put_page(page); -} - -static inline void cleancache_flush_page(struct address_space *mapping, - struct page *page) -{ - /* careful... page->mapping is NULL sometimes when this is called */ - if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping)) - __cleancache_flush_page(mapping, page); -} - -static inline void cleancache_flush_inode(struct address_space *mapping) -{ - if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping)) - __cleancache_flush_inode(mapping); -} - -static inline void cleancache_flush_fs(struct super_block *sb) -{ - if (cleancache_enabled) - __cleancache_flush_fs(sb); -} - -#endif /* _LINUX_CLEANCACHE_H */ diff --git a/mm/Kconfig b/mm/Kconfig old mode 100755 new mode 100644 index f86e0d29..2c19c0ba --- a/mm/Kconfig +++ b/mm/Kconfig @@ -288,25 +288,3 @@ config NOMMU_INITIAL_TRIM_EXCESS of 1 says that all excess pages should be trimmed. See Documentation/nommu-mmap.txt for more information. -config CLEANCACHE - bool "Enable cleancache driver to cache clean pages if tmem is present" - default n - help - Cleancache can be thought of as a page-granularity victim cache - for clean pages that the kernel's pageframe replacement algorithm - (PFRA) would like to keep around, but can't since there isn't enough - memory. So when the PFRA "evicts" a page, it first attempts to use - cleancacne code to put the data contained in that page into - "transcendent memory", memory that is not directly accessible or - addressable by the kernel and is of unknown and possibly - time-varying size. And when a cleancache-enabled - filesystem wishes to access a page in a file on disk, it first - checks cleancache to see if it already contains it; if it does, - the page is copied into the kernel and a disk access is avoided. - When a transcendent memory driver is available (such as zcache or - Xen transcendent memory), a significant I/O reduction - may be achieved. When none is available, all cleancache calls - are reduced to a single pointer-compare-against-NULL resulting - in a negligible performance hit. - - If unsure, say Y to enable cleancache \ No newline at end of file diff --git a/mm/Makefile b/mm/Makefile old mode 100755 new mode 100644 index 82a734fd..66f54865 --- a/mm/Makefile +++ b/mm/Makefile @@ -46,4 +46,3 @@ obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o -obj-$(CONFIG_CLEANCACHE) += cleancache.o diff --git a/mm/cleancache.c b/mm/cleancache.c deleted file mode 100755 index bcaae4c2..00000000 --- a/mm/cleancache.c +++ /dev/null @@ -1,244 +0,0 @@ -/* - * Cleancache frontend - * - * This code provides the generic "frontend" layer to call a matching - * "backend" driver implementation of cleancache. See - * Documentation/vm/cleancache.txt for more information. - * - * Copyright (C) 2009-2010 Oracle Corp. All rights reserved. - * Author: Dan Magenheimer - * - * This work is licensed under the terms of the GNU GPL, version 2. - */ - -#include -#include -#include -#include -#include - -/* - * This global enablement flag may be read thousands of times per second - * by cleancache_get/put/flush even on systems where cleancache_ops - * is not claimed (e.g. cleancache is config'ed on but remains - * disabled), so is preferred to the slower alternative: a function - * call that checks a non-global. - */ -int cleancache_enabled; -EXPORT_SYMBOL(cleancache_enabled); - -/* - * cleancache_ops is set by cleancache_ops_register to contain the pointers - * to the cleancache "backend" implementation functions. - */ -static struct cleancache_ops cleancache_ops; - -/* useful stats available in /sys/kernel/mm/cleancache */ -static unsigned long cleancache_succ_gets; -static unsigned long cleancache_failed_gets; -static unsigned long cleancache_puts; -static unsigned long cleancache_flushes; - -/* - * register operations for cleancache, returning previous thus allowing - * detection of multiple backends and possible nesting - */ -struct cleancache_ops cleancache_register_ops(struct cleancache_ops *ops) -{ - struct cleancache_ops old = cleancache_ops; - - cleancache_ops = *ops; - cleancache_enabled = 1; - return old; -} -EXPORT_SYMBOL(cleancache_register_ops); - -/* Called by a cleancache-enabled filesystem at time of mount */ -void __cleancache_init_fs(struct super_block *sb) -{ - sb->cleancache_poolid = (*cleancache_ops.init_fs)(PAGE_SIZE); -} -EXPORT_SYMBOL(__cleancache_init_fs); - -/* Called by a cleancache-enabled clustered filesystem at time of mount */ -void __cleancache_init_shared_fs(char *uuid, struct super_block *sb) -{ - sb->cleancache_poolid = - (*cleancache_ops.init_shared_fs)(uuid, PAGE_SIZE); -} -EXPORT_SYMBOL(__cleancache_init_shared_fs); - -/* - * If the filesystem uses exportable filehandles, use the filehandle as - * the key, else use the inode number. - */ -static int cleancache_get_key(struct inode *inode, - struct cleancache_filekey *key) -{ - int (*fhfn)(struct dentry *, __u32 *fh, int *, int); - int len = 0, maxlen = CLEANCACHE_KEY_MAX; - struct super_block *sb = inode->i_sb; - - key->u.ino = inode->i_ino; - if (sb->s_export_op != NULL) { - fhfn = sb->s_export_op->encode_fh; - if (fhfn) { - struct dentry d; - d.d_inode = inode; - len = (*fhfn)(&d, &key->u.fh[0], &maxlen, 0); - if (len <= 0 || len == 255) - return -1; - if (maxlen > CLEANCACHE_KEY_MAX) - return -1; - } - } - return 0; -} - -/* - * "Get" data from cleancache associated with the poolid/inode/index - * that were specified when the data was put to cleanache and, if - * successful, use it to fill the specified page with data and return 0. - * The pageframe is unchanged and returns -1 if the get fails. - * Page must be locked by caller. - */ -int __cleancache_get_page(struct page *page) -{ - int ret = -1; - int pool_id; - struct cleancache_filekey key = { .u.key = { 0 } }; - - VM_BUG_ON(!PageLocked(page)); - pool_id = page->mapping->host->i_sb->cleancache_poolid; - if (pool_id < 0) - goto out; - - if (cleancache_get_key(page->mapping->host, &key) < 0) - goto out; - - ret = (*cleancache_ops.get_page)(pool_id, key, page->index, page); - if (ret == 0) - cleancache_succ_gets++; - else - cleancache_failed_gets++; -out: - return ret; -} -EXPORT_SYMBOL(__cleancache_get_page); - -/* - * "Put" data from a page to cleancache and associate it with the - * (previously-obtained per-filesystem) poolid and the page's, - * inode and page index. Page must be locked. Note that a put_page - * always "succeeds", though a subsequent get_page may succeed or fail. - */ -void __cleancache_put_page(struct page *page) -{ - int pool_id; - struct cleancache_filekey key = { .u.key = { 0 } }; - - VM_BUG_ON(!PageLocked(page)); - pool_id = page->mapping->host->i_sb->cleancache_poolid; - if (pool_id >= 0 && - cleancache_get_key(page->mapping->host, &key) >= 0) { - (*cleancache_ops.put_page)(pool_id, key, page->index, page); - cleancache_puts++; - } -} -EXPORT_SYMBOL(__cleancache_put_page); - -/* - * Flush any data from cleancache associated with the poolid and the - * page's inode and page index so that a subsequent "get" will fail. - */ -void __cleancache_flush_page(struct address_space *mapping, struct page *page) -{ - /* careful... page->mapping is NULL sometimes when this is called */ - int pool_id = mapping->host->i_sb->cleancache_poolid; - struct cleancache_filekey key = { .u.key = { 0 } }; - - if (pool_id >= 0) { - VM_BUG_ON(!PageLocked(page)); - if (cleancache_get_key(mapping->host, &key) >= 0) { - (*cleancache_ops.flush_page)(pool_id, key, page->index); - cleancache_flushes++; - } - } -} -EXPORT_SYMBOL(__cleancache_flush_page); - -/* - * Flush all data from cleancache associated with the poolid and the - * mappings's inode so that all subsequent gets to this poolid/inode - * will fail. - */ -void __cleancache_flush_inode(struct address_space *mapping) -{ - int pool_id = mapping->host->i_sb->cleancache_poolid; - struct cleancache_filekey key = { .u.key = { 0 } }; - - if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0) - (*cleancache_ops.flush_inode)(pool_id, key); -} -EXPORT_SYMBOL(__cleancache_flush_inode); - -/* - * Called by any cleancache-enabled filesystem at time of unmount; - * note that pool_id is surrendered and may be reutrned by a subsequent - * cleancache_init_fs or cleancache_init_shared_fs - */ -void __cleancache_flush_fs(struct super_block *sb) -{ - if (sb->cleancache_poolid >= 0) { - int old_poolid = sb->cleancache_poolid; - sb->cleancache_poolid = -1; - (*cleancache_ops.flush_fs)(old_poolid); - } -} -EXPORT_SYMBOL(__cleancache_flush_fs); - -#ifdef CONFIG_SYSFS - -/* see Documentation/ABI/xxx/sysfs-kernel-mm-cleancache */ - -#define CLEANCACHE_SYSFS_RO(_name) \ - static ssize_t cleancache_##_name##_show(struct kobject *kobj, \ - struct kobj_attribute *attr, char *buf) \ - { \ - return sprintf(buf, "%lu\n", cleancache_##_name); \ - } \ - static struct kobj_attribute cleancache_##_name##_attr = { \ - .attr = { .name = __stringify(_name), .mode = 0444 }, \ - .show = cleancache_##_name##_show, \ - } - -CLEANCACHE_SYSFS_RO(succ_gets); -CLEANCACHE_SYSFS_RO(failed_gets); -CLEANCACHE_SYSFS_RO(puts); -CLEANCACHE_SYSFS_RO(flushes); - -static struct attribute *cleancache_attrs[] = { - &cleancache_succ_gets_attr.attr, - &cleancache_failed_gets_attr.attr, - &cleancache_puts_attr.attr, - &cleancache_flushes_attr.attr, - NULL, -}; - -static struct attribute_group cleancache_attr_group = { - .attrs = cleancache_attrs, - .name = "cleancache", -}; - -#endif /* CONFIG_SYSFS */ - -static int __init init_cleancache(void) -{ -#ifdef CONFIG_SYSFS - int err; - - err = sysfs_create_group(mm_kobj, &cleancache_attr_group); -#endif /* CONFIG_SYSFS */ - return 0; -} -module_init(init_cleancache) From 04d584c63411b8f490013822de2178695ac390ff Mon Sep 17 00:00:00 2001 From: SecureCRT Date: Thu, 23 Aug 2012 00:00:26 +0800 Subject: [PATCH 28/35] drivers/mmc/host/msm-sdcc: remove dead config options CONFIG_MMC_MSM7X00A_RESUME_IN_WQ and CONFIG_MMC_EMBEDDED_SDIO don't exist in Kconfig and is never defined anywhere else, therefore removing all references for it from the source code. --- drivers/mmc/host/msm_sdcc.c | 36 +----------------------------------- drivers/mmc/host/msm_sdcc.h | 3 --- 2 files changed, 1 insertion(+), 38 deletions(-) mode change 100644 => 100755 drivers/mmc/host/msm_sdcc.c mode change 100644 => 100755 drivers/mmc/host/msm_sdcc.h diff --git a/drivers/mmc/host/msm_sdcc.c b/drivers/mmc/host/msm_sdcc.c old mode 100644 new mode 100755 index 806dcad0..1f8cd0ee --- a/drivers/mmc/host/msm_sdcc.c +++ b/drivers/mmc/host/msm_sdcc.c @@ -1265,24 +1265,6 @@ msmsdcc_init_dma(struct msmsdcc_host *host) return 0; } -#ifdef CONFIG_MMC_MSM7X00A_RESUME_IN_WQ -static void -do_resume_work(struct work_struct *work) -{ - struct msmsdcc_host *host = - container_of(work, struct msmsdcc_host, resume_task); - struct mmc_host *mmc = host->mmc; - - if (mmc) { - mmc_resume_host(mmc); - if (host->stat_irq) - enable_irq(host->stat_irq); - } -} - -#endif - - #ifdef CONFIG_HAS_EARLYSUSPEND static void msmsdcc_early_suspend(struct early_suspend *h) { @@ -1382,14 +1364,6 @@ msmsdcc_probe(struct platform_device *pdev) host->dmares = dmares; spin_lock_init(&host->lock); -#ifdef CONFIG_MMC_EMBEDDED_SDIO - if (plat->embedded_sdio) - mmc_set_embedded_sdio_data(mmc, - &plat->embedded_sdio->cis, - &plat->embedded_sdio->cccr, - plat->embedded_sdio->funcs, - plat->embedded_sdio->num_funcs); -#endif /* * Setup DMA @@ -1608,22 +1582,14 @@ msmsdcc_resume(struct platform_device *dev) msmsdcc_writel(host, host->saved_irq0mask, MMCIMASK0); - if (mmc->card && mmc->card->type != MMC_TYPE_SDIO) { -#ifdef CONFIG_MMC_MSM7X00A_RESUME_IN_WQ - schedule_work(&host->resume_task); -#else + if (mmc->card && mmc->card->type != MMC_TYPE_SDIO) mmc_resume_host(mmc); -#endif - } - if (host->stat_irq) enable_irq(host->stat_irq); - #if BUSCLK_PWRSAVE if (host->clks_on) msmsdcc_disable_clocks(host, 1); #endif - } return 0; } diff --git a/drivers/mmc/host/msm_sdcc.h b/drivers/mmc/host/msm_sdcc.h old mode 100644 new mode 100755 index fdb0b9c6..65c4a3e5 --- a/drivers/mmc/host/msm_sdcc.h +++ b/drivers/mmc/host/msm_sdcc.h @@ -258,9 +258,6 @@ struct msmsdcc_host { int polling_enabled; #endif -#ifdef CONFIG_MMC_MSM7X00A_RESUME_IN_WQ - struct work_struct resume_task; -#endif struct tasklet_struct dma_tlet; From 81d981629c93026be48ffc126d36078216cc05ee Mon Sep 17 00:00:00 2001 From: SecureCRT Date: Thu, 23 Aug 2012 00:10:19 +0800 Subject: [PATCH 29/35] mmc: msm_sdcc: Fix possible circular locking dependency warning In the context of request processing thread, data mover lock is acquired after the host lock. In another context, in the completion handler of data mover the locks are acquired in the reverse order, resulting in possible circular lock dependency warning. Hence, schedule a tasklet to process the dma completion so as to avoid nested locks. --- drivers/mmc/host/msm_sdcc.c | 46 +++++++++++++++++++++++++------------ drivers/mmc/host/msm_sdcc.h | 2 +- 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/drivers/mmc/host/msm_sdcc.c b/drivers/mmc/host/msm_sdcc.c index 1f8cd0ee..c3193db6 100755 --- a/drivers/mmc/host/msm_sdcc.c +++ b/drivers/mmc/host/msm_sdcc.c @@ -308,42 +308,40 @@ msmsdcc_dma_exec_func(struct msm_dmov_cmd *cmd) } static void -msmsdcc_dma_complete_func(struct msm_dmov_cmd *cmd, - unsigned int result, - struct msm_dmov_errdata *err) +msmsdcc_dma_complete_tlet(unsigned long data) { - struct msmsdcc_dma_data *dma_data = - container_of(cmd, struct msmsdcc_dma_data, hdr); - struct msmsdcc_host *host = dma_data->host; + struct msmsdcc_host *host = (struct msmsdcc_host *)data; unsigned long flags; struct mmc_request *mrq; + struct msm_dmov_errdata err; spin_lock_irqsave(&host->lock, flags); host->dma.active = 0; + err = host->dma.err; mrq = host->curr.mrq; BUG_ON(!mrq); WARN_ON(!mrq->data); - if (!(result & DMOV_RSLT_VALID)) { + if (!(host->dma.result & DMOV_RSLT_VALID)) { pr_err("msmsdcc: Invalid DataMover result\n"); goto out; } - if (result & DMOV_RSLT_DONE) { + if (host->dma.result & DMOV_RSLT_DONE) { host->curr.data_xfered = host->curr.xfer_size; } else { /* Error or flush */ - if (result & DMOV_RSLT_ERROR) + if (host->dma.result & DMOV_RSLT_ERROR) pr_err("%s: DMA error (0x%.8x)\n", - mmc_hostname(host->mmc), result); - if (result & DMOV_RSLT_FLUSH) + mmc_hostname(host->mmc), host->dma.result); + if (host->dma.result & DMOV_RSLT_FLUSH) pr_err("%s: DMA channel flushed (0x%.8x)\n", - mmc_hostname(host->mmc), result); - if (err) + mmc_hostname(host->mmc), host->dma.result); + pr_err("Flush data: %.8x %.8x %.8x %.8x %.8x %.8x\n", - err->flush[0], err->flush[1], err->flush[2], - err->flush[3], err->flush[4], err->flush[5]); + err.flush[0], err.flush[1], err.flush[2], + err.flush[3], err.flush[4], err.flush[5]); if (!mrq->data->error) mrq->data->error = -EIO; } @@ -391,6 +389,22 @@ out: return; } +static void +msmsdcc_dma_complete_func(struct msm_dmov_cmd *cmd, + unsigned int result, + struct msm_dmov_errdata *err) +{ + struct msmsdcc_dma_data *dma_data = + container_of(cmd, struct msmsdcc_dma_data, hdr); + struct msmsdcc_host *host = dma_data->host; + + dma_data->result = result; + if (err) + memcpy(&dma_data->err, err, sizeof(struct msm_dmov_errdata)); + + tasklet_schedule(&host->dma_tlet); +} + static int validate_dma(struct msmsdcc_host *host, struct mmc_data *data) { if (host->dma.channel == -1) @@ -1364,6 +1378,8 @@ msmsdcc_probe(struct platform_device *pdev) host->dmares = dmares; spin_lock_init(&host->lock); + tasklet_init(&host->dma_tlet, msmsdcc_dma_complete_tlet, + (unsigned long)host); /* * Setup DMA diff --git a/drivers/mmc/host/msm_sdcc.h b/drivers/mmc/host/msm_sdcc.h index 65c4a3e5..16504610 100755 --- a/drivers/mmc/host/msm_sdcc.h +++ b/drivers/mmc/host/msm_sdcc.h @@ -190,7 +190,7 @@ struct msmsdcc_dma_data { int busy; /* Set if DM is busy */ int active; unsigned int result; - struct msm_dmov_errdata *err; + struct msm_dmov_errdata err; }; struct msmsdcc_pio_data { From 29e0f0df9d761946e1776b7b2f15eae475211155 Mon Sep 17 00:00:00 2001 From: SecureCRT Date: Thu, 23 Aug 2012 00:34:36 +0800 Subject: [PATCH 30/35] mmc: msm_sdcc: Add prog done interrupt support Enable prog done interrupt for stop command(CMD12) that is sent after a multi-block write(CMD25). The PROG_DONE bit is set when the card has finished its programming and is ready for next data. After every write request the card will be polled for ready status using CMD13. For a multi-block write(CMD25) before sending CMD13, stop command (CMD12) will be sent. If we enable prog done interrupt for CMD12, then CMD13 polling can be avoided. The prog done interrupt means that the card is done with its programming and is ready for next request. --- drivers/mmc/host/msm_sdcc.c | 26 ++++++++++++++++++++++++-- drivers/mmc/host/msm_sdcc.h | 6 +++--- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/drivers/mmc/host/msm_sdcc.c b/drivers/mmc/host/msm_sdcc.c index c3193db6..c022cc2f 100755 --- a/drivers/mmc/host/msm_sdcc.c +++ b/drivers/mmc/host/msm_sdcc.c @@ -556,6 +556,11 @@ msmsdcc_start_command_deferred(struct msmsdcc_host *host, (cmd->opcode == 53)) *c |= MCI_CSPM_DATCMD; + if (host->prog_scan && (cmd->opcode == 12)) { + *c |= MCI_CPSM_PROGENA; + host->prog_enable = true; + } + if (cmd == cmd->mrq->stop) *c |= MCI_CSPM_MCIABORT; @@ -626,6 +631,8 @@ msmsdcc_start_data(struct msmsdcc_host *host, struct mmc_data *data, } dsb(); msm_dmov_enqueue_cmd_ext(host->dma.channel, &host->dma.hdr); + if (data->flags & MMC_DATA_WRITE) + host->prog_scan = true; } else { msmsdcc_writel(host, timeout, MMCIDATATIMER); @@ -920,8 +927,23 @@ static void msmsdcc_do_cmdirq(struct msmsdcc_host *host, uint32_t status) else if (host->curr.data) { /* Non DMA */ msmsdcc_stop_data(host); msmsdcc_request_end(host, cmd->mrq); - } else /* host->data == NULL */ + } else { /* host->data == NULL */ + if (!cmd->error && host->prog_enable) { + if (status & MCI_PROGDONE) { + host->prog_scan = false; + host->prog_enable = false; msmsdcc_request_end(host, cmd->mrq); + } else { + host->curr.cmd = cmd; + } + } else { + if (host->prog_enable) { + host->prog_scan = false; + host->prog_enable = false; + } + msmsdcc_request_end(host, cmd->mrq); + } + } } else if (cmd->data) if (!(cmd->data->flags & MMC_DATA_READ)) msmsdcc_start_data(host, cmd->data, @@ -935,7 +957,7 @@ msmsdcc_handle_irq_data(struct msmsdcc_host *host, u32 status, struct mmc_data *data = host->curr.data; if (status & (MCI_CMDSENT | MCI_CMDRESPEND | MCI_CMDCRCFAIL | - MCI_CMDTIMEOUT) && host->curr.cmd) { + MCI_CMDTIMEOUT | MCI_PROGDONE) && host->curr.cmd) { msmsdcc_do_cmdirq(host, status); } diff --git a/drivers/mmc/host/msm_sdcc.h b/drivers/mmc/host/msm_sdcc.h index 16504610..78d8b1ac 100755 --- a/drivers/mmc/host/msm_sdcc.h +++ b/drivers/mmc/host/msm_sdcc.h @@ -155,7 +155,7 @@ #define MCI_IRQENABLE \ (MCI_CMDCRCFAILMASK|MCI_DATACRCFAILMASK|MCI_CMDTIMEOUTMASK| \ MCI_DATATIMEOUTMASK|MCI_TXUNDERRUNMASK|MCI_RXOVERRUNMASK| \ - MCI_CMDRESPENDMASK|MCI_CMDSENTMASK|MCI_DATAENDMASK) + MCI_CMDRESPENDMASK|MCI_CMDSENTMASK|MCI_DATAENDMASK|MCI_PROGDONEMASK) /* * The size of the FIFO in bytes. @@ -264,8 +264,6 @@ struct msmsdcc_host { #ifdef CONFIG_MMC_AUTO_SUSPEND unsigned long suspended; #endif - unsigned int prog_scan; - unsigned int prog_enable; /* Command parameters */ unsigned int cmd_timeout; unsigned int cmd_pio_irqmask; @@ -276,6 +274,8 @@ struct msmsdcc_host { unsigned int dummy_52_needed; unsigned int dummy_52_state; + bool prog_scan; + bool prog_enable; }; #endif From 08802ee5be4234c1fd62a6cf1bcd6c5f8c4216b6 Mon Sep 17 00:00:00 2001 From: SecureCRT Date: Thu, 23 Aug 2012 00:43:40 +0800 Subject: [PATCH 31/35] mmc: msm_sdcc: Fix bug in PIO mode when data size is not word aligned The current code for PIO doesn't transfer whole data when data size is not in multiple of 4 bytes. The last few bytes are not written to the card resulting in no DATAEND interrupt from SDCC. This patch allows data transfer for non-aligned data size in PIO mode. --- drivers/mmc/host/msm_sdcc.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/host/msm_sdcc.c b/drivers/mmc/host/msm_sdcc.c index c022cc2f..ed4d6f13 100755 --- a/drivers/mmc/host/msm_sdcc.c +++ b/drivers/mmc/host/msm_sdcc.c @@ -722,6 +722,9 @@ msmsdcc_pio_read(struct msmsdcc_host *host, char *buffer, unsigned int remain) count += remain; }else #endif + if (remain % 4) + remain = ((remain >> 2) + 1) << 2; + while (msmsdcc_readl(host, MMCISTATUS) & MCI_RXDATAAVLBL) { *ptr = msmsdcc_readl(host, MMCIFIFO + (count % MCI_FIFOSIZE)); ptr++; @@ -758,13 +761,14 @@ msmsdcc_pio_write(struct msmsdcc_host *host, char *buffer, } else { #endif do { - unsigned int count, maxcnt; + unsigned int count, maxcnt, sz; maxcnt = status & MCI_TXFIFOEMPTY ? MCI_FIFOSIZE : MCI_FIFOHALFSIZE; count = min(remain, maxcnt); - writesl(base + MMCIFIFO, ptr, count >> 2); + sz = count % 4 ? (count >> 2) + 1 : (count >> 2); + writesl(base + MMCIFIFO, ptr, sz); ptr += count; remain -= count; From 4eade398cdd8ce0f9b1e715fe49e6d86a2a18982 Mon Sep 17 00:00:00 2001 From: SecureCRT Date: Thu, 23 Aug 2012 01:05:06 +0800 Subject: [PATCH 32/35] mmc: msm: fix dma usage not to use internal APIs Remove parts of this driver which use internal API calls. This replaces the calls as suggested by Russell King. --- drivers/mmc/host/msm_sdcc.c | 49 +++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 27 deletions(-) diff --git a/drivers/mmc/host/msm_sdcc.c b/drivers/mmc/host/msm_sdcc.c index ed4d6f13..e22fc1cb 100755 --- a/drivers/mmc/host/msm_sdcc.c +++ b/drivers/mmc/host/msm_sdcc.c @@ -465,14 +465,30 @@ static int msmsdcc_config_dma(struct msmsdcc_host *host, struct mmc_data *data) host->curr.user_pages = 0; box = &nc->cmd[0]; - for (i = 0; i < host->dma.num_ents; i++) { + + /* location of command block must be 64 bit aligned */ + BUG_ON(host->dma.cmd_busaddr & 0x07); + + nc->cmdptr = (host->dma.cmd_busaddr >> 3) | CMD_PTR_LP; + host->dma.hdr.cmdptr = DMOV_CMD_PTR_LIST | + DMOV_CMD_ADDR(host->dma.cmdptr_busaddr); + host->dma.hdr.complete_func = msmsdcc_dma_complete_func; + + n = dma_map_sg(mmc_dev(host->mmc), host->dma.sg, + host->dma.num_ents, host->dma.dir); + if (n == 0) { + printk(KERN_ERR "%s: Unable to map in all sg elements\n", + mmc_hostname(host->mmc)); + host->dma.sg = NULL; + host->dma.num_ents = 0; + return -ENOMEM; + } + + for_each_sg(host->dma.sg, sg, n, i) { + box->cmd = CMD_MODE_BOX; - /* Initialize sg dma address */ - sg->dma_address = page_to_dma(mmc_dev(host->mmc), sg_page(sg)) - + sg->offset; - - if (i == (host->dma.num_ents - 1)) + if (i == n - 1) box->cmd |= CMD_LC; rows = (sg_dma_len(sg) % MCI_FIFOSIZE) ? (sg_dma_len(sg) / MCI_FIFOSIZE) + 1 : @@ -500,27 +516,6 @@ static int msmsdcc_config_dma(struct msmsdcc_host *host, struct mmc_data *data) box->cmd |= CMD_DST_CRCI(crci); } box++; - sg++; - } - - /* location of command block must be 64 bit aligned */ - BUG_ON(host->dma.cmd_busaddr & 0x07); - - nc->cmdptr = (host->dma.cmd_busaddr >> 3) | CMD_PTR_LP; - host->dma.hdr.cmdptr = DMOV_CMD_PTR_LIST | - DMOV_CMD_ADDR(host->dma.cmdptr_busaddr); - host->dma.hdr.complete_func = msmsdcc_dma_complete_func; - - n = dma_map_sg(mmc_dev(host->mmc), host->dma.sg, - host->dma.num_ents, host->dma.dir); -/* dsb inside dma_map_sg will write nc out to mem as well */ - - if (n != host->dma.num_ents) { - printk(KERN_ERR "%s: Unable to map in all sg elements\n", - mmc_hostname(host->mmc)); - host->dma.sg = NULL; - host->dma.num_ents = 0; - return -ENOMEM; } return 0; From 2145485d7d4f0d9cda8f51725269b0421ebac734 Mon Sep 17 00:00:00 2001 From: SecureCRT Date: Thu, 23 Aug 2012 02:45:27 +0800 Subject: [PATCH 33/35] mmc: change clock from 50Mhz to 64Mhz PCLK mmc: change NR_SG from 32 to 128 for better performance --- drivers/mmc/host/msm_sdcc.c | 2 +- drivers/mmc/host/msm_sdcc.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/host/msm_sdcc.c b/drivers/mmc/host/msm_sdcc.c index e22fc1cb..4fef37f7 100755 --- a/drivers/mmc/host/msm_sdcc.c +++ b/drivers/mmc/host/msm_sdcc.c @@ -73,7 +73,7 @@ static int msmsdcc_auto_suspend(struct mmc_host *, int); #define BUSCLK_TIMEOUT (HZ) #define SQN_BUSCLK_TIMEOUT (5 * HZ) static unsigned int msmsdcc_fmin = 144000; -static unsigned int msmsdcc_fmax = 50000000; +static unsigned int msmsdcc_fmax = 64000000; static unsigned int msmsdcc_4bit = 1; static unsigned int msmsdcc_pwrsave = 1; static unsigned int msmsdcc_piopoll = 1; diff --git a/drivers/mmc/host/msm_sdcc.h b/drivers/mmc/host/msm_sdcc.h index 78d8b1ac..1368fc0c 100755 --- a/drivers/mmc/host/msm_sdcc.h +++ b/drivers/mmc/host/msm_sdcc.h @@ -164,7 +164,7 @@ #define MCI_FIFOHALFSIZE (MCI_FIFOSIZE / 2) -#define NR_SG 32 +#define NR_SG 128 struct clk; From 4b10fc19b9dfa387ee0744a968503726b2d4231e Mon Sep 17 00:00:00 2001 From: securecrt Date: Thu, 23 Aug 2012 12:39:53 +0800 Subject: [PATCH 34/35] staging: Add Snappy compression support to zram Zram currently uses LZO compression. With Snappy, it uses less CPU time and is thus more useful. The sacrifice in compression ratio is small. Zram's LZO and Snappy support can be independently enabled at compile time and each zram device can switch between compression methods when unused. When only a single compression method was enabled at compile time, no idirection penalty is incurred. http://driverdev.linuxdriverproject.org/pipermail/devel/2011-April/015114.html --- arch/arm/configs/htcleo_defconfig | 4 + drivers/staging/Kconfig | 2 + drivers/staging/Makefile | 3 +- drivers/staging/snappy/Kconfig | 5 + drivers/staging/snappy/Makefile | 5 + drivers/staging/snappy/csnappy.h | 125 +++++ drivers/staging/snappy/csnappy_compress.c | 497 ++++++++++++++++++++ drivers/staging/snappy/csnappy_decompress.c | 321 +++++++++++++ drivers/staging/snappy/csnappy_internal.h | 83 ++++ drivers/staging/zram/Kconfig | 14 +- drivers/staging/zram/zram_drv.c | 101 +++- drivers/staging/zram/zram_drv.h | 30 ++ drivers/staging/zram/zram_sysfs.c | 58 +++ 13 files changed, 1233 insertions(+), 15 deletions(-) mode change 100644 => 100755 drivers/staging/Kconfig mode change 100644 => 100755 drivers/staging/Makefile create mode 100755 drivers/staging/snappy/Kconfig create mode 100755 drivers/staging/snappy/Makefile create mode 100755 drivers/staging/snappy/csnappy.h create mode 100755 drivers/staging/snappy/csnappy_compress.c create mode 100755 drivers/staging/snappy/csnappy_decompress.c create mode 100755 drivers/staging/snappy/csnappy_internal.h diff --git a/arch/arm/configs/htcleo_defconfig b/arch/arm/configs/htcleo_defconfig index 12f408f2..8d26b82b 100644 --- a/arch/arm/configs/htcleo_defconfig +++ b/arch/arm/configs/htcleo_defconfig @@ -1697,6 +1697,10 @@ CONFIG_ZRAM_NUM_DEVICES=1 CONFIG_ZRAM_DEFAULT_PERCENTAGE=18 # CONFIG_ZRAM_DEBUG is not set CONFIG_ZRAM_DEFAULT_DISKSIZE=100000000 +# CONFIG_ZRAM_LZO is not set +CONFIG_ZRAM_SNAPPY=y +CONFIG_SNAPPY_COMPRESS=y +CONFIG_SNAPPY_DECOMPRESS=y # # File systems diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig old mode 100644 new mode 100755 index 8ee4bfa6..b8964347 --- a/drivers/staging/Kconfig +++ b/drivers/staging/Kconfig @@ -125,5 +125,7 @@ source "drivers/staging/iio/Kconfig" source "drivers/staging/zram/Kconfig" +source "drivers/staging/snappy/Kconfig" + endif # !STAGING_EXCLUDE_BUILD endif # STAGING diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile old mode 100644 new mode 100755 index 5a1b7341..621dc916 --- a/drivers/staging/Makefile +++ b/drivers/staging/Makefile @@ -45,4 +45,5 @@ obj-$(CONFIG_DX_SEP) += sep/ obj-$(CONFIG_IIO) += iio/ obj-$(CONFIG_ZRAM) += zram/ obj-$(CONFIG_XVMALLOC) += zram/ - +obj-$(CONFIG_SNAPPY_COMPRESS) += snappy/ +obj-$(CONFIG_SNAPPY_DECOMPRESS) += snappy/ diff --git a/drivers/staging/snappy/Kconfig b/drivers/staging/snappy/Kconfig new file mode 100755 index 00000000..24f69085 --- /dev/null +++ b/drivers/staging/snappy/Kconfig @@ -0,0 +1,5 @@ +config SNAPPY_COMPRESS + tristate "Google Snappy Compression" + +config SNAPPY_DECOMPRESS + tristate "Google Snappy Decompression" diff --git a/drivers/staging/snappy/Makefile b/drivers/staging/snappy/Makefile new file mode 100755 index 00000000..399d070a --- /dev/null +++ b/drivers/staging/snappy/Makefile @@ -0,0 +1,5 @@ +snappy_compress-objs := csnappy_compress.o +snappy_decompress-objs := csnappy_decompress.o + +obj-$(CONFIG_SNAPPY_COMPRESS) += csnappy_compress.o +obj-$(CONFIG_SNAPPY_DECOMPRESS) += csnappy_decompress.o diff --git a/drivers/staging/snappy/csnappy.h b/drivers/staging/snappy/csnappy.h new file mode 100755 index 00000000..1e0a54ea --- /dev/null +++ b/drivers/staging/snappy/csnappy.h @@ -0,0 +1,125 @@ +#ifndef __CSNAPPY_H__ +#define __CSNAPPY_H__ +/* +File modified for the Linux Kernel by +Zeev Tarantov +*/ +#ifdef __cplusplus +extern "C" { +#endif + +#define CSNAPPY_VERSION 4 + +#define CSNAPPY_WORKMEM_BYTES_POWER_OF_TWO 15 +#define CSNAPPY_WORKMEM_BYTES (1 << CSNAPPY_WORKMEM_BYTES_POWER_OF_TWO) + +/* + * Returns the maximal size of the compressed representation of + * input data that is "source_len" bytes in length; + */ +uint32_t +csnappy_max_compressed_length(uint32_t source_len) __attribute__((const)); + +/* + * Flat array compression that does not emit the "uncompressed length" + * prefix. Compresses "input" array to the "output" array. + * + * REQUIRES: "input" is at most 32KiB long. + * REQUIRES: "output" points to an array of memory that is at least + * "csnappy_max_compressed_length(input_length)" in size. + * REQUIRES: working_memory has (1 << workmem_bytes_power_of_two) bytes. + * REQUIRES: 9 <= workmem_bytes_power_of_two <= 15. + * + * Returns an "end" pointer into "output" buffer. + * "end - output" is the compressed size of "input". + */ +char* +csnappy_compress_fragment( + const char *input, + const uint32_t input_length, + char *output, + void *working_memory, + const int workmem_bytes_power_of_two); + +/* + * REQUIRES: "compressed" must point to an area of memory that is at + * least "csnappy_max_compressed_length(input_length)" bytes in length. + * REQUIRES: working_memory has (1 << workmem_bytes_power_of_two) bytes. + * REQUIRES: 9 <= workmem_bytes_power_of_two <= 15. + * + * Takes the data stored in "input[0..input_length]" and stores + * it in the array pointed to by "compressed". + * + * "*out_compressed_length" is set to the length of the compressed output. + */ +void +csnappy_compress( + const char *input, + uint32_t input_length, + char *compressed, + uint32_t *out_compressed_length, + void *working_memory, + const int workmem_bytes_power_of_two); + +/* + * Reads header of compressed data to get stored length of uncompressed data. + * REQUIRES: start points to compressed data. + * REQUIRES: n is length of available compressed data. + * + * Returns SNAPPY_E_HEADER_BAD on error. + * Returns number of bytes read from input on success. + * Stores decoded length into *result. + */ +int +csnappy_get_uncompressed_length( + const char *start, + uint32_t n, + uint32_t *result); + +/* + * Safely decompresses all data from array "src" of length "src_len" containing + * entire compressed stream (with header) into array "dst" of size "dst_len". + * REQUIRES: dst_len is at least csnappy_get_uncompressed_length(...). + * + * Iff sucessful, returns CSNAPPY_E_OK. + * If recorded length in header is greater than dst_len, returns + * CSNAPPY_E_OUTPUT_INSUF. + * If compressed data is malformed, does not write more than dst_len into dst. + */ +int +csnappy_decompress( + const char *src, + uint32_t src_len, + char *dst, + uint32_t dst_len); + +/* + * Safely decompresses stream src_len bytes long read from src to dst. + * Amount of available space at dst must be provided in *dst_len by caller. + * If compressed stream needs more space, it will not overflow and return + * CSNAPPY_E_OUTPUT_OVERRUN. + * On success, sets *dst_len to actal number of bytes decompressed. + * Iff sucessful, returns CSNAPPY_E_OK. + */ +int +csnappy_decompress_noheader( + const char *src, + uint32_t src_len, + char *dst, + uint32_t *dst_len); + +/* + * Return values (< 0 = Error) + */ +#define CSNAPPY_E_OK 0 +#define CSNAPPY_E_HEADER_BAD (-1) +#define CSNAPPY_E_OUTPUT_INSUF (-2) +#define CSNAPPY_E_OUTPUT_OVERRUN (-3) +#define CSNAPPY_E_INPUT_NOT_CONSUMED (-4) +#define CSNAPPY_E_DATA_MALFORMED (-5) + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/drivers/staging/snappy/csnappy_compress.c b/drivers/staging/snappy/csnappy_compress.c new file mode 100755 index 00000000..7344f772 --- /dev/null +++ b/drivers/staging/snappy/csnappy_compress.c @@ -0,0 +1,497 @@ +/* +Copyright 2011, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +File modified for the Linux Kernel by +Zeev Tarantov +*/ + +#include "csnappy_internal.h" +#ifdef __KERNEL__ +#include +#include +#endif +#include "csnappy.h" + + +static inline char* +encode_varint32(char *sptr, uint32_t v) +{ + uint8_t* ptr = (uint8_t *)sptr; + static const int B = 128; + if (v < (1<<7)) { + *(ptr++) = v; + } else if (v < (1<<14)) { + *(ptr++) = v | B; + *(ptr++) = v>>7; + } else if (v < (1<<21)) { + *(ptr++) = v | B; + *(ptr++) = (v>>7) | B; + *(ptr++) = v>>14; + } else if (v < (1<<28)) { + *(ptr++) = v | B; + *(ptr++) = (v>>7) | B; + *(ptr++) = (v>>14) | B; + *(ptr++) = v>>21; + } else { + *(ptr++) = v | B; + *(ptr++) = (v>>7) | B; + *(ptr++) = (v>>14) | B; + *(ptr++) = (v>>21) | B; + *(ptr++) = v>>28; + } + return (char *)ptr; +} + + +/* + * Any hash function will produce a valid compressed bitstream, but a good + * hash function reduces the number of collisions and thus yields better + * compression for compressible input, and more speed for incompressible + * input. Of course, it doesn't hurt if the hash function is reasonably fast + * either, as it gets called a lot. + */ +static inline uint32_t HashBytes(uint32_t bytes, int shift) +{ + uint32_t kMul = 0x1e35a7bd; + return (bytes * kMul) >> shift; +} +static inline uint32_t Hash(const char *p, int shift) +{ + return HashBytes(UNALIGNED_LOAD32(p), shift); +} + + +/* + * *** DO NOT CHANGE THE VALUE OF kBlockSize *** + + * New Compression code chops up the input into blocks of at most + * the following size. This ensures that back-references in the + * output never cross kBlockSize block boundaries. This can be + * helpful in implementing blocked decompression. However the + * decompression code should not rely on this guarantee since older + * compression code may not obey it. + */ +#define kBlockLog 15 +#define kBlockSize (1 << kBlockLog) + + +/* + * Return the largest n such that + * + * s1[0,n-1] == s2[0,n-1] + * and n <= (s2_limit - s2). + * + * Does not read *s2_limit or beyond. + * Does not read *(s1 + (s2_limit - s2)) or beyond. + * Requires that s2_limit >= s2. + * + * Separate implementation for x86_64, for speed. Uses the fact that + * x86_64 is little endian. + */ +#if defined(__x86_64__) +static inline int +FindMatchLength(const char *s1, const char *s2, const char *s2_limit) +{ + uint64_t x; + int matched, matching_bits; + DCHECK_GE(s2_limit, s2); + matched = 0; + /* + * Find out how long the match is. We loop over the data 64 bits at a + * time until we find a 64-bit block that doesn't match; then we find + * the first non-matching bit and use that to calculate the total + * length of the match. + */ + while (likely(s2 <= s2_limit - 8)) { + if (unlikely(UNALIGNED_LOAD64(s1 + matched) == + UNALIGNED_LOAD64(s2))) { + s2 += 8; + matched += 8; + } else { + /* + * On current (mid-2008) Opteron models there is a 3% + * more efficient code sequence to find the first + * non-matching byte. However, what follows is ~10% + * better on Intel Core 2 and newer, and we expect AMD's + * bsf instruction to improve. + */ + x = UNALIGNED_LOAD64(s1 + matched) ^ + UNALIGNED_LOAD64(s2); + matching_bits = FindLSBSetNonZero64(x); + matched += matching_bits >> 3; + return matched; + } + } + while (likely(s2 < s2_limit)) { + if (likely(s1[matched] == *s2)) { + ++s2; + ++matched; + } else { + return matched; + } + } + return matched; +} +#else /* !defined(__x86_64__) */ +static inline int +FindMatchLength(const char *s1, const char *s2, const char *s2_limit) +{ + /* Implementation based on the x86-64 version, above. */ + int matched = 0; + DCHECK_GE(s2_limit, s2); + + while (s2 <= s2_limit - 4 && + UNALIGNED_LOAD32(s2) == UNALIGNED_LOAD32(s1 + matched)) { + s2 += 4; + matched += 4; + } +#if defined(__LITTLE_ENDIAN) + if (s2 <= s2_limit - 4) { + uint32_t x = UNALIGNED_LOAD32(s1 + matched) ^ + UNALIGNED_LOAD32(s2); + int matching_bits = FindLSBSetNonZero(x); + matched += matching_bits >> 3; + } else { + while ((s2 < s2_limit) && (s1[matched] == *s2)) { + ++s2; + ++matched; + } + } +#else + while ((s2 < s2_limit) && (s1[matched] == *s2)) { + ++s2; + ++matched; + } +#endif + return matched; +} +#endif /* !defined(__x86_64__) */ + + +static inline char* +EmitLiteral(char *op, const char *literal, int len, int allow_fast_path) +{ + int n = len - 1; /* Zero-length literals are disallowed */ + if (n < 60) { + /* Fits in tag byte */ + *op++ = LITERAL | (n << 2); + /* + The vast majority of copies are below 16 bytes, for which a + call to memcpy is overkill. This fast path can sometimes + copy up to 15 bytes too much, but that is okay in the + main loop, since we have a bit to go on for both sides: + - The input will always have kInputMarginBytes = 15 extra + available bytes, as long as we're in the main loop, and + if not, allow_fast_path = false. + - The output will always have 32 spare bytes (see + snappy_max_compressed_length). + */ + if (allow_fast_path && len <= 16) { + UNALIGNED_STORE64(op, UNALIGNED_LOAD64(literal)); + UNALIGNED_STORE64(op + 8, + UNALIGNED_LOAD64(literal + 8)); + return op + len; + } + } else { + /* Encode in upcoming bytes */ + char *base = op; + int count = 0; + op++; + while (n > 0) { + *op++ = n & 0xff; + n >>= 8; + count++; + } + DCHECK_GE(count, 1); + DCHECK_LE(count, 4); + *base = LITERAL | ((59+count) << 2); + } + memcpy(op, literal, len); + return op + len; +} + +static inline char* +EmitCopyLessThan64(char *op, int offset, int len) +{ + DCHECK_LE(len, 64); + DCHECK_GE(len, 4); + DCHECK_LT(offset, 65536); + + if ((len < 12) && (offset < 2048)) { + int len_minus_4 = len - 4; + DCHECK_LT(len_minus_4, 8); /* Must fit in 3 bits */ + *op++ = COPY_1_BYTE_OFFSET | + ((len_minus_4) << 2) | + ((offset >> 8) << 5); + *op++ = offset & 0xff; + } else { + *op++ = COPY_2_BYTE_OFFSET | ((len-1) << 2); + put_unaligned_le16(offset, op); + op += 2; + } + return op; +} + +static inline char* +EmitCopy(char *op, int offset, int len) +{ + /* Emit 64 byte copies but make sure to keep at least four bytes + * reserved */ + while (len >= 68) { + op = EmitCopyLessThan64(op, offset, 64); + len -= 64; + } + + /* Emit an extra 60 byte copy if have too much data to fit in one + * copy */ + if (len > 64) { + op = EmitCopyLessThan64(op, offset, 60); + len -= 60; + } + + /* Emit remainder */ + op = EmitCopyLessThan64(op, offset, len); + return op; +} + + +/* + * For 0 <= offset <= 4, GetUint32AtOffset(UNALIGNED_LOAD64(p), offset) will + * equal UNALIGNED_LOAD32(p + offset). Motivation: On x86-64 hardware we have + * empirically found that overlapping loads such as + * UNALIGNED_LOAD32(p) ... UNALIGNED_LOAD32(p+1) ... UNALIGNED_LOAD32(p+2) + * are slower than UNALIGNED_LOAD64(p) followed by shifts and casts to uint32_t. + */ +static inline uint32_t +GetUint32AtOffset(uint64_t v, int offset) +{ + DCHECK(0 <= offset && offset <= 4); +#ifdef __LITTLE_ENDIAN + return v >> (8 * offset); +#else + return v >> (32 - 8 * offset); +#endif +} + +#define kInputMarginBytes 15 +char* +csnappy_compress_fragment( + const char *input, + const uint32_t input_size, + char *op, + void *working_memory, + const int workmem_bytes_power_of_two) +{ + const char *ip, *ip_end, *base_ip, *next_emit, *ip_limit, *next_ip, + *candidate, *base; + uint16_t *table = (uint16_t *)working_memory; + uint64_t input_bytes; + uint32_t hash, next_hash, prev_hash, cur_hash, skip, candidate_bytes; + int shift, matched; + + DCHECK_GE(workmem_bytes_power_of_two, 9); + DCHECK_LE(workmem_bytes_power_of_two, 15); + /* Table of 2^X bytes, need (X-1) bits to address table of uint16_t. + * How many bits of 32bit hash function result are discarded? */ + shift = 33 - workmem_bytes_power_of_two; + /* "ip" is the input pointer, and "op" is the output pointer. */ + ip = input; + DCHECK_LE(input_size, kBlockSize); + ip_end = input + input_size; + base_ip = ip; + /* Bytes in [next_emit, ip) will be emitted as literal bytes. Or + [next_emit, ip_end) after the main loop. */ + next_emit = ip; + + if (unlikely(input_size < kInputMarginBytes)) + goto emit_remainder; + + memset(working_memory, 0, 1 << workmem_bytes_power_of_two); + + ip_limit = input + input_size - kInputMarginBytes; + next_hash = Hash(++ip, shift); + +main_loop: + DCHECK_LT(next_emit, ip); + /* + * The body of this loop calls EmitLiteral once and then EmitCopy one or + * more times. (The exception is that when we're close to exhausting + * the input we goto emit_remainder.) + * + * In the first iteration of this loop we're just starting, so + * there's nothing to copy, so calling EmitLiteral once is + * necessary. And we only start a new iteration when the + * current iteration has determined that a call to EmitLiteral will + * precede the next call to EmitCopy (if any). + * + * Step 1: Scan forward in the input looking for a 4-byte-long match. + * If we get close to exhausting the input then goto emit_remainder. + * + * Heuristic match skipping: If 32 bytes are scanned with no matches + * found, start looking only at every other byte. If 32 more bytes are + * scanned, look at every third byte, etc.. When a match is found, + * immediately go back to looking at every byte. This is a small loss + * (~5% performance, ~0.1% density) for compressible data due to more + * bookkeeping, but for non-compressible data (such as JPEG) it's a huge + * win since the compressor quickly "realizes" the data is incompressible + * and doesn't bother looking for matches everywhere. + * + * The "skip" variable keeps track of how many bytes there are since the + * last match; dividing it by 32 (ie. right-shifting by five) gives the + * number of bytes to move ahead for each iteration. + */ + skip = 32; + + next_ip = ip; + do { + ip = next_ip; + hash = next_hash; + DCHECK_EQ(hash, Hash(ip, shift)); + next_ip = ip + (skip++ >> 5); + if (unlikely(next_ip > ip_limit)) + goto emit_remainder; + next_hash = Hash(next_ip, shift); + candidate = base_ip + table[hash]; + DCHECK_GE(candidate, base_ip); + DCHECK_LT(candidate, ip); + + table[hash] = ip - base_ip; + } while (likely(UNALIGNED_LOAD32(ip) != + UNALIGNED_LOAD32(candidate))); + + /* + * Step 2: A 4-byte match has been found. We'll later see if more + * than 4 bytes match. But, prior to the match, input + * bytes [next_emit, ip) are unmatched. Emit them as "literal bytes." + */ + DCHECK_LE(next_emit + 16, ip_end); + op = EmitLiteral(op, next_emit, ip - next_emit, 1); + + /* + * Step 3: Call EmitCopy, and then see if another EmitCopy could + * be our next move. Repeat until we find no match for the + * input immediately after what was consumed by the last EmitCopy call. + * + * If we exit this loop normally then we need to call EmitLiteral next, + * though we don't yet know how big the literal will be. We handle that + * by proceeding to the next iteration of the main loop. We also can exit + * this loop via goto if we get close to exhausting the input. + */ + input_bytes = 0; + candidate_bytes = 0; + + do { + /* We have a 4-byte match at ip, and no need to emit any + "literal bytes" prior to ip. */ + base = ip; + matched = 4 + FindMatchLength(candidate + 4, ip + 4, ip_end); + ip += matched; + DCHECK_EQ(0, memcmp(base, candidate, matched)); + op = EmitCopy(op, base - candidate, matched); + /* We could immediately start working at ip now, but to improve + compression we first update table[Hash(ip - 1, ...)]. */ + next_emit = ip; + if (unlikely(ip >= ip_limit)) + goto emit_remainder; + input_bytes = UNALIGNED_LOAD64(ip - 1); + prev_hash = HashBytes(GetUint32AtOffset(input_bytes, 0), shift); + table[prev_hash] = ip - base_ip - 1; + cur_hash = HashBytes(GetUint32AtOffset(input_bytes, 1), shift); + candidate = base_ip + table[cur_hash]; + candidate_bytes = UNALIGNED_LOAD32(candidate); + table[cur_hash] = ip - base_ip; + } while (GetUint32AtOffset(input_bytes, 1) == candidate_bytes); + + next_hash = HashBytes(GetUint32AtOffset(input_bytes, 2), shift); + ++ip; + goto main_loop; + +emit_remainder: + /* Emit the remaining bytes as a literal */ + if (next_emit < ip_end) + op = EmitLiteral(op, next_emit, ip_end - next_emit, 0); + + return op; +} +#if defined(__KERNEL__) && !defined(STATIC) +EXPORT_SYMBOL(csnappy_compress_fragment); +#endif + +uint32_t __attribute__((const)) +csnappy_max_compressed_length(uint32_t source_len) +{ + return 32 + source_len + source_len/6; +} +#if defined(__KERNEL__) && !defined(STATIC) +EXPORT_SYMBOL(csnappy_max_compressed_length); +#endif + +void +csnappy_compress( + const char *input, + uint32_t input_length, + char *compressed, + uint32_t *compressed_length, + void *working_memory, + const int workmem_bytes_power_of_two) +{ + int workmem_size; + int num_to_read; + uint32_t written = 0; + char *p = encode_varint32(compressed, input_length); + written += (p - compressed); + compressed = p; + while (input_length > 0) { + num_to_read = min(input_length, (uint32_t)kBlockSize); + workmem_size = workmem_bytes_power_of_two; + if (num_to_read < kBlockSize) { + for (workmem_size = 9; + workmem_size < workmem_bytes_power_of_two; + ++workmem_size) { + if ((1 << (workmem_size-1)) >= num_to_read) + break; + } + } + p = csnappy_compress_fragment( + input, num_to_read, compressed, + working_memory, workmem_size); + written += (p - compressed); + compressed = p; + input_length -= num_to_read; + input += num_to_read; + } + *compressed_length = written; +} +#if defined(__KERNEL__) && !defined(STATIC) +EXPORT_SYMBOL(csnappy_compress); + +MODULE_LICENSE("BSD"); +MODULE_DESCRIPTION("Snappy Compressor"); +#endif diff --git a/drivers/staging/snappy/csnappy_decompress.c b/drivers/staging/snappy/csnappy_decompress.c new file mode 100755 index 00000000..d05d8173 --- /dev/null +++ b/drivers/staging/snappy/csnappy_decompress.c @@ -0,0 +1,321 @@ +/* +Copyright 2011, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +File modified for the Linux Kernel by +Zeev Tarantov +*/ + +#include "csnappy_internal.h" +#ifdef __KERNEL__ +#include +#include +#endif +#include "csnappy.h" + + +/* Mapping from i in range [0,4] to a mask to extract the bottom 8*i bits */ +static const uint32_t wordmask[] = { + 0u, 0xffu, 0xffffu, 0xffffffu, 0xffffffffu +}; + +/* + * Data stored per entry in lookup table: + * Range Bits-used Description + * ------------------------------------ + * 1..64 0..7 Literal/copy length encoded in opcode byte + * 0..7 8..10 Copy offset encoded in opcode byte / 256 + * 0..4 11..13 Extra bytes after opcode + * + * We use eight bits for the length even though 7 would have sufficed + * because of efficiency reasons: + * (1) Extracting a byte is faster than a bit-field + * (2) It properly aligns copy offset so we do not need a <<8 + */ +static const uint16_t char_table[256] = { + 0x0001, 0x0804, 0x1001, 0x2001, 0x0002, 0x0805, 0x1002, 0x2002, + 0x0003, 0x0806, 0x1003, 0x2003, 0x0004, 0x0807, 0x1004, 0x2004, + 0x0005, 0x0808, 0x1005, 0x2005, 0x0006, 0x0809, 0x1006, 0x2006, + 0x0007, 0x080a, 0x1007, 0x2007, 0x0008, 0x080b, 0x1008, 0x2008, + 0x0009, 0x0904, 0x1009, 0x2009, 0x000a, 0x0905, 0x100a, 0x200a, + 0x000b, 0x0906, 0x100b, 0x200b, 0x000c, 0x0907, 0x100c, 0x200c, + 0x000d, 0x0908, 0x100d, 0x200d, 0x000e, 0x0909, 0x100e, 0x200e, + 0x000f, 0x090a, 0x100f, 0x200f, 0x0010, 0x090b, 0x1010, 0x2010, + 0x0011, 0x0a04, 0x1011, 0x2011, 0x0012, 0x0a05, 0x1012, 0x2012, + 0x0013, 0x0a06, 0x1013, 0x2013, 0x0014, 0x0a07, 0x1014, 0x2014, + 0x0015, 0x0a08, 0x1015, 0x2015, 0x0016, 0x0a09, 0x1016, 0x2016, + 0x0017, 0x0a0a, 0x1017, 0x2017, 0x0018, 0x0a0b, 0x1018, 0x2018, + 0x0019, 0x0b04, 0x1019, 0x2019, 0x001a, 0x0b05, 0x101a, 0x201a, + 0x001b, 0x0b06, 0x101b, 0x201b, 0x001c, 0x0b07, 0x101c, 0x201c, + 0x001d, 0x0b08, 0x101d, 0x201d, 0x001e, 0x0b09, 0x101e, 0x201e, + 0x001f, 0x0b0a, 0x101f, 0x201f, 0x0020, 0x0b0b, 0x1020, 0x2020, + 0x0021, 0x0c04, 0x1021, 0x2021, 0x0022, 0x0c05, 0x1022, 0x2022, + 0x0023, 0x0c06, 0x1023, 0x2023, 0x0024, 0x0c07, 0x1024, 0x2024, + 0x0025, 0x0c08, 0x1025, 0x2025, 0x0026, 0x0c09, 0x1026, 0x2026, + 0x0027, 0x0c0a, 0x1027, 0x2027, 0x0028, 0x0c0b, 0x1028, 0x2028, + 0x0029, 0x0d04, 0x1029, 0x2029, 0x002a, 0x0d05, 0x102a, 0x202a, + 0x002b, 0x0d06, 0x102b, 0x202b, 0x002c, 0x0d07, 0x102c, 0x202c, + 0x002d, 0x0d08, 0x102d, 0x202d, 0x002e, 0x0d09, 0x102e, 0x202e, + 0x002f, 0x0d0a, 0x102f, 0x202f, 0x0030, 0x0d0b, 0x1030, 0x2030, + 0x0031, 0x0e04, 0x1031, 0x2031, 0x0032, 0x0e05, 0x1032, 0x2032, + 0x0033, 0x0e06, 0x1033, 0x2033, 0x0034, 0x0e07, 0x1034, 0x2034, + 0x0035, 0x0e08, 0x1035, 0x2035, 0x0036, 0x0e09, 0x1036, 0x2036, + 0x0037, 0x0e0a, 0x1037, 0x2037, 0x0038, 0x0e0b, 0x1038, 0x2038, + 0x0039, 0x0f04, 0x1039, 0x2039, 0x003a, 0x0f05, 0x103a, 0x203a, + 0x003b, 0x0f06, 0x103b, 0x203b, 0x003c, 0x0f07, 0x103c, 0x203c, + 0x0801, 0x0f08, 0x103d, 0x203d, 0x1001, 0x0f09, 0x103e, 0x203e, + 0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040 +}; + +/* + * Copy "len" bytes from "src" to "op", one byte at a time. Used for + * handling COPY operations where the input and output regions may + * overlap. For example, suppose: + * src == "ab" + * op == src + 2 + * len == 20 + * After IncrementalCopy(src, op, len), the result will have + * eleven copies of "ab" + * ababababababababababab + * Note that this does not match the semantics of either memcpy() + * or memmove(). + */ +static inline void IncrementalCopy(const char *src, char *op, int len) +{ + DCHECK_GT(len, 0); + do { + *op++ = *src++; + } while (--len > 0); +} + +/* + * Equivalent to IncrementalCopy except that it can write up to ten extra + * bytes after the end of the copy, and that it is faster. + * + * The main part of this loop is a simple copy of eight bytes at a time until + * we've copied (at least) the requested amount of bytes. However, if op and + * src are less than eight bytes apart (indicating a repeating pattern of + * length < 8), we first need to expand the pattern in order to get the correct + * results. For instance, if the buffer looks like this, with the eight-byte + * and patterns marked as intervals: + * + * abxxxxxxxxxxxx + * [------] src + * [------] op + * + * a single eight-byte copy from to will repeat the pattern once, + * after which we can move two bytes without moving : + * + * ababxxxxxxxxxx + * [------] src + * [------] op + * + * and repeat the exercise until the two no longer overlap. + * + * This allows us to do very well in the special case of one single byte + * repeated many times, without taking a big hit for more general cases. + * + * The worst case of extra writing past the end of the match occurs when + * op - src == 1 and len == 1; the last copy will read from byte positions + * [0..7] and write to [4..11], whereas it was only supposed to write to + * position 1. Thus, ten excess bytes. + */ +static const int kMaxIncrementCopyOverflow = 10; +static inline void IncrementalCopyFastPath(const char *src, char *op, int len) +{ + while (op - src < 8) { + UNALIGNED_STORE64(op, UNALIGNED_LOAD64(src)); + len -= op - src; + op += op - src; + } + while (len > 0) { + UNALIGNED_STORE64(op, UNALIGNED_LOAD64(src)); + src += 8; + op += 8; + len -= 8; + } +} + + +/* A type that writes to a flat array. */ +struct SnappyArrayWriter { + char *base; + char *op; + char *op_limit; +}; + +static inline int +SAW__Append(struct SnappyArrayWriter *this, + const char *ip, uint32_t len, int allow_fast_path) +{ + char *op = this->op; + const int space_left = this->op_limit - op; + /*Fast path, used for the majority (about 90%) of dynamic invocations.*/ + if (allow_fast_path && len <= 16 && space_left >= 16) { + UNALIGNED_STORE64(op, UNALIGNED_LOAD64(ip)); + UNALIGNED_STORE64(op + 8, UNALIGNED_LOAD64(ip + 8)); + } else { + if (space_left < len) + return CSNAPPY_E_OUTPUT_OVERRUN; + memcpy(op, ip, len); + } + this->op = op + len; + return CSNAPPY_E_OK; +} + +static inline int +SAW__AppendFromSelf(struct SnappyArrayWriter *this, + uint32_t offset, uint32_t len) +{ + char *op = this->op; + const int space_left = this->op_limit - op; + /* -1u catches offset==0 */ + if (op - this->base <= offset - 1u) + return CSNAPPY_E_DATA_MALFORMED; + /* Fast path, used for the majority (70-80%) of dynamic invocations. */ + if (len <= 16 && offset >= 8 && space_left >= 16) { + UNALIGNED_STORE64(op, UNALIGNED_LOAD64(op - offset)); + UNALIGNED_STORE64(op + 8, UNALIGNED_LOAD64(op - offset + 8)); + } else if (space_left >= len + kMaxIncrementCopyOverflow) { + IncrementalCopyFastPath(op - offset, op, len); + } else { + if (space_left < len) + return CSNAPPY_E_OUTPUT_OVERRUN; + IncrementalCopy(op - offset, op, len); + } + this->op = op + len; + return CSNAPPY_E_OK; +} + + +int +csnappy_get_uncompressed_length( + const char *src, + uint32_t src_len, + uint32_t *result) +{ + const char *src_base = src; + uint32_t shift = 0; + uint8_t c; + /* Length is encoded in 1..5 bytes */ + *result = 0; + for (;;) { + if (shift >= 32) + goto err_out; + if (src_len == 0) + goto err_out; + c = *(const uint8_t *)src++; + src_len -= 1; + *result |= (uint32_t)(c & 0x7f) << shift; + if (c < 128) + break; + shift += 7; + } + return src - src_base; +err_out: + return CSNAPPY_E_HEADER_BAD; +} +#if defined(__KERNEL__) && !defined(STATIC) +EXPORT_SYMBOL(csnappy_get_uncompressed_length); +#endif + +int +csnappy_decompress_noheader( + const char *src, + uint32_t src_remaining, + char *dst, + uint32_t *dst_len) +{ + struct SnappyArrayWriter writer; + uint32_t length, trailer, opword, extra_bytes; + int ret; + uint8_t opcode; + char scratch[5]; + writer.op = writer.base = dst; + writer.op_limit = writer.op + *dst_len; + while (src_remaining) { + if (unlikely(src_remaining < 5)) { + memcpy(scratch, src, src_remaining); + src = scratch; + } + opcode = *(const uint8_t *)src++; + opword = char_table[opcode]; + extra_bytes = opword >> 11; + trailer = get_unaligned_le32(src) & wordmask[extra_bytes]; + src += extra_bytes; + src_remaining -= 1 + extra_bytes; + length = opword & 0xff; + if (opcode & 0x3) { + trailer += opword & 0x700; + ret = SAW__AppendFromSelf(&writer, trailer, length); + if (ret < 0) + return ret; + } else { + length += trailer; + if (unlikely(src_remaining < length)) + return CSNAPPY_E_DATA_MALFORMED; + ret = src_remaining >= 16; + ret = SAW__Append(&writer, src, length, ret); + if (ret < 0) + return ret; + src += length; + src_remaining -= length; + } + } + *dst_len = writer.op - writer.base; + return CSNAPPY_E_OK; +} +#if defined(__KERNEL__) && !defined(STATIC) +EXPORT_SYMBOL(csnappy_decompress_noheader); +#endif + +int +csnappy_decompress( + const char *src, + uint32_t src_len, + char *dst, + uint32_t dst_len) +{ + int n; + uint32_t olen = 0; + /* Read uncompressed length from the front of the compressed input */ + n = csnappy_get_uncompressed_length(src, src_len, &olen); + if (unlikely(n < CSNAPPY_E_OK)) + return n; + /* Protect against possible DoS attack */ + if (unlikely(olen > dst_len)) + return CSNAPPY_E_OUTPUT_INSUF; + return csnappy_decompress_noheader(src + n, src_len - n, dst, &olen); +} +#if defined(__KERNEL__) && !defined(STATIC) +EXPORT_SYMBOL(csnappy_decompress); + +MODULE_LICENSE("BSD"); +MODULE_DESCRIPTION("Snappy Decompressor"); +#endif diff --git a/drivers/staging/snappy/csnappy_internal.h b/drivers/staging/snappy/csnappy_internal.h new file mode 100755 index 00000000..6f1a5465 --- /dev/null +++ b/drivers/staging/snappy/csnappy_internal.h @@ -0,0 +1,83 @@ +/* +Copyright 2011 Google Inc. All Rights Reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Various stubs for the open-source version of Snappy. + +File modified for the Linux Kernel by +Zeev Tarantov +*/ + +#ifndef CSNAPPY_INTERNAL_H_ +#define CSNAPPY_INTERNAL_H_ + +#ifndef __KERNEL__ +#include "csnappy_internal_userspace.h" +#else + +#include +#include +#include +#include +#include + +#ifdef DEBUG +#define DCHECK(cond) if (!(cond)) \ + printk(KERN_DEBUG "assert failed @ %s:%i\n", \ + __FILE__, __LINE__) +#else +#define DCHECK(cond) +#endif + +#define UNALIGNED_LOAD16(_p) get_unaligned((const uint16_t *)(_p)) +#define UNALIGNED_LOAD32(_p) get_unaligned((const uint32_t *)(_p)) +#define UNALIGNED_LOAD64(_p) get_unaligned((const uint64_t *)(_p)) +#define UNALIGNED_STORE16(_p, _val) put_unaligned((_val), (uint16_t *)(_p)) +#define UNALIGNED_STORE32(_p, _val) put_unaligned((_val), (uint32_t *)(_p)) +#define UNALIGNED_STORE64(_p, _val) put_unaligned((_val), (uint64_t *)(_p)) + +#define FindLSBSetNonZero(n) __builtin_ctz(n) +#define FindLSBSetNonZero64(n) __builtin_ctzll(n) + +#endif /* __KERNEL__ */ + +#define DCHECK_EQ(a, b) DCHECK(((a) == (b))) +#define DCHECK_NE(a, b) DCHECK(((a) != (b))) +#define DCHECK_GT(a, b) DCHECK(((a) > (b))) +#define DCHECK_GE(a, b) DCHECK(((a) >= (b))) +#define DCHECK_LT(a, b) DCHECK(((a) < (b))) +#define DCHECK_LE(a, b) DCHECK(((a) <= (b))) + +enum { + LITERAL = 0, + COPY_1_BYTE_OFFSET = 1, /* 3 bit length + 3 bits of offset in opcode */ + COPY_2_BYTE_OFFSET = 2, + COPY_4_BYTE_OFFSET = 3 +}; + +#endif /* CSNAPPY_INTERNAL_H_ */ diff --git a/drivers/staging/zram/Kconfig b/drivers/staging/zram/Kconfig index 3bec4dba..24027b43 100755 --- a/drivers/staging/zram/Kconfig +++ b/drivers/staging/zram/Kconfig @@ -6,8 +6,6 @@ config ZRAM tristate "Compressed RAM block device support" depends on BLOCK && SYSFS select XVMALLOC - select LZO_COMPRESS - select LZO_DECOMPRESS default n help Creates virtual block devices called /dev/zramX (X = 0, 1, ...). @@ -28,3 +26,15 @@ config ZRAM_DEBUG help This option adds additional debugging code to the compressed RAM block device driver. +config ZRAM_LZO + bool "LZO compression" + default y + depends on ZRAM + select LZO_COMPRESS + select LZO_DECOMPRESS + +config ZRAM_SNAPPY + bool "Snappy compression" + depends on ZRAM + select SNAPPY_COMPRESS + select SNAPPY_DECOMPRESS diff --git a/drivers/staging/zram/zram_drv.c b/drivers/staging/zram/zram_drv.c index 88383651..c61261ac 100755 --- a/drivers/staging/zram/zram_drv.c +++ b/drivers/staging/zram/zram_drv.c @@ -29,12 +29,90 @@ #include #include #include -#include #include #include #include "zram_drv.h" +#if defined(CONFIG_ZRAM_LZO) +#include +#ifdef MULTIPLE_COMPRESSORS +static const struct zram_compressor lzo_compressor = { + .name = "LZO", + .workmem_bytes = LZO1X_MEM_COMPRESS, + .compress = &lzo1x_1_compress, + .decompress = &lzo1x_decompress_safe +}; +#else /* !MULTIPLE_COMPRESSORS */ +#define WMSIZE LZO1X_MEM_COMPRESS +#define COMPRESS(s, sl, d, dl, wm) \ + lzo1x_1_compress(s, sl, d, dl, wm) +#define DECOMPRESS(s, sl, d, dl) \ + lzo1x_decompress_safe(s, sl, d, dl) +#endif /* !MULTIPLE_COMPRESSORS */ +#endif /* defined(CONFIG_ZRAM_LZO) */ + +#if defined(CONFIG_ZRAM_SNAPPY) +#include "../snappy/csnappy.h" /* if built in drivers/staging */ +#define WMSIZE_ORDER ((PAGE_SHIFT > 14) ? (15) : (PAGE_SHIFT+1)) +static int +snappy_compress_( + const unsigned char *src, + size_t src_len, + unsigned char *dst, + size_t *dst_len, + void *workmem) +{ + const unsigned char *end = csnappy_compress_fragment( + src, (uint32_t)src_len, dst, workmem, WMSIZE_ORDER); + *dst_len = end - dst; + return 0; +} +static int +snappy_decompress_( + const unsigned char *src, + size_t src_len, + unsigned char *dst, + size_t *dst_len) +{ + uint32_t dst_len_ = (uint32_t)*dst_len; + int ret = csnappy_decompress_noheader(src, src_len, dst, &dst_len_); + *dst_len = (size_t)dst_len_; + return ret; +} +#ifdef MULTIPLE_COMPRESSORS +static const struct zram_compressor snappy_compressor = { + .name = "SNAPPY", + .workmem_bytes = (1 << WMSIZE_ORDER), + .compress = &snappy_compress_, + .decompress = &snappy_decompress_ +}; +#else /* !MULTIPLE_COMPRESSORS */ +#define WMSIZE (1 << WMSIZE_ORDER) +#define COMPRESS(s, sl, d, dl, wm) \ + snappy_compress_(s, sl, d, dl, wm) +#define DECOMPRESS(s, sl, d, dl) \ + snappy_decompress_(s, sl, d, dl) +#endif /* !MULTIPLE_COMPRESSORS */ +#endif /* defined(CONFIG_ZRAM_SNAPPY) */ + +#ifdef MULTIPLE_COMPRESSORS +const struct zram_compressor * const zram_compressors[] = { +#if defined(CONFIG_ZRAM_LZO) + &lzo_compressor, +#endif +#if defined(CONFIG_ZRAM_SNAPPY) + &snappy_compressor, +#endif + NULL +}; +#define WMSIZE (zram->compressor->workmem_bytes) +#define COMPRESS(s, sl, d, dl, wm) \ + (zram->compressor->compress(s, sl, d, dl, wm)) +#define DECOMPRESS(s, sl, d, dl) \ + (zram->compressor->decompress(s, sl, d, dl)) +#endif /* MULTIPLE_COMPRESSORS */ + /* Globals */ static int zram_major; struct zram *zram_devices; @@ -257,7 +335,7 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, cmem = kmap_atomic(zram->table[index].page, KM_USER1) + zram->table[index].offset; - ret = lzo1x_decompress_safe(cmem + sizeof(*zheader), + ret = DECOMPRESS(cmem + sizeof(*zheader), xv_get_object_size(cmem) - sizeof(*zheader), uncmem, &clen); @@ -271,7 +349,7 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, kunmap_atomic(user_mem, KM_USER0); /* Should NEVER happen. Return bio error if it does. */ - if (unlikely(ret != LZO_E_OK)) { + if (unlikely(ret)) { pr_err("Decompression failed! err=%d, page=%u\n", ret, index); zram_stat64_inc(zram, &zram->stats.failed_reads); return ret; @@ -305,13 +383,13 @@ static int zram_read_before_write(struct zram *zram, char *mem, u32 index) return 0; } - ret = lzo1x_decompress_safe(cmem + sizeof(*zheader), + ret = DECOMPRESS(cmem + sizeof(*zheader), xv_get_object_size(cmem) - sizeof(*zheader), mem, &clen); kunmap_atomic(cmem, KM_USER0); /* Should NEVER happen. Return bio error if it does. */ - if (unlikely(ret != LZO_E_OK)) { + if (unlikely(ret)) { pr_err("Decompression failed! err=%d, page=%u\n", ret, index); zram_stat64_inc(zram, &zram->stats.failed_reads); return ret; @@ -377,18 +455,13 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, goto out; } - ret = lzo1x_1_compress(uncmem, PAGE_SIZE, src, &clen, + COMPRESS(uncmem, PAGE_SIZE, src, &clen, zram->compress_workmem); kunmap_atomic(user_mem, KM_USER0); if (is_partial_io(bvec)) kfree(uncmem); - if (unlikely(ret != LZO_E_OK)) { - pr_err("Compression failed! err=%d\n", ret); - goto out; - } - /* * Page is incompressible. Store it as-is (uncompressed) * since we do not want to return too many disk write @@ -648,7 +721,7 @@ int zram_init_device(struct zram *zram) zram_set_disksize(zram, totalram_pages << PAGE_SHIFT); - zram->compress_workmem = kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); + zram->compress_workmem = kzalloc(WMSIZE, GFP_KERNEL); if (!zram->compress_workmem) { pr_err("Error allocating compressor working memory!\n"); ret = -ENOMEM; @@ -753,6 +826,10 @@ static int create_device(struct zram *zram, int device_id) /* Actual capacity set using syfs (/sys/block/zram/disksize */ set_capacity(zram->disk, 0); + /* Can be changed using sysfs (/sys/block/zram/compressor) */ +#ifdef MULTIPLE_COMPRESSORS + zram->compressor = zram_compressors[0]; +#endif /* * To ensure that we always get PAGE_SIZE aligned * and n*PAGE_SIZED sized I/O requests. diff --git a/drivers/staging/zram/zram_drv.h b/drivers/staging/zram/zram_drv.h index 31617ee7..6ccb855b 100755 --- a/drivers/staging/zram/zram_drv.h +++ b/drivers/staging/zram/zram_drv.h @@ -66,6 +66,13 @@ static const size_t max_zpage_size = PAGE_SIZE / 4 * 3; #define ZRAM_SECTOR_PER_LOGICAL_BLOCK \ (1 << (ZRAM_LOGICAL_BLOCK_SHIFT - SECTOR_SHIFT)) +#if defined(CONFIG_ZRAM_LZO) + defined(CONFIG_ZRAM_SNAPPY) == 0 +#error At least one of CONFIG_ZRAM_LZO, CONFIG_ZRAM_SNAPPY must be defined! +#endif +#if defined(CONFIG_ZRAM_LZO) + defined(CONFIG_ZRAM_SNAPPY) > 1 +#define MULTIPLE_COMPRESSORS +#endif + /* Flags for zram pages (table[page_no].flags) */ enum zram_pageflags { /* Page is stored uncompressed */ @@ -103,6 +110,9 @@ struct zram_stats { struct zram { struct xv_pool *mem_pool; +#ifdef MULTIPLE_COMPRESSORS + const struct zram_compressor *compressor; +#endif void *compress_workmem; void *compress_buffer; struct table *table; @@ -132,5 +142,25 @@ extern struct attribute_group zram_disk_attr_group; extern int zram_init_device(struct zram *zram); extern void __zram_reset_device(struct zram *zram); +#ifdef MULTIPLE_COMPRESSORS +struct zram_compressor { + const char *name; + int (*compress)( + const unsigned char *src, + size_t src_len, + unsigned char *dst, + size_t *dst_len, + void *workmem); + int (*decompress)( + const unsigned char *src, + size_t src_len, + unsigned char *dst, + size_t *dst_len); + unsigned workmem_bytes; +}; + +extern const struct zram_compressor * const zram_compressors[]; +#endif + #endif diff --git a/drivers/staging/zram/zram_sysfs.c b/drivers/staging/zram/zram_sysfs.c index 41e51a2b..231924e0 100755 --- a/drivers/staging/zram/zram_sysfs.c +++ b/drivers/staging/zram/zram_sysfs.c @@ -76,6 +76,57 @@ static ssize_t disksize_store(struct device *dev, return len; } +#ifdef MULTIPLE_COMPRESSORS +static ssize_t compressor_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + char * const buf_base = buf; + const struct zram_compressor *p, *curr; + unsigned int i = 0; + struct zram *zram = dev_to_zram(dev); + curr = zram->compressor; + p = zram_compressors[i]; + while (p) { + if (curr == p) + buf += sprintf(buf, "*"); + buf += sprintf(buf, "%u - %s\n", i, p->name); + p = zram_compressors[++i]; + } + return buf - buf_base; +} + +static ssize_t compressor_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + const struct zram_compressor *p; + unsigned long requested; + unsigned int i = 0; + int ret; + struct zram *zram = dev_to_zram(dev); + + if (zram->init_done) { + pr_info("Cannot change compressor for initialized device\n"); + return -EBUSY; + } + + ret = strict_strtoul(buf, 10, &requested); + if (ret) + return ret; + + p = zram_compressors[i]; + while (p && (i < requested)) + p = zram_compressors[++i]; + + if (!p) { + pr_info("No compressor with index #%lu\n", requested); + return -EINVAL; + } + + zram->compressor = p; + return len; +} +#endif /* MULTIPLE_COMPRESSORS */ + static ssize_t initstate_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -194,6 +245,10 @@ static ssize_t mem_used_total_show(struct device *dev, return sprintf(buf, "%llu\n", val); } +#ifdef MULTIPLE_COMPRESSORS +static DEVICE_ATTR(compressor, S_IRUGO | S_IWUSR, + compressor_show, compressor_store); +#endif static DEVICE_ATTR(disksize, S_IRUGO | S_IWUSR, disksize_show, disksize_store); static DEVICE_ATTR(initstate, S_IRUGO, initstate_show, NULL); @@ -208,6 +263,9 @@ static DEVICE_ATTR(compr_data_size, S_IRUGO, compr_data_size_show, NULL); static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL); static struct attribute *zram_disk_attrs[] = { +#ifdef MULTIPLE_COMPRESSORS + &dev_attr_compressor.attr, +#endif &dev_attr_disksize.attr, &dev_attr_initstate.attr, &dev_attr_reset.attr, From c9ff1491834a531715dd5658ccf90b524545b7a9 Mon Sep 17 00:00:00 2001 From: securecrt Date: Thu, 23 Aug 2012 12:45:29 +0800 Subject: [PATCH 35/35] disable KSM --- arch/arm/configs/htcleo_defconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/configs/htcleo_defconfig b/arch/arm/configs/htcleo_defconfig index 8d26b82b..b6f35061 100644 --- a/arch/arm/configs/htcleo_defconfig +++ b/arch/arm/configs/htcleo_defconfig @@ -401,7 +401,7 @@ CONFIG_BOUNCE=y CONFIG_VIRT_TO_BUS=y CONFIG_HAVE_MLOCK=y CONFIG_HAVE_MLOCKED_PAGE_BIT=y -CONFIG_KSM=y +# CONFIG_KSM is not set CONFIG_DEFAULT_MMAP_MIN_ADDR=4096 CONFIG_ALIGNMENT_TRAP=y CONFIG_ALLOW_CPU_ALIGNMENT=y