From fe15ce446beb3a33583af81ffe6c9d01a75314ed Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:23 +1100 Subject: fs: change d_delete semantics Change d_delete from a dentry deletion notification to a dentry caching advise, more like ->drop_inode. Require it to be constant and idempotent, and not take d_lock. This is how all existing filesystems use the callback anyway. This makes fine grained dentry locking of dput and dentry lru scanning much simpler. Signed-off-by: Nick Piggin --- fs/proc/base.c | 2 +- fs/proc/generic.c | 2 +- fs/proc/proc_sysctl.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index 182845147fe..d932fdb6a24 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1744,7 +1744,7 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd) return 0; } -static int pid_delete_dentry(struct dentry * dentry) +static int pid_delete_dentry(const struct dentry * dentry) { /* Is the task we represent dead? * If so, then don't put the dentry on the lru list, diff --git a/fs/proc/generic.c b/fs/proc/generic.c index dd29f033766..1d607be36d9 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -400,7 +400,7 @@ static const struct inode_operations proc_link_inode_operations = { * smarter: we could keep a "volatile" flag in the * inode to indicate which ones to keep. */ -static int proc_delete_dentry(struct dentry * dentry) +static int proc_delete_dentry(const struct dentry * dentry) { return 1; } diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index b652cb00906..a256d770ea1 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -392,7 +392,7 @@ static int proc_sys_revalidate(struct dentry *dentry, struct nameidata *nd) return !PROC_I(dentry->d_inode)->sysctl->unregistering; } -static int proc_sys_delete(struct dentry *dentry) +static int proc_sys_delete(const struct dentry *dentry) { return !!PROC_I(dentry->d_inode)->sysctl->unregistering; } -- cgit v1.2.3 From 621e155a3591962420eacdd39f6f0aa29ceb221e Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:27 +1100 Subject: fs: change d_compare for rcu-walk Change d_compare so it may be called from lock-free RCU lookups. This does put significant restrictions on what may be done from the callback, however there don't seem to have been any problems with in-tree fses. If some strange use case pops up that _really_ cannot cope with the rcu-walk rules, we can just add new rcu-unaware callbacks, which would cause name lookup to drop out of rcu-walk mode. For in-tree filesystems, this is just a mechanical change. Signed-off-by: Nick Piggin --- fs/proc/proc_sysctl.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index a256d770ea1..ae4b0fd9033 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -397,15 +397,16 @@ static int proc_sys_delete(const struct dentry *dentry) return !!PROC_I(dentry->d_inode)->sysctl->unregistering; } -static int proc_sys_compare(struct dentry *dir, struct qstr *qstr, - struct qstr *name) +static int proc_sys_compare(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) { - struct dentry *dentry = container_of(qstr, struct dentry, d_name); - if (qstr->len != name->len) + if (name->len != len) return 1; - if (memcmp(qstr->name, name->name, name->len)) + if (memcmp(name->name, str, len)) return 1; - return !sysctl_is_seen(PROC_I(dentry->d_inode)->sysctl); + return !sysctl_is_seen(PROC_I(inode)->sysctl); } static const struct dentry_operations proc_sys_dentry_operations = { -- cgit v1.2.3 From fa0d7e3de6d6fc5004ad9dea0dd6b286af8f03e9 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:49 +1100 Subject: fs: icache RCU free inodes RCU free the struct inode. This will allow: - Subsequent store-free path walking patch. The inode must be consulted for permissions when walking, so an RCU inode reference is a must. - sb_inode_list_lock to be moved inside i_lock because sb list walkers who want to take i_lock no longer need to take sb_inode_list_lock to walk the list in the first place. This will simplify and optimize locking. - Could remove some nested trylock loops in dcache code - Could potentially simplify things a bit in VM land. Do not need to take the page lock to follow page->mapping. The downsides of this is the performance cost of using RCU. In a simple creat/unlink microbenchmark, performance drops by about 10% due to inability to reuse cache-hot slab objects. As iterations increase and RCU freeing starts kicking over, this increases to about 20%. In cases where inode lifetimes are longer (ie. many inodes may be allocated during the average life span of a single inode), a lot of this cache reuse is not applicable, so the regression caused by this patch is smaller. The cache-hot regression could largely be avoided by using SLAB_DESTROY_BY_RCU, however this adds some complexity to list walking and store-free path walking, so I prefer to implement this at a later date, if it is shown to be a win in real situations. I haven't found a regression in any non-micro benchmark so I doubt it will be a problem. Signed-off-by: Nick Piggin --- fs/proc/inode.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 3ddb6068177..6bcb926b101 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -65,11 +65,18 @@ static struct inode *proc_alloc_inode(struct super_block *sb) return inode; } -static void proc_destroy_inode(struct inode *inode) +static void proc_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(proc_inode_cachep, PROC_I(inode)); } +static void proc_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, proc_i_callback); +} + static void init_once(void *foo) { struct proc_inode *ei = (struct proc_inode *) foo; -- cgit v1.2.3 From 31e6b01f4183ff419a6d1f86177cbf4662347cec Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:52 +1100 Subject: fs: rcu-walk for path lookup Perform common cases of path lookups without any stores or locking in the ancestor dentry elements. This is called rcu-walk, as opposed to the current algorithm which is a refcount based walk, or ref-walk. This results in far fewer atomic operations on every path element, significantly improving path lookup performance. It also avoids cacheline bouncing on common dentries, significantly improving scalability. The overall design is like this: * LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk. * Take the RCU lock for the entire path walk, starting with the acquiring of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are not required for dentry persistence. * synchronize_rcu is called when unregistering a filesystem, so we can access d_ops and i_ops during rcu-walk. * Similarly take the vfsmount lock for the entire path walk. So now mnt refcounts are not required for persistence. Also we are free to perform mount lookups, and to assume dentry mount points and mount roots are stable up and down the path. * Have a per-dentry seqlock to protect the dentry name, parent, and inode, so we can load this tuple atomically, and also check whether any of its members have changed. * Dentry lookups (based on parent, candidate string tuple) recheck the parent sequence after the child is found in case anything changed in the parent during the path walk. * inode is also RCU protected so we can load d_inode and use the inode for limited things. * i_mode, i_uid, i_gid can be tested for exec permissions during path walk. * i_op can be loaded. When we reach the destination dentry, we lock it, recheck lookup sequence, and increment its refcount and mountpoint refcount. RCU and vfsmount locks are dropped. This is termed "dropping rcu-walk". If the dentry refcount does not match, we can not drop rcu-walk gracefully at the current point in the lokup, so instead return -ECHILD (for want of a better errno). This signals the path walking code to re-do the entire lookup with a ref-walk. Aside from the final dentry, there are other situations that may be encounted where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take a reference on the last good dentry) and continue with a ref-walk. Again, if we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup using ref-walk. But it is very important that we can continue with ref-walk for most cases, particularly to avoid the overhead of double lookups, and to gain the scalability advantages on common path elements (like cwd and root). The cases where rcu-walk cannot continue are: * NULL dentry (ie. any uncached path element) * parent with d_inode->i_op->permission or ACLs * dentries with d_revalidate * Following links In future patches, permission checks and d_revalidate become rcu-walk aware. It may be possible eventually to make following links rcu-walk aware. Uncached path elements will always require dropping to ref-walk mode, at the very least because i_mutex needs to be grabbed, and objects allocated. Signed-off-by: Nick Piggin --- fs/proc/proc_sysctl.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index ae4b0fd9033..998e3a715bc 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -402,6 +402,10 @@ static int proc_sys_compare(const struct dentry *parent, const struct dentry *dentry, const struct inode *inode, unsigned int len, const char *str, const struct qstr *name) { + /* Although proc doesn't have negative dentries, rcu-walk means + * that inode here can be NULL */ + if (!inode) + return 0; if (name->len != len) return 1; if (memcmp(name->name, str, len)) -- cgit v1.2.3 From fb045adb99d9b7c562dc7fef834857f78249daa1 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:55 +1100 Subject: fs: dcache reduce branches in lookup path Reduce some branches and memory accesses in dcache lookup by adding dentry flags to indicate common d_ops are set, rather than having to check them. This saves a pointer memory access (dentry->d_op) in common path lookup situations, and saves another pointer load and branch in cases where we have d_op but not the particular operation. Patched with: git grep -E '[.>]([[:space:]])*d_op([[:space:]])*=' | xargs sed -e 's/\([^\t ]*\)->d_op = \(.*\);/d_set_d_op(\1, \2);/' -e 's/\([^\t ]*\)\.d_op = \(.*\);/d_set_d_op(\&\1, \2);/' -i Signed-off-by: Nick Piggin --- fs/proc/base.c | 12 ++++++------ fs/proc/generic.c | 2 +- fs/proc/proc_sysctl.c | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index d932fdb6a24..85f0a80912a 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1969,7 +1969,7 @@ static struct dentry *proc_fd_instantiate(struct inode *dir, inode->i_op = &proc_pid_link_inode_operations; inode->i_size = 64; ei->op.proc_get_link = proc_fd_link; - dentry->d_op = &tid_fd_dentry_operations; + d_set_d_op(dentry, &tid_fd_dentry_operations); d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ if (tid_fd_revalidate(dentry, NULL)) @@ -2137,7 +2137,7 @@ static struct dentry *proc_fdinfo_instantiate(struct inode *dir, ei->fd = fd; inode->i_mode = S_IFREG | S_IRUSR; inode->i_fop = &proc_fdinfo_file_operations; - dentry->d_op = &tid_fd_dentry_operations; + d_set_d_op(dentry, &tid_fd_dentry_operations); d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ if (tid_fd_revalidate(dentry, NULL)) @@ -2196,7 +2196,7 @@ static struct dentry *proc_pident_instantiate(struct inode *dir, if (p->fop) inode->i_fop = p->fop; ei->op = p->op; - dentry->d_op = &pid_dentry_operations; + d_set_d_op(dentry, &pid_dentry_operations); d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ if (pid_revalidate(dentry, NULL)) @@ -2615,7 +2615,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir, if (p->fop) inode->i_fop = p->fop; ei->op = p->op; - dentry->d_op = &proc_base_dentry_operations; + d_set_d_op(dentry, &proc_base_dentry_operations); d_add(dentry, inode); error = NULL; out: @@ -2926,7 +2926,7 @@ static struct dentry *proc_pid_instantiate(struct inode *dir, inode->i_nlink = 2 + pid_entry_count_dirs(tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff)); - dentry->d_op = &pid_dentry_operations; + d_set_d_op(dentry, &pid_dentry_operations); d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ @@ -3169,7 +3169,7 @@ static struct dentry *proc_task_instantiate(struct inode *dir, inode->i_nlink = 2 + pid_entry_count_dirs(tid_base_stuff, ARRAY_SIZE(tid_base_stuff)); - dentry->d_op = &pid_dentry_operations; + d_set_d_op(dentry, &pid_dentry_operations); d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 1d607be36d9..f766be29d2c 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -439,7 +439,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, out_unlock: if (inode) { - dentry->d_op = &proc_dentry_operations; + d_set_d_op(dentry, &proc_dentry_operations); d_add(dentry, inode); return NULL; } diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 998e3a715bc..35efd85a4d3 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -120,7 +120,7 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, goto out; err = NULL; - dentry->d_op = &proc_sys_dentry_operations; + d_set_d_op(dentry, &proc_sys_dentry_operations); d_add(dentry, inode); out: @@ -201,7 +201,7 @@ static int proc_sys_fill_cache(struct file *filp, void *dirent, dput(child); return -ENOMEM; } else { - child->d_op = &proc_sys_dentry_operations; + d_set_d_op(child, &proc_sys_dentry_operations); d_add(child, inode); } } else { -- cgit v1.2.3 From 34286d6662308d82aed891852d04c7c3a2649b16 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:57 +1100 Subject: fs: rcu-walk aware d_revalidate method Require filesystems be aware of .d_revalidate being called in rcu-walk mode (nd->flags & LOOKUP_RCU). For now do a simple push down, returning -ECHILD from all implementations. Signed-off-by: Nick Piggin --- fs/proc/base.c | 33 ++++++++++++++++++++++++++------- fs/proc/proc_sysctl.c | 3 +++ 2 files changed, 29 insertions(+), 7 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index 85f0a80912a..dc5b2fcadc3 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1719,10 +1719,16 @@ static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat */ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd) { - struct inode *inode = dentry->d_inode; - struct task_struct *task = get_proc_task(inode); + struct inode *inode; + struct task_struct *task; const struct cred *cred; + if (nd && nd->flags & LOOKUP_RCU) + return -ECHILD; + + inode = dentry->d_inode; + task = get_proc_task(inode); + if (task) { if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || task_dumpable(task)) { @@ -1888,12 +1894,19 @@ static int proc_fd_link(struct inode *inode, struct path *path) static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) { - struct inode *inode = dentry->d_inode; - struct task_struct *task = get_proc_task(inode); - int fd = proc_fd(inode); + struct inode *inode; + struct task_struct *task; + int fd; struct files_struct *files; const struct cred *cred; + if (nd && nd->flags & LOOKUP_RCU) + return -ECHILD; + + inode = dentry->d_inode; + task = get_proc_task(inode); + fd = proc_fd(inode); + if (task) { files = get_files_struct(task); if (files) { @@ -2563,8 +2576,14 @@ static const struct pid_entry proc_base_stuff[] = { */ static int proc_base_revalidate(struct dentry *dentry, struct nameidata *nd) { - struct inode *inode = dentry->d_inode; - struct task_struct *task = get_proc_task(inode); + struct inode *inode; + struct task_struct *task; + + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + + inode = dentry->d_inode; + task = get_proc_task(inode); if (task) { put_task_struct(task); return 1; diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 35efd85a4d3..c9097f43b42 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -5,6 +5,7 @@ #include #include #include +#include #include "internal.h" static const struct dentry_operations proc_sys_dentry_operations; @@ -389,6 +390,8 @@ static const struct inode_operations proc_sys_dir_operations = { static int proc_sys_revalidate(struct dentry *dentry, struct nameidata *nd) { + if (nd->flags & LOOKUP_RCU) + return -ECHILD; return !PROC_I(dentry->d_inode)->sysctl->unregistering; } -- cgit v1.2.3 From b74c79e99389cd79b31fcc08f82c24e492e63c7e Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:58 +1100 Subject: fs: provide rcu-walk aware permission i_ops Signed-off-by: Nick Piggin --- fs/proc/base.c | 6 ++++-- fs/proc/proc_sysctl.c | 5 ++++- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index dc5b2fcadc3..b953d41d9ab 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2114,11 +2114,13 @@ static const struct file_operations proc_fd_operations = { * /proc/pid/fd needs a special permission handler so that a process can still * access /proc/self/fd after it has executed a setuid(). */ -static int proc_fd_permission(struct inode *inode, int mask) +static int proc_fd_permission(struct inode *inode, int mask, unsigned int flags) { int rv; - rv = generic_permission(inode, mask, NULL); + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + rv = generic_permission(inode, mask, flags, NULL); if (rv == 0) return 0; if (task_pid(current) == proc_pid(inode)) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index c9097f43b42..09a1f92a34e 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -295,7 +295,7 @@ out: return ret; } -static int proc_sys_permission(struct inode *inode, int mask) +static int proc_sys_permission(struct inode *inode, int mask,unsigned int flags) { /* * sysctl entries that are not writeable, @@ -305,6 +305,9 @@ static int proc_sys_permission(struct inode *inode, int mask) struct ctl_table *table; int error; + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + /* Executable files are not allowed under /proc/sys/ */ if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) return -EACCES; -- cgit v1.2.3