From 474279dc0f7745124fc76b474c8dc1294f8e87ce Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 1 Oct 2013 16:11:26 -0400 Subject: split __lookup_mnt() in two functions Instead of passing the direction as argument (and checking it on every step through the hash chain), just have separate __lookup_mnt() and __lookup_mnt_last(). And use the standard iterators... Signed-off-by: Al Viro --- fs/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 645268f23eb6..1f844fbfce72 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1111,7 +1111,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, if (!d_mountpoint(path->dentry)) break; - mounted = __lookup_mnt(path->mnt, path->dentry, 1); + mounted = __lookup_mnt(path->mnt, path->dentry); if (!mounted) break; path->mnt = &mounted->mnt; @@ -1132,7 +1132,7 @@ static void follow_mount_rcu(struct nameidata *nd) { while (d_mountpoint(nd->path.dentry)) { struct mount *mounted; - mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry, 1); + mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry); if (!mounted) break; nd->path.mnt = &mounted->mnt; -- cgit v1.2.3 From 48a066e72d970a3e225a9c18690d570c736fc455 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 29 Sep 2013 22:06:07 -0400 Subject: RCU'd vfsmounts * RCU-delayed freeing of vfsmounts * vfsmount_lock replaced with a seqlock (mount_lock) * sequence number from mount_lock is stored in nameidata->m_seq and used when we exit RCU mode * new vfsmount flag - MNT_SYNC_UMOUNT. Set by umount_tree() when its caller knows that vfsmount will have no surviving references. * synchronize_rcu() done between unlocking namespace_sem in namespace_unlock() and doing pending mntput(). * new helper: legitimize_mnt(mnt, seq). Checks the mount_lock sequence number against seq, then grabs reference to mnt. Then it rechecks mount_lock again to close the race and either returns success or drops the reference it has acquired. The subtle point is that in case of MNT_SYNC_UMOUNT we can simply decrement the refcount and sod off - aforementioned synchronize_rcu() makes sure that final mntput() won't come until we leave RCU mode. We need that, since we don't want to end up with some lazy pathwalk racing with umount() and stealing the final mntput() from it - caller of umount() may expect it to return only once the fs is shut down and we don't want to break that. In other cases (i.e. with MNT_SYNC_UMOUNT absent) we have to do full-blown mntput() in case of mount_lock sequence number mismatch happening just as we'd grabbed the reference, but in those cases we won't be stealing the final mntput() from anything that would care. * mntput_no_expire() doesn't lock anything on the fast path now. Incidentally, SMP and UP cases are handled the same way - no ifdefs there. * normal pathname resolution does *not* do any writes to mount_lock. It does, of course, bump the refcounts of vfsmount and dentry in the very end, but that's it. Signed-off-by: Al Viro --- fs/dcache.c | 20 +++++--- fs/mount.h | 10 ++-- fs/namei.c | 50 ++++++++++--------- fs/namespace.c | 135 ++++++++++++++++++++++++++++++++------------------ include/linux/mount.h | 2 + include/linux/namei.h | 2 +- 6 files changed, 136 insertions(+), 83 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/dcache.c b/fs/dcache.c index eb0978da1bd4..aafa2a146434 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2887,24 +2887,28 @@ static int prepend_path(const struct path *path, struct vfsmount *vfsmnt = path->mnt; struct mount *mnt = real_mount(vfsmnt); int error = 0; - unsigned seq = 0; + unsigned seq, m_seq = 0; char *bptr; int blen; - br_read_lock(&vfsmount_lock); rcu_read_lock(); +restart_mnt: + read_seqbegin_or_lock(&mount_lock, &m_seq); + seq = 0; restart: bptr = *buffer; blen = *buflen; + error = 0; read_seqbegin_or_lock(&rename_lock, &seq); while (dentry != root->dentry || vfsmnt != root->mnt) { struct dentry * parent; if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { + struct mount *parent = ACCESS_ONCE(mnt->mnt_parent); /* Global root? */ - if (mnt_has_parent(mnt)) { - dentry = mnt->mnt_mountpoint; - mnt = mnt->mnt_parent; + if (mnt != parent) { + dentry = ACCESS_ONCE(mnt->mnt_mountpoint); + mnt = parent; vfsmnt = &mnt->mnt; continue; } @@ -2938,7 +2942,11 @@ restart: goto restart; } done_seqretry(&rename_lock, seq); - br_read_unlock(&vfsmount_lock); + if (need_seqretry(&mount_lock, m_seq)) { + m_seq = 1; + goto restart_mnt; + } + done_seqretry(&mount_lock, m_seq); if (error >= 0 && bptr == *buffer) { if (--blen < 0) diff --git a/fs/mount.h b/fs/mount.h index f0866076de6e..d64c594be6c4 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -1,7 +1,6 @@ #include #include #include -#include struct mnt_namespace { atomic_t count; @@ -30,6 +29,7 @@ struct mount { struct mount *mnt_parent; struct dentry *mnt_mountpoint; struct vfsmount mnt; + struct rcu_head mnt_rcu; #ifdef CONFIG_SMP struct mnt_pcp __percpu *mnt_pcp; #else @@ -80,21 +80,23 @@ static inline int is_mounted(struct vfsmount *mnt) extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *); extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *); +extern bool legitimize_mnt(struct vfsmount *, unsigned); + static inline void get_mnt_ns(struct mnt_namespace *ns) { atomic_inc(&ns->count); } -extern struct lglock vfsmount_lock; +extern seqlock_t mount_lock; static inline void lock_mount_hash(void) { - br_write_lock(&vfsmount_lock); + write_seqlock(&mount_lock); } static inline void unlock_mount_hash(void) { - br_write_unlock(&vfsmount_lock); + write_sequnlock(&mount_lock); } struct proc_mounts { diff --git a/fs/namei.c b/fs/namei.c index 1f844fbfce72..cb0ebae07e52 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -484,14 +484,12 @@ EXPORT_SYMBOL(path_put); static inline void lock_rcu_walk(void) { - br_read_lock(&vfsmount_lock); rcu_read_lock(); } static inline void unlock_rcu_walk(void) { rcu_read_unlock(); - br_read_unlock(&vfsmount_lock); } /** @@ -512,26 +510,23 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) BUG_ON(!(nd->flags & LOOKUP_RCU)); /* - * Get a reference to the parent first: we're - * going to make "path_put(nd->path)" valid in - * non-RCU context for "terminate_walk()". - * - * If this doesn't work, return immediately with - * RCU walking still active (and then we will do - * the RCU walk cleanup in terminate_walk()). + * After legitimizing the bastards, terminate_walk() + * will do the right thing for non-RCU mode, and all our + * subsequent exit cases should rcu_read_unlock() + * before returning. Do vfsmount first; if dentry + * can't be legitimized, just set nd->path.dentry to NULL + * and rely on dput(NULL) being a no-op. */ - if (!lockref_get_not_dead(&parent->d_lockref)) + if (!legitimize_mnt(nd->path.mnt, nd->m_seq)) return -ECHILD; - - /* - * After the mntget(), we terminate_walk() will do - * the right thing for non-RCU mode, and all our - * subsequent exit cases should unlock_rcu_walk() - * before returning. - */ - mntget(nd->path.mnt); nd->flags &= ~LOOKUP_RCU; + if (!lockref_get_not_dead(&parent->d_lockref)) { + nd->path.dentry = NULL; + unlock_rcu_walk(); + return -ECHILD; + } + /* * For a negative lookup, the lookup sequence point is the parents * sequence point, and it only needs to revalidate the parent dentry. @@ -608,16 +603,21 @@ static int complete_walk(struct nameidata *nd) if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; + if (!legitimize_mnt(nd->path.mnt, nd->m_seq)) { + unlock_rcu_walk(); + return -ECHILD; + } if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) { unlock_rcu_walk(); + mntput(nd->path.mnt); return -ECHILD; } if (read_seqcount_retry(&dentry->d_seq, nd->seq)) { unlock_rcu_walk(); dput(dentry); + mntput(nd->path.mnt); return -ECHILD; } - mntget(nd->path.mnt); unlock_rcu_walk(); } @@ -909,15 +909,15 @@ int follow_up(struct path *path) struct mount *parent; struct dentry *mountpoint; - br_read_lock(&vfsmount_lock); + read_seqlock_excl(&mount_lock); parent = mnt->mnt_parent; if (parent == mnt) { - br_read_unlock(&vfsmount_lock); + read_sequnlock_excl(&mount_lock); return 0; } mntget(&parent->mnt); mountpoint = dget(mnt->mnt_mountpoint); - br_read_unlock(&vfsmount_lock); + read_sequnlock_excl(&mount_lock); dput(path->dentry); path->dentry = mountpoint; mntput(path->mnt); @@ -1048,8 +1048,8 @@ static int follow_managed(struct path *path, unsigned flags) /* Something is mounted on this dentry in another * namespace and/or whatever was mounted there in this - * namespace got unmounted before we managed to get the - * vfsmount_lock */ + * namespace got unmounted before lookup_mnt() could + * get it */ } /* Handle an automount point */ @@ -1864,6 +1864,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, if (flags & LOOKUP_RCU) { lock_rcu_walk(); nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); + nd->m_seq = read_seqbegin(&mount_lock); } else { path_get(&nd->path); } @@ -1872,6 +1873,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, nd->root.mnt = NULL; + nd->m_seq = read_seqbegin(&mount_lock); if (*name=='/') { if (flags & LOOKUP_RCU) { lock_rcu_walk(); diff --git a/fs/namespace.c b/fs/namespace.c index 500202ce10db..ac2ce8a766e1 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -53,7 +53,7 @@ EXPORT_SYMBOL_GPL(fs_kobj); * It should be taken for write in all cases where the vfsmount * tree or hash is modified or when a vfsmount structure is modified. */ -DEFINE_BRLOCK(vfsmount_lock); +__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry) { @@ -547,16 +547,38 @@ static void free_vfsmnt(struct mount *mnt) kmem_cache_free(mnt_cache, mnt); } +/* call under rcu_read_lock */ +bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) +{ + struct mount *mnt; + if (read_seqretry(&mount_lock, seq)) + return false; + if (bastard == NULL) + return true; + mnt = real_mount(bastard); + mnt_add_count(mnt, 1); + if (likely(!read_seqretry(&mount_lock, seq))) + return true; + if (bastard->mnt_flags & MNT_SYNC_UMOUNT) { + mnt_add_count(mnt, -1); + return false; + } + rcu_read_unlock(); + mntput(bastard); + rcu_read_lock(); + return false; +} + /* * find the first mount at @dentry on vfsmount @mnt. - * vfsmount_lock must be held for read or write. + * call under rcu_read_lock() */ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) { struct list_head *head = mount_hashtable + hash(mnt, dentry); struct mount *p; - list_for_each_entry(p, head, mnt_hash) + list_for_each_entry_rcu(p, head, mnt_hash) if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) return p; return NULL; @@ -564,7 +586,7 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) /* * find the last mount at @dentry on vfsmount @mnt. - * vfsmount_lock must be held for read or write. + * mount_lock must be held. */ struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry) { @@ -596,17 +618,17 @@ struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry) struct vfsmount *lookup_mnt(struct path *path) { struct mount *child_mnt; + struct vfsmount *m; + unsigned seq; - br_read_lock(&vfsmount_lock); - child_mnt = __lookup_mnt(path->mnt, path->dentry); - if (child_mnt) { - mnt_add_count(child_mnt, 1); - br_read_unlock(&vfsmount_lock); - return &child_mnt->mnt; - } else { - br_read_unlock(&vfsmount_lock); - return NULL; - } + rcu_read_lock(); + do { + seq = read_seqbegin(&mount_lock); + child_mnt = __lookup_mnt(path->mnt, path->dentry); + m = child_mnt ? &child_mnt->mnt : NULL; + } while (!legitimize_mnt(m, seq)); + rcu_read_unlock(); + return m; } static struct mountpoint *new_mountpoint(struct dentry *dentry) @@ -874,38 +896,46 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, return ERR_PTR(err); } +static void delayed_free(struct rcu_head *head) +{ + struct mount *mnt = container_of(head, struct mount, mnt_rcu); + kfree(mnt->mnt_devname); +#ifdef CONFIG_SMP + free_percpu(mnt->mnt_pcp); +#endif + kmem_cache_free(mnt_cache, mnt); +} + static void mntput_no_expire(struct mount *mnt) { put_again: -#ifdef CONFIG_SMP - br_read_lock(&vfsmount_lock); - if (likely(mnt->mnt_ns)) { - /* shouldn't be the last one */ - mnt_add_count(mnt, -1); - br_read_unlock(&vfsmount_lock); + rcu_read_lock(); + mnt_add_count(mnt, -1); + if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */ + rcu_read_unlock(); return; } - br_read_unlock(&vfsmount_lock); - lock_mount_hash(); - mnt_add_count(mnt, -1); if (mnt_get_count(mnt)) { + rcu_read_unlock(); unlock_mount_hash(); return; } -#else - mnt_add_count(mnt, -1); - if (likely(mnt_get_count(mnt))) - return; - lock_mount_hash(); -#endif if (unlikely(mnt->mnt_pinned)) { mnt_add_count(mnt, mnt->mnt_pinned + 1); mnt->mnt_pinned = 0; + rcu_read_unlock(); unlock_mount_hash(); acct_auto_close_mnt(&mnt->mnt); goto put_again; } + if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) { + rcu_read_unlock(); + unlock_mount_hash(); + return; + } + mnt->mnt.mnt_flags |= MNT_DOOMED; + rcu_read_unlock(); list_del(&mnt->mnt_instance); unlock_mount_hash(); @@ -924,7 +954,8 @@ put_again: fsnotify_vfsmount_delete(&mnt->mnt); dput(mnt->mnt.mnt_root); deactivate_super(mnt->mnt.mnt_sb); - free_vfsmnt(mnt); + mnt_free_id(mnt); + call_rcu(&mnt->mnt_rcu, delayed_free); } void mntput(struct vfsmount *mnt) @@ -1137,6 +1168,8 @@ static void namespace_unlock(void) list_splice_init(&unmounted, &head); up_write(&namespace_sem); + synchronize_rcu(); + while (!list_empty(&head)) { mnt = list_first_entry(&head, struct mount, mnt_hash); list_del_init(&mnt->mnt_hash); @@ -1152,10 +1185,13 @@ static inline void namespace_lock(void) } /* - * vfsmount lock must be held for write + * mount_lock must be held * namespace_sem must be held for write + * how = 0 => just this tree, don't propagate + * how = 1 => propagate; we know that nobody else has reference to any victims + * how = 2 => lazy umount */ -void umount_tree(struct mount *mnt, int propagate) +void umount_tree(struct mount *mnt, int how) { LIST_HEAD(tmp_list); struct mount *p; @@ -1163,7 +1199,7 @@ void umount_tree(struct mount *mnt, int propagate) for (p = mnt; p; p = next_mnt(p, mnt)) list_move(&p->mnt_hash, &tmp_list); - if (propagate) + if (how) propagate_umount(&tmp_list); list_for_each_entry(p, &tmp_list, mnt_hash) { @@ -1171,6 +1207,8 @@ void umount_tree(struct mount *mnt, int propagate) list_del_init(&p->mnt_list); __touch_mnt_namespace(p->mnt_ns); p->mnt_ns = NULL; + if (how < 2) + p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; list_del_init(&p->mnt_child); if (mnt_has_parent(p)) { put_mountpoint(p->mnt_mp); @@ -1262,14 +1300,18 @@ static int do_umount(struct mount *mnt, int flags) lock_mount_hash(); event++; - if (!(flags & MNT_DETACH)) - shrink_submounts(mnt); - - retval = -EBUSY; - if (flags & MNT_DETACH || !propagate_mount_busy(mnt, 2)) { + if (flags & MNT_DETACH) { if (!list_empty(&mnt->mnt_list)) - umount_tree(mnt, 1); + umount_tree(mnt, 2); retval = 0; + } else { + shrink_submounts(mnt); + retval = -EBUSY; + if (!propagate_mount_busy(mnt, 2)) { + if (!list_empty(&mnt->mnt_list)) + umount_tree(mnt, 1); + retval = 0; + } } unlock_mount_hash(); namespace_unlock(); @@ -1955,7 +1997,7 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) struct mount *parent; int err; - mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL); + mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | MNT_DOOMED | MNT_SYNC_UMOUNT); mp = lock_mount(path); if (IS_ERR(mp)) @@ -2172,7 +2214,7 @@ resume: * process a list of expirable mountpoints with the intent of discarding any * submounts of a specific parent mountpoint * - * vfsmount_lock must be held for write + * mount_lock must be held for write */ static void shrink_submounts(struct mount *mnt) { @@ -2558,7 +2600,7 @@ out_type: /* * Return true if path is reachable from root * - * namespace_sem or vfsmount_lock is held + * namespace_sem or mount_lock is held */ bool is_path_reachable(struct mount *mnt, struct dentry *dentry, const struct path *root) @@ -2573,9 +2615,9 @@ bool is_path_reachable(struct mount *mnt, struct dentry *dentry, int path_is_under(struct path *path1, struct path *path2) { int res; - br_read_lock(&vfsmount_lock); + read_seqlock_excl(&mount_lock); res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2); - br_read_unlock(&vfsmount_lock); + read_sequnlock_excl(&mount_lock); return res; } EXPORT_SYMBOL(path_is_under); @@ -2748,8 +2790,6 @@ void __init mnt_init(void) for (u = 0; u < HASH_SIZE; u++) INIT_LIST_HEAD(&mountpoint_hashtable[u]); - br_lock_init(&vfsmount_lock); - err = sysfs_init(); if (err) printk(KERN_WARNING "%s: sysfs_init error: %d\n", @@ -2788,9 +2828,8 @@ void kern_unmount(struct vfsmount *mnt) { /* release long term mount so mount point can be released */ if (!IS_ERR_OR_NULL(mnt)) { - lock_mount_hash(); real_mount(mnt)->mnt_ns = NULL; - unlock_mount_hash(); + synchronize_rcu(); /* yecchhh... */ mntput(mnt); } } diff --git a/include/linux/mount.h b/include/linux/mount.h index 38cd98f112a0..371d346fa270 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -49,6 +49,8 @@ struct mnt_namespace; #define MNT_LOCK_READONLY 0x400000 #define MNT_LOCKED 0x800000 +#define MNT_DOOMED 0x1000000 +#define MNT_SYNC_UMOUNT 0x2000000 struct vfsmount { struct dentry *mnt_root; /* root of the mounted tree */ diff --git a/include/linux/namei.h b/include/linux/namei.h index 8e47bc7a1665..492de72560fa 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -16,7 +16,7 @@ struct nameidata { struct path root; struct inode *inode; /* path.dentry.d_inode */ unsigned int flags; - unsigned seq; + unsigned seq, m_seq; int last_type; unsigned depth; char *saved_names[MAX_NESTED_LINKS + 1]; -- cgit v1.2.3 From 8b61e74ffc6310e1d35a9b51c8463093851f8bcf Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 8 Nov 2013 12:45:01 -0500 Subject: get rid of {lock,unlock}_rcu_walk() those have become aliases for rcu_read_{lock,unlock}() Signed-off-by: Al Viro --- fs/namei.c | 38 ++++++++++++++------------------------ 1 file changed, 14 insertions(+), 24 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index cb0ebae07e52..e5c0118ba9f8 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -482,16 +482,6 @@ EXPORT_SYMBOL(path_put); * to restart the path walk from the beginning in ref-walk mode. */ -static inline void lock_rcu_walk(void) -{ - rcu_read_lock(); -} - -static inline void unlock_rcu_walk(void) -{ - rcu_read_unlock(); -} - /** * unlazy_walk - try to switch to ref-walk mode. * @nd: nameidata pathwalk data @@ -523,7 +513,7 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) if (!lockref_get_not_dead(&parent->d_lockref)) { nd->path.dentry = NULL; - unlock_rcu_walk(); + rcu_read_unlock(); return -ECHILD; } @@ -561,17 +551,17 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) spin_unlock(&fs->lock); } - unlock_rcu_walk(); + rcu_read_unlock(); return 0; unlock_and_drop_dentry: spin_unlock(&fs->lock); drop_dentry: - unlock_rcu_walk(); + rcu_read_unlock(); dput(dentry); goto drop_root_mnt; out: - unlock_rcu_walk(); + rcu_read_unlock(); drop_root_mnt: if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; @@ -604,21 +594,21 @@ static int complete_walk(struct nameidata *nd) nd->root.mnt = NULL; if (!legitimize_mnt(nd->path.mnt, nd->m_seq)) { - unlock_rcu_walk(); + rcu_read_unlock(); return -ECHILD; } if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) { - unlock_rcu_walk(); + rcu_read_unlock(); mntput(nd->path.mnt); return -ECHILD; } if (read_seqcount_retry(&dentry->d_seq, nd->seq)) { - unlock_rcu_walk(); + rcu_read_unlock(); dput(dentry); mntput(nd->path.mnt); return -ECHILD; } - unlock_rcu_walk(); + rcu_read_unlock(); } if (likely(!(nd->flags & LOOKUP_JUMPED))) @@ -1174,7 +1164,7 @@ failed: nd->flags &= ~LOOKUP_RCU; if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; - unlock_rcu_walk(); + rcu_read_unlock(); return -ECHILD; } @@ -1501,7 +1491,7 @@ static void terminate_walk(struct nameidata *nd) nd->flags &= ~LOOKUP_RCU; if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; - unlock_rcu_walk(); + rcu_read_unlock(); } } @@ -1862,7 +1852,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, nd->path = nd->root; nd->inode = inode; if (flags & LOOKUP_RCU) { - lock_rcu_walk(); + rcu_read_lock(); nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); nd->m_seq = read_seqbegin(&mount_lock); } else { @@ -1876,7 +1866,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, nd->m_seq = read_seqbegin(&mount_lock); if (*name=='/') { if (flags & LOOKUP_RCU) { - lock_rcu_walk(); + rcu_read_lock(); set_root_rcu(nd); } else { set_root(nd); @@ -1888,7 +1878,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct fs_struct *fs = current->fs; unsigned seq; - lock_rcu_walk(); + rcu_read_lock(); do { seq = read_seqcount_begin(&fs->seq); @@ -1920,7 +1910,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, if (f.need_put) *fp = f.file; nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); - lock_rcu_walk(); + rcu_read_lock(); } else { path_get(&nd->path); fdput(f); -- cgit v1.2.3 From b18825a7c8e37a7cf6abb97a12a6ad71af160de7 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 12 Sep 2013 19:22:53 +0100 Subject: VFS: Put a small type field into struct dentry::d_flags Put a type field into struct dentry::d_flags to indicate if the dentry is one of the following types that relate particularly to pathwalk: Miss (negative dentry) Directory "Automount" directory (defective - no i_op->lookup()) Symlink Other (regular, socket, fifo, device) The type field is set to one of the first five types on a dentry by calls to __d_instantiate() and d_obtain_alias() from information in the inode (if one is given). The type is cleared by dentry_unlink_inode() when it reconstitutes an existing dentry as a negative dentry. Accessors provided are: d_set_type(dentry, type) d_is_directory(dentry) d_is_autodir(dentry) d_is_symlink(dentry) d_is_file(dentry) d_is_negative(dentry) d_is_positive(dentry) A bunch of checks in pathname resolution switched to those. Signed-off-by: David Howells Signed-off-by: Al Viro --- fs/dcache.c | 42 +++++++++++++++++--- fs/namei.c | 95 ++++++++++++++++++--------------------------- include/linux/dcache.h | 103 +++++++++++++++++++++++++++++++++++++++---------- 3 files changed, 158 insertions(+), 82 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/dcache.c b/fs/dcache.c index fb7bcf3ba5d6..525770e576db 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -343,6 +343,7 @@ static void dentry_unlink_inode(struct dentry * dentry) __releases(dentry->d_inode->i_lock) { struct inode *inode = dentry->d_inode; + __d_clear_type(dentry); dentry->d_inode = NULL; hlist_del_init(&dentry->d_alias); dentry_rcuwalk_barrier(dentry); @@ -1648,14 +1649,42 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) } EXPORT_SYMBOL(d_set_d_op); +static unsigned d_flags_for_inode(struct inode *inode) +{ + unsigned add_flags = DCACHE_FILE_TYPE; + + if (!inode) + return DCACHE_MISS_TYPE; + + if (S_ISDIR(inode->i_mode)) { + add_flags = DCACHE_DIRECTORY_TYPE; + if (unlikely(!(inode->i_opflags & IOP_LOOKUP))) { + if (unlikely(!inode->i_op->lookup)) + add_flags = DCACHE_AUTODIR_TYPE; + else + inode->i_opflags |= IOP_LOOKUP; + } + } else if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) { + if (unlikely(inode->i_op->follow_link)) + add_flags = DCACHE_SYMLINK_TYPE; + else + inode->i_opflags |= IOP_NOFOLLOW; + } + + if (unlikely(IS_AUTOMOUNT(inode))) + add_flags |= DCACHE_NEED_AUTOMOUNT; + return add_flags; +} + static void __d_instantiate(struct dentry *dentry, struct inode *inode) { + unsigned add_flags = d_flags_for_inode(inode); + spin_lock(&dentry->d_lock); - if (inode) { - if (unlikely(IS_AUTOMOUNT(inode))) - dentry->d_flags |= DCACHE_NEED_AUTOMOUNT; + dentry->d_flags &= ~DCACHE_ENTRY_TYPE; + dentry->d_flags |= add_flags; + if (inode) hlist_add_head(&dentry->d_alias, &inode->i_dentry); - } dentry->d_inode = inode; dentry_rcuwalk_barrier(dentry); spin_unlock(&dentry->d_lock); @@ -1860,6 +1889,7 @@ struct dentry *d_obtain_alias(struct inode *inode) static const struct qstr anonstring = QSTR_INIT("/", 1); struct dentry *tmp; struct dentry *res; + unsigned add_flags; if (!inode) return ERR_PTR(-ESTALE); @@ -1885,9 +1915,11 @@ struct dentry *d_obtain_alias(struct inode *inode) } /* attach a disconnected dentry */ + add_flags = d_flags_for_inode(inode) | DCACHE_DISCONNECTED; + spin_lock(&tmp->d_lock); tmp->d_inode = inode; - tmp->d_flags |= DCACHE_DISCONNECTED; + tmp->d_flags |= add_flags; hlist_add_head(&tmp->d_alias, &inode->i_dentry); hlist_bl_lock(&tmp->d_sb->s_anon); hlist_bl_add_head(&tmp->d_hash, &tmp->d_sb->s_anon); diff --git a/fs/namei.c b/fs/namei.c index e5c0118ba9f8..e1fa43346c61 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1501,18 +1501,9 @@ static void terminate_walk(struct nameidata *nd) * so we keep a cache of "no, this doesn't need follow_link" * for the common case. */ -static inline int should_follow_link(struct inode *inode, int follow) +static inline int should_follow_link(struct dentry *dentry, int follow) { - if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) { - if (likely(inode->i_op->follow_link)) - return follow; - - /* This gets set once for the inode lifetime */ - spin_lock(&inode->i_lock); - inode->i_opflags |= IOP_NOFOLLOW; - spin_unlock(&inode->i_lock); - } - return 0; + return unlikely(d_is_symlink(dentry)) ? follow : 0; } static inline int walk_component(struct nameidata *nd, struct path *path, @@ -1542,7 +1533,7 @@ static inline int walk_component(struct nameidata *nd, struct path *path, if (!inode) goto out_path_put; - if (should_follow_link(inode, follow)) { + if (should_follow_link(path->dentry, follow)) { if (nd->flags & LOOKUP_RCU) { if (unlikely(unlazy_walk(nd, path->dentry))) { err = -ECHILD; @@ -1600,26 +1591,6 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd) return res; } -/* - * We really don't want to look at inode->i_op->lookup - * when we don't have to. So we keep a cache bit in - * the inode ->i_opflags field that says "yes, we can - * do lookup on this inode". - */ -static inline int can_lookup(struct inode *inode) -{ - if (likely(inode->i_opflags & IOP_LOOKUP)) - return 1; - if (likely(!inode->i_op->lookup)) - return 0; - - /* We do this once for the lifetime of the inode */ - spin_lock(&inode->i_lock); - inode->i_opflags |= IOP_LOOKUP; - spin_unlock(&inode->i_lock); - return 1; -} - /* * We can do the critical dentry name comparison and hashing * operations one word at a time, but we are limited to: @@ -1823,7 +1794,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) if (err) return err; } - if (!can_lookup(nd->inode)) { + if (!d_is_directory(nd->path.dentry)) { err = -ENOTDIR; break; } @@ -1841,9 +1812,10 @@ static int path_init(int dfd, const char *name, unsigned int flags, nd->flags = flags | LOOKUP_JUMPED; nd->depth = 0; if (flags & LOOKUP_ROOT) { - struct inode *inode = nd->root.dentry->d_inode; + struct dentry *root = nd->root.dentry; + struct inode *inode = root->d_inode; if (*name) { - if (!can_lookup(inode)) + if (!d_is_directory(root)) return -ENOTDIR; retval = inode_permission(inode, MAY_EXEC); if (retval) @@ -1899,7 +1871,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, dentry = f.file->f_path.dentry; if (*name) { - if (!can_lookup(dentry->d_inode)) { + if (!d_is_directory(dentry)) { fdput(f); return -ENOTDIR; } @@ -1981,7 +1953,7 @@ static int path_lookupat(int dfd, const char *name, err = complete_walk(nd); if (!err && nd->flags & LOOKUP_DIRECTORY) { - if (!can_lookup(nd->inode)) { + if (!d_is_directory(nd->path.dentry)) { path_put(&nd->path); err = -ENOTDIR; } @@ -2273,7 +2245,7 @@ done: } path->dentry = dentry; path->mnt = mntget(nd->path.mnt); - if (should_follow_link(dentry->d_inode, nd->flags & LOOKUP_FOLLOW)) + if (should_follow_link(dentry, nd->flags & LOOKUP_FOLLOW)) return 1; follow_mount(path); error = 0; @@ -2417,12 +2389,14 @@ static inline int check_sticky(struct inode *dir, struct inode *inode) * 10. We don't allow removal of NFS sillyrenamed files; it's handled by * nfs_async_unlink(). */ -static int may_delete(struct inode *dir,struct dentry *victim,int isdir) +static int may_delete(struct inode *dir, struct dentry *victim, bool isdir) { + struct inode *inode = victim->d_inode; int error; - if (!victim->d_inode) + if (d_is_negative(victim)) return -ENOENT; + BUG_ON(!inode); BUG_ON(victim->d_parent->d_inode != dir); audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); @@ -2432,15 +2406,16 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir) return error; if (IS_APPEND(dir)) return -EPERM; - if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)|| - IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode)) + + if (check_sticky(dir, inode) || IS_APPEND(inode) || + IS_IMMUTABLE(inode) || IS_SWAPFILE(inode)) return -EPERM; if (isdir) { - if (!S_ISDIR(victim->d_inode->i_mode)) + if (!d_is_directory(victim) && !d_is_autodir(victim)) return -ENOTDIR; if (IS_ROOT(victim)) return -EBUSY; - } else if (S_ISDIR(victim->d_inode->i_mode)) + } else if (d_is_directory(victim) || d_is_autodir(victim)) return -EISDIR; if (IS_DEADDIR(dir)) return -ENOENT; @@ -2974,7 +2949,7 @@ retry_lookup: /* * create/update audit record if it already exists. */ - if (path->dentry->d_inode) + if (d_is_positive(path->dentry)) audit_inode(name, path->dentry, 0); /* @@ -3003,12 +2978,12 @@ retry_lookup: finish_lookup: /* we _can_ be in RCU mode here */ error = -ENOENT; - if (!inode) { + if (d_is_negative(path->dentry)) { path_to_nameidata(path, nd); goto out; } - if (should_follow_link(inode, !symlink_ok)) { + if (should_follow_link(path->dentry, !symlink_ok)) { if (nd->flags & LOOKUP_RCU) { if (unlikely(unlazy_walk(nd, path->dentry))) { error = -ECHILD; @@ -3037,10 +3012,11 @@ finish_open: } audit_inode(name, nd->path.dentry, 0); error = -EISDIR; - if ((open_flag & O_CREAT) && S_ISDIR(nd->inode->i_mode)) + if ((open_flag & O_CREAT) && + (d_is_directory(nd->path.dentry) || d_is_autodir(nd->path.dentry))) goto out; error = -ENOTDIR; - if ((nd->flags & LOOKUP_DIRECTORY) && !can_lookup(nd->inode)) + if ((nd->flags & LOOKUP_DIRECTORY) && !d_is_directory(nd->path.dentry)) goto out; if (!S_ISREG(nd->inode->i_mode)) will_truncate = false; @@ -3266,7 +3242,7 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, nd.root.mnt = mnt; nd.root.dentry = dentry; - if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN) + if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN) return ERR_PTR(-ELOOP); file = path_openat(-1, &filename, &nd, op, flags | LOOKUP_RCU); @@ -3316,8 +3292,9 @@ struct dentry *kern_path_create(int dfd, const char *pathname, goto unlock; error = -EEXIST; - if (dentry->d_inode) + if (d_is_positive(dentry)) goto fail; + /* * Special case - lookup gave negative, but... we had foo/bar/ * From the vfs_mknod() POV we just have a negative dentry - @@ -3706,7 +3683,7 @@ retry: if (nd.last.name[nd.last.len]) goto slashes; inode = dentry->d_inode; - if (!inode) + if (d_is_negative(dentry)) goto slashes; ihold(inode); error = security_path_unlink(&nd.path, dentry); @@ -3731,8 +3708,12 @@ exit1: return error; slashes: - error = !dentry->d_inode ? -ENOENT : - S_ISDIR(dentry->d_inode->i_mode) ? -EISDIR : -ENOTDIR; + if (d_is_negative(dentry)) + error = -ENOENT; + else if (d_is_directory(dentry) || d_is_autodir(dentry)) + error = -EISDIR; + else + error = -ENOTDIR; goto exit2; } @@ -4046,7 +4027,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { int error; - int is_dir = S_ISDIR(old_dentry->d_inode->i_mode); + int is_dir = d_is_directory(old_dentry) || d_is_autodir(old_dentry); const unsigned char *old_name; if (old_dentry->d_inode == new_dentry->d_inode) @@ -4134,10 +4115,10 @@ retry: goto exit3; /* source must exist */ error = -ENOENT; - if (!old_dentry->d_inode) + if (d_is_negative(old_dentry)) goto exit4; /* unless the source is a directory trailing slashes give -ENOTDIR */ - if (!S_ISDIR(old_dentry->d_inode->i_mode)) { + if (!d_is_directory(old_dentry) && !d_is_autodir(old_dentry)) { error = -ENOTDIR; if (oldnd.last.name[oldnd.last.len]) goto exit4; diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 716c3760ee39..57e87e749a48 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -169,13 +169,13 @@ struct dentry_operations { */ /* d_flags entries */ -#define DCACHE_OP_HASH 0x0001 -#define DCACHE_OP_COMPARE 0x0002 -#define DCACHE_OP_REVALIDATE 0x0004 -#define DCACHE_OP_DELETE 0x0008 -#define DCACHE_OP_PRUNE 0x0010 +#define DCACHE_OP_HASH 0x00000001 +#define DCACHE_OP_COMPARE 0x00000002 +#define DCACHE_OP_REVALIDATE 0x00000004 +#define DCACHE_OP_DELETE 0x00000008 +#define DCACHE_OP_PRUNE 0x00000010 -#define DCACHE_DISCONNECTED 0x0020 +#define DCACHE_DISCONNECTED 0x00000020 /* This dentry is possibly not currently connected to the dcache tree, in * which case its parent will either be itself, or will have this flag as * well. nfsd will not use a dentry with this bit set, but will first @@ -186,30 +186,38 @@ struct dentry_operations { * dentry into place and return that dentry rather than the passed one, * typically using d_splice_alias. */ -#define DCACHE_REFERENCED 0x0040 /* Recently used, don't discard. */ -#define DCACHE_RCUACCESS 0x0080 /* Entry has ever been RCU-visible */ +#define DCACHE_REFERENCED 0x00000040 /* Recently used, don't discard. */ +#define DCACHE_RCUACCESS 0x00000080 /* Entry has ever been RCU-visible */ -#define DCACHE_CANT_MOUNT 0x0100 -#define DCACHE_GENOCIDE 0x0200 -#define DCACHE_SHRINK_LIST 0x0400 +#define DCACHE_CANT_MOUNT 0x00000100 +#define DCACHE_GENOCIDE 0x00000200 +#define DCACHE_SHRINK_LIST 0x00000400 -#define DCACHE_OP_WEAK_REVALIDATE 0x0800 +#define DCACHE_OP_WEAK_REVALIDATE 0x00000800 -#define DCACHE_NFSFS_RENAMED 0x1000 +#define DCACHE_NFSFS_RENAMED 0x00001000 /* this dentry has been "silly renamed" and has to be deleted on the last * dput() */ -#define DCACHE_COOKIE 0x2000 /* For use by dcookie subsystem */ -#define DCACHE_FSNOTIFY_PARENT_WATCHED 0x4000 +#define DCACHE_COOKIE 0x00002000 /* For use by dcookie subsystem */ +#define DCACHE_FSNOTIFY_PARENT_WATCHED 0x00004000 /* Parent inode is watched by some fsnotify listener */ -#define DCACHE_MOUNTED 0x10000 /* is a mountpoint */ -#define DCACHE_NEED_AUTOMOUNT 0x20000 /* handle automount on this dir */ -#define DCACHE_MANAGE_TRANSIT 0x40000 /* manage transit from this dirent */ +#define DCACHE_DENTRY_KILLED 0x00008000 + +#define DCACHE_MOUNTED 0x00010000 /* is a mountpoint */ +#define DCACHE_NEED_AUTOMOUNT 0x00020000 /* handle automount on this dir */ +#define DCACHE_MANAGE_TRANSIT 0x00040000 /* manage transit from this dirent */ #define DCACHE_MANAGED_DENTRY \ (DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT) -#define DCACHE_LRU_LIST 0x80000 -#define DCACHE_DENTRY_KILLED 0x100000 +#define DCACHE_LRU_LIST 0x00080000 + +#define DCACHE_ENTRY_TYPE 0x00700000 +#define DCACHE_MISS_TYPE 0x00000000 /* Negative dentry */ +#define DCACHE_DIRECTORY_TYPE 0x00100000 /* Normal directory */ +#define DCACHE_AUTODIR_TYPE 0x00200000 /* Lookupless directory (presumed automount) */ +#define DCACHE_SYMLINK_TYPE 0x00300000 /* Symlink */ +#define DCACHE_FILE_TYPE 0x00400000 /* Other file type */ extern seqlock_t rename_lock; @@ -394,6 +402,61 @@ static inline bool d_mountpoint(const struct dentry *dentry) return dentry->d_flags & DCACHE_MOUNTED; } +/* + * Directory cache entry type accessor functions. + */ +static inline void __d_set_type(struct dentry *dentry, unsigned type) +{ + dentry->d_flags = (dentry->d_flags & ~DCACHE_ENTRY_TYPE) | type; +} + +static inline void __d_clear_type(struct dentry *dentry) +{ + __d_set_type(dentry, DCACHE_MISS_TYPE); +} + +static inline void d_set_type(struct dentry *dentry, unsigned type) +{ + spin_lock(&dentry->d_lock); + __d_set_type(dentry, type); + spin_unlock(&dentry->d_lock); +} + +static inline unsigned __d_entry_type(const struct dentry *dentry) +{ + return dentry->d_flags & DCACHE_ENTRY_TYPE; +} + +static inline bool d_is_directory(const struct dentry *dentry) +{ + return __d_entry_type(dentry) == DCACHE_DIRECTORY_TYPE; +} + +static inline bool d_is_autodir(const struct dentry *dentry) +{ + return __d_entry_type(dentry) == DCACHE_AUTODIR_TYPE; +} + +static inline bool d_is_symlink(const struct dentry *dentry) +{ + return __d_entry_type(dentry) == DCACHE_SYMLINK_TYPE; +} + +static inline bool d_is_file(const struct dentry *dentry) +{ + return __d_entry_type(dentry) == DCACHE_FILE_TYPE; +} + +static inline bool d_is_negative(const struct dentry *dentry) +{ + return __d_entry_type(dentry) == DCACHE_MISS_TYPE; +} + +static inline bool d_is_positive(const struct dentry *dentry) +{ + return !d_is_negative(dentry); +} + extern int sysctl_vfs_cache_pressure; static inline unsigned long vfs_pressure_ratio(unsigned long val) -- cgit v1.2.3 From 13a2c3be03973d61b6cb89ff870e758c86327bb7 Mon Sep 17 00:00:00 2001 From: J. Bruce Fields Date: Wed, 23 Oct 2013 16:09:16 -0400 Subject: dcache: fix outdated DCACHE_NEED_LOOKUP comment The DCACHE_NEED_LOOKUP case referred to here was removed with 39e3c9553f34381a1b664c27b0c696a266a5735e "vfs: remove DCACHE_NEED_LOOKUP". There are only four real_lookup() callers and all of them pass in an unhashed dentry just returned from d_alloc. Signed-off-by: J. Bruce Fields Signed-off-by: Al Viro --- fs/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index e1fa43346c61..2a5a7aa9f43f 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1298,8 +1298,8 @@ static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir, } /* - * Call i_op->lookup on the dentry. The dentry must be negative but may be - * hashed if it was pouplated with DCACHE_NEED_LOOKUP. + * Call i_op->lookup on the dentry. The dentry must be negative and + * unhashed. * * dir->d_inode->i_mutex must be held */ -- cgit v1.2.3 From 6cedba8962f440c72447f811d0d530a8a9dc637a Mon Sep 17 00:00:00 2001 From: J. Bruce Fields Date: Mon, 5 Mar 2012 11:40:41 -0500 Subject: vfs: take i_mutex on renamed file A read delegation is used by NFSv4 as a guarantee that a client can perform local read opens without informing the server. The open operation takes the last component of the pathname as an argument, thus is also a lookup operation, and giving the client the above guarantee means informing the client before we allow anything that would change the set of names pointing to the inode. Therefore, we need to break delegations on rename, link, and unlink. We also need to prevent new delegations from being acquired while one of these operations is in progress. We could add some completely new locking for that purpose, but it's simpler to use the i_mutex, since that's already taken by all the operations we care about. The single exception is rename. So, modify rename to take the i_mutex on the file that is being renamed. Also fix up lockdep and Documentation/filesystems/directory-locking to reflect the change. Acked-by: Jeff Layton Signed-off-by: J. Bruce Fields Signed-off-by: Al Viro --- Documentation/filesystems/directory-locking | 31 ++++++++++++++++++++--------- fs/namei.c | 10 +++++----- 2 files changed, 27 insertions(+), 14 deletions(-) (limited to 'fs/namei.c') diff --git a/Documentation/filesystems/directory-locking b/Documentation/filesystems/directory-locking index ff7b611abf33..09bbf9a54f80 100644 --- a/Documentation/filesystems/directory-locking +++ b/Documentation/filesystems/directory-locking @@ -2,6 +2,10 @@ kinds of locks - per-inode (->i_mutex) and per-filesystem (->s_vfs_rename_mutex). + When taking the i_mutex on multiple non-directory objects, we +always acquire the locks in order by increasing address. We'll call +that "inode pointer" order in the following. + For our purposes all operations fall in 5 classes: 1) read access. Locking rules: caller locks directory we are accessing. @@ -12,8 +16,9 @@ kinds of locks - per-inode (->i_mutex) and per-filesystem locks victim and calls the method. 4) rename() that is _not_ cross-directory. Locking rules: caller locks -the parent, finds source and target, if target already exists - locks it -and then calls the method. +the parent and finds source and target. If target already exists, lock +it. If source is a non-directory, lock it. If that means we need to +lock both, lock them in inode pointer order. 5) link creation. Locking rules: * lock parent @@ -30,7 +35,9 @@ rules: fail with -ENOTEMPTY * if new parent is equal to or is a descendent of source fail with -ELOOP - * if target exists - lock it. + * If target exists, lock it. If source is a non-directory, lock + it. In case that means we need to lock both source and target, + do so in inode pointer order. * call the method. @@ -56,9 +63,11 @@ objects - A < B iff A is an ancestor of B. renames will be blocked on filesystem lock and we don't start changing the order until we had acquired all locks). -(3) any operation holds at most one lock on non-directory object and - that lock is acquired after all other locks. (Proof: see descriptions - of operations). +(3) locks on non-directory objects are acquired only after locks on + directory objects, and are acquired in inode pointer order. + (Proof: all operations but renames take lock on at most one + non-directory object, except renames, which take locks on source and + target in inode pointer order in the case they are not directories.) Now consider the minimal deadlock. Each process is blocked on attempt to acquire some lock and already holds at least one lock. Let's @@ -66,9 +75,13 @@ consider the set of contended locks. First of all, filesystem lock is not contended, since any process blocked on it is not holding any locks. Thus all processes are blocked on ->i_mutex. - Non-directory objects are not contended due to (3). Thus link -creation can't be a part of deadlock - it can't be blocked on source -and it means that it doesn't hold any locks. + By (3), any process holding a non-directory lock can only be +waiting on another non-directory lock with a larger address. Therefore +the process holding the "largest" such lock can always make progress, and +non-directory objects are not included in the set of contended locks. + + Thus link creation can't be a part of deadlock - it can't be +blocked on source and it means that it doesn't hold any locks. Any contended object is either held by cross-directory rename or has a child that is also contended. Indeed, suppose that it is held by diff --git a/fs/namei.c b/fs/namei.c index 2a5a7aa9f43f..88cec0330bf7 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3918,7 +3918,8 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname * That's where 4.4 screws up. Current fix: serialization on * sb->s_vfs_rename_mutex. We might be more accurate, but that's another * story. - * c) we have to lock _three_ objects - parents and victim (if it exists). + * c) we have to lock _four_ objects - parents and victim (if it exists), + * and source (if it is not a directory). * And that - after we got ->i_mutex on parents (until then we don't know * whether the target exists). Solution: try to be smart with locking * order for inodes. We rely on the fact that tree topology may change @@ -3994,6 +3995,7 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { struct inode *target = new_dentry->d_inode; + struct inode *source = old_dentry->d_inode; int error; error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry); @@ -4001,8 +4003,7 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, return error; dget(new_dentry); - if (target) - mutex_lock(&target->i_mutex); + lock_two_nondirectories(source, target); error = -EBUSY; if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) @@ -4017,8 +4018,7 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) d_move(old_dentry, new_dentry); out: - if (target) - mutex_unlock(&target->i_mutex); + unlock_two_nondirectories(source, target); dput(new_dentry); return error; } -- cgit v1.2.3 From 9accbb977ab78234b8f298df5f306ed08d06bedb Mon Sep 17 00:00:00 2001 From: J. Bruce Fields Date: Tue, 28 Aug 2012 07:03:24 -0400 Subject: namei: minor vfs_unlink cleanup We'll be using dentry->d_inode in one more place. Acked-by: Jeff Layton Signed-off-by: J. Bruce Fields Signed-off-by: Al Viro --- fs/namei.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 88cec0330bf7..e633a58d4222 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3617,6 +3617,7 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname) int vfs_unlink(struct inode *dir, struct dentry *dentry) { + struct inode *target = dentry->d_inode; int error = may_delete(dir, dentry, 0); if (error) @@ -3625,7 +3626,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry) if (!dir->i_op->unlink) return -EPERM; - mutex_lock(&dentry->d_inode->i_mutex); + mutex_lock(&target->i_mutex); if (d_mountpoint(dentry)) error = -EBUSY; else { @@ -3636,11 +3637,11 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry) dont_mount(dentry); } } - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&target->i_mutex); /* We don't d_delete() NFS sillyrenamed files--they still exist. */ if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) { - fsnotify_link_count(dentry->d_inode); + fsnotify_link_count(target); d_delete(dentry); } -- cgit v1.2.3 From b21996e36c8e3b92a84e972378bde80b43acd890 Mon Sep 17 00:00:00 2001 From: J. Bruce Fields Date: Tue, 20 Sep 2011 09:14:34 -0400 Subject: locks: break delegations on unlink We need to break delegations on any operation that changes the set of links pointing to an inode. Start with unlink. Such operations also hold the i_mutex on a parent directory. Breaking a delegation may require waiting for a timeout (by default 90 seconds) in the case of a unresponsive NFS client. To avoid blocking all directory operations, we therefore drop locks before waiting for the delegation. The logic then looks like: acquire locks ... test for delegation; if found: take reference on inode release locks wait for delegation break drop reference on inode retry It is possible this could never terminate. (Even if we take precautions to prevent another delegation being acquired on the same inode, we could get a different inode on each retry.) But this seems very unlikely. The initial test for a delegation happens after the lock on the target inode is acquired, but the directory inode may have been acquired further up the call stack. We therefore add a "struct inode **" argument to any intervening functions, which we use to pass the inode back up to the caller in the case it needs a delegation synchronously broken. Cc: David Howells Cc: Tyler Hicks Cc: Dustin Kirkland Acked-by: Jeff Layton Signed-off-by: J. Bruce Fields Signed-off-by: Al Viro --- drivers/base/devtmpfs.c | 2 +- fs/cachefiles/namei.c | 2 +- fs/ecryptfs/inode.c | 4 ++-- fs/namei.c | 42 +++++++++++++++++++++++++++++++++++++++--- fs/nfsd/vfs.c | 2 +- include/linux/fs.h | 2 +- ipc/mqueue.c | 2 +- 7 files changed, 46 insertions(+), 10 deletions(-) (limited to 'fs/namei.c') diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c index 7413d065906b..1b8490e2fbde 100644 --- a/drivers/base/devtmpfs.c +++ b/drivers/base/devtmpfs.c @@ -324,7 +324,7 @@ static int handle_remove(const char *nodename, struct device *dev) mutex_lock(&dentry->d_inode->i_mutex); notify_change(dentry, &newattrs); mutex_unlock(&dentry->d_inode->i_mutex); - err = vfs_unlink(parent.dentry->d_inode, dentry); + err = vfs_unlink(parent.dentry->d_inode, dentry, NULL); if (!err || err == -ENOENT) deleted = 1; } diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index f4a08d7fa2f7..31d480c0e046 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -294,7 +294,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache, if (ret < 0) { cachefiles_io_error(cache, "Unlink security error"); } else { - ret = vfs_unlink(dir->d_inode, rep); + ret = vfs_unlink(dir->d_inode, rep, NULL); if (preemptive) cachefiles_mark_object_buried(cache, rep); diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 0f9b66eaa767..dc60b8bd09ec 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -153,7 +153,7 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry, dget(lower_dentry); lower_dir_dentry = lock_parent(lower_dentry); - rc = vfs_unlink(lower_dir_inode, lower_dentry); + rc = vfs_unlink(lower_dir_inode, lower_dentry, NULL); if (rc) { printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc); goto out_unlock; @@ -208,7 +208,7 @@ ecryptfs_do_create(struct inode *directory_inode, inode = __ecryptfs_get_inode(lower_dentry->d_inode, directory_inode->i_sb); if (IS_ERR(inode)) { - vfs_unlink(lower_dir_dentry->d_inode, lower_dentry); + vfs_unlink(lower_dir_dentry->d_inode, lower_dentry, NULL); goto out_lock; } fsstack_copy_attr_times(directory_inode, lower_dir_dentry->d_inode); diff --git a/fs/namei.c b/fs/namei.c index e633a58d4222..67ce331a3ed8 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3615,7 +3615,25 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname) return do_rmdir(AT_FDCWD, pathname); } -int vfs_unlink(struct inode *dir, struct dentry *dentry) +/** + * vfs_unlink - unlink a filesystem object + * @dir: parent directory + * @dentry: victim + * @delegated_inode: returns victim inode, if the inode is delegated. + * + * The caller must hold dir->i_mutex. + * + * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and + * return a reference to the inode in delegated_inode. The caller + * should then break the delegation on that inode and retry. Because + * breaking a delegation may take a long time, the caller should drop + * dir->i_mutex before doing so. + * + * Alternatively, a caller may pass NULL for delegated_inode. This may + * be appropriate for callers that expect the underlying filesystem not + * to be NFS exported. + */ +int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode) { struct inode *target = dentry->d_inode; int error = may_delete(dir, dentry, 0); @@ -3632,11 +3650,20 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry) else { error = security_inode_unlink(dir, dentry); if (!error) { + error = break_deleg(target, O_WRONLY|O_NONBLOCK); + if (error) { + if (error == -EWOULDBLOCK && delegated_inode) { + *delegated_inode = target; + ihold(target); + } + goto out; + } error = dir->i_op->unlink(dir, dentry); if (!error) dont_mount(dentry); } } +out: mutex_unlock(&target->i_mutex); /* We don't d_delete() NFS sillyrenamed files--they still exist. */ @@ -3661,6 +3688,7 @@ static long do_unlinkat(int dfd, const char __user *pathname) struct dentry *dentry; struct nameidata nd; struct inode *inode = NULL; + struct inode *delegated_inode = NULL; unsigned int lookup_flags = 0; retry: name = user_path_parent(dfd, pathname, &nd, lookup_flags); @@ -3675,7 +3703,7 @@ retry: error = mnt_want_write(nd.path.mnt); if (error) goto exit1; - +retry_deleg: mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); dentry = lookup_hash(&nd); error = PTR_ERR(dentry); @@ -3690,13 +3718,21 @@ retry: error = security_path_unlink(&nd.path, dentry); if (error) goto exit2; - error = vfs_unlink(nd.path.dentry->d_inode, dentry); + error = vfs_unlink(nd.path.dentry->d_inode, dentry, &delegated_inode); exit2: dput(dentry); } mutex_unlock(&nd.path.dentry->d_inode->i_mutex); if (inode) iput(inode); /* truncate the inode here */ + inode = NULL; + if (delegated_inode) { + error = break_deleg(delegated_inode, O_WRONLY); + iput(delegated_inode); + delegated_inode = NULL; + if (!error) + goto retry_deleg; + } mnt_drop_write(nd.path.mnt); exit1: path_put(&nd.path); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 13886f7f40d5..7a810235d599 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1910,7 +1910,7 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, if (host_err) goto out_put; if (type != S_IFDIR) - host_err = vfs_unlink(dirp, rdentry); + host_err = vfs_unlink(dirp, rdentry, NULL); else host_err = vfs_rmdir(dirp, rdentry); if (!host_err) diff --git a/include/linux/fs.h b/include/linux/fs.h index 8e4be1be1a62..a5799233142a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1455,7 +1455,7 @@ extern int vfs_mknod(struct inode *, struct dentry *, umode_t, dev_t); extern int vfs_symlink(struct inode *, struct dentry *, const char *); extern int vfs_link(struct dentry *, struct inode *, struct dentry *); extern int vfs_rmdir(struct inode *, struct dentry *); -extern int vfs_unlink(struct inode *, struct dentry *); +extern int vfs_unlink(struct inode *, struct dentry *, struct inode **); extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); /* diff --git a/ipc/mqueue.c b/ipc/mqueue.c index ae1996d3c539..95827ce2f3c7 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -886,7 +886,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name) err = -ENOENT; } else { ihold(inode); - err = vfs_unlink(dentry->d_parent->d_inode, dentry); + err = vfs_unlink(dentry->d_parent->d_inode, dentry, NULL); } dput(dentry); -- cgit v1.2.3 From 5a14696c1795d3843673b5cf1982d0e5357a5bbf Mon Sep 17 00:00:00 2001 From: J. Bruce Fields Date: Tue, 28 Aug 2012 07:50:40 -0700 Subject: locks: helper functions for delegation breaking We'll need the same logic for rename and link. Acked-by: Jeff Layton Signed-off-by: J. Bruce Fields Signed-off-by: Al Viro --- fs/namei.c | 13 +++---------- include/linux/fs.h | 39 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 40 insertions(+), 12 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 67ce331a3ed8..cfaeaae0f2db 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3650,14 +3650,9 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegate else { error = security_inode_unlink(dir, dentry); if (!error) { - error = break_deleg(target, O_WRONLY|O_NONBLOCK); - if (error) { - if (error == -EWOULDBLOCK && delegated_inode) { - *delegated_inode = target; - ihold(target); - } + error = try_break_deleg(target, delegated_inode); + if (error) goto out; - } error = dir->i_op->unlink(dir, dentry); if (!error) dont_mount(dentry); @@ -3727,9 +3722,7 @@ exit2: iput(inode); /* truncate the inode here */ inode = NULL; if (delegated_inode) { - error = break_deleg(delegated_inode, O_WRONLY); - iput(delegated_inode); - delegated_inode = NULL; + error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } diff --git a/include/linux/fs.h b/include/linux/fs.h index a5799233142a..931f919f44e1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1905,6 +1905,9 @@ extern bool fs_fully_visible(struct file_system_type *); extern int current_umask(void); +extern void ihold(struct inode * inode); +extern void iput(struct inode *); + /* /sys/fs */ extern struct kobject *fs_kobj; @@ -1972,6 +1975,28 @@ static inline int break_deleg(struct inode *inode, unsigned int mode) return 0; } +static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode) +{ + int ret; + + ret = break_deleg(inode, O_WRONLY|O_NONBLOCK); + if (ret == -EWOULDBLOCK && delegated_inode) { + *delegated_inode = inode; + ihold(inode); + } + return ret; +} + +static inline int break_deleg_wait(struct inode **delegated_inode) +{ + int ret; + + ret = break_deleg(*delegated_inode, O_WRONLY); + iput(*delegated_inode); + *delegated_inode = NULL; + return ret; +} + #else /* !CONFIG_FILE_LOCKING */ static inline int locks_mandatory_locked(struct inode *inode) { @@ -2015,6 +2040,18 @@ static inline int break_deleg(struct inode *inode, unsigned int mode) { return 0; } + +static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode) +{ + return 0; +} + +static inline int break_deleg_wait(struct inode **delegated_inode) +{ + BUG(); + return 0; +} + #endif /* CONFIG_FILE_LOCKING */ /* fs/open.c */ @@ -2350,8 +2387,6 @@ extern loff_t vfs_llseek(struct file *file, loff_t offset, int whence); extern int inode_init_always(struct super_block *, struct inode *); extern void inode_init_once(struct inode *); extern void address_space_init_once(struct address_space *mapping); -extern void ihold(struct inode * inode); -extern void iput(struct inode *); extern struct inode * igrab(struct inode *); extern ino_t iunique(struct super_block *, ino_t); extern int inode_needs_sync(struct inode *inode); -- cgit v1.2.3 From 8e6d782cab50884ba94324632700e6233a252f6a Mon Sep 17 00:00:00 2001 From: J. Bruce Fields Date: Tue, 20 Sep 2011 16:59:58 -0400 Subject: locks: break delegations on rename Cc: David Howells Acked-by: Jeff Layton Signed-off-by: J. Bruce Fields Signed-off-by: Al Viro --- .../lustre/lustre/include/linux/lustre_compat25.h | 4 +- drivers/staging/lustre/lustre/lvfs/lvfs_linux.c | 2 +- fs/cachefiles/namei.c | 2 +- fs/ecryptfs/inode.c | 3 +- fs/namei.c | 47 ++++++++++++++++++++-- fs/nfsd/vfs.c | 2 +- include/linux/fs.h | 2 +- 7 files changed, 51 insertions(+), 11 deletions(-) (limited to 'fs/namei.c') diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h b/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h index 9243dfab43d3..80b019bf969c 100644 --- a/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h +++ b/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h @@ -105,8 +105,8 @@ static inline void ll_set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt, #define ll_vfs_unlink(inode,entry,mnt) vfs_unlink(inode,entry) #define ll_vfs_mknod(dir,entry,mnt,mode,dev) vfs_mknod(dir,entry,mode,dev) #define ll_security_inode_unlink(dir,entry,mnt) security_inode_unlink(dir,entry) -#define ll_vfs_rename(old,old_dir,mnt,new,new_dir,mnt1) \ - vfs_rename(old,old_dir,new,new_dir) +#define ll_vfs_rename(old,old_dir,mnt,new,new_dir,mnt1,delegated_inode) \ + vfs_rename(old,old_dir,new,new_dir,delegated_inode) #define cfs_bio_io_error(a,b) bio_io_error((a)) #define cfs_bio_endio(a,b,c) bio_endio((a),(c)) diff --git a/drivers/staging/lustre/lustre/lvfs/lvfs_linux.c b/drivers/staging/lustre/lustre/lvfs/lvfs_linux.c index 18e1b47a1d65..4ed7c9f0a8be 100644 --- a/drivers/staging/lustre/lustre/lvfs/lvfs_linux.c +++ b/drivers/staging/lustre/lustre/lvfs/lvfs_linux.c @@ -220,7 +220,7 @@ int lustre_rename(struct dentry *dir, struct vfsmount *mnt, GOTO(put_old, err = PTR_ERR(dchild_new)); err = ll_vfs_rename(dir->d_inode, dchild_old, mnt, - dir->d_inode, dchild_new, mnt); + dir->d_inode, dchild_new, mnt, NULL); dput(dchild_new); put_old: diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 31d480c0e046..ca65f39dc8dc 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -396,7 +396,7 @@ try_again: cachefiles_io_error(cache, "Rename security error %d", ret); } else { ret = vfs_rename(dir->d_inode, rep, - cache->graveyard->d_inode, grave); + cache->graveyard->d_inode, grave, NULL); if (ret != 0 && ret != -ENOMEM) cachefiles_io_error(cache, "Rename failed with error %d", ret); diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index dc60b8bd09ec..c23b01bb7e04 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -640,7 +640,8 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_lock; } rc = vfs_rename(lower_old_dir_dentry->d_inode, lower_old_dentry, - lower_new_dir_dentry->d_inode, lower_new_dentry); + lower_new_dir_dentry->d_inode, lower_new_dentry, + NULL); if (rc) goto out_lock; if (target_inode) diff --git a/fs/namei.c b/fs/namei.c index cfaeaae0f2db..ce7e580e4e14 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4022,7 +4022,8 @@ out: } static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + struct inode **delegated_inode) { struct inode *target = new_dentry->d_inode; struct inode *source = old_dentry->d_inode; @@ -4039,6 +4040,14 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) goto out; + error = try_break_deleg(source, delegated_inode); + if (error) + goto out; + if (target) { + error = try_break_deleg(target, delegated_inode); + if (error) + goto out; + } error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); if (error) goto out; @@ -4053,8 +4062,30 @@ out: return error; } +/** + * vfs_rename - rename a filesystem object + * @old_dir: parent of source + * @old_dentry: source + * @new_dir: parent of destination + * @new_dentry: destination + * @delegated_inode: returns an inode needing a delegation break + * + * The caller must hold multiple mutexes--see lock_rename()). + * + * If vfs_rename discovers a delegation in need of breaking at either + * the source or destination, it will return -EWOULDBLOCK and return a + * reference to the inode in delegated_inode. The caller should then + * break the delegation and retry. Because breaking a delegation may + * take a long time, the caller should drop all locks before doing + * so. + * + * Alternatively, a caller may pass NULL for delegated_inode. This may + * be appropriate for callers that expect the underlying filesystem not + * to be NFS exported. + */ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + struct inode **delegated_inode) { int error; int is_dir = d_is_directory(old_dentry) || d_is_autodir(old_dentry); @@ -4082,7 +4113,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (is_dir) error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry); else - error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry); + error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry,delegated_inode); if (!error) fsnotify_move(old_dir, new_dir, old_name, is_dir, new_dentry->d_inode, old_dentry); @@ -4098,6 +4129,7 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, struct dentry *old_dentry, *new_dentry; struct dentry *trap; struct nameidata oldnd, newnd; + struct inode *delegated_inode = NULL; struct filename *from; struct filename *to; unsigned int lookup_flags = 0; @@ -4137,6 +4169,7 @@ retry: newnd.flags &= ~LOOKUP_PARENT; newnd.flags |= LOOKUP_RENAME_TARGET; +retry_deleg: trap = lock_rename(new_dir, old_dir); old_dentry = lookup_hash(&oldnd); @@ -4173,13 +4206,19 @@ retry: if (error) goto exit5; error = vfs_rename(old_dir->d_inode, old_dentry, - new_dir->d_inode, new_dentry); + new_dir->d_inode, new_dentry, + &delegated_inode); exit5: dput(new_dentry); exit4: dput(old_dentry); exit3: unlock_rename(new_dir, old_dir); + if (delegated_inode) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry_deleg; + } mnt_drop_write(oldnd.path.mnt); exit2: if (retry_estale(error, lookup_flags)) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 7a810235d599..45bf0295894d 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1837,7 +1837,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, if (host_err) goto out_dput_new; } - host_err = vfs_rename(fdir, odentry, tdir, ndentry); + host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL); if (!host_err) { host_err = commit_metadata(tfhp); if (!host_err) diff --git a/include/linux/fs.h b/include/linux/fs.h index 931f919f44e1..5bcff883fa90 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1456,7 +1456,7 @@ extern int vfs_symlink(struct inode *, struct dentry *, const char *); extern int vfs_link(struct dentry *, struct inode *, struct dentry *); extern int vfs_rmdir(struct inode *, struct dentry *); extern int vfs_unlink(struct inode *, struct dentry *, struct inode **); -extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); +extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **); /* * VFS dentry helper functions. -- cgit v1.2.3 From 146a8595c6399ee6ab4b5cc34c0d28aa4835fdc5 Mon Sep 17 00:00:00 2001 From: J. Bruce Fields Date: Tue, 20 Sep 2011 17:14:31 -0400 Subject: locks: break delegations on link Cc: Tyler Hicks Cc: Dustin Kirkland Acked-by: Jeff Layton Signed-off-by: J. Bruce Fields Signed-off-by: Al Viro --- fs/ecryptfs/inode.c | 2 +- fs/namei.c | 36 ++++++++++++++++++++++++++++++++---- fs/nfsd/vfs.c | 2 +- include/linux/fs.h | 2 +- 4 files changed, 35 insertions(+), 7 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index c23b01bb7e04..1c628f023041 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -475,7 +475,7 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir, dget(lower_new_dentry); lower_dir_dentry = lock_parent(lower_new_dentry); rc = vfs_link(lower_old_dentry, lower_dir_dentry->d_inode, - lower_new_dentry); + lower_new_dentry, NULL); if (rc || !lower_new_dentry->d_inode) goto out_lock; rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb); diff --git a/fs/namei.c b/fs/namei.c index ce7e580e4e14..251178a1e383 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3819,7 +3819,26 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn return sys_symlinkat(oldname, AT_FDCWD, newname); } -int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) +/** + * vfs_link - create a new link + * @old_dentry: object to be linked + * @dir: new parent + * @new_dentry: where to create the new link + * @delegated_inode: returns inode needing a delegation break + * + * The caller must hold dir->i_mutex + * + * If vfs_link discovers a delegation on the to-be-linked file in need + * of breaking, it will return -EWOULDBLOCK and return a reference to the + * inode in delegated_inode. The caller should then break the delegation + * and retry. Because breaking a delegation may take a long time, the + * caller should drop the i_mutex before doing so. + * + * Alternatively, a caller may pass NULL for delegated_inode. This may + * be appropriate for callers that expect the underlying filesystem not + * to be NFS exported. + */ +int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode) { struct inode *inode = old_dentry->d_inode; unsigned max_links = dir->i_sb->s_max_links; @@ -3855,8 +3874,11 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de error = -ENOENT; else if (max_links && inode->i_nlink >= max_links) error = -EMLINK; - else - error = dir->i_op->link(old_dentry, dir, new_dentry); + else { + error = try_break_deleg(inode, delegated_inode); + if (!error) + error = dir->i_op->link(old_dentry, dir, new_dentry); + } if (!error && (inode->i_state & I_LINKABLE)) { spin_lock(&inode->i_lock); @@ -3883,6 +3905,7 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, { struct dentry *new_dentry; struct path old_path, new_path; + struct inode *delegated_inode = NULL; int how = 0; int error; @@ -3921,9 +3944,14 @@ retry: error = security_path_link(old_path.dentry, &new_path, new_dentry); if (error) goto out_dput; - error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry); + error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode); out_dput: done_path_create(&new_path, new_dentry); + if (delegated_inode) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry; + } if (retry_estale(error, how)) { how |= LOOKUP_REVAL; goto retry; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 45bf0295894d..27ba21b5f383 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1736,7 +1736,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, err = nfserrno(host_err); goto out_dput; } - host_err = vfs_link(dold, dirp, dnew); + host_err = vfs_link(dold, dirp, dnew, NULL); if (!host_err) { err = nfserrno(commit_metadata(ffhp)); if (!err) diff --git a/include/linux/fs.h b/include/linux/fs.h index 5bcff883fa90..6e36e7118ec1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1453,7 +1453,7 @@ extern int vfs_create(struct inode *, struct dentry *, umode_t, bool); extern int vfs_mkdir(struct inode *, struct dentry *, umode_t); extern int vfs_mknod(struct inode *, struct dentry *, umode_t, dev_t); extern int vfs_symlink(struct inode *, struct dentry *, const char *); -extern int vfs_link(struct dentry *, struct inode *, struct dentry *); +extern int vfs_link(struct dentry *, struct inode *, struct dentry *, struct inode **); extern int vfs_rmdir(struct inode *, struct dentry *); extern int vfs_unlink(struct inode *, struct dentry *, struct inode **); extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **); -- cgit v1.2.3