From 1cce1eea0aff51201753fcaca421df825b0813b6 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov
Date: Wed, 14 Dec 2016 15:56:33 +0200
Subject: inotify: Convert to using per-namespace limits

This patchset converts inotify to using the newly introduced
per-userns sysctl infrastructure.

Currently the inotify instances/watches are being accounted in the
user_struct structure. This means that in setups where multiple
users in unprivileged containers map to the same underlying
real user (i.e. pointing to the same user_struct) the inotify limits
are going to be shared as well, allowing one user(or application) to exhaust
all others limits.

Fix this by switching the inotify sysctls to using the
per-namespace/per-user limits. This will allow the server admin to
set sensible global limits, which can further be tuned inside every
individual user namespace. Additionally, in order to preserve the
sysctl ABI make the existing inotify instances/watches sysctls
modify the values of the initial user namespace.

Signed-off-by: Nikolay Borisov <n.borisov.lkml@gmail.com>
Acked-by: Jan Kara <jack@suse.cz>
Acked-by: Serge Hallyn <serge@hallyn.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 include/linux/fsnotify_backend.h | 3 ++-
 include/linux/sched.h            | 4 ----
 include/linux/user_namespace.h   | 4 ++++
 3 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 0cf34d6cc253..c8f2738113f4 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -16,6 +16,7 @@
 #include <linux/spinlock.h>
 #include <linux/types.h>
 #include <linux/atomic.h>
+#include <linux/user_namespace.h>
 
 /*
  * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily
@@ -170,7 +171,7 @@ struct fsnotify_group {
 		struct inotify_group_private_data {
 			spinlock_t	idr_lock;
 			struct idr      idr;
-			struct user_struct      *user;
+			struct ucounts *ucounts;
 		} inotify_data;
 #endif
 #ifdef CONFIG_FANOTIFY
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4d1905245c7a..d2334229167f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -868,10 +868,6 @@ struct user_struct {
 	atomic_t __count;	/* reference count */
 	atomic_t processes;	/* How many processes does this user have? */
 	atomic_t sigpending;	/* How many pending signals does this user have? */
-#ifdef CONFIG_INOTIFY_USER
-	atomic_t inotify_watches; /* How many inotify watches does this user have? */
-	atomic_t inotify_devs;	/* How many inotify devs does this user have opened? */
-#endif
 #ifdef CONFIG_FANOTIFY
 	atomic_t fanotify_listeners;
 #endif
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index eb209d4523f5..363e0e8082a9 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -32,6 +32,10 @@ enum ucount_type {
 	UCOUNT_NET_NAMESPACES,
 	UCOUNT_MNT_NAMESPACES,
 	UCOUNT_CGROUP_NAMESPACES,
+#ifdef CONFIG_INOTIFY_USER
+	UCOUNT_INOTIFY_INSTANCES,
+	UCOUNT_INOTIFY_WATCHES,
+#endif
 	UCOUNT_COUNTS,
 };
 
-- 
cgit v1.2.3


From 9227dd2a84a765fcfef1677ff17de0958b192eda Mon Sep 17 00:00:00 2001
From: Eric W. Biederman
Date: Mon, 23 Jan 2017 17:26:31 +1300
Subject: exec: Remove LSM_UNSAFE_PTRACE_CAP

With previous changes every location that tests for
LSM_UNSAFE_PTRACE_CAP also tests for LSM_UNSAFE_PTRACE making the
LSM_UNSAFE_PTRACE_CAP redundant, so remove it.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/exec.c                  | 8 ++------
 include/linux/security.h   | 3 +--
 security/apparmor/domain.c | 2 +-
 security/commoncap.c       | 2 +-
 security/selinux/hooks.c   | 3 +--
 security/smack/smack_lsm.c | 2 +-
 6 files changed, 7 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index e57946610733..c195ebb8e2aa 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1426,12 +1426,8 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
 	struct task_struct *p = current, *t;
 	unsigned n_fs;
 
-	if (p->ptrace) {
-		if (ptracer_capable(p, current_user_ns()))
-			bprm->unsafe |= LSM_UNSAFE_PTRACE_CAP;
-		else
-			bprm->unsafe |= LSM_UNSAFE_PTRACE;
-	}
+	if (p->ptrace)
+		bprm->unsafe |= LSM_UNSAFE_PTRACE;
 
 	/*
 	 * This isn't strictly necessary, but it makes it harder for LSMs to
diff --git a/include/linux/security.h b/include/linux/security.h
index c2125e9093e8..9d9ee90f1f35 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -140,8 +140,7 @@ struct request_sock;
 /* bprm->unsafe reasons */
 #define LSM_UNSAFE_SHARE	1
 #define LSM_UNSAFE_PTRACE	2
-#define LSM_UNSAFE_PTRACE_CAP	4
-#define LSM_UNSAFE_NO_NEW_PRIVS	8
+#define LSM_UNSAFE_NO_NEW_PRIVS	4
 
 #ifdef CONFIG_MMU
 extern int mmap_min_addr_handler(struct ctl_table *table, int write,
diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c
index a4d90aa1045a..04185b7fd38a 100644
--- a/security/apparmor/domain.c
+++ b/security/apparmor/domain.c
@@ -469,7 +469,7 @@ int apparmor_bprm_set_creds(struct linux_binprm *bprm)
 		;
 	}
 
-	if (bprm->unsafe & (LSM_UNSAFE_PTRACE | LSM_UNSAFE_PTRACE_CAP)) {
+	if (bprm->unsafe & LSM_UNSAFE_PTRACE) {
 		error = may_change_ptraced_domain(new_profile);
 		if (error)
 			goto audit;
diff --git a/security/commoncap.c b/security/commoncap.c
index cbb203c91406..8ec6b7fe909e 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -548,7 +548,7 @@ skip:
 
 	if ((is_setid ||
 	     !cap_issubset(new->cap_permitted, old->cap_permitted)) &&
-	    ((bprm->unsafe & ~(LSM_UNSAFE_PTRACE | LSM_UNSAFE_PTRACE_CAP)) ||
+	    ((bprm->unsafe & ~LSM_UNSAFE_PTRACE) ||
 	     !ptracer_capable(current, new->user_ns))) {
 		/* downgrade; they get no more than they had, and maybe less */
 		if (!ns_capable(new->user_ns, CAP_SETUID) ||
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index c7c6619431d5..cece6fe55f02 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2404,8 +2404,7 @@ static int selinux_bprm_set_creds(struct linux_binprm *bprm)
 
 		/* Make sure that anyone attempting to ptrace over a task that
 		 * changes its SID has the appropriate permit */
-		if (bprm->unsafe &
-		    (LSM_UNSAFE_PTRACE | LSM_UNSAFE_PTRACE_CAP)) {
+		if (bprm->unsafe & LSM_UNSAFE_PTRACE) {
 			u32 ptsid = ptrace_parent_sid(current);
 			if (ptsid != 0) {
 				rc = avc_has_perm(ptsid, new_tsec->sid,
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 94dc9d406ce3..bc2ff09f1494 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -934,7 +934,7 @@ static int smack_bprm_set_creds(struct linux_binprm *bprm)
 	    isp->smk_task != sbsp->smk_root)
 		return 0;
 
-	if (bprm->unsafe & (LSM_UNSAFE_PTRACE | LSM_UNSAFE_PTRACE_CAP)) {
+	if (bprm->unsafe & LSM_UNSAFE_PTRACE) {
 		struct task_struct *tracer;
 		rc = 0;
 
-- 
cgit v1.2.3


From 93faccbbfa958a9668d3ab4e30f38dd205cee8d8 Mon Sep 17 00:00:00 2001
From: Eric W. Biederman
Date: Wed, 1 Feb 2017 06:06:16 +1300
Subject: fs: Better permission checking for submounts

To support unprivileged users mounting filesystems two permission
checks have to be performed: a test to see if the user allowed to
create a mount in the mount namespace, and a test to see if
the user is allowed to access the specified filesystem.

The automount case is special in that mounting the original filesystem
grants permission to mount the sub-filesystems, to any user who
happens to stumble across the their mountpoint and satisfies the
ordinary filesystem permission checks.

Attempting to handle the automount case by using override_creds
almost works.  It preserves the idea that permission to mount
the original filesystem is permission to mount the sub-filesystem.
Unfortunately using override_creds messes up the filesystems
ordinary permission checks.

Solve this by being explicit that a mount is a submount by introducing
vfs_submount, and using it where appropriate.

vfs_submount uses a new mount internal mount flags MS_SUBMOUNT, to let
sget and friends know that a mount is a submount so they can take appropriate
action.

sget and sget_userns are modified to not perform any permission checks
on submounts.

follow_automount is modified to stop using override_creds as that
has proven problemantic.

do_mount is modified to always remove the new MS_SUBMOUNT flag so
that we know userspace will never by able to specify it.

autofs4 is modified to stop using current_real_cred that was put in
there to handle the previous version of submount permission checking.

cifs is modified to pass the mountpoint all of the way down to vfs_submount.

debugfs is modified to pass the mountpoint all of the way down to
trace_automount by adding a new parameter.  To make this change easier
a new typedef debugfs_automount_t is introduced to capture the type of
the debugfs automount function.

Cc: stable@vger.kernel.org
Fixes: 069d5ac9ae0d ("autofs:  Fix automounts by using current_real_cred()->uid")
Fixes: aeaa4a79ff6a ("fs: Call d_automount with the filesystems creds")
Reviewed-by: Trond Myklebust <trond.myklebust@primarydata.com>
Reviewed-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/afs/mntpt.c          |  2 +-
 fs/autofs4/waitq.c      |  4 ++--
 fs/cifs/cifs_dfs_ref.c  |  7 ++++---
 fs/debugfs/inode.c      |  8 ++++----
 fs/namei.c              |  3 ---
 fs/namespace.c          | 17 ++++++++++++++++-
 fs/nfs/namespace.c      |  2 +-
 fs/nfs/nfs4namespace.c  |  2 +-
 fs/super.c              | 13 ++++++++++---
 include/linux/debugfs.h |  3 ++-
 include/linux/mount.h   |  3 +++
 include/uapi/linux/fs.h |  1 +
 kernel/trace/trace.c    |  4 ++--
 13 files changed, 47 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 81dd075356b9..d4fb0afc0097 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -202,7 +202,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 
 	/* try and do the mount */
 	_debug("--- attempting mount %s -o %s ---", devname, options);
-	mnt = vfs_kern_mount(&afs_fs_type, 0, devname, options);
+	mnt = vfs_submount(mntpt, &afs_fs_type, devname, options);
 	_debug("--- mount result %p ---", mnt);
 
 	free_page((unsigned long) devname);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 1278335ce366..79fbd85db4ba 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -436,8 +436,8 @@ int autofs4_wait(struct autofs_sb_info *sbi,
 		memcpy(&wq->name, &qstr, sizeof(struct qstr));
 		wq->dev = autofs4_get_dev(sbi);
 		wq->ino = autofs4_get_ino(sbi);
-		wq->uid = current_real_cred()->uid;
-		wq->gid = current_real_cred()->gid;
+		wq->uid = current_cred()->uid;
+		wq->gid = current_cred()->gid;
 		wq->pid = pid;
 		wq->tgid = tgid;
 		wq->status = -EINTR; /* Status return if interrupted */
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index ec9dbbcca3b9..9156be545b0f 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -245,7 +245,8 @@ compose_mount_options_err:
  * @fullpath:		full path in UNC format
  * @ref:		server's referral
  */
-static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb,
+static struct vfsmount *cifs_dfs_do_refmount(struct dentry *mntpt,
+		struct cifs_sb_info *cifs_sb,
 		const char *fullpath, const struct dfs_info3_param *ref)
 {
 	struct vfsmount *mnt;
@@ -259,7 +260,7 @@ static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb,
 	if (IS_ERR(mountdata))
 		return (struct vfsmount *)mountdata;
 
-	mnt = vfs_kern_mount(&cifs_fs_type, 0, devname, mountdata);
+	mnt = vfs_submount(mntpt, &cifs_fs_type, devname, mountdata);
 	kfree(mountdata);
 	kfree(devname);
 	return mnt;
@@ -334,7 +335,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
 			mnt = ERR_PTR(-EINVAL);
 			break;
 		}
-		mnt = cifs_dfs_do_refmount(cifs_sb,
+		mnt = cifs_dfs_do_refmount(mntpt, cifs_sb,
 				full_path, referrals + i);
 		cifs_dbg(FYI, "%s: cifs_dfs_do_refmount:%s , mnt:%p\n",
 			 __func__, referrals[i].node_name, mnt);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index f17fcf89e18e..1e30f74a9527 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -187,9 +187,9 @@ static const struct super_operations debugfs_super_operations = {
 
 static struct vfsmount *debugfs_automount(struct path *path)
 {
-	struct vfsmount *(*f)(void *);
-	f = (struct vfsmount *(*)(void *))path->dentry->d_fsdata;
-	return f(d_inode(path->dentry)->i_private);
+	debugfs_automount_t f;
+	f = (debugfs_automount_t)path->dentry->d_fsdata;
+	return f(path->dentry, d_inode(path->dentry)->i_private);
 }
 
 static const struct dentry_operations debugfs_dops = {
@@ -504,7 +504,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_dir);
  */
 struct dentry *debugfs_create_automount(const char *name,
 					struct dentry *parent,
-					struct vfsmount *(*f)(void *),
+					debugfs_automount_t f,
 					void *data)
 {
 	struct dentry *dentry = start_creating(name, parent);
diff --git a/fs/namei.c b/fs/namei.c
index 6fa3e9138fe4..da689c9c005e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1100,7 +1100,6 @@ static int follow_automount(struct path *path, struct nameidata *nd,
 			    bool *need_mntput)
 {
 	struct vfsmount *mnt;
-	const struct cred *old_cred;
 	int err;
 
 	if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
@@ -1129,9 +1128,7 @@ static int follow_automount(struct path *path, struct nameidata *nd,
 	if (nd->total_link_count >= 40)
 		return -ELOOP;
 
-	old_cred = override_creds(&init_cred);
 	mnt = path->dentry->d_op->d_automount(path);
-	revert_creds(old_cred);
 	if (IS_ERR(mnt)) {
 		/*
 		 * The filesystem is allowed to return -EISDIR here to indicate
diff --git a/fs/namespace.c b/fs/namespace.c
index 487ba30bb5c6..089a6b23135a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -989,6 +989,21 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
 }
 EXPORT_SYMBOL_GPL(vfs_kern_mount);
 
+struct vfsmount *
+vfs_submount(const struct dentry *mountpoint, struct file_system_type *type,
+	     const char *name, void *data)
+{
+	/* Until it is worked out how to pass the user namespace
+	 * through from the parent mount to the submount don't support
+	 * unprivileged mounts with submounts.
+	 */
+	if (mountpoint->d_sb->s_user_ns != &init_user_ns)
+		return ERR_PTR(-EPERM);
+
+	return vfs_kern_mount(type, MS_SUBMOUNT, name, data);
+}
+EXPORT_SYMBOL_GPL(vfs_submount);
+
 static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 					int flag)
 {
@@ -2794,7 +2809,7 @@ long do_mount(const char *dev_name, const char __user *dir_name,
 
 	flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
 		   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
-		   MS_STRICTATIME | MS_NOREMOTELOCK);
+		   MS_STRICTATIME | MS_NOREMOTELOCK | MS_SUBMOUNT);
 
 	if (flags & MS_REMOUNT)
 		retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 5551e8ef67fd..e49d831c4e85 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -226,7 +226,7 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,
 					   const char *devname,
 					   struct nfs_clone_mount *mountdata)
 {
-	return vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata);
+	return vfs_submount(mountdata->dentry, &nfs_xdev_fs_type, devname, mountdata);
 }
 
 /**
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index d21104912676..d8b040bd9814 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -279,7 +279,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
 				mountdata->hostname,
 				mountdata->mnt_path);
 
-		mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, mountdata);
+		mnt = vfs_submount(mountdata->dentry, &nfs4_referral_fs_type, page, mountdata);
 		if (!IS_ERR(mnt))
 			break;
 	}
diff --git a/fs/super.c b/fs/super.c
index 1709ed029a2c..4185844f7a12 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -469,7 +469,7 @@ struct super_block *sget_userns(struct file_system_type *type,
 	struct super_block *old;
 	int err;
 
-	if (!(flags & MS_KERNMOUNT) &&
+	if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT)) &&
 	    !(type->fs_flags & FS_USERNS_MOUNT) &&
 	    !capable(CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
@@ -499,7 +499,7 @@ retry:
 	}
 	if (!s) {
 		spin_unlock(&sb_lock);
-		s = alloc_super(type, flags, user_ns);
+		s = alloc_super(type, (flags & ~MS_SUBMOUNT), user_ns);
 		if (!s)
 			return ERR_PTR(-ENOMEM);
 		goto retry;
@@ -540,8 +540,15 @@ struct super_block *sget(struct file_system_type *type,
 {
 	struct user_namespace *user_ns = current_user_ns();
 
+	/* We don't yet pass the user namespace of the parent
+	 * mount through to here so always use &init_user_ns
+	 * until that changes.
+	 */
+	if (flags & MS_SUBMOUNT)
+		user_ns = &init_user_ns;
+
 	/* Ensure the requestor has permissions over the target filesystem */
-	if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN))
+	if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT)) && !ns_capable(user_ns, CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
 
 	return sget_userns(type, test, set, flags, user_ns, data);
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index 014cc564d1c4..233006be30aa 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -97,9 +97,10 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent);
 struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
 				      const char *dest);
 
+typedef struct vfsmount *(*debugfs_automount_t)(struct dentry *, void *);
 struct dentry *debugfs_create_automount(const char *name,
 					struct dentry *parent,
-					struct vfsmount *(*f)(void *),
+					debugfs_automount_t f,
 					void *data);
 
 void debugfs_remove(struct dentry *dentry);
diff --git a/include/linux/mount.h b/include/linux/mount.h
index c6f55158d5e5..8e0352af06b7 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -90,6 +90,9 @@ struct file_system_type;
 extern struct vfsmount *vfs_kern_mount(struct file_system_type *type,
 				      int flags, const char *name,
 				      void *data);
+extern struct vfsmount *vfs_submount(const struct dentry *mountpoint,
+				     struct file_system_type *type,
+				     const char *name, void *data);
 
 extern void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list);
 extern void mark_mounts_for_expiry(struct list_head *mounts);
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 36da93fbf188..048a85e9f017 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -132,6 +132,7 @@ struct inodes_stat_t {
 #define MS_LAZYTIME	(1<<25) /* Update the on-disk [acm]times lazily */
 
 /* These sb flags are internal to the kernel */
+#define MS_SUBMOUNT     (1<<26)
 #define MS_NOREMOTELOCK	(1<<27)
 #define MS_NOSEC	(1<<28)
 #define MS_BORN		(1<<29)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d7449783987a..310f0ea0d1a2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7503,7 +7503,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
 	ftrace_init_tracefs(tr, d_tracer);
 }
 
-static struct vfsmount *trace_automount(void *ingore)
+static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore)
 {
 	struct vfsmount *mnt;
 	struct file_system_type *type;
@@ -7516,7 +7516,7 @@ static struct vfsmount *trace_automount(void *ingore)
 	type = get_fs_type("tracefs");
 	if (!type)
 		return NULL;
-	mnt = vfs_kern_mount(type, 0, "tracefs", NULL);
+	mnt = vfs_submount(mntpt, type, "tracefs", NULL);
 	put_filesystem(type);
 	if (IS_ERR(mnt))
 		return NULL;
-- 
cgit v1.2.3


From 0f1b92cbdd0309afae0af1963e8cccddb3d2eaff Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Mon, 30 Jan 2017 18:06:11 +0300
Subject: introduce the walk_process_tree() helper

Add the new helper to walk the process tree, the next patch adds a user.
Note that it visits the group leaders only, proc_visitor can do
for_each_thread itself or we can trivially extend walk_process_tree() to
do this.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 include/linux/sched.h |  3 +++
 kernel/fork.c         | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d2334229167f..6261bfc12853 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3053,6 +3053,9 @@ extern bool current_is_single_threaded(void);
 #define for_each_process_thread(p, t)	\
 	for_each_process(p) for_each_thread(p, t)
 
+typedef int (*proc_visitor)(struct task_struct *p, void *data);
+void walk_process_tree(struct task_struct *top, proc_visitor, void *);
+
 static inline int get_nr_threads(struct task_struct *tsk)
 {
 	return tsk->signal->nr_threads;
diff --git a/kernel/fork.c b/kernel/fork.c
index 11c5c8ab827c..135b7a49ad59 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2053,6 +2053,38 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
 }
 #endif
 
+void walk_process_tree(struct task_struct *top, proc_visitor visitor, void *data)
+{
+	struct task_struct *leader, *parent, *child;
+	int res;
+
+	read_lock(&tasklist_lock);
+	leader = top = top->group_leader;
+down:
+	for_each_thread(leader, parent) {
+		list_for_each_entry(child, &parent->children, sibling) {
+			res = visitor(child, data);
+			if (res) {
+				if (res < 0)
+					goto out;
+				leader = child;
+				goto down;
+			}
+up:
+			;
+		}
+	}
+
+	if (leader != top) {
+		child = leader;
+		parent = child->real_parent;
+		leader = parent->group_leader;
+		goto up;
+	}
+out:
+	read_unlock(&tasklist_lock);
+}
+
 #ifndef ARCH_MIN_MMSTRUCT_ALIGN
 #define ARCH_MIN_MMSTRUCT_ALIGN 0
 #endif
-- 
cgit v1.2.3


From d6cffbbe9a7e51eb705182965a189457c17ba8a3 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov
Date: Fri, 10 Feb 2017 10:35:02 +0300
Subject: proc/sysctl: prune stale dentries during unregistering

Currently unregistering sysctl table does not prune its dentries.
Stale dentries could slowdown sysctl operations significantly.

For example, command:

 # for i in {1..100000} ; do unshare -n -- sysctl -a &> /dev/null ; done
 creates a millions of stale denties around sysctls of loopback interface:

 # sysctl fs.dentry-state
 fs.dentry-state = 25812579  24724135        45      0       0       0

 All of them have matching names thus lookup have to scan though whole
 hash chain and call d_compare (proc_sys_compare) which checks them
 under system-wide spinlock (sysctl_lock).

 # time sysctl -a > /dev/null
 real    1m12.806s
 user    0m0.016s
 sys     1m12.400s

Currently only memory reclaimer could remove this garbage.
But without significant memory pressure this never happens.

This patch collects sysctl inodes into list on sysctl table header and
prunes all their dentries once that table unregisters.

Konstantin Khlebnikov <khlebnikov@yandex-team.ru> writes:
> On 10.02.2017 10:47, Al Viro wrote:
>> how about >> the matching stats *after* that patch?
>
> dcache size doesn't grow endlessly, so stats are fine
>
> # sysctl fs.dentry-state
> fs.dentry-state = 92712	58376	45	0	0	0
>
> # time sysctl -a &>/dev/null
>
> real	0m0.013s
> user	0m0.004s
> sys	0m0.008s

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/proc/inode.c        |  3 ++-
 fs/proc/internal.h     |  7 ++++--
 fs/proc/proc_sysctl.c  | 59 ++++++++++++++++++++++++++++++++++++--------------
 include/linux/sysctl.h |  1 +
 4 files changed, 51 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 842a5ff5b85c..7ad9ed7958af 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -43,10 +43,11 @@ static void proc_evict_inode(struct inode *inode)
 	de = PDE(inode);
 	if (de)
 		pde_put(de);
+
 	head = PROC_I(inode)->sysctl;
 	if (head) {
 		RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL);
-		sysctl_head_put(head);
+		proc_sys_evict_inode(inode, head);
 	}
 }
 
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index e2c3c461fa20..5d6960f5f1c0 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -65,6 +65,7 @@ struct proc_inode {
 	struct proc_dir_entry *pde;
 	struct ctl_table_header *sysctl;
 	struct ctl_table *sysctl_entry;
+	struct list_head sysctl_inodes;
 	const struct proc_ns_operations *ns_ops;
 	struct inode vfs_inode;
 };
@@ -237,10 +238,12 @@ extern void proc_thread_self_init(void);
  */
 #ifdef CONFIG_PROC_SYSCTL
 extern int proc_sys_init(void);
-extern void sysctl_head_put(struct ctl_table_header *);
+extern void proc_sys_evict_inode(struct inode *inode,
+				 struct ctl_table_header *head);
 #else
 static inline void proc_sys_init(void) { }
-static inline void sysctl_head_put(struct ctl_table_header *head) { }
+static inline void proc_sys_evict_inode(struct  inode *inode,
+					struct ctl_table_header *head) { }
 #endif
 
 /*
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index d4e37acd4821..8efb1e10b025 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -190,6 +190,7 @@ static void init_header(struct ctl_table_header *head,
 	head->set = set;
 	head->parent = NULL;
 	head->node = node;
+	INIT_LIST_HEAD(&head->inodes);
 	if (node) {
 		struct ctl_table *entry;
 		for (entry = table; entry->procname; entry++, node++)
@@ -259,6 +260,29 @@ static void unuse_table(struct ctl_table_header *p)
 			complete(p->unregistering);
 }
 
+/* called under sysctl_lock */
+static void proc_sys_prune_dcache(struct ctl_table_header *head)
+{
+	struct inode *inode, *prev = NULL;
+	struct proc_inode *ei;
+
+	list_for_each_entry(ei, &head->inodes, sysctl_inodes) {
+		inode = igrab(&ei->vfs_inode);
+		if (inode) {
+			spin_unlock(&sysctl_lock);
+			iput(prev);
+			prev = inode;
+			d_prune_aliases(inode);
+			spin_lock(&sysctl_lock);
+		}
+	}
+	if (prev) {
+		spin_unlock(&sysctl_lock);
+		iput(prev);
+		spin_lock(&sysctl_lock);
+	}
+}
+
 /* called under sysctl_lock, will reacquire if has to wait */
 static void start_unregistering(struct ctl_table_header *p)
 {
@@ -277,6 +301,11 @@ static void start_unregistering(struct ctl_table_header *p)
 		/* anything non-NULL; we'll never dereference it */
 		p->unregistering = ERR_PTR(-EINVAL);
 	}
+	/*
+	 * Prune dentries for unregistered sysctls: namespaced sysctls
+	 * can have duplicate names and contaminate dcache very badly.
+	 */
+	proc_sys_prune_dcache(p);
 	/*
 	 * do not remove from the list until nobody holds it; walking the
 	 * list in do_sysctl() relies on that.
@@ -284,21 +313,6 @@ static void start_unregistering(struct ctl_table_header *p)
 	erase_header(p);
 }
 
-static void sysctl_head_get(struct ctl_table_header *head)
-{
-	spin_lock(&sysctl_lock);
-	head->count++;
-	spin_unlock(&sysctl_lock);
-}
-
-void sysctl_head_put(struct ctl_table_header *head)
-{
-	spin_lock(&sysctl_lock);
-	if (!--head->count)
-		kfree_rcu(head, rcu);
-	spin_unlock(&sysctl_lock);
-}
-
 static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
 {
 	BUG_ON(!head);
@@ -440,11 +454,15 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
 
 	inode->i_ino = get_next_ino();
 
-	sysctl_head_get(head);
 	ei = PROC_I(inode);
 	ei->sysctl = head;
 	ei->sysctl_entry = table;
 
+	spin_lock(&sysctl_lock);
+	list_add(&ei->sysctl_inodes, &head->inodes);
+	head->count++;
+	spin_unlock(&sysctl_lock);
+
 	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
 	inode->i_mode = table->mode;
 	if (!S_ISDIR(table->mode)) {
@@ -466,6 +484,15 @@ out:
 	return inode;
 }
 
+void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head)
+{
+	spin_lock(&sysctl_lock);
+	list_del(&PROC_I(inode)->sysctl_inodes);
+	if (!--head->count)
+		kfree_rcu(head, rcu);
+	spin_unlock(&sysctl_lock);
+}
+
 static struct ctl_table_header *grab_header(struct inode *inode)
 {
 	struct ctl_table_header *head = PROC_I(inode)->sysctl;
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index adf4e51cf597..b7e82049fec7 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -143,6 +143,7 @@ struct ctl_table_header
 	struct ctl_table_set *set;
 	struct ctl_dir *parent;
 	struct ctl_node *node;
+	struct list_head inodes; /* head for proc_inode->sysctl_inodes */
 };
 
 struct ctl_dir {
-- 
cgit v1.2.3