From c952cd4e949ab3d07287efc2e80246e03727d15d Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Fri, 10 Mar 2017 09:52:20 +0800 Subject: nfsd: map the ENOKEY to nfserr_perm for avoiding warning Now that Ext4 and f2fs filesystems support encrypted directories and files, attempts to access those files may return ENOKEY, resulting in the following WARNING. Map ENOKEY to nfserr_perm instead of nfserr_io. [ 1295.411759] ------------[ cut here ]------------ [ 1295.411787] WARNING: CPU: 0 PID: 12786 at fs/nfsd/nfsproc.c:796 nfserrno+0x74/0x80 [nfsd] [ 1295.411806] nfsd: non-standard errno: -126 [ 1295.411816] Modules linked in: nfsd nfs_acl auth_rpcgss nfsv4 nfs lockd fscache tun bridge stp llc fuse ip_set nfnetlink vmw_vsock_vmci_transport vsock snd_seq_midi snd_seq_midi_event coretemp crct10dif_pclmul crc32_generic crc32_pclmul snd_ens1371 gameport ghash_clmulni_intel snd_ac97_codec f2fs intel_rapl_perf ac97_bus snd_seq ppdev snd_pcm snd_rawmidi snd_timer vmw_balloon snd_seq_device snd joydev soundcore parport_pc parport nfit acpi_cpufreq tpm_tis vmw_vmci tpm_tis_core tpm shpchp i2c_piix4 grace sunrpc xfs libcrc32c vmwgfx drm_kms_helper ttm drm crc32c_intel e1000 mptspi scsi_transport_spi serio_raw mptscsih mptbase ata_generic pata_acpi fjes [last unloaded: nfs_acl] [ 1295.412522] CPU: 0 PID: 12786 Comm: nfsd Tainted: G W 4.11.0-rc1+ #521 [ 1295.412959] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015 [ 1295.413814] Call Trace: [ 1295.414252] dump_stack+0x63/0x86 [ 1295.414666] __warn+0xcb/0xf0 [ 1295.415087] warn_slowpath_fmt+0x5f/0x80 [ 1295.415502] ? put_filp+0x42/0x50 [ 1295.415927] nfserrno+0x74/0x80 [nfsd] [ 1295.416339] nfsd_open+0xd7/0x180 [nfsd] [ 1295.416746] nfs4_get_vfs_file+0x367/0x3c0 [nfsd] [ 1295.417182] ? security_inode_permission+0x41/0x60 [ 1295.417591] nfsd4_process_open2+0x9b2/0x1200 [nfsd] [ 1295.418007] nfsd4_open+0x481/0x790 [nfsd] [ 1295.418409] nfsd4_proc_compound+0x395/0x680 [nfsd] [ 1295.418812] nfsd_dispatch+0xb8/0x1f0 [nfsd] [ 1295.419233] svc_process_common+0x4d9/0x830 [sunrpc] [ 1295.419631] svc_process+0xfe/0x1b0 [sunrpc] [ 1295.420033] nfsd+0xe9/0x150 [nfsd] [ 1295.420420] kthread+0x101/0x140 [ 1295.420802] ? nfsd_destroy+0x60/0x60 [nfsd] [ 1295.421199] ? kthread_park+0x90/0x90 [ 1295.421598] ret_from_fork+0x2c/0x40 [ 1295.421996] ---[ end trace 0d5a969cd7852e1f ]--- Signed-off-by: Kinglong Mee Cc: stable@vger.kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/nfsproc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index fa82b7707e85..03a7e9da4da0 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -786,6 +786,7 @@ nfserrno (int errno) { nfserr_serverfault, -ESERVERFAULT }, { nfserr_serverfault, -ENFILE }, { nfserr_io, -EUCLEAN }, + { nfserr_perm, -ENOKEY }, }; int i; -- cgit v1.2.3 From abcb4dacb098a1baca746406a8775e9930f47f3f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 10 Mar 2017 11:36:39 +1100 Subject: NFSD: further refinement of content of /proc/fs/nfsd/versions Prior to e35659f1b03c ("NFSD: correctly range-check v4.x minor version when setting versions.") v4.0 could not be disabled without disabling all NFSv4 protocols. So the 'versions' file contained ±4 ±4.1 ±4.2. Writing "-4" would disable all v4 completely. Writing +4 would enabled those minor versions that are currently enabled, either by default or otherwise. After that commit, it was possible to disable v4.0 independently. To maximize backward compatibility with use cases which never disabled v4.0, the "versions" file would never contain "+4.0" - that was implied by "+4", unless explicitly negated by "-4.0". This introduced an inconsistency in that it was possible to disable all minor versions, but still have the major version advertised. e.g. "-4.0 -4.1 -4.2 +4" would result in NFSv4 support being advertised, but all attempts to use it rejected. Commit d3635ff07e8c ("nfsd: fix configuration of supported minor versions") and following removed this inconsistency. If all minor version were disabled, the major would be disabled too. If any minor was enabled, the major would be disabled. This patch also treated "+4" as equivalent to "+4.0" and "-4" as "-4.0". A consequence of this is that writing "-4" would only disable 4.0. This is a regression against the earlier behaviour, in a use case that rpc.nfsd actually uses. The command "rpc.nfsd -N 4" will write "+2 +3 -4" to the versions files. Previously, that would disable v4 completely. Now it will only disable v4.0. Also "4.0" never appears in the "versions" file when read. So if only v4.1 is available, the previous kernel would have reported "+4 -4.0 +4.1 -4.2" the current kernel reports "-4 +4.1 -4.2" which could easily confuse. This patch restores the implication that "+4" and "-4" apply more globals and do not imply "4.0". Specifically: writing "-4" will disable all 4.x minor versions. writing "+4" will enable all 4.1 minor version if none are currently enabled. rpc.nfsd will list minor versions before major versions, so rpc.nfsd -V 4.2 -N 4.1 will write "-4.1 +4.2 +2 +3 +4" so it would be a regression for "+4" to enable always all versions. reading "-4" implies that no v4.x are enabled reading "+4" implies that some v4.x are enabled, and that v4.0 is enabled unless "-4.0" is also present. All other minor versions will explicitly be listed. Signed-off-by: NeilBrown Signed-off-by: J. Bruce Fields --- fs/nfsd/nfsctl.c | 43 +++++++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 73e75ac90525..8bf8f667a8cf 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -538,13 +538,21 @@ out_free: static ssize_t nfsd_print_version_support(char *buf, int remaining, const char *sep, - unsigned vers, unsigned minor) + unsigned vers, int minor) { - const char *format = (minor == 0) ? "%s%c%u" : "%s%c%u.%u"; + const char *format = minor < 0 ? "%s%c%u" : "%s%c%u.%u"; bool supported = !!nfsd_vers(vers, NFSD_TEST); - if (vers == 4 && !nfsd_minorversion(minor, NFSD_TEST)) + if (vers == 4 && minor >= 0 && + !nfsd_minorversion(minor, NFSD_TEST)) supported = false; + if (minor == 0 && supported) + /* + * special case for backward compatability. + * +4.0 is never reported, it is implied by + * +4, unless -4.0 is present. + */ + return 0; return snprintf(buf, remaining, format, sep, supported ? '+' : '-', vers, minor); } @@ -554,7 +562,6 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) char *mesg = buf; char *vers, *minorp, sign; int len, num, remaining; - unsigned minor; ssize_t tlen = 0; char *sep; struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); @@ -575,6 +582,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) if (len <= 0) return -EINVAL; do { enum vers_op cmd; + unsigned minor; sign = *vers; if (sign == '+' || sign == '-') num = simple_strtol((vers+1), &minorp, 0); @@ -585,8 +593,8 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) return -EINVAL; if (kstrtouint(minorp+1, 0, &minor) < 0) return -EINVAL; - } else - minor = 0; + } + cmd = sign == '-' ? NFSD_CLEAR : NFSD_SET; switch(num) { case 2: @@ -594,8 +602,20 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) nfsd_vers(num, cmd); break; case 4: - if (nfsd_minorversion(minor, cmd) >= 0) - break; + if (*minorp == '.') { + if (nfsd_minorversion(minor, cmd) < 0) + return -EINVAL; + } else if ((cmd == NFSD_SET) != nfsd_vers(num, NFSD_TEST)) { + /* + * Either we have +4 and no minors are enabled, + * or we have -4 and at least one minor is enabled. + * In either case, propagate 'cmd' to all minors. + */ + minor = 0; + while (nfsd_minorversion(minor, cmd) >= 0) + minor++; + } + break; default: return -EINVAL; } @@ -612,9 +632,11 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) sep = ""; remaining = SIMPLE_TRANSACTION_LIMIT; for (num=2 ; num <= 4 ; num++) { + int minor; if (!nfsd_vers(num, NFSD_AVAIL)) continue; - minor = 0; + + minor = -1; do { len = nfsd_print_version_support(buf, remaining, sep, num, minor); @@ -624,7 +646,8 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) buf += len; tlen += len; minor++; - sep = " "; + if (len) + sep = " "; } while (num == 4 && minor <= NFSD_SUPPORTED_MINOR_VERSION); } out: -- cgit v1.2.3 From 928c6fb3a9bfd6c5b287aa3465226add551c13c0 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 10 Mar 2017 11:36:39 +1100 Subject: NFSD: fix nfsd_minorversion(.., NFSD_AVAIL) Current code will return 1 if the version is supported, and -1 if it isn't. This is confusing and inconsistent with the one place where this is used. So change to return 1 if it is supported, and zero if not. i.e. an error is never returned. Signed-off-by: NeilBrown Signed-off-by: J. Bruce Fields --- fs/nfsd/nfssvc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 786a4a2cb2d7..892137b1e330 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -167,7 +167,8 @@ nfsd_adjust_nfsd_versions4(void) int nfsd_minorversion(u32 minorversion, enum vers_op change) { - if (minorversion > NFSD_SUPPORTED_MINOR_VERSION) + if (minorversion > NFSD_SUPPORTED_MINOR_VERSION && + change != NFSD_AVAIL) return -1; switch(change) { case NFSD_SET: -- cgit v1.2.3 From 800a938f0bf9130c8256116649c0cc5806bfb2fd Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 10 Mar 2017 11:36:39 +1100 Subject: NFSD: fix nfsd_reset_versions for NFSv4. If you write "-2 -3 -4" to the "versions" file, it will notice that no versions are enabled, and nfsd_reset_versions() is called. This enables all major versions, not no minor versions. So we lose the invariant that NFSv4 is only advertised when at least one minor is enabled. Fix the code to explicitly enable minor versions for v4, change it to use nfsd_vers() to test and set, and simplify the code. Signed-off-by: NeilBrown Signed-off-by: J. Bruce Fields --- fs/nfsd/nfssvc.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 892137b1e330..31e1f9593457 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -416,23 +416,20 @@ static void nfsd_last_thread(struct svc_serv *serv, struct net *net) void nfsd_reset_versions(void) { - int found_one = 0; int i; - for (i = NFSD_MINVERS; i < NFSD_NRVERS; i++) { - if (nfsd_program.pg_vers[i]) - found_one = 1; - } + for (i = 0; i < NFSD_NRVERS; i++) + if (nfsd_vers(i, NFSD_TEST)) + return; - if (!found_one) { - for (i = NFSD_MINVERS; i < NFSD_NRVERS; i++) - nfsd_program.pg_vers[i] = nfsd_version[i]; -#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) - for (i = NFSD_ACL_MINVERS; i < NFSD_ACL_NRVERS; i++) - nfsd_acl_program.pg_vers[i] = - nfsd_acl_version[i]; -#endif - } + for (i = 0; i < NFSD_NRVERS; i++) + if (i != 4) + nfsd_vers(i, NFSD_SET); + else { + int minor = 0; + while (nfsd_minorversion(minor, NFSD_SET) >= 0) + minor++; + } } /* -- cgit v1.2.3 From 1b53cf9815bb4744958d41f3795d5d5a1d365e2d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 21 Feb 2017 15:07:11 -0800 Subject: fscrypt: remove broken support for detecting keyring key revocation Filesystem encryption ostensibly supported revoking a keyring key that had been used to "unlock" encrypted files, causing those files to become "locked" again. This was, however, buggy for several reasons, the most severe of which was that when key revocation happened to be detected for an inode, its fscrypt_info was immediately freed, even while other threads could be using it for encryption or decryption concurrently. This could be exploited to crash the kernel or worse. This patch fixes the use-after-free by removing the code which detects the keyring key having been revoked, invalidated, or expired. Instead, an encrypted inode that is "unlocked" now simply remains unlocked until it is evicted from memory. Note that this is no worse than the case for block device-level encryption, e.g. dm-crypt, and it still remains possible for a privileged user to evict unused pages, inodes, and dentries by running 'sync; echo 3 > /proc/sys/vm/drop_caches', or by simply unmounting the filesystem. In fact, one of those actions was already needed anyway for key revocation to work even somewhat sanely. This change is not expected to break any applications. In the future I'd like to implement a real API for fscrypt key revocation that interacts sanely with ongoing filesystem operations --- waiting for existing operations to complete and blocking new operations, and invalidating and sanitizing key material and plaintext from the VFS caches. But this is a hard problem, and for now this bug must be fixed. This bug affected almost all versions of ext4, f2fs, and ubifs encryption, and it was potentially reachable in any kernel configured with encryption support (CONFIG_EXT4_ENCRYPTION=y, CONFIG_EXT4_FS_ENCRYPTION=y, CONFIG_F2FS_FS_ENCRYPTION=y, or CONFIG_UBIFS_FS_ENCRYPTION=y). Note that older kernels did not use the shared fs/crypto/ code, but due to the potential security implications of this bug, it may still be worthwhile to backport this fix to them. Fixes: b7236e21d55f ("ext4 crypto: reorganize how we store keys in the inode") Cc: stable@vger.kernel.org # v4.2+ Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o Acked-by: Michael Halcrow --- fs/crypto/crypto.c | 10 +-------- fs/crypto/fname.c | 2 +- fs/crypto/fscrypt_private.h | 4 ---- fs/crypto/keyinfo.c | 52 ++++++++------------------------------------- 4 files changed, 11 insertions(+), 57 deletions(-) (limited to 'fs') diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 02a7a9286449..6d6eca394d4d 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -327,7 +327,6 @@ EXPORT_SYMBOL(fscrypt_decrypt_page); static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) { struct dentry *dir; - struct fscrypt_info *ci; int dir_has_key, cached_with_key; if (flags & LOOKUP_RCU) @@ -339,18 +338,11 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) return 0; } - ci = d_inode(dir)->i_crypt_info; - if (ci && ci->ci_keyring_key && - (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | - (1 << KEY_FLAG_REVOKED) | - (1 << KEY_FLAG_DEAD)))) - ci = NULL; - /* this should eventually be an flag in d_flags */ spin_lock(&dentry->d_lock); cached_with_key = dentry->d_flags & DCACHE_ENCRYPTED_WITH_KEY; spin_unlock(&dentry->d_lock); - dir_has_key = (ci != NULL); + dir_has_key = (d_inode(dir)->i_crypt_info != NULL); dput(dir); /* diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 13052b85c393..37b49894c762 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -350,7 +350,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, fname->disk_name.len = iname->len; return 0; } - ret = fscrypt_get_crypt_info(dir); + ret = fscrypt_get_encryption_info(dir); if (ret && ret != -EOPNOTSUPP) return ret; diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index fdbb8af32eaf..e39696e64494 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -67,7 +67,6 @@ struct fscrypt_info { u8 ci_filename_mode; u8 ci_flags; struct crypto_skcipher *ci_ctfm; - struct key *ci_keyring_key; u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE]; }; @@ -101,7 +100,4 @@ extern int fscrypt_do_page_crypto(const struct inode *inode, extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags); -/* keyinfo.c */ -extern int fscrypt_get_crypt_info(struct inode *); - #endif /* _FSCRYPT_PRIVATE_H */ diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 02eb6b9e4438..cb3e82abf034 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -95,6 +95,7 @@ static int validate_user_key(struct fscrypt_info *crypt_info, kfree(description); if (IS_ERR(keyring_key)) return PTR_ERR(keyring_key); + down_read(&keyring_key->sem); if (keyring_key->type != &key_type_logon) { printk_once(KERN_WARNING @@ -102,11 +103,9 @@ static int validate_user_key(struct fscrypt_info *crypt_info, res = -ENOKEY; goto out; } - down_read(&keyring_key->sem); ukp = user_key_payload(keyring_key); if (ukp->datalen != sizeof(struct fscrypt_key)) { res = -EINVAL; - up_read(&keyring_key->sem); goto out; } master_key = (struct fscrypt_key *)ukp->data; @@ -117,17 +116,11 @@ static int validate_user_key(struct fscrypt_info *crypt_info, "%s: key size incorrect: %d\n", __func__, master_key->size); res = -ENOKEY; - up_read(&keyring_key->sem); goto out; } res = derive_key_aes(ctx->nonce, master_key->raw, raw_key); - up_read(&keyring_key->sem); - if (res) - goto out; - - crypt_info->ci_keyring_key = keyring_key; - return 0; out: + up_read(&keyring_key->sem); key_put(keyring_key); return res; } @@ -169,12 +162,11 @@ static void put_crypt_info(struct fscrypt_info *ci) if (!ci) return; - key_put(ci->ci_keyring_key); crypto_free_skcipher(ci->ci_ctfm); kmem_cache_free(fscrypt_info_cachep, ci); } -int fscrypt_get_crypt_info(struct inode *inode) +int fscrypt_get_encryption_info(struct inode *inode) { struct fscrypt_info *crypt_info; struct fscrypt_context ctx; @@ -184,21 +176,15 @@ int fscrypt_get_crypt_info(struct inode *inode) u8 *raw_key = NULL; int res; + if (inode->i_crypt_info) + return 0; + res = fscrypt_initialize(inode->i_sb->s_cop->flags); if (res) return res; if (!inode->i_sb->s_cop->get_context) return -EOPNOTSUPP; -retry: - crypt_info = ACCESS_ONCE(inode->i_crypt_info); - if (crypt_info) { - if (!crypt_info->ci_keyring_key || - key_validate(crypt_info->ci_keyring_key) == 0) - return 0; - fscrypt_put_encryption_info(inode, crypt_info); - goto retry; - } res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); if (res < 0) { @@ -229,7 +215,6 @@ retry: crypt_info->ci_data_mode = ctx.contents_encryption_mode; crypt_info->ci_filename_mode = ctx.filenames_encryption_mode; crypt_info->ci_ctfm = NULL; - crypt_info->ci_keyring_key = NULL; memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, sizeof(crypt_info->ci_master_key)); @@ -273,14 +258,8 @@ retry: if (res) goto out; - kzfree(raw_key); - raw_key = NULL; - if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) != NULL) { - put_crypt_info(crypt_info); - goto retry; - } - return 0; - + if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) == NULL) + crypt_info = NULL; out: if (res == -ENOKEY) res = 0; @@ -288,6 +267,7 @@ out: kzfree(raw_key); return res; } +EXPORT_SYMBOL(fscrypt_get_encryption_info); void fscrypt_put_encryption_info(struct inode *inode, struct fscrypt_info *ci) { @@ -305,17 +285,3 @@ void fscrypt_put_encryption_info(struct inode *inode, struct fscrypt_info *ci) put_crypt_info(ci); } EXPORT_SYMBOL(fscrypt_put_encryption_info); - -int fscrypt_get_encryption_info(struct inode *inode) -{ - struct fscrypt_info *ci = inode->i_crypt_info; - - if (!ci || - (ci->ci_keyring_key && - (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | - (1 << KEY_FLAG_REVOKED) | - (1 << KEY_FLAG_DEAD))))) - return fscrypt_get_crypt_info(inode); - return 0; -} -EXPORT_SYMBOL(fscrypt_get_encryption_info); -- cgit v1.2.3 From 94840e3c802daa1a62985957f36ac48faf8ceedd Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 22 Feb 2017 13:25:14 -0800 Subject: fscrypt: eliminate ->prepare_context() operation The only use of the ->prepare_context() fscrypt operation was to allow ext4 to evict inline data from the inode before ->set_context(). However, there is no reason why this cannot be done as simply the first step in ->set_context(), and in fact it makes more sense to do it that way because then the policy modes and flags get validated before any real work is done. Therefore, merge ext4_prepare_context() into ext4_set_context(), and remove ->prepare_context(). Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/policy.c | 7 ------- fs/ext4/super.c | 10 ++++------ include/linux/fscrypt_common.h | 1 - 3 files changed, 4 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 14b76da71269..4908906d54d5 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -33,17 +33,10 @@ static int create_encryption_context_from_policy(struct inode *inode, const struct fscrypt_policy *policy) { struct fscrypt_context ctx; - int res; if (!inode->i_sb->s_cop->set_context) return -EOPNOTSUPP; - if (inode->i_sb->s_cop->prepare_context) { - res = inode->i_sb->s_cop->prepare_context(inode); - if (res) - return res; - } - ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; memcpy(ctx.master_key_descriptor, policy->master_key_descriptor, FS_KEY_DESCRIPTOR_SIZE); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2e03a0a88d92..a9448db1cf7e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1120,17 +1120,16 @@ static int ext4_get_context(struct inode *inode, void *ctx, size_t len) EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len); } -static int ext4_prepare_context(struct inode *inode) -{ - return ext4_convert_inline_data(inode); -} - static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, void *fs_data) { handle_t *handle = fs_data; int res, res2, retries = 0; + res = ext4_convert_inline_data(inode); + if (res) + return res; + /* * If a journal handle was specified, then the encryption context is * being set on a new inode via inheritance and is part of a larger @@ -1196,7 +1195,6 @@ static unsigned ext4_max_namelen(struct inode *inode) static const struct fscrypt_operations ext4_cryptops = { .key_prefix = "ext4:", .get_context = ext4_get_context, - .prepare_context = ext4_prepare_context, .set_context = ext4_set_context, .dummy_context = ext4_dummy_context, .is_encrypted = ext4_encrypted_inode, diff --git a/include/linux/fscrypt_common.h b/include/linux/fscrypt_common.h index 547f81592ba1..10c1abfbac6c 100644 --- a/include/linux/fscrypt_common.h +++ b/include/linux/fscrypt_common.h @@ -87,7 +87,6 @@ struct fscrypt_operations { unsigned int flags; const char *key_prefix; int (*get_context)(struct inode *, void *, size_t); - int (*prepare_context)(struct inode *); int (*set_context)(struct inode *, const void *, size_t, void *); int (*dummy_context)(struct inode *); bool (*is_encrypted)(struct inode *); -- cgit v1.2.3 From b9cf625d6ecde0d372e23ae022feead72b4228a6 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 15 Mar 2017 14:52:02 -0400 Subject: ext4: mark inode dirty after converting inline directory If ext4_convert_inline_data() was called on a directory with inline data, the filesystem was left in an inconsistent state (as considered by e2fsck) because the file size was not increased to cover the new block. This happened because the inode was not marked dirty after i_disksize was updated. Fix this by marking the inode dirty at the end of ext4_finish_convert_inline_dir(). This bug was probably not noticed before because most users mark the inode dirty afterwards for other reasons. But if userspace executed FS_IOC_SET_ENCRYPTION_POLICY with invalid parameters, as exercised by 'kvm-xfstests -c adv generic/396', then the inode was never marked dirty after updating i_disksize. Cc: stable@vger.kernel.org # 3.10+ Fixes: 3c47d54170b6a678875566b1b8d6dcf57904e49b Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/ext4/inline.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 30a9f210d1e3..375fb1c05d49 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1169,10 +1169,9 @@ static int ext4_finish_convert_inline_dir(handle_t *handle, set_buffer_uptodate(dir_block); err = ext4_handle_dirty_dirent_node(handle, inode, dir_block); if (err) - goto out; + return err; set_buffer_verified(dir_block); -out: - return err; + return ext4_mark_inode_dirty(handle, inode); } static int ext4_convert_inline_data_nolock(handle_t *handle, -- cgit v1.2.3 From cd9cb405e0b948363811dc74dbb2890f56f2cb87 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 15 Mar 2017 15:08:48 -0400 Subject: jbd2: don't leak memory if setting up journal fails In journal_init_common(), if we failed to allocate the j_wbuf array, or if we failed to create the buffer_head for the journal superblock, we leaked the memory allocated for the revocation tables. Fix this. Cc: stable@vger.kernel.org # 4.9 Fixes: f0c9fd5458bacf7b12a9a579a727dc740cbe047e Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/jbd2/journal.c | 22 +++++++++++----------- fs/jbd2/revoke.c | 1 + 2 files changed, 12 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index a1a359bfcc9c..5adc2fb62b0f 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1125,10 +1125,8 @@ static journal_t *journal_init_common(struct block_device *bdev, /* Set up a default-sized revoke table for the new mount. */ err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH); - if (err) { - kfree(journal); - return NULL; - } + if (err) + goto err_cleanup; spin_lock_init(&journal->j_history_lock); @@ -1145,23 +1143,25 @@ static journal_t *journal_init_common(struct block_device *bdev, journal->j_wbufsize = n; journal->j_wbuf = kmalloc_array(n, sizeof(struct buffer_head *), GFP_KERNEL); - if (!journal->j_wbuf) { - kfree(journal); - return NULL; - } + if (!journal->j_wbuf) + goto err_cleanup; bh = getblk_unmovable(journal->j_dev, start, journal->j_blocksize); if (!bh) { pr_err("%s: Cannot get buffer for journal superblock\n", __func__); - kfree(journal->j_wbuf); - kfree(journal); - return NULL; + goto err_cleanup; } journal->j_sb_buffer = bh; journal->j_superblock = (journal_superblock_t *)bh->b_data; return journal; + +err_cleanup: + kfree(journal->j_wbuf); + jbd2_journal_destroy_revoke(journal); + kfree(journal); + return NULL; } /* jbd2_journal_init_dev and jbd2_journal_init_inode: diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index cfc38b552118..f9aefcda5854 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -280,6 +280,7 @@ int jbd2_journal_init_revoke(journal_t *journal, int hash_size) fail1: jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]); + journal->j_revoke_table[0] = NULL; fail0: return -ENOMEM; } -- cgit v1.2.3 From 966fa72a716ceafc69de901a31f7cc1f52b35f81 Mon Sep 17 00:00:00 2001 From: Vaibhav Jain Date: Tue, 14 Mar 2017 08:17:00 +0530 Subject: kernfs: Check KERNFS_HAS_RELEASE before calling kernfs_release_file() Recently started seeing a kernel oops when a module tries removing a memory mapped sysfs bin_attribute. On closer investigation the root cause seems to be kernfs_release_file() trying to call kernfs_op.release() callback that's NULL for such sysfs bin_attributes. The oops occurs when kernfs_release_file() is called from kernfs_drain_open_files() to cleanup any open handles with active memory mappings. The patch fixes this by checking for flag KERNFS_HAS_RELEASE before calling kernfs_release_file() in function kernfs_drain_open_files(). On ppc64-le arch with cxl module the oops back-trace is of the form below: [ 861.381126] Unable to handle kernel paging request for instruction fetch [ 861.381360] Faulting instruction address: 0x00000000 [ 861.381428] Oops: Kernel access of bad area, sig: 11 [#1] .... [ 861.382481] NIP: 0000000000000000 LR: c000000000362c60 CTR: 0000000000000000 .... Call Trace: [c000000f1680b750] [c000000000362c34] kernfs_drain_open_files+0x104/0x1d0 (unreliable) [c000000f1680b790] [c00000000035fa00] __kernfs_remove+0x260/0x2c0 [c000000f1680b820] [c000000000360da0] kernfs_remove_by_name_ns+0x60/0xe0 [c000000f1680b8b0] [c0000000003638f4] sysfs_remove_bin_file+0x24/0x40 [c000000f1680b8d0] [c00000000062a164] device_remove_bin_file+0x24/0x40 [c000000f1680b8f0] [d000000009b7b22c] cxl_sysfs_afu_remove+0x144/0x170 [cxl] [c000000f1680b940] [d000000009b7c7e4] cxl_remove+0x6c/0x1a0 [cxl] [c000000f1680b990] [c00000000052f694] pci_device_remove+0x64/0x110 [c000000f1680b9d0] [c0000000006321d4] device_release_driver_internal+0x1f4/0x2b0 [c000000f1680ba20] [c000000000525cb0] pci_stop_bus_device+0xa0/0xd0 [c000000f1680ba60] [c000000000525e80] pci_stop_and_remove_bus_device+0x20/0x40 [c000000f1680ba90] [c00000000004a6c4] pci_hp_remove_devices+0x84/0xc0 [c000000f1680bad0] [c00000000004a688] pci_hp_remove_devices+0x48/0xc0 [c000000f1680bb10] [c0000000009dfda4] eeh_reset_device+0xb0/0x290 [c000000f1680bbb0] [c000000000032b4c] eeh_handle_normal_event+0x47c/0x530 [c000000f1680bc60] [c000000000032e64] eeh_handle_event+0x174/0x350 [c000000f1680bd10] [c000000000033228] eeh_event_handler+0x1e8/0x1f0 [c000000f1680bdc0] [c0000000000d384c] kthread+0x14c/0x190 [c000000f1680be30] [c00000000000b5a0] ret_from_kernel_thread+0x5c/0xbc Fixes: f83f3c515654 ("kernfs: fix locking around kernfs_ops->release() callback") Signed-off-by: Vaibhav Jain Acked-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/kernfs/file.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 8e4dc7ab584c..ac2dfe0c5a9c 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -809,7 +809,8 @@ void kernfs_drain_open_files(struct kernfs_node *kn) if (kn->flags & KERNFS_HAS_MMAP) unmap_mapping_range(inode->i_mapping, 0, 0, 1); - kernfs_release_file(kn, of); + if (kn->flags & KERNFS_HAS_RELEASE) + kernfs_release_file(kn, of); } mutex_unlock(&kernfs_open_file_mutex); -- cgit v1.2.3 From dac7a4b4b1f664934e8b713f529b629f67db313c Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 25 Mar 2017 17:22:47 -0400 Subject: ext4: lock the xattr block before checksuming it We must lock the xattr block before calculating or verifying the checksum in order to avoid spurious checksum failures. https://bugzilla.kernel.org/show_bug.cgi?id=193661 Reported-by: Colin Ian King Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/xattr.c | 65 +++++++++++++++++++++++++++------------------------------ 1 file changed, 31 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 67636acf7624..996e7900d4c8 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -131,31 +131,26 @@ static __le32 ext4_xattr_block_csum(struct inode *inode, } static int ext4_xattr_block_csum_verify(struct inode *inode, - sector_t block_nr, - struct ext4_xattr_header *hdr) + struct buffer_head *bh) { - if (ext4_has_metadata_csum(inode->i_sb) && - (hdr->h_checksum != ext4_xattr_block_csum(inode, block_nr, hdr))) - return 0; - return 1; -} - -static void ext4_xattr_block_csum_set(struct inode *inode, - sector_t block_nr, - struct ext4_xattr_header *hdr) -{ - if (!ext4_has_metadata_csum(inode->i_sb)) - return; + struct ext4_xattr_header *hdr = BHDR(bh); + int ret = 1; - hdr->h_checksum = ext4_xattr_block_csum(inode, block_nr, hdr); + if (ext4_has_metadata_csum(inode->i_sb)) { + lock_buffer(bh); + ret = (hdr->h_checksum == ext4_xattr_block_csum(inode, + bh->b_blocknr, hdr)); + unlock_buffer(bh); + } + return ret; } -static inline int ext4_handle_dirty_xattr_block(handle_t *handle, - struct inode *inode, - struct buffer_head *bh) +static void ext4_xattr_block_csum_set(struct inode *inode, + struct buffer_head *bh) { - ext4_xattr_block_csum_set(inode, bh->b_blocknr, BHDR(bh)); - return ext4_handle_dirty_metadata(handle, inode, bh); + if (ext4_has_metadata_csum(inode->i_sb)) + BHDR(bh)->h_checksum = ext4_xattr_block_csum(inode, + bh->b_blocknr, BHDR(bh)); } static inline const struct xattr_handler * @@ -233,7 +228,7 @@ ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh) if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || BHDR(bh)->h_blocks != cpu_to_le32(1)) return -EFSCORRUPTED; - if (!ext4_xattr_block_csum_verify(inode, bh->b_blocknr, BHDR(bh))) + if (!ext4_xattr_block_csum_verify(inode, bh)) return -EFSBADCRC; error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size, bh->b_data); @@ -618,23 +613,22 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, } } + ext4_xattr_block_csum_set(inode, bh); /* * Beware of this ugliness: Releasing of xattr block references * from different inodes can race and so we have to protect * from a race where someone else frees the block (and releases * its journal_head) before we are done dirtying the buffer. In * nojournal mode this race is harmless and we actually cannot - * call ext4_handle_dirty_xattr_block() with locked buffer as + * call ext4_handle_dirty_metadata() with locked buffer as * that function can call sync_dirty_buffer() so for that case * we handle the dirtying after unlocking the buffer. */ if (ext4_handle_valid(handle)) - error = ext4_handle_dirty_xattr_block(handle, inode, - bh); + error = ext4_handle_dirty_metadata(handle, inode, bh); unlock_buffer(bh); if (!ext4_handle_valid(handle)) - error = ext4_handle_dirty_xattr_block(handle, inode, - bh); + error = ext4_handle_dirty_metadata(handle, inode, bh); if (IS_SYNC(inode)) ext4_handle_sync(handle); dquot_free_block(inode, EXT4_C2B(EXT4_SB(inode->i_sb), 1)); @@ -863,13 +857,14 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, ext4_xattr_cache_insert(ext4_mb_cache, bs->bh); } + ext4_xattr_block_csum_set(inode, bs->bh); unlock_buffer(bs->bh); if (error == -EFSCORRUPTED) goto bad_block; if (!error) - error = ext4_handle_dirty_xattr_block(handle, - inode, - bs->bh); + error = ext4_handle_dirty_metadata(handle, + inode, + bs->bh); if (error) goto cleanup; goto inserted; @@ -967,10 +962,11 @@ inserted: ce->e_reusable = 0; ea_bdebug(new_bh, "reusing; refcount now=%d", ref); + ext4_xattr_block_csum_set(inode, new_bh); unlock_buffer(new_bh); - error = ext4_handle_dirty_xattr_block(handle, - inode, - new_bh); + error = ext4_handle_dirty_metadata(handle, + inode, + new_bh); if (error) goto cleanup_dquot; } @@ -1020,11 +1016,12 @@ getblk_failed: goto getblk_failed; } memcpy(new_bh->b_data, s->base, new_bh->b_size); + ext4_xattr_block_csum_set(inode, new_bh); set_buffer_uptodate(new_bh); unlock_buffer(new_bh); ext4_xattr_cache_insert(ext4_mb_cache, new_bh); - error = ext4_handle_dirty_xattr_block(handle, - inode, new_bh); + error = ext4_handle_dirty_metadata(handle, inode, + new_bh); if (error) goto cleanup; } -- cgit v1.2.3 From d67d64f423147cf4fe8212658255e1160a4ef02c Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 25 Mar 2017 17:33:31 -0400 Subject: ext4: fix two spelling nits Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 2 +- fs/ext4/move_extent.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f622d4a577e3..f303d3a7f44a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5400,7 +5400,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, * If there is inline data in the inode, the inode will normally not * have data blocks allocated (it may have an external xattr block). * Report at least one sector for such files, so tools like tar, rsync, - * others doen't incorrectly think the file is completely sparse. + * others don't incorrectly think the file is completely sparse. */ if (unlikely(ext4_has_inline_data(inode))) stat->blocks += (stat->size + 511) >> 9; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 6fc14def0c70..615bc03d0fbd 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -511,7 +511,7 @@ mext_check_arguments(struct inode *orig_inode, if ((orig_start & ~(PAGE_MASK >> orig_inode->i_blkbits)) != (donor_start & ~(PAGE_MASK >> orig_inode->i_blkbits))) { ext4_debug("ext4 move extent: orig and donor's start " - "offset are not alligned [ino:orig %lu, donor %lu]\n", + "offsets are not aligned [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } -- cgit v1.2.3 From d4ea7e3c5c0e341c15b073016dbf3ab6c65f12f3 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Wed, 15 Mar 2017 21:50:09 -0400 Subject: NFS: Fix old dentry rehash after move Now that nfs_rename()'s d_move has moved within the RPC task's rpc_call_done callback, rehashing new_dentry will actually rehash the old dentry's name in nfs_rename(). d_move() is going to rehash the new dentry for us anyway, so doing it again here is unnecessary. Reported-by: Chuck Lever Fixes: 920b4530fb80 ("NFS: nfs_rename() handle -ERESTARTSYS dentry left behind") Signed-off-by: Benjamin Coddington Tested-by: Chuck Lever Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index fb499a3f21b5..f92ba8d6c556 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2055,7 +2055,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, { struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); - struct dentry *dentry = NULL, *rehash = NULL; + struct dentry *dentry = NULL; struct rpc_task *task; int error = -EBUSY; @@ -2078,10 +2078,8 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, * To prevent any new references to the target during the * rename, we unhash the dentry in advance. */ - if (!d_unhashed(new_dentry)) { + if (!d_unhashed(new_dentry)) d_drop(new_dentry); - rehash = new_dentry; - } if (d_count(new_dentry) > 2) { int err; @@ -2098,7 +2096,6 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; new_dentry = dentry; - rehash = NULL; new_inode = NULL; } } @@ -2119,8 +2116,6 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, error = task->tk_status; rpc_put_task(task); out: - if (rehash) - d_rehash(rehash); trace_nfs_rename_exit(old_dir, old_dentry, new_dir, new_dentry, error); /* new dentry created? */ -- cgit v1.2.3 From 551afbb85b3898e78068405d78708245999c19c0 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Mon, 20 Mar 2017 18:07:00 -0400 Subject: NFS cleanup struct nfs4_filelayout_segment Signed-off-by: Andy Adamson Signed-off-by: Anna Schumaker --- fs/nfs/filelayout/filelayout.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfs/filelayout/filelayout.h b/fs/nfs/filelayout/filelayout.h index 2896cb833a11..4c4d436a6796 100644 --- a/fs/nfs/filelayout/filelayout.h +++ b/fs/nfs/filelayout/filelayout.h @@ -55,15 +55,15 @@ struct nfs4_file_layout_dsaddr { }; struct nfs4_filelayout_segment { - struct pnfs_layout_segment generic_hdr; - u32 stripe_type; - u32 commit_through_mds; - u32 stripe_unit; - u32 first_stripe_index; - u64 pattern_offset; - struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */ - unsigned int num_fh; - struct nfs_fh **fh_array; + struct pnfs_layout_segment generic_hdr; + u32 stripe_type; + u32 commit_through_mds; + u32 stripe_unit; + u32 first_stripe_index; + u64 pattern_offset; + struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */ + unsigned int num_fh; + struct nfs_fh **fh_array; }; struct nfs4_filelayout { -- cgit v1.2.3 From 629dc8704b922f0c46f3025bd3486c2bc51eb7a6 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Mon, 20 Mar 2017 18:07:01 -0400 Subject: NFS store nfs4_deviceid in struct nfs4_filelayout_segment In preparation for moving the filelayout getdeviceinfo call from filelayout_alloc_lseg called by pnfs_process_layout Signed-off-by: Andy Adamson Signed-off-by: Anna Schumaker --- fs/nfs/filelayout/filelayout.c | 13 +++++-------- fs/nfs/filelayout/filelayout.h | 1 + 2 files changed, 6 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 7aff350f15b1..cad74c1c79ff 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -572,7 +572,6 @@ static int filelayout_check_layout(struct pnfs_layout_hdr *lo, struct nfs4_filelayout_segment *fl, struct nfs4_layoutget_res *lgr, - struct nfs4_deviceid *id, gfp_t gfp_flags) { struct nfs4_deviceid_node *d; @@ -602,7 +601,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, } /* find and reference the deviceid */ - d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), id, + d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), &fl->deviceid, lo->plh_lc_cred, gfp_flags); if (d == NULL) goto out; @@ -657,7 +656,6 @@ static int filelayout_decode_layout(struct pnfs_layout_hdr *flo, struct nfs4_filelayout_segment *fl, struct nfs4_layoutget_res *lgr, - struct nfs4_deviceid *id, gfp_t gfp_flags) { struct xdr_stream stream; @@ -682,9 +680,9 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, if (unlikely(!p)) goto out_err; - memcpy(id, p, sizeof(*id)); + memcpy(&fl->deviceid, p, sizeof(fl->deviceid)); p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); - nfs4_print_deviceid(id); + nfs4_print_deviceid(&fl->deviceid); nfl_util = be32_to_cpup(p++); if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS) @@ -831,15 +829,14 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, { struct nfs4_filelayout_segment *fl; int rc; - struct nfs4_deviceid id; dprintk("--> %s\n", __func__); fl = kzalloc(sizeof(*fl), gfp_flags); if (!fl) return NULL; - rc = filelayout_decode_layout(layoutid, fl, lgr, &id, gfp_flags); - if (rc != 0 || filelayout_check_layout(layoutid, fl, lgr, &id, gfp_flags)) { + rc = filelayout_decode_layout(layoutid, fl, lgr, gfp_flags); + if (rc != 0 || filelayout_check_layout(layoutid, fl, lgr, gfp_flags)) { _filelayout_free_lseg(fl); return NULL; } diff --git a/fs/nfs/filelayout/filelayout.h b/fs/nfs/filelayout/filelayout.h index 4c4d436a6796..79323b5dab0c 100644 --- a/fs/nfs/filelayout/filelayout.h +++ b/fs/nfs/filelayout/filelayout.h @@ -61,6 +61,7 @@ struct nfs4_filelayout_segment { u32 stripe_unit; u32 first_stripe_index; u64 pattern_offset; + struct nfs4_deviceid deviceid; struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */ unsigned int num_fh; struct nfs_fh **fh_array; -- cgit v1.2.3 From 8d40b0f14846f7d45c7c72d343fe62cb866dda34 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Mon, 20 Mar 2017 18:07:02 -0400 Subject: NFS filelayout:call GETDEVICEINFO after pnfs_layout_process completes Fix a filelayout GETDEVICEINFO call hang triggered from the LAYOUTGET pnfs_layout_process where the GETDEVICEINFO call is waiting for a session slot, and the LAYOUGET call is waiting for pnfs_layout_process to complete before freeing the slot GETDEVICEINFO is waiting for.. This occurs in testing against the pynfs pNFS server where the the on-wire reply highest_slotid and slot id are zero, and the target high slot id is 8 (negotiated in CREATE_SESSION). The internal fore channel slot table max_slotid, the maximum allowed table slotid value, has been reduced via nfs41_set_max_slotid_locked from 8 to 1. Thus there is one slot (slotid 0) available for use but it has not been freed by LAYOUTGET proir to the GETDEVICEINFO request. In order to ensure that layoutrecall callbacks are processed in the correct order, nfs4_proc_layoutget processing needs to be finished e.g. pnfs_layout_process) before giving up the slot that identifies the layoutget (see referring_call_exists). Move the filelayout_check_layout nfs4_find_get_device call outside of the pnfs_layout_process call tree. Signed-off-by: Andy Adamson Signed-off-by: Anna Schumaker --- fs/nfs/filelayout/filelayout.c | 138 +++++++++++++++++++++++++++-------------- 1 file changed, 91 insertions(+), 47 deletions(-) (limited to 'fs') diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index cad74c1c79ff..367f8eb19bfa 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -560,6 +560,50 @@ filelayout_write_pagelist(struct nfs_pgio_header *hdr, int sync) return PNFS_ATTEMPTED; } +static int +filelayout_check_deviceid(struct pnfs_layout_hdr *lo, + struct nfs4_filelayout_segment *fl, + gfp_t gfp_flags) +{ + struct nfs4_deviceid_node *d; + struct nfs4_file_layout_dsaddr *dsaddr; + int status = -EINVAL; + + /* find and reference the deviceid */ + d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), &fl->deviceid, + lo->plh_lc_cred, gfp_flags); + if (d == NULL) + goto out; + + dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node); + /* Found deviceid is unavailable */ + if (filelayout_test_devid_unavailable(&dsaddr->id_node)) + goto out_put; + + fl->dsaddr = dsaddr; + + if (fl->first_stripe_index >= dsaddr->stripe_count) { + dprintk("%s Bad first_stripe_index %u\n", + __func__, fl->first_stripe_index); + goto out_put; + } + + if ((fl->stripe_type == STRIPE_SPARSE && + fl->num_fh > 1 && fl->num_fh != dsaddr->ds_num) || + (fl->stripe_type == STRIPE_DENSE && + fl->num_fh != dsaddr->stripe_count)) { + dprintk("%s num_fh %u not valid for given packing\n", + __func__, fl->num_fh); + goto out_put; + } + status = 0; +out: + return status; +out_put: + nfs4_fl_put_deviceid(dsaddr); + goto out; +} + /* * filelayout_check_layout() * @@ -574,8 +618,6 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags) { - struct nfs4_deviceid_node *d; - struct nfs4_file_layout_dsaddr *dsaddr; int status = -EINVAL; dprintk("--> %s\n", __func__); @@ -600,41 +642,10 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, goto out; } - /* find and reference the deviceid */ - d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), &fl->deviceid, - lo->plh_lc_cred, gfp_flags); - if (d == NULL) - goto out; - - dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node); - /* Found deviceid is unavailable */ - if (filelayout_test_devid_unavailable(&dsaddr->id_node)) - goto out_put; - - fl->dsaddr = dsaddr; - - if (fl->first_stripe_index >= dsaddr->stripe_count) { - dprintk("%s Bad first_stripe_index %u\n", - __func__, fl->first_stripe_index); - goto out_put; - } - - if ((fl->stripe_type == STRIPE_SPARSE && - fl->num_fh > 1 && fl->num_fh != dsaddr->ds_num) || - (fl->stripe_type == STRIPE_DENSE && - fl->num_fh != dsaddr->stripe_count)) { - dprintk("%s num_fh %u not valid for given packing\n", - __func__, fl->num_fh); - goto out_put; - } - status = 0; out: dprintk("--> %s returns %d\n", __func__, status); return status; -out_put: - nfs4_fl_put_deviceid(dsaddr); - goto out; } static void _filelayout_free_lseg(struct nfs4_filelayout_segment *fl) @@ -885,18 +896,51 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, return min(stripe_unit - (unsigned int)stripe_offset, size); } +static struct pnfs_layout_segment * +fl_pnfs_update_layout(struct inode *ino, + struct nfs_open_context *ctx, + loff_t pos, + u64 count, + enum pnfs_iomode iomode, + bool strict_iomode, + gfp_t gfp_flags) +{ + struct pnfs_layout_segment *lseg = NULL; + struct pnfs_layout_hdr *lo; + struct nfs4_filelayout_segment *fl; + int status; + + lseg = pnfs_update_layout(ino, ctx, pos, count, iomode, strict_iomode, + gfp_flags); + if (!lseg) + lseg = ERR_PTR(-ENOMEM); + if (IS_ERR(lseg)) + goto out; + + lo = NFS_I(ino)->layout; + fl = FILELAYOUT_LSEG(lseg); + + status = filelayout_check_deviceid(lo, fl, gfp_flags); + if (status) + lseg = ERR_PTR(status); +out: + if (IS_ERR(lseg)) + pnfs_put_lseg(lseg); + return lseg; +} + static void filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { if (!pgio->pg_lseg) { - pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, - req->wb_context, - 0, - NFS4_MAX_UINT64, - IOMODE_READ, - false, - GFP_KERNEL); + pgio->pg_lseg = fl_pnfs_update_layout(pgio->pg_inode, + req->wb_context, + 0, + NFS4_MAX_UINT64, + IOMODE_READ, + false, + GFP_KERNEL); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); pgio->pg_lseg = NULL; @@ -916,13 +960,13 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, int status; if (!pgio->pg_lseg) { - pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, - req->wb_context, - 0, - NFS4_MAX_UINT64, - IOMODE_RW, - false, - GFP_NOFS); + pgio->pg_lseg = fl_pnfs_update_layout(pgio->pg_inode, + req->wb_context, + 0, + NFS4_MAX_UINT64, + IOMODE_RW, + false, + GFP_NOFS); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); pgio->pg_lseg = NULL; -- cgit v1.2.3 From 9d0d1c8b1c9d80b17cfa86ecd50c8933a742585c Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 24 Mar 2017 15:04:50 -0700 Subject: Btrfs: bring back repair during read Commit 20a7db8ab3f2 ("btrfs: add dummy callback for readpage_io_failed and drop checks") made a cleanup around readpage_io_failed_hook, and it was supposed to keep the original sematics, but it also unexpectedly disabled repair during read for dup, raid1 and raid10. This fixes the problem by letting data's inode call the generic readpage_io_failed callback by returning -EAGAIN from its readpage_io_failed_hook in order to notify end_bio_extent_readpage to do the rest. We don't call it directly because the generic one takes an offset from end_bio_extent_readpage() to calculate the index in the checksum array and inode's readpage_io_failed_hook doesn't offer that offset. Cc: David Sterba Signed-off-by: Liu Bo Reviewed-by: David Sterba [ keep the const function attribute ] Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 46 ++++++++++++++++++++++++++++------------------ fs/btrfs/inode.c | 6 +++--- 2 files changed, 31 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 8df797432740..27fdb250b446 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2584,26 +2584,36 @@ static void end_bio_extent_readpage(struct bio *bio) if (tree->ops) { ret = tree->ops->readpage_io_failed_hook(page, mirror); - if (!ret && !bio->bi_error) - uptodate = 1; - } else { + if (ret == -EAGAIN) { + /* + * Data inode's readpage_io_failed_hook() always + * returns -EAGAIN. + * + * The generic bio_readpage_error handles errors + * the following way: If possible, new read + * requests are created and submitted and will + * end up in end_bio_extent_readpage as well (if + * we're lucky, not in the !uptodate case). In + * that case it returns 0 and we just go on with + * the next page in our bio. If it can't handle + * the error it will return -EIO and we remain + * responsible for that page. + */ + ret = bio_readpage_error(bio, offset, page, + start, end, mirror); + if (ret == 0) { + uptodate = !bio->bi_error; + offset += len; + continue; + } + } + /* - * The generic bio_readpage_error handles errors the - * following way: If possible, new read requests are - * created and submitted and will end up in - * end_bio_extent_readpage as well (if we're lucky, not - * in the !uptodate case). In that case it returns 0 and - * we just go on with the next page in our bio. If it - * can't handle the error it will return -EIO and we - * remain responsible for that page. + * metadata's readpage_io_failed_hook() always returns + * -EIO and fixes nothing. -EIO is also returned if + * data inode error could not be fixed. */ - ret = bio_readpage_error(bio, offset, page, start, end, - mirror); - if (ret == 0) { - uptodate = !bio->bi_error; - offset += len; - continue; - } + ASSERT(ret == -EIO); } readpage_ok: if (likely(uptodate)) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e57191072aa3..876f1d36030c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -10523,9 +10523,9 @@ out_inode: } __attribute__((const)) -static int dummy_readpage_io_failed_hook(struct page *page, int failed_mirror) +static int btrfs_readpage_io_failed_hook(struct page *page, int failed_mirror) { - return 0; + return -EAGAIN; } static const struct inode_operations btrfs_dir_inode_operations = { @@ -10570,7 +10570,7 @@ static const struct extent_io_ops btrfs_extent_io_ops = { .submit_bio_hook = btrfs_submit_bio_hook, .readpage_end_io_hook = btrfs_readpage_end_io_hook, .merge_bio_hook = btrfs_merge_bio_hook, - .readpage_io_failed_hook = dummy_readpage_io_failed_hook, + .readpage_io_failed_hook = btrfs_readpage_io_failed_hook, /* optional callbacks */ .fill_delalloc = run_delalloc_range, -- cgit v1.2.3 From ce0dcee626c482183b42d45b6ea43198c7223fc7 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Tue, 14 Mar 2017 05:25:09 -0500 Subject: btrfs: Change qgroup_meta_rsv to 64bit Using an int value is causing qg->reserved to become negative and exclusive -EDQUOT to be reached prematurely. This affects exclusive qgroups only. TEST CASE: DEVICE=/dev/vdb MOUNTPOINT=/mnt SUBVOL=$MOUNTPOINT/tmp umount $SUBVOL umount $MOUNTPOINT mkfs.btrfs -f $DEVICE mount /dev/vdb $MOUNTPOINT btrfs quota enable $MOUNTPOINT btrfs subvol create $SUBVOL umount $MOUNTPOINT mount /dev/vdb $MOUNTPOINT mount -o subvol=tmp $DEVICE $SUBVOL btrfs qgroup limit -e 3G $SUBVOL btrfs quota rescan /mnt -w for i in `seq 1 44000`; do dd if=/dev/zero of=/mnt/tmp/test_$i bs=10k count=1 if [[ $? > 0 ]]; then btrfs qgroup show -pcref $SUBVOL exit 1 fi done Signed-off-by: Goldwyn Rodrigues [ add reproducer to changelog ] Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/disk-io.c | 2 +- fs/btrfs/qgroup.c | 10 +++++----- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f03c2f285eb1..660d485b6e8b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1258,7 +1258,7 @@ struct btrfs_root { atomic_t will_be_snapshoted; /* For qgroup metadata space reserve */ - atomic_t qgroup_meta_rsv; + atomic64_t qgroup_meta_rsv; }; static inline u32 btrfs_inode_sectorsize(const struct inode *inode) { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 73fdc6bdaea9..982c56f79515 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1342,7 +1342,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, atomic_set(&root->orphan_inodes, 0); atomic_set(&root->refs, 1); atomic_set(&root->will_be_snapshoted, 0); - atomic_set(&root->qgroup_meta_rsv, 0); + atomic64_set(&root->qgroup_meta_rsv, 0); root->log_transid = 0; root->log_transid_committed = -1; root->last_log_commit = 0; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index a5da750c1087..a59801dc2a34 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2948,20 +2948,20 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, ret = qgroup_reserve(root, num_bytes, enforce); if (ret < 0) return ret; - atomic_add(num_bytes, &root->qgroup_meta_rsv); + atomic64_add(num_bytes, &root->qgroup_meta_rsv); return ret; } void btrfs_qgroup_free_meta_all(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; - int reserved; + u64 reserved; if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || !is_fstree(root->objectid)) return; - reserved = atomic_xchg(&root->qgroup_meta_rsv, 0); + reserved = atomic64_xchg(&root->qgroup_meta_rsv, 0); if (reserved == 0) return; btrfs_qgroup_free_refroot(fs_info, root->objectid, reserved); @@ -2976,8 +2976,8 @@ void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes) return; BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); - WARN_ON(atomic_read(&root->qgroup_meta_rsv) < num_bytes); - atomic_sub(num_bytes, &root->qgroup_meta_rsv); + WARN_ON(atomic64_read(&root->qgroup_meta_rsv) < num_bytes); + atomic64_sub(num_bytes, &root->qgroup_meta_rsv); btrfs_qgroup_free_refroot(fs_info, root->objectid, num_bytes); } -- cgit v1.2.3 From 457ae7268b29c33dee1c0feb143a15f6029d177b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 17 Mar 2017 23:51:20 +0300 Subject: Btrfs: fix an integer overflow check This isn't super serious because you need CAP_ADMIN to run this code. I added this integer overflow check last year but apparently I am rubbish at writing integer overflow checks... There are two issues. First, access_ok() works on unsigned long type and not u64 so on 32 bit systems the access_ok() could be checking a truncated size. The other issue is that we should be using a stricter limit so we don't overflow the kzalloc() setting ctx->clone_roots later in the function after the access_ok(): alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1); sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN); Fixes: f5ecec3ce21f ("btrfs: send: silence an integer overflow warning") Signed-off-by: Dan Carpenter Reviewed-by: David Sterba [ added comment ] Signed-off-by: David Sterba --- fs/btrfs/send.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 456c8901489b..a60d5bfb8a49 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -6305,8 +6305,13 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) goto out; } + /* + * Check that we don't overflow at later allocations, we request + * clone_sources_count + 1 items, and compare to unsigned long inside + * access_ok. + */ if (arg->clone_sources_count > - ULLONG_MAX / sizeof(*arg->clone_sources)) { + ULONG_MAX / sizeof(struct clone_root) - 1) { ret = -EINVAL; goto out; } -- cgit v1.2.3 From fabbbee0eb0f4b763576ac1e2db4fc3bf6dcc0cc Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Thu, 30 Mar 2017 10:10:55 -0400 Subject: PNFS fix fallback to MDS if got error on commit to DS Upong receiving some errors (EACCES) on commit to the DS the code doesn't fallback to MDS and intead retrieds to the same DS again. Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- fs/nfs/filelayout/filelayout.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 367f8eb19bfa..c9230fecc77e 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -202,10 +202,10 @@ static int filelayout_async_handle_error(struct rpc_task *task, task->tk_status); nfs4_mark_deviceid_unavailable(devid); pnfs_error_mark_layout_for_return(inode, lseg); - pnfs_set_lo_fail(lseg); rpc_wake_up(&tbl->slot_tbl_waitq); /* fall through */ default: + pnfs_set_lo_fail(lseg); reset: dprintk("%s Retry through MDS. Error %d\n", __func__, task->tk_status); -- cgit v1.2.3 From 0e3d3e5df07dcf8a50d96e0ecd6ab9a888f55dfc Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Thu, 30 Mar 2017 13:49:03 -0400 Subject: NFSv4.1 fix infinite loop on IO BAD_STATEID error Commit 63d63cbf5e03 "NFSv4.1: Don't recheck delegations that have already been checked" introduced a regression where when a client received BAD_STATEID error it would not send any TEST_STATEID and instead go into an infinite loop of resending the IO that caused the BAD_STATEID. Fixes: 63d63cbf5e03 ("NFSv4.1: Don't recheck delegations that have already been checked") Signed-off-by: Olga Kornievskaia Cc: stable@vger.kernel.org # 4.9+ Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index c780d98035cc..201ca3f2c4ba 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2442,17 +2442,14 @@ static void nfs41_check_delegation_stateid(struct nfs4_state *state) } nfs4_stateid_copy(&stateid, &delegation->stateid); - if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) { + if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags) || + !test_and_clear_bit(NFS_DELEGATION_TEST_EXPIRED, + &delegation->flags)) { rcu_read_unlock(); nfs_finish_clear_delegation_stateid(state, &stateid); return; } - if (!test_and_clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags)) { - rcu_read_unlock(); - return; - } - cred = get_rpccred(delegation->cred); rcu_read_unlock(); status = nfs41_test_and_free_expired_stateid(server, &stateid, cred); -- cgit v1.2.3 From f17f8a14e82cdf34cd6473e3644f3c672b3884f6 Mon Sep 17 00:00:00 2001 From: Tigran Mkrtchyan Date: Thu, 30 Mar 2017 17:31:18 +0200 Subject: nfs: flexfiles: fix kernel OOPS if MDS returns unsupported DS type this fix aims to fix dereferencing of a mirror in an error state when MDS returns unsupported DS type (IOW, not v3), which causes the following oops: [ 220.370709] BUG: unable to handle kernel NULL pointer dereference at 0000000000000065 [ 220.370842] IP: ff_layout_mirror_valid+0x2d/0x110 [nfs_layout_flexfiles] [ 220.370920] PGD 0 [ 220.370972] Oops: 0000 [#1] SMP [ 220.371013] Modules linked in: nfnetlink_queue nfnetlink_log bluetooth nfs_layout_flexfiles rpcsec_gss_krb5 nfsv4 dns_resolver nfs fscache nf_conntrack_netbios_ns nf_conntrack_broadcast xt_CT ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 xt_conntrack ip_set nfnetlink ebtable_nat ebtable_broute bridge stp llc ip6table_raw ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle ip6table_security iptable_raw iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack libcrc32c iptable_mangle iptable_security ebtable_filter ebtables ip6table_filter ip6_tables binfmt_misc intel_rapl x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel btrfs kvm arc4 snd_hda_codec_hdmi iwldvm irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel intel_cstate mac80211 xor uvcvideo [ 220.371814] videobuf2_vmalloc videobuf2_memops snd_hda_codec_idt mei_wdt videobuf2_v4l2 snd_hda_codec_generic iTCO_wdt ppdev videobuf2_core iTCO_vendor_support dell_rbtn dell_wmi iwlwifi sparse_keymap dell_laptop dell_smbios snd_hda_intel dcdbas videodev snd_hda_codec dell_smm_hwmon snd_hda_core media cfg80211 intel_uncore snd_hwdep raid6_pq snd_seq intel_rapl_perf snd_seq_device joydev i2c_i801 rfkill lpc_ich snd_pcm parport_pc mei_me parport snd_timer dell_smo8800 mei snd shpchp soundcore tpm_tis tpm_tis_core tpm nfsd auth_rpcgss nfs_acl lockd grace sunrpc i915 nouveau mxm_wmi ttm i2c_algo_bit drm_kms_helper crc32c_intel e1000e drm sdhci_pci firewire_ohci sdhci serio_raw mmc_core firewire_core ptp crc_itu_t pps_core wmi fjes video [ 220.372568] CPU: 7 PID: 4988 Comm: cat Not tainted 4.10.5-200.fc25.x86_64 #1 [ 220.372647] Hardware name: Dell Inc. Latitude E6520/0J4TFW, BIOS A06 07/11/2011 [ 220.372729] task: ffff94791f6ea580 task.stack: ffffb72b88c0c000 [ 220.372802] RIP: 0010:ff_layout_mirror_valid+0x2d/0x110 [nfs_layout_flexfiles] [ 220.372883] RSP: 0018:ffffb72b88c0f970 EFLAGS: 00010246 [ 220.372945] RAX: 0000000000000000 RBX: ffff9479015ca600 RCX: ffffffffffffffed [ 220.373025] RDX: ffffffffffffffed RSI: ffff9479753dc980 RDI: 0000000000000000 [ 220.373104] RBP: ffffb72b88c0f988 R08: 000000000001c980 R09: ffffffffc0ea6112 [ 220.373184] R10: ffffef17477d9640 R11: ffff9479753dd6c0 R12: ffff9479211c7440 [ 220.373264] R13: ffff9478f45b7790 R14: 0000000000000001 R15: ffff9479015ca600 [ 220.373345] FS: 00007f555fa3e700(0000) GS:ffff9479753c0000(0000) knlGS:0000000000000000 [ 220.373435] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 220.373506] CR2: 0000000000000065 CR3: 0000000196044000 CR4: 00000000000406e0 [ 220.373586] Call Trace: [ 220.373627] nfs4_ff_layout_prepare_ds+0x5e/0x200 [nfs_layout_flexfiles] [ 220.373708] ff_layout_pg_init_read+0x81/0x160 [nfs_layout_flexfiles] [ 220.373806] __nfs_pageio_add_request+0x11f/0x4a0 [nfs] [ 220.373886] ? nfs_create_request.part.14+0x37/0x330 [nfs] [ 220.373967] nfs_pageio_add_request+0xb2/0x260 [nfs] [ 220.374042] readpage_async_filler+0xaf/0x280 [nfs] [ 220.374103] read_cache_pages+0xef/0x1b0 [ 220.374166] ? nfs_read_completion+0x210/0x210 [nfs] [ 220.374239] nfs_readpages+0x129/0x200 [nfs] [ 220.374293] __do_page_cache_readahead+0x1d0/0x2f0 [ 220.374352] ondemand_readahead+0x17d/0x2a0 [ 220.374403] page_cache_sync_readahead+0x2e/0x50 [ 220.374460] generic_file_read_iter+0x6c8/0x950 [ 220.374532] ? nfs_mapping_need_revalidate_inode+0x17/0x40 [nfs] [ 220.374617] nfs_file_read+0x6e/0xc0 [nfs] [ 220.374670] __vfs_read+0xe2/0x150 [ 220.374715] vfs_read+0x96/0x130 [ 220.374758] SyS_read+0x55/0xc0 [ 220.374801] entry_SYSCALL_64_fastpath+0x1a/0xa9 [ 220.374856] RIP: 0033:0x7f555f570bd0 [ 220.374900] RSP: 002b:00007ffeb73e1b38 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [ 220.374986] RAX: ffffffffffffffda RBX: 00007f555f839ae0 RCX: 00007f555f570bd0 [ 220.375066] RDX: 0000000000020000 RSI: 00007f555fa41000 RDI: 0000000000000003 [ 220.375145] RBP: 0000000000021010 R08: ffffffffffffffff R09: 0000000000000000 [ 220.375226] R10: 00007f555fa40010 R11: 0000000000000246 R12: 0000000000022000 [ 220.375305] R13: 0000000000021010 R14: 0000000000001000 R15: 0000000000002710 [ 220.375386] Code: 66 66 90 55 48 89 e5 41 54 53 49 89 fc 48 83 ec 08 48 85 f6 74 2e 48 8b 4e 30 48 89 f3 48 81 f9 00 f0 ff ff 77 1e 48 85 c9 74 15 <48> 83 79 78 00 b8 01 00 00 00 74 2c 48 83 c4 08 5b 41 5c 5d c3 [ 220.375653] RIP: ff_layout_mirror_valid+0x2d/0x110 [nfs_layout_flexfiles] RSP: ffffb72b88c0f970 [ 220.375748] CR2: 0000000000000065 [ 220.403538] ---[ end trace bcdca752211b7da9 ]--- Signed-off-by: Tigran Mkrtchyan Signed-off-by: Anna Schumaker --- fs/nfs/flexfilelayout/flexfilelayoutdev.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index 85fde93dff77..457cfeb1d5c1 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -208,6 +208,10 @@ static bool ff_layout_mirror_valid(struct pnfs_layout_segment *lseg, } else goto outerr; } + + if (IS_ERR(mirror->mirror_ds)) + goto outerr; + if (mirror->mirror_ds->ds == NULL) { struct nfs4_deviceid_node *devid; devid = &mirror->mirror_ds->id_node; -- cgit v1.2.3 From 4742a35d9de745e867405b4311e1aac412f0ace1 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 31 Mar 2017 15:12:01 -0700 Subject: hugetlbfs: initialize shared policy as part of inode allocation Any time after inode allocation, destroy_inode can be called. The hugetlbfs inode contains a shared_policy structure, and mpol_free_shared_policy is unconditionally called as part of hugetlbfs_destroy_inode. Initialize the policy as part of inode allocation so that any quick (error path) calls to destroy_inode will be handed an initialized policy. syzkaller fuzzer found this bug, that resulted in the following: BUG: KASAN: user-memory-access in atomic_inc include/asm-generic/atomic-instrumented.h:87 [inline] at addr 000000131730bd7a BUG: KASAN: user-memory-access in __lock_acquire+0x21a/0x3a80 kernel/locking/lockdep.c:3239 at addr 000000131730bd7a Write of size 4 by task syz-executor6/14086 CPU: 3 PID: 14086 Comm: syz-executor6 Not tainted 4.11.0-rc3+ #364 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: atomic_inc include/asm-generic/atomic-instrumented.h:87 [inline] __lock_acquire+0x21a/0x3a80 kernel/locking/lockdep.c:3239 lock_acquire+0x1ee/0x590 kernel/locking/lockdep.c:3762 __raw_write_lock include/linux/rwlock_api_smp.h:210 [inline] _raw_write_lock+0x33/0x50 kernel/locking/spinlock.c:295 mpol_free_shared_policy+0x43/0xb0 mm/mempolicy.c:2536 hugetlbfs_destroy_inode+0xca/0x120 fs/hugetlbfs/inode.c:952 alloc_inode+0x10d/0x180 fs/inode.c:216 new_inode_pseudo+0x69/0x190 fs/inode.c:889 new_inode+0x1c/0x40 fs/inode.c:918 hugetlbfs_get_inode+0x40/0x420 fs/hugetlbfs/inode.c:734 hugetlb_file_setup+0x329/0x9f0 fs/hugetlbfs/inode.c:1282 newseg+0x422/0xd30 ipc/shm.c:575 ipcget_new ipc/util.c:285 [inline] ipcget+0x21e/0x580 ipc/util.c:639 SYSC_shmget ipc/shm.c:673 [inline] SyS_shmget+0x158/0x230 ipc/shm.c:657 entry_SYSCALL_64_fastpath+0x1f/0xc2 Analysis provided by Tetsuo Handa Link: http://lkml.kernel.org/r/1490477850-7944-1-git-send-email-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reported-by: Dmitry Vyukov Acked-by: Hillf Danton Cc: Tetsuo Handa Cc: Michal Hocko Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 8f96461236f6..7163fe014b57 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -695,14 +695,11 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb, inode = new_inode(sb); if (inode) { - struct hugetlbfs_inode_info *info; inode->i_ino = get_next_ino(); inode->i_mode = S_IFDIR | config->mode; inode->i_uid = config->uid; inode->i_gid = config->gid; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); - info = HUGETLBFS_I(inode); - mpol_shared_policy_init(&info->policy, NULL); inode->i_op = &hugetlbfs_dir_inode_operations; inode->i_fop = &simple_dir_operations; /* directory inodes start off with i_nlink == 2 (for "." entry) */ @@ -733,7 +730,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, inode = new_inode(sb); if (inode) { - struct hugetlbfs_inode_info *info; inode->i_ino = get_next_ino(); inode_init_owner(inode, dir, mode); lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, @@ -741,15 +737,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, inode->i_mapping->a_ops = &hugetlbfs_aops; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); inode->i_mapping->private_data = resv_map; - info = HUGETLBFS_I(inode); - /* - * The policy is initialized here even if we are creating a - * private inode because initialization simply creates an - * an empty rb tree and calls rwlock_init(), later when we - * call mpol_free_shared_policy() it will just return because - * the rb tree will still be empty. - */ - mpol_shared_policy_init(&info->policy, NULL); switch (mode & S_IFMT) { default: init_special_inode(inode, mode, dev); @@ -937,6 +924,18 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) hugetlbfs_inc_free_inodes(sbinfo); return NULL; } + + /* + * Any time after allocation, hugetlbfs_destroy_inode can be called + * for the inode. mpol_free_shared_policy is unconditionally called + * as part of hugetlbfs_destroy_inode. So, initialize policy here + * in case of a quick call to destroy. + * + * Note that the policy is initialized even if we are creating a + * private inode. This simplifies hugetlbfs_destroy_inode. + */ + mpol_shared_policy_init(&p->policy, NULL); + return &p->vfs_inode; } -- cgit v1.2.3