From 18cc912b8a2acaf32589241fbac47192ab90db14 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 31 Mar 2022 16:29:00 -0400 Subject: fs: change test in inode_insert5 for adding to the sb list inode_insert5 currently looks at I_CREATING to decide whether to insert the inode into the sb list. This test is a bit ambiguous, as I_CREATING state is not directly related to that list. This test is also problematic for some upcoming ceph changes to add fscrypt support. We need to be able to allocate an inode using new_inode and insert it into the hash later iff we end up using it, and doing that now means that we double add it and corrupt the list. What we really want to know in this test is whether the inode is already in its superblock list, and then add it if it isn't. Have it test for list_empty instead and ensure that we always initialize the list by doing it in inode_init_once. It's only ever removed from the list with list_del_init, so that should be sufficient. Suggested-by: Al Viro Signed-off-by: Jeff Layton Reviewed-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Ilya Dryomov --- fs/inode.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/inode.c b/fs/inode.c index bd4da9c5207e..d5db55df442b 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -422,6 +422,7 @@ void inode_init_once(struct inode *inode) INIT_LIST_HEAD(&inode->i_io_list); INIT_LIST_HEAD(&inode->i_wb_list); INIT_LIST_HEAD(&inode->i_lru); + INIT_LIST_HEAD(&inode->i_sb_list); __address_space_init_once(&inode->i_data); i_size_ordered_init(inode); } @@ -1021,7 +1022,6 @@ struct inode *new_inode_pseudo(struct super_block *sb) spin_lock(&inode->i_lock); inode->i_state = 0; spin_unlock(&inode->i_lock); - INIT_LIST_HEAD(&inode->i_sb_list); } return inode; } @@ -1165,7 +1165,6 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, { struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval); struct inode *old; - bool creating = inode->i_state & I_CREATING; again: spin_lock(&inode_hash_lock); @@ -1199,7 +1198,12 @@ again: inode->i_state |= I_NEW; hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); - if (!creating) + + /* + * Add inode to the sb list if it's not already. It has I_NEW at this + * point, so it should be safe to test i_sb_list locklessly. + */ + if (list_empty(&inode->i_sb_list)) inode_sb_list_add(inode); unlock: spin_unlock(&inode_hash_lock); -- cgit v1.2.3 From d3e94fdc4ef476ca1edd468cc11badf2dbbb3c00 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 8 Jan 2021 15:34:38 -0500 Subject: fscrypt: export fscrypt_fname_encrypt and fscrypt_fname_encrypted_size For ceph, we want to use our own scheme for handling filenames that are are longer than NAME_MAX after encryption and Base64 encoding. This allows us to have a consistent view of the encrypted filenames for clients that don't support fscrypt and clients that do but that don't have the key. Currently, fs/crypto only supports encrypting filenames using fscrypt_setup_filename, but that also handles encoding nokey names. Ceph can't use that because it handles nokey names in a different way. Export fscrypt_fname_encrypt. Rename fscrypt_fname_encrypted_size to __fscrypt_fname_encrypted_size and add a new wrapper called fscrypt_fname_encrypted_size that takes an inode argument rather than a pointer to a fscrypt_policy union. Signed-off-by: Jeff Layton Reviewed-by: Xiubo Li Acked-by: Eric Biggers Signed-off-by: Ilya Dryomov --- fs/crypto/fname.c | 36 ++++++++++++++++++++++++++++++------ fs/crypto/fscrypt_private.h | 9 +++------ fs/crypto/hooks.c | 6 +++--- include/linux/fscrypt.h | 4 ++++ 4 files changed, 40 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 14e0ef5e9a20..12bd61d20f69 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -86,7 +86,8 @@ static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) /** * fscrypt_fname_encrypt() - encrypt a filename * @inode: inode of the parent directory (for regular filenames) - * or of the symlink (for symlink targets) + * or of the symlink (for symlink targets). Key must already be + * set up. * @iname: the filename to encrypt * @out: (output) the encrypted filename * @olen: size of the encrypted filename. It must be at least @iname->len. @@ -137,6 +138,7 @@ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, return 0; } +EXPORT_SYMBOL_GPL(fscrypt_fname_encrypt); /** * fname_decrypt() - decrypt a filename @@ -264,9 +266,9 @@ static int fscrypt_base64url_decode(const char *src, int srclen, u8 *dst) return bp - dst; } -bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, - u32 orig_len, u32 max_len, - u32 *encrypted_len_ret) +bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, + u32 orig_len, u32 max_len, + u32 *encrypted_len_ret) { int padding = 4 << (fscrypt_policy_flags(policy) & FSCRYPT_POLICY_FLAGS_PAD_MASK); @@ -280,6 +282,29 @@ bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, return true; } +/** + * fscrypt_fname_encrypted_size() - calculate length of encrypted filename + * @inode: parent inode of dentry name being encrypted. Key must + * already be set up. + * @orig_len: length of the original filename + * @max_len: maximum length to return + * @encrypted_len_ret: where calculated length should be returned (on success) + * + * Filenames that are shorter than the maximum length may have their lengths + * increased slightly by encryption, due to padding that is applied. + * + * Return: false if the orig_len is greater than max_len. Otherwise, true and + * fill out encrypted_len_ret with the length (up to max_len). + */ +bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, + u32 max_len, u32 *encrypted_len_ret) +{ + return __fscrypt_fname_encrypted_size(&inode->i_crypt_info->ci_policy, + orig_len, max_len, + encrypted_len_ret); +} +EXPORT_SYMBOL_GPL(fscrypt_fname_encrypted_size); + /** * fscrypt_fname_alloc_buffer() - allocate a buffer for presented filenames * @max_encrypted_len: maximum length of encrypted filenames the buffer will be @@ -435,8 +460,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, return ret; if (fscrypt_has_encryption_key(dir)) { - if (!fscrypt_fname_encrypted_size(&dir->i_crypt_info->ci_policy, - iname->len, NAME_MAX, + if (!fscrypt_fname_encrypted_size(dir, iname->len, NAME_MAX, &fname->crypto_buf.len)) return -ENAMETOOLONG; fname->crypto_buf.name = kmalloc(fname->crypto_buf.len, diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 6b4c8094cc7b..11fe9d213ae1 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -297,14 +297,11 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num, const struct fscrypt_info *ci); /* fname.c */ -int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, - u8 *out, unsigned int olen); -bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, - u32 orig_len, u32 max_len, - u32 *encrypted_len_ret); +bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, + u32 orig_len, u32 max_len, + u32 *encrypted_len_ret); /* hkdf.c */ - struct fscrypt_hkdf { struct crypto_shash *hmac_tfm; }; diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index af74599ae1cf..7c01025879b3 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -228,9 +228,9 @@ int fscrypt_prepare_symlink(struct inode *dir, const char *target, * counting it (even though it is meaningless for ciphertext) is simpler * for now since filesystems will assume it is there and subtract it. */ - if (!fscrypt_fname_encrypted_size(policy, len, - max_len - sizeof(struct fscrypt_symlink_data), - &disk_link->len)) + if (!__fscrypt_fname_encrypted_size(policy, len, + max_len - sizeof(struct fscrypt_symlink_data), + &disk_link->len)) return -ENAMETOOLONG; disk_link->len += sizeof(struct fscrypt_symlink_data); diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index e60d57c99cb6..5926a4081c6d 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -327,6 +327,10 @@ void fscrypt_free_inode(struct inode *inode); int fscrypt_drop_inode(struct inode *inode); /* fname.c */ +int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, + u8 *out, unsigned int olen); +bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, + u32 max_len, u32 *encrypted_len_ret); int fscrypt_setup_filename(struct inode *inode, const struct qstr *iname, int lookup, struct fscrypt_name *fname); -- cgit v1.2.3 From 637fa738b590ec0e3414931d1e07c4f195eb5215 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 1 Sep 2020 12:56:42 -0400 Subject: fscrypt: add fscrypt_context_for_new_inode Most filesystems just call fscrypt_set_context on new inodes, which usually causes a setxattr. That's a bit late for ceph, which can send along a full set of attributes with the create request. Doing so allows it to avoid race windows that where the new inode could be seen by other clients without the crypto context attached. It also avoids the separate round trip to the server. Refactor the fscrypt code a bit to allow us to create a new crypto context, attach it to the inode, and write it to the buffer, but without calling set_context on it. ceph can later use this to marshal the context into the attributes we send along with the create request. Signed-off-by: Jeff Layton Reviewed-by: Xiubo Li Acked-by: Eric Biggers Signed-off-by: Ilya Dryomov --- fs/crypto/policy.c | 35 +++++++++++++++++++++++++++++------ include/linux/fscrypt.h | 1 + 2 files changed, 30 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 5f858cee1e3b..a450189565e3 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -685,6 +685,32 @@ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir) return fscrypt_get_dummy_policy(dir->i_sb); } +/** + * fscrypt_context_for_new_inode() - create an encryption context for a new inode + * @ctx: where context should be written + * @inode: inode from which to fetch policy and nonce + * + * Given an in-core "prepared" (via fscrypt_prepare_new_inode) inode, + * generate a new context and write it to ctx. ctx _must_ be at least + * FSCRYPT_SET_CONTEXT_MAX_SIZE bytes. + * + * Return: size of the resulting context or a negative error code. + */ +int fscrypt_context_for_new_inode(void *ctx, struct inode *inode) +{ + struct fscrypt_info *ci = inode->i_crypt_info; + + BUILD_BUG_ON(sizeof(union fscrypt_context) != + FSCRYPT_SET_CONTEXT_MAX_SIZE); + + /* fscrypt_prepare_new_inode() should have set up the key already. */ + if (WARN_ON_ONCE(!ci)) + return -ENOKEY; + + return fscrypt_new_context(ctx, &ci->ci_policy, ci->ci_nonce); +} +EXPORT_SYMBOL_GPL(fscrypt_context_for_new_inode); + /** * fscrypt_set_context() - Set the fscrypt context of a new inode * @inode: a new inode @@ -701,12 +727,9 @@ int fscrypt_set_context(struct inode *inode, void *fs_data) union fscrypt_context ctx; int ctxsize; - /* fscrypt_prepare_new_inode() should have set up the key already. */ - if (WARN_ON_ONCE(!ci)) - return -ENOKEY; - - BUILD_BUG_ON(sizeof(ctx) != FSCRYPT_SET_CONTEXT_MAX_SIZE); - ctxsize = fscrypt_new_context(&ctx, &ci->ci_policy, ci->ci_nonce); + ctxsize = fscrypt_context_for_new_inode(&ctx, inode); + if (ctxsize < 0) + return ctxsize; /* * This may be the first time the inode number is available, so do any diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 5926a4081c6d..7d2f1e0f23b1 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -284,6 +284,7 @@ int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg); int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *arg); int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg); int fscrypt_has_permitted_context(struct inode *parent, struct inode *child); +int fscrypt_context_for_new_inode(void *ctx, struct inode *inode); int fscrypt_set_context(struct inode *inode, void *fs_data); struct fscrypt_dummy_policy { -- cgit v1.2.3 From fea013e020e6ecc7be75bea0d61697b7e916b44d Mon Sep 17 00:00:00 2001 From: Luís Henriques Date: Tue, 24 May 2022 17:06:27 +0100 Subject: ceph: use correct index when encoding client supported features Feature bits have to be encoded into the correct locations. This hasn't been an issue so far because the only hole in the feature bits was in bit 10 (CEPHFS_FEATURE_RECLAIM_CLIENT), which is located in the 2nd byte. When adding more bits that go beyond the this 2nd byte, the bug will show up. [xiubli: remove incorrect comment for CEPHFS_FEATURES_CLIENT_SUPPORTED] Fixes: 9ba1e224538a ("ceph: allocate the correct amount of extra bytes for the session features") Signed-off-by: Luís Henriques Reviewed-by: Jeff Layton Signed-off-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 7 +++++-- fs/ceph/mds_client.h | 6 ------ 2 files changed, 5 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 33f517d549ce..0aded10375fd 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1220,14 +1220,17 @@ static int encode_supported_features(void **p, void *end) if (count > 0) { size_t i; size_t size = FEATURE_BYTES(count); + unsigned long bit; if (WARN_ON_ONCE(*p + 4 + size > end)) return -ERANGE; ceph_encode_32(p, size); memset(*p, 0, size); - for (i = 0; i < count; i++) - ((unsigned char*)(*p))[i / 8] |= BIT(feature_bits[i] % 8); + for (i = 0; i < count; i++) { + bit = feature_bits[i]; + ((unsigned char *)(*p))[bit / 8] |= BIT(bit % 8); + } *p += size; } else { if (WARN_ON_ONCE(*p + 4 > end)) diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 1140aecd82ce..2a49e331987b 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -33,10 +33,6 @@ enum ceph_feature_type { CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT, }; -/* - * This will always have the highest feature bit value - * as the last element of the array. - */ #define CEPHFS_FEATURES_CLIENT_SUPPORTED { \ 0, 1, 2, 3, 4, 5, 6, 7, \ CEPHFS_FEATURE_MIMIC, \ @@ -45,8 +41,6 @@ enum ceph_feature_type { CEPHFS_FEATURE_MULTI_RECONNECT, \ CEPHFS_FEATURE_DELEG_INO, \ CEPHFS_FEATURE_METRIC_COLLECT, \ - \ - CEPHFS_FEATURE_MAX, \ } #define CEPHFS_FEATURES_CLIENT_REQUIRED {} -- cgit v1.2.3 From 7c2e3d9194f78770fdfd688d0eecfe7132f83138 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 26 May 2022 14:07:21 +0800 Subject: ceph: remove useless CEPHFS_FEATURES_CLIENT_REQUIRED This macro was added but never be used. And check the ceph code there has another CEPHFS_FEATURES_MDS_REQUIRED but always be empty. We should clean up all this related code, which make no sense but introducing confusion. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Reviewed-by: Luís Henriques Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.h | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 2a49e331987b..4620167f58eb 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -42,7 +42,6 @@ enum ceph_feature_type { CEPHFS_FEATURE_DELEG_INO, \ CEPHFS_FEATURE_METRIC_COLLECT, \ } -#define CEPHFS_FEATURES_CLIENT_REQUIRED {} /* * Some lock dependencies: -- cgit v1.2.3 From 4f48d5da81ee7004a789c8aac2d0dfb2514c37f1 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Mon, 16 May 2022 11:23:19 +0800 Subject: fs/dcache: export d_same_name() helper Compare dentry name with case-exact name, return true if names are same, or false. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Reviewed-by: Luis Chamberlain Signed-off-by: Ilya Dryomov --- fs/dcache.c | 15 +++++++++++---- include/linux/dcache.h | 2 ++ 2 files changed, 13 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 93f4f5ee07bf..a409312ee0df 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2247,10 +2247,16 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode, } EXPORT_SYMBOL(d_add_ci); - -static inline bool d_same_name(const struct dentry *dentry, - const struct dentry *parent, - const struct qstr *name) +/** + * d_same_name - compare dentry name with case-exact name + * @parent: parent dentry + * @dentry: the negative dentry that was passed to the parent's lookup func + * @name: the case-exact name to be associated with the returned dentry + * + * Return: true if names are same, or false + */ +bool d_same_name(const struct dentry *dentry, const struct dentry *parent, + const struct qstr *name) { if (likely(!(parent->d_flags & DCACHE_OP_COMPARE))) { if (dentry->d_name.len != name->len) @@ -2261,6 +2267,7 @@ static inline bool d_same_name(const struct dentry *dentry, dentry->d_name.len, dentry->d_name.name, name) == 0; } +EXPORT_SYMBOL_GPL(d_same_name); /** * __d_lookup_rcu - search for a dentry (racy, store-free) diff --git a/include/linux/dcache.h b/include/linux/dcache.h index f5bba51480b2..bb72361834de 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -233,6 +233,8 @@ extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *, wait_queue_head_t *); extern struct dentry * d_splice_alias(struct inode *, struct dentry *); extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *); +extern bool d_same_name(const struct dentry *dentry, const struct dentry *parent, + const struct qstr *name); extern struct dentry * d_exact_alias(struct dentry *, struct inode *); extern struct dentry *d_find_any_alias(struct inode *inode); extern struct dentry * d_obtain_alias(struct inode *); -- cgit v1.2.3 From 4868e537fa867f82e38e37429d61d7bb8357d79b Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Tue, 10 May 2022 09:47:01 +0800 Subject: ceph: wait for the first reply of inflight async unlink In async unlink case the kclient won't wait for the first reply from MDS and just drop all the links and unhash the dentry and then succeeds immediately. For any new create/link/rename,etc requests followed by using the same file names we must wait for the first reply of the inflight unlink request, or the MDS possibly will fail these following requests with -EEXIST if the inflight async unlink request was delayed for some reasons. And the worst case is that for the none async openc request it will successfully open the file if the CDentry hasn't been unlinked yet, but later the previous delayed async unlink request will remove the CDenty. That means the just created file is possiblly deleted later by accident. We need to wait for the inflight async unlink requests to finish when creating new files/directories by using the same file names. Link: https://tracker.ceph.com/issues/55332 Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/dir.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++------ fs/ceph/file.c | 6 +++- fs/ceph/mds_client.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++- fs/ceph/mds_client.h | 1 + fs/ceph/super.c | 3 ++ fs/ceph/super.h | 19 +++++++++---- 6 files changed, 167 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index eae417d71136..e7e2ebac330d 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -856,6 +856,10 @@ static int ceph_mknod(struct user_namespace *mnt_userns, struct inode *dir, if (ceph_snap(dir) != CEPH_NOSNAP) return -EROFS; + err = ceph_wait_on_conflict_unlink(dentry); + if (err) + return err; + if (ceph_quota_is_max_files_exceeded(dir)) { err = -EDQUOT; goto out; @@ -918,6 +922,10 @@ static int ceph_symlink(struct user_namespace *mnt_userns, struct inode *dir, if (ceph_snap(dir) != CEPH_NOSNAP) return -EROFS; + err = ceph_wait_on_conflict_unlink(dentry); + if (err) + return err; + if (ceph_quota_is_max_files_exceeded(dir)) { err = -EDQUOT; goto out; @@ -968,9 +976,13 @@ static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir, struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); struct ceph_mds_request *req; struct ceph_acl_sec_ctx as_ctx = {}; - int err = -EROFS; + int err; int op; + err = ceph_wait_on_conflict_unlink(dentry); + if (err) + return err; + if (ceph_snap(dir) == CEPH_SNAPDIR) { /* mkdir .snap/foo is a MKSNAP */ op = CEPH_MDS_OP_MKSNAP; @@ -980,6 +992,7 @@ static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir, dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode); op = CEPH_MDS_OP_MKDIR; } else { + err = -EROFS; goto out; } @@ -1037,6 +1050,10 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, struct ceph_mds_request *req; int err; + err = ceph_wait_on_conflict_unlink(dentry); + if (err) + return err; + if (ceph_snap(dir) != CEPH_NOSNAP) return -EROFS; @@ -1071,9 +1088,27 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc, struct ceph_mds_request *req) { + struct dentry *dentry = req->r_dentry; + struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); + struct ceph_dentry_info *di = ceph_dentry(dentry); int result = req->r_err ? req->r_err : le32_to_cpu(req->r_reply_info.head->result); + if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags)) + pr_warn("%s dentry %p:%pd async unlink bit is not set\n", + __func__, dentry, dentry); + + spin_lock(&fsc->async_unlink_conflict_lock); + hash_del_rcu(&di->hnode); + spin_unlock(&fsc->async_unlink_conflict_lock); + + spin_lock(&dentry->d_lock); + di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK; + wake_up_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT); + spin_unlock(&dentry->d_lock); + + synchronize_rcu(); + if (result == -EJUKEBOX) goto out; @@ -1081,7 +1116,7 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc, if (result) { int pathlen = 0; u64 base = 0; - char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen, + char *path = ceph_mdsc_build_path(dentry, &pathlen, &base, 0); /* mark error on parent + clear complete */ @@ -1089,13 +1124,13 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc, ceph_dir_clear_complete(req->r_parent); /* drop the dentry -- we don't know its status */ - if (!d_unhashed(req->r_dentry)) - d_drop(req->r_dentry); + if (!d_unhashed(dentry)) + d_drop(dentry); /* mark inode itself for an error (since metadata is bogus) */ mapping_set_error(req->r_old_inode->i_mapping, result); - pr_warn("ceph: async unlink failure path=(%llx)%s result=%d!\n", + pr_warn("async unlink failure path=(%llx)%s result=%d!\n", base, IS_ERR(path) ? "<>" : path, result); ceph_mdsc_free_path(path, pathlen); } @@ -1180,6 +1215,8 @@ retry: if (try_async && op == CEPH_MDS_OP_UNLINK && (req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) { + struct ceph_dentry_info *di = ceph_dentry(dentry); + dout("async unlink on %llu/%.*s caps=%s", ceph_ino(dir), dentry->d_name.len, dentry->d_name.name, ceph_cap_string(req->r_dir_caps)); @@ -1187,6 +1224,16 @@ retry: req->r_callback = ceph_async_unlink_cb; req->r_old_inode = d_inode(dentry); ihold(req->r_old_inode); + + spin_lock(&dentry->d_lock); + di->flags |= CEPH_DENTRY_ASYNC_UNLINK; + spin_unlock(&dentry->d_lock); + + spin_lock(&fsc->async_unlink_conflict_lock); + hash_add_rcu(fsc->async_unlink_conflict, &di->hnode, + dentry->d_name.hash); + spin_unlock(&fsc->async_unlink_conflict_lock); + err = ceph_mdsc_submit_request(mdsc, dir, req); if (!err) { /* @@ -1195,10 +1242,20 @@ retry: */ drop_nlink(inode); d_delete(dentry); - } else if (err == -EJUKEBOX) { - try_async = false; - ceph_mdsc_put_request(req); - goto retry; + } else { + spin_lock(&fsc->async_unlink_conflict_lock); + hash_del_rcu(&di->hnode); + spin_unlock(&fsc->async_unlink_conflict_lock); + + spin_lock(&dentry->d_lock); + di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK; + spin_unlock(&dentry->d_lock); + + if (err == -EJUKEBOX) { + try_async = false; + ceph_mdsc_put_request(req); + goto retry; + } } } else { set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); @@ -1237,6 +1294,10 @@ static int ceph_rename(struct user_namespace *mnt_userns, struct inode *old_dir, (!ceph_quota_is_same_realm(old_dir, new_dir))) return -EXDEV; + err = ceph_wait_on_conflict_unlink(new_dentry); + if (err) + return err; + dout("rename dir %p dentry %p to dir %p dentry %p\n", old_dir, old_dentry, new_dir, new_dentry); req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index da59e836a06e..0f3424dc618b 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -569,7 +569,7 @@ static void ceph_async_create_cb(struct ceph_mds_client *mdsc, char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen, &base, 0); - pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n", + pr_warn("async create failure path=(%llx)%s result=%d!\n", base, IS_ERR(path) ? "<>" : path, result); ceph_mdsc_free_path(path, pathlen); @@ -740,6 +740,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, if (dentry->d_name.len > NAME_MAX) return -ENAMETOOLONG; + err = ceph_wait_on_conflict_unlink(dentry); + if (err) + return err; + if (flags & O_CREAT) { if (ceph_quota_is_max_files_exceeded(dir)) return -EDQUOT; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 0aded10375fd..f6da80d110dc 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -456,7 +456,7 @@ static int ceph_parse_deleg_inos(void **p, void *end, dout("added delegated inode 0x%llx\n", start - 1); } else if (err == -EBUSY) { - pr_warn("ceph: MDS delegated inode 0x%llx more than once.\n", + pr_warn("MDS delegated inode 0x%llx more than once.\n", start - 1); } else { return err; @@ -655,6 +655,79 @@ static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size)); } +/* + * In async unlink case the kclient won't wait for the first reply + * from MDS and just drop all the links and unhash the dentry and then + * succeeds immediately. + * + * For any new create/link/rename,etc requests followed by using the + * same file names we must wait for the first reply of the inflight + * unlink request, or the MDS possibly will fail these following + * requests with -EEXIST if the inflight async unlink request was + * delayed for some reasons. + * + * And the worst case is that for the none async openc request it will + * successfully open the file if the CDentry hasn't been unlinked yet, + * but later the previous delayed async unlink request will remove the + * CDenty. That means the just created file is possiblly deleted later + * by accident. + * + * We need to wait for the inflight async unlink requests to finish + * when creating new files/directories by using the same file names. + */ +int ceph_wait_on_conflict_unlink(struct dentry *dentry) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); + struct dentry *pdentry = dentry->d_parent; + struct dentry *udentry, *found = NULL; + struct ceph_dentry_info *di; + struct qstr dname; + u32 hash = dentry->d_name.hash; + int err; + + dname.name = dentry->d_name.name; + dname.len = dentry->d_name.len; + + rcu_read_lock(); + hash_for_each_possible_rcu(fsc->async_unlink_conflict, di, + hnode, hash) { + udentry = di->dentry; + + spin_lock(&udentry->d_lock); + if (udentry->d_name.hash != hash) + goto next; + if (unlikely(udentry->d_parent != pdentry)) + goto next; + if (!hash_hashed(&di->hnode)) + goto next; + + if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags)) + pr_warn("%s dentry %p:%pd async unlink bit is not set\n", + __func__, dentry, dentry); + + if (!d_same_name(udentry, pdentry, &dname)) + goto next; + + spin_unlock(&udentry->d_lock); + found = dget(udentry); + break; +next: + spin_unlock(&udentry->d_lock); + } + rcu_read_unlock(); + + if (likely(!found)) + return 0; + + dout("%s dentry %p:%pd conflict with old %p:%pd\n", __func__, + dentry, dentry, found, found); + + err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT, + TASK_KILLABLE); + dput(found); + return err; +} + /* * sessions diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 4620167f58eb..d8ec2ac93da3 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -575,6 +575,7 @@ static inline int ceph_wait_on_async_create(struct inode *inode) TASK_KILLABLE); } +extern int ceph_wait_on_conflict_unlink(struct dentry *dentry); extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session); extern int ceph_restore_deleg_ino(struct ceph_mds_session *session, u64 ino); #endif diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 40140805bdcf..5539f6c87a45 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -816,6 +816,9 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, if (!fsc->cap_wq) goto fail_inode_wq; + hash_init(fsc->async_unlink_conflict); + spin_lock_init(&fsc->async_unlink_conflict_lock); + spin_lock(&ceph_fsc_lock); list_add_tail(&fsc->metric_wakeup, &ceph_fsc_list); spin_unlock(&ceph_fsc_lock); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index f59dac66955b..59469253592b 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -19,6 +19,7 @@ #include #include #include +#include #include @@ -99,6 +100,8 @@ struct ceph_mount_options { char *mon_addr; }; +#define CEPH_ASYNC_CREATE_CONFLICT_BITS 8 + struct ceph_fs_client { struct super_block *sb; @@ -124,6 +127,9 @@ struct ceph_fs_client { struct workqueue_struct *inode_wq; struct workqueue_struct *cap_wq; + DECLARE_HASHTABLE(async_unlink_conflict, CEPH_ASYNC_CREATE_CONFLICT_BITS); + spinlock_t async_unlink_conflict_lock; + #ifdef CONFIG_DEBUG_FS struct dentry *debugfs_dentry_lru, *debugfs_caps; struct dentry *debugfs_congestion_kb; @@ -280,7 +286,8 @@ struct ceph_dentry_info { struct dentry *dentry; struct ceph_mds_session *lease_session; struct list_head lease_list; - unsigned flags; + struct hlist_node hnode; + unsigned long flags; int lease_shared_gen; u32 lease_gen; u32 lease_seq; @@ -289,10 +296,12 @@ struct ceph_dentry_info { u64 offset; }; -#define CEPH_DENTRY_REFERENCED 1 -#define CEPH_DENTRY_LEASE_LIST 2 -#define CEPH_DENTRY_SHRINK_LIST 4 -#define CEPH_DENTRY_PRIMARY_LINK 8 +#define CEPH_DENTRY_REFERENCED (1 << 0) +#define CEPH_DENTRY_LEASE_LIST (1 << 1) +#define CEPH_DENTRY_SHRINK_LIST (1 << 2) +#define CEPH_DENTRY_PRIMARY_LINK (1 << 3) +#define CEPH_DENTRY_ASYNC_UNLINK_BIT (4) +#define CEPH_DENTRY_ASYNC_UNLINK (1 << CEPH_DENTRY_ASYNC_UNLINK_BIT) struct ceph_inode_xattrs_info { /* -- cgit v1.2.3 From 300e42a2e79e2270a00dbf9e4ddd4b101dd75a03 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 26 May 2022 13:21:31 +0800 Subject: ceph: add session already open notify support If the connection was accidently closed due to the socket issue or something else the clients will try to open the opened sessions, the MDSes will send the session open reply one more time if the clients support the notify feature. When the clients retry to open the sessions the s_seq will be 0 as default, we need to update it anyway. Link: https://tracker.ceph.com/issues/53911 Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 25 ++++++++++++++++++++----- fs/ceph/mds_client.h | 5 ++++- 2 files changed, 24 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index f6da80d110dc..9cfa7b775fdb 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3540,11 +3540,26 @@ static void handle_session(struct ceph_mds_session *session, case CEPH_SESSION_OPEN: if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) pr_info("mds%d reconnect success\n", session->s_mds); - session->s_state = CEPH_MDS_SESSION_OPEN; - session->s_features = features; - renewed_caps(mdsc, session, 0); - if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &session->s_features)) - metric_schedule_delayed(&mdsc->metric); + + if (session->s_state == CEPH_MDS_SESSION_OPEN) { + pr_notice("mds%d is already opened\n", session->s_mds); + } else { + session->s_state = CEPH_MDS_SESSION_OPEN; + session->s_features = features; + renewed_caps(mdsc, session, 0); + if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT, + &session->s_features)) + metric_schedule_delayed(&mdsc->metric); + } + + /* + * The connection maybe broken and the session in client + * side has been reinitialized, need to update the seq + * anyway. + */ + if (!session->s_seq && seq) + session->s_seq = seq; + wake = 1; if (mdsc->stopping) __close_session(mdsc, session); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index d8ec2ac93da3..256e3eada6c1 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -29,8 +29,10 @@ enum ceph_feature_type { CEPHFS_FEATURE_MULTI_RECONNECT, CEPHFS_FEATURE_DELEG_INO, CEPHFS_FEATURE_METRIC_COLLECT, + CEPHFS_FEATURE_ALTERNATE_NAME, + CEPHFS_FEATURE_NOTIFY_SESSION_STATE, - CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT, + CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_NOTIFY_SESSION_STATE, }; #define CEPHFS_FEATURES_CLIENT_SUPPORTED { \ @@ -41,6 +43,7 @@ enum ceph_feature_type { CEPHFS_FEATURE_MULTI_RECONNECT, \ CEPHFS_FEATURE_DELEG_INO, \ CEPHFS_FEATURE_METRIC_COLLECT, \ + CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \ } /* -- cgit v1.2.3 From 8266c4d7a7469c3fd45ee2b4ebc01aac311c6c48 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Fri, 27 May 2022 12:39:17 +0800 Subject: ceph: choose auth MDS for getxattr with the Xs caps And for the 'Xs' caps for getxattr we will also choose the auth MDS, because the MDS side code is buggy due to setxattr won't notify the replica MDSes when the values changed and the replica MDS will return the old values. Though we will fix it in MDS code, but this still makes sense for old ceph. Link: https://tracker.ceph.com/issues/55331 Signed-off-by: Xiubo Li Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/inode.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 56c53ab3618e..1834d6529f25 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -2275,9 +2275,15 @@ int ceph_try_to_choose_auth_mds(struct inode *inode, int mask) * * This cost much when doing the Locker state transition and * usually will need to revoke caps from clients. + * + * And for the 'Xs' caps for getxattr we will also choose the + * auth MDS, because the MDS side code is buggy due to setxattr + * won't notify the replica MDSes when the values changed and + * the replica MDS will return the old values. Though we will + * fix it in MDS code, but this still makes sense for old ceph. */ if (((mask & CEPH_CAP_ANY_SHARED) && (issued & CEPH_CAP_ANY_EXCL)) - || (mask & CEPH_STAT_RSTAT)) + || (mask & (CEPH_STAT_RSTAT | CEPH_STAT_CAP_XATTR))) return USE_AUTH_MDS; else return USE_ANY_MDS; -- cgit v1.2.3 From d93231a6bc8a452323d5fef16cca7107ce483a27 Mon Sep 17 00:00:00 2001 From: Luís Henriques Date: Fri, 3 Jun 2022 14:29:09 +0100 Subject: ceph: prevent a client from exceeding the MDS maximum xattr size The MDS tries to enforce a limit on the total key/values in extended attributes. However, this limit is enforced only if doing a synchronous operation (MDS_OP_SETXATTR) -- if we're buffering the xattrs, the MDS doesn't have a chance to enforce these limits. This patch adds support for decoding the xattrs maximum size setting that is distributed in the mdsmap. Then, when setting an xattr, the kernel client will revert to do a synchronous operation if that maximum size is exceeded. While there, fix a dout() that would trigger a printk warning: [ 98.718078] ------------[ cut here ]------------ [ 98.719012] precision 65536 too large [ 98.719039] WARNING: CPU: 1 PID: 3755 at lib/vsprintf.c:2703 vsnprintf+0x5e3/0x600 ... Link: https://tracker.ceph.com/issues/55725 Signed-off-by: Luís Henriques Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/mdsmap.c | 22 ++++++++++++++++++---- fs/ceph/xattr.c | 12 ++++++++---- include/linux/ceph/mdsmap.h | 1 + 3 files changed, 27 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 30387733765d..8d0a6d2c2da4 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -352,12 +352,10 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2) __decode_and_drop_type(p, end, u8, bad_ext); } if (mdsmap_ev >= 8) { - u32 name_len; /* enabled */ ceph_decode_8_safe(p, end, m->m_enabled, bad_ext); - ceph_decode_32_safe(p, end, name_len, bad_ext); - ceph_decode_need(p, end, name_len, bad_ext); - *p += name_len; + /* fs_name */ + ceph_decode_skip_string(p, end, bad_ext); } /* damaged */ if (mdsmap_ev >= 9) { @@ -370,6 +368,22 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2) } else { m->m_damaged = false; } + if (mdsmap_ev >= 17) { + /* balancer */ + ceph_decode_skip_string(p, end, bad_ext); + /* standby_count_wanted */ + ceph_decode_skip_32(p, end, bad_ext); + /* old_max_mds */ + ceph_decode_skip_32(p, end, bad_ext); + /* min_compat_client */ + ceph_decode_skip_8(p, end, bad_ext); + /* required_client_features */ + ceph_decode_skip_set(p, end, 64, bad_ext); + ceph_decode_64_safe(p, end, m->m_max_xattr_size, bad_ext); + } else { + /* This forces the usage of the (sync) SETXATTR Op */ + m->m_max_xattr_size = 0; + } bad_ext: dout("mdsmap_decode m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n", !!m->m_enabled, !!m->m_damaged, m->m_num_laggy); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index f141f5246163..f31350cda960 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -1086,7 +1086,7 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name, flags |= CEPH_XATTR_REMOVE; } - dout("setxattr value=%.*s\n", (int)size, value); + dout("setxattr value size: %zu\n", size); /* do request */ req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); @@ -1184,8 +1184,14 @@ int __ceph_setxattr(struct inode *inode, const char *name, spin_lock(&ci->i_ceph_lock); retry: issued = __ceph_caps_issued(ci, NULL); - if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) + required_blob_size = __get_required_blob_size(ci, name_len, val_len); + if ((ci->i_xattrs.version == 0) || !(issued & CEPH_CAP_XATTR_EXCL) || + (required_blob_size > mdsc->mdsmap->m_max_xattr_size)) { + dout("%s do sync setxattr: version: %llu size: %d max: %llu\n", + __func__, ci->i_xattrs.version, required_blob_size, + mdsc->mdsmap->m_max_xattr_size); goto do_sync; + } if (!lock_snap_rwsem && !ci->i_head_snapc) { lock_snap_rwsem = true; @@ -1201,8 +1207,6 @@ retry: ceph_cap_string(issued)); __build_xattrs(inode); - required_blob_size = __get_required_blob_size(ci, name_len, val_len); - if (!ci->i_xattrs.prealloc_blob || required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) { struct ceph_buffer *blob; diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h index 523fd0452856..4c3e0648dc27 100644 --- a/include/linux/ceph/mdsmap.h +++ b/include/linux/ceph/mdsmap.h @@ -25,6 +25,7 @@ struct ceph_mdsmap { u32 m_session_timeout; /* seconds */ u32 m_session_autoclose; /* seconds */ u64 m_max_file_size; + u64 m_max_xattr_size; /* maximum size for xattrs blob */ u32 m_max_mds; /* expected up:active mds number */ u32 m_num_active_mds; /* actual up:active mds number */ u32 possible_max_rank; /* possible max rank index */ -- cgit v1.2.3 From 58dd4385577ed7969b80cdc9e2a31575aba6c712 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 3 Jun 2022 16:39:57 -0400 Subject: ceph: don't leak snap_rwsem in handle_cap_grant When handle_cap_grant is called on an IMPORT op, then the snap_rwsem is held and the function is expected to release it before returning. It currently fails to do that in all cases which could lead to a deadlock. Fixes: 6f05b30ea063 ("ceph: reset i_requested_max_size if file write is not wanted") Link: https://tracker.ceph.com/issues/55857 Signed-off-by: Jeff Layton Reviewed-by: Luís Henriques Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index ac8fd5e7f540..2b1f22322e8f 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3578,24 +3578,23 @@ static void handle_cap_grant(struct inode *inode, fill_inline = true; } - if (ci->i_auth_cap == cap && - le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { - if (newcaps & ~extra_info->issued) - wake = true; + if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { + if (ci->i_auth_cap == cap) { + if (newcaps & ~extra_info->issued) + wake = true; + + if (ci->i_requested_max_size > max_size || + !(le32_to_cpu(grant->wanted) & CEPH_CAP_ANY_FILE_WR)) { + /* re-request max_size if necessary */ + ci->i_requested_max_size = 0; + wake = true; + } - if (ci->i_requested_max_size > max_size || - !(le32_to_cpu(grant->wanted) & CEPH_CAP_ANY_FILE_WR)) { - /* re-request max_size if necessary */ - ci->i_requested_max_size = 0; - wake = true; + ceph_kick_flushing_inode_caps(session, ci); } - - ceph_kick_flushing_inode_caps(session, ci); - spin_unlock(&ci->i_ceph_lock); up_read(&session->s_mdsc->snap_rwsem); - } else { - spin_unlock(&ci->i_ceph_lock); } + spin_unlock(&ci->i_ceph_lock); if (fill_inline) ceph_fill_inline_data(inode, NULL, extra_info->inline_data, -- cgit v1.2.3 From e82145033547dac360bf20e960cf9adefc50b72d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 7 Jun 2022 11:05:49 -0400 Subject: ceph: convert to generic_file_llseek There's no reason we need to lock the inode for write in order to handle an llseek. I suspect this should have been dropped in 2013 when we stopped doing vmtruncate in llseek. With that gone, ceph_llseek is functionally equivalent to generic_file_llseek, so just call that after getting the size. Signed-off-by: Jeff Layton Reviewed-by: Luís Henriques Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 52 +++++----------------------------------------------- 1 file changed, 5 insertions(+), 47 deletions(-) (limited to 'fs') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 0f3424dc618b..fefa6ded07c8 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1939,57 +1939,15 @@ out_unlocked: */ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) { - struct inode *inode = file->f_mapping->host; - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - loff_t i_size; - loff_t ret; - - inode_lock(inode); - if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) { + struct inode *inode = file_inode(file); + int ret; + ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false); if (ret < 0) - goto out; - } - - i_size = i_size_read(inode); - switch (whence) { - case SEEK_END: - offset += i_size; - break; - case SEEK_CUR: - /* - * Here we special-case the lseek(fd, 0, SEEK_CUR) - * position-querying operation. Avoid rewriting the "same" - * f_pos value back to the file because a concurrent read(), - * write() or lseek() might have altered it - */ - if (offset == 0) { - ret = file->f_pos; - goto out; - } - offset += file->f_pos; - break; - case SEEK_DATA: - if (offset < 0 || offset >= i_size) { - ret = -ENXIO; - goto out; - } - break; - case SEEK_HOLE: - if (offset < 0 || offset >= i_size) { - ret = -ENXIO; - goto out; - } - offset = i_size; - break; + return ret; } - - ret = vfs_setpos(file, offset, max(i_size, fsc->max_file_size)); - -out: - inode_unlock(inode); - return ret; + return generic_file_llseek(file, offset, whence); } static inline void ceph_zero_partial_page( -- cgit v1.2.3 From 7467b04418d929c64cbaf75fc8d54db73e2b64df Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 7 Jun 2022 13:06:14 -0400 Subject: ceph: call netfs_subreq_terminated with was_async == false "was_async" is a bit misleadingly named. It's supposed to indicate whether it's safe to call blocking operations from the context you're calling it from, but it sounds like it's asking whether this was done via async operation. For ceph, this it's always called from kernel thread context so it should be safe to set this to false. Cc: David Howells Signed-off-by: Jeff Layton Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index d6e5916138e4..01fe75b8d146 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -237,7 +237,7 @@ static void finish_netfs_read(struct ceph_osd_request *req) if (err >= 0 && err < subreq->len) __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); - netfs_subreq_terminated(subreq, err, true); + netfs_subreq_terminated(subreq, err, false); num_pages = calc_pages_for(osd_data->alignment, osd_data->length); ceph_put_page_vector(osd_data->pages, num_pages, false); -- cgit v1.2.3 From 020bc44a9fbf6946f42db503d11c9811f26dd9fd Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 10 Jun 2022 11:40:13 -0400 Subject: ceph: switch back to testing for NULL folio->private in ceph_dirty_folio Willy requested that we change this back to warning on folio->private being non-NULl. He's trying to kill off the PG_private flag, and so we'd like to catch where it's non-NULL. Add a VM_WARN_ON_FOLIO (since it doesn't exist yet) and change over to using that instead of VM_BUG_ON_FOLIO along with testing the ->private pointer. [ xiubli: define VM_WARN_ON_FOLIO macro in case DEBUG_VM is disabled reported by kernel test robot ] Cc: Matthew Wilcox Signed-off-by: Jeff Layton Signed-off-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 2 +- include/linux/mmdebug.h | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 01fe75b8d146..31fc04eeb4d0 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -122,7 +122,7 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio) * Reference snap context in folio->private. Also set * PagePrivate so that we get invalidate_folio callback. */ - VM_BUG_ON_FOLIO(folio_test_private(folio), folio); + VM_WARN_ON_FOLIO(folio->private, folio); folio_attach_private(folio, snapc); return ceph_fscache_dirty_folio(mapping, folio); diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index d7285f8148a3..15ae78cd2853 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -54,6 +54,15 @@ void dump_mm(const struct mm_struct *mm); } \ unlikely(__ret_warn_once); \ }) +#define VM_WARN_ON_FOLIO(cond, folio) ({ \ + int __ret_warn = !!(cond); \ + \ + if (unlikely(__ret_warn)) { \ + dump_page(&folio->page, "VM_WARN_ON_FOLIO(" __stringify(cond)")");\ + WARN_ON(1); \ + } \ + unlikely(__ret_warn); \ +}) #define VM_WARN_ON_ONCE_FOLIO(cond, folio) ({ \ static bool __section(".data.once") __warned; \ int __ret_warn_once = !!(cond); \ @@ -79,6 +88,7 @@ void dump_mm(const struct mm_struct *mm); #define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE_PAGE(cond, page) BUILD_BUG_ON_INVALID(cond) +#define VM_WARN_ON_FOLIO(cond, folio) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE_FOLIO(cond, folio) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) #define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond) -- cgit v1.2.3 From e19feff96380e7a98ed55446ae08c3c52ce6a994 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Fri, 10 Jun 2022 10:12:49 +0800 Subject: ceph: make change_auth_cap_ses a global symbol Signed-off-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 4 ++-- fs/ceph/super.h | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 2b1f22322e8f..2677199efbce 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -602,8 +602,8 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, * @ci: inode to be moved * @session: new auth caps session */ -static void change_auth_cap_ses(struct ceph_inode_info *ci, - struct ceph_mds_session *session) +void change_auth_cap_ses(struct ceph_inode_info *ci, + struct ceph_mds_session *session) { lockdep_assert_held(&ci->i_ceph_lock); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 59469253592b..3c940a8457de 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -767,6 +767,8 @@ extern void ceph_unreserve_caps(struct ceph_mds_client *mdsc, extern void ceph_reservation_status(struct ceph_fs_client *client, int *total, int *avail, int *used, int *reserved, int *min); +extern void change_auth_cap_ses(struct ceph_inode_info *ci, + struct ceph_mds_session *session); -- cgit v1.2.3 From 0006164589ecc755cd6bbc46e466e32be20fe285 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Fri, 10 Jun 2022 09:53:21 +0800 Subject: ceph: update the auth cap when the async create req is forwarded For async create we will always try to choose the auth MDS of frag the dentry belonged to of the parent directory to send the request and ususally this works fine, but if the MDS migrated the directory to another MDS before it could be handled the request will be forwarded. And then the auth cap will be changed. We need to update the auth cap in this case before the request is forwarded. Link: https://tracker.ceph.com/issues/55857 Signed-off-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 14 +++++++++++++ fs/ceph/mds_client.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/ceph/super.h | 2 ++ 3 files changed, 74 insertions(+) (limited to 'fs') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index fefa6ded07c8..cec1111f58ab 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -612,6 +612,7 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, struct ceph_mds_reply_inode in = { }; struct ceph_mds_reply_info_in iinfo = { .in = &in }; struct ceph_inode_info *ci = ceph_inode(dir); + struct ceph_dentry_info *di = ceph_dentry(dentry); struct inode *inode; struct timespec64 now; struct ceph_string *pool_ns; @@ -714,6 +715,12 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, file->f_mode |= FMODE_CREATED; ret = finish_open(file, dentry, ceph_open); } + + spin_lock(&dentry->d_lock); + di->flags &= ~CEPH_DENTRY_ASYNC_CREATE; + wake_up_bit(&di->flags, CEPH_DENTRY_ASYNC_CREATE_BIT); + spin_unlock(&dentry->d_lock); + return ret; } @@ -790,9 +797,16 @@ retry: (req->r_dir_caps = try_prep_async_create(dir, dentry, &lo, &req->r_deleg_ino))) { + struct ceph_dentry_info *di = ceph_dentry(dentry); + set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags); req->r_args.open.flags |= cpu_to_le32(CEPH_O_EXCL); req->r_callback = ceph_async_create_cb; + + spin_lock(&dentry->d_lock); + di->flags |= CEPH_DENTRY_ASYNC_CREATE; + spin_unlock(&dentry->d_lock); + err = ceph_mdsc_submit_request(mdsc, dir, req); if (!err) { err = ceph_finish_async_create(dir, dentry, diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 9cfa7b775fdb..80f8b9ec1a31 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2960,6 +2960,64 @@ static void __do_request(struct ceph_mds_client *mdsc, if (req->r_request_started == 0) /* note request start time */ req->r_request_started = jiffies; + /* + * For async create we will choose the auth MDS of frag in parent + * directory to send the request and ususally this works fine, but + * if the migrated the dirtory to another MDS before it could handle + * it the request will be forwarded. + * + * And then the auth cap will be changed. + */ + if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags) && req->r_num_fwd) { + struct ceph_dentry_info *di = ceph_dentry(req->r_dentry); + struct ceph_inode_info *ci; + struct ceph_cap *cap; + + /* + * The request maybe handled very fast and the new inode + * hasn't been linked to the dentry yet. We need to wait + * for the ceph_finish_async_create(), which shouldn't be + * stuck too long or fail in thoery, to finish when forwarding + * the request. + */ + if (!d_inode(req->r_dentry)) { + err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_CREATE_BIT, + TASK_KILLABLE); + if (err) { + mutex_lock(&req->r_fill_mutex); + set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); + mutex_unlock(&req->r_fill_mutex); + goto out_session; + } + } + + ci = ceph_inode(d_inode(req->r_dentry)); + + spin_lock(&ci->i_ceph_lock); + cap = ci->i_auth_cap; + if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE && mds != cap->mds) { + dout("do_request session changed for auth cap %d -> %d\n", + cap->session->s_mds, session->s_mds); + + /* Remove the auth cap from old session */ + spin_lock(&cap->session->s_cap_lock); + cap->session->s_nr_caps--; + list_del_init(&cap->session_caps); + spin_unlock(&cap->session->s_cap_lock); + + /* Add the auth cap to the new session */ + cap->mds = mds; + cap->session = session; + spin_lock(&session->s_cap_lock); + session->s_nr_caps++; + list_add_tail(&cap->session_caps, &session->s_caps); + spin_unlock(&session->s_cap_lock); + + change_auth_cap_ses(ci, session); + } + spin_unlock(&ci->i_ceph_lock); + } + err = __send_request(session, req, false); out_session: diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 3c940a8457de..3b146c0fbb2b 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -302,6 +302,8 @@ struct ceph_dentry_info { #define CEPH_DENTRY_PRIMARY_LINK (1 << 3) #define CEPH_DENTRY_ASYNC_UNLINK_BIT (4) #define CEPH_DENTRY_ASYNC_UNLINK (1 << CEPH_DENTRY_ASYNC_UNLINK_BIT) +#define CEPH_DENTRY_ASYNC_CREATE_BIT (5) +#define CEPH_DENTRY_ASYNC_CREATE (1 << CEPH_DENTRY_ASYNC_CREATE_BIT) struct ceph_inode_xattrs_info { /* -- cgit v1.2.3 From 4849077604f0126514d487836e7d87c3e53a753c Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Tue, 7 Jun 2022 10:13:53 +0800 Subject: ceph: don't get the inline data for new creating files If the 'i_inline_version' is 1, that means the file is just new created and there shouldn't have any inline data in it, we should skip retrieving the inline data from MDS. This also could help reduce possiblity of dead lock issue introduce by the inline data and Fcr caps. Gradually we will remove the inline feature from kclient after ceph's scrub too have support to unline the inline data, currently this could help reduce the teuthology test failures. This is possiblly could also fix a bug that for some old clients if they couldn't explictly uninline the inline data when writing, the inline version will keep as 1 always. We may always reading non-exist data from inline data. Signed-off-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 5 ++--- fs/ceph/caps.c | 2 +- fs/ceph/file.c | 5 ++--- fs/ceph/inode.c | 5 +++-- fs/ceph/super.h | 8 ++++++++ 5 files changed, 16 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 31fc04eeb4d0..2f886ec426a0 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -313,8 +313,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) int err = 0; u64 len = subreq->len; - if (ci->i_inline_version != CEPH_INLINE_NONE && - ceph_netfs_issue_op_inline(subreq)) + if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq)) return; req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len, @@ -1439,7 +1438,7 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) inode, off, ceph_cap_string(got)); if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) || - ci->i_inline_version == CEPH_INLINE_NONE) { + !ceph_has_inline_data(ci)) { CEPH_DEFINE_RW_CONTEXT(rw_ctx, got); ceph_add_rw_context(fi, &rw_ctx); ret = filemap_fault(vmf); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 2677199efbce..0acff406ba29 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3005,7 +3005,7 @@ int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got } if (S_ISREG(ci->netfs.inode.i_mode) && - ci->i_inline_version != CEPH_INLINE_NONE && + ceph_has_inline_data(ci) && (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && i_size_read(inode) > 0) { struct page *page = diff --git a/fs/ceph/file.c b/fs/ceph/file.c index cec1111f58ab..ffb717e2c1df 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -241,8 +241,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, INIT_LIST_HEAD(&fi->rw_contexts); fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen); - if ((file->f_mode & FMODE_WRITE) && - ci->i_inline_version != CEPH_INLINE_NONE) { + if ((file->f_mode & FMODE_WRITE) && ceph_has_inline_data(ci)) { ret = ceph_uninline_data(file); if (ret < 0) goto error; @@ -1650,7 +1649,7 @@ again: inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, ceph_cap_string(got)); - if (ci->i_inline_version == CEPH_INLINE_NONE) { + if (!ceph_has_inline_data(ci)) { if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) { ret = ceph_direct_read_write(iocb, to, NULL, NULL); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 1834d6529f25..42351d7a0dd6 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1049,7 +1049,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, iinfo->inline_version >= ci->i_inline_version) { int cache_caps = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; ci->i_inline_version = iinfo->inline_version; - if (ci->i_inline_version != CEPH_INLINE_NONE && + if (ceph_has_inline_data(ci) && (locked_page || (info_caps & cache_caps))) fill_inline = true; } @@ -2327,7 +2327,8 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page, if (inline_version == 0) { /* the reply is supposed to contain inline data */ err = -EINVAL; - } else if (inline_version == CEPH_INLINE_NONE) { + } else if (inline_version == CEPH_INLINE_NONE || + inline_version == 1) { err = -ENODATA; } else { err = req->r_reply_info.targeti.inline_len; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 3b146c0fbb2b..40630e6f691c 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1231,6 +1231,14 @@ extern int ceph_pool_perm_check(struct inode *inode, int need); extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate); +static inline bool ceph_has_inline_data(struct ceph_inode_info *ci) +{ + if (ci->i_inline_version == CEPH_INLINE_NONE || + ci->i_inline_version == 1) /* initial version, no data */ + return false; + return true; +} + /* file.c */ extern const struct file_operations ceph_file_fops; -- cgit v1.2.3 From e027ddb6d3cce80945ab86358929460f91f5cf4f Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 23 Jun 2022 17:17:21 +0800 Subject: ceph: flush the dirty caps immediatelly when quota is approaching When the quota is approaching we need to notify it to the MDS as soon as possible, or the client could write to the directory more than expected. This will flush the dirty caps without delaying after each write, though this couldn't prevent the real size of a directory exceed the quota but could prevent it as soon as possible. Link: https://tracker.ceph.com/issues/56180 Signed-off-by: Xiubo Li Reviewed-by: Luís Henriques Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 5 +++-- fs/ceph/file.c | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 0acff406ba29..53cfe026b3ea 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1978,14 +1978,15 @@ retry: } dout("check_caps %llx.%llx file_want %s used %s dirty %s flushing %s" - " issued %s revoking %s retain %s %s%s\n", ceph_vinop(inode), + " issued %s revoking %s retain %s %s%s%s\n", ceph_vinop(inode), ceph_cap_string(file_wanted), ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps), ceph_cap_string(ci->i_flushing_caps), ceph_cap_string(issued), ceph_cap_string(revoking), ceph_cap_string(retain), (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "", - (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : ""); + (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "", + (flags & CHECK_CAPS_NOINVAL) ? " NOINVAL" : ""); /* * If we no longer need to hold onto old our caps, and we may diff --git a/fs/ceph/file.c b/fs/ceph/file.c index ffb717e2c1df..cd025ff25bf0 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1912,7 +1912,7 @@ retry_snap: if (dirty) __mark_inode_dirty(inode, dirty); if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos)) - ceph_check_caps(ci, 0, NULL); + ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL); } dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", @@ -2529,7 +2529,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, /* Let the MDS know about dst file size change */ if (ceph_inode_set_size(dst_inode, dst_off) || ceph_quota_is_max_bytes_approaching(dst_inode, dst_off)) - ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY, NULL); + ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_FLUSH, + NULL); } /* Mark Fw dirty */ spin_lock(&dst_ci->i_ceph_lock); -- cgit v1.2.3 From 0c04a117d77b258febb1a69da7c0cb651d4a38cc Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Fri, 24 Jun 2022 16:43:49 +0800 Subject: ceph: make f_bsize always equal to f_frsize The f_frsize maybe changed in the quota size is less than the defualt 4MB. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/super.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 5539f6c87a45..3fc48b43cab0 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -72,15 +72,9 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_type = CEPH_SUPER_MAGIC; /* ?? */ /* - * express utilization in terms of large blocks to avoid + * Express utilization in terms of large blocks to avoid * overflow on 32-bit machines. - * - * NOTE: for the time being, we make bsize == frsize to humor - * not-yet-ancient versions of glibc that are broken. - * Someday, we will probably want to report a real block - * size... whatever that may mean for a network file system! */ - buf->f_bsize = 1 << CEPH_BLOCK_SHIFT; buf->f_frsize = 1 << CEPH_BLOCK_SHIFT; /* @@ -95,6 +89,14 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); } + /* + * NOTE: for the time being, we make bsize == frsize to humor + * not-yet-ancient versions of glibc that are broken. + * Someday, we will probably want to report a real block + * size... whatever that may mean for a network file system! + */ + buf->f_bsize = buf->f_frsize; + buf->f_files = le64_to_cpu(st.num_objects); buf->f_ffree = -1; buf->f_namelen = NAME_MAX; -- cgit v1.2.3 From 7cb9994754f8a36ae9e5ec4597c5c4c2d6c03832 Mon Sep 17 00:00:00 2001 From: Hu Weiwen Date: Fri, 1 Jul 2022 10:52:27 +0800 Subject: ceph: don't truncate file in atomic_open Clear O_TRUNC from the flags sent in the MDS create request. `atomic_open' is called before permission check. We should not do any modification to the file here. The caller will do the truncation afterward. Fixes: 124e68e74099 ("ceph: file operations") Signed-off-by: Hu Weiwen Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index cd025ff25bf0..b4e978420802 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -749,6 +749,11 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, err = ceph_wait_on_conflict_unlink(dentry); if (err) return err; + /* + * Do not truncate the file, since atomic_open is called before the + * permission check. The caller will do the truncation afterward. + */ + flags &= ~O_TRUNC; if (flags & O_CREAT) { if (ceph_quota_is_max_files_exceeded(dir)) @@ -824,9 +829,7 @@ retry: } set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); - err = ceph_mdsc_do_request(mdsc, - (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, - req); + err = ceph_mdsc_do_request(mdsc, (flags & O_CREAT) ? dir : NULL, req); if (err == -ENOENT) { dentry = ceph_handle_snapdir(req, dentry); if (IS_ERR(dentry)) { -- cgit v1.2.3 From c460f4e4bba2d3f8dc0b5bfa8995d6e8d2d527a1 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Tue, 5 Jul 2022 10:40:23 +0800 Subject: ceph: remove useless check for the folio The netfs_write_begin() won't set the folio if the return value is non-zero. Signed-off-by: Xiubo Li Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 2f886ec426a0..de12715c237b 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1326,16 +1326,13 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, int r; r = netfs_write_begin(&ci->netfs, file, inode->i_mapping, pos, len, &folio, NULL); - if (r == 0) - folio_wait_fscache(folio); - if (r < 0) { - if (folio) - folio_put(folio); - } else { - WARN_ON_ONCE(!folio_test_locked(folio)); - *pagep = &folio->page; - } - return r; + if (r < 0) + return r; + + folio_wait_fscache(folio); + WARN_ON_ONCE(!folio_test_locked(folio)); + *pagep = &folio->page; + return 0; } /* -- cgit v1.2.3 From a8af0d682ae0c9cf62dd0ad6afdb1480951d6a10 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 30 Jun 2022 16:21:50 -0400 Subject: libceph: clean up ceph_osdc_start_request prototype This function always returns 0, and ignores the nofail boolean. Drop the nofail argument, make the function void return and fix up the callers. Signed-off-by: Jeff Layton Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 6 +++--- fs/ceph/addr.c | 33 +++++++++++++-------------------- fs/ceph/file.c | 32 +++++++++++++------------------- include/linux/ceph/osd_client.h | 5 ++--- net/ceph/osd_client.c | 15 ++++++--------- 5 files changed, 37 insertions(+), 54 deletions(-) (limited to 'fs') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index ef9bc62e9afd..b4580b73479f 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1297,7 +1297,7 @@ static void rbd_osd_submit(struct ceph_osd_request *osd_req) dout("%s osd_req %p for obj_req %p objno %llu %llu~%llu\n", __func__, osd_req, obj_req, obj_req->ex.oe_objno, obj_req->ex.oe_off, obj_req->ex.oe_len); - ceph_osdc_start_request(osd_req->r_osdc, osd_req, false); + ceph_osdc_start_request(osd_req->r_osdc, osd_req); } /* @@ -2081,7 +2081,7 @@ static int rbd_object_map_update(struct rbd_obj_request *obj_req, u64 snap_id, if (ret) return ret; - ceph_osdc_start_request(osdc, req, false); + ceph_osdc_start_request(osdc, req); return 0; } @@ -4768,7 +4768,7 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, if (ret) goto out_req; - ceph_osdc_start_request(osdc, req, false); + ceph_osdc_start_request(osdc, req); ret = ceph_osdc_wait_request(osdc, req); if (ret >= 0) ceph_copy_from_page_vector(pages, buf, 0, ret); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index de12715c237b..ec76e77f8d4b 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -337,6 +337,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) /* should always give us a page-aligned read */ WARN_ON_ONCE(page_off); len = err; + err = 0; osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false); req->r_callback = finish_netfs_read; @@ -344,9 +345,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) req->r_inode = inode; ihold(inode); - err = ceph_osdc_start_request(req->r_osdc, req, false); - if (err) - iput(inode); + ceph_osdc_start_request(req->r_osdc, req); out: ceph_osdc_put_request(req); if (err) @@ -620,9 +619,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) dout("writepage %llu~%llu (%llu bytes)\n", page_off, len, len); req->r_mtime = inode->i_mtime; - err = ceph_osdc_start_request(osdc, req, true); - if (!err) - err = ceph_osdc_wait_request(osdc, req); + ceph_osdc_start_request(osdc, req); + err = ceph_osdc_wait_request(osdc, req); ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, len, err); @@ -1150,8 +1148,7 @@ new_request: } req->r_mtime = inode->i_mtime; - rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); - BUG_ON(rc); + ceph_osdc_start_request(&fsc->client->osdc, req); req = NULL; wbc->nr_to_write -= i; @@ -1692,9 +1689,8 @@ int ceph_uninline_data(struct file *file) } req->r_mtime = inode->i_mtime; - err = ceph_osdc_start_request(&fsc->client->osdc, req, false); - if (!err) - err = ceph_osdc_wait_request(&fsc->client->osdc, req); + ceph_osdc_start_request(&fsc->client->osdc, req); + err = ceph_osdc_wait_request(&fsc->client->osdc, req); ceph_osdc_put_request(req); if (err < 0) goto out_unlock; @@ -1735,9 +1731,8 @@ int ceph_uninline_data(struct file *file) } req->r_mtime = inode->i_mtime; - err = ceph_osdc_start_request(&fsc->client->osdc, req, false); - if (!err) - err = ceph_osdc_wait_request(&fsc->client->osdc, req); + ceph_osdc_start_request(&fsc->client->osdc, req); + err = ceph_osdc_wait_request(&fsc->client->osdc, req); ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, len, err); @@ -1908,15 +1903,13 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE, 0, false, true); - err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false); + ceph_osdc_start_request(&fsc->client->osdc, rd_req); wr_req->r_mtime = ci->netfs.inode.i_mtime; - err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false); + ceph_osdc_start_request(&fsc->client->osdc, wr_req); - if (!err) - err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req); - if (!err2) - err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req); + err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req); + err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req); if (err >= 0 || err == -ENOENT) have |= POOL_READ; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index b4e978420802..c3caa9bf9755 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -985,9 +985,8 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off, false, false); - ret = ceph_osdc_start_request(osdc, req, false); - if (!ret) - ret = ceph_osdc_wait_request(osdc, req); + ceph_osdc_start_request(osdc, req); + ret = ceph_osdc_wait_request(osdc, req); ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency, @@ -1250,7 +1249,7 @@ static void ceph_aio_retry_work(struct work_struct *work) req->r_inode = inode; req->r_priv = aio_req; - ret = ceph_osdc_start_request(req->r_osdc, req, false); + ceph_osdc_start_request(req->r_osdc, req); out: if (ret < 0) { req->r_result = ret; @@ -1387,9 +1386,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, continue; } - ret = ceph_osdc_start_request(req->r_osdc, req, false); - if (!ret) - ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + ceph_osdc_start_request(req->r_osdc, req); + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); if (write) ceph_update_write_metrics(metric, req->r_start_latency, @@ -1452,8 +1450,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, r_private_item); list_del_init(&req->r_private_item); if (ret >= 0) - ret = ceph_osdc_start_request(req->r_osdc, - req, false); + ceph_osdc_start_request(req->r_osdc, req); if (ret < 0) { req->r_result = ret; ceph_aio_complete_req(req); @@ -1566,9 +1563,8 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, false, true); req->r_mtime = mtime; - ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); - if (!ret) - ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + ceph_osdc_start_request(&fsc->client->osdc, req); + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, len, ret); @@ -2032,12 +2028,10 @@ static int ceph_zero_partial_object(struct inode *inode, } req->r_mtime = inode->i_mtime; - ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); - if (!ret) { - ret = ceph_osdc_wait_request(&fsc->client->osdc, req); - if (ret == -ENOENT) - ret = 0; - } + ceph_osdc_start_request(&fsc->client->osdc, req); + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + if (ret == -ENOENT) + ret = 0; ceph_osdc_put_request(req); out: @@ -2339,7 +2333,7 @@ static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off if (IS_ERR(req)) ret = PTR_ERR(req); else { - ceph_osdc_start_request(osdc, req, false); + ceph_osdc_start_request(osdc, req); ret = ceph_osdc_wait_request(osdc, req); ceph_update_copyfrom_metrics(&fsc->mdsc->metric, req->r_start_latency, diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index cba8a6ffc329..fb6be72104df 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -507,9 +507,8 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, extern void ceph_osdc_get_request(struct ceph_osd_request *req); extern void ceph_osdc_put_request(struct ceph_osd_request *req); -extern int ceph_osdc_start_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req, - bool nofail); +void ceph_osdc_start_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req); extern void ceph_osdc_cancel_request(struct ceph_osd_request *req); extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 9d82bb42e958..87b883c7bfd6 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -4578,15 +4578,12 @@ bad: /* * Register request, send initial attempt. */ -int ceph_osdc_start_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req, - bool nofail) +void ceph_osdc_start_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) { down_read(&osdc->lock); submit_request(req, false); up_read(&osdc->lock); - - return 0; } EXPORT_SYMBOL(ceph_osdc_start_request); @@ -4756,7 +4753,7 @@ int ceph_osdc_unwatch(struct ceph_osd_client *osdc, if (ret) goto out_put_req; - ceph_osdc_start_request(osdc, req, false); + ceph_osdc_start_request(osdc, req); linger_cancel(lreq); linger_put(lreq); ret = wait_request_timeout(req, opts->mount_timeout); @@ -4827,7 +4824,7 @@ int ceph_osdc_notify_ack(struct ceph_osd_client *osdc, if (ret) goto out_put_req; - ceph_osdc_start_request(osdc, req, false); + ceph_osdc_start_request(osdc, req); ret = ceph_osdc_wait_request(osdc, req); out_put_req: @@ -5043,7 +5040,7 @@ int ceph_osdc_list_watchers(struct ceph_osd_client *osdc, if (ret) goto out_put_req; - ceph_osdc_start_request(osdc, req, false); + ceph_osdc_start_request(osdc, req); ret = ceph_osdc_wait_request(osdc, req); if (ret >= 0) { void *p = page_address(pages[0]); @@ -5120,7 +5117,7 @@ int ceph_osdc_call(struct ceph_osd_client *osdc, if (ret) goto out_put_req; - ceph_osdc_start_request(osdc, req, false); + ceph_osdc_start_request(osdc, req); ret = ceph_osdc_wait_request(osdc, req); if (ret >= 0) { ret = req->r_ops[0].rval; -- cgit v1.2.3