diff options
-rw-r--r-- | drivers/block/rbd.c | 6 | ||||
-rw-r--r-- | fs/ceph/addr.c | 59 | ||||
-rw-r--r-- | fs/ceph/caps.c | 38 | ||||
-rw-r--r-- | fs/ceph/dir.c | 79 | ||||
-rw-r--r-- | fs/ceph/file.c | 123 | ||||
-rw-r--r-- | fs/ceph/inode.c | 13 | ||||
-rw-r--r-- | fs/ceph/mds_client.c | 165 | ||||
-rw-r--r-- | fs/ceph/mds_client.h | 13 | ||||
-rw-r--r-- | fs/ceph/mdsmap.c | 22 | ||||
-rw-r--r-- | fs/ceph/super.c | 19 | ||||
-rw-r--r-- | fs/ceph/super.h | 31 | ||||
-rw-r--r-- | fs/ceph/xattr.c | 12 | ||||
-rw-r--r-- | fs/crypto/fname.c | 36 | ||||
-rw-r--r-- | fs/crypto/fscrypt_private.h | 9 | ||||
-rw-r--r-- | fs/crypto/hooks.c | 6 | ||||
-rw-r--r-- | fs/crypto/policy.c | 35 | ||||
-rw-r--r-- | fs/dcache.c | 15 | ||||
-rw-r--r-- | fs/inode.c | 10 | ||||
-rw-r--r-- | include/linux/ceph/ceph_fs.h | 8 | ||||
-rw-r--r-- | include/linux/ceph/mdsmap.h | 1 | ||||
-rw-r--r-- | include/linux/ceph/osd_client.h | 5 | ||||
-rw-r--r-- | include/linux/dcache.h | 2 | ||||
-rw-r--r-- | include/linux/fscrypt.h | 5 | ||||
-rw-r--r-- | include/linux/mmdebug.h | 10 | ||||
-rw-r--r-- | net/ceph/osd_client.c | 15 | ||||
-rw-r--r-- | net/ceph/osdmap.c | 32 | ||||
-rw-r--r-- | net/ceph/pagelist.c | 2 |
27 files changed, 538 insertions, 233 deletions
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 0d8ec2fe5740..f9e39301c4af 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1297,7 +1297,7 @@ static void rbd_osd_submit(struct ceph_osd_request *osd_req) dout("%s osd_req %p for obj_req %p objno %llu %llu~%llu\n", __func__, osd_req, obj_req, obj_req->ex.oe_objno, obj_req->ex.oe_off, obj_req->ex.oe_len); - ceph_osdc_start_request(osd_req->r_osdc, osd_req, false); + ceph_osdc_start_request(osd_req->r_osdc, osd_req); } /* @@ -2081,7 +2081,7 @@ static int rbd_object_map_update(struct rbd_obj_request *obj_req, u64 snap_id, if (ret) return ret; - ceph_osdc_start_request(osdc, req, false); + ceph_osdc_start_request(osdc, req); return 0; } @@ -4768,7 +4768,7 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, if (ret) goto out_req; - ceph_osdc_start_request(osdc, req, false); + ceph_osdc_start_request(osdc, req); ret = ceph_osdc_wait_request(osdc, req); if (ret >= 0) ceph_copy_from_page_vector(pages, buf, 0, ret); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 2c3a9b5b4b74..dcf701b05cc1 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -122,7 +122,7 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio) * Reference snap context in folio->private. Also set * PagePrivate so that we get invalidate_folio callback. */ - VM_BUG_ON_FOLIO(folio_test_private(folio), folio); + VM_WARN_ON_FOLIO(folio->private, folio); folio_attach_private(folio, snapc); return ceph_fscache_dirty_folio(mapping, folio); @@ -237,7 +237,7 @@ static void finish_netfs_read(struct ceph_osd_request *req) if (err >= 0 && err < subreq->len) __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); - netfs_subreq_terminated(subreq, err, true); + netfs_subreq_terminated(subreq, err, false); num_pages = calc_pages_for(osd_data->alignment, osd_data->length); ceph_put_page_vector(osd_data->pages, num_pages, false); @@ -313,8 +313,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) int err = 0; u64 len = subreq->len; - if (ci->i_inline_version != CEPH_INLINE_NONE && - ceph_netfs_issue_op_inline(subreq)) + if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq)) return; req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len, @@ -338,6 +337,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) /* should always give us a page-aligned read */ WARN_ON_ONCE(page_off); len = err; + err = 0; osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false); req->r_callback = finish_netfs_read; @@ -345,9 +345,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) req->r_inode = inode; ihold(inode); - err = ceph_osdc_start_request(req->r_osdc, req, false); - if (err) - iput(inode); + ceph_osdc_start_request(req->r_osdc, req); out: ceph_osdc_put_request(req); if (err) @@ -621,9 +619,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) dout("writepage %llu~%llu (%llu bytes)\n", page_off, len, len); req->r_mtime = inode->i_mtime; - err = ceph_osdc_start_request(osdc, req, true); - if (!err) - err = ceph_osdc_wait_request(osdc, req); + ceph_osdc_start_request(osdc, req); + err = ceph_osdc_wait_request(osdc, req); ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, len, err); @@ -1151,8 +1148,7 @@ new_request: } req->r_mtime = inode->i_mtime; - rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); - BUG_ON(rc); + ceph_osdc_start_request(&fsc->client->osdc, req); req = NULL; wbc->nr_to_write -= i; @@ -1327,16 +1323,13 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, int r; r = netfs_write_begin(&ci->netfs, file, inode->i_mapping, pos, len, &folio, NULL); - if (r == 0) - folio_wait_fscache(folio); - if (r < 0) { - if (folio) - folio_put(folio); - } else { - WARN_ON_ONCE(!folio_test_locked(folio)); - *pagep = &folio->page; - } - return r; + if (r < 0) + return r; + + folio_wait_fscache(folio); + WARN_ON_ONCE(!folio_test_locked(folio)); + *pagep = &folio->page; + return 0; } /* @@ -1439,7 +1432,7 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) inode, off, ceph_cap_string(got)); if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) || - ci->i_inline_version == CEPH_INLINE_NONE) { + !ceph_has_inline_data(ci)) { CEPH_DEFINE_RW_CONTEXT(rw_ctx, got); ceph_add_rw_context(fi, &rw_ctx); ret = filemap_fault(vmf); @@ -1696,9 +1689,8 @@ int ceph_uninline_data(struct file *file) } req->r_mtime = inode->i_mtime; - err = ceph_osdc_start_request(&fsc->client->osdc, req, false); - if (!err) - err = ceph_osdc_wait_request(&fsc->client->osdc, req); + ceph_osdc_start_request(&fsc->client->osdc, req); + err = ceph_osdc_wait_request(&fsc->client->osdc, req); ceph_osdc_put_request(req); if (err < 0) goto out_unlock; @@ -1739,9 +1731,8 @@ int ceph_uninline_data(struct file *file) } req->r_mtime = inode->i_mtime; - err = ceph_osdc_start_request(&fsc->client->osdc, req, false); - if (!err) - err = ceph_osdc_wait_request(&fsc->client->osdc, req); + ceph_osdc_start_request(&fsc->client->osdc, req); + err = ceph_osdc_wait_request(&fsc->client->osdc, req); ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, len, err); @@ -1912,15 +1903,13 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE, 0, false, true); - err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false); + ceph_osdc_start_request(&fsc->client->osdc, rd_req); wr_req->r_mtime = ci->netfs.inode.i_mtime; - err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false); + ceph_osdc_start_request(&fsc->client->osdc, wr_req); - if (!err) - err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req); - if (!err2) - err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req); + err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req); + err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req); if (err >= 0 || err == -ENOENT) have |= POOL_READ; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index ac8fd5e7f540..53cfe026b3ea 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -602,8 +602,8 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, * @ci: inode to be moved * @session: new auth caps session */ -static void change_auth_cap_ses(struct ceph_inode_info *ci, - struct ceph_mds_session *session) +void change_auth_cap_ses(struct ceph_inode_info *ci, + struct ceph_mds_session *session) { lockdep_assert_held(&ci->i_ceph_lock); @@ -1978,14 +1978,15 @@ retry: } dout("check_caps %llx.%llx file_want %s used %s dirty %s flushing %s" - " issued %s revoking %s retain %s %s%s\n", ceph_vinop(inode), + " issued %s revoking %s retain %s %s%s%s\n", ceph_vinop(inode), ceph_cap_string(file_wanted), ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps), ceph_cap_string(ci->i_flushing_caps), ceph_cap_string(issued), ceph_cap_string(revoking), ceph_cap_string(retain), (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "", - (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : ""); + (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "", + (flags & CHECK_CAPS_NOINVAL) ? " NOINVAL" : ""); /* * If we no longer need to hold onto old our caps, and we may @@ -3005,7 +3006,7 @@ int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got } if (S_ISREG(ci->netfs.inode.i_mode) && - ci->i_inline_version != CEPH_INLINE_NONE && + ceph_has_inline_data(ci) && (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && i_size_read(inode) > 0) { struct page *page = @@ -3578,24 +3579,23 @@ static void handle_cap_grant(struct inode *inode, fill_inline = true; } - if (ci->i_auth_cap == cap && - le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { - if (newcaps & ~extra_info->issued) - wake = true; + if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { + if (ci->i_auth_cap == cap) { + if (newcaps & ~extra_info->issued) + wake = true; - if (ci->i_requested_max_size > max_size || - !(le32_to_cpu(grant->wanted) & CEPH_CAP_ANY_FILE_WR)) { - /* re-request max_size if necessary */ - ci->i_requested_max_size = 0; - wake = true; - } + if (ci->i_requested_max_size > max_size || + !(le32_to_cpu(grant->wanted) & CEPH_CAP_ANY_FILE_WR)) { + /* re-request max_size if necessary */ + ci->i_requested_max_size = 0; + wake = true; + } - ceph_kick_flushing_inode_caps(session, ci); - spin_unlock(&ci->i_ceph_lock); + ceph_kick_flushing_inode_caps(session, ci); + } up_read(&session->s_mdsc->snap_rwsem); - } else { - spin_unlock(&ci->i_ceph_lock); } + spin_unlock(&ci->i_ceph_lock); if (fill_inline) ceph_fill_inline_data(inode, NULL, extra_info->inline_data, diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index eae417d71136..e7e2ebac330d 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -856,6 +856,10 @@ static int ceph_mknod(struct user_namespace *mnt_userns, struct inode *dir, if (ceph_snap(dir) != CEPH_NOSNAP) return -EROFS; + err = ceph_wait_on_conflict_unlink(dentry); + if (err) + return err; + if (ceph_quota_is_max_files_exceeded(dir)) { err = -EDQUOT; goto out; @@ -918,6 +922,10 @@ static int ceph_symlink(struct user_namespace *mnt_userns, struct inode *dir, if (ceph_snap(dir) != CEPH_NOSNAP) return -EROFS; + err = ceph_wait_on_conflict_unlink(dentry); + if (err) + return err; + if (ceph_quota_is_max_files_exceeded(dir)) { err = -EDQUOT; goto out; @@ -968,9 +976,13 @@ static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir, struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); struct ceph_mds_request *req; struct ceph_acl_sec_ctx as_ctx = {}; - int err = -EROFS; + int err; int op; + err = ceph_wait_on_conflict_unlink(dentry); + if (err) + return err; + if (ceph_snap(dir) == CEPH_SNAPDIR) { /* mkdir .snap/foo is a MKSNAP */ op = CEPH_MDS_OP_MKSNAP; @@ -980,6 +992,7 @@ static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir, dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode); op = CEPH_MDS_OP_MKDIR; } else { + err = -EROFS; goto out; } @@ -1037,6 +1050,10 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, struct ceph_mds_request *req; int err; + err = ceph_wait_on_conflict_unlink(dentry); + if (err) + return err; + if (ceph_snap(dir) != CEPH_NOSNAP) return -EROFS; @@ -1071,9 +1088,27 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc, struct ceph_mds_request *req) { + struct dentry *dentry = req->r_dentry; + struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); + struct ceph_dentry_info *di = ceph_dentry(dentry); int result = req->r_err ? req->r_err : le32_to_cpu(req->r_reply_info.head->result); + if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags)) + pr_warn("%s dentry %p:%pd async unlink bit is not set\n", + __func__, dentry, dentry); + + spin_lock(&fsc->async_unlink_conflict_lock); + hash_del_rcu(&di->hnode); + spin_unlock(&fsc->async_unlink_conflict_lock); + + spin_lock(&dentry->d_lock); + di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK; + wake_up_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT); + spin_unlock(&dentry->d_lock); + + synchronize_rcu(); + if (result == -EJUKEBOX) goto out; @@ -1081,7 +1116,7 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc, if (result) { int pathlen = 0; u64 base = 0; - char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen, + char *path = ceph_mdsc_build_path(dentry, &pathlen, &base, 0); /* mark error on parent + clear complete */ @@ -1089,13 +1124,13 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc, ceph_dir_clear_complete(req->r_parent); /* drop the dentry -- we don't know its status */ - if (!d_unhashed(req->r_dentry)) - d_drop(req->r_dentry); + if (!d_unhashed(dentry)) + d_drop(dentry); /* mark inode itself for an error (since metadata is bogus) */ mapping_set_error(req->r_old_inode->i_mapping, result); - pr_warn("ceph: async unlink failure path=(%llx)%s result=%d!\n", + pr_warn("async unlink failure path=(%llx)%s result=%d!\n", base, IS_ERR(path) ? "<<bad>>" : path, result); ceph_mdsc_free_path(path, pathlen); } @@ -1180,6 +1215,8 @@ retry: if (try_async && op == CEPH_MDS_OP_UNLINK && (req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) { + struct ceph_dentry_info *di = ceph_dentry(dentry); + dout("async unlink on %llu/%.*s caps=%s", ceph_ino(dir), dentry->d_name.len, dentry->d_name.name, ceph_cap_string(req->r_dir_caps)); @@ -1187,6 +1224,16 @@ retry: req->r_callback = ceph_async_unlink_cb; req->r_old_inode = d_inode(dentry); ihold(req->r_old_inode); + + spin_lock(&dentry->d_lock); + di->flags |= CEPH_DENTRY_ASYNC_UNLINK; + spin_unlock(&dentry->d_lock); + + spin_lock(&fsc->async_unlink_conflict_lock); + hash_add_rcu(fsc->async_unlink_conflict, &di->hnode, + dentry->d_name.hash); + spin_unlock(&fsc->async_unlink_conflict_lock); + err = ceph_mdsc_submit_request(mdsc, dir, req); if (!err) { /* @@ -1195,10 +1242,20 @@ retry: */ drop_nlink(inode); d_delete(dentry); - } else if (err == -EJUKEBOX) { - try_async = false; - ceph_mdsc_put_request(req); - goto retry; + } else { + spin_lock(&fsc->async_unlink_conflict_lock); + hash_del_rcu(&di->hnode); + spin_unlock(&fsc->async_unlink_conflict_lock); + + spin_lock(&dentry->d_lock); + di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK; + spin_unlock(&dentry->d_lock); + + if (err == -EJUKEBOX) { + try_async = false; + ceph_mdsc_put_request(req); + goto retry; + } } } else { set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); @@ -1237,6 +1294,10 @@ static int ceph_rename(struct user_namespace *mnt_userns, struct inode *old_dir, (!ceph_quota_is_same_realm(old_dir, new_dir))) return -EXDEV; + err = ceph_wait_on_conflict_unlink(new_dentry); + if (err) + return err; + dout("rename dir %p dentry %p to dir %p dentry %p\n", old_dir, old_dentry, new_dir, new_dentry); req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 284d2fda663d..04fd34557de8 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -240,8 +240,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, INIT_LIST_HEAD(&fi->rw_contexts); fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen); - if ((file->f_mode & FMODE_WRITE) && - ci->i_inline_version != CEPH_INLINE_NONE) { + if ((file->f_mode & FMODE_WRITE) && ceph_has_inline_data(ci)) { ret = ceph_uninline_data(file); if (ret < 0) goto error; @@ -568,7 +567,7 @@ static void ceph_async_create_cb(struct ceph_mds_client *mdsc, char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen, &base, 0); - pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n", + pr_warn("async create failure path=(%llx)%s result=%d!\n", base, IS_ERR(path) ? "<<bad>>" : path, result); ceph_mdsc_free_path(path, pathlen); @@ -611,6 +610,7 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, struct ceph_mds_reply_inode in = { }; struct ceph_mds_reply_info_in iinfo = { .in = &in }; struct ceph_inode_info *ci = ceph_inode(dir); + struct ceph_dentry_info *di = ceph_dentry(dentry); struct inode *inode; struct timespec64 now; struct ceph_string *pool_ns; @@ -709,6 +709,12 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, file->f_mode |= FMODE_CREATED; ret = finish_open(file, dentry, ceph_open); } + + spin_lock(&dentry->d_lock); + di->flags &= ~CEPH_DENTRY_ASYNC_CREATE; + wake_up_bit(&di->flags, CEPH_DENTRY_ASYNC_CREATE_BIT); + spin_unlock(&dentry->d_lock); + return ret; } @@ -735,6 +741,15 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, if (dentry->d_name.len > NAME_MAX) return -ENAMETOOLONG; + err = ceph_wait_on_conflict_unlink(dentry); + if (err) + return err; + /* + * Do not truncate the file, since atomic_open is called before the + * permission check. The caller will do the truncation afterward. + */ + flags &= ~O_TRUNC; + if (flags & O_CREAT) { if (ceph_quota_is_max_files_exceeded(dir)) return -EDQUOT; @@ -781,9 +796,16 @@ retry: (req->r_dir_caps = try_prep_async_create(dir, dentry, &lo, &req->r_deleg_ino))) { + struct ceph_dentry_info *di = ceph_dentry(dentry); + set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags); req->r_args.open.flags |= cpu_to_le32(CEPH_O_EXCL); req->r_callback = ceph_async_create_cb; + + spin_lock(&dentry->d_lock); + di->flags |= CEPH_DENTRY_ASYNC_CREATE; + spin_unlock(&dentry->d_lock); + err = ceph_mdsc_submit_request(mdsc, dir, req); if (!err) { err = ceph_finish_async_create(dir, dentry, @@ -802,9 +824,7 @@ retry: } set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); - err = ceph_mdsc_do_request(mdsc, - (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, - req); + err = ceph_mdsc_do_request(mdsc, (flags & O_CREAT) ? dir : NULL, req); if (err == -ENOENT) { dentry = ceph_handle_snapdir(req, dentry); if (IS_ERR(dentry)) { @@ -960,9 +980,8 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off, false, false); - ret = ceph_osdc_start_request(osdc, req, false); - if (!ret) - ret = ceph_osdc_wait_request(osdc, req); + ceph_osdc_start_request(osdc, req); + ret = ceph_osdc_wait_request(osdc, req); ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency, @@ -1225,7 +1244,7 @@ static void ceph_aio_retry_work(struct work_struct *work) req->r_inode = inode; req->r_priv = aio_req; - ret = ceph_osdc_start_request(req->r_osdc, req, false); + ceph_osdc_start_request(req->r_osdc, req); out: if (ret < 0) { req->r_result = ret; @@ -1362,9 +1381,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, continue; } - ret = ceph_osdc_start_request(req->r_osdc, req, false); - if (!ret) - ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + ceph_osdc_start_request(req->r_osdc, req); + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); if (write) ceph_update_write_metrics(metric, req->r_start_latency, @@ -1427,8 +1445,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, r_private_item); list_del_init(&req->r_private_item); if (ret >= 0) - ret = ceph_osdc_start_request(req->r_osdc, - req, false); + ceph_osdc_start_request(req->r_osdc, req); if (ret < 0) { req->r_result = ret; ceph_aio_complete_req(req); @@ -1541,9 +1558,8 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, false, true); req->r_mtime = mtime; - ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); - if (!ret) - ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + ceph_osdc_start_request(&fsc->client->osdc, req); + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, len, ret); @@ -1627,7 +1643,7 @@ again: inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, ceph_cap_string(got)); - if (ci->i_inline_version == CEPH_INLINE_NONE) { + if (!ceph_has_inline_data(ci)) { if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) { ret = ceph_direct_read_write(iocb, to, NULL, NULL); @@ -1890,7 +1906,7 @@ retry_snap: if (dirty) __mark_inode_dirty(inode, dirty); if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos)) - ceph_check_caps(ci, 0, NULL); + ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL); } dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", @@ -1930,57 +1946,15 @@ out_unlocked: */ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) { - struct inode *inode = file->f_mapping->host; - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - loff_t i_size; - loff_t ret; - - inode_lock(inode); - if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) { + struct inode *inode = file_inode(file); + int ret; + ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false); if (ret < 0) - goto out; - } - - i_size = i_size_read(inode); - switch (whence) { - case SEEK_END: - offset += i_size; - break; - case SEEK_CUR: - /* - * Here we special-case the lseek(fd, 0, SEEK_CUR) - * position-querying operation. Avoid rewriting the "same" - * f_pos value back to the file because a concurrent read(), - * write() or lseek() might have altered it - */ - if (offset == 0) { - ret = file->f_pos; - goto out; - } - offset += file->f_pos; - break; - case SEEK_DATA: - if (offset < 0 || offset >= i_size) { - ret = -ENXIO; - goto out; - } - break; - case SEEK_HOLE: - if (offset < 0 || offset >= i_size) { - ret = -ENXIO; - goto out; - } - offset = i_size; - break; + return ret; } - - ret = vfs_setpos(file, offset, max(i_size, fsc->max_file_size)); - -out: - inode_unlock(inode); - return ret; + return generic_file_llseek(file, offset, whence); } static inline void ceph_zero_partial_page( @@ -2049,12 +2023,10 @@ static int ceph_zero_partial_object(struct inode *inode, } req->r_mtime = inode->i_mtime; - ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); - if (!ret) { - ret = ceph_osdc_wait_request(&fsc->client->osdc, req); - if (ret == -ENOENT) - ret = 0; - } + ceph_osdc_start_request(&fsc->client->osdc, req); + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + if (ret == -ENOENT) + ret = 0; ceph_osdc_put_request(req); out: @@ -2356,7 +2328,7 @@ static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off if (IS_ERR(req)) ret = PTR_ERR(req); else { - ceph_osdc_start_request(osdc, req, false); + ceph_osdc_start_request(osdc, req); ret = ceph_osdc_wait_request(osdc, req); ceph_update_copyfrom_metrics(&fsc->mdsc->metric, req->r_start_latency, @@ -2549,7 +2521,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, /* Let the MDS know about dst file size change */ if (ceph_inode_set_size(dst_inode, dst_off) || ceph_quota_is_max_bytes_approaching(dst_inode, dst_off)) - ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY, NULL); + ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_FLUSH, + NULL); } /* Mark Fw dirty */ spin_lock(&dst_ci->i_ceph_lock); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 56c53ab3618e..42351d7a0dd6 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1049,7 +1049,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, iinfo->inline_version >= ci->i_inline_version) { int cache_caps = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; ci->i_inline_version = iinfo->inline_version; - if (ci->i_inline_version != CEPH_INLINE_NONE && + if (ceph_has_inline_data(ci) && (locked_page || (info_caps & cache_caps))) fill_inline = true; } @@ -2275,9 +2275,15 @@ int ceph_try_to_choose_auth_mds(struct inode *inode, int mask) * * This cost much when doing the Locker state transition and * usually will need to revoke caps from clients. + * + * And for the 'Xs' caps for getxattr we will also choose the + * auth MDS, because the MDS side code is buggy due to setxattr + * won't notify the replica MDSes when the values changed and + * the replica MDS will return the old values. Though we will + * fix it in MDS code, but this still makes sense for old ceph. */ if (((mask & CEPH_CAP_ANY_SHARED) && (issued & CEPH_CAP_ANY_EXCL)) - || (mask & CEPH_STAT_RSTAT)) + || (mask & (CEPH_STAT_RSTAT | CEPH_STAT_CAP_XATTR))) return USE_AUTH_MDS; else return USE_ANY_MDS; @@ -2321,7 +2327,8 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page, if (inline_version == 0) { /* the reply is supposed to contain inline data */ err = -EINVAL; - } else if (inline_version == CEPH_INLINE_NONE) { + } else if (inline_version == CEPH_INLINE_NONE || + inline_version == 1) { err = -ENODATA; } else { err = req->r_reply_info.targeti.inline_len; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 33f517d549ce..80f8b9ec1a31 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -456,7 +456,7 @@ static int ceph_parse_deleg_inos(void **p, void *end, dout("added delegated inode 0x%llx\n", start - 1); } else if (err == -EBUSY) { - pr_warn("ceph: MDS delegated inode 0x%llx more than once.\n", + pr_warn("MDS delegated inode 0x%llx more than once.\n", start - 1); } else { return err; @@ -655,6 +655,79 @@ static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size)); } +/* + * In async unlink case the kclient won't wait for the first reply + * from MDS and just drop all the links and unhash the dentry and then + * succeeds immediately. + * + * For any new create/link/rename,etc requests followed by using the + * same file names we must wait for the first reply of the inflight + * unlink request, or the MDS possibly will fail these following + * requests with -EEXIST if the inflight async unlink request was + * delayed for some reasons. + * + * And the worst case is that for the none async openc request it will + * successfully open the file if the CDentry hasn't been unlinked yet, + * but later the previous delayed async unlink request will remove the + * CDenty. That means the just created file is possiblly deleted later + * by accident. + * + * We need to wait for the inflight async unlink requests to finish + * when creating new files/directories by using the same file names. + */ +int ceph_wait_on_conflict_unlink(struct dentry *dentry) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); + struct dentry *pdentry = dentry->d_parent; + struct dentry *udentry, *found = NULL; + struct ceph_dentry_info *di; + struct qstr dname; + u32 hash = dentry->d_name.hash; + int err; + + dname.name = dentry->d_name.name; + dname.len = dentry->d_name.len; + + rcu_read_lock(); + hash_for_each_possible_rcu(fsc->async_unlink_conflict, di, + hnode, hash) { + udentry = di->dentry; + + spin_lock(&udentry->d_lock); + if (udentry->d_name.hash != hash) + goto next; + if (unlikely(udentry->d_parent != pdentry)) + goto next; + if (!hash_hashed(&di->hnode)) + goto next; + + if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags)) + pr_warn("%s dentry %p:%pd async unlink bit is not set\n", + __func__, dentry, dentry); + + if (!d_same_name(udentry, pdentry, &dname)) + goto next; + + spin_unlock(&udentry->d_lock); + found = dget(udentry); + break; +next: + spin_unlock(&udentry->d_lock); + } + rcu_read_unlock(); + + if (likely(!found)) + return 0; + + dout("%s dentry %p:%pd conflict with old %p:%pd\n", __func__, + dentry, dentry, found, found); + + err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT, + TASK_KILLABLE); + dput(found); + return err; +} + /* * sessions @@ -1220,14 +1293,17 @@ static int encode_supported_features(void **p, void *end) if (count > 0) { size_t i; size_t size = FEATURE_BYTES(count); + unsigned long bit; if (WARN_ON_ONCE(*p + 4 + size > end)) return -ERANGE; ceph_encode_32(p, size); memset(*p, 0, size); - for (i = 0; i < count; i++) - ((unsigned char*)(*p))[i / 8] |= BIT(feature_bits[i] % 8); + for (i = 0; i < count; i++) { + bit = feature_bits[i]; + ((unsigned char *)(*p))[bit / 8] |= BIT(bit % 8); + } *p += size; } else { if (WARN_ON_ONCE(*p + 4 > end)) @@ -2884,6 +2960,64 @@ static void __do_request(struct ceph_mds_client *mdsc, if (req->r_request_started == 0) /* note request start time */ req->r_request_started = jiffies; + /* + * For async create we will choose the auth MDS of frag in parent + * directory to send the request and ususally this works fine, but + * if the migrated the dirtory to another MDS before it could handle + * it the request will be forwarded. + * + * And then the auth cap will be changed. + */ + if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags) && req->r_num_fwd) { + struct ceph_dentry_info *di = ceph_dentry(req->r_dentry); + struct ceph_inode_info *ci; + struct ceph_cap *cap; + + /* + * The request maybe handled very fast and the new inode + * hasn't been linked to the dentry yet. We need to wait + * for the ceph_finish_async_create(), which shouldn't be + * stuck too long or fail in thoery, to finish when forwarding + * the request. + */ + if (!d_inode(req->r_dentry)) { + err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_CREATE_BIT, + TASK_KILLABLE); + if (err) { + mutex_lock(&req->r_fill_mutex); + set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); + mutex_unlock(&req->r_fill_mutex); + goto out_session; + } + } + + ci = ceph_inode(d_inode(req->r_dentry)); + + spin_lock(&ci->i_ceph_lock); + cap = ci->i_auth_cap; + if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE && mds != cap->mds) { + dout("do_request session changed for auth cap %d -> %d\n", + cap->session->s_mds, session->s_mds); + + /* Remove the auth cap from old session */ + spin_lock(&cap->session->s_cap_lock); + cap->session->s_nr_caps--; + list_del_init(&cap->session_caps); + spin_unlock(&cap->session->s_cap_lock); + + /* Add the auth cap to the new session */ + cap->mds = mds; + cap->session = session; + spin_lock(&session->s_cap_lock); + session->s_nr_caps++; + list_add_tail(&cap->session_caps, &session->s_caps); + spin_unlock(&session->s_cap_lock); + + change_auth_cap_ses(ci, session); + } + spin_unlock(&ci->i_ceph_lock); + } + err = __send_request(session, req, false); out_session: @@ -3464,11 +3598,26 @@ static void handle_session(struct ceph_mds_session *session, case CEPH_SESSION_OPEN: if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) pr_info("mds%d reconnect success\n", session->s_mds); - session->s_state = CEPH_MDS_SESSION_OPEN; - session->s_features = features; - renewed_caps(mdsc, session, 0); - if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &session->s_features)) - metric_schedule_delayed(&mdsc->metric); + + if (session->s_state == CEPH_MDS_SESSION_OPEN) { + pr_notice("mds%d is already opened\n", session->s_mds); + } else { + session->s_state = CEPH_MDS_SESSION_OPEN; + session->s_features = features; + renewed_caps(mdsc, session, 0); + if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT, + &session->s_features)) + metric_schedule_delayed(&mdsc->metric); + } + + /* + * The connection maybe broken and the session in client + * side has been reinitialized, need to update the seq + * anyway. + */ + if (!session->s_seq && seq) + session->s_seq = seq; + wake = 1; if (mdsc->stopping) __close_session(mdsc, session); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 1140aecd82ce..256e3eada6c1 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -29,14 +29,12 @@ enum ceph_feature_type { CEPHFS_FEATURE_MULTI_RECONNECT, CEPHFS_FEATURE_DELEG_INO, CEPHFS_FEATURE_METRIC_COLLECT, + CEPHFS_FEATURE_ALTERNATE_NAME, + CEPHFS_FEATURE_NOTIFY_SESSION_STATE, - CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT, + CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_NOTIFY_SESSION_STATE, }; -/* - * This will always have the highest feature bit value - * as the last element of the array. - */ #define CEPHFS_FEATURES_CLIENT_SUPPORTED { \ 0, 1, 2, 3, 4, 5, 6, 7, \ CEPHFS_FEATURE_MIMIC, \ @@ -45,10 +43,8 @@ enum ceph_feature_type { CEPHFS_FEATURE_MULTI_RECONNECT, \ CEPHFS_FEATURE_DELEG_INO, \ CEPHFS_FEATURE_METRIC_COLLECT, \ - \ - CEPHFS_FEATURE_MAX, \ + CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \ } -#define CEPHFS_FEATURES_CLIENT_REQUIRED {} /* * Some lock dependencies: @@ -582,6 +578,7 @@ static inline int ceph_wait_on_async_create(struct inode *inode) TASK_KILLABLE); } +extern int ceph_wait_on_conflict_unlink(struct dentry *dentry); extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session); extern int ceph_restore_deleg_ino(struct ceph_mds_session *session, u64 ino); #endif diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 30387733765d..8d0a6d2c2da4 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -352,12 +352,10 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2) __decode_and_drop_type(p, end, u8, bad_ext); } if (mdsmap_ev >= 8) { - u32 name_len; /* enabled */ ceph_decode_8_safe(p, end, m->m_enabled, bad_ext); - ceph_decode_32_safe(p, end, name_len, bad_ext); - ceph_decode_need(p, end, name_len, bad_ext); - *p += name_len; + /* fs_name */ + ceph_decode_skip_string(p, end, bad_ext); } /* damaged */ if (mdsmap_ev >= 9) { @@ -370,6 +368,22 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2) } else { m->m_damaged = false; } + if (mdsmap_ev >= 17) { + /* balancer */ + ceph_decode_skip_string(p, end, bad_ext); + /* standby_count_wanted */ + ceph_decode_skip_32(p, end, bad_ext); + /* old_max_mds */ + ceph_decode_skip_32(p, end, bad_ext); + /* min_compat_client */ + ceph_decode_skip_8(p, end, bad_ext); + /* required_client_features */ + ceph_decode_skip_set(p, end, 64, bad_ext); + ceph_decode_64_safe(p, end, m->m_max_xattr_size, bad_ext); + } else { + /* This forces the usage of the (sync) SETXATTR Op */ + m->m_max_xattr_size = 0; + } bad_ext: dout("mdsmap_decode m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n", !!m->m_enabled, !!m->m_damaged, m->m_num_laggy); diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 40140805bdcf..3fc48b43cab0 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -72,15 +72,9 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_type = CEPH_SUPER_MAGIC; /* ?? */ /* - * express utilization in terms of large blocks to avoid + * Express utilization in terms of large blocks to avoid * overflow on 32-bit machines. - * - * NOTE: for the time being, we make bsize == frsize to humor - * not-yet-ancient versions of glibc that are broken. - * Someday, we will probably want to report a real block - * size... whatever that may mean for a network file system! */ - buf->f_bsize = 1 << CEPH_BLOCK_SHIFT; buf->f_frsize = 1 << CEPH_BLOCK_SHIFT; /* @@ -95,6 +89,14 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); } + /* + * NOTE: for the time being, we make bsize == frsize to humor + * not-yet-ancient versions of glibc that are broken. + * Someday, we will probably want to report a real block + * size... whatever that may mean for a network file system! + */ + buf->f_bsize = buf->f_frsize; + buf->f_files = le64_to_cpu(st.num_objects); buf->f_ffree = -1; buf->f_namelen = NAME_MAX; @@ -816,6 +818,9 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, if (!fsc->cap_wq) goto fail_inode_wq; + hash_init(fsc->async_unlink_conflict); + spin_lock_init(&fsc->async_unlink_conflict_lock); + spin_lock(&ceph_fsc_lock); list_add_tail(&fsc->metric_wakeup, &ceph_fsc_list); spin_unlock(&ceph_fsc_lock); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index f59dac66955b..40630e6f691c 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -19,6 +19,7 @@ #include <linux/security.h> #include <linux/netfs.h> #include <linux/fscache.h> +#include <linux/hashtable.h> #include <linux/ceph/libceph.h> @@ -99,6 +100,8 @@ struct ceph_mount_options { char *mon_addr; }; +#define CEPH_ASYNC_CREATE_CONFLICT_BITS 8 + struct ceph_fs_client { struct super_block *sb; @@ -124,6 +127,9 @@ struct ceph_fs_client { struct workqueue_struct *inode_wq; struct workqueue_struct *cap_wq; + DECLARE_HASHTABLE(async_unlink_conflict, CEPH_ASYNC_CREATE_CONFLICT_BITS); + spinlock_t async_unlink_conflict_lock; + #ifdef CONFIG_DEBUG_FS struct dentry *debugfs_dentry_lru, *debugfs_caps; struct dentry *debugfs_congestion_kb; @@ -280,7 +286,8 @@ struct ceph_dentry_info { struct dentry *dentry; struct ceph_mds_session *lease_session; struct list_head lease_list; - unsigned flags; + struct hlist_node hnode; + unsigned long flags; int lease_shared_gen; u32 lease_gen; u32 lease_seq; @@ -289,10 +296,14 @@ struct ceph_dentry_info { u64 offset; }; -#define CEPH_DENTRY_REFERENCED 1 -#define CEPH_DENTRY_LEASE_LIST 2 -#define CEPH_DENTRY_SHRINK_LIST 4 -#define CEPH_DENTRY_PRIMARY_LINK 8 +#define CEPH_DENTRY_REFERENCED (1 << 0) +#define CEPH_DENTRY_LEASE_LIST (1 << 1) +#define CEPH_DENTRY_SHRINK_LIST (1 << 2) +#define CEPH_DENTRY_PRIMARY_LINK (1 << 3) +#define CEPH_DENTRY_ASYNC_UNLINK_BIT (4) +#define CEPH_DENTRY_ASYNC_UNLINK (1 << CEPH_DENTRY_ASYNC_UNLINK_BIT) +#define CEPH_DENTRY_ASYNC_CREATE_BIT (5) +#define CEPH_DENTRY_ASYNC_CREATE (1 << CEPH_DENTRY_ASYNC_CREATE_BIT) struct ceph_inode_xattrs_info { /* @@ -758,6 +769,8 @@ extern void ceph_unreserve_caps(struct ceph_mds_client *mdsc, extern void ceph_reservation_status(struct ceph_fs_client *client, int *total, int *avail, int *used, int *reserved, int *min); +extern void change_auth_cap_ses(struct ceph_inode_info *ci, + struct ceph_mds_session *session); @@ -1218,6 +1231,14 @@ extern int ceph_pool_perm_check(struct inode *inode, int need); extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate); +static inline bool ceph_has_inline_data(struct ceph_inode_info *ci) +{ + if (ci->i_inline_version == CEPH_INLINE_NONE || + ci->i_inline_version == 1) /* initial version, no data */ + return false; + return true; +} + /* file.c */ extern const struct file_operations ceph_file_fops; diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index f141f5246163..f31350cda960 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -1086,7 +1086,7 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name, flags |= CEPH_XATTR_REMOVE; } - dout("setxattr value=%.*s\n", (int)size, value); + dout("setxattr value size: %zu\n", size); /* do request */ req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); @@ -1184,8 +1184,14 @@ int __ceph_setxattr(struct inode *inode, const char *name, spin_lock(&ci->i_ceph_lock); retry: issued = __ceph_caps_issued(ci, NULL); - if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) + required_blob_size = __get_required_blob_size(ci, name_len, val_len); + if ((ci->i_xattrs.version == 0) || !(issued & CEPH_CAP_XATTR_EXCL) || + (required_blob_size > mdsc->mdsmap->m_max_xattr_size)) { + dout("%s do sync setxattr: version: %llu size: %d max: %llu\n", + __func__, ci->i_xattrs.version, required_blob_size, + mdsc->mdsmap->m_max_xattr_size); goto do_sync; + } if (!lock_snap_rwsem && !ci->i_head_snapc) { lock_snap_rwsem = true; @@ -1201,8 +1207,6 @@ retry: ceph_cap_string(issued)); __build_xattrs(inode); - required_blob_size = __get_required_blob_size(ci, name_len, val_len); - if (!ci->i_xattrs.prealloc_blob || required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) { struct ceph_buffer *blob; diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 14e0ef5e9a20..12bd61d20f69 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -86,7 +86,8 @@ static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) /** * fscrypt_fname_encrypt() - encrypt a filename * @inode: inode of the parent directory (for regular filenames) - * or of the symlink (for symlink targets) + * or of the symlink (for symlink targets). Key must already be + * set up. * @iname: the filename to encrypt * @out: (output) the encrypted filename * @olen: size of the encrypted filename. It must be at least @iname->len. @@ -137,6 +138,7 @@ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, return 0; } +EXPORT_SYMBOL_GPL(fscrypt_fname_encrypt); /** * fname_decrypt() - decrypt a filename @@ -264,9 +266,9 @@ static int fscrypt_base64url_decode(const char *src, int srclen, u8 *dst) return bp - dst; } -bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, - u32 orig_len, u32 max_len, - u32 *encrypted_len_ret) +bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, + u32 orig_len, u32 max_len, + u32 *encrypted_len_ret) { int padding = 4 << (fscrypt_policy_flags(policy) & FSCRYPT_POLICY_FLAGS_PAD_MASK); @@ -281,6 +283,29 @@ bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, } /** + * fscrypt_fname_encrypted_size() - calculate length of encrypted filename + * @inode: parent inode of dentry name being encrypted. Key must + * already be set up. + * @orig_len: length of the original filename + * @max_len: maximum length to return + * @encrypted_len_ret: where calculated length should be returned (on success) + * + * Filenames that are shorter than the maximum length may have their lengths + * increased slightly by encryption, due to padding that is applied. + * + * Return: false if the orig_len is greater than max_len. Otherwise, true and + * fill out encrypted_len_ret with the length (up to max_len). + */ +bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, + u32 max_len, u32 *encrypted_len_ret) +{ + return __fscrypt_fname_encrypted_size(&inode->i_crypt_info->ci_policy, + orig_len, max_len, + encrypted_len_ret); +} +EXPORT_SYMBOL_GPL(fscrypt_fname_encrypted_size); + +/** * fscrypt_fname_alloc_buffer() - allocate a buffer for presented filenames * @max_encrypted_len: maximum length of encrypted filenames the buffer will be * used to present @@ -435,8 +460,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, return ret; if (fscrypt_has_encryption_key(dir)) { - if (!fscrypt_fname_encrypted_size(&dir->i_crypt_info->ci_policy, - iname->len, NAME_MAX, + if (!fscrypt_fname_encrypted_size(dir, iname->len, NAME_MAX, &fname->crypto_buf.len)) return -ENAMETOOLONG; fname->crypto_buf.name = kmalloc(fname->crypto_buf.len, diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index f5be777d8279..3afdaa084773 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -297,14 +297,11 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num, const struct fscrypt_info *ci); /* fname.c */ -int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, - u8 *out, unsigned int olen); -bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, - u32 orig_len, u32 max_len, - u32 *encrypted_len_ret); +bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, + u32 orig_len, u32 max_len, + u32 *encrypted_len_ret); /* hkdf.c */ - struct fscrypt_hkdf { struct crypto_shash *hmac_tfm; }; diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index af74599ae1cf..7c01025879b3 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -228,9 +228,9 @@ int fscrypt_prepare_symlink(struct inode *dir, const char *target, * counting it (even though it is meaningless for ciphertext) is simpler * for now since filesystems will assume it is there and subtract it. */ - if (!fscrypt_fname_encrypted_size(policy, len, - max_len - sizeof(struct fscrypt_symlink_data), - &disk_link->len)) + if (!__fscrypt_fname_encrypted_size(policy, len, + max_len - sizeof(struct fscrypt_symlink_data), + &disk_link->len)) return -ENAMETOOLONG; disk_link->len += sizeof(struct fscrypt_symlink_data); diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 8a054e6d1e68..80b8ca0f340b 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -694,6 +694,32 @@ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir) } /** + * fscrypt_context_for_new_inode() - create an encryption context for a new inode + * @ctx: where context should be written + * @inode: inode from which to fetch policy and nonce + * + * Given an in-core "prepared" (via fscrypt_prepare_new_inode) inode, + * generate a new context and write it to ctx. ctx _must_ be at least + * FSCRYPT_SET_CONTEXT_MAX_SIZE bytes. + * + * Return: size of the resulting context or a negative error code. + */ +int fscrypt_context_for_new_inode(void *ctx, struct inode *inode) +{ + struct fscrypt_info *ci = inode->i_crypt_info; + + BUILD_BUG_ON(sizeof(union fscrypt_context) != + FSCRYPT_SET_CONTEXT_MAX_SIZE); + + /* fscrypt_prepare_new_inode() should have set up the key already. */ + if (WARN_ON_ONCE(!ci)) + return -ENOKEY; + + return fscrypt_new_context(ctx, &ci->ci_policy, ci->ci_nonce); +} +EXPORT_SYMBOL_GPL(fscrypt_context_for_new_inode); + +/** * fscrypt_set_context() - Set the fscrypt context of a new inode * @inode: a new inode * @fs_data: private data given by FS and passed to ->set_context() @@ -709,12 +735,9 @@ int fscrypt_set_context(struct inode *inode, void *fs_data) union fscrypt_context ctx; int ctxsize; - /* fscrypt_prepare_new_inode() should have set up the key already. */ - if (WARN_ON_ONCE(!ci)) - return -ENOKEY; - - BUILD_BUG_ON(sizeof(ctx) != FSCRYPT_SET_CONTEXT_MAX_SIZE); - ctxsize = fscrypt_new_context(&ctx, &ci->ci_policy, ci->ci_nonce); + ctxsize = fscrypt_context_for_new_inode(&ctx, inode); + if (ctxsize < 0) + return ctxsize; /* * This may be the first time the inode number is available, so do any diff --git a/fs/dcache.c b/fs/dcache.c index ea5cdec24ea7..c5dc32a59c76 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2248,10 +2248,16 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode, } EXPORT_SYMBOL(d_add_ci); - -static inline bool d_same_name(const struct dentry *dentry, - const struct dentry *parent, - const struct qstr *name) +/** + * d_same_name - compare dentry name with case-exact name + * @parent: parent dentry + * @dentry: the negative dentry that was passed to the parent's lookup func + * @name: the case-exact name to be associated with the returned dentry + * + * Return: true if names are same, or false + */ +bool d_same_name(const struct dentry *dentry, const struct dentry *parent, + const struct qstr *name) { if (likely(!(parent->d_flags & DCACHE_OP_COMPARE))) { if (dentry->d_name.len != name->len) @@ -2262,6 +2268,7 @@ static inline bool d_same_name(const struct dentry *dentry, dentry->d_name.len, dentry->d_name.name, name) == 0; } +EXPORT_SYMBOL_GPL(d_same_name); /** * __d_lookup_rcu - search for a dentry (racy, store-free) diff --git a/fs/inode.c b/fs/inode.c index 9c3cd540c665..6462276dfdf0 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -422,6 +422,7 @@ void inode_init_once(struct inode *inode) INIT_LIST_HEAD(&inode->i_io_list); INIT_LIST_HEAD(&inode->i_wb_list); INIT_LIST_HEAD(&inode->i_lru); + INIT_LIST_HEAD(&inode->i_sb_list); __address_space_init_once(&inode->i_data); i_size_ordered_init(inode); } @@ -1021,7 +1022,6 @@ struct inode *new_inode_pseudo(struct super_block *sb) spin_lock(&inode->i_lock); inode->i_state = 0; spin_unlock(&inode->i_lock); - INIT_LIST_HEAD(&inode->i_sb_list); } return inode; } @@ -1165,7 +1165,6 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, { struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval); struct inode *old; - bool creating = inode->i_state & I_CREATING; again: spin_lock(&inode_hash_lock); @@ -1199,7 +1198,12 @@ again: inode->i_state |= I_NEW; hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); - if (!creating) + + /* + * Add inode to the sb list if it's not already. It has I_NEW at this + * point, so it should be safe to test i_sb_list locklessly. + */ + if (list_empty(&inode->i_sb_list)) inode_sb_list_add(inode); unlock: spin_unlock(&inode_hash_lock); diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 86bf82dbd8b8..49586ff26152 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -433,9 +433,9 @@ union ceph_mds_request_args { __le32 stripe_unit; /* layout for newly created file */ __le32 stripe_count; /* ... */ __le32 object_size; - __le32 file_replication; - __le32 mask; /* CEPH_CAP_* */ - __le32 old_size; + __le32 pool; + __le32 mask; /* CEPH_CAP_* */ + __le64 old_size; } __attribute__ ((packed)) open; struct { __le32 flags; @@ -768,7 +768,7 @@ struct ceph_mds_caps { __le32 xattr_len; __le64 xattr_version; - /* filelock */ + /* a union of non-export and export bodies. */ __le64 size, max_size, truncate_size; __le32 truncate_seq; struct ceph_timespec mtime, atime, ctime; diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h index 523fd0452856..4c3e0648dc27 100644 --- a/include/linux/ceph/mdsmap.h +++ b/include/linux/ceph/mdsmap.h @@ -25,6 +25,7 @@ struct ceph_mdsmap { u32 m_session_timeout; /* seconds */ u32 m_session_autoclose; /* seconds */ u64 m_max_file_size; + u64 m_max_xattr_size; /* maximum size for xattrs blob */ u32 m_max_mds; /* expected up:active mds number */ u32 m_num_active_mds; /* actual up:active mds number */ u32 possible_max_rank; /* possible max rank index */ diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index cba8a6ffc329..fb6be72104df 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -507,9 +507,8 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, extern void ceph_osdc_get_request(struct ceph_osd_request *req); extern void ceph_osdc_put_request(struct ceph_osd_request *req); -extern int ceph_osdc_start_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req, - bool nofail); +void ceph_osdc_start_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req); extern void ceph_osdc_cancel_request(struct ceph_osd_request *req); extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req); diff --git a/include/linux/dcache.h b/include/linux/dcache.h index c73e5e327e76..92c78ed02b54 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -233,6 +233,8 @@ extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *, wait_queue_head_t *); extern struct dentry * d_splice_alias(struct inode *, struct dentry *); extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *); +extern bool d_same_name(const struct dentry *dentry, const struct dentry *parent, + const struct qstr *name); extern struct dentry * d_exact_alias(struct dentry *, struct inode *); extern struct dentry *d_find_any_alias(struct inode *inode); extern struct dentry * d_obtain_alias(struct inode *); diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index e60d57c99cb6..7d2f1e0f23b1 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -284,6 +284,7 @@ int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg); int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *arg); int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg); int fscrypt_has_permitted_context(struct inode *parent, struct inode *child); +int fscrypt_context_for_new_inode(void *ctx, struct inode *inode); int fscrypt_set_context(struct inode *inode, void *fs_data); struct fscrypt_dummy_policy { @@ -327,6 +328,10 @@ void fscrypt_free_inode(struct inode *inode); int fscrypt_drop_inode(struct inode *inode); /* fname.c */ +int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, + u8 *out, unsigned int olen); +bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, + u32 max_len, u32 *encrypted_len_ret); int fscrypt_setup_filename(struct inode *inode, const struct qstr *iname, int lookup, struct fscrypt_name *fname); diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index d7285f8148a3..15ae78cd2853 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -54,6 +54,15 @@ void dump_mm(const struct mm_struct *mm); } \ unlikely(__ret_warn_once); \ }) +#define VM_WARN_ON_FOLIO(cond, folio) ({ \ + int __ret_warn = !!(cond); \ + \ + if (unlikely(__ret_warn)) { \ + dump_page(&folio->page, "VM_WARN_ON_FOLIO(" __stringify(cond)")");\ + WARN_ON(1); \ + } \ + unlikely(__ret_warn); \ +}) #define VM_WARN_ON_ONCE_FOLIO(cond, folio) ({ \ static bool __section(".data.once") __warned; \ int __ret_warn_once = !!(cond); \ @@ -79,6 +88,7 @@ void dump_mm(const struct mm_struct *mm); #define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE_PAGE(cond, page) BUILD_BUG_ON_INVALID(cond) +#define VM_WARN_ON_FOLIO(cond, folio) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE_FOLIO(cond, folio) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) #define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 9d82bb42e958..87b883c7bfd6 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -4578,15 +4578,12 @@ bad: /* * Register request, send initial attempt. */ -int ceph_osdc_start_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req, - bool nofail) +void ceph_osdc_start_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) { down_read(&osdc->lock); submit_request(req, false); up_read(&osdc->lock); - - return 0; } EXPORT_SYMBOL(ceph_osdc_start_request); @@ -4756,7 +4753,7 @@ int ceph_osdc_unwatch(struct ceph_osd_client *osdc, if (ret) goto out_put_req; - ceph_osdc_start_request(osdc, req, false); + ceph_osdc_start_request(osdc, req); linger_cancel(lreq); linger_put(lreq); ret = wait_request_timeout(req, opts->mount_timeout); @@ -4827,7 +4824,7 @@ int ceph_osdc_notify_ack(struct ceph_osd_client *osdc, if (ret) goto out_put_req; - ceph_osdc_start_request(osdc, req, false); + ceph_osdc_start_request(osdc, req); ret = ceph_osdc_wait_request(osdc, req); out_put_req: @@ -5043,7 +5040,7 @@ int ceph_osdc_list_watchers(struct ceph_osd_client *osdc, if (ret) goto out_put_req; - ceph_osdc_start_request(osdc, req, false); + ceph_osdc_start_request(osdc, req); ret = ceph_osdc_wait_request(osdc, req); if (ret >= 0) { void *p = page_address(pages[0]); @@ -5120,7 +5117,7 @@ int ceph_osdc_call(struct ceph_osd_client *osdc, if (ret) goto out_put_req; - ceph_osdc_start_request(osdc, req, false); + ceph_osdc_start_request(osdc, req); ret = ceph_osdc_wait_request(osdc, req); if (ret >= 0) { ret = req->r_ops[0].rval; diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 2823bb3cff55..295098873861 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -11,6 +11,22 @@ #include <linux/crush/hash.h> #include <linux/crush/mapper.h> +static __printf(2, 3) +void osdmap_info(const struct ceph_osdmap *map, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_INFO "%s (%pU e%u): %pV", KBUILD_MODNAME, &map->fsid, + map->epoch, &vaf); + + va_end(args); +} + char *ceph_osdmap_state_str(char *str, int len, u32 state) { if (!len) @@ -571,10 +587,10 @@ static struct crush_map *crush_decode(void *pbyval, void *end) goto bad; #endif r = kmalloc(struct_size(r, steps, yes), GFP_NOFS); - c->rules[i] = r; if (r == NULL) goto badmem; dout(" rule %d is at %p\n", i, r); + c->rules[i] = r; r->len = yes; ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */ ceph_decode_need(p, end, r->len*3*sizeof(u32), bad); @@ -1566,7 +1582,7 @@ static int decode_new_primary_affinity(void **p, void *end, if (ret) return ret; - pr_info("osd%d primary-affinity 0x%x\n", osd, aff); + osdmap_info(map, "osd%d primary-affinity 0x%x\n", osd, aff); } return 0; @@ -1864,9 +1880,9 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v, osd = ceph_decode_32(p); w = ceph_decode_32(p); BUG_ON(osd >= map->max_osd); - pr_info("osd%d weight 0x%x %s\n", osd, w, - w == CEPH_OSD_IN ? "(in)" : - (w == CEPH_OSD_OUT ? "(out)" : "")); + osdmap_info(map, "osd%d weight 0x%x %s\n", osd, w, + w == CEPH_OSD_IN ? "(in)" : + (w == CEPH_OSD_OUT ? "(out)" : "")); map->osd_weight[osd] = w; /* @@ -1898,10 +1914,10 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v, BUG_ON(osd >= map->max_osd); if ((map->osd_state[osd] & CEPH_OSD_UP) && (xorstate & CEPH_OSD_UP)) - pr_info("osd%d down\n", osd); + osdmap_info(map, "osd%d down\n", osd); if ((map->osd_state[osd] & CEPH_OSD_EXISTS) && (xorstate & CEPH_OSD_EXISTS)) { - pr_info("osd%d does not exist\n", osd); + osdmap_info(map, "osd%d does not exist\n", osd); ret = set_primary_affinity(map, osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY); if (ret) @@ -1931,7 +1947,7 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v, dout("%s osd%d addr %s\n", __func__, osd, ceph_pr_addr(&addr)); - pr_info("osd%d up\n", osd); + osdmap_info(map, "osd%d up\n", osd); map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP; map->osd_addr[osd] = addr; } diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c index 65e34f78b05d..74622b278d57 100644 --- a/net/ceph/pagelist.c +++ b/net/ceph/pagelist.c @@ -96,7 +96,7 @@ int ceph_pagelist_append(struct ceph_pagelist *pl, const void *buf, size_t len) EXPORT_SYMBOL(ceph_pagelist_append); /* Allocate enough pages for a pagelist to append the given amount - * of data without without allocating. + * of data without allocating. * Returns: 0 on success, -ENOMEM on error. */ int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space) |