diff options
Diffstat (limited to 'fs')
468 files changed, 13881 insertions, 25120 deletions
diff --git a/fs/9p/fid.c b/fs/9p/fid.c index baf2b152229e..23cf9b2fbfe4 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -28,14 +28,18 @@ static inline void __add_fid(struct dentry *dentry, struct p9_fid *fid) /** * v9fs_fid_add - add a fid to a dentry * @dentry: dentry that the fid is being added to - * @fid: fid to add + * @pfid: fid to add, NULLed out * */ -void v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid) +void v9fs_fid_add(struct dentry *dentry, struct p9_fid **pfid) { + struct p9_fid *fid = *pfid; + spin_lock(&dentry->d_lock); __add_fid(dentry, fid); spin_unlock(&dentry->d_lock); + + *pfid = NULL; } /** @@ -56,7 +60,7 @@ static struct p9_fid *v9fs_fid_find_inode(struct inode *inode, kuid_t uid) h = (struct hlist_head *)&inode->i_private; hlist_for_each_entry(fid, h, ilist) { if (uid_eq(fid->uid, uid)) { - refcount_inc(&fid->count); + p9_fid_get(fid); ret = fid; break; } @@ -68,15 +72,19 @@ static struct p9_fid *v9fs_fid_find_inode(struct inode *inode, kuid_t uid) /** * v9fs_open_fid_add - add an open fid to an inode * @inode: inode that the fid is being added to - * @fid: fid to add + * @pfid: fid to add, NULLed out * */ -void v9fs_open_fid_add(struct inode *inode, struct p9_fid *fid) +void v9fs_open_fid_add(struct inode *inode, struct p9_fid **pfid) { + struct p9_fid *fid = *pfid; + spin_lock(&inode->i_lock); hlist_add_head(&fid->ilist, (struct hlist_head *)&inode->i_private); spin_unlock(&inode->i_lock); + + *pfid = NULL; } @@ -104,7 +112,7 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any) hlist_for_each_entry(fid, h, dlist) { if (any || uid_eq(fid->uid, uid)) { ret = fid; - refcount_inc(&ret->count); + p9_fid_get(ret); break; } } @@ -150,9 +158,9 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry, { struct dentry *ds; const unsigned char **wnames, *uname; - int i, n, l, clone, access; + int i, n, l, access; struct v9fs_session_info *v9ses; - struct p9_fid *fid, *old_fid; + struct p9_fid *fid, *root_fid, *old_fid; v9ses = v9fs_dentry2v9ses(dentry); access = v9ses->flags & V9FS_ACCESS_MASK; @@ -169,17 +177,17 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry, fid = v9fs_fid_find(ds, uid, any); if (fid) { /* Found the parent fid do a lookup with that */ - struct p9_fid *ofid = fid; + old_fid = fid; - fid = p9_client_walk(ofid, 1, &dentry->d_name.name, 1); - p9_client_clunk(ofid); + fid = p9_client_walk(old_fid, 1, &dentry->d_name.name, 1); + p9_fid_put(old_fid); goto fid_out; } up_read(&v9ses->rename_sem); /* start from the root and try to do a lookup */ - fid = v9fs_fid_find(dentry->d_sb->s_root, uid, any); - if (!fid) { + root_fid = v9fs_fid_find(dentry->d_sb->s_root, uid, any); + if (!root_fid) { /* the user is not attached to the fs yet */ if (access == V9FS_ACCESS_SINGLE) return ERR_PTR(-EPERM); @@ -194,12 +202,13 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry, if (IS_ERR(fid)) return fid; - refcount_inc(&fid->count); - v9fs_fid_add(dentry->d_sb->s_root, fid); + root_fid = p9_fid_get(fid); + v9fs_fid_add(dentry->d_sb->s_root, &fid); } /* If we are root ourself just return that */ if (dentry->d_sb->s_root == dentry) - return fid; + return root_fid; + /* * Do a multipath walk with attached root. * When walking parent we need to make sure we @@ -211,19 +220,20 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry, fid = ERR_PTR(n); goto err_out; } - old_fid = fid; - clone = 1; + fid = root_fid; + old_fid = root_fid; i = 0; while (i < n) { l = min(n - i, P9_MAXWELEM); /* * We need to hold rename lock when doing a multipath - * walk to ensure none of the patch component change + * walk to ensure none of the path components change */ - fid = p9_client_walk(fid, l, &wnames[i], clone); + fid = p9_client_walk(old_fid, l, &wnames[i], + old_fid == root_fid /* clone */); /* non-cloning walk will return the same fid */ if (fid != old_fid) { - p9_client_clunk(old_fid); + p9_fid_put(old_fid); old_fid = fid; } if (IS_ERR(fid)) { @@ -231,7 +241,6 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry, goto err_out; } i += l; - clone = 0; } kfree(wnames); fid_out: @@ -239,11 +248,11 @@ fid_out: spin_lock(&dentry->d_lock); if (d_unhashed(dentry)) { spin_unlock(&dentry->d_lock); - p9_client_clunk(fid); + p9_fid_put(fid); fid = ERR_PTR(-ENOENT); } else { __add_fid(dentry, fid); - refcount_inc(&fid->count); + p9_fid_get(fid); spin_unlock(&dentry->d_lock); } } @@ -300,7 +309,7 @@ struct p9_fid *v9fs_writeback_fid(struct dentry *dentry) fid = clone_fid(ofid); if (IS_ERR(fid)) goto error_out; - p9_client_clunk(ofid); + p9_fid_put(ofid); /* * writeback fid will only be used to write back the * dirty pages. We always request for the open fid in read-write @@ -309,7 +318,7 @@ struct p9_fid *v9fs_writeback_fid(struct dentry *dentry) */ err = p9_client_open(fid, O_RDWR); if (err < 0) { - p9_client_clunk(fid); + p9_fid_put(fid); fid = ERR_PTR(err); goto error_out; } diff --git a/fs/9p/fid.h b/fs/9p/fid.h index f7f33509e169..8a4e8cd12ca2 100644 --- a/fs/9p/fid.h +++ b/fs/9p/fid.h @@ -13,9 +13,9 @@ static inline struct p9_fid *v9fs_parent_fid(struct dentry *dentry) { return v9fs_fid_lookup(dentry->d_parent); } -void v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid); +void v9fs_fid_add(struct dentry *dentry, struct p9_fid **fid); struct p9_fid *v9fs_writeback_fid(struct dentry *dentry); -void v9fs_open_fid_add(struct inode *inode, struct p9_fid *fid); +void v9fs_open_fid_add(struct inode *inode, struct p9_fid **fid); static inline struct p9_fid *clone_fid(struct p9_fid *fid) { return IS_ERR(fid) ? fid : p9_client_walk(fid, 0, NULL, 1); @@ -29,7 +29,7 @@ static inline struct p9_fid *v9fs_fid_clone(struct dentry *dentry) return fid; nfid = clone_fid(fid); - p9_client_clunk(fid); + p9_fid_put(fid); return nfid; } #endif diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index d0833fa69faf..47b9a1122f34 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -73,7 +73,7 @@ static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file) BUG_ON(!fid); } - refcount_inc(&fid->count); + p9_fid_get(fid); rreq->netfs_priv = fid; return 0; } @@ -86,7 +86,7 @@ static void v9fs_free_request(struct netfs_io_request *rreq) { struct p9_fid *fid = rreq->netfs_priv; - p9_client_clunk(fid); + p9_fid_put(fid); } /** diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c index 1c609e99d280..f89f01734587 100644 --- a/fs/9p/vfs_dentry.c +++ b/fs/9p/vfs_dentry.c @@ -54,7 +54,7 @@ static void v9fs_dentry_release(struct dentry *dentry) p9_debug(P9_DEBUG_VFS, " dentry: %pd (%p)\n", dentry, dentry); hlist_for_each_safe(p, n, (struct hlist_head *)&dentry->d_fsdata) - p9_client_clunk(hlist_entry(p, struct p9_fid, dlist)); + p9_fid_put(hlist_entry(p, struct p9_fid, dlist)); dentry->d_fsdata = NULL; } @@ -85,7 +85,7 @@ static int v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags) retval = v9fs_refresh_inode_dotl(fid, inode); else retval = v9fs_refresh_inode(fid, inode); - p9_client_clunk(fid); + p9_fid_put(fid); if (retval == -ENOENT) return 0; diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 958680f7f23e..000fbaae9b18 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -218,7 +218,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp) spin_lock(&inode->i_lock); hlist_del(&fid->ilist); spin_unlock(&inode->i_lock); - p9_client_clunk(fid); + p9_fid_put(fid); } if ((filp->f_mode & FMODE_WRITE)) { diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 2573c08f335c..aec43ba83799 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -63,15 +63,16 @@ int v9fs_file_open(struct inode *inode, struct file *file) err = p9_client_open(fid, omode); if (err < 0) { - p9_client_clunk(fid); + p9_fid_put(fid); return err; } if ((file->f_flags & O_APPEND) && (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses))) generic_file_llseek(file, 0, SEEK_END); + + file->private_data = fid; } - file->private_data = fid; mutex_lock(&v9inode->v_mutex); if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) && !v9inode->writeback_fid && @@ -95,10 +96,10 @@ int v9fs_file_open(struct inode *inode, struct file *file) if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) fscache_use_cookie(v9fs_inode_cookie(v9inode), file->f_mode & FMODE_WRITE); - v9fs_open_fid_add(inode, fid); + v9fs_open_fid_add(inode, &fid); return 0; out_error: - p9_client_clunk(file->private_data); + p9_fid_put(file->private_data); file->private_data = NULL; return err; } diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 3d8297714772..4d1a4a8d9277 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -399,10 +399,8 @@ void v9fs_evict_inode(struct inode *inode) fscache_relinquish_cookie(v9fs_inode_cookie(v9inode), false); /* clunk the fid stashed in writeback_fid */ - if (v9inode->writeback_fid) { - p9_client_clunk(v9inode->writeback_fid); - v9inode->writeback_fid = NULL; - } + p9_fid_put(v9inode->writeback_fid); + v9inode->writeback_fid = NULL; } static int v9fs_test_inode(struct inode *inode, void *data) @@ -569,7 +567,7 @@ static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags) if (v9fs_proto_dotl(v9ses)) retval = p9_client_unlinkat(dfid, dentry->d_name.name, v9fs_at_to_dotl_flags(flags)); - p9_client_clunk(dfid); + p9_fid_put(dfid); if (retval == -EOPNOTSUPP) { /* Try the one based on path */ v9fid = v9fs_fid_clone(dentry); @@ -633,14 +631,12 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, if (IS_ERR(ofid)) { err = PTR_ERR(ofid); p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); - p9_client_clunk(dfid); - return ERR_PTR(err); + goto error; } err = p9_client_fcreate(ofid, name, perm, mode, extension); if (err < 0) { p9_debug(P9_DEBUG_VFS, "p9_client_fcreate failed %d\n", err); - p9_client_clunk(dfid); goto error; } @@ -651,8 +647,6 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, err = PTR_ERR(fid); p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); - fid = NULL; - p9_client_clunk(dfid); goto error; } /* @@ -663,21 +657,17 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, err = PTR_ERR(inode); p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err); - p9_client_clunk(dfid); goto error; } - v9fs_fid_add(dentry, fid); + v9fs_fid_add(dentry, &fid); d_instantiate(dentry, inode); } - p9_client_clunk(dfid); + p9_fid_put(dfid); return ofid; error: - if (ofid) - p9_client_clunk(ofid); - - if (fid) - p9_client_clunk(fid); - + p9_fid_put(dfid); + p9_fid_put(ofid); + p9_fid_put(fid); return ERR_PTR(err); } @@ -708,7 +698,7 @@ v9fs_vfs_create(struct user_namespace *mnt_userns, struct inode *dir, return PTR_ERR(fid); v9fs_invalidate_inode_attr(dir); - p9_client_clunk(fid); + p9_fid_put(fid); return 0; } @@ -744,7 +734,7 @@ static int v9fs_vfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, } if (fid) - p9_client_clunk(fid); + p9_fid_put(fid); return err; } @@ -785,7 +775,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, */ name = dentry->d_name.name; fid = p9_client_walk(dfid, 1, &name, 1); - p9_client_clunk(dfid); + p9_fid_put(dfid); if (fid == ERR_PTR(-ENOENT)) inode = NULL; else if (IS_ERR(fid)) @@ -804,11 +794,11 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, res = d_splice_alias(inode, dentry); if (!IS_ERR(fid)) { if (!res) - v9fs_fid_add(dentry, fid); + v9fs_fid_add(dentry, &fid); else if (!IS_ERR(res)) - v9fs_fid_add(res, fid); + v9fs_fid_add(res, &fid); else - p9_client_clunk(fid); + p9_fid_put(fid); } return res; } @@ -847,7 +837,6 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, v9fs_proto_dotu(v9ses))); if (IS_ERR(fid)) { err = PTR_ERR(fid); - fid = NULL; goto error; } @@ -882,7 +871,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) fscache_use_cookie(v9fs_inode_cookie(v9inode), file->f_mode & FMODE_WRITE); - v9fs_open_fid_add(inode, fid); + v9fs_open_fid_add(inode, &fid); file->f_mode |= FMODE_CREATED; out: @@ -890,8 +879,7 @@ out: return err; error: - if (fid) - p9_client_clunk(fid); + p9_fid_put(fid); goto out; } @@ -939,9 +927,9 @@ v9fs_vfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, struct inode *old_inode; struct inode *new_inode; struct v9fs_session_info *v9ses; - struct p9_fid *oldfid, *dfid; - struct p9_fid *olddirfid; - struct p9_fid *newdirfid; + struct p9_fid *oldfid = NULL, *dfid = NULL; + struct p9_fid *olddirfid = NULL; + struct p9_fid *newdirfid = NULL; struct p9_wstat wstat; if (flags) @@ -958,21 +946,22 @@ v9fs_vfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, dfid = v9fs_parent_fid(old_dentry); olddirfid = clone_fid(dfid); - if (dfid && !IS_ERR(dfid)) - p9_client_clunk(dfid); + p9_fid_put(dfid); + dfid = NULL; if (IS_ERR(olddirfid)) { retval = PTR_ERR(olddirfid); - goto done; + goto error; } dfid = v9fs_parent_fid(new_dentry); newdirfid = clone_fid(dfid); - p9_client_clunk(dfid); + p9_fid_put(dfid); + dfid = NULL; if (IS_ERR(newdirfid)) { retval = PTR_ERR(newdirfid); - goto clunk_olddir; + goto error; } down_write(&v9ses->rename_sem); @@ -983,7 +972,7 @@ v9fs_vfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, retval = p9_client_rename(oldfid, newdirfid, new_dentry->d_name.name); if (retval != -EOPNOTSUPP) - goto clunk_newdir; + goto error_locked; } if (old_dentry->d_parent != new_dentry->d_parent) { /* @@ -992,14 +981,14 @@ v9fs_vfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, p9_debug(P9_DEBUG_ERROR, "old dir and new dir are different\n"); retval = -EXDEV; - goto clunk_newdir; + goto error_locked; } v9fs_blank_wstat(&wstat); wstat.muid = v9ses->uname; wstat.name = new_dentry->d_name.name; retval = p9_client_wstat(oldfid, &wstat); -clunk_newdir: +error_locked: if (!retval) { if (new_inode) { if (S_ISDIR(new_inode->i_mode)) @@ -1020,13 +1009,11 @@ clunk_newdir: d_move(old_dentry, new_dentry); } up_write(&v9ses->rename_sem); - p9_client_clunk(newdirfid); - -clunk_olddir: - p9_client_clunk(olddirfid); -done: - p9_client_clunk(oldfid); +error: + p9_fid_put(newdirfid); + p9_fid_put(olddirfid); + p9_fid_put(oldfid); return retval; } @@ -1060,7 +1047,7 @@ v9fs_vfs_getattr(struct user_namespace *mnt_userns, const struct path *path, return PTR_ERR(fid); st = p9_client_stat(fid); - p9_client_clunk(fid); + p9_fid_put(fid); if (IS_ERR(st)) return PTR_ERR(st); @@ -1136,7 +1123,7 @@ static int v9fs_vfs_setattr(struct user_namespace *mnt_userns, retval = p9_client_wstat(fid, &wstat); if (use_dentry) - p9_client_clunk(fid); + p9_fid_put(fid); if (retval < 0) return retval; @@ -1261,7 +1248,7 @@ static const char *v9fs_vfs_get_link(struct dentry *dentry, return ERR_CAST(fid); st = p9_client_stat(fid); - p9_client_clunk(fid); + p9_fid_put(fid); if (IS_ERR(st)) return ERR_CAST(st); @@ -1308,7 +1295,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, return PTR_ERR(fid); v9fs_invalidate_inode_attr(dir); - p9_client_clunk(fid); + p9_fid_put(fid); return 0; } @@ -1364,7 +1351,7 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir, v9fs_refresh_inode(oldfid, d_inode(old_dentry)); v9fs_invalidate_inode_attr(dir); } - p9_client_clunk(oldfid); + p9_fid_put(oldfid); return retval; } diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index b6eb1160296c..5cfa4b4f070f 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -238,7 +238,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, struct inode *inode; struct p9_fid *fid = NULL; struct v9fs_inode *v9inode; - struct p9_fid *dfid, *ofid, *inode_fid; + struct p9_fid *dfid = NULL, *ofid = NULL, *inode_fid = NULL; struct v9fs_session_info *v9ses; struct posix_acl *pacl = NULL, *dacl = NULL; struct dentry *res = NULL; @@ -274,7 +274,6 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, if (IS_ERR(ofid)) { err = PTR_ERR(ofid); p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); - p9_client_clunk(dfid); goto out; } @@ -286,38 +285,34 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, if (err) { p9_debug(P9_DEBUG_VFS, "Failed to get acl values in creat %d\n", err); - p9_client_clunk(dfid); - goto error; + goto out; } err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags), mode, gid, &qid); if (err < 0) { p9_debug(P9_DEBUG_VFS, "p9_client_open_dotl failed in creat %d\n", err); - p9_client_clunk(dfid); - goto error; + goto out; } v9fs_invalidate_inode_attr(dir); /* instantiate inode and assign the unopened fid to the dentry */ fid = p9_client_walk(dfid, 1, &name, 1); - p9_client_clunk(dfid); if (IS_ERR(fid)) { err = PTR_ERR(fid); p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); - fid = NULL; - goto error; + goto out; } inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err); - goto error; + goto out; } /* Now set the ACL based on the default value */ v9fs_set_create_acl(inode, fid, dacl, pacl); - v9fs_fid_add(dentry, fid); + v9fs_fid_add(dentry, &fid); d_instantiate(dentry, inode); v9inode = V9FS_I(inode); @@ -336,7 +331,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, if (IS_ERR(inode_fid)) { err = PTR_ERR(inode_fid); mutex_unlock(&v9inode->v_mutex); - goto err_clunk_old_fid; + goto out; } v9inode->writeback_fid = (void *) inode_fid; } @@ -344,25 +339,20 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, /* Since we are opening a file, assign the open fid to the file */ err = finish_open(file, dentry, generic_file_open); if (err) - goto err_clunk_old_fid; + goto out; file->private_data = ofid; if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) fscache_use_cookie(v9fs_inode_cookie(v9inode), file->f_mode & FMODE_WRITE); - v9fs_open_fid_add(inode, ofid); + v9fs_open_fid_add(inode, &ofid); file->f_mode |= FMODE_CREATED; out: + p9_fid_put(dfid); + p9_fid_put(ofid); + p9_fid_put(fid); v9fs_put_acl(dacl, pacl); dput(res); return err; - -error: - if (fid) - p9_client_clunk(fid); -err_clunk_old_fid: - if (ofid) - p9_client_clunk(ofid); - goto out; } /** @@ -400,7 +390,6 @@ static int v9fs_vfs_mkdir_dotl(struct user_namespace *mnt_userns, if (IS_ERR(dfid)) { err = PTR_ERR(dfid); p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); - dfid = NULL; goto error; } @@ -422,7 +411,6 @@ static int v9fs_vfs_mkdir_dotl(struct user_namespace *mnt_userns, err = PTR_ERR(fid); p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); - fid = NULL; goto error; } @@ -435,10 +423,9 @@ static int v9fs_vfs_mkdir_dotl(struct user_namespace *mnt_userns, err); goto error; } - v9fs_fid_add(dentry, fid); + v9fs_fid_add(dentry, &fid); v9fs_set_create_acl(inode, fid, dacl, pacl); d_instantiate(dentry, inode); - fid = NULL; err = 0; } else { /* @@ -457,10 +444,9 @@ static int v9fs_vfs_mkdir_dotl(struct user_namespace *mnt_userns, inc_nlink(dir); v9fs_invalidate_inode_attr(dir); error: - if (fid) - p9_client_clunk(fid); + p9_fid_put(fid); v9fs_put_acl(dacl, pacl); - p9_client_clunk(dfid); + p9_fid_put(dfid); return err; } @@ -489,7 +475,7 @@ v9fs_vfs_getattr_dotl(struct user_namespace *mnt_userns, */ st = p9_client_getattr_dotl(fid, P9_STATS_ALL); - p9_client_clunk(fid); + p9_fid_put(fid); if (IS_ERR(st)) return PTR_ERR(st); @@ -603,7 +589,7 @@ int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns, retval = p9_client_setattr(fid, &p9attr); if (retval < 0) { if (use_dentry) - p9_client_clunk(fid); + p9_fid_put(fid); return retval; } @@ -619,12 +605,12 @@ int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns, retval = v9fs_acl_chmod(inode, fid); if (retval < 0) { if (use_dentry) - p9_client_clunk(fid); + p9_fid_put(fid); return retval; } } if (use_dentry) - p9_client_clunk(fid); + p9_fid_put(fid); return 0; } @@ -743,7 +729,6 @@ v9fs_vfs_symlink_dotl(struct user_namespace *mnt_userns, struct inode *dir, err = PTR_ERR(fid); p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); - fid = NULL; goto error; } @@ -755,9 +740,8 @@ v9fs_vfs_symlink_dotl(struct user_namespace *mnt_userns, struct inode *dir, err); goto error; } - v9fs_fid_add(dentry, fid); + v9fs_fid_add(dentry, &fid); d_instantiate(dentry, inode); - fid = NULL; err = 0; } else { /* Not in cached mode. No need to populate inode with stat */ @@ -770,10 +754,8 @@ v9fs_vfs_symlink_dotl(struct user_namespace *mnt_userns, struct inode *dir, } error: - if (fid) - p9_client_clunk(fid); - - p9_client_clunk(dfid); + p9_fid_put(fid); + p9_fid_put(dfid); return err; } @@ -803,14 +785,14 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, oldfid = v9fs_fid_lookup(old_dentry); if (IS_ERR(oldfid)) { - p9_client_clunk(dfid); + p9_fid_put(dfid); return PTR_ERR(oldfid); } err = p9_client_link(dfid, oldfid, dentry->d_name.name); - p9_client_clunk(dfid); - p9_client_clunk(oldfid); + p9_fid_put(dfid); + p9_fid_put(oldfid); if (err < 0) { p9_debug(P9_DEBUG_VFS, "p9_client_link failed %d\n", err); return err; @@ -826,7 +808,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, return PTR_ERR(fid); v9fs_refresh_inode_dotl(fid, d_inode(old_dentry)); - p9_client_clunk(fid); + p9_fid_put(fid); } ihold(d_inode(old_dentry)); d_instantiate(dentry, d_inode(old_dentry)); @@ -866,7 +848,6 @@ v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir, if (IS_ERR(dfid)) { err = PTR_ERR(dfid); p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); - dfid = NULL; goto error; } @@ -891,7 +872,6 @@ v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir, err = PTR_ERR(fid); p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); - fid = NULL; goto error; } @@ -905,9 +885,8 @@ v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir, goto error; } v9fs_set_create_acl(inode, fid, dacl, pacl); - v9fs_fid_add(dentry, fid); + v9fs_fid_add(dentry, &fid); d_instantiate(dentry, inode); - fid = NULL; err = 0; } else { /* @@ -923,10 +902,9 @@ v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir, d_instantiate(dentry, inode); } error: - if (fid) - p9_client_clunk(fid); + p9_fid_put(fid); v9fs_put_acl(dacl, pacl); - p9_client_clunk(dfid); + p9_fid_put(dfid); return err; } @@ -956,7 +934,7 @@ v9fs_vfs_get_link_dotl(struct dentry *dentry, if (IS_ERR(fid)) return ERR_CAST(fid); retval = p9_client_readlink(fid, &target); - p9_client_clunk(fid); + p9_fid_put(fid); if (retval) return ERR_PTR(retval); set_delayed_call(done, kfree_link, target); diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 97e23b4e6982..2d9ee073d12c 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -184,13 +184,13 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, retval = v9fs_get_acl(inode, fid); if (retval) goto release_sb; - v9fs_fid_add(root, fid); + v9fs_fid_add(root, &fid); p9_debug(P9_DEBUG_VFS, " simple set mount, return 0\n"); return dget(sb->s_root); clunk_fid: - p9_client_clunk(fid); + p9_fid_put(fid); v9fs_session_close(v9ses); free_session: kfree(v9ses); @@ -203,7 +203,7 @@ release_sb: * attached the fid to dentry so it won't get clunked * automatically. */ - p9_client_clunk(fid); + p9_fid_put(fid); deactivate_locked_super(sb); return ERR_PTR(retval); } @@ -270,7 +270,7 @@ static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf) } res = simple_statfs(dentry, buf); done: - p9_client_clunk(fid); + p9_fid_put(fid); return res; } diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index a824441b95a2..1f9298a4bd42 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -44,7 +44,7 @@ ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name, if (err) retval = err; } - p9_client_clunk(attr_fid); + p9_fid_put(attr_fid); return retval; } @@ -71,7 +71,7 @@ ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name, if (IS_ERR(fid)) return PTR_ERR(fid); ret = v9fs_fid_xattr_get(fid, name, buffer, buffer_size); - p9_client_clunk(fid); + p9_fid_put(fid); return ret; } @@ -98,7 +98,7 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name, if (IS_ERR(fid)) return PTR_ERR(fid); ret = v9fs_fid_xattr_set(fid, name, value, value_len, flags); - p9_client_clunk(fid); + p9_fid_put(fid); return ret; } @@ -128,7 +128,7 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name, retval); else p9_client_write(fid, 0, &from, &retval); - err = p9_client_clunk(fid); + err = p9_fid_put(fid); if (!retval && err) retval = err; return retval; diff --git a/fs/Makefile b/fs/Makefile index 208a74e0b00e..93b80529f8e8 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -34,8 +34,6 @@ obj-$(CONFIG_TIMERFD) += timerfd.o obj-$(CONFIG_EVENTFD) += eventfd.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_AIO) += aio.o -obj-$(CONFIG_IO_URING) += io_uring.o -obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_FS_DAX) += dax.o obj-$(CONFIG_FS_ENCRYPTION) += crypto/ obj-$(CONFIG_FS_VERITY) += verity/ diff --git a/fs/affs/file.c b/fs/affs/file.c index cd00a4c68a12..cefa222f7881 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -526,7 +526,6 @@ affs_do_readpage_ofs(struct page *page, unsigned to, int create) struct inode *inode = page->mapping->host; struct super_block *sb = inode->i_sb; struct buffer_head *bh; - char *data; unsigned pos = 0; u32 bidx, boff, bsize; u32 tmp; @@ -545,15 +544,12 @@ affs_do_readpage_ofs(struct page *page, unsigned to, int create) return PTR_ERR(bh); tmp = min(bsize - boff, to - pos); BUG_ON(pos + tmp > to || tmp > bsize); - data = kmap_atomic(page); - memcpy(data + pos, AFFS_DATA(bh) + boff, tmp); - kunmap_atomic(data); + memcpy_to_page(page, pos, AFFS_DATA(bh) + boff, tmp); affs_brelse(bh); bidx++; pos += tmp; boff = 0; } - flush_dcache_page(page); return 0; } diff --git a/fs/afs/file.c b/fs/afs/file.c index 42118a4f3383..d1cfb235c4b9 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -375,7 +375,7 @@ static int afs_begin_cache_operation(struct netfs_io_request *rreq) } static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len, - struct folio *folio, void **_fsdata) + struct folio **foliop, void **_fsdata) { struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index bbb2c210d139..97f50e9fd9eb 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -132,12 +132,6 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) if (IS_ERR(page)) return PTR_ERR(page); - if (PageError(page)) { - ret = afs_bad(AFS_FS_I(d_inode(mntpt)), afs_file_error_mntpt); - put_page(page); - return ret; - } - buf = kmap(page); ret = -EINVAL; if (buf[size - 1] == '.') @@ -400,8 +400,8 @@ static const struct file_operations aio_ring_fops = { }; #if IS_ENABLED(CONFIG_MIGRATION) -static int aio_migratepage(struct address_space *mapping, struct page *new, - struct page *old, enum migrate_mode mode) +static int aio_migrate_folio(struct address_space *mapping, struct folio *dst, + struct folio *src, enum migrate_mode mode) { struct kioctx *ctx; unsigned long flags; @@ -435,10 +435,10 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, goto out; } - idx = old->index; + idx = src->index; if (idx < (pgoff_t)ctx->nr_pages) { - /* Make sure the old page hasn't already been changed */ - if (ctx->ring_pages[idx] != old) + /* Make sure the old folio hasn't already been changed */ + if (ctx->ring_pages[idx] != &src->page) rc = -EAGAIN; } else rc = -EINVAL; @@ -447,27 +447,27 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, goto out_unlock; /* Writeback must be complete */ - BUG_ON(PageWriteback(old)); - get_page(new); + BUG_ON(folio_test_writeback(src)); + folio_get(dst); - rc = migrate_page_move_mapping(mapping, new, old, 1); + rc = folio_migrate_mapping(mapping, dst, src, 1); if (rc != MIGRATEPAGE_SUCCESS) { - put_page(new); + folio_put(dst); goto out_unlock; } /* Take completion_lock to prevent other writes to the ring buffer - * while the old page is copied to the new. This prevents new + * while the old folio is copied to the new. This prevents new * events from being lost. */ spin_lock_irqsave(&ctx->completion_lock, flags); - migrate_page_copy(new, old); - BUG_ON(ctx->ring_pages[idx] != old); - ctx->ring_pages[idx] = new; + folio_migrate_copy(dst, src); + BUG_ON(ctx->ring_pages[idx] != &src->page); + ctx->ring_pages[idx] = &dst->page; spin_unlock_irqrestore(&ctx->completion_lock, flags); - /* The old page is no longer accessible. */ - put_page(old); + /* The old folio is no longer accessible. */ + folio_put(src); out_unlock: mutex_unlock(&ctx->ring_lock); @@ -475,13 +475,13 @@ out: spin_unlock(&mapping->private_lock); return rc; } +#else +#define aio_migrate_folio NULL #endif static const struct address_space_operations aio_ctx_aops = { .dirty_folio = noop_dirty_folio, -#if IS_ENABLED(CONFIG_MIGRATION) - .migratepage = aio_migratepage, -#endif + .migrate_folio = aio_migrate_folio, }; static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) @@ -1475,7 +1475,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) req->ki_complete = aio_complete_rw; req->private = NULL; req->ki_pos = iocb->aio_offset; - req->ki_flags = iocb_flags(req->ki_filp); + req->ki_flags = req->ki_filp->f_iocb_flags; if (iocb->aio_flags & IOCB_FLAG_RESFD) req->ki_flags |= IOCB_EVENTFD; if (iocb->aio_flags & IOCB_FLAG_IOPRIO) { diff --git a/fs/attr.c b/fs/attr.c index dbe996b0dedf..1552a5f23d6b 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -22,7 +22,7 @@ * chown_ok - verify permissions to chown inode * @mnt_userns: user namespace of the mount @inode was found from * @inode: inode to check permissions on - * @uid: uid to chown @inode to + * @ia_vfsuid: uid to chown @inode to * * If the inode has been found through an idmapped mount the user namespace of * the vfsmount must be passed through @mnt_userns. This function will then @@ -31,15 +31,15 @@ * performed on the raw inode simply passs init_user_ns. */ static bool chown_ok(struct user_namespace *mnt_userns, - const struct inode *inode, - kuid_t uid) + const struct inode *inode, vfsuid_t ia_vfsuid) { - kuid_t kuid = i_uid_into_mnt(mnt_userns, inode); - if (uid_eq(current_fsuid(), kuid) && uid_eq(uid, inode->i_uid)) + vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + if (vfsuid_eq_kuid(vfsuid, current_fsuid()) && + vfsuid_eq(ia_vfsuid, vfsuid)) return true; if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN)) return true; - if (uid_eq(kuid, INVALID_UID) && + if (!vfsuid_valid(vfsuid) && ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN)) return true; return false; @@ -49,7 +49,7 @@ static bool chown_ok(struct user_namespace *mnt_userns, * chgrp_ok - verify permissions to chgrp inode * @mnt_userns: user namespace of the mount @inode was found from * @inode: inode to check permissions on - * @gid: gid to chown @inode to + * @ia_vfsgid: gid to chown @inode to * * If the inode has been found through an idmapped mount the user namespace of * the vfsmount must be passed through @mnt_userns. This function will then @@ -58,21 +58,19 @@ static bool chown_ok(struct user_namespace *mnt_userns, * performed on the raw inode simply passs init_user_ns. */ static bool chgrp_ok(struct user_namespace *mnt_userns, - const struct inode *inode, kgid_t gid) + const struct inode *inode, vfsgid_t ia_vfsgid) { - kgid_t kgid = i_gid_into_mnt(mnt_userns, inode); - if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode))) { - kgid_t mapped_gid; - - if (gid_eq(gid, inode->i_gid)) + vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + if (vfsuid_eq_kuid(vfsuid, current_fsuid())) { + if (vfsgid_eq(ia_vfsgid, vfsgid)) return true; - mapped_gid = mapped_kgid_fs(mnt_userns, i_user_ns(inode), gid); - if (in_group_p(mapped_gid)) + if (vfsgid_in_group_p(ia_vfsgid)) return true; } if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN)) return true; - if (gid_eq(kgid, INVALID_GID) && + if (!vfsgid_valid(vfsgid) && ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN)) return true; return false; @@ -120,28 +118,29 @@ int setattr_prepare(struct user_namespace *mnt_userns, struct dentry *dentry, goto kill_priv; /* Make sure a caller can chown. */ - if ((ia_valid & ATTR_UID) && !chown_ok(mnt_userns, inode, attr->ia_uid)) + if ((ia_valid & ATTR_UID) && + !chown_ok(mnt_userns, inode, attr->ia_vfsuid)) return -EPERM; /* Make sure caller can chgrp. */ - if ((ia_valid & ATTR_GID) && !chgrp_ok(mnt_userns, inode, attr->ia_gid)) + if ((ia_valid & ATTR_GID) && + !chgrp_ok(mnt_userns, inode, attr->ia_vfsgid)) return -EPERM; /* Make sure a caller can chmod. */ if (ia_valid & ATTR_MODE) { - kgid_t mapped_gid; + vfsgid_t vfsgid; if (!inode_owner_or_capable(mnt_userns, inode)) return -EPERM; if (ia_valid & ATTR_GID) - mapped_gid = mapped_kgid_fs(mnt_userns, - i_user_ns(inode), attr->ia_gid); + vfsgid = attr->ia_vfsgid; else - mapped_gid = i_gid_into_mnt(mnt_userns, inode); + vfsgid = i_gid_into_vfsgid(mnt_userns, inode); /* Also check the setgid bit! */ - if (!in_group_p(mapped_gid) && + if (!vfsgid_in_group_p(vfsgid) && !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) attr->ia_mode &= ~S_ISGID; } @@ -184,6 +183,8 @@ EXPORT_SYMBOL(setattr_prepare); */ int inode_newsize_ok(const struct inode *inode, loff_t offset) { + if (offset < 0) + return -EINVAL; if (inode->i_size < offset) { unsigned long limit; @@ -219,9 +220,7 @@ EXPORT_SYMBOL(inode_newsize_ok); * setattr_copy must be called with i_mutex held. * * setattr_copy updates the inode's metadata with that specified - * in attr on idmapped mounts. If file ownership is changed setattr_copy - * doesn't map ia_uid and ia_gid. It will asssume the caller has already - * provided the intended values. Necessary permission checks to determine + * in attr on idmapped mounts. Necessary permission checks to determine * whether or not the S_ISGID property needs to be removed are performed with * the correct idmapped mount permission helpers. * Noticeably missing is inode size update, which is more complex @@ -242,10 +241,8 @@ void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode, { unsigned int ia_valid = attr->ia_valid; - if (ia_valid & ATTR_UID) - inode->i_uid = attr->ia_uid; - if (ia_valid & ATTR_GID) - inode->i_gid = attr->ia_gid; + i_uid_update(mnt_userns, attr, inode); + i_gid_update(mnt_userns, attr, inode); if (ia_valid & ATTR_ATIME) inode->i_atime = attr->ia_atime; if (ia_valid & ATTR_MTIME) @@ -254,8 +251,8 @@ void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode, inode->i_ctime = attr->ia_ctime; if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; - kgid_t kgid = i_gid_into_mnt(mnt_userns, inode); - if (!in_group_p(kgid) && + vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + if (!vfsgid_in_group_p(vfsgid) && !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) mode &= ~S_ISGID; inode->i_mode = mode; @@ -306,9 +303,6 @@ EXPORT_SYMBOL(may_setattr); * retry. Because breaking a delegation may take a long time, the * caller should drop the i_mutex before doing so. * - * If file ownership is changed notify_change() doesn't map ia_uid and - * ia_gid. It will asssume the caller has already provided the intended values. - * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. Also, passing NULL is fine for callers holding @@ -397,23 +391,25 @@ int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, * namespace of the superblock. */ if (ia_valid & ATTR_UID && - !kuid_has_mapping(inode->i_sb->s_user_ns, attr->ia_uid)) + !vfsuid_has_fsmapping(mnt_userns, inode->i_sb->s_user_ns, + attr->ia_vfsuid)) return -EOVERFLOW; if (ia_valid & ATTR_GID && - !kgid_has_mapping(inode->i_sb->s_user_ns, attr->ia_gid)) + !vfsgid_has_fsmapping(mnt_userns, inode->i_sb->s_user_ns, + attr->ia_vfsgid)) return -EOVERFLOW; /* Don't allow modifications of files with invalid uids or * gids unless those uids & gids are being made valid. */ if (!(ia_valid & ATTR_UID) && - !uid_valid(i_uid_into_mnt(mnt_userns, inode))) + !vfsuid_valid(i_uid_into_vfsuid(mnt_userns, inode))) return -EOVERFLOW; if (!(ia_valid & ATTR_GID) && - !gid_valid(i_gid_into_mnt(mnt_userns, inode))) + !vfsgid_valid(i_gid_into_vfsgid(mnt_userns, inode))) return -EOVERFLOW; - error = security_inode_setattr(dentry, attr); + error = security_inode_setattr(mnt_userns, dentry, attr); if (error) return error; error = try_break_deleg(inode, delegated_inode); diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index 918826eaceea..d5a44fa88acf 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -51,8 +51,6 @@ extern struct file_system_type autofs_fs_type; */ struct autofs_info { struct dentry *dentry; - struct inode *inode; - int flags; struct completion expire_complete; @@ -148,6 +146,11 @@ static inline int autofs_oz_mode(struct autofs_sb_info *sbi) task_pgrp(current) == sbi->oz_pgrp); } +static inline bool autofs_empty(struct autofs_info *ino) +{ + return ino->count < 2; +} + struct inode *autofs_get_inode(struct super_block *, umode_t); void autofs_free_ino(struct autofs_info *); diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c index b3fefd6237c3..038b3d2d9f57 100644 --- a/fs/autofs/expire.c +++ b/fs/autofs/expire.c @@ -371,7 +371,7 @@ static struct dentry *should_expire(struct dentry *dentry, return NULL; } - if (simple_empty(dentry)) + if (autofs_empty(ino)) return NULL; /* Case 2: tree mount, expire iff entire tree is not busy */ diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index 9edf243713eb..affa70360b1f 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -20,6 +20,7 @@ struct autofs_info *autofs_new_ino(struct autofs_sb_info *sbi) INIT_LIST_HEAD(&ino->expiring); ino->last_used = jiffies; ino->sbi = sbi; + ino->count = 1; } return ino; } diff --git a/fs/autofs/root.c b/fs/autofs/root.c index 91fe4548c256..ca03c1cae2be 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -10,6 +10,7 @@ #include "autofs_i.h" +static int autofs_dir_permission(struct user_namespace *, struct inode *, int); static int autofs_dir_symlink(struct user_namespace *, struct inode *, struct dentry *, const char *); static int autofs_dir_unlink(struct inode *, struct dentry *); @@ -50,6 +51,7 @@ const struct file_operations autofs_dir_operations = { const struct inode_operations autofs_dir_inode_operations = { .lookup = autofs_lookup, + .permission = autofs_dir_permission, .unlink = autofs_dir_unlink, .symlink = autofs_dir_symlink, .mkdir = autofs_dir_mkdir, @@ -77,6 +79,7 @@ static int autofs_dir_open(struct inode *inode, struct file *file) { struct dentry *dentry = file->f_path.dentry; struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); pr_debug("file=%p dentry=%p %pd\n", file, dentry, dentry); @@ -93,7 +96,7 @@ static int autofs_dir_open(struct inode *inode, struct file *file) * it. */ spin_lock(&sbi->lookup_lock); - if (!path_is_mountpoint(&file->f_path) && simple_empty(dentry)) { + if (!path_is_mountpoint(&file->f_path) && autofs_empty(ino)) { spin_unlock(&sbi->lookup_lock); return -ENOENT; } @@ -288,9 +291,26 @@ static struct dentry *autofs_mountpoint_changed(struct path *path) struct dentry *dentry = path->dentry; struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); - /* - * If this is an indirect mount the dentry could have gone away - * as a result of an expire and a new one created. + /* If this is an indirect mount the dentry could have gone away + * and a new one created. + * + * This is unusual and I can't remember the case for which it + * was originally added now. But an example of how this can + * happen is an autofs indirect mount that has the "browse" + * option set and also has the "symlink" option in the autofs + * map entry. In this case the daemon will remove the browse + * directory and create a symlink as the mount leaving the + * struct path stale. + * + * Another not so obvious case is when a mount in an autofs + * indirect mount that uses the "nobrowse" option is being + * expired at the same time as a path walk. If the mount has + * been umounted but the mount point directory seen before + * becoming unhashed (during a lockless path walk) when a stat + * family system call is made the mount won't be re-mounted as + * it should. In this case the mount point that's been removed + * (by the daemon) will be stale and the a new mount point + * dentry created. */ if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) { struct dentry *parent = dentry->d_parent; @@ -362,7 +382,7 @@ static struct vfsmount *autofs_d_automount(struct path *path) * the mount never trigger mounts themselves (they have an * autofs trigger mount mounted on them). But v4 pseudo direct * mounts do need the leaves to trigger mounts. In this case - * we have no choice but to use the list_empty() check and + * we have no choice but to use the autofs_empty() check and * require user space behave. */ if (sbi->version > 4) { @@ -371,7 +391,7 @@ static struct vfsmount *autofs_d_automount(struct path *path) goto done; } } else { - if (!simple_empty(dentry)) { + if (!autofs_empty(ino)) { spin_unlock(&sbi->fs_lock); goto done; } @@ -426,9 +446,8 @@ static int autofs_d_manage(const struct path *path, bool rcu_walk) if (rcu_walk) { /* We don't need fs_lock in rcu_walk mode, - * just testing 'AUTOFS_INFO_NO_RCU' is enough. - * simple_empty() takes a spinlock, so leave it - * to last. + * just testing 'AUTOFS_INF_WANT_EXPIRE' is enough. + * * We only return -EISDIR when certain this isn't * a mount-trap. */ @@ -441,9 +460,7 @@ static int autofs_d_manage(const struct path *path, bool rcu_walk) inode = d_inode_rcu(dentry); if (inode && S_ISLNK(inode->i_mode)) return -EISDIR; - if (list_empty(&dentry->d_subdirs)) - return 0; - if (!simple_empty(dentry)) + if (!autofs_empty(ino)) return -EISDIR; return 0; } @@ -463,7 +480,7 @@ static int autofs_d_manage(const struct path *path, bool rcu_walk) * we can avoid needless calls ->d_automount() and avoid * an incorrect ELOOP error return. */ - if ((!path_is_mountpoint(path) && !simple_empty(dentry)) || + if ((!path_is_mountpoint(path) && !autofs_empty(ino)) || (d_really_is_positive(dentry) && d_is_symlink(dentry))) status = -EISDIR; } @@ -526,11 +543,30 @@ static struct dentry *autofs_lookup(struct inode *dir, return NULL; } +static int autofs_dir_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask) +{ + if (mask & MAY_WRITE) { + struct autofs_sb_info *sbi = autofs_sbi(inode->i_sb); + + if (!autofs_oz_mode(sbi)) + return -EACCES; + + /* autofs_oz_mode() needs to allow path walks when the + * autofs mount is catatonic but the state of an autofs + * file system needs to be preserved over restarts. + */ + if (sbi->flags & AUTOFS_SBI_CATATONIC) + return -EACCES; + } + + return generic_permission(mnt_userns, inode, mask); +} + static int autofs_dir_symlink(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, const char *symname) { - struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); struct autofs_info *ino = autofs_dentry_ino(dentry); struct autofs_info *p_ino; struct inode *inode; @@ -539,16 +575,6 @@ static int autofs_dir_symlink(struct user_namespace *mnt_userns, pr_debug("%s <- %pd\n", symname, dentry); - if (!autofs_oz_mode(sbi)) - return -EACCES; - - /* autofs_oz_mode() needs to allow path walks when the - * autofs mount is catatonic but the state of an autofs - * file system needs to be preserved over restarts. - */ - if (sbi->flags & AUTOFS_SBI_CATATONIC) - return -EACCES; - BUG_ON(!ino); autofs_clean_ino(ino); @@ -571,7 +597,6 @@ static int autofs_dir_symlink(struct user_namespace *mnt_userns, d_add(dentry, inode); dget(dentry); - ino->count++; p_ino = autofs_dentry_ino(dentry->d_parent); p_ino->count++; @@ -601,17 +626,6 @@ static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry) struct autofs_info *ino = autofs_dentry_ino(dentry); struct autofs_info *p_ino; - if (!autofs_oz_mode(sbi)) - return -EACCES; - - /* autofs_oz_mode() needs to allow path walks when the - * autofs mount is catatonic but the state of an autofs - * file system needs to be preserved over restarts. - */ - if (sbi->flags & AUTOFS_SBI_CATATONIC) - return -EACCES; - - ino->count--; p_ino = autofs_dentry_ino(dentry->d_parent); p_ino->count--; dput(ino->dentry); @@ -683,16 +697,6 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry) pr_debug("dentry %p, removing %pd\n", dentry, dentry); - if (!autofs_oz_mode(sbi)) - return -EACCES; - - /* autofs_oz_mode() needs to allow path walks when the - * autofs mount is catatonic but the state of an autofs - * file system needs to be preserved over restarts. - */ - if (sbi->flags & AUTOFS_SBI_CATATONIC) - return -EACCES; - if (ino->count != 1) return -ENOTEMPTY; @@ -704,7 +708,6 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry) if (sbi->version < 5) autofs_clear_leaf_automount_flags(dentry); - ino->count--; p_ino = autofs_dentry_ino(dentry->d_parent); p_ino->count--; dput(ino->dentry); @@ -726,16 +729,6 @@ static int autofs_dir_mkdir(struct user_namespace *mnt_userns, struct autofs_info *p_ino; struct inode *inode; - if (!autofs_oz_mode(sbi)) - return -EACCES; - - /* autofs_oz_mode() needs to allow path walks when the - * autofs mount is catatonic but the state of an autofs - * file system needs to be preserved over restarts. - */ - if (sbi->flags & AUTOFS_SBI_CATATONIC) - return -EACCES; - pr_debug("dentry %p, creating %pd\n", dentry, dentry); BUG_ON(!ino); @@ -753,7 +746,6 @@ static int autofs_dir_mkdir(struct user_namespace *mnt_userns, autofs_set_leaf_automount_flags(dentry); dget(dentry); - ino->count++; p_ino = autofs_dentry_ino(dentry->d_parent); p_ino->count++; inc_nlink(dir); diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index be383fa46b12..32749fcee090 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -108,8 +108,7 @@ static const struct export_operations befs_export_operations = { * passes it the address of befs_get_block, for mapping file * positions to disk blocks. */ -static int -befs_read_folio(struct file *file, struct folio *folio) +static int befs_read_folio(struct file *file, struct folio *folio) { return block_read_full_folio(folio, befs_get_block); } @@ -470,13 +469,12 @@ befs_destroy_inodecache(void) */ static int befs_symlink_read_folio(struct file *unused, struct folio *folio) { - struct page *page = &folio->page; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct super_block *sb = inode->i_sb; struct befs_inode_info *befs_ino = BEFS_I(inode); befs_data_stream *data = &befs_ino->i_data.ds; befs_off_t len = data->size; - char *link = page_address(page); + char *link = folio_address(folio); if (len == 0 || len > PAGE_SIZE) { befs_error(sb, "Long symlink with illegal length"); @@ -489,12 +487,12 @@ static int befs_symlink_read_folio(struct file *unused, struct folio *folio) goto fail; } link[len - 1] = '\0'; - SetPageUptodate(page); - unlock_page(page); + folio_mark_uptodate(folio); + folio_unlock(folio); return 0; fail: - SetPageError(page); - unlock_page(page); + folio_set_error(folio); + folio_unlock(folio); return -EIO; } diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 07960529b360..6e2596ddae10 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -13,7 +13,6 @@ struct btrfs_fs_info; struct btrfs_workqueue; struct btrfs_work; typedef void (*btrfs_func_t)(struct btrfs_work *arg); -typedef void (*btrfs_work_func_t)(struct work_struct *arg); struct btrfs_work { btrfs_func_t func; diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index ebc392ea1d74..d385357e19b6 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -2028,10 +2028,29 @@ out: return ret; } +static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx) +{ + struct btrfs_data_container *inodes = ctx; + const size_t c = 3 * sizeof(u64); + + if (inodes->bytes_left >= c) { + inodes->bytes_left -= c; + inodes->val[inodes->elem_cnt] = inum; + inodes->val[inodes->elem_cnt + 1] = offset; + inodes->val[inodes->elem_cnt + 2] = root; + inodes->elem_cnt += 3; + } else { + inodes->bytes_missing += c - inodes->bytes_left; + inodes->bytes_left = 0; + inodes->elem_missed += 3; + } + + return 0; +} + int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, struct btrfs_path *path, - iterate_extent_inodes_t *iterate, void *ctx, - bool ignore_offset) + void *ctx, bool ignore_offset) { int ret; u64 extent_item_pos; @@ -2049,17 +2068,15 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, extent_item_pos = logical - found_key.objectid; ret = iterate_extent_inodes(fs_info, found_key.objectid, extent_item_pos, search_commit_root, - iterate, ctx, ignore_offset); + build_ino_list, ctx, ignore_offset); return ret; } -typedef int (iterate_irefs_t)(u64 parent, u32 name_len, unsigned long name_off, - struct extent_buffer *eb, void *ctx); +static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, + struct extent_buffer *eb, struct inode_fs_paths *ipath); -static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, - struct btrfs_path *path, - iterate_irefs_t *iterate, void *ctx) +static int iterate_inode_refs(u64 inum, struct inode_fs_paths *ipath) { int ret = 0; int slot; @@ -2068,6 +2085,8 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, u32 name_len; u64 parent = 0; int found = 0; + struct btrfs_root *fs_root = ipath->fs_root; + struct btrfs_path *path = ipath->btrfs_path; struct extent_buffer *eb; struct btrfs_inode_ref *iref; struct btrfs_key found_key; @@ -2103,8 +2122,8 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, "following ref at offset %u for inode %llu in tree %llu", cur, found_key.objectid, fs_root->root_key.objectid); - ret = iterate(parent, name_len, - (unsigned long)(iref + 1), eb, ctx); + ret = inode_to_path(parent, name_len, + (unsigned long)(iref + 1), eb, ipath); if (ret) break; len = sizeof(*iref) + name_len; @@ -2118,15 +2137,15 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, return ret; } -static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, - struct btrfs_path *path, - iterate_irefs_t *iterate, void *ctx) +static int iterate_inode_extrefs(u64 inum, struct inode_fs_paths *ipath) { int ret; int slot; u64 offset = 0; u64 parent; int found = 0; + struct btrfs_root *fs_root = ipath->fs_root; + struct btrfs_path *path = ipath->btrfs_path; struct extent_buffer *eb; struct btrfs_inode_extref *extref; u32 item_size; @@ -2162,8 +2181,8 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, extref = (struct btrfs_inode_extref *)(ptr + cur_offset); parent = btrfs_inode_extref_parent(eb, extref); name_len = btrfs_inode_extref_name_len(eb, extref); - ret = iterate(parent, name_len, - (unsigned long)&extref->name, eb, ctx); + ret = inode_to_path(parent, name_len, + (unsigned long)&extref->name, eb, ipath); if (ret) break; @@ -2180,34 +2199,13 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, return ret; } -static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, - struct btrfs_path *path, iterate_irefs_t *iterate, - void *ctx) -{ - int ret; - int found_refs = 0; - - ret = iterate_inode_refs(inum, fs_root, path, iterate, ctx); - if (!ret) - ++found_refs; - else if (ret != -ENOENT) - return ret; - - ret = iterate_inode_extrefs(inum, fs_root, path, iterate, ctx); - if (ret == -ENOENT && found_refs) - return 0; - - return ret; -} - /* * returns 0 if the path could be dumped (probably truncated) * returns <0 in case of an error */ static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, - struct extent_buffer *eb, void *ctx) + struct extent_buffer *eb, struct inode_fs_paths *ipath) { - struct inode_fs_paths *ipath = ctx; char *fspath; char *fspath_min; int i = ipath->fspath->elem_cnt; @@ -2248,8 +2246,20 @@ static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, */ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath) { - return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path, - inode_to_path, ipath); + int ret; + int found_refs = 0; + + ret = iterate_inode_refs(inum, ipath); + if (!ret) + ++found_refs; + else if (ret != -ENOENT) + return ret; + + ret = iterate_inode_extrefs(inum, ipath); + if (ret == -ENOENT && found_refs) + return 0; + + return ret; } struct btrfs_data_container *init_data_container(u32 total_bytes) diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index ba454032dbe2..2759de7d324c 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -35,8 +35,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, bool ignore_offset); int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, - struct btrfs_path *path, - iterate_extent_inodes_t *iterate, void *ctx, + struct btrfs_path *path, void *ctx, bool ignore_offset); int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index ede389f2602d..c3aecfb0a71d 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1051,8 +1051,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, < block_group->zone_unusable); WARN_ON(block_group->space_info->disk_total < block_group->length * factor); + WARN_ON(block_group->zone_is_active && + block_group->space_info->active_total_bytes + < block_group->length); } block_group->space_info->total_bytes -= block_group->length; + if (block_group->zone_is_active) + block_group->space_info->active_total_bytes -= block_group->length; block_group->space_info->bytes_readonly -= (block_group->length - block_group->zone_unusable); block_group->space_info->bytes_zone_unusable -= @@ -1816,11 +1821,10 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, stripe_nr = physical - map->stripes[i].physical; stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset); - if (map->type & BTRFS_BLOCK_GROUP_RAID10) { + if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)) { stripe_nr = stripe_nr * map->num_stripes + i; stripe_nr = div_u64(stripe_nr, map->sub_stripes); - } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - stripe_nr = stripe_nr * map->num_stripes + i; } /* * The remaining case would be for RAID56, multiply by @@ -2108,7 +2112,8 @@ static int read_one_block_group(struct btrfs_fs_info *info, trace_btrfs_add_block_group(info, cache, 0); btrfs_update_space_info(info, cache->flags, cache->length, cache->used, cache->bytes_super, - cache->zone_unusable, &space_info); + cache->zone_unusable, cache->zone_is_active, + &space_info); cache->space_info = space_info; @@ -2178,7 +2183,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) } btrfs_update_space_info(fs_info, bg->flags, em->len, em->len, - 0, 0, &space_info); + 0, 0, false, &space_info); bg->space_info = space_info; link_block_group(bg); @@ -2559,7 +2564,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran trace_btrfs_add_block_group(fs_info, cache, 1); btrfs_update_space_info(fs_info, cache->flags, size, bytes_used, cache->bytes_super, cache->zone_unusable, - &cache->space_info); + cache->zone_is_active, &cache->space_info); btrfs_update_global_block_rsv(fs_info); link_block_group(cache); @@ -2659,6 +2664,14 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); if (ret < 0) goto out; + /* + * We have allocated a new chunk. We also need to activate that chunk to + * grant metadata tickets for zoned filesystem. + */ + ret = btrfs_zoned_activate_one_bg(fs_info, cache->space_info, true); + if (ret < 0) + goto out; + ret = inc_block_group_ro(cache, 0); if (ret == -ETXTBSY) goto unlock_out; @@ -3761,6 +3774,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, * attempt. */ wait_for_alloc = true; + force = CHUNK_ALLOC_NO_FORCE; spin_unlock(&space_info->lock); mutex_lock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->chunk_mutex); @@ -3884,6 +3898,14 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans, ret = PTR_ERR(bg); } else { /* + * We have a new chunk. We also need to activate it for + * zoned filesystem. + */ + ret = btrfs_zoned_activate_one_bg(fs_info, info, true); + if (ret < 0) + return; + + /* * If we fail to add the chunk item here, we end up * trying again at phase 2 of chunk allocation, at * btrfs_create_pending_block_groups(). So ignore diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 3ac668ace50a..35e0e860cc0b 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -104,6 +104,7 @@ struct btrfs_block_group { unsigned int relocating_repair:1; unsigned int chunk_item_inserted:1; unsigned int zone_is_active:1; + unsigned int zoned_data_reloc_ongoing:1; int disk_cache_state; diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index b3ee49b0b1e8..06be0644dd37 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -118,7 +118,7 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, if (block_rsv->reserved >= block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; block_rsv->reserved = block_rsv->size; - block_rsv->full = 1; + block_rsv->full = true; } else { num_bytes = 0; } @@ -142,7 +142,7 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, bytes_to_add = min(num_bytes, bytes_to_add); dest->reserved += bytes_to_add; if (dest->reserved >= dest->size) - dest->full = 1; + dest->full = true; num_bytes -= bytes_to_add; } spin_unlock(&dest->lock); @@ -171,7 +171,7 @@ int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, return 0; } -void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, enum btrfs_rsv_type type) { memset(rsv, 0, sizeof(*rsv)); spin_lock_init(&rsv->lock); @@ -180,7 +180,7 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv, - unsigned short type) + enum btrfs_rsv_type type) { btrfs_init_block_rsv(rsv, type); rsv->space_info = btrfs_find_space_info(fs_info, @@ -188,7 +188,7 @@ void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, } struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, - unsigned short type) + enum btrfs_rsv_type type) { struct btrfs_block_rsv *block_rsv; @@ -304,7 +304,7 @@ int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes) if (block_rsv->reserved >= num_bytes) { block_rsv->reserved -= num_bytes; if (block_rsv->reserved < block_rsv->size) - block_rsv->full = 0; + block_rsv->full = false; ret = 0; } spin_unlock(&block_rsv->lock); @@ -319,7 +319,7 @@ void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, if (update_size) block_rsv->size += num_bytes; else if (block_rsv->reserved >= block_rsv->size) - block_rsv->full = 1; + block_rsv->full = true; spin_unlock(&block_rsv->lock); } @@ -341,7 +341,7 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, } global_rsv->reserved -= num_bytes; if (global_rsv->reserved < global_rsv->size) - global_rsv->full = 0; + global_rsv->full = false; spin_unlock(&global_rsv->lock); btrfs_block_rsv_add_bytes(dest, num_bytes, true); @@ -408,10 +408,7 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) btrfs_try_granting_tickets(fs_info, sinfo); } - if (block_rsv->reserved == block_rsv->size) - block_rsv->full = 1; - else - block_rsv->full = 0; + block_rsv->full = (block_rsv->reserved == block_rsv->size); if (block_rsv->size >= sinfo->total_bytes) sinfo->force_alloc = CHUNK_ALLOC_FORCE; diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index 3b67ff08d434..0c183709be00 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -9,7 +9,7 @@ enum btrfs_reserve_flush_enum; /* * Types of block reserves */ -enum { +enum btrfs_rsv_type { BTRFS_BLOCK_RSV_GLOBAL, BTRFS_BLOCK_RSV_DELALLOC, BTRFS_BLOCK_RSV_TRANS, @@ -25,9 +25,10 @@ struct btrfs_block_rsv { u64 reserved; struct btrfs_space_info *space_info; spinlock_t lock; - unsigned short full; - unsigned short type; - unsigned short failfast; + bool full; + bool failfast; + /* Block reserve type, one of BTRFS_BLOCK_RSV_* */ + enum btrfs_rsv_type type:8; /* * Qgroup equivalent for @size @reserved @@ -49,13 +50,13 @@ struct btrfs_block_rsv { u64 qgroup_rsv_reserved; }; -void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, enum btrfs_rsv_type type); void btrfs_init_root_block_rsv(struct btrfs_root *root); struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, - unsigned short type); + enum btrfs_rsv_type type); void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv, - unsigned short type); + enum btrfs_rsv_type type); void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 33811e896623..b160b8e124e0 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -279,19 +279,31 @@ static inline void btrfs_insert_inode_hash(struct inode *inode) __insert_inode_hash(inode, h); } +#if BITS_PER_LONG == 32 + +/* + * On 32 bit systems the i_ino of struct inode is 32 bits (unsigned long), so + * we use the inode's location objectid which is a u64 to avoid truncation. + */ static inline u64 btrfs_ino(const struct btrfs_inode *inode) { u64 ino = inode->location.objectid; - /* - * !ino: btree_inode - * type == BTRFS_ROOT_ITEM_KEY: subvol dir - */ - if (!ino || inode->location.type == BTRFS_ROOT_ITEM_KEY) + /* type == BTRFS_ROOT_ITEM_KEY: subvol dir */ + if (inode->location.type == BTRFS_ROOT_ITEM_KEY) ino = inode->vfs_inode.i_ino; return ino; } +#else + +static inline u64 btrfs_ino(const struct btrfs_inode *inode) +{ + return inode->vfs_inode.i_ino; +} + +#endif + static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size) { i_size_write(&inode->vfs_inode, size); @@ -305,8 +317,7 @@ static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode) if (root == root->fs_info->tree_root && btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID) return true; - if (inode->location.objectid == BTRFS_FREE_INO_OBJECTID) - return true; + return false; } diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 5d20137b7b67..98c6e5feab19 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -152,7 +152,7 @@ struct btrfsic_block { struct btrfsic_block *next_in_same_bio; void *orig_bio_private; bio_end_io_t *orig_bio_end_io; - int submit_bio_bh_rw; + blk_opf_t submit_bio_bh_rw; u64 flush_gen; /* only valid if !never_written */ }; @@ -1681,7 +1681,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, u64 dev_bytenr, char **mapped_datav, unsigned int num_pages, struct bio *bio, int *bio_is_patched, - int submit_bio_bh_rw) + blk_opf_t submit_bio_bh_rw) { int is_metadata; struct btrfsic_block *block; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index f4564f32f6d9..e84d22c5c6a8 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -136,109 +136,14 @@ static int compression_decompress(int type, struct list_head *ws, static int btrfs_decompress_bio(struct compressed_bio *cb); -static inline int compressed_bio_size(struct btrfs_fs_info *fs_info, - unsigned long disk_size) -{ - return sizeof(struct compressed_bio) + - (DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * fs_info->csum_size; -} - -static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio, - u64 disk_start) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - const u32 csum_size = fs_info->csum_size; - const u32 sectorsize = fs_info->sectorsize; - struct page *page; - unsigned int i; - char *kaddr; - u8 csum[BTRFS_CSUM_SIZE]; - struct compressed_bio *cb = bio->bi_private; - u8 *cb_sum = cb->sums; - - if ((inode->flags & BTRFS_INODE_NODATASUM) || - test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) - return 0; - - shash->tfm = fs_info->csum_shash; - - for (i = 0; i < cb->nr_pages; i++) { - u32 pg_offset; - u32 bytes_left = PAGE_SIZE; - page = cb->compressed_pages[i]; - - /* Determine the remaining bytes inside the page first */ - if (i == cb->nr_pages - 1) - bytes_left = cb->compressed_len - i * PAGE_SIZE; - - /* Hash through the page sector by sector */ - for (pg_offset = 0; pg_offset < bytes_left; - pg_offset += sectorsize) { - kaddr = kmap_atomic(page); - crypto_shash_digest(shash, kaddr + pg_offset, - sectorsize, csum); - kunmap_atomic(kaddr); - - if (memcmp(&csum, cb_sum, csum_size) != 0) { - btrfs_print_data_csum_error(inode, disk_start, - csum, cb_sum, cb->mirror_num); - if (btrfs_bio(bio)->device) - btrfs_dev_stat_inc_and_print( - btrfs_bio(bio)->device, - BTRFS_DEV_STAT_CORRUPTION_ERRS); - return -EIO; - } - cb_sum += csum_size; - disk_start += sectorsize; - } - } - return 0; -} - -/* - * Reduce bio and io accounting for a compressed_bio with its corresponding bio. - * - * Return true if there is no pending bio nor io. - * Return false otherwise. - */ -static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *bio) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); - unsigned int bi_size = 0; - bool last_io = false; - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - - /* - * At endio time, bi_iter.bi_size doesn't represent the real bio size. - * Thus here we have to iterate through all segments to grab correct - * bio size. - */ - bio_for_each_segment_all(bvec, bio, iter_all) - bi_size += bvec->bv_len; - - if (bio->bi_status) - cb->status = bio->bi_status; - - ASSERT(bi_size && bi_size <= cb->compressed_len); - last_io = refcount_sub_and_test(bi_size >> fs_info->sectorsize_bits, - &cb->pending_sectors); - /* - * Here we must wake up the possible error handler after all other - * operations on @cb finished, or we can race with - * finish_compressed_bio_*() which may free @cb. - */ - wake_up_var(cb); - - return last_io; -} - static void finish_compressed_bio_read(struct compressed_bio *cb) { unsigned int index; struct page *page; + if (cb->status == BLK_STS_OK) + cb->status = errno_to_blk_status(btrfs_decompress_bio(cb)); + /* Release the compressed pages */ for (index = 0; index < cb->nr_pages; index++) { page = cb->compressed_pages[index]; @@ -247,85 +152,63 @@ static void finish_compressed_bio_read(struct compressed_bio *cb) } /* Do io completion on the original bio */ - if (cb->status != BLK_STS_OK) { + if (cb->status != BLK_STS_OK) cb->orig_bio->bi_status = cb->status; - bio_endio(cb->orig_bio); - } else { - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - - /* - * We have verified the checksum already, set page checked so - * the end_io handlers know about it - */ - ASSERT(!bio_flagged(cb->orig_bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) { - u64 bvec_start = page_offset(bvec->bv_page) + - bvec->bv_offset; - - btrfs_page_set_checked(btrfs_sb(cb->inode->i_sb), - bvec->bv_page, bvec_start, - bvec->bv_len); - } - - bio_endio(cb->orig_bio); - } + bio_endio(cb->orig_bio); /* Finally free the cb struct */ kfree(cb->compressed_pages); kfree(cb); } -/* when we finish reading compressed pages from the disk, we - * decompress them and then run the bio end_io routines on the - * decompressed pages (in the inode address space). - * - * This allows the checksumming and other IO error handling routines - * to work normally - * - * The compressed pages are freed here, and it must be run - * in process context +/* + * Verify the checksums and kick off repair if needed on the uncompressed data + * before decompressing it into the original bio and freeing the uncompressed + * pages. */ static void end_compressed_bio_read(struct bio *bio) { struct compressed_bio *cb = bio->bi_private; - struct inode *inode; - unsigned int mirror = btrfs_bio(bio)->mirror_num; - int ret = 0; - - if (!dec_and_test_compressed_bio(cb, bio)) - goto out; - - /* - * Record the correct mirror_num in cb->orig_bio so that - * read-repair can work properly. - */ - btrfs_bio(cb->orig_bio)->mirror_num = mirror; - cb->mirror_num = mirror; - - /* - * Some IO in this cb have failed, just skip checksum as there - * is no way it could be correct. - */ - if (cb->status != BLK_STS_OK) - goto csum_failed; - - inode = cb->inode; - ret = check_compressed_csum(BTRFS_I(inode), bio, - bio->bi_iter.bi_sector << 9); - if (ret) - goto csum_failed; + struct inode *inode = cb->inode; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_inode *bi = BTRFS_I(inode); + bool csum = !(bi->flags & BTRFS_INODE_NODATASUM) && + !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); + blk_status_t status = bio->bi_status; + struct btrfs_bio *bbio = btrfs_bio(bio); + struct bvec_iter iter; + struct bio_vec bv; + u32 offset; + + btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) { + u64 start = bbio->file_offset + offset; + + if (!status && + (!csum || !btrfs_check_data_csum(inode, bbio, offset, + bv.bv_page, bv.bv_offset))) { + clean_io_failure(fs_info, &bi->io_failure_tree, + &bi->io_tree, start, bv.bv_page, + btrfs_ino(bi), bv.bv_offset); + } else { + int ret; + + refcount_inc(&cb->pending_ios); + ret = btrfs_repair_one_sector(inode, bbio, offset, + bv.bv_page, bv.bv_offset, + btrfs_submit_data_read_bio); + if (ret) { + refcount_dec(&cb->pending_ios); + status = errno_to_blk_status(ret); + } + } + } - /* ok, we're the last bio for this extent, lets start - * the decompression. - */ - ret = btrfs_decompress_bio(cb); + if (status) + cb->status = status; -csum_failed: - if (ret) - cb->status = errno_to_blk_status(ret); - finish_compressed_bio_read(cb); -out: + if (refcount_dec_and_test(&cb->pending_ios)) + finish_compressed_bio_read(cb); + btrfs_bio_free_csum(bbio); bio_put(bio); } @@ -403,6 +286,14 @@ static void finish_compressed_bio_write(struct compressed_bio *cb) kfree(cb); } +static void btrfs_finish_compressed_write_work(struct work_struct *work) +{ + struct compressed_bio *cb = + container_of(work, struct compressed_bio, write_end_work); + + finish_compressed_bio_write(cb); +} + /* * Do the cleanup once all the compressed pages hit the disk. This will clear * writeback on the file pages and free the compressed pages. @@ -414,29 +305,18 @@ static void end_compressed_bio_write(struct bio *bio) { struct compressed_bio *cb = bio->bi_private; - if (!dec_and_test_compressed_bio(cb, bio)) - goto out; + if (bio->bi_status) + cb->status = bio->bi_status; - btrfs_record_physical_zoned(cb->inode, cb->start, bio); + if (refcount_dec_and_test(&cb->pending_ios)) { + struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); - finish_compressed_bio_write(cb); -out: + btrfs_record_physical_zoned(cb->inode, cb->start, bio); + queue_work(fs_info->compressed_write_workers, &cb->write_end_work); + } bio_put(bio); } -static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info, - struct bio *bio, int mirror_num) -{ - blk_status_t ret; - - ASSERT(bio->bi_iter.bi_size); - ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); - if (ret) - return ret; - ret = btrfs_map_bio(fs_info, bio, mirror_num); - return ret; -} - /* * Allocate a compressed_bio, which will be used to read/write on-disk * (aka, compressed) * data. @@ -455,7 +335,7 @@ static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info, static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr, - unsigned int opf, bio_end_io_t endio_func, + blk_opf_t opf, bio_end_io_t endio_func, u64 *next_stripe_start) { struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); @@ -487,7 +367,7 @@ static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_byte return ERR_PTR(ret); } *next_stripe_start = disk_bytenr + geom.len; - + refcount_inc(&cb->pending_ios); return bio; } @@ -505,7 +385,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, unsigned int compressed_len, struct page **compressed_pages, unsigned int nr_pages, - unsigned int write_flags, + blk_opf_t write_flags, struct cgroup_subsys_state *blkcg_css, bool writeback) { @@ -514,26 +394,25 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, struct compressed_bio *cb; u64 cur_disk_bytenr = disk_start; u64 next_stripe_start; - blk_status_t ret; + blk_status_t ret = BLK_STS_OK; int skip_sum = inode->flags & BTRFS_INODE_NODATASUM; const bool use_append = btrfs_use_zone_append(inode, disk_start); - const unsigned int bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE; + const enum req_op bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE; ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && IS_ALIGNED(len, fs_info->sectorsize)); - cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); + cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS); if (!cb) return BLK_STS_RESOURCE; - refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits); + refcount_set(&cb->pending_ios, 1); cb->status = BLK_STS_OK; cb->inode = &inode->vfs_inode; cb->start = start; cb->len = len; - cb->mirror_num = 0; cb->compressed_pages = compressed_pages; cb->compressed_len = compressed_len; cb->writeback = writeback; - cb->orig_bio = NULL; + INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work); cb->nr_pages = nr_pages; if (blkcg_css) @@ -554,8 +433,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, &next_stripe_start); if (IS_ERR(bio)) { ret = errno_to_blk_status(PTR_ERR(bio)); - bio = NULL; - goto finish_cb; + break; } if (blkcg_css) bio->bi_opf |= REQ_CGROUP_PUNT; @@ -599,44 +477,25 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, if (submit) { if (!skip_sum) { ret = btrfs_csum_one_bio(inode, bio, start, true); - if (ret) - goto finish_cb; + if (ret) { + bio->bi_status = ret; + bio_endio(bio); + break; + } } - ret = submit_compressed_bio(fs_info, bio, 0); - if (ret) - goto finish_cb; + ASSERT(bio->bi_iter.bi_size); + btrfs_submit_bio(fs_info, bio, 0); bio = NULL; } cond_resched(); } - if (blkcg_css) - kthread_associate_blkcg(NULL); - - return 0; -finish_cb: if (blkcg_css) kthread_associate_blkcg(NULL); - if (bio) { - bio->bi_status = ret; - bio_endio(bio); - } - /* Last byte of @cb is submitted, endio will free @cb */ - if (cur_disk_bytenr == disk_start + compressed_len) - return ret; - - wait_var_event(cb, refcount_read(&cb->pending_sectors) == - (disk_start + compressed_len - cur_disk_bytenr) >> - fs_info->sectorsize_bits); - /* - * Even with previous bio ended, we should still have io not yet - * submitted, thus need to finish manually. - */ - ASSERT(refcount_read(&cb->pending_sectors)); - /* Now we are the only one referring @cb, can finish it safely. */ - finish_compressed_bio_write(cb); + if (refcount_dec_and_test(&cb->pending_ios)) + finish_compressed_bio_write(cb); return ret; } @@ -765,7 +624,6 @@ static noinline int add_ra_bio_pages(struct inode *inode, int zeros; zeros = PAGE_SIZE - zero_offset; memzero_page(page, zero_offset, zeros); - flush_dcache_page(page); } } @@ -819,7 +677,6 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, blk_status_t ret; int ret2; int i; - u8 *sums; em_tree = &BTRFS_I(inode)->extent_tree; @@ -837,17 +694,15 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ASSERT(em->compress_type != BTRFS_COMPRESS_NONE); compressed_len = em->block_len; - cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); + cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS); if (!cb) { ret = BLK_STS_RESOURCE; goto out; } - refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits); + refcount_set(&cb->pending_ios, 1); cb->status = BLK_STS_OK; cb->inode = inode; - cb->mirror_num = mirror_num; - sums = cb->sums; cb->start = em->orig_start; em_len = em->len; @@ -893,9 +748,8 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, REQ_OP_READ, end_compressed_bio_read, &next_stripe_start); if (IS_ERR(comp_bio)) { - ret = errno_to_blk_status(PTR_ERR(comp_bio)); - comp_bio = NULL; - goto finish_cb; + cb->status = errno_to_blk_status(PTR_ERR(comp_bio)); + break; } } /* @@ -931,22 +785,33 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, submit = true; if (submit) { - unsigned int nr_sectors; + /* Save the original iter for read repair */ + if (bio_op(comp_bio) == REQ_OP_READ) + btrfs_bio(comp_bio)->iter = comp_bio->bi_iter; - ret = btrfs_lookup_bio_sums(inode, comp_bio, sums); - if (ret) - goto finish_cb; + /* + * Save the initial offset of this chunk, as there + * is no direct correlation between compressed pages and + * the original file offset. The field is only used for + * priting error messages. + */ + btrfs_bio(comp_bio)->file_offset = file_offset; - nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size, - fs_info->sectorsize); - sums += fs_info->csum_size * nr_sectors; + ret = btrfs_lookup_bio_sums(inode, comp_bio, NULL); + if (ret) { + comp_bio->bi_status = ret; + bio_endio(comp_bio); + break; + } - ret = submit_compressed_bio(fs_info, comp_bio, mirror_num); - if (ret) - goto finish_cb; + ASSERT(comp_bio->bi_iter.bi_size); + btrfs_submit_bio(fs_info, comp_bio, mirror_num); comp_bio = NULL; } } + + if (refcount_dec_and_test(&cb->pending_ios)) + finish_compressed_bio_read(cb); return; fail: @@ -964,25 +829,6 @@ out: bio->bi_status = ret; bio_endio(bio); return; -finish_cb: - if (comp_bio) { - comp_bio->bi_status = ret; - bio_endio(comp_bio); - } - /* All bytes of @cb is submitted, endio will free @cb */ - if (cur_disk_byte == disk_bytenr + compressed_len) - return; - - wait_var_event(cb, refcount_read(&cb->pending_sectors) == - (disk_bytenr + compressed_len - cur_disk_byte) >> - fs_info->sectorsize_bits); - /* - * Even with previous bio ended, we should still have io not yet - * submitted, thus need to finish @cb manually. - */ - ASSERT(refcount_read(&cb->pending_sectors)); - /* Now we are the only one referring @cb, can finish it safely. */ - finish_compressed_bio_read(cb); } /* @@ -1481,7 +1327,6 @@ int btrfs_decompress_buf2page(const char *buf, u32 buf_len, ASSERT(copy_start - decompressed < buf_len); memcpy_to_page(bvec.bv_page, bvec.bv_offset, buf + copy_start - decompressed, copy_len); - flush_dcache_page(bvec.bv_page); cur_offset += copy_len; bio_advance(orig_bio, copy_len); diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 2707404389a5..1aa02903de69 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -30,8 +30,8 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0); #define BTRFS_ZLIB_DEFAULT_LEVEL 3 struct compressed_bio { - /* Number of sectors with unfinished IO (unsubmitted or unfinished) */ - refcount_t pending_sectors; + /* Number of outstanding bios */ + refcount_t pending_ios; /* Number of compressed pages in the array */ unsigned int nr_pages; @@ -59,16 +59,12 @@ struct compressed_bio { /* IO errors */ blk_status_t status; - int mirror_num; - /* for reads, this is the bio we are copying the data into */ - struct bio *orig_bio; - - /* - * the start of a variable length array of checksums only - * used by reads - */ - u8 sums[]; + union { + /* For reads, this is the bio we are copying the data into */ + struct bio *orig_bio; + struct work_struct write_end_work; + }; }; static inline unsigned int btrfs_compress_type(unsigned int type_level) @@ -99,7 +95,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, unsigned int compressed_len, struct page **compressed_pages, unsigned int nr_pages, - unsigned int write_flags, + blk_opf_t write_flags, struct cgroup_subsys_state *blkcg_css, bool writeback); void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0e49b1a0c071..4db85b9dc7ed 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -107,14 +107,6 @@ struct btrfs_ioctl_encoded_io_args; #define BTRFS_STAT_CURR 0 #define BTRFS_STAT_PREV 1 -/* - * Count how many BTRFS_MAX_EXTENT_SIZE cover the @size - */ -static inline u32 count_max_extents(u64 size) -{ - return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); -} - static inline unsigned long btrfs_chunk_item_size(int num_stripes) { BUG_ON(num_stripes == 0); @@ -230,6 +222,13 @@ struct btrfs_root_backup { #define BTRFS_SUPER_INFO_SIZE 4096 /* + * The reserved space at the beginning of each device. + * It covers the primary super block and leaves space for potential use by other + * tools like bootloaders or to lower potential damage of accidental overwrite. + */ +#define BTRFS_DEVICE_RANGE_RESERVED (SZ_1M) + +/* * the super block basically lists the main trees of the FS * it currently lacks any block count etc etc */ @@ -248,8 +247,12 @@ struct btrfs_super_block { __le64 chunk_root; __le64 log_root; - /* this will help find the new super based on the log root */ - __le64 log_root_transid; + /* + * This member has never been utilized since the very beginning, thus + * it's always 0 regardless of kernel version. We always use + * generation + 1 to read log tree root. So here we mark it deprecated. + */ + __le64 __unused_log_root_transid; __le64 total_bytes; __le64 bytes_used; __le64 root_dir_objectid; @@ -635,6 +638,9 @@ enum { /* Indicate we have half completed snapshot deletions pending. */ BTRFS_FS_UNFINISHED_DROPS, + /* Indicate we have to finish a zone to do next allocation. */ + BTRFS_FS_NEED_ZONE_FINISH, + #if BITS_PER_LONG == 32 /* Indicate if we have error/warn message printed on 32bit systems */ BTRFS_FS_32BIT_ERROR, @@ -656,6 +662,18 @@ enum btrfs_exclusive_operation { BTRFS_EXCLOP_SWAP_ACTIVATE, }; +/* Store data about transaction commits, exported via sysfs. */ +struct btrfs_commit_stats { + /* Total number of commits */ + u64 commit_count; + /* The maximum commit duration so far in ns */ + u64 max_commit_dur; + /* The last commit duration in ns */ + u64 last_commit_dur; + /* The total commit duration in ns */ + u64 total_commit_dur; +}; + struct btrfs_fs_info { u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; unsigned long flags; @@ -675,9 +693,8 @@ struct btrfs_fs_info { rwlock_t global_root_lock; struct rb_root global_root_tree; - /* The xarray that holds all the FS roots */ - spinlock_t fs_roots_lock; - struct xarray fs_roots; + spinlock_t fs_roots_radix_lock; + struct radix_tree_root fs_roots_radix; /* block group cache stuff */ rwlock_t block_group_cache_lock; @@ -851,11 +868,11 @@ struct btrfs_fs_info { struct btrfs_workqueue *hipri_workers; struct btrfs_workqueue *delalloc_workers; struct btrfs_workqueue *flush_workers; - struct btrfs_workqueue *endio_workers; - struct btrfs_workqueue *endio_meta_workers; - struct btrfs_workqueue *endio_raid56_workers; + struct workqueue_struct *endio_workers; + struct workqueue_struct *endio_meta_workers; + struct workqueue_struct *endio_raid56_workers; struct workqueue_struct *rmw_workers; - struct btrfs_workqueue *endio_meta_write_workers; + struct workqueue_struct *compressed_write_workers; struct btrfs_workqueue *endio_write_workers; struct btrfs_workqueue *endio_freespace_worker; struct btrfs_workqueue *caching_workers; @@ -995,10 +1012,10 @@ struct btrfs_fs_info { struct btrfs_delayed_root *delayed_root; - /* Extent buffer xarray */ + /* Extent buffer radix tree */ spinlock_t buffer_lock; /* Entries are eb->start / sectorsize */ - struct xarray extent_buffers; + struct radix_tree_root buffer_radix; /* next backup root to be overwritten */ int backup_root_index; @@ -1033,6 +1050,12 @@ struct btrfs_fs_info { u32 csums_per_leaf; u32 stripesize; + /* + * Maximum size of an extent. BTRFS_MAX_EXTENT_SIZE on regular + * filesystem, on zoned it depends on the device constraints. + */ + u64 max_extent_size; + /* Block groups and devices containing active swapfiles. */ spinlock_t swapfile_pins_lock; struct rb_root swapfile_pins; @@ -1048,6 +1071,8 @@ struct btrfs_fs_info { */ u64 zone_size; + /* Max size to emit ZONE_APPEND write command */ + u64 max_zone_append_size; struct mutex zoned_meta_io_lock; spinlock_t treelog_bg_lock; u64 treelog_bg; @@ -1064,6 +1089,11 @@ struct btrfs_fs_info { spinlock_t zone_active_bgs_lock; struct list_head zone_active_bgs; + /* Waiters when BTRFS_FS_NEED_ZONE_FINISH is set */ + wait_queue_head_t zone_finish_wait; + + /* Updates are not protected by any lock */ + struct btrfs_commit_stats commit_stats; #ifdef CONFIG_BTRFS_FS_REF_VERIFY spinlock_t ref_verify_lock; @@ -1119,8 +1149,7 @@ enum { */ BTRFS_ROOT_SHAREABLE, BTRFS_ROOT_TRACK_DIRTY, - /* The root is tracked in fs_info::fs_roots */ - BTRFS_ROOT_REGISTERED, + BTRFS_ROOT_IN_RADIX, BTRFS_ROOT_ORPHAN_ITEM_INSERTED, BTRFS_ROOT_DEFRAG_RUNNING, BTRFS_ROOT_FORCE_COW, @@ -1224,10 +1253,10 @@ struct btrfs_root { struct rb_root inode_tree; /* - * Xarray that keeps track of delayed nodes of every inode, protected - * by inode_lock + * radix tree that keeps track of delayed nodes of every inode, + * protected by inode_lock */ - struct xarray delayed_nodes; + struct radix_tree_root delayed_nodes_tree; /* * right now this just gets used so that a root has its own devid * for stat. It may be used for more later @@ -1330,6 +1359,8 @@ struct btrfs_replace_extent_info { * existing extent into a file range. */ bool is_new_extent; + /* Indicate if we should update the inode's mtime and ctime. */ + bool update_times; /* Meaningful only if is_new_extent is true. */ int qgroup_reserved; /* @@ -2475,8 +2506,6 @@ BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block, chunk_root_level, 8); BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block, log_root, 64); -BTRFS_SETGET_STACK_FUNCS(super_log_root_transid, struct btrfs_super_block, - log_root_transid, 64); BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block, log_root_level, 8); BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block, @@ -2733,8 +2762,16 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, enum btrfs_inline_ref_type is_data); u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset); +static inline u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, + u64 offset) +{ + u64 offset_in_sectors = offset >> fs_info->sectorsize_bits; + + return csums + offset_in_sectors * fs_info->csum_size; +} + /* - * Take the number of bytes to be checksummmed and figure out how many leaves + * Take the number of bytes to be checksummed and figure out how many leaves * it would require to store the csums for that many bytes. */ static inline u64 btrfs_csum_bytes_to_leaves( @@ -3251,11 +3288,18 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz u64 btrfs_file_extent_end(const struct btrfs_path *path); /* inode.c */ -void btrfs_submit_data_bio(struct inode *inode, struct bio *bio, - int mirror_num, enum btrfs_compression_type compress_type); +void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num); +void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, + int mirror_num, enum btrfs_compression_type compress_type); +int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, + u32 pgoff, u8 *csum, const u8 * const csum_expected); +int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, + u32 bio_offset, struct page *page, u32 pgoff); unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, u32 bio_offset, struct page *page, u64 start, u64 end); +int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, + u32 bio_offset, struct page *page, u32 pgoff); struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, u64 start, u64 len); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, @@ -3305,9 +3349,9 @@ void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args); struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns, struct inode *dir); void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, - unsigned *bits); + u32 bits); void btrfs_clear_delalloc_extent(struct inode *inode, - struct extent_state *state, unsigned *bits); + struct extent_state *state, u32 bits); void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, struct extent_state *other); void btrfs_split_delalloc_extent(struct inode *inode, @@ -3353,6 +3397,12 @@ int btrfs_writepage_cow_fixup(struct page *page); void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, struct page *page, u64 start, u64 end, bool uptodate); +int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, + int compress_type); +int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, + u64 file_offset, u64 disk_bytenr, + u64 disk_io_size, + struct page **pages); ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, struct btrfs_ioctl_encoded_io_args *encoded); ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, @@ -4009,6 +4059,19 @@ static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) return fs_info->zone_size > 0; } +/* + * Count how many fs_info->max_extent_size cover the @size + */ +static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size) +{ +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (!fs_info) + return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); +#endif + + return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size); +} + static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) { return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID; diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 36ab0859a263..1e8f17ff829e 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -273,7 +273,7 @@ static void calc_inode_reservations(struct btrfs_fs_info *fs_info, u64 num_bytes, u64 disk_num_bytes, u64 *meta_reserve, u64 *qgroup_reserve) { - u64 nr_extents = count_max_extents(num_bytes); + u64 nr_extents = count_max_extents(fs_info, num_bytes); u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes); u64 inode_update = btrfs_calc_metadata_size(fs_info, 1); @@ -350,7 +350,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, * needs to free the reservation we just made. */ spin_lock(&inode->lock); - nr_extents = count_max_extents(num_bytes); + nr_extents = count_max_extents(fs_info, num_bytes); btrfs_mod_outstanding_extents(inode, nr_extents); inode->csum_bytes += disk_num_bytes; btrfs_calculate_inode_block_rsv_size(fs_info, inode); @@ -413,7 +413,7 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes) unsigned num_extents; spin_lock(&inode->lock); - num_extents = count_max_extents(num_bytes); + num_extents = count_max_extents(fs_info, num_bytes); btrfs_mod_outstanding_extents(inode, -num_extents); btrfs_calculate_inode_block_rsv_size(fs_info, inode); spin_unlock(&inode->lock); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 66779ab3ed4a..e7f34871a132 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -52,18 +52,6 @@ static inline void btrfs_init_delayed_node( INIT_LIST_HEAD(&delayed_node->p_list); } -static inline int btrfs_is_continuous_delayed_item( - struct btrfs_delayed_item *item1, - struct btrfs_delayed_item *item2) -{ - if (item1->key.type == BTRFS_DIR_INDEX_KEY && - item1->key.objectid == item2->key.objectid && - item1->key.type == item2->key.type && - item1->key.offset + 1 == item2->key.offset) - return 1; - return 0; -} - static struct btrfs_delayed_node *btrfs_get_delayed_node( struct btrfs_inode *btrfs_inode) { @@ -78,7 +66,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( } spin_lock(&root->inode_lock); - node = xa_load(&root->delayed_nodes, ino); + node = radix_tree_lookup(&root->delayed_nodes_tree, ino); if (node) { if (btrfs_inode->delayed_node) { @@ -90,9 +78,9 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( /* * It's possible that we're racing into the middle of removing - * this node from the xarray. In this case, the refcount + * this node from the radix tree. In this case, the refcount * was zero and it should never go back to one. Just return - * NULL like it was never in the xarray at all; our release + * NULL like it was never in the radix at all; our release * function is in the process of removing it. * * Some implementations of refcount_inc refuse to bump the @@ -100,7 +88,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( * here, refcount_inc() may decide to just WARN_ONCE() instead * of actually bumping the refcount. * - * If this node is properly in the xarray, we want to bump the + * If this node is properly in the radix, we want to bump the * refcount twice, once for the inode and once for this get * operation. */ @@ -128,30 +116,36 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( u64 ino = btrfs_ino(btrfs_inode); int ret; - do { - node = btrfs_get_delayed_node(btrfs_inode); - if (node) - return node; +again: + node = btrfs_get_delayed_node(btrfs_inode); + if (node) + return node; - node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS); - if (!node) - return ERR_PTR(-ENOMEM); - btrfs_init_delayed_node(node, root, ino); + node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS); + if (!node) + return ERR_PTR(-ENOMEM); + btrfs_init_delayed_node(node, root, ino); - /* Cached in the inode and can be accessed */ - refcount_set(&node->refs, 2); + /* cached in the btrfs inode and can be accessed */ + refcount_set(&node->refs, 2); - spin_lock(&root->inode_lock); - ret = xa_insert(&root->delayed_nodes, ino, node, GFP_NOFS); - if (ret) { - spin_unlock(&root->inode_lock); - kmem_cache_free(delayed_node_cache, node); - if (ret != -EBUSY) - return ERR_PTR(ret); - } - } while (ret); + ret = radix_tree_preload(GFP_NOFS); + if (ret) { + kmem_cache_free(delayed_node_cache, node); + return ERR_PTR(ret); + } + + spin_lock(&root->inode_lock); + ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node); + if (ret == -EEXIST) { + spin_unlock(&root->inode_lock); + kmem_cache_free(delayed_node_cache, node); + radix_tree_preload_end(); + goto again; + } btrfs_inode->delayed_node = node; spin_unlock(&root->inode_lock); + radix_tree_preload_end(); return node; } @@ -270,7 +264,8 @@ static void __btrfs_release_delayed_node( * back up. We can delete it now. */ ASSERT(refcount_read(&delayed_node->refs) == 0); - xa_erase(&root->delayed_nodes, delayed_node->inode_id); + radix_tree_delete(&root->delayed_nodes_tree, + delayed_node->inode_id); spin_unlock(&root->inode_lock); kmem_cache_free(delayed_node_cache, delayed_node); } @@ -391,8 +386,7 @@ static struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item( } static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, - struct btrfs_delayed_item *ins, - int action) + struct btrfs_delayed_item *ins) { struct rb_node **p, *node; struct rb_node *parent_node = NULL; @@ -401,9 +395,9 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, int cmp; bool leftmost = true; - if (action == BTRFS_DELAYED_INSERTION_ITEM) + if (ins->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM) root = &delayed_node->ins_root; - else if (action == BTRFS_DELAYED_DELETION_ITEM) + else if (ins->ins_or_del == BTRFS_DELAYED_DELETION_ITEM) root = &delayed_node->del_root; else BUG(); @@ -429,32 +423,19 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, rb_link_node(node, parent_node, p); rb_insert_color_cached(node, root, leftmost); ins->delayed_node = delayed_node; - ins->ins_or_del = action; - if (ins->key.type == BTRFS_DIR_INDEX_KEY && - action == BTRFS_DELAYED_INSERTION_ITEM && + /* Delayed items are always for dir index items. */ + ASSERT(ins->key.type == BTRFS_DIR_INDEX_KEY); + + if (ins->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM && ins->key.offset >= delayed_node->index_cnt) - delayed_node->index_cnt = ins->key.offset + 1; + delayed_node->index_cnt = ins->key.offset + 1; delayed_node->count++; atomic_inc(&delayed_node->root->fs_info->delayed_root->items); return 0; } -static int __btrfs_add_delayed_insertion_item(struct btrfs_delayed_node *node, - struct btrfs_delayed_item *item) -{ - return __btrfs_add_delayed_item(node, item, - BTRFS_DELAYED_INSERTION_ITEM); -} - -static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node, - struct btrfs_delayed_item *item) -{ - return __btrfs_add_delayed_item(node, item, - BTRFS_DELAYED_DELETION_ITEM); -} - static void finish_one_item(struct btrfs_delayed_root *delayed_root) { int seq = atomic_inc_return(&delayed_root->items_seq); @@ -566,7 +547,13 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, trace_btrfs_space_reservation(fs_info, "delayed_item", item->key.objectid, num_bytes, 1); - item->bytes_reserved = num_bytes; + /* + * For insertions we track reserved metadata space by accounting + * for the number of leaves that will be used, based on the delayed + * node's index_items_size field. + */ + if (item->ins_or_del == BTRFS_DELAYED_DELETION_ITEM) + item->bytes_reserved = num_bytes; } return ret; @@ -592,6 +579,21 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, btrfs_block_rsv_release(fs_info, rsv, item->bytes_reserved, NULL); } +static void btrfs_delayed_item_release_leaves(struct btrfs_delayed_node *node, + unsigned int num_leaves) +{ + struct btrfs_fs_info *fs_info = node->root->fs_info; + const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, num_leaves); + + /* There are no space reservations during log replay, bail out. */ + if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) + return; + + trace_btrfs_space_reservation(fs_info, "delayed_item", node->inode_id, + bytes, 0); + btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv, bytes, NULL); +} + static int btrfs_delayed_inode_reserve_metadata( struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -665,22 +667,53 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info, } /* - * Insert a single delayed item or a batch of delayed items that have consecutive - * keys if they exist. + * Insert a single delayed item or a batch of delayed items, as many as possible + * that fit in a leaf. The delayed items (dir index keys) are sorted by their key + * in the rbtree, and if there's a gap between two consecutive dir index items, + * then it means at some point we had delayed dir indexes to add but they got + * removed (by btrfs_delete_delayed_dir_index()) before we attempted to flush them + * into the subvolume tree. Dir index keys also have their offsets coming from a + * monotonically increasing counter, so we can't get new keys with an offset that + * fits within a gap between delayed dir index items. */ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_delayed_item *first_item) { + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_delayed_node *node = first_item->delayed_node; LIST_HEAD(item_list); struct btrfs_delayed_item *curr; struct btrfs_delayed_item *next; - const int max_size = BTRFS_LEAF_DATA_SIZE(root->fs_info); + const int max_size = BTRFS_LEAF_DATA_SIZE(fs_info); struct btrfs_item_batch batch; int total_size; char *ins_data = NULL; int ret; + bool continuous_keys_only = false; + + lockdep_assert_held(&node->mutex); + + /* + * During normal operation the delayed index offset is continuously + * increasing, so we can batch insert all items as there will not be any + * overlapping keys in the tree. + * + * The exception to this is log replay, where we may have interleaved + * offsets in the tree, so our batch needs to be continuous keys only in + * order to ensure we do not end up with out of order items in our leaf. + */ + if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) + continuous_keys_only = true; + + /* + * For delayed items to insert, we track reserved metadata bytes based + * on the number of leaves that we will use. + * See btrfs_insert_delayed_dir_index() and + * btrfs_delayed_item_reserve_metadata()). + */ + ASSERT(first_item->bytes_reserved == 0); list_add_tail(&first_item->tree_list, &item_list); batch.total_data_size = first_item->data_len; @@ -692,9 +725,19 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, int next_size; next = __btrfs_next_delayed_item(curr); - if (!next || !btrfs_is_continuous_delayed_item(curr, next)) + if (!next) + break; + + /* + * We cannot allow gaps in the key space if we're doing log + * replay. + */ + if (continuous_keys_only && + (next->key.offset != curr->key.offset + 1)) break; + ASSERT(next->bytes_reserved == 0); + next_size = next->data_len + sizeof(struct btrfs_item); if (total_size + next_size > max_size) break; @@ -751,9 +794,41 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, */ btrfs_release_path(path); + ASSERT(node->index_item_leaves > 0); + + /* + * For normal operations we will batch an entire leaf's worth of delayed + * items, so if there are more items to process we can decrement + * index_item_leaves by 1 as we inserted 1 leaf's worth of items. + * + * However for log replay we may not have inserted an entire leaf's + * worth of items, we may have not had continuous items, so decrementing + * here would mess up the index_item_leaves accounting. For this case + * only clean up the accounting when there are no items left. + */ + if (next && !continuous_keys_only) { + /* + * We inserted one batch of items into a leaf a there are more + * items to flush in a future batch, now release one unit of + * metadata space from the delayed block reserve, corresponding + * the leaf we just flushed to. + */ + btrfs_delayed_item_release_leaves(node, 1); + node->index_item_leaves--; + } else if (!next) { + /* + * There are no more items to insert. We can have a number of + * reserved leaves > 1 here - this happens when many dir index + * items are added and then removed before they are flushed (file + * names with a very short life, never span a transaction). So + * release all remaining leaves. + */ + btrfs_delayed_item_release_leaves(node, node->index_item_leaves); + node->index_item_leaves = 0; + } + list_for_each_entry_safe(curr, next, &item_list, tree_list) { list_del(&curr->tree_list); - btrfs_delayed_item_release_metadata(root, curr); btrfs_release_delayed_item(curr); } out: @@ -789,62 +864,75 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_delayed_item *item) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_delayed_item *curr, *next; - struct extent_buffer *leaf; - struct btrfs_key key; - struct list_head head; - int nitems, i, last_item; - int ret = 0; + struct extent_buffer *leaf = path->nodes[0]; + LIST_HEAD(batch_list); + int nitems, slot, last_slot; + int ret; + u64 total_reserved_size = item->bytes_reserved; - BUG_ON(!path->nodes[0]); + ASSERT(leaf != NULL); - leaf = path->nodes[0]; + slot = path->slots[0]; + last_slot = btrfs_header_nritems(leaf) - 1; + /* + * Our caller always gives us a path pointing to an existing item, so + * this can not happen. + */ + ASSERT(slot <= last_slot); + if (WARN_ON(slot > last_slot)) + return -ENOENT; - i = path->slots[0]; - last_item = btrfs_header_nritems(leaf) - 1; - if (i > last_item) - return -ENOENT; /* FIXME: Is errno suitable? */ + nitems = 1; + curr = item; + list_add_tail(&curr->tree_list, &batch_list); - next = item; - INIT_LIST_HEAD(&head); - btrfs_item_key_to_cpu(leaf, &key, i); - nitems = 0; /* - * count the number of the dir index items that we can delete in batch + * Keep checking if the next delayed item matches the next item in the + * leaf - if so, we can add it to the batch of items to delete from the + * leaf. */ - while (btrfs_comp_cpu_keys(&next->key, &key) == 0) { - list_add_tail(&next->tree_list, &head); - nitems++; + while (slot < last_slot) { + struct btrfs_key key; - curr = next; next = __btrfs_next_delayed_item(curr); if (!next) break; - if (!btrfs_is_continuous_delayed_item(curr, next)) - break; - - i++; - if (i > last_item) + slot++; + btrfs_item_key_to_cpu(leaf, &key, slot); + if (btrfs_comp_cpu_keys(&next->key, &key) != 0) break; - btrfs_item_key_to_cpu(leaf, &key, i); + nitems++; + curr = next; + list_add_tail(&curr->tree_list, &batch_list); + total_reserved_size += curr->bytes_reserved; } - if (!nitems) - return 0; - ret = btrfs_del_items(trans, root, path, path->slots[0], nitems); if (ret) - goto out; + return ret; + + /* In case of BTRFS_FS_LOG_RECOVERING items won't have reserved space */ + if (total_reserved_size > 0) { + /* + * Check btrfs_delayed_item_reserve_metadata() to see why we + * don't need to release/reserve qgroup space. + */ + trace_btrfs_space_reservation(fs_info, "delayed_item", + item->key.objectid, total_reserved_size, + 0); + btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv, + total_reserved_size, NULL); + } - list_for_each_entry_safe(curr, next, &head, tree_list) { - btrfs_delayed_item_release_metadata(root, curr); + list_for_each_entry_safe(curr, next, &batch_list, tree_list) { list_del(&curr->tree_list); btrfs_release_delayed_item(curr); } -out: - return ret; + return 0; } static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans, @@ -852,43 +940,52 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_delayed_node *node) { - struct btrfs_delayed_item *curr, *prev; int ret = 0; -do_again: - mutex_lock(&node->mutex); - curr = __btrfs_first_delayed_deletion_item(node); - if (!curr) - goto delete_fail; + while (ret == 0) { + struct btrfs_delayed_item *item; + + mutex_lock(&node->mutex); + item = __btrfs_first_delayed_deletion_item(node); + if (!item) { + mutex_unlock(&node->mutex); + break; + } + + ret = btrfs_search_slot(trans, root, &item->key, path, -1, 1); + if (ret > 0) { + /* + * There's no matching item in the leaf. This means we + * have already deleted this item in a past run of the + * delayed items. We ignore errors when running delayed + * items from an async context, through a work queue job + * running btrfs_async_run_delayed_root(), and don't + * release delayed items that failed to complete. This + * is because we will retry later, and at transaction + * commit time we always run delayed items and will + * then deal with errors if they fail to run again. + * + * So just release delayed items for which we can't find + * an item in the tree, and move to the next item. + */ + btrfs_release_path(path); + btrfs_release_delayed_item(item); + ret = 0; + } else if (ret == 0) { + ret = btrfs_batch_delete_items(trans, root, path, item); + btrfs_release_path(path); + } - ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1); - if (ret < 0) - goto delete_fail; - else if (ret > 0) { /* - * can't find the item which the node points to, so this node - * is invalid, just drop it. + * We unlock and relock on each iteration, this is to prevent + * blocking other tasks for too long while we are being run from + * the async context (work queue job). Those tasks are typically + * running system calls like creat/mkdir/rename/unlink/etc which + * need to add delayed items to this delayed node. */ - prev = curr; - curr = __btrfs_next_delayed_item(prev); - btrfs_release_delayed_item(prev); - ret = 0; - btrfs_release_path(path); - if (curr) { - mutex_unlock(&node->mutex); - goto do_again; - } else - goto delete_fail; + mutex_unlock(&node->mutex); } - btrfs_batch_delete_items(trans, root, path, curr); - btrfs_release_path(path); - mutex_unlock(&node->mutex); - goto do_again; - -delete_fail: - btrfs_release_path(path); - mutex_unlock(&node->mutex); return ret; } @@ -1347,9 +1444,13 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, struct btrfs_disk_key *disk_key, u8 type, u64 index) { + struct btrfs_fs_info *fs_info = trans->fs_info; + const unsigned int leaf_data_size = BTRFS_LEAF_DATA_SIZE(fs_info); struct btrfs_delayed_node *delayed_node; struct btrfs_delayed_item *delayed_item; struct btrfs_dir_item *dir_item; + bool reserve_leaf_space; + u32 data_len; int ret; delayed_node = btrfs_get_or_create_delayed_node(dir); @@ -1365,6 +1466,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, delayed_item->key.objectid = btrfs_ino(dir); delayed_item->key.type = BTRFS_DIR_INDEX_KEY; delayed_item->key.offset = index; + delayed_item->ins_or_del = BTRFS_DELAYED_INSERTION_ITEM; dir_item = (struct btrfs_dir_item *)delayed_item->data; dir_item->location = *disk_key; @@ -1374,15 +1476,52 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, btrfs_set_stack_dir_type(dir_item, type); memcpy((char *)(dir_item + 1), name, name_len); - ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, delayed_item); - /* - * we have reserved enough space when we start a new transaction, - * so reserving metadata failure is impossible - */ - BUG_ON(ret); + data_len = delayed_item->data_len + sizeof(struct btrfs_item); mutex_lock(&delayed_node->mutex); - ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item); + + if (delayed_node->index_item_leaves == 0 || + delayed_node->curr_index_batch_size + data_len > leaf_data_size) { + delayed_node->curr_index_batch_size = data_len; + reserve_leaf_space = true; + } else { + delayed_node->curr_index_batch_size += data_len; + reserve_leaf_space = false; + } + + if (reserve_leaf_space) { + ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, + delayed_item); + /* + * Space was reserved for a dir index item insertion when we + * started the transaction, so getting a failure here should be + * impossible. + */ + if (WARN_ON(ret)) { + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_item(delayed_item); + goto release_node; + } + + delayed_node->index_item_leaves++; + } else if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { + const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, 1); + + /* + * Adding the new dir index item does not require touching another + * leaf, so we can release 1 unit of metadata that was previously + * reserved when starting the transaction. This applies only to + * the case where we had a transaction start and excludes the + * transaction join case (when replaying log trees). + */ + trace_btrfs_space_reservation(fs_info, "transaction", + trans->transid, bytes, 0); + btrfs_block_rsv_release(fs_info, trans->block_rsv, bytes, NULL); + ASSERT(trans->bytes_reserved >= bytes); + trans->bytes_reserved -= bytes; + } + + ret = __btrfs_add_delayed_item(delayed_node, delayed_item); if (unlikely(ret)) { btrfs_err(trans->fs_info, "err add delayed dir index item(name: %.*s) into the insertion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)", @@ -1410,8 +1549,37 @@ static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info, return 1; } - btrfs_delayed_item_release_metadata(node->root, item); + /* + * For delayed items to insert, we track reserved metadata bytes based + * on the number of leaves that we will use. + * See btrfs_insert_delayed_dir_index() and + * btrfs_delayed_item_reserve_metadata()). + */ + ASSERT(item->bytes_reserved == 0); + ASSERT(node->index_item_leaves > 0); + + /* + * If there's only one leaf reserved, we can decrement this item from the + * current batch, otherwise we can not because we don't know which leaf + * it belongs to. With the current limit on delayed items, we rarely + * accumulate enough dir index items to fill more than one leaf (even + * when using a leaf size of 4K). + */ + if (node->index_item_leaves == 1) { + const u32 data_len = item->data_len + sizeof(struct btrfs_item); + + ASSERT(node->curr_index_batch_size >= data_len); + node->curr_index_batch_size -= data_len; + } + btrfs_release_delayed_item(item); + + /* If we now have no more dir index items, we can release all leaves. */ + if (RB_EMPTY_ROOT(&node->ins_root.rb_root)) { + btrfs_delayed_item_release_leaves(node, node->index_item_leaves); + node->index_item_leaves = 0; + } + mutex_unlock(&node->mutex); return 0; } @@ -1444,6 +1612,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, } item->key = item_key; + item->ins_or_del = BTRFS_DELAYED_DELETION_ITEM; ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, item); /* @@ -1458,7 +1627,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, } mutex_lock(&node->mutex); - ret = __btrfs_add_delayed_deletion_item(node, item); + ret = __btrfs_add_delayed_item(node, item); if (unlikely(ret)) { btrfs_err(trans->fs_info, "err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)", @@ -1826,12 +1995,17 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node) mutex_lock(&delayed_node->mutex); curr_item = __btrfs_first_delayed_insertion_item(delayed_node); while (curr_item) { - btrfs_delayed_item_release_metadata(root, curr_item); prev_item = curr_item; curr_item = __btrfs_next_delayed_item(prev_item); btrfs_release_delayed_item(prev_item); } + if (delayed_node->index_item_leaves > 0) { + btrfs_delayed_item_release_leaves(delayed_node, + delayed_node->index_item_leaves); + delayed_node->index_item_leaves = 0; + } + curr_item = __btrfs_first_delayed_deletion_item(delayed_node); while (curr_item) { btrfs_delayed_item_release_metadata(root, curr_item); @@ -1863,35 +2037,34 @@ void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode) void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) { - unsigned long index = 0; - struct btrfs_delayed_node *delayed_node; + u64 inode_id = 0; struct btrfs_delayed_node *delayed_nodes[8]; + int i, n; while (1) { - int n = 0; - spin_lock(&root->inode_lock); - if (xa_empty(&root->delayed_nodes)) { + n = radix_tree_gang_lookup(&root->delayed_nodes_tree, + (void **)delayed_nodes, inode_id, + ARRAY_SIZE(delayed_nodes)); + if (!n) { spin_unlock(&root->inode_lock); - return; + break; } - xa_for_each_start(&root->delayed_nodes, index, delayed_node, index) { + inode_id = delayed_nodes[n - 1]->inode_id + 1; + for (i = 0; i < n; i++) { /* * Don't increase refs in case the node is dead and * about to be removed from the tree in the loop below */ - if (refcount_inc_not_zero(&delayed_node->refs)) { - delayed_nodes[n] = delayed_node; - n++; - } - if (n >= ARRAY_SIZE(delayed_nodes)) - break; + if (!refcount_inc_not_zero(&delayed_nodes[i]->refs)) + delayed_nodes[i] = NULL; } - index++; spin_unlock(&root->inode_lock); - for (int i = 0; i < n; i++) { + for (i = 0; i < n; i++) { + if (!delayed_nodes[i]) + continue; __btrfs_kill_delayed_node(delayed_nodes[i]); btrfs_release_delayed_node(delayed_nodes[i]); } diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index b2412160c5bc..9795dc295a18 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -58,6 +58,17 @@ struct btrfs_delayed_node { u64 index_cnt; unsigned long flags; int count; + /* + * The size of the next batch of dir index items to insert (if this + * node is from a directory inode). Protected by @mutex. + */ + u32 curr_index_batch_size; + /* + * Number of leaves reserved for inserting dir index items (if this + * node belongs to a directory inode). This may be larger then the + * actual number of leaves we end up using. Protected by @mutex. + */ + u32 index_item_leaves; }; struct btrfs_delayed_item { diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 99f37fca2e96..36a3debe9493 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -132,7 +132,7 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) spin_lock(&delayed_rsv->lock); delayed_rsv->size += num_bytes; - delayed_rsv->full = 0; + delayed_rsv->full = false; spin_unlock(&delayed_rsv->lock); trans->delayed_ref_updates = 0; } @@ -175,7 +175,7 @@ void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, if (num_bytes) delayed_refs_rsv->reserved += num_bytes; if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size) - delayed_refs_rsv->full = 1; + delayed_refs_rsv->full = true; spin_unlock(&delayed_refs_rsv->lock); if (num_bytes) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index a7dd6ba25e99..f43196a893ca 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -587,7 +587,8 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, ASSERT(!IS_ERR(em)); map = em->map_lookup; - num_extents = cur_extent = 0; + num_extents = 0; + cur_extent = 0; for (i = 0; i < map->num_stripes; i++) { /* We have more device extent to copy */ if (srcdev != map->stripes[i].dev) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4ba005c41983..4c3166f3c725 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -5,6 +5,7 @@ #include <linux/fs.h> #include <linux/blkdev.h> +#include <linux/radix-tree.h> #include <linux/writeback.h> #include <linux/workqueue.h> #include <linux/kthread.h> @@ -50,7 +51,6 @@ BTRFS_SUPER_FLAG_METADUMP |\ BTRFS_SUPER_FLAG_METADUMP_V2) -static void end_workqueue_fn(struct btrfs_work *work); static void btrfs_destroy_ordered_extents(struct btrfs_root *root); static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info); @@ -63,40 +63,6 @@ static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info); static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info); -/* - * btrfs_end_io_wq structs are used to do processing in task context when an IO - * is complete. This is used during reads to verify checksums, and it is used - * by writes to insert metadata for new file extents after IO is complete. - */ -struct btrfs_end_io_wq { - struct bio *bio; - bio_end_io_t *end_io; - void *private; - struct btrfs_fs_info *info; - blk_status_t status; - enum btrfs_wq_endio_type metadata; - struct btrfs_work work; -}; - -static struct kmem_cache *btrfs_end_io_wq_cache; - -int __init btrfs_end_io_wq_init(void) -{ - btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq", - sizeof(struct btrfs_end_io_wq), - 0, - SLAB_MEM_SPREAD, - NULL); - if (!btrfs_end_io_wq_cache) - return -ENOMEM; - return 0; -} - -void __cold btrfs_end_io_wq_exit(void) -{ - kmem_cache_destroy(btrfs_end_io_wq_cache); -} - static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) { if (fs_info->csum_shash) @@ -255,8 +221,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, goto out; } btrfs_err_rl(eb->fs_info, - "parent transid verify failed on %llu wanted %llu found %llu", - eb->start, +"parent transid verify failed on logical %llu mirror %u wanted %llu found %llu", + eb->start, eb->read_mirror, parent_transid, btrfs_header_generation(eb)); ret = 1; clear_extent_buffer_uptodate(eb); @@ -485,7 +451,7 @@ static int csum_dirty_subpage_buffers(struct btrfs_fs_info *fs_info, uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur, fs_info->nodesize); - /* A dirty eb shouldn't disappear from extent_buffers */ + /* A dirty eb shouldn't disappear from buffer_radix */ if (WARN_ON(!eb)) return -EUCLEAN; @@ -586,21 +552,23 @@ static int validate_extent_buffer(struct extent_buffer *eb) found_start = btrfs_header_bytenr(eb); if (found_start != eb->start) { - btrfs_err_rl(fs_info, "bad tree block start, want %llu have %llu", - eb->start, found_start); + btrfs_err_rl(fs_info, + "bad tree block start, mirror %u want %llu have %llu", + eb->read_mirror, eb->start, found_start); ret = -EIO; goto out; } if (check_tree_block_fsid(eb)) { - btrfs_err_rl(fs_info, "bad fsid on block %llu", - eb->start); + btrfs_err_rl(fs_info, "bad fsid on logical %llu mirror %u", + eb->start, eb->read_mirror); ret = -EIO; goto out; } found_level = btrfs_header_level(eb); if (found_level >= BTRFS_MAX_LEVEL) { - btrfs_err(fs_info, "bad tree block level %d on %llu", - (int)btrfs_header_level(eb), eb->start); + btrfs_err(fs_info, + "bad tree block level, mirror %u level %d on logical %llu", + eb->read_mirror, btrfs_header_level(eb), eb->start); ret = -EIO; goto out; } @@ -611,8 +579,8 @@ static int validate_extent_buffer(struct extent_buffer *eb) if (memcmp(result, header_csum, csum_size) != 0) { btrfs_warn_rl(fs_info, - "checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d", - eb->start, +"checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d", + eb->start, eb->read_mirror, CSUM_FMT_VALUE(csum_size, header_csum), CSUM_FMT_VALUE(csum_size, result), btrfs_header_level(eb)); @@ -637,8 +605,8 @@ static int validate_extent_buffer(struct extent_buffer *eb) set_extent_buffer_uptodate(eb); else btrfs_err(fs_info, - "block=%llu read time tree block corruption detected", - eb->start); + "read time tree block corruption detected on logical %llu mirror %u", + eb->start, eb->read_mirror); out: return ret; } @@ -739,58 +707,6 @@ err: return ret; } -static void end_workqueue_bio(struct bio *bio) -{ - struct btrfs_end_io_wq *end_io_wq = bio->bi_private; - struct btrfs_fs_info *fs_info; - struct btrfs_workqueue *wq; - - fs_info = end_io_wq->info; - end_io_wq->status = bio->bi_status; - - if (btrfs_op(bio) == BTRFS_MAP_WRITE) { - if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) - wq = fs_info->endio_meta_write_workers; - else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) - wq = fs_info->endio_freespace_worker; - else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) - wq = fs_info->endio_raid56_workers; - else - wq = fs_info->endio_write_workers; - } else { - if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) - wq = fs_info->endio_raid56_workers; - else if (end_io_wq->metadata) - wq = fs_info->endio_meta_workers; - else - wq = fs_info->endio_workers; - } - - btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL); - btrfs_queue_work(wq, &end_io_wq->work); -} - -blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, - enum btrfs_wq_endio_type metadata) -{ - struct btrfs_end_io_wq *end_io_wq; - - end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS); - if (!end_io_wq) - return BLK_STS_RESOURCE; - - end_io_wq->private = bio->bi_private; - end_io_wq->end_io = bio->bi_end_io; - end_io_wq->info = info; - end_io_wq->status = 0; - end_io_wq->bio = bio; - end_io_wq->metadata = metadata; - - bio->bi_private = end_io_wq; - bio->bi_end_io = end_workqueue_bio; - return 0; -} - static void run_one_async_start(struct btrfs_work *work) { struct async_submit_bio *async; @@ -815,7 +731,6 @@ static void run_one_async_done(struct btrfs_work *work) { struct async_submit_bio *async; struct inode *inode; - blk_status_t ret; async = container_of(work, struct async_submit_bio, work); inode = async->inode; @@ -833,11 +748,7 @@ static void run_one_async_done(struct btrfs_work *work) * This changes nothing when cgroups aren't in use. */ async->bio->bi_opf |= REQ_CGROUP_PUNT; - ret = btrfs_map_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num); - if (ret) { - async->bio->bi_status = ret; - bio_endio(async->bio); - } + btrfs_submit_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num); } static void run_one_async_free(struct btrfs_work *work) @@ -848,16 +759,23 @@ static void run_one_async_free(struct btrfs_work *work) kfree(async); } -blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, - int mirror_num, u64 dio_file_offset, - extent_submit_bio_start_t *submit_bio_start) +/* + * Submit bio to an async queue. + * + * Retrun: + * - true if the work has been succesfuly submitted + * - false in case of error + */ +bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num, + u64 dio_file_offset, + extent_submit_bio_start_t *submit_bio_start) { struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; struct async_submit_bio *async; async = kmalloc(sizeof(*async), GFP_NOFS); if (!async) - return BLK_STS_RESOURCE; + return false; async->inode = inode; async->bio = bio; @@ -875,7 +793,7 @@ blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, btrfs_queue_work(fs_info->hipri_workers, &async->work); else btrfs_queue_work(fs_info->workers, &async->work); - return 0; + return true; } static blk_status_t btree_csum_one_bio(struct bio *bio) @@ -901,7 +819,7 @@ static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio, { /* * when we're called for a write, we're already in the async - * submission context. Just jump into btrfs_map_bio + * submission context. Just jump into btrfs_submit_bio. */ return btree_csum_one_bio(bio); } @@ -923,57 +841,54 @@ void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); blk_status_t ret; + bio->bi_opf |= REQ_META; + if (btrfs_op(bio) != BTRFS_MAP_WRITE) { - /* - * called for a read, do the setup so that checksum validation - * can happen in the async kernel threads - */ - ret = btrfs_bio_wq_end_io(fs_info, bio, - BTRFS_WQ_ENDIO_METADATA); - if (!ret) - ret = btrfs_map_bio(fs_info, bio, mirror_num); - } else if (!should_async_write(fs_info, BTRFS_I(inode))) { - ret = btree_csum_one_bio(bio); - if (!ret) - ret = btrfs_map_bio(fs_info, bio, mirror_num); - } else { - /* - * kthread helpers are used to submit writes so that - * checksumming can happen in parallel across all CPUs - */ - ret = btrfs_wq_submit_bio(inode, bio, mirror_num, 0, - btree_submit_bio_start); + btrfs_submit_bio(fs_info, bio, mirror_num); + return; } + /* + * Kthread helpers are used to submit writes so that checksumming can + * happen in parallel across all CPUs. + */ + if (should_async_write(fs_info, BTRFS_I(inode)) && + btrfs_wq_submit_bio(inode, bio, mirror_num, 0, btree_submit_bio_start)) + return; + + ret = btree_csum_one_bio(bio); if (ret) { bio->bi_status = ret; bio_endio(bio); + return; } + + btrfs_submit_bio(fs_info, bio, mirror_num); } #ifdef CONFIG_MIGRATION -static int btree_migratepage(struct address_space *mapping, - struct page *newpage, struct page *page, - enum migrate_mode mode) +static int btree_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, enum migrate_mode mode) { /* * we can't safely write a btree page from here, * we haven't done the locking hook */ - if (PageDirty(page)) + if (folio_test_dirty(src)) return -EAGAIN; /* * Buffers may be managed in a filesystem specific way. * We must have no buffers or drop them. */ - if (page_has_private(page) && - !try_to_release_page(page, GFP_KERNEL)) + if (folio_get_private(src) && + !filemap_release_folio(src, GFP_KERNEL)) return -EAGAIN; - return migrate_page(mapping, newpage, page, mode); + return migrate_folio(mapping, dst, src, mode); } +#else +#define btree_migrate_folio NULL #endif - static int btree_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -1073,10 +988,8 @@ static const struct address_space_operations btree_aops = { .writepages = btree_writepages, .release_folio = btree_release_folio, .invalidate_folio = btree_invalidate_folio, -#ifdef CONFIG_MIGRATION - .migratepage = btree_migratepage, -#endif - .dirty_folio = btree_dirty_folio, + .migrate_folio = btree_migrate_folio, + .dirty_folio = btree_dirty_folio, }; struct extent_buffer *btrfs_find_create_tree_block( @@ -1158,7 +1071,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->nr_delalloc_inodes = 0; root->nr_ordered_extents = 0; root->inode_tree = RB_ROOT; - xa_init_flags(&root->delayed_nodes, GFP_ATOMIC); + INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); btrfs_init_root_block_rsv(root); @@ -1210,9 +1123,9 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks); #ifdef CONFIG_BTRFS_DEBUG INIT_LIST_HEAD(&root->leak_list); - spin_lock(&fs_info->fs_roots_lock); + spin_lock(&fs_info->fs_roots_radix_lock); list_add_tail(&root->leak_list, &fs_info->allocated_roots); - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); #endif } @@ -1659,11 +1572,12 @@ static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, { struct btrfs_root *root; - spin_lock(&fs_info->fs_roots_lock); - root = xa_load(&fs_info->fs_roots, (unsigned long)root_id); + spin_lock(&fs_info->fs_roots_radix_lock); + root = radix_tree_lookup(&fs_info->fs_roots_radix, + (unsigned long)root_id); if (root) root = btrfs_grab_root(root); - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); return root; } @@ -1705,14 +1619,20 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, { int ret; - spin_lock(&fs_info->fs_roots_lock); - ret = xa_insert(&fs_info->fs_roots, (unsigned long)root->root_key.objectid, - root, GFP_NOFS); + ret = radix_tree_preload(GFP_NOFS); + if (ret) + return ret; + + spin_lock(&fs_info->fs_roots_radix_lock); + ret = radix_tree_insert(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + root); if (ret == 0) { btrfs_grab_root(root); - set_bit(BTRFS_ROOT_REGISTERED, &root->state); + set_bit(BTRFS_ROOT_IN_RADIX, &root->state); } - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); + radix_tree_preload_end(); return ret; } @@ -1864,7 +1784,7 @@ again: fail: /* * If our caller provided us an anonymous device, then it's his - * responsability to free it in case we fail. So we have to set our + * responsibility to free it in case we fail. So we have to set our * root's anon_dev to 0 to avoid a double free, once by btrfs_put_root() * and once again by our caller. */ @@ -1947,25 +1867,6 @@ struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info, return root; } -/* - * called by the kthread helper functions to finally call the bio end_io - * functions. This is where read checksum verification actually happens - */ -static void end_workqueue_fn(struct btrfs_work *work) -{ - struct bio *bio; - struct btrfs_end_io_wq *end_io_wq; - - end_io_wq = container_of(work, struct btrfs_end_io_wq, work); - bio = end_io_wq->bio; - - bio->bi_status = end_io_wq->status; - bio->bi_private = end_io_wq->private; - bio->bi_end_io = end_io_wq->end_io; - bio_endio(bio); - kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq); -} - static int cleaner_kthread(void *arg) { struct btrfs_fs_info *fs_info = arg; @@ -2272,10 +2173,14 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) btrfs_destroy_workqueue(fs_info->delalloc_workers); btrfs_destroy_workqueue(fs_info->hipri_workers); btrfs_destroy_workqueue(fs_info->workers); - btrfs_destroy_workqueue(fs_info->endio_workers); - btrfs_destroy_workqueue(fs_info->endio_raid56_workers); + if (fs_info->endio_workers) + destroy_workqueue(fs_info->endio_workers); + if (fs_info->endio_raid56_workers) + destroy_workqueue(fs_info->endio_raid56_workers); if (fs_info->rmw_workers) destroy_workqueue(fs_info->rmw_workers); + if (fs_info->compressed_write_workers) + destroy_workqueue(fs_info->compressed_write_workers); btrfs_destroy_workqueue(fs_info->endio_write_workers); btrfs_destroy_workqueue(fs_info->endio_freespace_worker); btrfs_destroy_workqueue(fs_info->delayed_workers); @@ -2289,8 +2194,8 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) * the queues used for metadata I/O, since tasks from those other work * queues can do metadata I/O operations. */ - btrfs_destroy_workqueue(fs_info->endio_meta_workers); - btrfs_destroy_workqueue(fs_info->endio_meta_write_workers); + if (fs_info->endio_meta_workers) + destroy_workqueue(fs_info->endio_meta_workers); } static void free_root_extent_buffers(struct btrfs_root *root) @@ -2342,9 +2247,9 @@ void btrfs_put_root(struct btrfs_root *root) btrfs_drew_lock_destroy(&root->snapshot_lock); free_root_extent_buffers(root); #ifdef CONFIG_BTRFS_DEBUG - spin_lock(&root->fs_info->fs_roots_lock); + spin_lock(&root->fs_info->fs_roots_radix_lock); list_del_init(&root->leak_list); - spin_unlock(&root->fs_info->fs_roots_lock); + spin_unlock(&root->fs_info->fs_roots_radix_lock); #endif kfree(root); } @@ -2352,21 +2257,28 @@ void btrfs_put_root(struct btrfs_root *root) void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) { - struct btrfs_root *root; - unsigned long index = 0; + int ret; + struct btrfs_root *gang[8]; + int i; while (!list_empty(&fs_info->dead_roots)) { - root = list_entry(fs_info->dead_roots.next, - struct btrfs_root, root_list); - list_del(&root->root_list); + gang[0] = list_entry(fs_info->dead_roots.next, + struct btrfs_root, root_list); + list_del(&gang[0]->root_list); - if (test_bit(BTRFS_ROOT_REGISTERED, &root->state)) - btrfs_drop_and_free_fs_root(fs_info, root); - btrfs_put_root(root); + if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) + btrfs_drop_and_free_fs_root(fs_info, gang[0]); + btrfs_put_root(gang[0]); } - xa_for_each(&fs_info->fs_roots, index, root) { - btrfs_drop_and_free_fs_root(fs_info, root); + while (1) { + ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, 0, + ARRAY_SIZE(gang)); + if (!ret) + break; + for (i = 0; i < ret; i++) + btrfs_drop_and_free_fs_root(fs_info, gang[i]); } } @@ -2413,7 +2325,9 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) extent_map_tree_init(&BTRFS_I(inode)->extent_tree); BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root); - memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key)); + BTRFS_I(inode)->location.objectid = BTRFS_BTREE_INODE_OBJECTID; + BTRFS_I(inode)->location.type = 0; + BTRFS_I(inode)->location.offset = 0; set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); btrfs_insert_inode_hash(inode); } @@ -2462,25 +2376,18 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) fs_info->fixup_workers = btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0); - /* - * endios are largely parallel and should have a very - * low idle thresh - */ fs_info->endio_workers = - btrfs_alloc_workqueue(fs_info, "endio", flags, max_active, 4); + alloc_workqueue("btrfs-endio", flags, max_active); fs_info->endio_meta_workers = - btrfs_alloc_workqueue(fs_info, "endio-meta", flags, - max_active, 4); - fs_info->endio_meta_write_workers = - btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags, - max_active, 2); + alloc_workqueue("btrfs-endio-meta", flags, max_active); fs_info->endio_raid56_workers = - btrfs_alloc_workqueue(fs_info, "endio-raid56", flags, - max_active, 4); + alloc_workqueue("btrfs-endio-raid56", flags, max_active); fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active); fs_info->endio_write_workers = btrfs_alloc_workqueue(fs_info, "endio-write", flags, max_active, 2); + fs_info->compressed_write_workers = + alloc_workqueue("btrfs-compressed-write", flags, max_active); fs_info->endio_freespace_worker = btrfs_alloc_workqueue(fs_info, "freespace-write", flags, max_active, 0); @@ -2495,7 +2402,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) if (!(fs_info->workers && fs_info->hipri_workers && fs_info->delalloc_workers && fs_info->flush_workers && fs_info->endio_workers && fs_info->endio_meta_workers && - fs_info->endio_meta_write_workers && + fs_info->compressed_write_workers && fs_info->endio_write_workers && fs_info->endio_raid56_workers && fs_info->endio_freespace_worker && fs_info->rmw_workers && fs_info->caching_workers && fs_info->fixup_workers && @@ -2522,6 +2429,9 @@ static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type) fs_info->csum_shash = csum_shash; + btrfs_info(fs_info, "using %s (%s) checksum algorithm", + btrfs_super_csum_name(csum_type), + crypto_shash_driver_name(csum_shash)); return 0; } @@ -3134,8 +3044,8 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) { - xa_init_flags(&fs_info->fs_roots, GFP_ATOMIC); - xa_init_flags(&fs_info->extent_buffers, GFP_ATOMIC); + INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); + INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->delayed_iputs); @@ -3143,7 +3053,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&fs_info->caching_block_groups); spin_lock_init(&fs_info->delalloc_root_lock); spin_lock_init(&fs_info->trans_lock); - spin_lock_init(&fs_info->fs_roots_lock); + spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); spin_lock_init(&fs_info->super_lock); @@ -3240,6 +3150,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) init_waitqueue_head(&fs_info->transaction_blocked_wait); init_waitqueue_head(&fs_info->async_submit_wait); init_waitqueue_head(&fs_info->delayed_iputs_wait); + init_waitqueue_head(&fs_info->zone_finish_wait); /* Usable values until the real ones are cached from the superblock */ fs_info->nodesize = 4096; @@ -3247,6 +3158,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) fs_info->sectorsize_bits = ilog2(4096); fs_info->stripesize = 4096; + fs_info->max_extent_size = BTRFS_MAX_EXTENT_SIZE; + spin_lock_init(&fs_info->swapfile_pins_lock); fs_info->swapfile_pins = RB_ROOT; @@ -3374,7 +3287,7 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) /* * btrfs_find_orphan_roots() is responsible for finding all the dead * roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load - * them into the fs_info->fs_roots. This must be done before + * them into the fs_info->fs_roots_radix tree. This must be done before * calling btrfs_orphan_cleanup() on the tree root. If we don't do it * first, then btrfs_orphan_cleanup() will delete a dead root's orphan * item before the root's tree is deleted - this means that if we unmount @@ -3578,16 +3491,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device */ fs_info->compress_type = BTRFS_COMPRESS_ZLIB; - /* - * Flag our filesystem as having big metadata blocks if they are bigger - * than the page size. - */ - if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) { - if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) - btrfs_info(fs_info, - "flagging fs with big metadata feature"); - features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; - } /* Set up fs_info before parsing mount options */ nodesize = btrfs_super_nodesize(disk_super); @@ -3625,8 +3528,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD) features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD; - if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) - btrfs_info(fs_info, "has skinny extents"); + /* + * Flag our filesystem as having big metadata blocks if they are bigger + * than the page size. + */ + if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) + features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; /* * mixed block groups end up with duplicate but slightly offset @@ -3655,6 +3562,20 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device err = -EINVAL; goto fail_alloc; } + /* + * We have unsupported RO compat features, although RO mounted, we + * should not cause any metadata write, including log replay. + * Or we could screw up whatever the new feature requires. + */ + if (unlikely(features && btrfs_super_log_root(disk_super) && + !btrfs_test_opt(fs_info, NOLOGREPLAY))) { + btrfs_err(fs_info, +"cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay", + features); + err = -EINVAL; + goto fail_alloc; + } + if (sectorsize < PAGE_SIZE) { struct btrfs_subpage_info *subpage_info; @@ -4499,11 +4420,12 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, { bool drop_ref = false; - spin_lock(&fs_info->fs_roots_lock); - xa_erase(&fs_info->fs_roots, (unsigned long)root->root_key.objectid); - if (test_and_clear_bit(BTRFS_ROOT_REGISTERED, &root->state)) + spin_lock(&fs_info->fs_roots_radix_lock); + radix_tree_delete(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid); + if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state)) drop_ref = true; - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); if (BTRFS_FS_ERROR(fs_info)) { ASSERT(root->log_root == NULL); @@ -4519,48 +4441,50 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) { - struct btrfs_root *roots[8]; - unsigned long index = 0; - int i; + u64 root_objectid = 0; + struct btrfs_root *gang[8]; + int i = 0; int err = 0; - int grabbed; + unsigned int ret = 0; while (1) { - struct btrfs_root *root; - - spin_lock(&fs_info->fs_roots_lock); - if (!xa_find(&fs_info->fs_roots, &index, ULONG_MAX, XA_PRESENT)) { - spin_unlock(&fs_info->fs_roots_lock); - return err; + spin_lock(&fs_info->fs_roots_radix_lock); + ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, root_objectid, + ARRAY_SIZE(gang)); + if (!ret) { + spin_unlock(&fs_info->fs_roots_radix_lock); + break; } + root_objectid = gang[ret - 1]->root_key.objectid + 1; - grabbed = 0; - xa_for_each_start(&fs_info->fs_roots, index, root, index) { - /* Avoid grabbing roots in dead_roots */ - if (btrfs_root_refs(&root->root_item) > 0) - roots[grabbed++] = btrfs_grab_root(root); - if (grabbed >= ARRAY_SIZE(roots)) - break; + for (i = 0; i < ret; i++) { + /* Avoid to grab roots in dead_roots */ + if (btrfs_root_refs(&gang[i]->root_item) == 0) { + gang[i] = NULL; + continue; + } + /* grab all the search result for later use */ + gang[i] = btrfs_grab_root(gang[i]); } - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); - for (i = 0; i < grabbed; i++) { - if (!roots[i]) + for (i = 0; i < ret; i++) { + if (!gang[i]) continue; - index = roots[i]->root_key.objectid; - err = btrfs_orphan_cleanup(roots[i]); + root_objectid = gang[i]->root_key.objectid; + err = btrfs_orphan_cleanup(gang[i]); if (err) - goto out; - btrfs_put_root(roots[i]); + break; + btrfs_put_root(gang[i]); } - index++; + root_objectid++; } -out: - /* Release the roots that remain uncleaned due to error */ - for (; i < grabbed; i++) { - if (roots[i]) - btrfs_put_root(roots[i]); + /* release the uncleaned roots due to error */ + for (; i < ret; i++) { + if (gang[i]) + btrfs_put_root(gang[i]); } return err; } @@ -4879,28 +4803,31 @@ static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info) static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info) { - unsigned long index = 0; - int grabbed = 0; - struct btrfs_root *roots[8]; + struct btrfs_root *gang[8]; + u64 root_objectid = 0; + int ret; + + spin_lock(&fs_info->fs_roots_radix_lock); + while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, root_objectid, + ARRAY_SIZE(gang))) != 0) { + int i; - spin_lock(&fs_info->fs_roots_lock); - while ((grabbed = xa_extract(&fs_info->fs_roots, (void **)roots, index, - ULONG_MAX, 8, XA_PRESENT))) { - for (int i = 0; i < grabbed; i++) - roots[i] = btrfs_grab_root(roots[i]); - spin_unlock(&fs_info->fs_roots_lock); + for (i = 0; i < ret; i++) + gang[i] = btrfs_grab_root(gang[i]); + spin_unlock(&fs_info->fs_roots_radix_lock); - for (int i = 0; i < grabbed; i++) { - if (!roots[i]) + for (i = 0; i < ret; i++) { + if (!gang[i]) continue; - index = roots[i]->root_key.objectid; - btrfs_free_log(NULL, roots[i]); - btrfs_put_root(roots[i]); + root_objectid = gang[i]->root_key.objectid; + btrfs_free_log(NULL, gang[i]); + btrfs_put_root(gang[i]); } - index++; - spin_lock(&fs_info->fs_roots_lock); + root_objectid++; + spin_lock(&fs_info->fs_roots_radix_lock); } - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); btrfs_free_log_root_tree(NULL, fs_info); } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 4ee8c42c9f78..8993b428e09c 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -17,13 +17,6 @@ */ #define BTRFS_BDEV_BLOCKSIZE (4096) -enum btrfs_wq_endio_type { - BTRFS_WQ_ENDIO_DATA, - BTRFS_WQ_ENDIO_METADATA, - BTRFS_WQ_ENDIO_FREE_SPACE, - BTRFS_WQ_ENDIO_RAID56, -}; - static inline u64 btrfs_sb_offset(int mirror) { u64 start = SZ_16K; @@ -121,11 +114,9 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); int btrfs_read_extent_buffer(struct extent_buffer *buf, u64 parent_transid, int level, struct btrfs_key *first_key); -blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, - enum btrfs_wq_endio_type metadata); -blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, - int mirror_num, u64 dio_file_offset, - extent_submit_bio_start_t *submit_bio_start); +bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num, + u64 dio_file_offset, + extent_submit_bio_start_t *submit_bio_start); blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio, int mirror_num); int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, @@ -145,8 +136,6 @@ int btree_lock_page_hook(struct page *page, void *data, int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid); int btrfs_init_root_free_objectid(struct btrfs_root *root); -int __init btrfs_end_io_wq_init(void); -void __cold btrfs_end_io_wq_exit(void); #ifdef CONFIG_DEBUG_LOCK_ALLOC void btrfs_set_buffer_lockdep_class(u64 objectid, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0867c5cd6e01..ea3ec1e761e8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1269,7 +1269,7 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, return ret; } -static int do_discard_extent(struct btrfs_io_stripe *stripe, u64 *bytes) +static int do_discard_extent(struct btrfs_discard_stripe *stripe, u64 *bytes) { struct btrfs_device *dev = stripe->dev; struct btrfs_fs_info *fs_info = dev->fs_info; @@ -1316,76 +1316,60 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 discarded_bytes = 0; u64 end = bytenr + num_bytes; u64 cur = bytenr; - struct btrfs_io_context *bioc = NULL; /* - * Avoid races with device replace and make sure our bioc has devices - * associated to its stripes that don't go away while we are discarding. + * Avoid races with device replace and make sure the devices in the + * stripes don't go away while we are discarding. */ btrfs_bio_counter_inc_blocked(fs_info); while (cur < end) { - struct btrfs_io_stripe *stripe; + struct btrfs_discard_stripe *stripes; + unsigned int num_stripes; int i; num_bytes = end - cur; - /* Tell the block device(s) that the sectors can be discarded */ - ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur, - &num_bytes, &bioc, 0); - /* - * Error can be -ENOMEM, -ENOENT (no such chunk mapping) or - * -EOPNOTSUPP. For any such error, @num_bytes is not updated, - * thus we can't continue anyway. - */ - if (ret < 0) - goto out; + stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes); + if (IS_ERR(stripes)) { + ret = PTR_ERR(stripes); + if (ret == -EOPNOTSUPP) + ret = 0; + break; + } - stripe = bioc->stripes; - for (i = 0; i < bioc->num_stripes; i++, stripe++) { + for (i = 0; i < num_stripes; i++) { + struct btrfs_discard_stripe *stripe = stripes + i; u64 bytes; - struct btrfs_device *device = stripe->dev; - if (!device->bdev) { + if (!stripe->dev->bdev) { ASSERT(btrfs_test_opt(fs_info, DEGRADED)); continue; } - if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) + if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, + &stripe->dev->dev_state)) continue; ret = do_discard_extent(stripe, &bytes); - if (!ret) { - discarded_bytes += bytes; - } else if (ret != -EOPNOTSUPP) { + if (ret) { /* - * Logic errors or -ENOMEM, or -EIO, but - * unlikely to happen. - * - * And since there are two loops, explicitly - * go to out to avoid confusion. + * Keep going if discard is not supported by the + * device. */ - btrfs_put_bioc(bioc); - goto out; + if (ret != -EOPNOTSUPP) + break; + ret = 0; + } else { + discarded_bytes += bytes; } - - /* - * Just in case we get back EOPNOTSUPP for some reason, - * just ignore the return value so we don't screw up - * people calling discard_extent. - */ - ret = 0; } - btrfs_put_bioc(bioc); + kfree(stripes); + if (ret) + break; cur += num_bytes; } -out: btrfs_bio_counter_dec(fs_info); - if (actual_bytes) *actual_bytes = discarded_bytes; - - - if (ret == -EOPNOTSUPP) - ret = 0; return ret; } @@ -3832,7 +3816,7 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, block_group->start == fs_info->data_reloc_bg || fs_info->data_reloc_bg == 0); - if (block_group->ro) { + if (block_group->ro || block_group->zoned_data_reloc_ongoing) { ret = 1; goto out; } @@ -3894,8 +3878,24 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, out: if (ret && ffe_ctl->for_treelog) fs_info->treelog_bg = 0; - if (ret && ffe_ctl->for_data_reloc) + if (ret && ffe_ctl->for_data_reloc && + fs_info->data_reloc_bg == block_group->start) { + /* + * Do not allow further allocations from this block group. + * Compared to increasing the ->ro, setting the + * ->zoned_data_reloc_ongoing flag still allows nocow + * writers to come in. See btrfs_inc_nocow_writers(). + * + * We need to disable an allocation to avoid an allocation of + * regular (non-relocation data) extent. With mix of relocation + * extents and regular extents, we can dispatch WRITE commands + * (for relocation extents) and ZONE APPEND commands (for + * regular extents) at the same time to the same zone, which + * easily break the write pointer. + */ + block_group->zoned_data_reloc_ongoing = 1; fs_info->data_reloc_bg = 0; + } spin_unlock(&fs_info->relocation_bg_lock); spin_unlock(&fs_info->treelog_bg_lock); spin_unlock(&block_group->lock); @@ -3965,23 +3965,63 @@ static void found_extent(struct find_free_extent_ctl *ffe_ctl, } } -static bool can_allocate_chunk(struct btrfs_fs_info *fs_info, - struct find_free_extent_ctl *ffe_ctl) +static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl) +{ + /* If we can activate new zone, just allocate a chunk and use it */ + if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags)) + return 0; + + /* + * We already reached the max active zones. Try to finish one block + * group to make a room for a new block group. This is only possible + * for a data block group because btrfs_zone_finish() may need to wait + * for a running transaction which can cause a deadlock for metadata + * allocation. + */ + if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) { + int ret = btrfs_zone_finish_one_bg(fs_info); + + if (ret == 1) + return 0; + else if (ret < 0) + return ret; + } + + /* + * If we have enough free space left in an already active block group + * and we can't activate any other zone now, do not allow allocating a + * new chunk and let find_free_extent() retry with a smaller size. + */ + if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size) + return -ENOSPC; + + /* + * Even min_alloc_size is not left in any block groups. Since we cannot + * activate a new block group, allocating it may not help. Let's tell a + * caller to try again and hope it progress something by writing some + * parts of the region. That is only possible for data block groups, + * where a part of the region can be written. + */ + if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) + return -EAGAIN; + + /* + * We cannot activate a new block group and no enough space left in any + * block groups. So, allocating a new block group may not help. But, + * there is nothing to do anyway, so let's go with it. + */ + return 0; +} + +static int can_allocate_chunk(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl) { switch (ffe_ctl->policy) { case BTRFS_EXTENT_ALLOC_CLUSTERED: - return true; + return 0; case BTRFS_EXTENT_ALLOC_ZONED: - /* - * If we have enough free space left in an already - * active block group and we can't activate any other - * zone now, do not allow allocating a new chunk and - * let find_free_extent() retry with a smaller size. - */ - if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size && - !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags)) - return false; - return true; + return can_allocate_chunk_zoned(fs_info, ffe_ctl); default: BUG(); } @@ -4063,8 +4103,9 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, int exist = 0; /*Check if allocation policy allows to create a new chunk */ - if (!can_allocate_chunk(fs_info, ffe_ctl)) - return -ENOSPC; + ret = can_allocate_chunk(fs_info, ffe_ctl); + if (ret) + return ret; trans = current->journal_info; if (trans) @@ -5813,7 +5854,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) btrfs_qgroup_convert_reserved_meta(root, INT_MAX); btrfs_qgroup_free_meta_all_pertrans(root); - if (test_bit(BTRFS_ROOT_REGISTERED, &root->state)) + if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) btrfs_add_dropped_root(trans, root); else btrfs_put_root(root); @@ -5976,7 +6017,7 @@ int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, */ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) { - u64 start = SZ_1M, len = 0, end = 0; + u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0; int ret; *trimmed = 0; @@ -6020,8 +6061,8 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) break; } - /* Ensure we skip the reserved area in the first 1M */ - start = max_t(u64, start, SZ_1M); + /* Ensure we skip the reserved space on each device. */ + start = max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED); /* * If find_first_clear_extent_bit find a range that spans the diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 8f6b544ae616..bfae67c593c5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -144,6 +144,7 @@ struct tree_entry { */ struct btrfs_bio_ctrl { struct bio *bio; + int mirror_num; enum btrfs_compression_type compress_type; u32 len_to_stripe_boundary; u32 len_to_oe_boundary; @@ -178,61 +179,56 @@ static int add_extent_changeset(struct extent_state *state, u32 bits, return ret; } -static void submit_one_bio(struct bio *bio, int mirror_num, - enum btrfs_compression_type compress_type) +static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) { - struct extent_io_tree *tree = bio->bi_private; + struct bio *bio; + struct bio_vec *bv; + struct inode *inode; + int mirror_num; + + if (!bio_ctrl->bio) + return; - bio->bi_private = NULL; + bio = bio_ctrl->bio; + bv = bio_first_bvec_all(bio); + inode = bv->bv_page->mapping->host; + mirror_num = bio_ctrl->mirror_num; /* Caller should ensure the bio has at least some range added */ ASSERT(bio->bi_iter.bi_size); - if (is_data_inode(tree->private_data)) - btrfs_submit_data_bio(tree->private_data, bio, mirror_num, - compress_type); - else - btrfs_submit_metadata_bio(tree->private_data, bio, mirror_num); - /* - * Above submission hooks will handle the error by ending the bio, - * which will do the cleanup properly. So here we should not return - * any error, or the caller of submit_extent_page() will do cleanup - * again, causing problems. - */ -} + btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset; -/* Cleanup unsubmitted bios */ -static void end_write_bio(struct extent_page_data *epd, int ret) -{ - struct bio *bio = epd->bio_ctrl.bio; + if (!is_data_inode(inode)) + btrfs_submit_metadata_bio(inode, bio, mirror_num); + else if (btrfs_op(bio) == BTRFS_MAP_WRITE) + btrfs_submit_data_write_bio(inode, bio, mirror_num); + else + btrfs_submit_data_read_bio(inode, bio, mirror_num, + bio_ctrl->compress_type); - if (bio) { - bio->bi_status = errno_to_blk_status(ret); - bio_endio(bio); - epd->bio_ctrl.bio = NULL; - } + /* The bio is owned by the bi_end_io handler now */ + bio_ctrl->bio = NULL; } /* - * Submit bio from extent page data via submit_one_bio - * - * Return 0 if everything is OK. - * Return <0 for error. + * Submit or fail the current bio in an extent_page_data structure. */ -static void flush_write_bio(struct extent_page_data *epd) +static void submit_write_bio(struct extent_page_data *epd, int ret) { struct bio *bio = epd->bio_ctrl.bio; - if (bio) { - submit_one_bio(bio, 0, 0); - /* - * Clean up of epd->bio is handled by its endio function. - * And endio is either triggered by successful bio execution - * or the error handler of submit bio hook. - * So at this point, no matter what happened, we don't need - * to clean up epd->bio. - */ + if (!bio) + return; + + if (ret) { + ASSERT(ret < 0); + bio->bi_status = errno_to_blk_status(ret); + bio_endio(bio); + /* The bio is owned by the bi_end_io handler now */ epd->bio_ctrl.bio = NULL; + } else { + submit_one_bio(&epd->bio_ctrl); } } @@ -376,131 +372,121 @@ void free_extent_state(struct extent_state *state) } } -static struct rb_node *tree_insert(struct rb_root *root, - struct rb_node *search_start, - u64 offset, - struct rb_node *node, - struct rb_node ***p_in, - struct rb_node **parent_in) -{ - struct rb_node **p; - struct rb_node *parent = NULL; - struct tree_entry *entry; - - if (p_in && parent_in) { - p = *p_in; - parent = *parent_in; - goto do_insert; - } - - p = search_start ? &search_start : &root->rb_node; - while (*p) { - parent = *p; - entry = rb_entry(parent, struct tree_entry, rb_node); - - if (offset < entry->start) - p = &(*p)->rb_left; - else if (offset > entry->end) - p = &(*p)->rb_right; - else - return parent; - } - -do_insert: - rb_link_node(node, parent, p); - rb_insert_color(node, root); - return NULL; -} - /** * Search @tree for an entry that contains @offset. Such entry would have * entry->start <= offset && entry->end >= offset. * * @tree: the tree to search * @offset: offset that should fall within an entry in @tree - * @next_ret: pointer to the first entry whose range ends after @offset - * @prev_ret: pointer to the first entry whose range begins before @offset - * @p_ret: pointer where new node should be anchored (used when inserting an + * @node_ret: pointer where new node should be anchored (used when inserting an * entry in the tree) * @parent_ret: points to entry which would have been the parent of the entry, * containing @offset * - * This function returns a pointer to the entry that contains @offset byte - * address. If no such entry exists, then NULL is returned and the other - * pointer arguments to the function are filled, otherwise the found entry is - * returned and other pointers are left untouched. + * Return a pointer to the entry that contains @offset byte address and don't change + * @node_ret and @parent_ret. + * + * If no such entry exists, return pointer to entry that ends before @offset + * and fill parameters @node_ret and @parent_ret, ie. does not return NULL. */ -static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, - struct rb_node **next_ret, - struct rb_node **prev_ret, - struct rb_node ***p_ret, - struct rb_node **parent_ret) +static inline struct rb_node *tree_search_for_insert(struct extent_io_tree *tree, + u64 offset, + struct rb_node ***node_ret, + struct rb_node **parent_ret) { struct rb_root *root = &tree->state; - struct rb_node **n = &root->rb_node; + struct rb_node **node = &root->rb_node; struct rb_node *prev = NULL; - struct rb_node *orig_prev = NULL; struct tree_entry *entry; - struct tree_entry *prev_entry = NULL; - while (*n) { - prev = *n; + while (*node) { + prev = *node; entry = rb_entry(prev, struct tree_entry, rb_node); - prev_entry = entry; if (offset < entry->start) - n = &(*n)->rb_left; + node = &(*node)->rb_left; else if (offset > entry->end) - n = &(*n)->rb_right; + node = &(*node)->rb_right; else - return *n; + return *node; } - if (p_ret) - *p_ret = n; + if (node_ret) + *node_ret = node; if (parent_ret) *parent_ret = prev; - if (next_ret) { - orig_prev = prev; - while (prev && offset > prev_entry->end) { - prev = rb_next(prev); - prev_entry = rb_entry(prev, struct tree_entry, rb_node); - } - *next_ret = prev; - prev = orig_prev; + /* Search neighbors until we find the first one past the end */ + while (prev && offset > entry->end) { + prev = rb_next(prev); + entry = rb_entry(prev, struct tree_entry, rb_node); } - if (prev_ret) { - prev_entry = rb_entry(prev, struct tree_entry, rb_node); - while (prev && offset < prev_entry->start) { - prev = rb_prev(prev); - prev_entry = rb_entry(prev, struct tree_entry, rb_node); - } - *prev_ret = prev; - } - return NULL; + return prev; } -static inline struct rb_node * -tree_search_for_insert(struct extent_io_tree *tree, - u64 offset, - struct rb_node ***p_ret, - struct rb_node **parent_ret) +/* + * Inexact rb-tree search, return the next entry if @offset is not found + */ +static inline struct rb_node *tree_search(struct extent_io_tree *tree, u64 offset) { - struct rb_node *next= NULL; - struct rb_node *ret; - - ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret); - if (!ret) - return next; - return ret; + return tree_search_for_insert(tree, offset, NULL, NULL); } -static inline struct rb_node *tree_search(struct extent_io_tree *tree, - u64 offset) +/** + * Search offset in the tree or fill neighbor rbtree node pointers. + * + * @tree: the tree to search + * @offset: offset that should fall within an entry in @tree + * @next_ret: pointer to the first entry whose range ends after @offset + * @prev_ret: pointer to the first entry whose range begins before @offset + * + * Return a pointer to the entry that contains @offset byte address. If no + * such entry exists, then return NULL and fill @prev_ret and @next_ret. + * Otherwise return the found entry and other pointers are left untouched. + */ +static struct rb_node *tree_search_prev_next(struct extent_io_tree *tree, + u64 offset, + struct rb_node **prev_ret, + struct rb_node **next_ret) { - return tree_search_for_insert(tree, offset, NULL, NULL); + struct rb_root *root = &tree->state; + struct rb_node **node = &root->rb_node; + struct rb_node *prev = NULL; + struct rb_node *orig_prev = NULL; + struct tree_entry *entry; + + ASSERT(prev_ret); + ASSERT(next_ret); + + while (*node) { + prev = *node; + entry = rb_entry(prev, struct tree_entry, rb_node); + + if (offset < entry->start) + node = &(*node)->rb_left; + else if (offset > entry->end) + node = &(*node)->rb_right; + else + return *node; + } + + orig_prev = prev; + while (prev && offset > entry->end) { + prev = rb_next(prev); + entry = rb_entry(prev, struct tree_entry, rb_node); + } + *next_ret = prev; + prev = orig_prev; + + entry = rb_entry(prev, struct tree_entry, rb_node); + while (prev && offset < entry->start) { + prev = rb_prev(prev); + entry = rb_entry(prev, struct tree_entry, rb_node); + } + *prev_ret = prev; + + return NULL; } /* @@ -554,7 +540,7 @@ static void merge_state(struct extent_io_tree *tree, } static void set_state_bits(struct extent_io_tree *tree, - struct extent_state *state, u32 *bits, + struct extent_state *state, u32 bits, struct extent_changeset *changeset); /* @@ -568,37 +554,56 @@ static void set_state_bits(struct extent_io_tree *tree, * probably isn't what you want to call (see set/clear_extent_bit). */ static int insert_state(struct extent_io_tree *tree, - struct extent_state *state, u64 start, u64 end, - struct rb_node ***p, - struct rb_node **parent, - u32 *bits, struct extent_changeset *changeset) + struct extent_state *state, + u32 bits, struct extent_changeset *changeset) { - struct rb_node *node; - - if (end < start) { - btrfs_err(tree->fs_info, - "insert state: end < start %llu %llu", end, start); - WARN_ON(1); - } - state->start = start; - state->end = end; + struct rb_node **node; + struct rb_node *parent; + const u64 end = state->end; set_state_bits(tree, state, bits, changeset); - node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent); - if (node) { - struct extent_state *found; - found = rb_entry(node, struct extent_state, rb_node); - btrfs_err(tree->fs_info, - "found node %llu %llu on insert of %llu %llu", - found->start, found->end, start, end); - return -EEXIST; + node = &tree->state.rb_node; + while (*node) { + struct tree_entry *entry; + + parent = *node; + entry = rb_entry(parent, struct tree_entry, rb_node); + + if (end < entry->start) { + node = &(*node)->rb_left; + } else if (end > entry->end) { + node = &(*node)->rb_right; + } else { + btrfs_err(tree->fs_info, + "found node %llu %llu on insert of %llu %llu", + entry->start, entry->end, state->start, end); + return -EEXIST; + } } + + rb_link_node(&state->rb_node, parent, node); + rb_insert_color(&state->rb_node, &tree->state); + merge_state(tree, state); return 0; } /* + * Insert state to @tree to the location given by @node and @parent. + */ +static void insert_state_fast(struct extent_io_tree *tree, + struct extent_state *state, struct rb_node **node, + struct rb_node *parent, unsigned bits, + struct extent_changeset *changeset) +{ + set_state_bits(tree, state, bits, changeset); + rb_link_node(&state->rb_node, parent, node); + rb_insert_color(&state->rb_node, &tree->state); + merge_state(tree, state); +} + +/* * split a given extent state struct in two, inserting the preallocated * struct 'prealloc' as the newly created second half. 'split' indicates an * offset inside 'orig' where it should be split. @@ -615,7 +620,8 @@ static int insert_state(struct extent_io_tree *tree, static int split_state(struct extent_io_tree *tree, struct extent_state *orig, struct extent_state *prealloc, u64 split) { - struct rb_node *node; + struct rb_node *parent = NULL; + struct rb_node **node; if (tree->private_data && is_data_inode(tree->private_data)) btrfs_split_delalloc_extent(tree->private_data, orig, split); @@ -625,12 +631,27 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, prealloc->state = orig->state; orig->start = split; - node = tree_insert(&tree->state, &orig->rb_node, prealloc->end, - &prealloc->rb_node, NULL, NULL); - if (node) { - free_extent_state(prealloc); - return -EEXIST; + parent = &orig->rb_node; + node = &parent; + while (*node) { + struct tree_entry *entry; + + parent = *node; + entry = rb_entry(parent, struct tree_entry, rb_node); + + if (prealloc->end < entry->start) { + node = &(*node)->rb_left; + } else if (prealloc->end > entry->end) { + node = &(*node)->rb_right; + } else { + free_extent_state(prealloc); + return -EEXIST; + } } + + rb_link_node(&prealloc->rb_node, parent, node); + rb_insert_color(&prealloc->rb_node, &tree->state); + return 0; } @@ -652,11 +673,11 @@ static struct extent_state *next_state(struct extent_state *state) */ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, struct extent_state *state, - u32 *bits, int wake, + u32 bits, int wake, struct extent_changeset *changeset) { struct extent_state *next; - u32 bits_to_clear = *bits & ~EXTENT_CTLBITS; + u32 bits_to_clear = bits & ~EXTENT_CTLBITS; int ret; if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { @@ -818,8 +839,7 @@ hit_next: if (err) goto out; if (state->end <= end) { - state = clear_state_bit(tree, state, &bits, wake, - changeset); + state = clear_state_bit(tree, state, bits, wake, changeset); goto next; } goto search_again; @@ -840,13 +860,13 @@ hit_next: if (wake) wake_up(&state->wq); - clear_state_bit(tree, prealloc, &bits, wake, changeset); + clear_state_bit(tree, prealloc, bits, wake, changeset); prealloc = NULL; goto out; } - state = clear_state_bit(tree, state, &bits, wake, changeset); + state = clear_state_bit(tree, state, bits, wake, changeset); next: if (last_end == (u64)-1) goto out; @@ -937,9 +957,9 @@ out: static void set_state_bits(struct extent_io_tree *tree, struct extent_state *state, - u32 *bits, struct extent_changeset *changeset) + u32 bits, struct extent_changeset *changeset) { - u32 bits_to_set = *bits & ~EXTENT_CTLBITS; + u32 bits_to_set = bits & ~EXTENT_CTLBITS; int ret; if (tree->private_data && is_data_inode(tree->private_data)) @@ -1033,11 +1053,9 @@ again: if (!node) { prealloc = alloc_extent_state_atomic(prealloc); BUG_ON(!prealloc); - err = insert_state(tree, prealloc, start, end, - &p, &parent, &bits, changeset); - if (err) - extent_io_tree_panic(tree, err); - + prealloc->start = start; + prealloc->end = end; + insert_state_fast(tree, prealloc, p, parent, bits, changeset); cache_state(prealloc, cached_state); prealloc = NULL; goto out; @@ -1060,7 +1078,7 @@ hit_next: goto out; } - set_state_bits(tree, state, &bits, changeset); + set_state_bits(tree, state, bits, changeset); cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) @@ -1116,7 +1134,7 @@ hit_next: if (err) goto out; if (state->end <= end) { - set_state_bits(tree, state, &bits, changeset); + set_state_bits(tree, state, bits, changeset); cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) @@ -1150,8 +1168,9 @@ hit_next: * Avoid to free 'prealloc' if it can be merged with * the later extent. */ - err = insert_state(tree, prealloc, start, this_end, - NULL, NULL, &bits, changeset); + prealloc->start = start; + prealloc->end = this_end; + err = insert_state(tree, prealloc, bits, changeset); if (err) extent_io_tree_panic(tree, err); @@ -1179,7 +1198,7 @@ hit_next: if (err) extent_io_tree_panic(tree, err); - set_state_bits(tree, prealloc, &bits, changeset); + set_state_bits(tree, prealloc, bits, changeset); cache_state(prealloc, cached_state); merge_state(tree, prealloc); prealloc = NULL; @@ -1274,10 +1293,9 @@ again: err = -ENOMEM; goto out; } - err = insert_state(tree, prealloc, start, end, - &p, &parent, &bits, NULL); - if (err) - extent_io_tree_panic(tree, err); + prealloc->start = start; + prealloc->end = end; + insert_state_fast(tree, prealloc, p, parent, bits, NULL); cache_state(prealloc, cached_state); prealloc = NULL; goto out; @@ -1294,9 +1312,9 @@ hit_next: * Just lock what we found and keep going */ if (state->start == start && state->end <= end) { - set_state_bits(tree, state, &bits, NULL); + set_state_bits(tree, state, bits, NULL); cache_state(state, cached_state); - state = clear_state_bit(tree, state, &clear_bits, 0, NULL); + state = clear_state_bit(tree, state, clear_bits, 0, NULL); if (last_end == (u64)-1) goto out; start = last_end + 1; @@ -1335,10 +1353,9 @@ hit_next: if (err) goto out; if (state->end <= end) { - set_state_bits(tree, state, &bits, NULL); + set_state_bits(tree, state, bits, NULL); cache_state(state, cached_state); - state = clear_state_bit(tree, state, &clear_bits, 0, - NULL); + state = clear_state_bit(tree, state, clear_bits, 0, NULL); if (last_end == (u64)-1) goto out; start = last_end + 1; @@ -1372,8 +1389,9 @@ hit_next: * Avoid to free 'prealloc' if it can be merged with * the later extent. */ - err = insert_state(tree, prealloc, start, this_end, - NULL, NULL, &bits, NULL); + prealloc->start = start; + prealloc->end = this_end; + err = insert_state(tree, prealloc, bits, NULL); if (err) extent_io_tree_panic(tree, err); cache_state(prealloc, cached_state); @@ -1398,9 +1416,9 @@ hit_next: if (err) extent_io_tree_panic(tree, err); - set_state_bits(tree, prealloc, &bits, NULL); + set_state_bits(tree, prealloc, bits, NULL); cache_state(prealloc, cached_state); - clear_state_bit(tree, prealloc, &clear_bits, 0, NULL); + clear_state_bit(tree, prealloc, clear_bits, 0, NULL); prealloc = NULL; goto out; } @@ -1674,7 +1692,7 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, /* Find first extent with bits cleared */ while (1) { - node = __etree_search(tree, start, &next, &prev, NULL, NULL); + node = tree_search_prev_next(tree, start, &prev, &next); if (!node && !next && !prev) { /* * Tree is completely empty, send full range and let @@ -2007,10 +2025,12 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, struct page *locked_page, u64 *start, u64 *end) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; const u64 orig_start = *start; const u64 orig_end = *end; - u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; + /* The sanity tests may not set a valid fs_info. */ + u64 max_bytes = fs_info ? fs_info->max_extent_size : BTRFS_MAX_EXTENT_SIZE; u64 delalloc_start; u64 delalloc_end; bool found; @@ -2418,6 +2438,20 @@ int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) return ret; } +static int next_mirror(const struct io_failure_record *failrec, int cur_mirror) +{ + if (cur_mirror == failrec->num_copies) + return cur_mirror + 1 - failrec->num_copies; + return cur_mirror + 1; +} + +static int prev_mirror(const struct io_failure_record *failrec, int cur_mirror) +{ + if (cur_mirror == 1) + return failrec->num_copies; + return cur_mirror - 1; +} + /* * each time an IO finishes, we do a fast check in the IO failure tree * to see if we need to process or clean up an io_failure_record @@ -2430,7 +2464,7 @@ int clean_io_failure(struct btrfs_fs_info *fs_info, u64 private; struct io_failure_record *failrec; struct extent_state *state; - int num_copies; + int mirror; int ret; private = 0; @@ -2454,20 +2488,19 @@ int clean_io_failure(struct btrfs_fs_info *fs_info, EXTENT_LOCKED); spin_unlock(&io_tree->lock); - if (state && state->start <= failrec->start && - state->end >= failrec->start + failrec->len - 1) { - num_copies = btrfs_num_copies(fs_info, failrec->logical, - failrec->len); - if (num_copies > 1) { - repair_io_failure(fs_info, ino, start, failrec->len, - failrec->logical, page, pg_offset, - failrec->failed_mirror); - } - } + if (!state || state->start > failrec->start || + state->end < failrec->start + failrec->len - 1) + goto out; + + mirror = failrec->this_mirror; + do { + mirror = prev_mirror(failrec, mirror); + repair_io_failure(fs_info, ino, start, failrec->len, + failrec->logical, page, pg_offset, mirror); + } while (mirror != failrec->failed_mirror); out: free_io_failure(failure_tree, io_tree, failrec); - return 0; } @@ -2506,17 +2539,16 @@ void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end) } static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode, - u64 start) + struct btrfs_bio *bbio, + unsigned int bio_offset) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + u64 start = bbio->file_offset + bio_offset; struct io_failure_record *failrec; - struct extent_map *em; struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; const u32 sectorsize = fs_info->sectorsize; int ret; - u64 logical; failrec = get_state_failrec(failure_tree, start); if (!IS_ERR(failrec)) { @@ -2528,7 +2560,8 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode * (e.g. with a list for failed_mirror) to make * clean_io_failure() clean all those errors at once. */ - + ASSERT(failrec->this_mirror == bbio->mirror_num); + ASSERT(failrec->len == fs_info->sectorsize); return failrec; } @@ -2538,41 +2571,28 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode failrec->start = start; failrec->len = sectorsize; - failrec->this_mirror = 0; - failrec->compress_type = BTRFS_COMPRESS_NONE; + failrec->failed_mirror = bbio->mirror_num; + failrec->this_mirror = bbio->mirror_num; + failrec->logical = (bbio->iter.bi_sector << SECTOR_SHIFT) + bio_offset; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, failrec->len); - if (!em) { - read_unlock(&em_tree->lock); - kfree(failrec); - return ERR_PTR(-EIO); - } + btrfs_debug(fs_info, + "new io failure record logical %llu start %llu", + failrec->logical, start); - if (em->start > start || em->start + em->len <= start) { - free_extent_map(em); - em = NULL; - } - read_unlock(&em_tree->lock); - if (!em) { + failrec->num_copies = btrfs_num_copies(fs_info, failrec->logical, sectorsize); + if (failrec->num_copies == 1) { + /* + * We only have a single copy of the data, so don't bother with + * all the retry and error correction code that follows. No + * matter what the error is, it is very likely to persist. + */ + btrfs_debug(fs_info, + "cannot repair logical %llu num_copies %d", + failrec->logical, failrec->num_copies); kfree(failrec); return ERR_PTR(-EIO); } - logical = start - em->start; - logical = em->block_start + logical; - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { - logical = em->block_start; - failrec->compress_type = em->compress_type; - } - - btrfs_debug(fs_info, - "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu", - logical, start, failrec->len); - - failrec->logical = logical; - free_extent_map(em); - /* Set the bits in the private failure tree */ ret = set_extent_bits(failure_tree, start, start + sectorsize - 1, EXTENT_LOCKED | EXTENT_DIRTY); @@ -2589,65 +2609,16 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode return failrec; } -static bool btrfs_check_repairable(struct inode *inode, - struct io_failure_record *failrec, - int failed_mirror) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int num_copies; - - num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len); - if (num_copies == 1) { - /* - * we only have a single copy of the data, so don't bother with - * all the retry and error correction code that follows. no - * matter what the error is, it is very likely to persist. - */ - btrfs_debug(fs_info, - "Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d", - num_copies, failrec->this_mirror, failed_mirror); - return false; - } - - /* The failure record should only contain one sector */ - ASSERT(failrec->len == fs_info->sectorsize); - - /* - * There are two premises: - * a) deliver good data to the caller - * b) correct the bad sectors on disk - * - * Since we're only doing repair for one sector, we only need to get - * a good copy of the failed sector and if we succeed, we have setup - * everything for repair_io_failure to do the rest for us. - */ - ASSERT(failed_mirror); - failrec->failed_mirror = failed_mirror; - failrec->this_mirror++; - if (failrec->this_mirror == failed_mirror) - failrec->this_mirror++; - - if (failrec->this_mirror > num_copies) { - btrfs_debug(fs_info, - "Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d", - num_copies, failrec->this_mirror, failed_mirror); - return false; - } - - return true; -} - -int btrfs_repair_one_sector(struct inode *inode, - struct bio *failed_bio, u32 bio_offset, - struct page *page, unsigned int pgoff, - u64 start, int failed_mirror, +int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, + u32 bio_offset, struct page *page, unsigned int pgoff, submit_bio_hook_t *submit_bio_hook) { + u64 start = failed_bbio->file_offset + bio_offset; struct io_failure_record *failrec; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; - struct btrfs_bio *failed_bbio = btrfs_bio(failed_bio); + struct bio *failed_bio = &failed_bbio->bio; const int icsum = bio_offset >> fs_info->sectorsize_bits; struct bio *repair_bio; struct btrfs_bio *repair_bbio; @@ -2657,12 +2628,24 @@ int btrfs_repair_one_sector(struct inode *inode, BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); - failrec = btrfs_get_io_failure_record(inode, start); + failrec = btrfs_get_io_failure_record(inode, failed_bbio, bio_offset); if (IS_ERR(failrec)) return PTR_ERR(failrec); - - if (!btrfs_check_repairable(inode, failrec, failed_mirror)) { + /* + * There are two premises: + * a) deliver good data to the caller + * b) correct the bad sectors on disk + * + * Since we're only doing repair for one sector, we only need to get + * a good copy of the failed sector and if we succeed, we have setup + * everything for repair_io_failure to do the rest for us. + */ + failrec->this_mirror = next_mirror(failrec, failrec->this_mirror); + if (failrec->this_mirror == failrec->failed_mirror) { + btrfs_debug(fs_info, + "failed to repair num_copies %d this_mirror %d failed_mirror %d", + failrec->num_copies, failrec->this_mirror, failrec->failed_mirror); free_io_failure(failure_tree, tree, failrec); return -EIO; } @@ -2695,7 +2678,7 @@ int btrfs_repair_one_sector(struct inode *inode, * will be handled by the endio on the repair_bio, so we can't return an * error here. */ - submit_bio_hook(inode, repair_bio, failrec->this_mirror, failrec->compress_type); + submit_bio_hook(inode, repair_bio, failrec->this_mirror, 0); return BLK_STS_OK; } @@ -2727,21 +2710,35 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) btrfs_subpage_end_reader(fs_info, page, start, len); } -static blk_status_t submit_data_read_repair(struct inode *inode, - struct bio *failed_bio, - u32 bio_offset, struct page *page, - unsigned int pgoff, - u64 start, u64 end, - int failed_mirror, - unsigned int error_bitmap) +static void end_sector_io(struct page *page, u64 offset, bool uptodate) +{ + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + const u32 sectorsize = inode->root->fs_info->sectorsize; + struct extent_state *cached = NULL; + + end_page_read(page, uptodate, offset, sectorsize); + if (uptodate) + set_extent_uptodate(&inode->io_tree, offset, + offset + sectorsize - 1, &cached, GFP_ATOMIC); + unlock_extent_cached_atomic(&inode->io_tree, offset, + offset + sectorsize - 1, &cached); +} + +static void submit_data_read_repair(struct inode *inode, + struct btrfs_bio *failed_bbio, + u32 bio_offset, const struct bio_vec *bvec, + unsigned int error_bitmap) { + const unsigned int pgoff = bvec->bv_offset; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct page *page = bvec->bv_page; + const u64 start = page_offset(bvec->bv_page) + bvec->bv_offset; + const u64 end = start + bvec->bv_len - 1; const u32 sectorsize = fs_info->sectorsize; const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits; - int error = 0; int i; - BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); + BUG_ON(bio_op(&failed_bbio->bio) == REQ_OP_WRITE); /* This repair is only for data */ ASSERT(is_data_inode(inode)); @@ -2753,12 +2750,11 @@ static blk_status_t submit_data_read_repair(struct inode *inode, * We only get called on buffered IO, thus page must be mapped and bio * must not be cloned. */ - ASSERT(page->mapping && !bio_flagged(failed_bio, BIO_CLONED)); + ASSERT(page->mapping && !bio_flagged(&failed_bbio->bio, BIO_CLONED)); /* Iterate through all the sectors in the range */ for (i = 0; i < nr_bits; i++) { const unsigned int offset = i * sectorsize; - struct extent_state *cached = NULL; bool uptodate = false; int ret; @@ -2771,10 +2767,9 @@ static blk_status_t submit_data_read_repair(struct inode *inode, goto next; } - ret = btrfs_repair_one_sector(inode, failed_bio, - bio_offset + offset, - page, pgoff + offset, start + offset, - failed_mirror, btrfs_submit_data_bio); + ret = btrfs_repair_one_sector(inode, failed_bbio, + bio_offset + offset, page, pgoff + offset, + btrfs_submit_data_read_bio); if (!ret) { /* * We have submitted the read repair, the page release @@ -2785,24 +2780,12 @@ static blk_status_t submit_data_read_repair(struct inode *inode, continue; } /* - * Repair failed, just record the error but still continue. - * Or the remaining sectors will not be properly unlocked. + * Continue on failed repair, otherwise the remaining sectors + * will not be properly unlocked. */ - if (!error) - error = ret; next: - end_page_read(page, uptodate, start + offset, sectorsize); - if (uptodate) - set_extent_uptodate(&BTRFS_I(inode)->io_tree, - start + offset, - start + offset + sectorsize - 1, - &cached, GFP_ATOMIC); - unlock_extent_cached_atomic(&BTRFS_I(inode)->io_tree, - start + offset, - start + offset + sectorsize - 1, - &cached); + end_sector_io(page, start + offset, uptodate); } - return errno_to_blk_status(error); } /* lots and lots of room for performance fixes in the end_bio funcs */ @@ -2966,7 +2949,7 @@ static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) } /* - * Find extent buffer for a given bytenr. + * Find extent buffer for a givne bytenr. * * This is for end_bio_extent_readpage(), thus we can't do any unsafe locking * in endio context. @@ -2985,9 +2968,11 @@ static struct extent_buffer *find_extent_buffer_readpage( return (struct extent_buffer *)page->private; } - /* For subpage case, we need to lookup extent buffer xarray */ - eb = xa_load(&fs_info->extent_buffers, - bytenr >> fs_info->sectorsize_bits); + /* For subpage case, we need to lookup buffer radix tree */ + rcu_read_lock(); + eb = radix_tree_lookup(&fs_info->buffer_radix, + bytenr >> fs_info->sectorsize_bits); + rcu_read_unlock(); ASSERT(eb); return eb; } @@ -3015,7 +3000,6 @@ static void end_bio_extent_readpage(struct bio *bio) */ u32 bio_offset = 0; int mirror; - int ret; struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); @@ -3026,6 +3010,7 @@ static void end_bio_extent_readpage(struct bio *bio) struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const u32 sectorsize = fs_info->sectorsize; unsigned int error_bitmap = (unsigned int)-1; + bool repair = false; u64 start; u64 end; u32 len; @@ -3063,57 +3048,23 @@ static void end_bio_extent_readpage(struct bio *bio) if (is_data_inode(inode)) { error_bitmap = btrfs_verify_data_csum(bbio, bio_offset, page, start, end); - ret = error_bitmap; + if (error_bitmap) + uptodate = false; } else { - ret = btrfs_validate_metadata_buffer(bbio, - page, start, end, mirror); + if (btrfs_validate_metadata_buffer(bbio, + page, start, end, mirror)) + uptodate = false; } - if (ret) - uptodate = false; - else - clean_io_failure(BTRFS_I(inode)->root->fs_info, - failure_tree, tree, start, - page, - btrfs_ino(BTRFS_I(inode)), 0); } - if (likely(uptodate)) - goto readpage_ok; - - if (is_data_inode(inode)) { - /* - * If we failed to submit the IO at all we'll have a - * mirror_num == 0, in which case we need to just mark - * the page with an error and unlock it and carry on. - */ - if (mirror == 0) - goto readpage_ok; - - /* - * submit_data_read_repair() will handle all the good - * and bad sectors, we just continue to the next bvec. - */ - submit_data_read_repair(inode, bio, bio_offset, page, - start - page_offset(page), - start, end, mirror, - error_bitmap); - - ASSERT(bio_offset + len > bio_offset); - bio_offset += len; - continue; - } else { - struct extent_buffer *eb; - - eb = find_extent_buffer_readpage(fs_info, page, start); - set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); - eb->read_mirror = mirror; - atomic_dec(&eb->io_pages); - } -readpage_ok: if (likely(uptodate)) { loff_t i_size = i_size_read(inode); pgoff_t end_index = i_size >> PAGE_SHIFT; + clean_io_failure(BTRFS_I(inode)->root->fs_info, + failure_tree, tree, start, page, + btrfs_ino(BTRFS_I(inode)), 0); + /* * Zero out the remaining part if this range straddles * i_size. @@ -3130,14 +3081,44 @@ readpage_ok: zero_user_segment(page, zero_start, offset_in_page(end) + 1); } + } else if (is_data_inode(inode)) { + /* + * Only try to repair bios that actually made it to a + * device. If the bio failed to be submitted mirror + * is 0 and we need to fail it without retrying. + * + * This also includes the high level bios for compressed + * extents - these never make it to a device and repair + * is already handled on the lower compressed bio. + */ + if (mirror > 0) + repair = true; + } else { + struct extent_buffer *eb; + + eb = find_extent_buffer_readpage(fs_info, page, start); + set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); + eb->read_mirror = mirror; + atomic_dec(&eb->io_pages); + } + + if (repair) { + /* + * submit_data_read_repair() will handle all the good + * and bad sectors, we just continue to the next bvec. + */ + submit_data_read_repair(inode, bbio, bio_offset, bvec, + error_bitmap); + } else { + /* Update page status and unlock */ + end_page_read(page, uptodate, start, len); + endio_readpage_release_extent(&processed, BTRFS_I(inode), + start, end, PageUptodate(page)); } + ASSERT(bio_offset + len > bio_offset); bio_offset += len; - /* Update page status and unlock */ - end_page_read(page, uptodate, start, len); - endio_readpage_release_extent(&processed, BTRFS_I(inode), - start, end, PageUptodate(page)); } /* Release the last extent */ endio_readpage_release_extent(&processed, NULL, 0, 0, false); @@ -3206,19 +3187,6 @@ struct bio *btrfs_bio_alloc(unsigned int nr_iovecs) return bio; } -struct bio *btrfs_bio_clone(struct block_device *bdev, struct bio *bio) -{ - struct btrfs_bio *bbio; - struct bio *new; - - /* Bio allocation backed by a bioset does not fail */ - new = bio_alloc_clone(bdev, bio, GFP_NOFS, &btrfs_bioset); - bbio = btrfs_bio(new); - btrfs_bio_init(bbio); - bbio->iter = bio->bi_iter; - return new; -} - struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size) { struct bio *bio; @@ -3357,7 +3325,7 @@ static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, static int alloc_new_bio(struct btrfs_inode *inode, struct btrfs_bio_ctrl *bio_ctrl, struct writeback_control *wbc, - unsigned int opf, + blk_opf_t opf, bio_end_io_t end_io_func, u64 disk_bytenr, u32 offset, u64 file_offset, enum btrfs_compression_type compress_type) @@ -3378,7 +3346,6 @@ static int alloc_new_bio(struct btrfs_inode *inode, bio_ctrl->bio = bio; bio_ctrl->compress_type = compress_type; bio->bi_end_io = end_io_func; - bio->bi_private = &inode->io_tree; bio->bi_opf = opf; ret = calc_bio_boundaries(bio_ctrl, inode, file_offset); if (ret < 0) @@ -3437,13 +3404,12 @@ error: * @prev_bio_flags: flags of previous bio to see if we can merge the current one * @compress_type: compress type for current bio */ -static int submit_extent_page(unsigned int opf, +static int submit_extent_page(blk_opf_t opf, struct writeback_control *wbc, struct btrfs_bio_ctrl *bio_ctrl, struct page *page, u64 disk_bytenr, size_t size, unsigned long pg_offset, bio_end_io_t end_io_func, - int mirror_num, enum btrfs_compression_type compress_type, bool force_bio_submit) { @@ -3455,10 +3421,8 @@ static int submit_extent_page(unsigned int opf, ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE && pg_offset + size <= PAGE_SIZE); - if (force_bio_submit && bio_ctrl->bio) { - submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->compress_type); - bio_ctrl->bio = NULL; - } + if (force_bio_submit) + submit_one_bio(bio_ctrl); while (cur < pg_offset + size) { u32 offset = cur - pg_offset; @@ -3498,8 +3462,7 @@ static int submit_extent_page(unsigned int opf, if (added < size - offset) { /* The bio should contain some page(s) */ ASSERT(bio_ctrl->bio->bi_iter.bi_size); - submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->compress_type); - bio_ctrl->bio = NULL; + submit_one_bio(bio_ctrl); } cur += added; } @@ -3615,7 +3578,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, */ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, struct btrfs_bio_ctrl *bio_ctrl, - unsigned int read_flags, u64 *prev_em_start) + blk_opf_t read_flags, u64 *prev_em_start) { struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -3647,7 +3610,6 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, if (zero_offset) { iosize = PAGE_SIZE - zero_offset; memzero_page(page, zero_offset, iosize); - flush_dcache_page(page); } } begin_page_read(fs_info, page); @@ -3662,7 +3624,6 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, iosize = PAGE_SIZE - pg_offset; memzero_page(page, pg_offset, iosize); - flush_dcache_page(page); set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); unlock_extent_cached(tree, cur, @@ -3746,7 +3707,6 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, struct extent_state *cached = NULL; memzero_page(page, pg_offset, iosize); - flush_dcache_page(page); set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); @@ -3779,10 +3739,8 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, ret = submit_extent_page(REQ_OP_READ | read_flags, NULL, bio_ctrl, page, disk_bytenr, iosize, - pg_offset, - end_bio_extent_readpage, 0, - this_bio_flag, - force_bio_submit); + pg_offset, end_bio_extent_readpage, + this_bio_flag, force_bio_submit); if (ret) { /* * We have to unlock the remaining range, or the page @@ -3815,8 +3773,7 @@ int btrfs_read_folio(struct file *file, struct folio *folio) * If btrfs_do_readpage() failed we will want to submit the assembled * bio to do the cleanup. */ - if (bio_ctrl.bio) - submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.compress_type); + submit_one_bio(&bio_ctrl); return ret; } @@ -3983,8 +3940,8 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, int saved_ret = 0; int ret = 0; int nr = 0; - u32 opf = REQ_OP_WRITE; - const unsigned int write_flags = wbc_to_write_flags(wbc); + enum req_op op = REQ_OP_WRITE; + const blk_opf_t write_flags = wbc_to_write_flags(wbc); bool has_error = false; bool compressed; @@ -4058,7 +4015,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, iosize = min(min(em_end, end + 1), dirty_range_end) - cur; if (btrfs_use_zone_append(inode, em->block_start)) - opf = REQ_OP_ZONE_APPEND; + op = REQ_OP_ZONE_APPEND; free_extent_map(em); em = NULL; @@ -4094,12 +4051,12 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, */ btrfs_page_clear_dirty(fs_info, page, cur, iosize); - ret = submit_extent_page(opf | write_flags, wbc, + ret = submit_extent_page(op | write_flags, wbc, &epd->bio_ctrl, page, disk_bytenr, iosize, cur - page_offset(page), end_bio_extent_writepage, - 0, 0, false); + 0, false); if (ret) { has_error = true; if (!saved_ret) @@ -4164,10 +4121,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, return 0; } - if (page->index == end_index) { + if (page->index == end_index) memzero_page(page, pg_offset, PAGE_SIZE - pg_offset); - flush_dcache_page(page); - } ret = set_page_extent_mapped(page); if (ret < 0) { @@ -4276,7 +4231,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb int ret = 0; if (!btrfs_try_tree_write_lock(eb)) { - flush_write_bio(epd); + submit_write_bio(epd, 0); flush = 1; btrfs_tree_lock(eb); } @@ -4286,7 +4241,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb if (!epd->sync_io) return 0; if (!flush) { - flush_write_bio(epd); + submit_write_bio(epd, 0); flush = 1; } while (1) { @@ -4333,7 +4288,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb if (!trylock_page(p)) { if (!flush) { - flush_write_bio(epd); + submit_write_bio(epd, 0); flush = 1; } lock_page(p); @@ -4435,8 +4390,8 @@ static struct extent_buffer *find_extent_buffer_nolock( struct extent_buffer *eb; rcu_read_lock(); - eb = xa_load(&fs_info->extent_buffers, - start >> fs_info->sectorsize_bits); + eb = radix_tree_lookup(&fs_info->buffer_radix, + start >> fs_info->sectorsize_bits); if (eb && atomic_inc_not_zero(&eb->refs)) { rcu_read_unlock(); return eb; @@ -4575,7 +4530,7 @@ static int write_one_subpage_eb(struct extent_buffer *eb, { struct btrfs_fs_info *fs_info = eb->fs_info; struct page *page = eb->pages[0]; - unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META; + blk_opf_t write_flags = wbc_to_write_flags(wbc); bool no_dirty_ebs = false; int ret; @@ -4594,7 +4549,7 @@ static int write_one_subpage_eb(struct extent_buffer *eb, ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, &epd->bio_ctrl, page, eb->start, eb->len, eb->start - page_offset(page), - end_bio_subpage_eb_writepage, 0, 0, false); + end_bio_subpage_eb_writepage, 0, false); if (ret) { btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len); set_btree_ioerr(page, eb); @@ -4620,7 +4575,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, { u64 disk_bytenr = eb->start; int i, num_pages; - unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META; + blk_opf_t write_flags = wbc_to_write_flags(wbc); int ret = 0; prepare_eb_write(eb); @@ -4635,7 +4590,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, &epd->bio_ctrl, p, disk_bytenr, PAGE_SIZE, 0, end_bio_extent_buffer_writepage, - 0, 0, false); + 0, false); if (ret) { set_btree_ioerr(p, eb); if (PageWriteback(p)) @@ -4749,7 +4704,7 @@ static int submit_eb_subpage(struct page *page, cleanup: /* We hit error, end bio for the submitted extent buffers */ - end_write_bio(epd, ret); + submit_write_bio(epd, ret); return ret; } @@ -4928,10 +4883,6 @@ retry: index = 0; goto retry; } - if (ret < 0) { - end_write_bio(&epd, ret); - goto out; - } /* * If something went wrong, don't allow any metadata write bio to be * submitted. @@ -4958,21 +4909,17 @@ retry: * Now such dirty tree block will not be cleaned by any dirty * extent io tree. Thus we don't want to submit such wild eb * if the fs already has error. - */ - if (!BTRFS_FS_ERROR(fs_info)) { - flush_write_bio(&epd); - } else { - ret = -EROFS; - end_write_bio(&epd, ret); - } -out: - btrfs_zoned_meta_io_unlock(fs_info); - /* + * * We can get ret > 0 from submit_extent_page() indicating how many ebs * were submitted. Reset it to 0 to avoid false alerts for the caller. */ if (ret > 0) ret = 0; + if (!ret && BTRFS_FS_ERROR(fs_info)) + ret = -EROFS; + submit_write_bio(&epd, ret); + + btrfs_zoned_meta_io_unlock(fs_info); return ret; } @@ -5074,7 +5021,7 @@ retry: * tmpfs file mapping */ if (!trylock_page(page)) { - flush_write_bio(epd); + submit_write_bio(epd, 0); lock_page(page); } @@ -5085,7 +5032,7 @@ retry: if (wbc->sync_mode != WB_SYNC_NONE) { if (PageWriteback(page)) - flush_write_bio(epd); + submit_write_bio(epd, 0); wait_on_page_writeback(page); } @@ -5125,7 +5072,7 @@ retry: * page in our current bio, and thus deadlock, so flush the * write bio here. */ - flush_write_bio(epd); + submit_write_bio(epd, 0); goto retry; } @@ -5136,26 +5083,6 @@ retry: return ret; } -int extent_write_full_page(struct page *page, struct writeback_control *wbc) -{ - int ret; - struct extent_page_data epd = { - .bio_ctrl = { 0 }, - .extent_locked = 0, - .sync_io = wbc->sync_mode == WB_SYNC_ALL, - }; - - ret = __extent_writepage(page, wbc, &epd); - ASSERT(ret <= 0); - if (ret < 0) { - end_write_bio(&epd, ret); - return ret; - } - - flush_write_bio(&epd); - return ret; -} - /* * Submit the pages in the range to bio for call sites which delalloc range has * already been ran (aka, ordered extent inserted) and all pages are still @@ -5213,10 +5140,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end) cur = cur_end + 1; } - if (!found_error) - flush_write_bio(&epd); - else - end_write_bio(&epd, ret); + submit_write_bio(&epd, found_error ? ret : 0); wbc_detach_inode(&wbc_writepages); if (found_error) @@ -5241,13 +5165,8 @@ int extent_writepages(struct address_space *mapping, */ btrfs_zoned_data_reloc_lock(BTRFS_I(inode)); ret = extent_write_cache_pages(mapping, wbc, &epd); + submit_write_bio(&epd, ret); btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); - ASSERT(ret <= 0); - if (ret < 0) { - end_write_bio(&epd, ret); - return ret; - } - flush_write_bio(&epd); return ret; } @@ -5269,9 +5188,7 @@ void extent_readahead(struct readahead_control *rac) if (em_cached) free_extent_map(em_cached); - - if (bio_ctrl.bio) - submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.compress_type); + submit_one_bio(&bio_ctrl); } /* @@ -6128,22 +6045,24 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, if (!eb) return ERR_PTR(-ENOMEM); eb->fs_info = fs_info; - - do { - ret = xa_insert(&fs_info->extent_buffers, - start >> fs_info->sectorsize_bits, - eb, GFP_NOFS); - if (ret == -ENOMEM) { - exists = ERR_PTR(ret); +again: + ret = radix_tree_preload(GFP_NOFS); + if (ret) { + exists = ERR_PTR(ret); + goto free_eb; + } + spin_lock(&fs_info->buffer_lock); + ret = radix_tree_insert(&fs_info->buffer_radix, + start >> fs_info->sectorsize_bits, eb); + spin_unlock(&fs_info->buffer_lock); + radix_tree_preload_end(); + if (ret == -EEXIST) { + exists = find_extent_buffer(fs_info, start); + if (exists) goto free_eb; - } - if (ret == -EBUSY) { - exists = find_extent_buffer(fs_info, start); - if (exists) - goto free_eb; - } - } while (ret); - + else + goto again; + } check_buffer_tree_ref(eb); set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); @@ -6201,7 +6120,7 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) return -EINVAL; } if (fs_info->nodesize >= PAGE_SIZE && - !IS_ALIGNED(start, PAGE_SIZE)) { + !PAGE_ALIGNED(start)) { btrfs_err(fs_info, "tree block is not page aligned, start %llu nodesize %u", start, fs_info->nodesize); @@ -6318,22 +6237,25 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, } if (uptodate) set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - - do { - ret = xa_insert(&fs_info->extent_buffers, - start >> fs_info->sectorsize_bits, - eb, GFP_NOFS); - if (ret == -ENOMEM) { - exists = ERR_PTR(ret); +again: + ret = radix_tree_preload(GFP_NOFS); + if (ret) { + exists = ERR_PTR(ret); + goto free_eb; + } + + spin_lock(&fs_info->buffer_lock); + ret = radix_tree_insert(&fs_info->buffer_radix, + start >> fs_info->sectorsize_bits, eb); + spin_unlock(&fs_info->buffer_lock); + radix_tree_preload_end(); + if (ret == -EEXIST) { + exists = find_extent_buffer(fs_info, start); + if (exists) goto free_eb; - } - if (ret == -EBUSY) { - exists = find_extent_buffer(fs_info, start); - if (exists) - goto free_eb; - } - } while (ret); - + else + goto again; + } /* add one reference for the tree */ check_buffer_tree_ref(eb); set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); @@ -6378,8 +6300,10 @@ static int release_extent_buffer(struct extent_buffer *eb) spin_unlock(&eb->refs_lock); - xa_erase(&fs_info->extent_buffers, - eb->start >> fs_info->sectorsize_bits); + spin_lock(&fs_info->buffer_lock); + radix_tree_delete(&fs_info->buffer_radix, + eb->start >> fs_info->sectorsize_bits); + spin_unlock(&fs_info->buffer_lock); } else { spin_unlock(&eb->refs_lock); } @@ -6598,7 +6522,9 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, struct btrfs_fs_info *fs_info = eb->fs_info; struct extent_io_tree *io_tree; struct page *page = eb->pages[0]; - struct btrfs_bio_ctrl bio_ctrl = { 0 }; + struct btrfs_bio_ctrl bio_ctrl = { + .mirror_num = mirror_num, + }; int ret = 0; ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)); @@ -6630,11 +6556,10 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len); btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len); - ret = submit_extent_page(REQ_OP_READ | REQ_META, NULL, &bio_ctrl, + ret = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl, page, eb->start, eb->len, eb->start - page_offset(page), - end_bio_extent_readpage, mirror_num, 0, - true); + end_bio_extent_readpage, 0, true); if (ret) { /* * In the endio function, if we hit something wrong we will @@ -6643,10 +6568,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, */ atomic_dec(&eb->io_pages); } - if (bio_ctrl.bio) { - submit_one_bio(bio_ctrl.bio, mirror_num, 0); - bio_ctrl.bio = NULL; - } + submit_one_bio(&bio_ctrl); if (ret || wait != WAIT_COMPLETE) return ret; @@ -6666,7 +6588,9 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) int all_uptodate = 1; int num_pages; unsigned long num_reads = 0; - struct btrfs_bio_ctrl bio_ctrl = { 0 }; + struct btrfs_bio_ctrl bio_ctrl = { + .mirror_num = mirror_num, + }; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; @@ -6737,10 +6661,10 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) } ClearPageError(page); - err = submit_extent_page(REQ_OP_READ | REQ_META, NULL, + err = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl, page, page_offset(page), PAGE_SIZE, 0, end_bio_extent_readpage, - mirror_num, 0, false); + 0, false); if (err) { /* * We failed to submit the bio so it's the @@ -6757,10 +6681,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) } } - if (bio_ctrl.bio) { - submit_one_bio(bio_ctrl.bio, mirror_num, bio_ctrl.compress_type); - bio_ctrl.bio = NULL; - } + submit_one_bio(&bio_ctrl); if (ret || wait != WAIT_COMPLETE) return ret; @@ -7324,25 +7245,42 @@ void memmove_extent_buffer(const struct extent_buffer *dst, } } +#define GANG_LOOKUP_SIZE 16 static struct extent_buffer *get_next_extent_buffer( struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr) { - struct extent_buffer *eb; - unsigned long index; + struct extent_buffer *gang[GANG_LOOKUP_SIZE]; + struct extent_buffer *found = NULL; u64 page_start = page_offset(page); + u64 cur = page_start; ASSERT(in_range(bytenr, page_start, PAGE_SIZE)); lockdep_assert_held(&fs_info->buffer_lock); - xa_for_each_start(&fs_info->extent_buffers, index, eb, - page_start >> fs_info->sectorsize_bits) { - if (in_range(eb->start, page_start, PAGE_SIZE)) - return eb; - else if (eb->start >= page_start + PAGE_SIZE) - /* Already beyond page end */ - return NULL; + while (cur < page_start + PAGE_SIZE) { + int ret; + int i; + + ret = radix_tree_gang_lookup(&fs_info->buffer_radix, + (void **)gang, cur >> fs_info->sectorsize_bits, + min_t(unsigned int, GANG_LOOKUP_SIZE, + PAGE_SIZE / fs_info->nodesize)); + if (ret == 0) + goto out; + for (i = 0; i < ret; i++) { + /* Already beyond page end */ + if (gang[i]->start >= page_start + PAGE_SIZE) + goto out; + /* Found one */ + if (gang[i]->start >= bytenr) { + found = gang[i]; + goto out; + } + } + cur = gang[ret - 1]->start + gang[ret - 1]->len; } - return NULL; +out: + return found; } static int try_release_subpage_extent_buffer(struct page *page) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 23d4103c8831..4bc72a87b9a9 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -57,6 +57,7 @@ enum { #define BITMAP_LAST_BYTE_MASK(nbits) \ (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1))) +struct btrfs_bio; struct btrfs_root; struct btrfs_inode; struct btrfs_io_bio; @@ -142,15 +143,10 @@ static inline void extent_changeset_free(struct extent_changeset *changeset) struct extent_map_tree; -typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, - u64 start, u64 len); - int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); int btrfs_read_folio(struct file *file, struct folio *folio); -int extent_write_full_page(struct page *page, struct writeback_control *wbc); int extent_write_locked_range(struct inode *inode, u64 start, u64 end); int extent_writepages(struct address_space *mapping, struct writeback_control *wbc); @@ -247,7 +243,6 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array); struct bio *btrfs_bio_alloc(unsigned int nr_iovecs); -struct bio *btrfs_bio_clone(struct block_device *bdev, struct bio *bio); struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size); void end_extent_writepage(struct page *page, int err, u64 start, u64 end); @@ -266,15 +261,13 @@ struct io_failure_record { u64 start; u64 len; u64 logical; - enum btrfs_compression_type compress_type; int this_mirror; int failed_mirror; + int num_copies; }; -int btrfs_repair_one_sector(struct inode *inode, - struct bio *failed_bio, u32 bio_offset, - struct page *page, unsigned int pgoff, - u64 start, int failed_mirror, +int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, + u32 bio_offset, struct page *page, unsigned int pgoff, submit_bio_hook_t *submit_bio_hook); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 1fd827b99c1b..66c822182ecc 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1848,7 +1848,6 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) { - const bool is_sync_write = (iocb->ki_flags & IOCB_DSYNC); struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -1902,15 +1901,6 @@ relock: } /* - * We remove IOCB_DSYNC so that we don't deadlock when iomap_dio_rw() - * calls generic_write_sync() (through iomap_dio_complete()), because - * that results in calling fsync (btrfs_sync_file()) which will try to - * lock the inode in exclusive/write mode. - */ - if (is_sync_write) - iocb->ki_flags &= ~IOCB_DSYNC; - - /* * The iov_iter can be mapped to the same file range we are writing to. * If that's the case, then we will deadlock in the iomap code, because * it first calls our callback btrfs_dio_iomap_begin(), which will create @@ -1965,17 +1955,24 @@ again: btrfs_inode_unlock(inode, ilock_flags); /* - * Add back IOCB_DSYNC. Our caller, btrfs_file_write_iter(), will do - * the fsync (call generic_write_sync()). + * If 'err' is -ENOTBLK or we have not written all data, then it means + * we must fallback to buffered IO. */ - if (is_sync_write) - iocb->ki_flags |= IOCB_DSYNC; - - /* If 'err' is -ENOTBLK then it means we must fallback to buffered IO. */ if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from)) goto out; buffered: + /* + * If we are in a NOWAIT context, then return -EAGAIN to signal the caller + * it must retry the operation in a context where blocking is acceptable, + * since we currently don't have NOWAIT semantics support for buffered IO + * and may block there for many reasons (reserving space for example). + */ + if (iocb->ki_flags & IOCB_NOWAIT) { + err = -EAGAIN; + goto out; + } + pos = iocb->ki_pos; written_buffered = btrfs_buffered_write(iocb, from); if (written_buffered < 0) { @@ -2038,7 +2035,7 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, struct file *file = iocb->ki_filp; struct btrfs_inode *inode = BTRFS_I(file_inode(file)); ssize_t num_written, num_sync; - const bool sync = iocb->ki_flags & IOCB_DSYNC; + const bool sync = iocb_is_dsync(iocb); /* * If the fs flips readonly due to some impossible error, although we @@ -2058,9 +2055,11 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, num_written = btrfs_encoded_write(iocb, from, encoded); num_sync = encoded->len; } else if (iocb->ki_flags & IOCB_DIRECT) { - num_written = num_sync = btrfs_direct_write(iocb, from); + num_written = btrfs_direct_write(iocb, from); + num_sync = num_written; } else { - num_written = num_sync = btrfs_buffered_write(iocb, from); + num_written = btrfs_buffered_write(iocb, from); + num_sync = num_written; } btrfs_set_inode_last_sub_trans(inode); @@ -2308,7 +2307,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) btrfs_release_log_ctx_extents(&ctx); if (ret < 0) { /* Fallthrough and commit/free transaction. */ - ret = 1; + ret = BTRFS_LOG_FORCE_COMMIT; } /* we've logged all the items and now have a consistent @@ -2323,25 +2322,62 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); - if (ret != BTRFS_NO_LOG_SYNC) { + if (ret == BTRFS_NO_LOG_SYNC) { + ret = btrfs_end_transaction(trans); + goto out; + } + + /* We successfully logged the inode, attempt to sync the log. */ + if (!ret) { + ret = btrfs_sync_log(trans, root, &ctx); if (!ret) { - ret = btrfs_sync_log(trans, root, &ctx); - if (!ret) { - ret = btrfs_end_transaction(trans); - goto out; - } - } - if (!full_sync) { - ret = btrfs_wait_ordered_range(inode, start, len); - if (ret) { - btrfs_end_transaction(trans); - goto out; - } + ret = btrfs_end_transaction(trans); + goto out; } - ret = btrfs_commit_transaction(trans); - } else { + } + + /* + * At this point we need to commit the transaction because we had + * btrfs_need_log_full_commit() or some other error. + * + * If we didn't do a full sync we have to stop the trans handle, wait on + * the ordered extents, start it again and commit the transaction. If + * we attempt to wait on the ordered extents here we could deadlock with + * something like fallocate() that is holding the extent lock trying to + * start a transaction while some other thread is trying to commit the + * transaction while we (fsync) are currently holding the transaction + * open. + */ + if (!full_sync) { ret = btrfs_end_transaction(trans); + if (ret) + goto out; + ret = btrfs_wait_ordered_range(inode, start, len); + if (ret) + goto out; + + /* + * This is safe to use here because we're only interested in + * making sure the transaction that had the ordered extents is + * committed. We aren't waiting on anything past this point, + * we're purely getting the transaction and committing it. + */ + trans = btrfs_attach_transaction_barrier(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + + /* + * We committed the transaction and there's no currently + * running transaction, this means everything we care + * about made it to disk and we are done. + */ + if (ret == -ENOENT) + ret = 0; + goto out; + } } + + ret = btrfs_commit_transaction(trans); out: ASSERT(list_empty(&ctx.list)); err = file_check_and_advance_wb_err(file); @@ -2697,7 +2733,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, goto out; } rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1); - rsv->failfast = 1; + rsv->failfast = true; /* * 1 - update the inode @@ -2719,7 +2755,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, min_size, false); - BUG_ON(ret); + if (WARN_ON(ret)) + goto out_trans; trans->block_rsv = rsv; cur_offset = start; @@ -2803,6 +2840,25 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, extent_info->file_offset += replace_len; } + /* + * We are releasing our handle on the transaction, balance the + * dirty pages of the btree inode and flush delayed items, and + * then get a new transaction handle, which may now point to a + * new transaction in case someone else may have committed the + * transaction we used to replace/drop file extent items. So + * bump the inode's iversion and update mtime and ctime except + * if we are called from a dedupe context. This is because a + * power failure/crash may happen after the transaction is + * committed and before we finish replacing/dropping all the + * file extent items we need. + */ + inode_inc_iversion(&inode->vfs_inode); + + if (!extent_info || extent_info->update_times) { + inode->vfs_inode.i_mtime = current_time(&inode->vfs_inode); + inode->vfs_inode.i_ctime = inode->vfs_inode.i_mtime; + } + ret = btrfs_update_inode(trans, root, inode); if (ret) break; @@ -2819,7 +2875,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, min_size, false); - BUG_ON(ret); /* shouldn't happen */ + if (WARN_ON(ret)) + break; trans->block_rsv = rsv; cur_offset = drop_args.drop_end; @@ -3042,7 +3099,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) ASSERT(trans != NULL); inode_inc_iversion(inode); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = current_time(inode); + inode->i_ctime = inode->i_mtime; ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); updated_inode = true; btrfs_end_transaction(trans); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index b1ae3ba2ca2c..996da650ecdc 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -3536,7 +3536,8 @@ int btrfs_find_space_cluster(struct btrfs_block_group *block_group, * data, keep it dense. */ if (btrfs_test_opt(fs_info, SSD_SPREAD)) { - cont1_bytes = min_bytes = bytes + empty_size; + cont1_bytes = bytes + empty_size; + min_bytes = cont1_bytes; } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { cont1_bytes = bytes; min_bytes = fs_info->sectorsize; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 81737eff92f3..f0c97d25b4a0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -114,21 +114,17 @@ struct kmem_cache *btrfs_free_space_bitmap_cachep; static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct inode *inode, bool skip_writeback); -static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); static noinline int cow_file_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, - unsigned long *nr_written, int unlock); + unsigned long *nr_written, int unlock, + u64 *done_offset); static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, u64 len, u64 orig_start, u64 block_start, u64 block_len, u64 orig_block_len, u64 ram_bytes, int compress_type, int type); -static void __endio_write_update_ordered(struct btrfs_inode *inode, - const u64 offset, const u64 bytes, - const bool uptodate); - /* * btrfs_inode_lock - lock inode i_rwsem based on arguments passed * @@ -195,11 +191,14 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, { unsigned long index = offset >> PAGE_SHIFT; unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; - u64 page_start = page_offset(locked_page); - u64 page_end = page_start + PAGE_SIZE - 1; - + u64 page_start, page_end; struct page *page; + if (locked_page) { + page_start = page_offset(locked_page); + page_end = page_start + PAGE_SIZE - 1; + } + while (index <= end_index) { /* * For locked page, we will call end_extent_writepage() on it @@ -212,7 +211,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, * btrfs_mark_ordered_io_finished() would skip the accounting * for the page range, and the ordered extent will never finish. */ - if (index == (page_offset(locked_page) >> PAGE_SHIFT)) { + if (locked_page && index == (page_start >> PAGE_SHIFT)) { index++; continue; } @@ -223,7 +222,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, /* * Here we just clear all Ordered bits for every page in the - * range, then __endio_write_update_ordered() will handle + * range, then btrfs_mark_ordered_io_finished() will handle * the ordered extent accounting for the range. */ btrfs_page_clamp_clear_ordered(inode->root->fs_info, page, @@ -231,20 +230,23 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, put_page(page); } - /* The locked page covers the full range, nothing needs to be done */ - if (bytes + offset <= page_offset(locked_page) + PAGE_SIZE) - return; - /* - * In case this page belongs to the delalloc range being instantiated - * then skip it, since the first page of a range is going to be - * properly cleaned up by the caller of run_delalloc_range - */ - if (page_start >= offset && page_end <= (offset + bytes - 1)) { - bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE; - offset = page_offset(locked_page) + PAGE_SIZE; + if (locked_page) { + /* The locked page covers the full range, nothing needs to be done */ + if (bytes + offset <= page_start + PAGE_SIZE) + return; + /* + * In case this page belongs to the delalloc range being + * instantiated then skip it, since the first page of a range is + * going to be properly cleaned up by the caller of + * run_delalloc_range + */ + if (page_start >= offset && page_end <= (offset + bytes - 1)) { + bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE; + offset = page_offset(locked_page) + PAGE_SIZE; + } } - return __endio_write_update_ordered(inode, offset, bytes, false); + return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false); } static int btrfs_dirty_inode(struct inode *inode); @@ -332,9 +334,9 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, cur_size = min_t(unsigned long, compressed_size, PAGE_SIZE); - kaddr = kmap_atomic(cpage); + kaddr = kmap_local_page(cpage); write_extent_buffer(leaf, kaddr, ptr, cur_size); - kunmap_atomic(kaddr); + kunmap_local(kaddr); i++; ptr += cur_size; @@ -345,9 +347,9 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, } else { page = find_get_page(inode->vfs_inode.i_mapping, 0); btrfs_set_file_extent_compression(leaf, ei, 0); - kaddr = kmap_atomic(page); + kaddr = kmap_local_page(page); write_extent_buffer(leaf, kaddr, ptr, size); - kunmap_atomic(kaddr); + kunmap_local(kaddr); put_page(page); } btrfs_mark_buffer_dirty(leaf); @@ -485,7 +487,7 @@ struct async_chunk { struct page *locked_page; u64 start; u64 end; - unsigned int write_flags; + blk_opf_t write_flags; struct list_head extents; struct cgroup_subsys_state *blkcg_css; struct btrfs_work work; @@ -560,8 +562,8 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, * will unlock the full page. */ if (fs_info->sectorsize < PAGE_SIZE) { - if (!IS_ALIGNED(start, PAGE_SIZE) || - !IS_ALIGNED(end + 1, PAGE_SIZE)) + if (!PAGE_ALIGNED(start) || + !PAGE_ALIGNED(end + 1)) return 0; } @@ -678,8 +680,8 @@ again: * Thus we must also check against @actual_end, not just @end. */ if (blocksize < PAGE_SIZE) { - if (!IS_ALIGNED(start, PAGE_SIZE) || - !IS_ALIGNED(round_up(actual_end, blocksize), PAGE_SIZE)) + if (!PAGE_ALIGNED(start) || + !PAGE_ALIGNED(round_up(actual_end, blocksize))) goto cleanup_and_bail_uncompressed; } @@ -920,15 +922,25 @@ static int submit_uncompressed_range(struct btrfs_inode *inode, * can directly submit them without interruption. */ ret = cow_file_range(inode, locked_page, start, end, &page_started, - &nr_written, 0); + &nr_written, 0, NULL); /* Inline extent inserted, page gets unlocked and everything is done */ if (page_started) { ret = 0; goto out; } if (ret < 0) { - if (locked_page) + btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); + if (locked_page) { + const u64 page_start = page_offset(locked_page); + const u64 page_end = page_start + PAGE_SIZE - 1; + + btrfs_page_set_error(inode->root->fs_info, locked_page, + page_start, PAGE_SIZE); + set_page_writeback(locked_page); + end_page_writeback(locked_page); + end_extent_writepage(locked_page, ret, page_start, page_end); unlock_page(locked_page); + } goto out; } @@ -1133,15 +1145,39 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * *page_started is set to one if we unlock locked_page and do everything * required to start IO on it. It may be clean and already done with * IO when we return. + * + * When unlock == 1, we unlock the pages in successfully allocated regions. + * When unlock == 0, we leave them locked for writing them out. + * + * However, we unlock all the pages except @locked_page in case of failure. + * + * In summary, page locking state will be as follow: + * + * - page_started == 1 (return value) + * - All the pages are unlocked. IO is started. + * - Note that this can happen only on success + * - unlock == 1 + * - All the pages except @locked_page are unlocked in any case + * - unlock == 0 + * - On success, all the pages are locked for writing out them + * - On failure, all the pages except @locked_page are unlocked + * + * When a failure happens in the second or later iteration of the + * while-loop, the ordered extents created in previous iterations are kept + * intact. So, the caller must clean them up by calling + * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for + * example. */ static noinline int cow_file_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, - unsigned long *nr_written, int unlock) + unsigned long *nr_written, int unlock, + u64 *done_offset) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; u64 alloc_hint = 0; + u64 orig_start = start; u64 num_bytes; unsigned long ram_size; u64 cur_alloc_size = 0; @@ -1329,18 +1365,62 @@ out_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); out_unlock: + /* + * If done_offset is non-NULL and ret == -EAGAIN, we expect the + * caller to write out the successfully allocated region and retry. + */ + if (done_offset && ret == -EAGAIN) { + if (orig_start < start) + *done_offset = start - 1; + else + *done_offset = start; + return ret; + } else if (ret == -EAGAIN) { + /* Convert to -ENOSPC since the caller cannot retry. */ + ret = -ENOSPC; + } + + /* + * Now, we have three regions to clean up: + * + * |-------(1)----|---(2)---|-------------(3)----------| + * `- orig_start `- start `- start + cur_alloc_size `- end + * + * We process each region below. + */ + clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; + + /* + * For the range (1). We have already instantiated the ordered extents + * for this region. They are cleaned up by + * btrfs_cleanup_ordered_extents() in e.g, + * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are + * already cleared in the above loop. And, EXTENT_DELALLOC_NEW | + * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup + * function. + * + * However, in case of unlock == 0, we still need to unlock the pages + * (except @locked_page) to ensure all the pages are unlocked. + */ + if (!unlock && orig_start < start) { + if (!locked_page) + mapping_set_error(inode->vfs_inode.i_mapping, ret); + extent_clear_unlock_delalloc(inode, orig_start, start - 1, + locked_page, 0, page_ops); + } + /* - * If we reserved an extent for our delalloc range (or a subrange) and - * failed to create the respective ordered extent, then it means that - * when we reserved the extent we decremented the extent's size from - * the data space_info's bytes_may_use counter and incremented the - * space_info's bytes_reserved counter by the same amount. We must make - * sure extent_clear_unlock_delalloc() does not try to decrement again - * the data space_info's bytes_may_use counter, therefore we do not pass - * it the flag EXTENT_CLEAR_DATA_RESV. + * For the range (2). If we reserved an extent for our delalloc range + * (or a subrange) and failed to create the respective ordered extent, + * then it means that when we reserved the extent we decremented the + * extent's size from the data space_info's bytes_may_use counter and + * incremented the space_info's bytes_reserved counter by the same + * amount. We must make sure extent_clear_unlock_delalloc() does not try + * to decrement again the data space_info's bytes_may_use counter, + * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV. */ if (extent_reserved) { extent_clear_unlock_delalloc(inode, start, @@ -1350,12 +1430,19 @@ out_unlock: page_ops); start += cur_alloc_size; if (start >= end) - goto out; + return ret; } + + /* + * For the range (3). We never touched the region. In addition to the + * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data + * space_info's bytes_may_use counter, reserved in + * btrfs_check_data_free_space(). + */ extent_clear_unlock_delalloc(inode, start, end, locked_page, clear_bits | EXTENT_CLEAR_DATA_RESV, page_ops); - goto out; + return ret; } /* @@ -1435,7 +1522,7 @@ static int cow_file_range_async(struct btrfs_inode *inode, int i; bool should_compress; unsigned nofs_flag; - const unsigned int write_flags = wbc_to_write_flags(wbc); + const blk_opf_t write_flags = wbc_to_write_flags(wbc); unlock_extent(&inode->io_tree, start, end); @@ -1538,19 +1625,42 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode, u64 end, int *page_started, unsigned long *nr_written) { + u64 done_offset = end; int ret; + bool locked_page_done = false; - ret = cow_file_range(inode, locked_page, start, end, page_started, - nr_written, 0); - if (ret) - return ret; + while (start <= end) { + ret = cow_file_range(inode, locked_page, start, end, page_started, + nr_written, 0, &done_offset); + if (ret && ret != -EAGAIN) + return ret; - if (*page_started) - return 0; + if (*page_started) { + ASSERT(ret == 0); + return 0; + } + + if (ret == 0) + done_offset = end; + + if (done_offset == start) { + struct btrfs_fs_info *info = inode->root->fs_info; + + wait_var_event(&info->zone_finish_wait, + !test_bit(BTRFS_FS_NEED_ZONE_FINISH, &info->flags)); + continue; + } + + if (!locked_page_done) { + __set_page_dirty_nobuffers(locked_page); + account_page_redirty(locked_page); + } + locked_page_done = true; + extent_write_locked_range(&inode->vfs_inode, start, done_offset); + + start = done_offset + 1; + } - __set_page_dirty_nobuffers(locked_page); - account_page_redirty(locked_page); - extent_write_locked_range(&inode->vfs_inode, start, end); *page_started = 1; return 0; @@ -1642,7 +1752,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, } return cow_file_range(inode, locked_page, start, end, page_started, - nr_written, 1); + nr_written, 1, NULL); } struct can_nocow_file_extent_args { @@ -2115,7 +2225,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page page_started, nr_written); else ret = cow_file_range(inode, locked_page, start, end, - page_started, nr_written, 1); + page_started, nr_written, 1, NULL); } else { set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags); ret = cow_file_range_async(inode, wbc, locked_page, start, end, @@ -2131,6 +2241,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page void btrfs_split_delalloc_extent(struct inode *inode, struct extent_state *orig, u64 split) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 size; /* not delalloc, ignore it */ @@ -2138,7 +2249,7 @@ void btrfs_split_delalloc_extent(struct inode *inode, return; size = orig->end - orig->start + 1; - if (size > BTRFS_MAX_EXTENT_SIZE) { + if (size > fs_info->max_extent_size) { u32 num_extents; u64 new_size; @@ -2147,10 +2258,10 @@ void btrfs_split_delalloc_extent(struct inode *inode, * applies here, just in reverse. */ new_size = orig->end - split + 1; - num_extents = count_max_extents(new_size); + num_extents = count_max_extents(fs_info, new_size); new_size = split - orig->start; - num_extents += count_max_extents(new_size); - if (count_max_extents(size) >= num_extents) + num_extents += count_max_extents(fs_info, new_size); + if (count_max_extents(fs_info, size) >= num_extents) return; } @@ -2167,6 +2278,7 @@ void btrfs_split_delalloc_extent(struct inode *inode, void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, struct extent_state *other) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 new_size, old_size; u32 num_extents; @@ -2180,7 +2292,7 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, new_size = other->end - new->start + 1; /* we're not bigger than the max, unreserve the space and go */ - if (new_size <= BTRFS_MAX_EXTENT_SIZE) { + if (new_size <= fs_info->max_extent_size) { spin_lock(&BTRFS_I(inode)->lock); btrfs_mod_outstanding_extents(BTRFS_I(inode), -1); spin_unlock(&BTRFS_I(inode)->lock); @@ -2206,10 +2318,10 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, * this case. */ old_size = other->end - other->start + 1; - num_extents = count_max_extents(old_size); + num_extents = count_max_extents(fs_info, old_size); old_size = new->end - new->start + 1; - num_extents += count_max_extents(old_size); - if (count_max_extents(new_size) >= num_extents) + num_extents += count_max_extents(fs_info, old_size); + if (count_max_extents(fs_info, new_size) >= num_extents) return; spin_lock(&BTRFS_I(inode)->lock); @@ -2274,21 +2386,21 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root, * list of inodes that have pending delalloc work to be done. */ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, - unsigned *bits) + u32 bits) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC)) + if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC)) WARN_ON(1); /* * set_bit and clear bit hooks normally require _irqsave/restore * but in this case, we are only testing for the DELALLOC * bit, which is only set or cleared with irqs on */ - if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { + if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 len = state->end + 1 - state->start; - u32 num_extents = count_max_extents(len); + u32 num_extents = count_max_extents(fs_info, len); bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode)); spin_lock(&BTRFS_I(inode)->lock); @@ -2303,7 +2415,7 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, fs_info->delalloc_batch); spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->delalloc_bytes += len; - if (*bits & EXTENT_DEFRAG) + if (bits & EXTENT_DEFRAG) BTRFS_I(inode)->defrag_bytes += len; if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, &BTRFS_I(inode)->runtime_flags)) @@ -2312,7 +2424,7 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, } if (!(state->state & EXTENT_DELALLOC_NEW) && - (*bits & EXTENT_DELALLOC_NEW)) { + (bits & EXTENT_DELALLOC_NEW)) { spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 - state->start; @@ -2325,14 +2437,14 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, * accounting happens. */ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, - struct extent_state *state, unsigned *bits) + struct extent_state *state, u32 bits) { struct btrfs_inode *inode = BTRFS_I(vfs_inode); struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb); u64 len = state->end + 1 - state->start; - u32 num_extents = count_max_extents(len); + u32 num_extents = count_max_extents(fs_info, len); - if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) { + if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) { spin_lock(&inode->lock); inode->defrag_bytes -= len; spin_unlock(&inode->lock); @@ -2343,7 +2455,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, * but in this case, we are only testing for the DELALLOC * bit, which is only set or cleared with irqs on */ - if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { + if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = inode->root; bool do_list = !btrfs_is_free_space_inode(inode); @@ -2356,7 +2468,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, * don't need to call delalloc_release_metadata if there is an * error. */ - if (*bits & EXTENT_CLEAR_META_RESV && + if (bits & EXTENT_CLEAR_META_RESV && root != fs_info->tree_root) btrfs_delalloc_release_metadata(inode, len, false); @@ -2366,7 +2478,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, if (!btrfs_is_data_reloc_root(root) && do_list && !(state->state & EXTENT_NORESERVE) && - (*bits & EXTENT_CLEAR_DATA_RESV)) + (bits & EXTENT_CLEAR_DATA_RESV)) btrfs_free_reserved_data_space_noquota(fs_info, len); percpu_counter_add_batch(&fs_info->delalloc_bytes, -len, @@ -2381,11 +2493,11 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, } if ((state->state & EXTENT_DELALLOC_NEW) && - (*bits & EXTENT_DELALLOC_NEW)) { + (bits & EXTENT_DELALLOC_NEW)) { spin_lock(&inode->lock); ASSERT(inode->new_delalloc_bytes >= len); inode->new_delalloc_bytes -= len; - if (*bits & EXTENT_ADD_INODE_BYTES) + if (bits & EXTENT_ADD_INODE_BYTES) inode_add_bytes(&inode->vfs_inode, len); spin_unlock(&inode->lock); } @@ -2580,95 +2692,78 @@ out: return errno_to_blk_status(ret); } -/* - * extent_io.c submission hook. This does the right thing for csum calculation - * on write, or reading the csums from the tree before a read. - * - * Rules about async/sync submit, - * a) read: sync submit - * - * b) write without checksum: sync submit - * - * c) write with checksum: - * c-1) if bio is issued by fsync: sync submit - * (sync_writers != 0) - * - * c-2) if root is reloc root: sync submit - * (only in case of buffered IO) - * - * c-3) otherwise: async submit - */ -void btrfs_submit_data_bio(struct inode *inode, struct bio *bio, - int mirror_num, enum btrfs_compression_type compress_type) +void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; - enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA; - blk_status_t ret = 0; - int skip_sum; - int async = !atomic_read(&BTRFS_I(inode)->sync_writers); - - skip_sum = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) || - test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); - - if (btrfs_is_free_space_inode(BTRFS_I(inode))) - metadata = BTRFS_WQ_ENDIO_FREE_SPACE; + struct btrfs_inode *bi = BTRFS_I(inode); + blk_status_t ret; if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - struct page *page = bio_first_bvec_all(bio)->bv_page; - loff_t file_offset = page_offset(page); - - ret = extract_ordered_extent(BTRFS_I(inode), bio, file_offset); + ret = extract_ordered_extent(bi, bio, + page_offset(bio_first_bvec_all(bio)->bv_page)); if (ret) goto out; } - if (btrfs_op(bio) != BTRFS_MAP_WRITE) { - ret = btrfs_bio_wq_end_io(fs_info, bio, metadata); - if (ret) - goto out; - - if (compress_type != BTRFS_COMPRESS_NONE) { - /* - * btrfs_submit_compressed_read will handle completing - * the bio if there were any errors, so just return - * here. - */ - btrfs_submit_compressed_read(inode, bio, mirror_num); + /* + * If we need to checksum, and the I/O is not issued by fsync and + * friends, that is ->sync_writers != 0, defer the submission to a + * workqueue to parallelize it. + * + * Csum items for reloc roots have already been cloned at this point, + * so they are handled as part of the no-checksum case. + */ + if (!(bi->flags & BTRFS_INODE_NODATASUM) && + !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && + !btrfs_is_data_reloc_root(bi->root)) { + if (!atomic_read(&bi->sync_writers) && + btrfs_wq_submit_bio(inode, bio, mirror_num, 0, + btrfs_submit_bio_start)) return; - } else { - /* - * Lookup bio sums does extra checks around whether we - * need to csum or not, which is why we ignore skip_sum - * here. - */ - ret = btrfs_lookup_bio_sums(inode, bio, NULL); - if (ret) - goto out; - } - goto mapit; - } else if (async && !skip_sum) { - /* csum items have already been cloned */ - if (btrfs_is_data_reloc_root(root)) - goto mapit; - /* we're doing a write, do the async checksumming */ - ret = btrfs_wq_submit_bio(inode, bio, mirror_num, - 0, btrfs_submit_bio_start); - goto out; - } else if (!skip_sum) { - ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false); + + ret = btrfs_csum_one_bio(bi, bio, (u64)-1, false); if (ret) goto out; } + btrfs_submit_bio(fs_info, bio, mirror_num); + return; +out: + if (ret) { + bio->bi_status = ret; + bio_endio(bio); + } +} -mapit: - ret = btrfs_map_bio(fs_info, bio, mirror_num); +void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, + int mirror_num, enum btrfs_compression_type compress_type) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + blk_status_t ret; -out: + if (compress_type != BTRFS_COMPRESS_NONE) { + /* + * btrfs_submit_compressed_read will handle completing the bio + * if there were any errors, so just return here. + */ + btrfs_submit_compressed_read(inode, bio, mirror_num); + return; + } + + /* Save the original iter for read repair */ + btrfs_bio(bio)->iter = bio->bi_iter; + + /* + * Lookup bio sums does extra checks around whether we need to csum or + * not, which is why we ignore skip_sum here. + */ + ret = btrfs_lookup_bio_sums(inode, bio, NULL); if (ret) { bio->bi_status = ret; bio_endio(bio); + return; } + + btrfs_submit_bio(fs_info, bio, mirror_num); } /* @@ -3075,8 +3170,10 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, oe->disk_num_bytes); btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset); - if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) - num_bytes = ram_bytes = oe->truncated_len; + if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) { + num_bytes = oe->truncated_len; + ram_bytes = num_bytes; + } btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes); btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes); btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); @@ -3102,7 +3199,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, * an ordered extent if the range of bytes in the file it covers are * fully written. */ -static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) +int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) { struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode); struct btrfs_root *root = inode->root; @@ -3195,6 +3292,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) ordered_extent->file_offset, ordered_extent->file_offset + logical_len); + btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes); } else { BUG_ON(root == fs_info->tree_root); ret = insert_ordered_extent_file_extent(trans, ordered_extent); @@ -3309,65 +3408,71 @@ out: return ret; } -static void finish_ordered_fn(struct btrfs_work *work) -{ - struct btrfs_ordered_extent *ordered_extent; - ordered_extent = container_of(work, struct btrfs_ordered_extent, work); - btrfs_finish_ordered_io(ordered_extent); -} - void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, struct page *page, u64 start, u64 end, bool uptodate) { trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate); - btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start, - finish_ordered_fn, uptodate); + btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start, uptodate); +} + +/* + * Verify the checksum for a single sector without any extra action that depend + * on the type of I/O. + */ +int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, + u32 pgoff, u8 *csum, const u8 * const csum_expected) +{ + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); + char *kaddr; + + ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE); + + shash->tfm = fs_info->csum_shash; + + kaddr = kmap_local_page(page) + pgoff; + crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum); + kunmap_local(kaddr); + + if (memcmp(csum, csum_expected, fs_info->csum_size)) + return -EIO; + return 0; } /* * check_data_csum - verify checksum of one sector of uncompressed data * @inode: inode - * @io_bio: btrfs_io_bio which contains the csum + * @bbio: btrfs_bio which contains the csum * @bio_offset: offset to the beginning of the bio (in bytes) * @page: page where is the data to be verified * @pgoff: offset inside the page - * @start: logical offset in the file * * The length of such check is always one sector size. + * + * When csum mismatch is detected, we will also report the error and fill the + * corrupted range with zero. (Thus it needs the extra parameters) */ -static int check_data_csum(struct inode *inode, struct btrfs_bio *bbio, - u32 bio_offset, struct page *page, u32 pgoff, - u64 start) +int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, + u32 bio_offset, struct page *page, u32 pgoff) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - char *kaddr; u32 len = fs_info->sectorsize; - const u32 csum_size = fs_info->csum_size; - unsigned int offset_sectors; u8 *csum_expected; u8 csum[BTRFS_CSUM_SIZE]; ASSERT(pgoff + len <= PAGE_SIZE); - offset_sectors = bio_offset >> fs_info->sectorsize_bits; - csum_expected = ((u8 *)bbio->csum) + offset_sectors * csum_size; + csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset); - kaddr = kmap_atomic(page); - shash->tfm = fs_info->csum_shash; - - crypto_shash_digest(shash, kaddr + pgoff, len, csum); - kunmap_atomic(kaddr); - - if (memcmp(csum, csum_expected, csum_size)) + if (btrfs_check_sector_csum(fs_info, page, pgoff, csum, csum_expected)) goto zeroit; - return 0; + zeroit: - btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected, - bbio->mirror_num); + btrfs_print_data_csum_error(BTRFS_I(inode), + bbio->file_offset + bio_offset, + csum, csum_expected, bbio->mirror_num); if (bbio->device) btrfs_dev_stat_inc_and_print(bbio->device, BTRFS_DEV_STAT_CORRUPTION_ERRS); @@ -3399,11 +3504,6 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, u32 pg_off; unsigned int result = 0; - if (btrfs_page_test_checked(fs_info, page, start, end + 1 - start)) { - btrfs_page_clear_checked(fs_info, page, start, end + 1 - start); - return 0; - } - /* * This only happens for NODATASUM or compressed read. * Normally this should be covered by above check for compressed read @@ -3436,8 +3536,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, EXTENT_NODATASUM); continue; } - ret = check_data_csum(inode, bbio, bio_offset, page, pg_off, - page_offset(page) + pg_off); + ret = btrfs_check_data_csum(inode, bbio, bio_offset, page, pg_off); if (ret < 0) { const int nr_bit = (pg_off - offset_in_page(start)) >> root->fs_info->sectorsize_bits; @@ -3576,7 +3675,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) u64 last_objectid = 0; int ret = 0, nr_unlink = 0; - /* Bail out if the cleanup is already running. */ if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state)) return 0; @@ -3659,17 +3757,17 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) * * btrfs_find_orphan_roots() ran before us, which has * found all deleted roots and loaded them into - * fs_info->fs_roots. So here we can find if an + * fs_info->fs_roots_radix. So here we can find if an * orphan item corresponds to a deleted root by looking - * up the root from that xarray. + * up the root from that radix tree. */ - spin_lock(&fs_info->fs_roots_lock); - dead_root = xa_load(&fs_info->fs_roots, - (unsigned long)found_key.objectid); + spin_lock(&fs_info->fs_roots_radix_lock); + dead_root = radix_tree_lookup(&fs_info->fs_roots_radix, + (unsigned long)found_key.objectid); if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0) is_dead_root = 1; - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); if (is_dead_root) { /* prevent this orphan from being found again */ @@ -3909,7 +4007,7 @@ cache_index: * cache. * * This is required for both inode re-read from disk and delayed inode - * in the delayed_nodes xarray. + * in delayed_nodes_tree. */ if (BTRFS_I(inode)->last_trans == fs_info->generation) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, @@ -4227,7 +4325,7 @@ skip_backref: /* * If we are in a rename context, we don't need to update anything in the * log. That will be done later during the rename by btrfs_log_new_name(). - * Besides that, doing it here would only cause extra unncessary btree + * Besides that, doing it here would only cause extra unnecessary btree * operations on the log tree, increasing latency for applications. */ if (!rename_ctx) { @@ -4255,8 +4353,9 @@ err: btrfs_i_size_write(dir, dir->vfs_inode.i_size - name_len * 2); inode_inc_iversion(&inode->vfs_inode); inode_inc_iversion(&dir->vfs_inode); - inode->vfs_inode.i_ctime = dir->vfs_inode.i_mtime = - dir->vfs_inode.i_ctime = current_time(&inode->vfs_inode); + inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode); + dir->vfs_inode.i_mtime = inode->vfs_inode.i_ctime; + dir->vfs_inode.i_ctime = inode->vfs_inode.i_ctime; ret = btrfs_update_inode(trans, root, dir); out: return ret; @@ -4418,7 +4517,8 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2); inode_inc_iversion(dir); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = current_time(dir); + dir->i_ctime = dir->i_mtime; ret = btrfs_update_inode_fallback(trans, root, BTRFS_I(dir)); if (ret) btrfs_abort_transaction(trans, ret); @@ -4857,7 +4957,6 @@ again: else memzero_page(page, (block_start - page_offset(page)) + offset, len); - flush_dcache_page(page); } btrfs_page_clear_checked(fs_info, page, block_start, block_end + 1 - block_start); @@ -5060,9 +5159,10 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) */ if (newsize != oldsize) { inode_inc_iversion(inode); - if (!(mask & (ATTR_CTIME | ATTR_MTIME))) - inode->i_ctime = inode->i_mtime = - current_time(inode); + if (!(mask & (ATTR_CTIME | ATTR_MTIME))) { + inode->i_mtime = current_time(inode); + inode->i_ctime = inode->i_mtime; + } } if (newsize > oldsize) { @@ -5370,7 +5470,7 @@ void btrfs_evict_inode(struct inode *inode) if (!rsv) goto no_delete; rsv->size = btrfs_calc_metadata_size(fs_info, 1); - rsv->failfast = 1; + rsv->failfast = true; btrfs_i_size_write(BTRFS_I(inode), 0); @@ -5762,14 +5862,14 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) if (ret != -ENOENT) inode = ERR_PTR(ret); else - inode = new_simple_dir(dir->i_sb, &location, sub_root); + inode = new_simple_dir(dir->i_sb, &location, root); } else { inode = btrfs_iget(dir->i_sb, location.objectid, sub_root); - } - if (root != sub_root) btrfs_put_root(sub_root); - if (!IS_ERR(inode) && root != sub_root) { + if (IS_ERR(inode)) + return inode; + down_read(&fs_info->cleanup_work_sem); if (!sb_rdonly(inode->i_sb)) ret = btrfs_orphan_cleanup(sub_root); @@ -6365,7 +6465,13 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, } btrfs_mark_buffer_dirty(path->nodes[0]); - btrfs_release_path(path); + /* + * We don't need the path anymore, plus inheriting properties, adding + * ACLs, security xattrs, orphan item or adding the link, will result in + * allocating yet another path. So just free our path. + */ + btrfs_free_path(path); + path = NULL; if (args->subvol) { struct inode *parent; @@ -6422,8 +6528,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, goto discard; } - ret = 0; - goto out; + return 0; discard: /* @@ -7505,7 +7610,8 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, btrfs_dec_nocow_writers(bg); if (type == BTRFS_ORDERED_PREALLOC) { free_extent_map(em); - *map = em = em2; + *map = em2; + em = em2; } if (IS_ERR(em2)) { @@ -7587,8 +7693,12 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, const u64 data_alloc_len = length; bool unlock_extents = false; + /* + * Cap the size of reads to that usually seen in buffered I/O as we need + * to allocate a contiguous array for the checksums. + */ if (!write) - len = min_t(u64, len, fs_info->sectorsize); + len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS); lockstart = start; lockend = start + len - 1; @@ -7679,7 +7789,19 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || em->block_start == EXTENT_MAP_INLINE) { free_extent_map(em); - ret = -ENOTBLK; + /* + * If we are in a NOWAIT context, return -EAGAIN in order to + * fallback to buffered IO. This is not only because we can + * block with buffered IO (no support for NOWAIT semantics at + * the moment) but also to avoid returning short reads to user + * space - this happens if we were able to read some data from + * previous non-compressed extents and then when we fallback to + * buffered IO, at btrfs_file_read_iter() by calling + * filemap_read(), we fail to fault in pages for the read buffer, + * in which case filemap_read() returns a short read (the number + * of bytes previously read is > 0, so it does not return -EFAULT). + */ + ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK; goto unlock_err; } @@ -7811,8 +7933,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, pos += submitted; length -= submitted; if (write) - __endio_write_update_ordered(BTRFS_I(inode), pos, - length, false); + btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL, + pos, length, false); else unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1); @@ -7834,10 +7956,9 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip) return; if (btrfs_op(&dip->bio) == BTRFS_MAP_WRITE) { - __endio_write_update_ordered(BTRFS_I(dip->inode), - dip->file_offset, - dip->bytes, - !dip->bio.bi_status); + btrfs_mark_ordered_io_finished(BTRFS_I(dip->inode), NULL, + dip->file_offset, dip->bytes, + !dip->bio.bi_status); } else { unlock_extent(&BTRFS_I(dip->inode)->io_tree, dip->file_offset, @@ -7857,12 +7978,8 @@ static void submit_dio_repair_bio(struct inode *inode, struct bio *bio, BUG_ON(bio_op(bio) == REQ_OP_WRITE); - if (btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA)) - return; - refcount_inc(&dip->refs); - if (btrfs_map_bio(fs_info, bio, mirror_num)) - refcount_dec(&dip->refs); + btrfs_submit_bio(fs_info, bio, mirror_num); } static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, @@ -7871,56 +7988,35 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, { struct inode *inode = dip->inode; struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; - const u32 sectorsize = fs_info->sectorsize; struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); - struct bio_vec bvec; - struct bvec_iter iter; - u32 bio_offset = 0; blk_status_t err = BLK_STS_OK; + struct bvec_iter iter; + struct bio_vec bv; + u32 offset; + + btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) { + u64 start = bbio->file_offset + offset; + + if (uptodate && + (!csum || !btrfs_check_data_csum(inode, bbio, offset, bv.bv_page, + bv.bv_offset))) { + clean_io_failure(fs_info, failure_tree, io_tree, start, + bv.bv_page, btrfs_ino(BTRFS_I(inode)), + bv.bv_offset); + } else { + int ret; - __bio_for_each_segment(bvec, &bbio->bio, iter, bbio->iter) { - unsigned int i, nr_sectors, pgoff; - - nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len); - pgoff = bvec.bv_offset; - for (i = 0; i < nr_sectors; i++) { - u64 start = bbio->file_offset + bio_offset; - - ASSERT(pgoff < PAGE_SIZE); - if (uptodate && - (!csum || !check_data_csum(inode, bbio, - bio_offset, bvec.bv_page, - pgoff, start))) { - clean_io_failure(fs_info, failure_tree, io_tree, - start, bvec.bv_page, - btrfs_ino(BTRFS_I(inode)), - pgoff); - } else { - int ret; - - ret = btrfs_repair_one_sector(inode, &bbio->bio, - bio_offset, bvec.bv_page, pgoff, - start, bbio->mirror_num, - submit_dio_repair_bio); - if (ret) - err = errno_to_blk_status(ret); - } - ASSERT(bio_offset + sectorsize > bio_offset); - bio_offset += sectorsize; - pgoff += sectorsize; + ret = btrfs_repair_one_sector(inode, bbio, offset, + bv.bv_page, bv.bv_offset, + submit_dio_repair_bio); + if (ret) + err = errno_to_blk_status(ret); } } - return err; -} -static void __endio_write_update_ordered(struct btrfs_inode *inode, - const u64 offset, const u64 bytes, - const bool uptodate) -{ - btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, - finish_ordered_fn, uptodate); + return err; } static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, @@ -7955,51 +8051,43 @@ static void btrfs_end_dio_bio(struct bio *bio) btrfs_dio_private_put(dip); } -static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, - struct inode *inode, u64 file_offset, int async_submit) +static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, + u64 file_offset, int async_submit) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_dio_private *dip = bio->bi_private; - bool write = btrfs_op(bio) == BTRFS_MAP_WRITE; blk_status_t ret; - /* Check btrfs_submit_bio_hook() for rules about async submit. */ - if (async_submit) - async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers); - - if (!write) { - ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); - if (ret) - goto err; - } + /* Save the original iter for read repair */ + if (btrfs_op(bio) == BTRFS_MAP_READ) + btrfs_bio(bio)->iter = bio->bi_iter; if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) goto map; - if (write && async_submit) { - ret = btrfs_wq_submit_bio(inode, bio, 0, file_offset, - btrfs_submit_bio_start_direct_io); - goto err; - } else if (write) { + if (btrfs_op(bio) == BTRFS_MAP_WRITE) { + /* Check btrfs_submit_data_write_bio() for async submit rules */ + if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers) && + btrfs_wq_submit_bio(inode, bio, 0, file_offset, + btrfs_submit_bio_start_direct_io)) + return; + /* * If we aren't doing async submit, calculate the csum of the * bio now. */ ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false); - if (ret) - goto err; + if (ret) { + bio->bi_status = ret; + bio_endio(bio); + return; + } } else { - u64 csum_offset; - - csum_offset = file_offset - dip->file_offset; - csum_offset >>= fs_info->sectorsize_bits; - csum_offset *= fs_info->csum_size; - btrfs_bio(bio)->csum = dip->csums + csum_offset; + btrfs_bio(bio)->csum = btrfs_csum_ptr(fs_info, dip->csums, + file_offset - dip->file_offset); } map: - ret = btrfs_map_bio(fs_info, bio, 0); -err: - return ret; + btrfs_submit_bio(fs_info, bio, 0); } static void btrfs_submit_direct(const struct iomap_iter *iter, @@ -8112,14 +8200,7 @@ static void btrfs_submit_direct(const struct iomap_iter *iter, async_submit = 1; } - status = btrfs_submit_dio_bio(bio, inode, file_offset, - async_submit); - if (status) { - bio_put(bio); - if (submit_len > 0) - refcount_dec(&dip->refs); - goto out_err_em; - } + btrfs_submit_dio_bio(bio, inode, file_offset, async_submit); dio_data->submitted += clone_len; clone_offset += clone_len; @@ -8152,7 +8233,8 @@ ssize_t btrfs_dio_rw(struct kiocb *iocb, struct iov_iter *iter, size_t done_befo struct btrfs_dio_data data; return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - IOMAP_DIO_PARTIAL, &data, done_before); + IOMAP_DIO_PARTIAL | IOMAP_DIO_NOSYNC, + &data, done_before); } static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, @@ -8167,31 +8249,6 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return extent_fiemap(BTRFS_I(inode), fieinfo, start, len); } -static int btrfs_writepage(struct page *page, struct writeback_control *wbc) -{ - struct inode *inode = page->mapping->host; - int ret; - - if (current->flags & PF_MEMALLOC) { - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; - } - - /* - * If we are under memory pressure we will call this directly from the - * VM, we need to make sure we have the inode referenced for the ordered - * extent. If not just return like we didn't do anything. - */ - if (!igrab(inode)) { - redirty_page_for_writepage(wbc, page); - return AOP_WRITEPAGE_ACTIVATE; - } - ret = extent_write_full_page(page, wbc); - btrfs_add_delayed_iput(inode); - return ret; -} - static int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -8255,30 +8312,24 @@ static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) } #ifdef CONFIG_MIGRATION -static int btrfs_migratepage(struct address_space *mapping, - struct page *newpage, struct page *page, +static int btrfs_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, enum migrate_mode mode) { - int ret; + int ret = filemap_migrate_folio(mapping, dst, src, mode); - ret = migrate_page_move_mapping(mapping, newpage, page, 0); if (ret != MIGRATEPAGE_SUCCESS) return ret; - if (page_has_private(page)) - attach_page_private(newpage, detach_page_private(page)); - - if (PageOrdered(page)) { - ClearPageOrdered(page); - SetPageOrdered(newpage); + if (folio_test_ordered(src)) { + folio_clear_ordered(src); + folio_set_ordered(dst); } - if (mode != MIGRATE_SYNC_NO_COPY) - migrate_page_copy(newpage, page); - else - migrate_page_states(newpage, page); return MIGRATEPAGE_SUCCESS; } +#else +#define btrfs_migrate_folio NULL #endif static void btrfs_invalidate_folio(struct folio *folio, size_t offset, @@ -8495,7 +8546,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) * Reserving delalloc space after obtaining the page lock can lead to * deadlock. For example, if a dirty page is locked by this function * and the call to btrfs_delalloc_reserve_space() ends up triggering - * dirty page write out, then the btrfs_writepage() function could + * dirty page write out, then the btrfs_writepages() function could * end up waiting indefinitely to get a lock on the page currently * being processed by btrfs_page_mkwrite() function. */ @@ -8586,10 +8637,9 @@ again: else zero_start = PAGE_SIZE; - if (zero_start != PAGE_SIZE) { + if (zero_start != PAGE_SIZE) memzero_page(page, zero_start, PAGE_SIZE - zero_start); - flush_dcache_page(page); - } + btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE); btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start); btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start); @@ -8672,7 +8722,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) if (!rsv) return -ENOMEM; rsv->size = min_size; - rsv->failfast = 1; + rsv->failfast = true; /* * 1 for the truncate slack space @@ -9193,8 +9243,10 @@ static int btrfs_rename_exchange(struct inode *old_dir, inode_inc_iversion(new_dir); inode_inc_iversion(old_inode); inode_inc_iversion(new_inode); - old_dir->i_ctime = old_dir->i_mtime = ctime; - new_dir->i_ctime = new_dir->i_mtime = ctime; + old_dir->i_mtime = ctime; + old_dir->i_ctime = ctime; + new_dir->i_mtime = ctime; + new_dir->i_ctime = ctime; old_inode->i_ctime = ctime; new_inode->i_ctime = ctime; @@ -9457,9 +9509,11 @@ static int btrfs_rename(struct user_namespace *mnt_userns, inode_inc_iversion(old_dir); inode_inc_iversion(new_dir); inode_inc_iversion(old_inode); - old_dir->i_ctime = old_dir->i_mtime = - new_dir->i_ctime = new_dir->i_mtime = - old_inode->i_ctime = current_time(old_dir); + old_dir->i_mtime = current_time(old_dir); + old_dir->i_ctime = old_dir->i_mtime; + new_dir->i_mtime = old_dir->i_mtime; + new_dir->i_ctime = old_dir->i_mtime; + old_inode->i_ctime = old_dir->i_mtime; if (old_dentry->d_parent != new_dentry->d_parent) btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), @@ -9547,15 +9601,21 @@ static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_di struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { + int ret; + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; if (flags & RENAME_EXCHANGE) - return btrfs_rename_exchange(old_dir, old_dentry, new_dir, - new_dentry); + ret = btrfs_rename_exchange(old_dir, old_dentry, new_dir, + new_dentry); + else + ret = btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir, + new_dentry, flags); + + btrfs_btree_balance_dirty(BTRFS_I(new_dir)->root->fs_info); - return btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir, - new_dentry, flags); + return ret; } struct btrfs_delalloc_work { @@ -9897,6 +9957,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( extent_info.file_offset = file_offset; extent_info.extent_buf = (char *)&stack_fi; extent_info.is_new_extent = true; + extent_info.update_times = true; extent_info.qgroup_reserved = qgroup_released; extent_info.insertions = 0; @@ -10174,9 +10235,8 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) } } -static int btrfs_encoded_io_compression_from_extent( - struct btrfs_fs_info *fs_info, - int compress_type) +int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, + int compress_type) { switch (compress_type) { case BTRFS_COMPRESS_NONE: @@ -10299,7 +10359,6 @@ static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) { struct btrfs_encoded_read_private *priv = bio->bi_private; - struct btrfs_bio *bbio = btrfs_bio(bio); struct btrfs_fs_info *fs_info = inode->root->fs_info; blk_status_t ret; @@ -10309,19 +10368,9 @@ static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode, return ret; } - ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); - if (ret) { - btrfs_bio_free_csum(bbio); - return ret; - } - atomic_inc(&priv->pending); - ret = btrfs_map_bio(fs_info, bio, mirror_num); - if (ret) { - atomic_dec(&priv->pending); - btrfs_bio_free_csum(bbio); - } - return ret; + btrfs_submit_bio(fs_info, bio, mirror_num); + return BLK_STS_OK; } static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio) @@ -10333,7 +10382,6 @@ static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio) u32 sectorsize = fs_info->sectorsize; struct bio_vec *bvec; struct bvec_iter_all iter_all; - u64 start = priv->file_offset; u32 bio_offset = 0; if (priv->skip_csum || !uptodate) @@ -10346,10 +10394,9 @@ static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio) pgoff = bvec->bv_offset; for (i = 0; i < nr_sectors; i++) { ASSERT(pgoff < PAGE_SIZE); - if (check_data_csum(&inode->vfs_inode, bbio, bio_offset, - bvec->bv_page, pgoff, start)) + if (btrfs_check_data_csum(&inode->vfs_inode, bbio, bio_offset, + bvec->bv_page, pgoff)) return BLK_STS_IOERR; - start += sectorsize; bio_offset += sectorsize; pgoff += sectorsize; } @@ -10381,11 +10428,9 @@ static void btrfs_encoded_read_endio(struct bio *bio) bio_put(bio); } -static int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, - u64 file_offset, - u64 disk_bytenr, - u64 disk_io_size, - struct page **pages) +int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, + u64 file_offset, u64 disk_bytenr, + u64 disk_io_size, struct page **pages) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_encoded_read_private priv = { @@ -10616,7 +10661,8 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, ret = -ENOBUFS; goto out_em; } - disk_io_size = count = em->block_len; + disk_io_size = em->block_len; + count = em->block_len; encoded->unencoded_len = em->ram_bytes; encoded->unencoded_offset = iocb->ki_pos - em->orig_start; ret = btrfs_encoded_io_compression_from_extent(fs_info, @@ -10779,15 +10825,15 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, ret = -ENOMEM; goto out_pages; } - kaddr = kmap(pages[i]); + kaddr = kmap_local_page(pages[i]); if (copy_from_iter(kaddr, bytes, from) != bytes) { - kunmap(pages[i]); + kunmap_local(kaddr); ret = -EFAULT; goto out_pages; } if (bytes < PAGE_SIZE) memset(kaddr + bytes, 0, PAGE_SIZE - bytes); - kunmap(pages[i]); + kunmap_local(kaddr); } for (;;) { @@ -11416,15 +11462,12 @@ static const struct file_operations btrfs_dir_file_operations = { */ static const struct address_space_operations btrfs_aops = { .read_folio = btrfs_read_folio, - .writepage = btrfs_writepage, .writepages = btrfs_writepages, .readahead = btrfs_readahead, .direct_IO = noop_direct_IO, .invalidate_folio = btrfs_invalidate_folio, .release_folio = btrfs_release_folio, -#ifdef CONFIG_MIGRATION - .migratepage = btrfs_migratepage, -#endif + .migrate_folio = btrfs_migrate_folio, .dirty_folio = filemap_dirty_folio, .error_remove_page = generic_error_remove_page, .swap_activate = btrfs_swap_activate, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0f79af919bc4..fe0cc816b4eb 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1230,16 +1230,18 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, return em; } -static u32 get_extent_max_capacity(const struct extent_map *em) +static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, + const struct extent_map *em) { if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) return BTRFS_MAX_COMPRESSED; - return BTRFS_MAX_EXTENT_SIZE; + return fs_info->max_extent_size; } static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, u32 extent_thresh, u64 newer_than, bool locked) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *next; bool ret = false; @@ -1263,7 +1265,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, * If the next extent is at its max capacity, defragging current extent * makes no sense, as the total number of extents won't change. */ - if (next->len >= get_extent_max_capacity(em)) + if (next->len >= get_extent_max_capacity(fs_info, em)) goto out; /* Skip older extent */ if (next->generation < newer_than) @@ -1400,6 +1402,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode, bool locked, struct list_head *target_list, u64 *last_scanned_ret) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; bool last_is_target = false; u64 cur = start; int ret = 0; @@ -1484,7 +1487,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode, * Skip extents already at its max capacity, this is mostly for * compressed extents, which max cap is only 128K. */ - if (em->len >= get_extent_max_capacity(em)) + if (em->len >= get_extent_max_capacity(fs_info, em)) goto next; /* @@ -4243,26 +4246,6 @@ out: return ret; } -static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx) -{ - struct btrfs_data_container *inodes = ctx; - const size_t c = 3 * sizeof(u64); - - if (inodes->bytes_left >= c) { - inodes->bytes_left -= c; - inodes->val[inodes->elem_cnt] = inum; - inodes->val[inodes->elem_cnt + 1] = offset; - inodes->val[inodes->elem_cnt + 2] = root; - inodes->elem_cnt += 3; - } else { - inodes->bytes_missing += c - inodes->bytes_left; - inodes->bytes_left = 0; - inodes->elem_missed += 3; - } - - return 0; -} - static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, void __user *arg, int version) { @@ -4312,7 +4295,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, } ret = iterate_inodes_from_logical(loi->logical, fs_info, path, - build_ino_list, inodes, ignore_offset); + inodes, ignore_offset); if (ret == -EINVAL) ret = -ENOENT; if (ret < 0) @@ -4355,13 +4338,79 @@ void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, spin_unlock(&fs_info->balance_lock); } +/** + * Try to acquire fs_info::balance_mutex as well as set BTRFS_EXLCOP_BALANCE as + * required. + * + * @fs_info: the filesystem + * @excl_acquired: ptr to boolean value which is set to false in case balance + * is being resumed + * + * Return 0 on success in which case both fs_info::balance is acquired as well + * as exclusive ops are blocked. In case of failure return an error code. + */ +static int btrfs_try_lock_balance(struct btrfs_fs_info *fs_info, bool *excl_acquired) +{ + int ret; + + /* + * Exclusive operation is locked. Three possibilities: + * (1) some other op is running + * (2) balance is running + * (3) balance is paused -- special case (think resume) + */ + while (1) { + if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { + *excl_acquired = true; + mutex_lock(&fs_info->balance_mutex); + return 0; + } + + mutex_lock(&fs_info->balance_mutex); + if (fs_info->balance_ctl) { + /* This is either (2) or (3) */ + if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { + /* This is (2) */ + ret = -EINPROGRESS; + goto out_failure; + + } else { + mutex_unlock(&fs_info->balance_mutex); + /* + * Lock released to allow other waiters to + * continue, we'll reexamine the status again. + */ + mutex_lock(&fs_info->balance_mutex); + + if (fs_info->balance_ctl && + !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { + /* This is (3) */ + *excl_acquired = false; + return 0; + } + } + } else { + /* This is (1) */ + ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + goto out_failure; + } + + mutex_unlock(&fs_info->balance_mutex); + } + +out_failure: + mutex_unlock(&fs_info->balance_mutex); + *excl_acquired = false; + return ret; +} + static long btrfs_ioctl_balance(struct file *file, void __user *arg) { struct btrfs_root *root = BTRFS_I(file_inode(file))->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_ioctl_balance_args *bargs; struct btrfs_balance_control *bctl; - bool need_unlock; /* for mut. excl. ops lock */ + bool need_unlock = true; int ret; if (!capable(CAP_SYS_ADMIN)) @@ -4378,53 +4427,12 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) goto out; } -again: - if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { - mutex_lock(&fs_info->balance_mutex); - need_unlock = true; - goto locked; - } - - /* - * mut. excl. ops lock is locked. Three possibilities: - * (1) some other op is running - * (2) balance is running - * (3) balance is paused -- special case (think resume) - */ - mutex_lock(&fs_info->balance_mutex); - if (fs_info->balance_ctl) { - /* this is either (2) or (3) */ - if (!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { - mutex_unlock(&fs_info->balance_mutex); - /* - * Lock released to allow other waiters to continue, - * we'll reexamine the status again. - */ - mutex_lock(&fs_info->balance_mutex); - - if (fs_info->balance_ctl && - !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { - /* this is (3) */ - need_unlock = false; - goto locked; - } - - mutex_unlock(&fs_info->balance_mutex); - goto again; - } else { - /* this is (2) */ - mutex_unlock(&fs_info->balance_mutex); - ret = -EINPROGRESS; - goto out; - } - } else { - /* this is (1) */ - mutex_unlock(&fs_info->balance_mutex); - ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + ret = btrfs_try_lock_balance(fs_info, &need_unlock); + if (ret) goto out; - } -locked: + lockdep_assert_held(&fs_info->balance_mutex); + if (bargs->flags & BTRFS_BALANCE_RESUME) { if (!fs_info->balance_ctl) { ret = -ENOTCONN; diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 313d9d685adb..33461b4f9c8b 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -45,7 +45,6 @@ void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting ne start_ns = ktime_get_ns(); down_read_nested(&eb->lock, nest); - eb->lock_owner = current->pid; trace_btrfs_tree_read_lock(eb, start_ns); } @@ -62,7 +61,6 @@ void btrfs_tree_read_lock(struct extent_buffer *eb) int btrfs_try_tree_read_lock(struct extent_buffer *eb) { if (down_read_trylock(&eb->lock)) { - eb->lock_owner = current->pid; trace_btrfs_try_tree_read_lock(eb); return 1; } @@ -90,7 +88,6 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb) void btrfs_tree_read_unlock(struct extent_buffer *eb) { trace_btrfs_tree_read_unlock(eb); - eb->lock_owner = 0; up_read(&eb->lock); } diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 430ad36b8b08..89bc5f825e0a 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -155,7 +155,7 @@ static int copy_compressed_data_to_page(char *compressed_data, out_pages[*cur_out / PAGE_SIZE] = cur_page; } - kaddr = kmap(cur_page); + kaddr = kmap_local_page(cur_page); write_compress_length(kaddr + offset_in_page(*cur_out), compressed_size); *cur_out += LZO_LEN; @@ -167,7 +167,7 @@ static int copy_compressed_data_to_page(char *compressed_data, u32 copy_len = min_t(u32, sectorsize - *cur_out % sectorsize, orig_out + compressed_size - *cur_out); - kunmap(cur_page); + kunmap_local(kaddr); if ((*cur_out / PAGE_SIZE) >= max_nr_page) return -E2BIG; @@ -180,7 +180,7 @@ static int copy_compressed_data_to_page(char *compressed_data, return -ENOMEM; out_pages[*cur_out / PAGE_SIZE] = cur_page; } - kaddr = kmap(cur_page); + kaddr = kmap_local_page(cur_page); memcpy(kaddr + offset_in_page(*cur_out), compressed_data + *cur_out - orig_out, copy_len); @@ -202,7 +202,7 @@ static int copy_compressed_data_to_page(char *compressed_data, *cur_out += sector_bytes_left; out: - kunmap(cur_page); + kunmap_local(kaddr); return 0; } @@ -248,12 +248,12 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, /* Compress at most one sector of data each time */ in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off); ASSERT(in_len); - data_in = kmap(page_in); + data_in = kmap_local_page(page_in); ret = lzo1x_1_compress(data_in + offset_in_page(cur_in), in_len, workspace->cbuf, &out_len, workspace->mem); - kunmap(page_in); + kunmap_local(data_in); if (ret < 0) { pr_debug("BTRFS: lzo in loop returned %d\n", ret); ret = -EIO; @@ -310,7 +310,6 @@ static void copy_compressed_segment(struct compressed_bio *cb, u32 orig_in = *cur_in; while (*cur_in < orig_in + len) { - char *kaddr; struct page *cur_page; u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in), orig_in + len - *cur_in); @@ -318,11 +317,8 @@ static void copy_compressed_segment(struct compressed_bio *cb, ASSERT(copy_len); cur_page = cb->compressed_pages[*cur_in / PAGE_SIZE]; - kaddr = kmap(cur_page); - memcpy(dest + *cur_in - orig_in, - kaddr + offset_in_page(*cur_in), - copy_len); - kunmap(cur_page); + memcpy_from_page(dest + *cur_in - orig_in, cur_page, + offset_in_page(*cur_in), copy_len); *cur_in += copy_len; } @@ -342,9 +338,9 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) /* Bytes decompressed so far */ u32 cur_out = 0; - kaddr = kmap(cb->compressed_pages[0]); + kaddr = kmap_local_page(cb->compressed_pages[0]); len_in = read_compress_length(kaddr); - kunmap(cb->compressed_pages[0]); + kunmap_local(kaddr); cur_in += LZO_LEN; /* @@ -378,9 +374,9 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) (cur_in + LZO_LEN - 1) / sectorsize); cur_page = cb->compressed_pages[cur_in / PAGE_SIZE]; ASSERT(cur_page); - kaddr = kmap(cur_page); + kaddr = kmap_local_page(cur_page); seg_len = read_compress_length(kaddr + offset_in_page(cur_in)); - kunmap(cur_page); + kunmap_local(kaddr); cur_in += LZO_LEN; if (seg_len > WORKSPACE_CBUF_LENGTH) { diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 1957b14b329a..1952ac85222c 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -272,25 +272,30 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, spin_unlock_irq(&tree->lock); } +static void finish_ordered_fn(struct btrfs_work *work) +{ + struct btrfs_ordered_extent *ordered_extent; + + ordered_extent = container_of(work, struct btrfs_ordered_extent, work); + btrfs_finish_ordered_io(ordered_extent); +} + /* * Mark all ordered extents io inside the specified range finished. * - * @page: The invovled page for the opeartion. + * @page: The involved page for the operation. * For uncompressed buffered IO, the page status also needs to be * updated to indicate whether the pending ordered io is finished. * Can be NULL for direct IO and compressed write. * For these cases, callers are ensured they won't execute the * endio function twice. - * @finish_func: The function to be executed when all the IO of an ordered - * extent are finished. * * This function is called for endio, thus the range must have ordered - * extent(s) coveri it. + * extent(s) covering it. */ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, - struct page *page, u64 file_offset, - u64 num_bytes, btrfs_func_t finish_func, - bool uptodate) + struct page *page, u64 file_offset, + u64 num_bytes, bool uptodate) { struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -401,8 +406,9 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); cond_wake_up(&entry->wait); refcount_inc(&entry->refs); + trace_btrfs_ordered_extent_mark_finished(inode, entry); spin_unlock_irqrestore(&tree->lock, flags); - btrfs_init_work(&entry->work, finish_func, NULL, NULL); + btrfs_init_work(&entry->work, finish_ordered_fn, NULL, NULL); btrfs_queue_work(wq, &entry->work); spin_lock_irqsave(&tree->lock, flags); } @@ -473,6 +479,7 @@ out: if (finished && cached && entry) { *cached = entry; refcount_inc(&entry->refs); + trace_btrfs_ordered_extent_dec_test_pending(inode, entry); } spin_unlock_irqrestore(&tree->lock, flags); return finished; @@ -807,8 +814,10 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); if (!in_range(file_offset, entry->file_offset, entry->num_bytes)) entry = NULL; - if (entry) + if (entry) { refcount_inc(&entry->refs); + trace_btrfs_ordered_extent_lookup(inode, entry); + } out: spin_unlock_irqrestore(&tree->lock, flags); return entry; @@ -848,8 +857,10 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range( break; } out: - if (entry) + if (entry) { refcount_inc(&entry->refs); + trace_btrfs_ordered_extent_lookup_range(inode, entry); + } spin_unlock_irq(&tree->lock); return entry; } @@ -878,6 +889,7 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, ASSERT(list_empty(&ordered->log_list)); list_add_tail(&ordered->log_list, list); refcount_inc(&ordered->refs); + trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered); } spin_unlock_irq(&tree->lock); } @@ -901,6 +913,7 @@ btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset) entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); refcount_inc(&entry->refs); + trace_btrfs_ordered_extent_lookup_first(inode, entry); out: spin_unlock_irq(&tree->lock); return entry; @@ -975,8 +988,11 @@ struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range( /* No ordered extent in the range */ entry = NULL; out: - if (entry) + if (entry) { refcount_inc(&entry->refs); + trace_btrfs_ordered_extent_lookup_first_range(inode, entry); + } + spin_unlock_irq(&tree->lock); return entry; } @@ -1055,6 +1071,8 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int ret = 0; + trace_btrfs_ordered_extent_split(BTRFS_I(inode), ordered); + spin_lock_irq(&tree->lock); /* Remove from tree once */ node = &ordered->rb_node; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index ecad67a2c745..87792f85e2c4 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -180,13 +180,14 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) t->last = NULL; } +int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); + void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_ordered_extent *entry); void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, struct page *page, u64 file_offset, - u64 num_bytes, btrfs_func_t finish_func, - bool uptodate); + u64 num_bytes, bool uptodate); bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index a5b623ee6fac..2feb5c20641a 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -63,137 +63,6 @@ struct sector_ptr { unsigned int uptodate:8; }; -enum btrfs_rbio_ops { - BTRFS_RBIO_WRITE, - BTRFS_RBIO_READ_REBUILD, - BTRFS_RBIO_PARITY_SCRUB, - BTRFS_RBIO_REBUILD_MISSING, -}; - -struct btrfs_raid_bio { - struct btrfs_io_context *bioc; - - /* while we're doing rmw on a stripe - * we put it into a hash table so we can - * lock the stripe and merge more rbios - * into it. - */ - struct list_head hash_list; - - /* - * LRU list for the stripe cache - */ - struct list_head stripe_cache; - - /* - * for scheduling work in the helper threads - */ - struct work_struct work; - - /* - * bio list and bio_list_lock are used - * to add more bios into the stripe - * in hopes of avoiding the full rmw - */ - struct bio_list bio_list; - spinlock_t bio_list_lock; - - /* also protected by the bio_list_lock, the - * plug list is used by the plugging code - * to collect partial bios while plugged. The - * stripe locking code also uses it to hand off - * the stripe lock to the next pending IO - */ - struct list_head plug_list; - - /* - * flags that tell us if it is safe to - * merge with this bio - */ - unsigned long flags; - - /* - * set if we're doing a parity rebuild - * for a read from higher up, which is handled - * differently from a parity rebuild as part of - * rmw - */ - enum btrfs_rbio_ops operation; - - /* Size of each individual stripe on disk */ - u32 stripe_len; - - /* How many pages there are for the full stripe including P/Q */ - u16 nr_pages; - - /* How many sectors there are for the full stripe including P/Q */ - u16 nr_sectors; - - /* Number of data stripes (no p/q) */ - u8 nr_data; - - /* Numer of all stripes (including P/Q) */ - u8 real_stripes; - - /* How many pages there are for each stripe */ - u8 stripe_npages; - - /* How many sectors there are for each stripe */ - u8 stripe_nsectors; - - /* First bad stripe, -1 means no corruption */ - s8 faila; - - /* Second bad stripe (for RAID6 use) */ - s8 failb; - - /* Stripe number that we're scrubbing */ - u8 scrubp; - - /* - * size of all the bios in the bio_list. This - * helps us decide if the rbio maps to a full - * stripe or not - */ - int bio_list_bytes; - - int generic_bio_cnt; - - refcount_t refs; - - atomic_t stripes_pending; - - atomic_t error; - /* - * these are two arrays of pointers. We allocate the - * rbio big enough to hold them both and setup their - * locations when the rbio is allocated - */ - - /* pointers to pages that we allocated for - * reading/writing stripes directly from the disk (including P/Q) - */ - struct page **stripe_pages; - - /* Pointers to the sectors in the bio_list, for faster lookup */ - struct sector_ptr *bio_sectors; - - /* - * For subpage support, we need to map each sector to above - * stripe_pages. - */ - struct sector_ptr *stripe_sectors; - - /* Bitmap to record which horizontal stripe has data */ - unsigned long *dbitmap; - - /* allocated with real_stripes-many pointers for finish_*() calls */ - void **finish_pointers; - - /* Allocated with stripe_nsectors-many bits for finish_*() calls */ - unsigned long *finish_pbitmap; -}; - static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); static noinline void finish_rmw(struct btrfs_raid_bio *rbio); static void rmw_work(struct work_struct *work); @@ -347,6 +216,24 @@ static void index_stripe_sectors(struct btrfs_raid_bio *rbio) } } +static void steal_rbio_page(struct btrfs_raid_bio *src, + struct btrfs_raid_bio *dest, int page_nr) +{ + const u32 sectorsize = src->bioc->fs_info->sectorsize; + const u32 sectors_per_page = PAGE_SIZE / sectorsize; + int i; + + if (dest->stripe_pages[page_nr]) + __free_page(dest->stripe_pages[page_nr]); + dest->stripe_pages[page_nr] = src->stripe_pages[page_nr]; + src->stripe_pages[page_nr] = NULL; + + /* Also update the sector->uptodate bits. */ + for (i = sectors_per_page * page_nr; + i < sectors_per_page * page_nr + sectors_per_page; i++) + dest->stripe_sectors[i].uptodate = true; +} + /* * Stealing an rbio means taking all the uptodate pages from the stripe array * in the source rbio and putting them into the destination rbio. @@ -358,7 +245,6 @@ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) { int i; struct page *s; - struct page *d; if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) return; @@ -368,12 +254,7 @@ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) if (!s || !full_page_sectors_uptodate(src, i)) continue; - d = dest->stripe_pages[i]; - if (d) - __free_page(d); - - dest->stripe_pages[i] = s; - src->stripe_pages[i] = NULL; + steal_rbio_page(src, dest, i); } index_stripe_sectors(dest); index_stripe_sectors(src); @@ -391,6 +272,9 @@ static void merge_rbio(struct btrfs_raid_bio *dest, { bio_list_merge(&dest->bio_list, &victim->bio_list); dest->bio_list_bytes += victim->bio_list_bytes; + /* Also inherit the bitmaps from @victim. */ + bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap, + dest->stripe_nsectors); dest->generic_bio_cnt += victim->generic_bio_cnt; bio_list_init(&victim->bio_list); } @@ -590,9 +474,9 @@ static int rbio_is_full(struct btrfs_raid_bio *rbio) int ret = 1; spin_lock_irqsave(&rbio->bio_list_lock, flags); - if (size != rbio->nr_data * rbio->stripe_len) + if (size != rbio->nr_data * BTRFS_STRIPE_LEN) ret = 0; - BUG_ON(size > rbio->nr_data * rbio->stripe_len); + BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN); spin_unlock_irqrestore(&rbio->bio_list_lock, flags); return ret; @@ -932,6 +816,12 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) if (rbio->generic_bio_cnt) btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt); + /* + * Clear the data bitmap, as the rbio may be cached for later usage. + * do this before before unlock_stripe() so there will be no new bio + * for this bio. + */ + bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors); /* * At this moment, rbio->bio_list is empty, however since rbio does not @@ -1023,29 +913,30 @@ static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, * this does not allocate any pages for rbio->pages. */ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, - struct btrfs_io_context *bioc, - u32 stripe_len) + struct btrfs_io_context *bioc) { const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs; - const unsigned int stripe_npages = stripe_len >> PAGE_SHIFT; + const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT; const unsigned int num_pages = stripe_npages * real_stripes; - const unsigned int stripe_nsectors = stripe_len >> fs_info->sectorsize_bits; + const unsigned int stripe_nsectors = + BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits; const unsigned int num_sectors = stripe_nsectors * real_stripes; struct btrfs_raid_bio *rbio; - int nr_data = 0; void *p; - ASSERT(IS_ALIGNED(stripe_len, PAGE_SIZE)); /* PAGE_SIZE must also be aligned to sectorsize for subpage support */ ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize)); + /* + * Our current stripe len should be fixed to 64k thus stripe_nsectors + * (at most 16) should be no larger than BITS_PER_LONG. + */ + ASSERT(stripe_nsectors <= BITS_PER_LONG); rbio = kzalloc(sizeof(*rbio) + sizeof(*rbio->stripe_pages) * num_pages + sizeof(*rbio->bio_sectors) * num_sectors + sizeof(*rbio->stripe_sectors) * num_sectors + - sizeof(*rbio->finish_pointers) * real_stripes + - sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_nsectors) + - sizeof(*rbio->finish_pbitmap) * BITS_TO_LONGS(stripe_nsectors), + sizeof(*rbio->finish_pointers) * real_stripes, GFP_NOFS); if (!rbio) return ERR_PTR(-ENOMEM); @@ -1056,7 +947,6 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, INIT_LIST_HEAD(&rbio->stripe_cache); INIT_LIST_HEAD(&rbio->hash_list); rbio->bioc = bioc; - rbio->stripe_len = stripe_len; rbio->nr_pages = num_pages; rbio->nr_sectors = num_sectors; rbio->real_stripes = real_stripes; @@ -1081,18 +971,11 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, CONSUME_ALLOC(rbio->bio_sectors, num_sectors); CONSUME_ALLOC(rbio->stripe_sectors, num_sectors); CONSUME_ALLOC(rbio->finish_pointers, real_stripes); - CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_nsectors)); - CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_nsectors)); #undef CONSUME_ALLOC - if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5) - nr_data = real_stripes - 1; - else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) - nr_data = real_stripes - 2; - else - BUG(); + ASSERT(btrfs_nr_parity_stripes(bioc->map_type)); + rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type); - rbio->nr_data = nr_data; return rbio; } @@ -1135,8 +1018,7 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, struct sector_ptr *sector, unsigned int stripe_nr, unsigned int sector_nr, - unsigned long bio_max_len, - unsigned int opf) + enum req_op op) { const u32 sectorsize = rbio->bioc->fs_info->sectorsize; struct bio *last = bio_list->tail; @@ -1180,8 +1062,9 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, } /* put a new bio on the list */ - bio = bio_alloc(stripe->dev->bdev, max(bio_max_len >> PAGE_SHIFT, 1UL), - opf, GFP_NOFS); + bio = bio_alloc(stripe->dev->bdev, + max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1), + op, GFP_NOFS); bio->bi_iter.bi_sector = disk_start >> 9; bio->bi_private = rbio; @@ -1215,9 +1098,6 @@ static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - rbio->bioc->raid_map[0]; - if (bio_flagged(bio, BIO_CLONED)) - bio->bi_iter = btrfs_bio(bio)->iter; - bio_for_each_segment(bvec, bio, iter) { u32 bvec_offset; @@ -1252,6 +1132,34 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio) spin_unlock_irq(&rbio->bio_list_lock); } +static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio, + struct raid56_bio_trace_info *trace_info) +{ + const struct btrfs_io_context *bioc = rbio->bioc; + int i; + + ASSERT(bioc); + + /* We rely on bio->bi_bdev to find the stripe number. */ + if (!bio->bi_bdev) + goto not_found; + + for (i = 0; i < bioc->num_stripes; i++) { + if (bio->bi_bdev != bioc->stripes[i].dev->bdev) + continue; + trace_info->stripe_nr = i; + trace_info->devid = bioc->stripes[i].dev->devid; + trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - + bioc->stripes[i].physical; + return; + } + +not_found: + trace_info->devid = -1; + trace_info->offset = -1; + trace_info->stripe_nr = -1; +} + /* * this is called from one of two situations. We either * have a full stripe from the higher layers, or we've read all @@ -1266,7 +1174,10 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) const u32 sectorsize = bioc->fs_info->sectorsize; void **pointers = rbio->finish_pointers; int nr_data = rbio->nr_data; + /* The total sector number inside the full stripe. */ + int total_sector_nr; int stripe; + /* Sector number inside a stripe. */ int sectornr; bool has_qstripe; struct bio_list bio_list; @@ -1282,6 +1193,9 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) else BUG(); + /* We should have at least one data sector. */ + ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors)); + /* at this point we either have a full stripe, * or we've read the full stripe from the drive. * recalculate the parity and write the new results. @@ -1348,55 +1262,71 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) } /* - * time to start writing. Make bios for everything from the - * higher layers (the bio_list in our rbio) and our p/q. Ignore - * everything else. + * Start writing. Make bios for everything from the higher layers (the + * bio_list in our rbio) and our P/Q. Ignore everything else. */ - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { - struct sector_ptr *sector; - - if (stripe < rbio->nr_data) { - sector = sector_in_rbio(rbio, stripe, sectornr, 1); - if (!sector) - continue; - } else { - sector = rbio_stripe_sector(rbio, stripe, sectornr); - } + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + struct sector_ptr *sector; + + stripe = total_sector_nr / rbio->stripe_nsectors; + sectornr = total_sector_nr % rbio->stripe_nsectors; - ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, - sectornr, rbio->stripe_len, - REQ_OP_WRITE); - if (ret) - goto cleanup; + /* This vertical stripe has no data, skip it. */ + if (!test_bit(sectornr, &rbio->dbitmap)) + continue; + + if (stripe < rbio->nr_data) { + sector = sector_in_rbio(rbio, stripe, sectornr, 1); + if (!sector) + continue; + } else { + sector = rbio_stripe_sector(rbio, stripe, sectornr); } + + ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + sectornr, REQ_OP_WRITE); + if (ret) + goto cleanup; } if (likely(!bioc->num_tgtdevs)) goto write_data; - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - if (!bioc->tgtdev_map[stripe]) - continue; + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + struct sector_ptr *sector; - for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { - struct sector_ptr *sector; + stripe = total_sector_nr / rbio->stripe_nsectors; + sectornr = total_sector_nr % rbio->stripe_nsectors; - if (stripe < rbio->nr_data) { - sector = sector_in_rbio(rbio, stripe, sectornr, 1); - if (!sector) - continue; - } else { - sector = rbio_stripe_sector(rbio, stripe, sectornr); - } + if (!bioc->tgtdev_map[stripe]) { + /* + * We can skip the whole stripe completely, note + * total_sector_nr will be increased by one anyway. + */ + ASSERT(sectornr == 0); + total_sector_nr += rbio->stripe_nsectors - 1; + continue; + } - ret = rbio_add_io_sector(rbio, &bio_list, sector, - rbio->bioc->tgtdev_map[stripe], - sectornr, rbio->stripe_len, - REQ_OP_WRITE); - if (ret) - goto cleanup; + /* This vertical stripe has no data, skip it. */ + if (!test_bit(sectornr, &rbio->dbitmap)) + continue; + + if (stripe < rbio->nr_data) { + sector = sector_in_rbio(rbio, stripe, sectornr, 1); + if (!sector) + continue; + } else { + sector = rbio_stripe_sector(rbio, stripe, sectornr); } + + ret = rbio_add_io_sector(rbio, &bio_list, sector, + rbio->bioc->tgtdev_map[stripe], + sectornr, REQ_OP_WRITE); + if (ret) + goto cleanup; } write_data: @@ -1406,6 +1336,12 @@ write_data: while ((bio = bio_list_pop(&bio_list))) { bio->bi_end_io = raid_write_end_io; + if (trace_raid56_write_stripe_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_write_stripe(rbio, bio, &trace_info); + } submit_bio(bio); } return; @@ -1433,7 +1369,7 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio, for (i = 0; i < rbio->bioc->num_stripes; i++) { stripe = &rbio->bioc->stripes[i]; - if (in_range(physical, stripe->physical, rbio->stripe_len) && + if (in_range(physical, stripe->physical, BTRFS_STRIPE_LEN) && stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) { return i; } @@ -1455,7 +1391,7 @@ static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, for (i = 0; i < rbio->nr_data; i++) { u64 stripe_start = rbio->bioc->raid_map[i]; - if (in_range(logical, stripe_start, rbio->stripe_len)) + if (in_range(logical, stripe_start, BTRFS_STRIPE_LEN)) return i; } return -1; @@ -1552,15 +1488,7 @@ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) } } -/* - * end io for the read phase of the rmw cycle. All the bios here are physical - * stripe bios we've read from the disk so we can recalculate the parity of the - * stripe. - * - * This will usually kick off finish_rmw once all the bios are read in, but it - * may trigger parity reconstruction if we had any errors along the way - */ -static void raid_rmw_end_io(struct bio *bio) +static void raid56_bio_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; @@ -1571,23 +1499,34 @@ static void raid_rmw_end_io(struct bio *bio) bio_put(bio); - if (!atomic_dec_and_test(&rbio->stripes_pending)) - return; + if (atomic_dec_and_test(&rbio->stripes_pending)) + queue_work(rbio->bioc->fs_info->endio_raid56_workers, + &rbio->end_io_work); +} - if (atomic_read(&rbio->error) > rbio->bioc->max_errors) - goto cleanup; +/* + * End io handler for the read phase of the RMW cycle. All the bios here are + * physical stripe bios we've read from the disk so we can recalculate the + * parity of the stripe. + * + * This will usually kick off finish_rmw once all the bios are read in, but it + * may trigger parity reconstruction if we had any errors along the way + */ +static void raid56_rmw_end_io_work(struct work_struct *work) +{ + struct btrfs_raid_bio *rbio = + container_of(work, struct btrfs_raid_bio, end_io_work); + + if (atomic_read(&rbio->error) > rbio->bioc->max_errors) { + rbio_orig_end_io(rbio, BLK_STS_IOERR); + return; + } /* - * this will normally call finish_rmw to start our write - * but if there are any failed stripes we'll reconstruct - * from parity first + * This will normally call finish_rmw to start our write but if there + * are any failed stripes we'll reconstruct from parity first. */ validate_rbio_for_rmw(rbio); - return; - -cleanup: - - rbio_orig_end_io(rbio, BLK_STS_IOERR); } /* @@ -1598,9 +1537,9 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) { int bios_to_read = 0; struct bio_list bio_list; + const int nr_data_sectors = rbio->stripe_nsectors * rbio->nr_data; int ret; - int sectornr; - int stripe; + int total_sector_nr; struct bio *bio; bio_list_init(&bio_list); @@ -1612,38 +1551,34 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) index_rbio_pages(rbio); atomic_set(&rbio->error, 0); - /* - * build a list of bios to read all the missing parts of this - * stripe - */ - for (stripe = 0; stripe < rbio->nr_data; stripe++) { - for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { - struct sector_ptr *sector; + /* Build a list of bios to read all the missing data sectors. */ + for (total_sector_nr = 0; total_sector_nr < nr_data_sectors; + total_sector_nr++) { + struct sector_ptr *sector; + int stripe = total_sector_nr / rbio->stripe_nsectors; + int sectornr = total_sector_nr % rbio->stripe_nsectors; - /* - * We want to find all the sectors missing from the - * rbio and read them from the disk. If * sector_in_rbio() - * finds a page in the bio list we don't need to read - * it off the stripe. - */ - sector = sector_in_rbio(rbio, stripe, sectornr, 1); - if (sector) - continue; + /* + * We want to find all the sectors missing from the rbio and + * read them from the disk. If sector_in_rbio() finds a page + * in the bio list we don't need to read it off the stripe. + */ + sector = sector_in_rbio(rbio, stripe, sectornr, 1); + if (sector) + continue; - sector = rbio_stripe_sector(rbio, stripe, sectornr); - /* - * The bio cache may have handed us an uptodate page. - * If so, be happy and use it. - */ - if (sector->uptodate) - continue; + sector = rbio_stripe_sector(rbio, stripe, sectornr); + /* + * The bio cache may have handed us an uptodate page. If so, + * use it. + */ + if (sector->uptodate) + continue; - ret = rbio_add_io_sector(rbio, &bio_list, sector, - stripe, sectornr, rbio->stripe_len, - REQ_OP_READ); - if (ret) - goto cleanup; - } + ret = rbio_add_io_sector(rbio, &bio_list, sector, + stripe, sectornr, REQ_OP_READ); + if (ret) + goto cleanup; } bios_to_read = bio_list_size(&bio_list); @@ -1662,11 +1597,16 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) * touch it after that. */ atomic_set(&rbio->stripes_pending, bios_to_read); + INIT_WORK(&rbio->end_io_work, raid56_rmw_end_io_work); while ((bio = bio_list_pop(&bio_list))) { - bio->bi_end_io = raid_rmw_end_io; + bio->bi_end_io = raid56_bio_end_io; - btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); + if (trace_raid56_read_partial_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_read_partial(rbio, bio, &trace_info); + } submit_bio(bio); } /* the actual write will happen once the reads are done */ @@ -1833,27 +1773,53 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) run_plug(plug); } +/* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */ +static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio) +{ + const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT; + const u64 full_stripe_start = rbio->bioc->raid_map[0]; + const u32 orig_len = orig_bio->bi_iter.bi_size; + const u32 sectorsize = fs_info->sectorsize; + u64 cur_logical; + + ASSERT(orig_logical >= full_stripe_start && + orig_logical + orig_len <= full_stripe_start + + rbio->nr_data * BTRFS_STRIPE_LEN); + + bio_list_add(&rbio->bio_list, orig_bio); + rbio->bio_list_bytes += orig_bio->bi_iter.bi_size; + + /* Update the dbitmap. */ + for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len; + cur_logical += sectorsize) { + int bit = ((u32)(cur_logical - full_stripe_start) >> + fs_info->sectorsize_bits) % rbio->stripe_nsectors; + + set_bit(bit, &rbio->dbitmap); + } +} + /* * our main entry point for writes from the rest of the FS. */ -int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stripe_len) +void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) { struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; struct btrfs_plug_cb *plug = NULL; struct blk_plug_cb *cb; - int ret; + int ret = 0; - rbio = alloc_rbio(fs_info, bioc, stripe_len); + rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) { btrfs_put_bioc(bioc); - return PTR_ERR(rbio); + ret = PTR_ERR(rbio); + goto out_dec_counter; } - bio_list_add(&rbio->bio_list, bio); - rbio->bio_list_bytes = bio->bi_iter.bi_size; rbio->operation = BTRFS_RBIO_WRITE; + rbio_add_bio(rbio, bio); - btrfs_bio_counter_inc_noblocked(fs_info); rbio->generic_bio_cnt = 1; /* @@ -1863,8 +1829,8 @@ int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stri if (rbio_is_full(rbio)) { ret = full_stripe_write(rbio); if (ret) - btrfs_bio_counter_dec(fs_info); - return ret; + goto out_dec_counter; + return; } cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug)); @@ -1875,13 +1841,18 @@ int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stri INIT_LIST_HEAD(&plug->rbio_list); } list_add_tail(&rbio->plug_list, &plug->rbio_list); - ret = 0; } else { ret = __raid56_parity_write(rbio); if (ret) - btrfs_bio_counter_dec(fs_info); + goto out_dec_counter; } - return ret; + + return; + +out_dec_counter: + btrfs_bio_counter_dec(fs_info); + bio->bi_status = errno_to_blk_status(ret); + bio_endio(bio); } /* @@ -1939,7 +1910,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) * which we have data when doing parity scrub. */ if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && - !test_bit(sectornr, rbio->dbitmap)) + !test_bit(sectornr, &rbio->dbitmap)) continue; /* @@ -2108,25 +2079,13 @@ cleanup_io: } /* - * This is called only for stripes we've read from disk to - * reconstruct the parity. + * This is called only for stripes we've read from disk to reconstruct the + * parity. */ -static void raid_recover_end_io(struct bio *bio) +static void raid_recover_end_io_work(struct work_struct *work) { - struct btrfs_raid_bio *rbio = bio->bi_private; - - /* - * we only read stripe pages off the disk, set them - * up to date if there were no errors - */ - if (bio->bi_status) - fail_bio_stripe(rbio, bio); - else - set_bio_pages_uptodate(rbio, bio); - bio_put(bio); - - if (!atomic_dec_and_test(&rbio->stripes_pending)) - return; + struct btrfs_raid_bio *rbio = + container_of(work, struct btrfs_raid_bio, end_io_work); if (atomic_read(&rbio->error) > rbio->bioc->max_errors) rbio_orig_end_io(rbio, BLK_STS_IOERR); @@ -2147,8 +2106,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) int bios_to_read = 0; struct bio_list bio_list; int ret; - int sectornr; - int stripe; + int total_sector_nr; struct bio *bio; bio_list_init(&bio_list); @@ -2160,33 +2118,31 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) atomic_set(&rbio->error, 0); /* - * read everything that hasn't failed. Thanks to the - * stripe cache, it is possible that some or all of these - * pages are going to be uptodate. + * Read everything that hasn't failed. However this time we will + * not trust any cached sector. + * As we may read out some stale data but higher layer is not reading + * that stale part. + * + * So here we always re-read everything in recovery path. */ - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + int stripe = total_sector_nr / rbio->stripe_nsectors; + int sectornr = total_sector_nr % rbio->stripe_nsectors; + struct sector_ptr *sector; + if (rbio->faila == stripe || rbio->failb == stripe) { atomic_inc(&rbio->error); + /* Skip the current stripe. */ + ASSERT(sectornr == 0); + total_sector_nr += rbio->stripe_nsectors - 1; continue; } - - for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { - struct sector_ptr *sector; - - /* - * the rmw code may have already read this - * page in - */ - sector = rbio_stripe_sector(rbio, stripe, sectornr); - if (sector->uptodate) - continue; - - ret = rbio_add_io_sector(rbio, &bio_list, sector, - stripe, sectornr, rbio->stripe_len, - REQ_OP_READ); - if (ret < 0) - goto cleanup; - } + sector = rbio_stripe_sector(rbio, stripe, sectornr); + ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + sectornr, REQ_OP_READ); + if (ret < 0) + goto cleanup; } bios_to_read = bio_list_size(&bio_list); @@ -2209,11 +2165,16 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) * touch it after that. */ atomic_set(&rbio->stripes_pending, bios_to_read); + INIT_WORK(&rbio->end_io_work, raid_recover_end_io_work); while ((bio = bio_list_pop(&bio_list))) { - bio->bi_end_io = raid_recover_end_io; + bio->bi_end_io = raid56_bio_end_io; - btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); + if (trace_raid56_scrub_read_recover_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_scrub_read_recover(rbio, bio, &trace_info); + } submit_bio(bio); } @@ -2236,28 +2197,27 @@ cleanup: * so we assume the bio they send down corresponds to a failed part * of the drive. */ -int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, - u32 stripe_len, int mirror_num, int generic_io) +void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, + int mirror_num, bool generic_io) { struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; - int ret; if (generic_io) { ASSERT(bioc->mirror_num == mirror_num); btrfs_bio(bio)->mirror_num = mirror_num; + } else { + btrfs_get_bioc(bioc); } - rbio = alloc_rbio(fs_info, bioc, stripe_len); + rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) { - if (generic_io) - btrfs_put_bioc(bioc); - return PTR_ERR(rbio); + bio->bi_status = errno_to_blk_status(PTR_ERR(rbio)); + goto out_end_bio; } rbio->operation = BTRFS_RBIO_READ_REBUILD; - bio_list_add(&rbio->bio_list, bio); - rbio->bio_list_bytes = bio->bi_iter.bi_size; + rbio_add_bio(rbio, bio); rbio->faila = find_logical_bio_stripe(rbio, bio); if (rbio->faila == -1) { @@ -2265,18 +2225,13 @@ int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)", __func__, bio->bi_iter.bi_sector << 9, (u64)bio->bi_iter.bi_size, bioc->map_type); - if (generic_io) - btrfs_put_bioc(bioc); kfree(rbio); - return -EIO; + bio->bi_status = BLK_STS_IOERR; + goto out_end_bio; } - if (generic_io) { - btrfs_bio_counter_inc_noblocked(fs_info); + if (generic_io) rbio->generic_bio_cnt = 1; - } else { - btrfs_get_bioc(bioc); - } /* * Loop retry: @@ -2295,24 +2250,20 @@ int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, rbio->failb--; } - ret = lock_stripe_add(rbio); + if (lock_stripe_add(rbio)) + return; /* - * __raid56_parity_recover will end the bio with - * any errors it hits. We don't want to return - * its error value up the stack because our caller - * will end up calling bio_endio with any nonzero - * return - */ - if (ret == 0) - __raid56_parity_recover(rbio); - /* - * our rbio has been added to the list of - * rbios that will be handled after the - * currently lock owner is done + * This adds our rbio to the list of rbios that will be handled after + * the current lock owner is done. */ - return 0; + __raid56_parity_recover(rbio); + return; +out_end_bio: + btrfs_bio_counter_dec(fs_info); + btrfs_put_bioc(bioc); + bio_endio(bio); } static void rmw_work(struct work_struct *work) @@ -2343,14 +2294,14 @@ static void read_rebuild_work(struct work_struct *work) struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, struct btrfs_io_context *bioc, - u32 stripe_len, struct btrfs_device *scrub_dev, + struct btrfs_device *scrub_dev, unsigned long *dbitmap, int stripe_nsectors) { struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; int i; - rbio = alloc_rbio(fs_info, bioc, stripe_len); + rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) return NULL; bio_list_add(&rbio->bio_list, bio); @@ -2374,7 +2325,7 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, } ASSERT(i < rbio->real_stripes); - bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors); + bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors); /* * We have already increased bio_counter when getting bioc, record it @@ -2395,7 +2346,7 @@ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, ASSERT(logical >= rbio->bioc->raid_map[0]); ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] + - rbio->stripe_len * rbio->nr_data); + BTRFS_STRIPE_LEN * rbio->nr_data); stripe_offset = (int)(logical - rbio->bioc->raid_map[0]); index = stripe_offset / sectorsize; rbio->bio_sectors[index].page = page; @@ -2409,23 +2360,22 @@ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) { const u32 sectorsize = rbio->bioc->fs_info->sectorsize; - int stripe; - int sectornr; - - for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) { - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - struct page *page; - int index = (stripe * rbio->stripe_nsectors + sectornr) * - sectorsize >> PAGE_SHIFT; + int total_sector_nr; - if (rbio->stripe_pages[index]) - continue; + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + struct page *page; + int sectornr = total_sector_nr % rbio->stripe_nsectors; + int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT; - page = alloc_page(GFP_NOFS); - if (!page) - return -ENOMEM; - rbio->stripe_pages[index] = page; - } + if (!test_bit(sectornr, &rbio->dbitmap)) + continue; + if (rbio->stripe_pages[index]) + continue; + page = alloc_page(GFP_NOFS); + if (!page) + return -ENOMEM; + rbio->stripe_pages[index] = page; } index_stripe_sectors(rbio); return 0; @@ -2437,7 +2387,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, struct btrfs_io_context *bioc = rbio->bioc; const u32 sectorsize = bioc->fs_info->sectorsize; void **pointers = rbio->finish_pointers; - unsigned long *pbitmap = rbio->finish_pbitmap; + unsigned long *pbitmap = &rbio->finish_pbitmap; int nr_data = rbio->nr_data; int stripe; int sectornr; @@ -2460,7 +2410,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) { is_replace = 1; - bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_nsectors); + bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors); } /* @@ -2497,7 +2447,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, /* Map the parity stripe just once */ pointers[nr_data] = kmap_local_page(p_sector.page); - for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) { + for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { struct sector_ptr *sector; void *parity; @@ -2525,7 +2475,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, memcpy(parity, pointers[rbio->scrubp], sectorsize); else /* Parity is right, needn't writeback */ - bitmap_clear(rbio->dbitmap, sectornr, 1); + bitmap_clear(&rbio->dbitmap, sectornr, 1); kunmap_local(parity); for (stripe = nr_data - 1; stripe >= 0; stripe--) @@ -2547,12 +2497,12 @@ writeback: * higher layers (the bio_list in our rbio) and our p/q. Ignore * everything else. */ - for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) { + for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { struct sector_ptr *sector; sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp, - sectornr, rbio->stripe_len, REQ_OP_WRITE); + sectornr, REQ_OP_WRITE); if (ret) goto cleanup; } @@ -2566,7 +2516,7 @@ writeback: sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); ret = rbio_add_io_sector(rbio, &bio_list, sector, bioc->tgtdev_map[rbio->scrubp], - sectornr, rbio->stripe_len, REQ_OP_WRITE); + sectornr, REQ_OP_WRITE); if (ret) goto cleanup; } @@ -2584,6 +2534,12 @@ submit_write: while ((bio = bio_list_pop(&bio_list))) { bio->bi_end_io = raid_write_end_io; + if (trace_raid56_scrub_write_stripe_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_scrub_write_stripe(rbio, bio, &trace_info); + } submit_bio(bio); } return; @@ -2671,24 +2627,14 @@ cleanup: * This will usually kick off finish_rmw once all the bios are read in, but it * may trigger parity reconstruction if we had any errors along the way */ -static void raid56_parity_scrub_end_io(struct bio *bio) +static void raid56_parity_scrub_end_io_work(struct work_struct *work) { - struct btrfs_raid_bio *rbio = bio->bi_private; - - if (bio->bi_status) - fail_bio_stripe(rbio, bio); - else - set_bio_pages_uptodate(rbio, bio); - - bio_put(bio); - - if (!atomic_dec_and_test(&rbio->stripes_pending)) - return; + struct btrfs_raid_bio *rbio = + container_of(work, struct btrfs_raid_bio, end_io_work); /* - * this will normally call finish_rmw to start our write - * but if there are any failed stripes we'll reconstruct - * from parity first + * This will normally call finish_rmw to start our write, but if there + * are any failed stripes we'll reconstruct from parity first */ validate_rbio_for_parity_scrub(rbio); } @@ -2698,8 +2644,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) int bios_to_read = 0; struct bio_list bio_list; int ret; - int sectornr; - int stripe; + int total_sector_nr; struct bio *bio; bio_list_init(&bio_list); @@ -2709,37 +2654,38 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) goto cleanup; atomic_set(&rbio->error, 0); - /* - * build a list of bios to read all the missing parts of this - * stripe - */ - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - for_each_set_bit(sectornr , rbio->dbitmap, rbio->stripe_nsectors) { - struct sector_ptr *sector; - /* - * We want to find all the sectors missing from the - * rbio and read them from the disk. If * sector_in_rbio() - * finds a sector in the bio list we don't need to read - * it off the stripe. - */ - sector = sector_in_rbio(rbio, stripe, sectornr, 1); - if (sector) - continue; + /* Build a list of bios to read all the missing parts. */ + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + int sectornr = total_sector_nr % rbio->stripe_nsectors; + int stripe = total_sector_nr / rbio->stripe_nsectors; + struct sector_ptr *sector; - sector = rbio_stripe_sector(rbio, stripe, sectornr); - /* - * The bio cache may have handed us an uptodate sector. - * If so, be happy and use it. - */ - if (sector->uptodate) - continue; + /* No data in the vertical stripe, no need to read. */ + if (!test_bit(sectornr, &rbio->dbitmap)) + continue; - ret = rbio_add_io_sector(rbio, &bio_list, sector, - stripe, sectornr, rbio->stripe_len, - REQ_OP_READ); - if (ret) - goto cleanup; - } + /* + * We want to find all the sectors missing from the rbio and + * read them from the disk. If sector_in_rbio() finds a sector + * in the bio list we don't need to read it off the stripe. + */ + sector = sector_in_rbio(rbio, stripe, sectornr, 1); + if (sector) + continue; + + sector = rbio_stripe_sector(rbio, stripe, sectornr); + /* + * The bio cache may have handed us an uptodate sector. If so, + * use it. + */ + if (sector->uptodate) + continue; + + ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + sectornr, REQ_OP_READ); + if (ret) + goto cleanup; } bios_to_read = bio_list_size(&bio_list); @@ -2758,11 +2704,16 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) * touch it after that. */ atomic_set(&rbio->stripes_pending, bios_to_read); + INIT_WORK(&rbio->end_io_work, raid56_parity_scrub_end_io_work); while ((bio = bio_list_pop(&bio_list))) { - bio->bi_end_io = raid56_parity_scrub_end_io; + bio->bi_end_io = raid56_bio_end_io; - btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); + if (trace_raid56_scrub_read_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_scrub_read(rbio, bio, &trace_info); + } submit_bio(bio); } /* the actual write will happen once the reads are done */ @@ -2797,13 +2748,12 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) /* The following code is used for dev replace of a missing RAID 5/6 device. */ struct btrfs_raid_bio * -raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc, - u64 length) +raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc) { struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; - rbio = alloc_rbio(fs_info, bioc, length); + rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) return NULL; diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index aaad08aefd7d..6f48f9e4c869 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -7,45 +7,179 @@ #ifndef BTRFS_RAID56_H #define BTRFS_RAID56_H -static inline int nr_parity_stripes(const struct map_lookup *map) -{ - if (map->type & BTRFS_BLOCK_GROUP_RAID5) - return 1; - else if (map->type & BTRFS_BLOCK_GROUP_RAID6) - return 2; - else - return 0; -} +#include <linux/workqueue.h> +#include "volumes.h" + +enum btrfs_rbio_ops { + BTRFS_RBIO_WRITE, + BTRFS_RBIO_READ_REBUILD, + BTRFS_RBIO_PARITY_SCRUB, + BTRFS_RBIO_REBUILD_MISSING, +}; + +struct btrfs_raid_bio { + struct btrfs_io_context *bioc; + + /* + * While we're doing RMW on a stripe we put it into a hash table so we + * can lock the stripe and merge more rbios into it. + */ + struct list_head hash_list; + + /* LRU list for the stripe cache */ + struct list_head stripe_cache; + + /* For scheduling work in the helper threads */ + struct work_struct work; + + /* + * bio_list and bio_list_lock are used to add more bios into the stripe + * in hopes of avoiding the full RMW + */ + struct bio_list bio_list; + spinlock_t bio_list_lock; + + /* + * Also protected by the bio_list_lock, the plug list is used by the + * plugging code to collect partial bios while plugged. The stripe + * locking code also uses it to hand off the stripe lock to the next + * pending IO. + */ + struct list_head plug_list; + + /* Flags that tell us if it is safe to merge with this bio. */ + unsigned long flags; + + /* + * Set if we're doing a parity rebuild for a read from higher up, which + * is handled differently from a parity rebuild as part of RMW. + */ + enum btrfs_rbio_ops operation; + + /* How many pages there are for the full stripe including P/Q */ + u16 nr_pages; + + /* How many sectors there are for the full stripe including P/Q */ + u16 nr_sectors; + + /* Number of data stripes (no p/q) */ + u8 nr_data; + + /* Numer of all stripes (including P/Q) */ + u8 real_stripes; + + /* How many pages there are for each stripe */ + u8 stripe_npages; + + /* How many sectors there are for each stripe */ + u8 stripe_nsectors; + + /* First bad stripe, -1 means no corruption */ + s8 faila; + + /* Second bad stripe (for RAID6 use) */ + s8 failb; + + /* Stripe number that we're scrubbing */ + u8 scrubp; + + /* + * Size of all the bios in the bio_list. This helps us decide if the + * rbio maps to a full stripe or not. + */ + int bio_list_bytes; + + int generic_bio_cnt; + + refcount_t refs; + + atomic_t stripes_pending; + + atomic_t error; + + struct work_struct end_io_work; + + /* Bitmap to record which horizontal stripe has data */ + unsigned long dbitmap; + + /* Allocated with stripe_nsectors-many bits for finish_*() calls */ + unsigned long finish_pbitmap; + + /* + * These are two arrays of pointers. We allocate the rbio big enough + * to hold them both and setup their locations when the rbio is + * allocated. + */ + + /* + * Pointers to pages that we allocated for reading/writing stripes + * directly from the disk (including P/Q). + */ + struct page **stripe_pages; + + /* Pointers to the sectors in the bio_list, for faster lookup */ + struct sector_ptr *bio_sectors; + + /* + * For subpage support, we need to map each sector to above + * stripe_pages. + */ + struct sector_ptr *stripe_sectors; + + /* Allocated with real_stripes-many pointers for finish_*() calls */ + void **finish_pointers; +}; + +/* + * For trace event usage only. Records useful debug info for each bio submitted + * by RAID56 to each physical device. + * + * No matter signed or not, (-1) is always the one indicating we can not grab + * the proper stripe number. + */ +struct raid56_bio_trace_info { + u64 devid; + + /* The offset inside the stripe. (<= STRIPE_LEN) */ + u32 offset; + + /* + * Stripe number. + * 0 is the first data stripe, and nr_data for P stripe, + * nr_data + 1 for Q stripe. + * >= real_stripes for + */ + u8 stripe_nr; +}; static inline int nr_data_stripes(const struct map_lookup *map) { - return map->num_stripes - nr_parity_stripes(map); + return map->num_stripes - btrfs_nr_parity_stripes(map->type); } + #define RAID5_P_STRIPE ((u64)-2) #define RAID6_Q_STRIPE ((u64)-1) #define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \ ((x) == RAID6_Q_STRIPE)) -struct btrfs_raid_bio; struct btrfs_device; -int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, - u32 stripe_len, int mirror_num, int generic_io); -int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stripe_len); +void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, + int mirror_num, bool generic_io); +void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc); void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, unsigned int pgoff, u64 logical); struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, - struct btrfs_io_context *bioc, u32 stripe_len, + struct btrfs_io_context *bioc, struct btrfs_device *scrub_dev, unsigned long *dbitmap, int stripe_nsectors); void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio); struct btrfs_raid_bio * -raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc, - u64 length); +raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc); void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio); int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index c39f8b3a5a4a..9acf47b11fe6 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -5,6 +5,7 @@ #include "compression.h" #include "ctree.h" #include "delalloc-space.h" +#include "disk-io.h" #include "reflink.h" #include "transaction.h" #include "subpage.h" @@ -22,8 +23,10 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, int ret; inode_inc_iversion(inode); - if (!no_time_update) - inode->i_mtime = inode->i_ctime = current_time(inode); + if (!no_time_update) { + inode->i_mtime = current_time(inode); + inode->i_ctime = inode->i_mtime; + } /* * We round up to the block size at eof when determining which * extents to clone above, but shouldn't round up the file size. @@ -110,7 +113,6 @@ static int copy_inline_to_page(struct btrfs_inode *inode, if (comp_type == BTRFS_COMPRESS_NONE) { memcpy_to_page(page, offset_in_page(file_offset), data_start, datal); - flush_dcache_page(page); } else { ret = btrfs_decompress(comp_type, data_start, page, offset_in_page(file_offset), @@ -132,10 +134,8 @@ static int copy_inline_to_page(struct btrfs_inode *inode, * * So what's in the range [500, 4095] corresponds to zeroes. */ - if (datal < block_size) { + if (datal < block_size) memzero_page(page, datal, block_size - datal); - flush_dcache_page(page); - } btrfs_page_set_uptodate(fs_info, page, file_offset, block_size); btrfs_page_clear_checked(fs_info, page, file_offset, block_size); @@ -344,6 +344,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, int ret; const u64 len = olen_aligned; u64 last_dest_end = destoff; + u64 prev_extent_end = off; ret = -ENOMEM; buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); @@ -363,7 +364,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode, key.offset = off; while (1) { - u64 next_key_min_offset = key.offset + 1; struct btrfs_file_extent_item *extent; u64 extent_gen; int type; @@ -431,14 +431,21 @@ process_slot: * The first search might have left us at an extent item that * ends before our target range's start, can happen if we have * holes and NO_HOLES feature enabled. + * + * Subsequent searches may leave us on a file range we have + * processed before - this happens due to a race with ordered + * extent completion for a file range that is outside our source + * range, but that range was part of a file extent item that + * also covered a leading part of our source range. */ - if (key.offset + datal <= off) { + if (key.offset + datal <= prev_extent_end) { path->slots[0]++; goto process_slot; } else if (key.offset >= off + len) { break; } - next_key_min_offset = key.offset + datal; + + prev_extent_end = key.offset + datal; size = btrfs_item_size(leaf, slot); read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot), size); @@ -489,6 +496,7 @@ process_slot: clone_info.file_offset = new_key.offset; clone_info.extent_buf = buf; clone_info.is_new_extent = false; + clone_info.update_times = !no_time_update; ret = btrfs_replace_file_extents(BTRFS_I(inode), path, drop_start, new_key.offset + datal - 1, &clone_info, &trans); @@ -550,7 +558,7 @@ process_slot: break; btrfs_release_path(path); - key.offset = next_key_min_offset; + key.offset = prev_extent_end; if (fatal_signal_pending(current)) { ret = -EINTR; @@ -650,7 +658,8 @@ static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2) static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, struct inode *dst, u64 dst_loff) { - const u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; + struct btrfs_fs_info *fs_info = BTRFS_I(src)->root->fs_info; + const u64 bs = fs_info->sb->s_blocksize; int ret; /* @@ -661,6 +670,8 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1); btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); + btrfs_btree_balance_dirty(fs_info); + return ret; } @@ -770,6 +781,8 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, round_down(destoff, PAGE_SIZE), round_up(destoff + len, PAGE_SIZE) - 1); + btrfs_btree_balance_dirty(fs_info); + return ret; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index e7b0323e6efd..3afe5fa50a63 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -135,15 +135,13 @@ struct scrub_parity { struct work_struct work; /* Mark the parity blocks which have data */ - unsigned long *dbitmap; + unsigned long dbitmap; /* * Mark the parity blocks which have data, but errors happen when * read data or check data */ - unsigned long *ebitmap; - - unsigned long bitmap[]; + unsigned long ebitmap; }; struct scrub_ctx { @@ -1218,7 +1216,6 @@ static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc) static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type, u64 *raid_map, - u64 mapped_length, int nstripes, int mirror, int *stripe_index, u64 *stripe_offset) @@ -1233,7 +1230,7 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type, continue; if (logical >= raid_map[i] && - logical < raid_map[i] + mapped_length) + logical < raid_map[i] + BTRFS_STRIPE_LEN) break; } @@ -1337,7 +1334,6 @@ leave_nomem: scrub_stripe_index_and_offset(logical, bioc->map_type, bioc->raid_map, - mapped_length, bioc->num_stripes - bioc->num_tgtdevs, mirror_index, @@ -1380,19 +1376,12 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, struct scrub_sector *sector) { DECLARE_COMPLETION_ONSTACK(done); - int ret; - int mirror_num; bio->bi_iter.bi_sector = sector->logical >> 9; bio->bi_private = &done; bio->bi_end_io = scrub_bio_wait_endio; - - mirror_num = sector->sblock->sectors[0]->mirror_num; - ret = raid56_parity_recover(bio, sector->recover->bioc, - sector->recover->map_length, - mirror_num, 0); - if (ret) - return ret; + raid56_parity_recover(bio, sector->recover->bioc, + sector->sblock->sectors[0]->mirror_num, false); wait_for_completion_io(&done); return blk_status_to_errno(bio->bi_status); @@ -2197,7 +2186,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) bio->bi_private = sblock; bio->bi_end_io = scrub_missing_raid56_end_io; - rbio = raid56_alloc_missing_rbio(bio, bioc, length); + rbio = raid56_alloc_missing_rbio(bio, bioc); if (!rbio) goto rbio_out; @@ -2406,13 +2395,13 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity, u64 start, u32 len) { - __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len); + __scrub_mark_bitmap(sparity, &sparity->ebitmap, start, len); } static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity, u64 start, u32 len) { - __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len); + __scrub_mark_bitmap(sparity, &sparity->dbitmap, start, len); } static void scrub_block_complete(struct scrub_block *sblock) @@ -2763,7 +2752,7 @@ static void scrub_free_parity(struct scrub_parity *sparity) struct scrub_sector *curr, *next; int nbits; - nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors); + nbits = bitmap_weight(&sparity->ebitmap, sparity->nsectors); if (nbits) { spin_lock(&sctx->stat_lock); sctx->stat.read_errors += nbits; @@ -2795,8 +2784,8 @@ static void scrub_parity_bio_endio(struct bio *bio) struct btrfs_fs_info *fs_info = sparity->sctx->fs_info; if (bio->bi_status) - bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, - sparity->nsectors); + bitmap_or(&sparity->ebitmap, &sparity->ebitmap, + &sparity->dbitmap, sparity->nsectors); bio_put(bio); @@ -2814,8 +2803,8 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) u64 length; int ret; - if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap, - sparity->nsectors)) + if (!bitmap_andnot(&sparity->dbitmap, &sparity->dbitmap, + &sparity->ebitmap, sparity->nsectors)) goto out; length = sparity->logic_end - sparity->logic_start; @@ -2831,9 +2820,9 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) bio->bi_private = sparity; bio->bi_end_io = scrub_parity_bio_endio; - rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, length, + rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, sparity->scrub_dev, - sparity->dbitmap, + &sparity->dbitmap, sparity->nsectors); if (!rbio) goto rbio_out; @@ -2847,7 +2836,7 @@ rbio_out: bioc_out: btrfs_bio_counter_dec(fs_info); btrfs_put_bioc(bioc); - bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, + bitmap_or(&sparity->ebitmap, &sparity->ebitmap, &sparity->dbitmap, sparity->nsectors); spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -2856,11 +2845,6 @@ out: scrub_free_parity(sparity); } -static inline int scrub_calc_parity_bitmap_len(int nsectors) -{ - return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long); -} - static void scrub_parity_get(struct scrub_parity *sparity) { refcount_inc(&sparity->refs); @@ -3131,7 +3115,6 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, int ret; struct scrub_parity *sparity; int nsectors; - int bitmap_len; path = btrfs_alloc_path(); if (!path) { @@ -3145,9 +3128,8 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, ASSERT(map->stripe_len <= U32_MAX); nsectors = map->stripe_len >> fs_info->sectorsize_bits; - bitmap_len = scrub_calc_parity_bitmap_len(nsectors); - sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len, - GFP_NOFS); + ASSERT(nsectors <= BITS_PER_LONG); + sparity = kzalloc(sizeof(struct scrub_parity), GFP_NOFS); if (!sparity) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -3165,8 +3147,6 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, sparity->logic_end = logic_end; refcount_set(&sparity->refs, 1); INIT_LIST_HEAD(&sparity->sectors_list); - sparity->dbitmap = sparity->bitmap; - sparity->ebitmap = (void *)sparity->bitmap + bitmap_len; ret = 0; for (cur_logical = logic_start; cur_logical < logic_end; @@ -3429,20 +3409,22 @@ static int scrub_simple_stripe(struct scrub_ctx *sctx, static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg, - struct map_lookup *map, + struct extent_map *em, struct btrfs_device *scrub_dev, - int stripe_index, u64 dev_extent_len) + int stripe_index) { struct btrfs_path *path; struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_root *root; struct btrfs_root *csum_root; struct blk_plug plug; + struct map_lookup *map = em->map_lookup; const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK; const u64 chunk_logical = bg->start; int ret; u64 physical = map->stripes[stripe_index].physical; - const u64 physical_end = physical + dev_extent_len; + const u64 dev_stripe_len = btrfs_calc_stripe_length(em); + const u64 physical_end = physical + dev_stripe_len; u64 logical; u64 logic_end; /* The logical increment after finishing one stripe */ @@ -3569,8 +3551,8 @@ next: physical += map->stripe_len; spin_lock(&sctx->stat_lock); if (stop_loop) - sctx->stat.last_physical = map->stripes[stripe_index].physical + - dev_extent_len; + sctx->stat.last_physical = + map->stripes[stripe_index].physical + dev_stripe_len; else sctx->stat.last_physical = physical; spin_unlock(&sctx->stat_lock); @@ -3639,8 +3621,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, for (i = 0; i < map->num_stripes; ++i) { if (map->stripes[i].dev->bdev == scrub_dev->bdev && map->stripes[i].physical == dev_offset) { - ret = scrub_stripe(sctx, bg, map, scrub_dev, i, - dev_extent_len); + ret = scrub_stripe(sctx, bg, em, scrub_dev, i); if (ret) goto out; } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index fa56890ff81f..e7671afcee4f 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -10,12 +10,14 @@ #include <linux/mount.h> #include <linux/xattr.h> #include <linux/posix_acl_xattr.h> +#include <linux/radix-tree.h> #include <linux/vmalloc.h> #include <linux/string.h> #include <linux/compat.h> #include <linux/crc32c.h> #include "send.h" +#include "ctree.h" #include "backref.h" #include "locking.h" #include "disk-io.h" @@ -81,8 +83,12 @@ struct send_ctx { char *send_buf; u32 send_size; u32 send_max_size; - u64 total_send_size; - u64 cmd_send_size[BTRFS_SEND_C_MAX + 1]; + /* + * Whether BTRFS_SEND_A_DATA attribute was already added to current + * command (since protocol v2, data must be the last attribute). + */ + bool put_data; + struct page **send_buf_pages; u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */ /* Protocol version compatibility requested */ u32 proto; @@ -112,14 +118,14 @@ struct send_ctx { */ u64 cur_ino; u64 cur_inode_gen; - int cur_inode_new; - int cur_inode_new_gen; - int cur_inode_deleted; u64 cur_inode_size; u64 cur_inode_mode; u64 cur_inode_rdev; u64 cur_inode_last_extent; u64 cur_inode_next_write_offset; + bool cur_inode_new; + bool cur_inode_new_gen; + bool cur_inode_deleted; bool ignore_cur_inode; u64 send_progress; @@ -127,7 +133,7 @@ struct send_ctx { struct list_head new_refs; struct list_head deleted_refs; - struct xarray name_cache; + struct radix_tree_root name_cache; struct list_head name_cache_list; int name_cache_size; @@ -234,6 +240,9 @@ struct send_ctx { * Indexed by the inode number of the directory to be deleted. */ struct rb_root orphan_dirs; + + struct rb_root rbtree_new_refs; + struct rb_root rbtree_deleted_refs; }; struct pending_dir_move { @@ -268,13 +277,14 @@ struct orphan_dir_info { struct name_cache_entry { struct list_head list; /* - * On 32bit kernels, xarray has only 32bit indices, but we need to - * handle 64bit inums. We use the lower 32bit of the 64bit inum to store - * it in the tree. If more than one inum would fall into the same entry, - * we use inum_aliases to store the additional entries. inum_aliases is - * also used to store entries with the same inum but different generations. + * radix_tree has only 32bit entries but we need to handle 64bit inums. + * We use the lower 32bit of the 64bit inum to store it in the tree. If + * more then one inum would fall into the same entry, we use radix_list + * to store the additional entries. radix_list is also used to store + * entries where two entries have the same inum but different + * generations. */ - struct list_head inum_aliases; + struct list_head radix_list; u64 ino; u64 gen; u64 parent_ino; @@ -333,8 +343,8 @@ __maybe_unused static bool proto_cmd_ok(const struct send_ctx *sctx, int cmd) { switch (sctx->proto) { - case 1: return cmd < __BTRFS_SEND_C_MAX_V1; - case 2: return cmd < __BTRFS_SEND_C_MAX_V2; + case 1: return cmd <= BTRFS_SEND_C_MAX_V1; + case 2: return cmd <= BTRFS_SEND_C_MAX_V2; default: return false; } } @@ -575,15 +585,10 @@ static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off) while (pos < len) { ret = kernel_write(filp, buf + pos, len - pos, off); - /* TODO handle that correctly */ - /*if (ret == -ERESTARTSYS) { - continue; - }*/ if (ret < 0) return ret; - if (ret == 0) { + if (ret == 0) return -EIO; - } pos += ret; } @@ -596,6 +601,9 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len) int total_len = sizeof(*hdr) + len; int left = sctx->send_max_size - sctx->send_size; + if (WARN_ON_ONCE(sctx->put_data)) + return -EINVAL; + if (unlikely(left < total_len)) return -EOVERFLOW; @@ -616,6 +624,7 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len) return tlv_put(sctx, attr, &__tmp, sizeof(__tmp)); \ } +TLV_PUT_DEFINE_INT(32) TLV_PUT_DEFINE_INT(64) static int tlv_put_string(struct send_ctx *sctx, u16 attr, @@ -691,8 +700,7 @@ static int send_header(struct send_ctx *sctx) struct btrfs_stream_header hdr; strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC); - hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION); - + hdr.version = cpu_to_le32(sctx->proto); return write_buf(sctx->send_filp, &hdr, sizeof(hdr), &sctx->send_off); } @@ -732,9 +740,8 @@ static int send_cmd(struct send_ctx *sctx) ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, &sctx->send_off); - sctx->total_send_size += sctx->send_size; - sctx->cmd_send_size[get_unaligned_le16(&hdr->cmd)] += sctx->send_size; sctx->send_size = 0; + sctx->put_data = false; return ret; } @@ -840,7 +847,7 @@ out: */ static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path, u64 ino, u64 *size, u64 *gen, u64 *mode, u64 *uid, - u64 *gid, u64 *rdev) + u64 *gid, u64 *rdev, u64 *fileattr) { int ret; struct btrfs_inode_item *ii; @@ -870,6 +877,12 @@ static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path, *gid = btrfs_inode_gid(path->nodes[0], ii); if (rdev) *rdev = btrfs_inode_rdev(path->nodes[0], ii); + /* + * Transfer the unchanged u64 value of btrfs_inode_item::flags, that's + * otherwise logically split to 32/32 parts. + */ + if (fileattr) + *fileattr = btrfs_inode_flags(path->nodes[0], ii); return ret; } @@ -877,7 +890,7 @@ static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path, static int get_inode_info(struct btrfs_root *root, u64 ino, u64 *size, u64 *gen, u64 *mode, u64 *uid, u64 *gid, - u64 *rdev) + u64 *rdev, u64 *fileattr) { struct btrfs_path *path; int ret; @@ -886,7 +899,7 @@ static int get_inode_info(struct btrfs_root *root, if (!path) return -ENOMEM; ret = __get_inode_info(root, path, ino, size, gen, mode, uid, gid, - rdev); + rdev, fileattr); btrfs_free_path(path); return ret; } @@ -1632,7 +1645,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) u64 right_gen; ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL, - NULL, NULL); + NULL, NULL, NULL); if (ret < 0 && ret != -ENOENT) goto out; left_ret = ret; @@ -1641,7 +1654,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) right_ret = -ENOENT; } else { ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen, - NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, NULL); if (ret < 0 && ret != -ENOENT) goto out; right_ret = ret; @@ -1804,7 +1817,7 @@ static int get_first_ref(struct btrfs_root *root, u64 ino, if (dir_gen) { ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL, - NULL, NULL, NULL); + NULL, NULL, NULL, NULL); if (ret < 0) goto out; } @@ -1876,7 +1889,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, */ if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID) { ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, - NULL, NULL, NULL); + NULL, NULL, NULL, NULL); if (ret < 0 && ret != -ENOENT) goto out; if (ret) { @@ -1904,7 +1917,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, if (other_inode > sctx->send_progress || is_waiting_for_move(sctx, other_inode)) { ret = get_inode_info(sctx->parent_root, other_inode, NULL, - who_gen, who_mode, NULL, NULL, NULL); + who_gen, who_mode, NULL, NULL, NULL, NULL); if (ret < 0) goto out; @@ -1943,7 +1956,7 @@ static int did_overwrite_ref(struct send_ctx *sctx, if (dir != BTRFS_FIRST_FREE_OBJECTID) { ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, - NULL, NULL, NULL); + NULL, NULL, NULL, NULL); if (ret < 0 && ret != -ENOENT) goto out; if (ret) { @@ -1966,7 +1979,7 @@ static int did_overwrite_ref(struct send_ctx *sctx, } ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL, - NULL, NULL); + NULL, NULL, NULL); if (ret < 0) goto out; @@ -2024,9 +2037,9 @@ out: } /* - * Insert a name cache entry. On 32bit kernels the xarray index is 32bit, + * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit, * so we need to do some special handling in case we have clashes. This function - * takes care of this with the help of name_cache_entry::inum_aliases. + * takes care of this with the help of name_cache_entry::radix_list. * In case of error, nce is kfreed. */ static int name_cache_insert(struct send_ctx *sctx, @@ -2035,7 +2048,8 @@ static int name_cache_insert(struct send_ctx *sctx, int ret = 0; struct list_head *nce_head; - nce_head = xa_load(&sctx->name_cache, (unsigned long)nce->ino); + nce_head = radix_tree_lookup(&sctx->name_cache, + (unsigned long)nce->ino); if (!nce_head) { nce_head = kmalloc(sizeof(*nce_head), GFP_KERNEL); if (!nce_head) { @@ -2044,14 +2058,14 @@ static int name_cache_insert(struct send_ctx *sctx, } INIT_LIST_HEAD(nce_head); - ret = xa_insert(&sctx->name_cache, nce->ino, nce_head, GFP_KERNEL); + ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head); if (ret < 0) { kfree(nce_head); kfree(nce); return ret; } } - list_add_tail(&nce->inum_aliases, nce_head); + list_add_tail(&nce->radix_list, nce_head); list_add_tail(&nce->list, &sctx->name_cache_list); sctx->name_cache_size++; @@ -2063,14 +2077,15 @@ static void name_cache_delete(struct send_ctx *sctx, { struct list_head *nce_head; - nce_head = xa_load(&sctx->name_cache, (unsigned long)nce->ino); + nce_head = radix_tree_lookup(&sctx->name_cache, + (unsigned long)nce->ino); if (!nce_head) { btrfs_err(sctx->send_root->fs_info, "name_cache_delete lookup failed ino %llu cache size %d, leaking memory", nce->ino, sctx->name_cache_size); } - list_del(&nce->inum_aliases); + list_del(&nce->radix_list); list_del(&nce->list); sctx->name_cache_size--; @@ -2078,7 +2093,7 @@ static void name_cache_delete(struct send_ctx *sctx, * We may not get to the final release of nce_head if the lookup fails */ if (nce_head && list_empty(nce_head)) { - xa_erase(&sctx->name_cache, (unsigned long)nce->ino); + radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino); kfree(nce_head); } } @@ -2089,11 +2104,11 @@ static struct name_cache_entry *name_cache_search(struct send_ctx *sctx, struct list_head *nce_head; struct name_cache_entry *cur; - nce_head = xa_load(&sctx->name_cache, (unsigned long)ino); + nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino); if (!nce_head) return NULL; - list_for_each_entry(cur, nce_head, inum_aliases) { + list_for_each_entry(cur, nce_head, radix_list) { if (cur->ino == ino && cur->gen == gen) return cur; } @@ -2180,7 +2195,7 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, /* * If the inode is not existent yet, add the orphan name and return 1. * This should only happen for the parent dir that we determine in - * __record_new_ref + * record_new_ref_if_needed(). */ ret = is_inode_existent(sctx, ino, gen); if (ret < 0) @@ -2495,6 +2510,39 @@ out: return ret; } +static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr) +{ + struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; + int ret = 0; + struct fs_path *p; + + if (sctx->proto < 2) + return 0; + + btrfs_debug(fs_info, "send_fileattr %llu fileattr=%llu", ino, fileattr); + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + + ret = begin_cmd(sctx, BTRFS_SEND_C_FILEATTR); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, ino, gen, p); + if (ret < 0) + goto out; + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILEATTR, fileattr); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(p); + return ret; +} + static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid) { struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; @@ -2574,7 +2622,8 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime); TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime); TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, &ii->ctime); - /* TODO Add otime support when the otime patches get into upstream */ + if (sctx->proto >= 2) + TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_OTIME, eb, &ii->otime); ret = send_cmd(sctx); @@ -2608,7 +2657,7 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino) if (ino != sctx->cur_ino) { ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, - NULL, NULL, &rdev); + NULL, NULL, &rdev, NULL); if (ret < 0) goto out; } else { @@ -2747,48 +2796,50 @@ struct recorded_ref { u64 dir; u64 dir_gen; int name_len; + struct rb_node node; + struct rb_root *root; }; -static void set_ref_path(struct recorded_ref *ref, struct fs_path *path) +static struct recorded_ref *recorded_ref_alloc(void) { - ref->full_path = path; - ref->name = (char *)kbasename(ref->full_path->start); - ref->name_len = ref->full_path->end - ref->name; + struct recorded_ref *ref; + + ref = kzalloc(sizeof(*ref), GFP_KERNEL); + if (!ref) + return NULL; + RB_CLEAR_NODE(&ref->node); + INIT_LIST_HEAD(&ref->list); + return ref; } -/* - * We need to process new refs before deleted refs, but compare_tree gives us - * everything mixed. So we first record all refs and later process them. - * This function is a helper to record one ref. - */ -static int __record_ref(struct list_head *head, u64 dir, - u64 dir_gen, struct fs_path *path) +static void recorded_ref_free(struct recorded_ref *ref) { - struct recorded_ref *ref; - - ref = kmalloc(sizeof(*ref), GFP_KERNEL); if (!ref) - return -ENOMEM; + return; + if (!RB_EMPTY_NODE(&ref->node)) + rb_erase(&ref->node, ref->root); + list_del(&ref->list); + fs_path_free(ref->full_path); + kfree(ref); +} - ref->dir = dir; - ref->dir_gen = dir_gen; - set_ref_path(ref, path); - list_add_tail(&ref->list, head); - return 0; +static void set_ref_path(struct recorded_ref *ref, struct fs_path *path) +{ + ref->full_path = path; + ref->name = (char *)kbasename(ref->full_path->start); + ref->name_len = ref->full_path->end - ref->name; } static int dup_ref(struct recorded_ref *ref, struct list_head *list) { struct recorded_ref *new; - new = kmalloc(sizeof(*ref), GFP_KERNEL); + new = recorded_ref_alloc(); if (!new) return -ENOMEM; new->dir = ref->dir; new->dir_gen = ref->dir_gen; - new->full_path = NULL; - INIT_LIST_HEAD(&new->list); list_add_tail(&new->list, list); return 0; } @@ -2799,9 +2850,7 @@ static void __free_recorded_refs(struct list_head *head) while (!list_empty(head)) { cur = list_entry(head->next, struct recorded_ref, list); - fs_path_free(cur->full_path); - list_del(&cur->list); - kfree(cur); + recorded_ref_free(cur); } } @@ -3311,7 +3360,7 @@ finish: * The parent inode might have been deleted in the send snapshot */ ret = get_inode_info(sctx->send_root, cur->dir, NULL, - NULL, NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, NULL, NULL); if (ret == -ENOENT) { ret = 0; continue; @@ -3486,11 +3535,11 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, } ret = get_inode_info(sctx->parent_root, di_key.objectid, NULL, - &left_gen, NULL, NULL, NULL, NULL); + &left_gen, NULL, NULL, NULL, NULL, NULL); if (ret < 0) goto out; ret = get_inode_info(sctx->send_root, di_key.objectid, NULL, - &right_gen, NULL, NULL, NULL, NULL); + &right_gen, NULL, NULL, NULL, NULL, NULL); if (ret < 0) { if (ret == -ENOENT) ret = 0; @@ -3621,7 +3670,7 @@ static int is_ancestor(struct btrfs_root *root, } ret = get_inode_info(root, parent, NULL, &parent_gen, - NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, NULL); if (ret < 0) goto out; ret = check_ino_in_path(root, ino1, ino1_gen, @@ -3713,7 +3762,7 @@ static int wait_for_parent_move(struct send_ctx *sctx, ret = get_inode_info(sctx->parent_root, ino, NULL, &parent_ino_gen, NULL, NULL, NULL, - NULL); + NULL, NULL); if (ret < 0) goto out; if (ino_gen == parent_ino_gen) { @@ -4307,185 +4356,169 @@ out: return ret; } -static int record_ref(struct btrfs_root *root, u64 dir, struct fs_path *name, - void *ctx, struct list_head *refs) +static int rbtree_ref_comp(const void *k, const struct rb_node *node) +{ + const struct recorded_ref *data = k; + const struct recorded_ref *ref = rb_entry(node, struct recorded_ref, node); + int result; + + if (data->dir > ref->dir) + return 1; + if (data->dir < ref->dir) + return -1; + if (data->dir_gen > ref->dir_gen) + return 1; + if (data->dir_gen < ref->dir_gen) + return -1; + if (data->name_len > ref->name_len) + return 1; + if (data->name_len < ref->name_len) + return -1; + result = strcmp(data->name, ref->name); + if (result > 0) + return 1; + if (result < 0) + return -1; + return 0; +} + +static bool rbtree_ref_less(struct rb_node *node, const struct rb_node *parent) +{ + const struct recorded_ref *entry = rb_entry(node, struct recorded_ref, node); + + return rbtree_ref_comp(entry, parent) < 0; +} + +static int record_ref_in_tree(struct rb_root *root, struct list_head *refs, + struct fs_path *name, u64 dir, u64 dir_gen, + struct send_ctx *sctx) { int ret = 0; - struct send_ctx *sctx = ctx; - struct fs_path *p; - u64 gen; + struct fs_path *path = NULL; + struct recorded_ref *ref = NULL; - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + path = fs_path_alloc(); + if (!path) { + ret = -ENOMEM; + goto out; + } - ret = get_inode_info(root, dir, NULL, &gen, NULL, NULL, - NULL, NULL); - if (ret < 0) + ref = recorded_ref_alloc(); + if (!ref) { + ret = -ENOMEM; goto out; + } - ret = get_cur_path(sctx, dir, gen, p); + ret = get_cur_path(sctx, dir, dir_gen, path); if (ret < 0) goto out; - ret = fs_path_add_path(p, name); + ret = fs_path_add_path(path, name); if (ret < 0) goto out; - ret = __record_ref(refs, dir, gen, p); - + ref->dir = dir; + ref->dir_gen = dir_gen; + set_ref_path(ref, path); + list_add_tail(&ref->list, refs); + rb_add(&ref->node, root, rbtree_ref_less); + ref->root = root; out: - if (ret) - fs_path_free(p); + if (ret) { + if (path && (!ref || !ref->full_path)) + fs_path_free(path); + recorded_ref_free(ref); + } return ret; } -static int __record_new_ref(int num, u64 dir, int index, - struct fs_path *name, - void *ctx) -{ - struct send_ctx *sctx = ctx; - return record_ref(sctx->send_root, dir, name, ctx, &sctx->new_refs); -} - - -static int __record_deleted_ref(int num, u64 dir, int index, - struct fs_path *name, - void *ctx) +static int record_new_ref_if_needed(int num, u64 dir, int index, + struct fs_path *name, void *ctx) { + int ret = 0; struct send_ctx *sctx = ctx; - return record_ref(sctx->parent_root, dir, name, ctx, - &sctx->deleted_refs); -} - -static int record_new_ref(struct send_ctx *sctx) -{ - int ret; + struct rb_node *node = NULL; + struct recorded_ref data; + struct recorded_ref *ref; + u64 dir_gen; - ret = iterate_inode_ref(sctx->send_root, sctx->left_path, - sctx->cmp_key, 0, __record_new_ref, sctx); + ret = get_inode_info(sctx->send_root, dir, NULL, &dir_gen, NULL, + NULL, NULL, NULL, NULL); if (ret < 0) goto out; - ret = 0; + data.dir = dir; + data.dir_gen = dir_gen; + set_ref_path(&data, name); + node = rb_find(&data, &sctx->rbtree_deleted_refs, rbtree_ref_comp); + if (node) { + ref = rb_entry(node, struct recorded_ref, node); + recorded_ref_free(ref); + } else { + ret = record_ref_in_tree(&sctx->rbtree_new_refs, + &sctx->new_refs, name, dir, dir_gen, + sctx); + } out: return ret; } -static int record_deleted_ref(struct send_ctx *sctx) +static int record_deleted_ref_if_needed(int num, u64 dir, int index, + struct fs_path *name, void *ctx) { - int ret; + int ret = 0; + struct send_ctx *sctx = ctx; + struct rb_node *node = NULL; + struct recorded_ref data; + struct recorded_ref *ref; + u64 dir_gen; - ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, - sctx->cmp_key, 0, __record_deleted_ref, sctx); + ret = get_inode_info(sctx->parent_root, dir, NULL, &dir_gen, NULL, + NULL, NULL, NULL, NULL); if (ret < 0) goto out; - ret = 0; + data.dir = dir; + data.dir_gen = dir_gen; + set_ref_path(&data, name); + node = rb_find(&data, &sctx->rbtree_new_refs, rbtree_ref_comp); + if (node) { + ref = rb_entry(node, struct recorded_ref, node); + recorded_ref_free(ref); + } else { + ret = record_ref_in_tree(&sctx->rbtree_deleted_refs, + &sctx->deleted_refs, name, dir, + dir_gen, sctx); + } out: return ret; } -struct find_ref_ctx { - u64 dir; - u64 dir_gen; - struct btrfs_root *root; - struct fs_path *name; - int found_idx; -}; - -static int __find_iref(int num, u64 dir, int index, - struct fs_path *name, - void *ctx_) -{ - struct find_ref_ctx *ctx = ctx_; - u64 dir_gen; - int ret; - - if (dir == ctx->dir && fs_path_len(name) == fs_path_len(ctx->name) && - strncmp(name->start, ctx->name->start, fs_path_len(name)) == 0) { - /* - * To avoid doing extra lookups we'll only do this if everything - * else matches. - */ - ret = get_inode_info(ctx->root, dir, NULL, &dir_gen, NULL, - NULL, NULL, NULL); - if (ret) - return ret; - if (dir_gen != ctx->dir_gen) - return 0; - ctx->found_idx = num; - return 1; - } - return 0; -} - -static int find_iref(struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_key *key, - u64 dir, u64 dir_gen, struct fs_path *name) +static int record_new_ref(struct send_ctx *sctx) { int ret; - struct find_ref_ctx ctx; - - ctx.dir = dir; - ctx.name = name; - ctx.dir_gen = dir_gen; - ctx.found_idx = -1; - ctx.root = root; - ret = iterate_inode_ref(root, path, key, 0, __find_iref, &ctx); + ret = iterate_inode_ref(sctx->send_root, sctx->left_path, + sctx->cmp_key, 0, record_new_ref_if_needed, sctx); if (ret < 0) - return ret; - - if (ctx.found_idx == -1) - return -ENOENT; - - return ctx.found_idx; -} - -static int __record_changed_new_ref(int num, u64 dir, int index, - struct fs_path *name, - void *ctx) -{ - u64 dir_gen; - int ret; - struct send_ctx *sctx = ctx; - - ret = get_inode_info(sctx->send_root, dir, NULL, &dir_gen, NULL, - NULL, NULL, NULL); - if (ret) - return ret; - - ret = find_iref(sctx->parent_root, sctx->right_path, - sctx->cmp_key, dir, dir_gen, name); - if (ret == -ENOENT) - ret = __record_new_ref(num, dir, index, name, sctx); - else if (ret > 0) - ret = 0; + goto out; + ret = 0; +out: return ret; } -static int __record_changed_deleted_ref(int num, u64 dir, int index, - struct fs_path *name, - void *ctx) +static int record_deleted_ref(struct send_ctx *sctx) { - u64 dir_gen; int ret; - struct send_ctx *sctx = ctx; - - ret = get_inode_info(sctx->parent_root, dir, NULL, &dir_gen, NULL, - NULL, NULL, NULL); - if (ret) - return ret; - ret = find_iref(sctx->send_root, sctx->left_path, sctx->cmp_key, - dir, dir_gen, name); - if (ret == -ENOENT) - ret = __record_deleted_ref(num, dir, index, name, sctx); - else if (ret > 0) - ret = 0; + ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, + sctx->cmp_key, 0, record_deleted_ref_if_needed, + sctx); + if (ret < 0) + goto out; + ret = 0; +out: return ret; } @@ -4494,11 +4527,11 @@ static int record_changed_ref(struct send_ctx *sctx) int ret = 0; ret = iterate_inode_ref(sctx->send_root, sctx->left_path, - sctx->cmp_key, 0, __record_changed_new_ref, sctx); + sctx->cmp_key, 0, record_new_ref_if_needed, sctx); if (ret < 0) goto out; ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, - sctx->cmp_key, 0, __record_changed_deleted_ref, sctx); + sctx->cmp_key, 0, record_deleted_ref_if_needed, sctx); if (ret < 0) goto out; ret = 0; @@ -4529,10 +4562,10 @@ static int process_all_refs(struct send_ctx *sctx, if (cmd == BTRFS_COMPARE_TREE_NEW) { root = sctx->send_root; - cb = __record_new_ref; + cb = record_new_ref_if_needed; } else if (cmd == BTRFS_COMPARE_TREE_DELETED) { root = sctx->parent_root; - cb = __record_deleted_ref; + cb = record_deleted_ref_if_needed; } else { btrfs_err(sctx->send_root->fs_info, "Wrong command %d in process_all_refs", cmd); @@ -4860,14 +4893,28 @@ static inline u64 max_send_read_size(const struct send_ctx *sctx) static int put_data_header(struct send_ctx *sctx, u32 len) { - struct btrfs_tlv_header *hdr; + if (WARN_ON_ONCE(sctx->put_data)) + return -EINVAL; + sctx->put_data = true; + if (sctx->proto >= 2) { + /* + * Since v2, the data attribute header doesn't include a length, + * it is implicitly to the end of the command. + */ + if (sctx->send_max_size - sctx->send_size < sizeof(__le16) + len) + return -EOVERFLOW; + put_unaligned_le16(BTRFS_SEND_A_DATA, sctx->send_buf + sctx->send_size); + sctx->send_size += sizeof(__le16); + } else { + struct btrfs_tlv_header *hdr; - if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len) - return -EOVERFLOW; - hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size); - put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type); - put_unaligned_le16(len, &hdr->tlv_len); - sctx->send_size += sizeof(*hdr); + if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len) + return -EOVERFLOW; + hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size); + put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type); + put_unaligned_le16(len, &hdr->tlv_len); + sctx->send_size += sizeof(*hdr); + } return 0; } @@ -5010,7 +5057,7 @@ static int send_clone(struct send_ctx *sctx, if (clone_root->root == sctx->send_root) { ret = get_inode_info(sctx->send_root, clone_root->ino, NULL, - &gen, NULL, NULL, NULL, NULL); + &gen, NULL, NULL, NULL, NULL, NULL); if (ret < 0) goto out; ret = get_cur_path(sctx, clone_root->ino, gen, p); @@ -5137,17 +5184,214 @@ tlv_put_failure: return ret; } -static int send_extent_data(struct send_ctx *sctx, - const u64 offset, - const u64 len) +static int send_encoded_inline_extent(struct send_ctx *sctx, + struct btrfs_path *path, u64 offset, + u64 len) +{ + struct btrfs_root *root = sctx->send_root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct inode *inode; + struct fs_path *fspath; + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_key key; + struct btrfs_file_extent_item *ei; + u64 ram_bytes; + size_t inline_size; + int ret; + + inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + fspath = fs_path_alloc(); + if (!fspath) { + ret = -ENOMEM; + goto out; + } + + ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath); + if (ret < 0) + goto out; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + ram_bytes = btrfs_file_extent_ram_bytes(leaf, ei); + inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]); + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, fspath); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); + TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_FILE_LEN, + min(key.offset + ram_bytes - offset, len)); + TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_LEN, ram_bytes); + TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_OFFSET, offset - key.offset); + ret = btrfs_encoded_io_compression_from_extent(fs_info, + btrfs_file_extent_compression(leaf, ei)); + if (ret < 0) + goto out; + TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret); + + ret = put_data_header(sctx, inline_size); + if (ret < 0) + goto out; + read_extent_buffer(leaf, sctx->send_buf + sctx->send_size, + btrfs_file_extent_inline_start(ei), inline_size); + sctx->send_size += inline_size; + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(fspath); + iput(inode); + return ret; +} + +static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, + u64 offset, u64 len) +{ + struct btrfs_root *root = sctx->send_root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct inode *inode; + struct fs_path *fspath; + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_key key; + struct btrfs_file_extent_item *ei; + u64 disk_bytenr, disk_num_bytes; + u32 data_offset; + struct btrfs_cmd_header *hdr; + u32 crc; + int ret; + + inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + fspath = fs_path_alloc(); + if (!fspath) { + ret = -ENOMEM; + goto out; + } + + ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath); + if (ret < 0) + goto out; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, ei); + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, fspath); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); + TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_FILE_LEN, + min(key.offset + btrfs_file_extent_num_bytes(leaf, ei) - offset, + len)); + TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_LEN, + btrfs_file_extent_ram_bytes(leaf, ei)); + TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_OFFSET, + offset - key.offset + btrfs_file_extent_offset(leaf, ei)); + ret = btrfs_encoded_io_compression_from_extent(fs_info, + btrfs_file_extent_compression(leaf, ei)); + if (ret < 0) + goto out; + TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret); + TLV_PUT_U32(sctx, BTRFS_SEND_A_ENCRYPTION, 0); + + ret = put_data_header(sctx, disk_num_bytes); + if (ret < 0) + goto out; + + /* + * We want to do I/O directly into the send buffer, so get the next page + * boundary in the send buffer. This means that there may be a gap + * between the beginning of the command and the file data. + */ + data_offset = ALIGN(sctx->send_size, PAGE_SIZE); + if (data_offset > sctx->send_max_size || + sctx->send_max_size - data_offset < disk_num_bytes) { + ret = -EOVERFLOW; + goto out; + } + + /* + * Note that send_buf is a mapping of send_buf_pages, so this is really + * reading into send_buf. + */ + ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), offset, + disk_bytenr, disk_num_bytes, + sctx->send_buf_pages + + (data_offset >> PAGE_SHIFT)); + if (ret) + goto out; + + hdr = (struct btrfs_cmd_header *)sctx->send_buf; + hdr->len = cpu_to_le32(sctx->send_size + disk_num_bytes - sizeof(*hdr)); + hdr->crc = 0; + crc = btrfs_crc32c(0, sctx->send_buf, sctx->send_size); + crc = btrfs_crc32c(crc, sctx->send_buf + data_offset, disk_num_bytes); + hdr->crc = cpu_to_le32(crc); + + ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, + &sctx->send_off); + if (!ret) { + ret = write_buf(sctx->send_filp, sctx->send_buf + data_offset, + disk_num_bytes, &sctx->send_off); + } + sctx->send_size = 0; + sctx->put_data = false; + +tlv_put_failure: +out: + fs_path_free(fspath); + iput(inode); + return ret; +} + +static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, + const u64 offset, const u64 len) { const u64 end = offset + len; + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_file_extent_item *ei; u64 read_size = max_send_read_size(sctx); u64 sent = 0; if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) return send_update_extent(sctx, offset, len); + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + if ((sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) && + btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) { + bool is_inline = (btrfs_file_extent_type(leaf, ei) == + BTRFS_FILE_EXTENT_INLINE); + + /* + * Send the compressed extent unless the compressed data is + * larger than the decompressed data. This can happen if we're + * not sending the entire extent, either because it has been + * partially overwritten/truncated or because this is a part of + * the extent that we couldn't clone in clone_range(). + */ + if (is_inline && + btrfs_file_extent_inline_item_len(leaf, + path->slots[0]) <= len) { + return send_encoded_inline_extent(sctx, path, offset, + len); + } else if (!is_inline && + btrfs_file_extent_disk_num_bytes(leaf, ei) <= len) { + return send_encoded_extent(sctx, path, offset, len); + } + } + if (sctx->cur_inode == NULL) { struct btrfs_root *root = sctx->send_root; @@ -5285,12 +5529,9 @@ out: return ret; } -static int clone_range(struct send_ctx *sctx, - struct clone_root *clone_root, - const u64 disk_byte, - u64 data_offset, - u64 offset, - u64 len) +static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, + struct clone_root *clone_root, const u64 disk_byte, + u64 data_offset, u64 offset, u64 len) { struct btrfs_path *path; struct btrfs_key key; @@ -5314,7 +5555,7 @@ static int clone_range(struct send_ctx *sctx, */ if (clone_root->offset == 0 && len == sctx->send_root->fs_info->sectorsize) - return send_extent_data(sctx, offset, len); + return send_extent_data(sctx, dst_path, offset, len); path = alloc_path_for_send(); if (!path) @@ -5325,7 +5566,8 @@ static int clone_range(struct send_ctx *sctx, * accept clones from these extents. */ ret = __get_inode_info(clone_root->root, path, clone_root->ino, - &clone_src_i_size, NULL, NULL, NULL, NULL, NULL); + &clone_src_i_size, NULL, NULL, NULL, NULL, NULL, + NULL); btrfs_release_path(path); if (ret < 0) goto out; @@ -5411,7 +5653,8 @@ static int clone_range(struct send_ctx *sctx, if (hole_len > len) hole_len = len; - ret = send_extent_data(sctx, offset, hole_len); + ret = send_extent_data(sctx, dst_path, offset, + hole_len); if (ret < 0) goto out; @@ -5484,14 +5727,16 @@ static int clone_range(struct send_ctx *sctx, if (ret < 0) goto out; } - ret = send_extent_data(sctx, offset + slen, + ret = send_extent_data(sctx, dst_path, + offset + slen, clone_len - slen); } else { ret = send_clone(sctx, offset, clone_len, clone_root); } } else { - ret = send_extent_data(sctx, offset, clone_len); + ret = send_extent_data(sctx, dst_path, offset, + clone_len); } if (ret < 0) @@ -5523,7 +5768,7 @@ next: } if (len > 0) - ret = send_extent_data(sctx, offset, len); + ret = send_extent_data(sctx, dst_path, offset, len); else ret = 0; out: @@ -5554,10 +5799,10 @@ static int send_write_or_clone(struct send_ctx *sctx, struct btrfs_file_extent_item); disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei); data_offset = btrfs_file_extent_offset(path->nodes[0], ei); - ret = clone_range(sctx, clone_root, disk_byte, data_offset, - offset, end - offset); + ret = clone_range(sctx, path, clone_root, disk_byte, + data_offset, offset, end - offset); } else { - ret = send_extent_data(sctx, offset, end - offset); + ret = send_extent_data(sctx, path, offset, end - offset); } sctx->cur_inode_next_write_offset = end; return ret; @@ -6017,11 +6262,14 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) u64 left_mode; u64 left_uid; u64 left_gid; + u64 left_fileattr; u64 right_mode; u64 right_uid; u64 right_gid; + u64 right_fileattr; int need_chmod = 0; int need_chown = 0; + bool need_fileattr = false; int need_truncate = 1; int pending_move = 0; int refs_processed = 0; @@ -6055,7 +6303,7 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) goto out; ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL, - &left_mode, &left_uid, &left_gid, NULL); + &left_mode, &left_uid, &left_gid, NULL, &left_fileattr); if (ret < 0) goto out; @@ -6070,7 +6318,7 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) ret = get_inode_info(sctx->parent_root, sctx->cur_ino, &old_size, NULL, &right_mode, &right_uid, - &right_gid, NULL); + &right_gid, NULL, &right_fileattr); if (ret < 0) goto out; @@ -6078,6 +6326,8 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) need_chown = 1; if (!S_ISLNK(sctx->cur_inode_mode) && left_mode != right_mode) need_chmod = 1; + if (!S_ISLNK(sctx->cur_inode_mode) && left_fileattr != right_fileattr) + need_fileattr = true; if ((old_size == sctx->cur_inode_size) || (sctx->cur_inode_size > old_size && sctx->cur_inode_next_write_offset == sctx->cur_inode_size)) @@ -6121,6 +6371,12 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) if (ret < 0) goto out; } + if (need_fileattr) { + ret = send_fileattr(sctx, sctx->cur_ino, sctx->cur_inode_gen, + left_fileattr); + if (ret < 0) + goto out; + } ret = send_capabilities(sctx); if (ret < 0) @@ -6161,8 +6417,13 @@ static int record_parent_ref(int num, u64 dir, int index, struct fs_path *name, { struct parent_paths_ctx *ppctx = ctx; - return record_ref(ppctx->sctx->parent_root, dir, name, ppctx->sctx, - ppctx->refs); + /* + * Pass 0 as the generation for the directory, we don't care about it + * here as we have no new references to add, we just want to delete all + * references for an inode. + */ + return record_ref_in_tree(&ppctx->sctx->rbtree_deleted_refs, ppctx->refs, + name, dir, 0, ppctx->sctx); } /* @@ -6216,9 +6477,7 @@ static int btrfs_unlink_all_paths(struct send_ctx *sctx) ret = send_unlink(sctx, ref->full_path); if (ret < 0) goto out; - fs_path_free(ref->full_path); - list_del(&ref->list); - kfree(ref); + recorded_ref_free(ref); } ret = 0; out: @@ -6265,7 +6524,7 @@ static int changed_inode(struct send_ctx *sctx, close_current_inode(sctx); sctx->cur_ino = key->objectid; - sctx->cur_inode_new_gen = 0; + sctx->cur_inode_new_gen = false; sctx->cur_inode_last_extent = (u64)-1; sctx->cur_inode_next_write_offset = 0; sctx->ignore_cur_inode = false; @@ -6306,7 +6565,7 @@ static int changed_inode(struct send_ctx *sctx, */ if (left_gen != right_gen && sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) - sctx->cur_inode_new_gen = 1; + sctx->cur_inode_new_gen = true; } /* @@ -6338,8 +6597,8 @@ static int changed_inode(struct send_ctx *sctx, if (result == BTRFS_COMPARE_TREE_NEW) { sctx->cur_inode_gen = left_gen; - sctx->cur_inode_new = 1; - sctx->cur_inode_deleted = 0; + sctx->cur_inode_new = true; + sctx->cur_inode_deleted = false; sctx->cur_inode_size = btrfs_inode_size( sctx->left_path->nodes[0], left_ii); sctx->cur_inode_mode = btrfs_inode_mode( @@ -6350,8 +6609,8 @@ static int changed_inode(struct send_ctx *sctx, ret = send_create_inode_if_needed(sctx); } else if (result == BTRFS_COMPARE_TREE_DELETED) { sctx->cur_inode_gen = right_gen; - sctx->cur_inode_new = 0; - sctx->cur_inode_deleted = 1; + sctx->cur_inode_new = false; + sctx->cur_inode_deleted = true; sctx->cur_inode_size = btrfs_inode_size( sctx->right_path->nodes[0], right_ii); sctx->cur_inode_mode = btrfs_inode_mode( @@ -6369,8 +6628,8 @@ static int changed_inode(struct send_ctx *sctx, * First, process the inode as if it was deleted. */ sctx->cur_inode_gen = right_gen; - sctx->cur_inode_new = 0; - sctx->cur_inode_deleted = 1; + sctx->cur_inode_new = false; + sctx->cur_inode_deleted = true; sctx->cur_inode_size = btrfs_inode_size( sctx->right_path->nodes[0], right_ii); sctx->cur_inode_mode = btrfs_inode_mode( @@ -6384,8 +6643,8 @@ static int changed_inode(struct send_ctx *sctx, * Now process the inode as if it was new. */ sctx->cur_inode_gen = left_gen; - sctx->cur_inode_new = 1; - sctx->cur_inode_deleted = 0; + sctx->cur_inode_new = true; + sctx->cur_inode_deleted = false; sctx->cur_inode_size = btrfs_inode_size( sctx->left_path->nodes[0], left_ii); sctx->cur_inode_mode = btrfs_inode_mode( @@ -6417,9 +6676,9 @@ static int changed_inode(struct send_ctx *sctx, goto out; } else { sctx->cur_inode_gen = left_gen; - sctx->cur_inode_new = 0; - sctx->cur_inode_new_gen = 0; - sctx->cur_inode_deleted = 0; + sctx->cur_inode_new = false; + sctx->cur_inode_new_gen = false; + sctx->cur_inode_deleted = false; sctx->cur_inode_size = btrfs_inode_size( sctx->left_path->nodes[0], left_ii); sctx->cur_inode_mode = btrfs_inode_mode( @@ -6532,12 +6791,12 @@ static int dir_changed(struct send_ctx *sctx, u64 dir) int ret; ret = get_inode_info(sctx->send_root, dir, NULL, &new_gen, NULL, NULL, - NULL, NULL); + NULL, NULL, NULL); if (ret) return ret; ret = get_inode_info(sctx->parent_root, dir, NULL, &orig_gen, NULL, - NULL, NULL, NULL); + NULL, NULL, NULL, NULL); if (ret) return ret; @@ -7518,7 +7777,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) INIT_LIST_HEAD(&sctx->new_refs); INIT_LIST_HEAD(&sctx->deleted_refs); - xa_init_flags(&sctx->name_cache, GFP_KERNEL); + INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL); INIT_LIST_HEAD(&sctx->name_cache_list); sctx->flags = arg->flags; @@ -7533,6 +7792,10 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) } else { sctx->proto = 1; } + if ((arg->flags & BTRFS_SEND_FLAG_COMPRESSED) && sctx->proto < 2) { + ret = -EINVAL; + goto out; + } sctx->send_filp = fget(arg->send_fd); if (!sctx->send_filp) { @@ -7552,8 +7815,31 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) sctx->clone_roots_cnt = arg->clone_sources_count; - sctx->send_max_size = BTRFS_SEND_BUF_SIZE; - sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL); + if (sctx->proto >= 2) { + u32 send_buf_num_pages; + + sctx->send_max_size = ALIGN(SZ_16K + BTRFS_MAX_COMPRESSED, PAGE_SIZE); + sctx->send_buf = vmalloc(sctx->send_max_size); + if (!sctx->send_buf) { + ret = -ENOMEM; + goto out; + } + send_buf_num_pages = sctx->send_max_size >> PAGE_SHIFT; + sctx->send_buf_pages = kcalloc(send_buf_num_pages, + sizeof(*sctx->send_buf_pages), + GFP_KERNEL); + if (!sctx->send_buf_pages) { + ret = -ENOMEM; + goto out; + } + for (i = 0; i < send_buf_num_pages; i++) { + sctx->send_buf_pages[i] = + vmalloc_to_page(sctx->send_buf + (i << PAGE_SHIFT)); + } + } else { + sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V1; + sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL); + } if (!sctx->send_buf) { ret = -ENOMEM; goto out; @@ -7562,6 +7848,8 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) sctx->pending_dir_moves = RB_ROOT; sctx->waiting_dir_moves = RB_ROOT; sctx->orphan_dirs = RB_ROOT; + sctx->rbtree_new_refs = RB_ROOT; + sctx->rbtree_deleted_refs = RB_ROOT; sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots), arg->clone_sources_count + 1, @@ -7746,6 +8034,7 @@ out: fput(sctx->send_filp); kvfree(sctx->clone_roots); + kfree(sctx->send_buf_pages); kvfree(sctx->send_buf); name_cache_free(sctx); diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 08602fdd600a..4bb4e6a638cb 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -7,12 +7,19 @@ #ifndef BTRFS_SEND_H #define BTRFS_SEND_H -#include "ctree.h" +#include <linux/types.h> #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream" -#define BTRFS_SEND_STREAM_VERSION 1 +#define BTRFS_SEND_STREAM_VERSION 2 -#define BTRFS_SEND_BUF_SIZE SZ_64K +/* + * In send stream v1, no command is larger than 64K. In send stream v2, no limit + * should be assumed. + */ +#define BTRFS_SEND_BUF_SIZE_V1 SZ_64K + +struct inode; +struct btrfs_ioctl_send_args; enum btrfs_tlv_type { BTRFS_TLV_U8, @@ -46,87 +53,117 @@ struct btrfs_tlv_header { /* commands */ enum btrfs_send_cmd { - BTRFS_SEND_C_UNSPEC, + BTRFS_SEND_C_UNSPEC = 0, /* Version 1 */ - BTRFS_SEND_C_SUBVOL, - BTRFS_SEND_C_SNAPSHOT, + BTRFS_SEND_C_SUBVOL = 1, + BTRFS_SEND_C_SNAPSHOT = 2, - BTRFS_SEND_C_MKFILE, - BTRFS_SEND_C_MKDIR, - BTRFS_SEND_C_MKNOD, - BTRFS_SEND_C_MKFIFO, - BTRFS_SEND_C_MKSOCK, - BTRFS_SEND_C_SYMLINK, + BTRFS_SEND_C_MKFILE = 3, + BTRFS_SEND_C_MKDIR = 4, + BTRFS_SEND_C_MKNOD = 5, + BTRFS_SEND_C_MKFIFO = 6, + BTRFS_SEND_C_MKSOCK = 7, + BTRFS_SEND_C_SYMLINK = 8, - BTRFS_SEND_C_RENAME, - BTRFS_SEND_C_LINK, - BTRFS_SEND_C_UNLINK, - BTRFS_SEND_C_RMDIR, + BTRFS_SEND_C_RENAME = 9, + BTRFS_SEND_C_LINK = 10, + BTRFS_SEND_C_UNLINK = 11, + BTRFS_SEND_C_RMDIR = 12, - BTRFS_SEND_C_SET_XATTR, - BTRFS_SEND_C_REMOVE_XATTR, + BTRFS_SEND_C_SET_XATTR = 13, + BTRFS_SEND_C_REMOVE_XATTR = 14, - BTRFS_SEND_C_WRITE, - BTRFS_SEND_C_CLONE, + BTRFS_SEND_C_WRITE = 15, + BTRFS_SEND_C_CLONE = 16, - BTRFS_SEND_C_TRUNCATE, - BTRFS_SEND_C_CHMOD, - BTRFS_SEND_C_CHOWN, - BTRFS_SEND_C_UTIMES, + BTRFS_SEND_C_TRUNCATE = 17, + BTRFS_SEND_C_CHMOD = 18, + BTRFS_SEND_C_CHOWN = 19, + BTRFS_SEND_C_UTIMES = 20, - BTRFS_SEND_C_END, - BTRFS_SEND_C_UPDATE_EXTENT, - __BTRFS_SEND_C_MAX_V1, + BTRFS_SEND_C_END = 21, + BTRFS_SEND_C_UPDATE_EXTENT = 22, + BTRFS_SEND_C_MAX_V1 = 22, /* Version 2 */ - __BTRFS_SEND_C_MAX_V2, + BTRFS_SEND_C_FALLOCATE = 23, + BTRFS_SEND_C_FILEATTR = 24, + BTRFS_SEND_C_ENCODED_WRITE = 25, + BTRFS_SEND_C_MAX_V2 = 25, /* End */ - __BTRFS_SEND_C_MAX, + BTRFS_SEND_C_MAX = 25, }; -#define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1) /* attributes in send stream */ enum { - BTRFS_SEND_A_UNSPEC, - - BTRFS_SEND_A_UUID, - BTRFS_SEND_A_CTRANSID, - - BTRFS_SEND_A_INO, - BTRFS_SEND_A_SIZE, - BTRFS_SEND_A_MODE, - BTRFS_SEND_A_UID, - BTRFS_SEND_A_GID, - BTRFS_SEND_A_RDEV, - BTRFS_SEND_A_CTIME, - BTRFS_SEND_A_MTIME, - BTRFS_SEND_A_ATIME, - BTRFS_SEND_A_OTIME, - - BTRFS_SEND_A_XATTR_NAME, - BTRFS_SEND_A_XATTR_DATA, - - BTRFS_SEND_A_PATH, - BTRFS_SEND_A_PATH_TO, - BTRFS_SEND_A_PATH_LINK, - - BTRFS_SEND_A_FILE_OFFSET, - BTRFS_SEND_A_DATA, - - BTRFS_SEND_A_CLONE_UUID, - BTRFS_SEND_A_CLONE_CTRANSID, - BTRFS_SEND_A_CLONE_PATH, - BTRFS_SEND_A_CLONE_OFFSET, - BTRFS_SEND_A_CLONE_LEN, - - __BTRFS_SEND_A_MAX, + BTRFS_SEND_A_UNSPEC = 0, + + /* Version 1 */ + BTRFS_SEND_A_UUID = 1, + BTRFS_SEND_A_CTRANSID = 2, + + BTRFS_SEND_A_INO = 3, + BTRFS_SEND_A_SIZE = 4, + BTRFS_SEND_A_MODE = 5, + BTRFS_SEND_A_UID = 6, + BTRFS_SEND_A_GID = 7, + BTRFS_SEND_A_RDEV = 8, + BTRFS_SEND_A_CTIME = 9, + BTRFS_SEND_A_MTIME = 10, + BTRFS_SEND_A_ATIME = 11, + BTRFS_SEND_A_OTIME = 12, + + BTRFS_SEND_A_XATTR_NAME = 13, + BTRFS_SEND_A_XATTR_DATA = 14, + + BTRFS_SEND_A_PATH = 15, + BTRFS_SEND_A_PATH_TO = 16, + BTRFS_SEND_A_PATH_LINK = 17, + + BTRFS_SEND_A_FILE_OFFSET = 18, + /* + * As of send stream v2, this attribute is special: it must be the last + * attribute in a command, its header contains only the type, and its + * length is implicitly the remaining length of the command. + */ + BTRFS_SEND_A_DATA = 19, + + BTRFS_SEND_A_CLONE_UUID = 20, + BTRFS_SEND_A_CLONE_CTRANSID = 21, + BTRFS_SEND_A_CLONE_PATH = 22, + BTRFS_SEND_A_CLONE_OFFSET = 23, + BTRFS_SEND_A_CLONE_LEN = 24, + + BTRFS_SEND_A_MAX_V1 = 24, + + /* Version 2 */ + BTRFS_SEND_A_FALLOCATE_MODE = 25, + + /* + * File attributes from the FS_*_FL namespace (i_flags, xflags), + * translated to BTRFS_INODE_* bits (BTRFS_INODE_FLAG_MASK) and stored + * in btrfs_inode_item::flags (represented by btrfs_inode::flags and + * btrfs_inode::ro_flags). + */ + BTRFS_SEND_A_FILEATTR = 26, + + BTRFS_SEND_A_UNENCODED_FILE_LEN = 27, + BTRFS_SEND_A_UNENCODED_LEN = 28, + BTRFS_SEND_A_UNENCODED_OFFSET = 29, + /* + * COMPRESSION and ENCRYPTION default to NONE (0) if omitted from + * BTRFS_SEND_C_ENCODED_WRITE. + */ + BTRFS_SEND_A_COMPRESSION = 30, + BTRFS_SEND_A_ENCRYPTION = 31, + BTRFS_SEND_A_MAX_V2 = 31, + + /* End */ + BTRFS_SEND_A_MAX = 31, }; -#define BTRFS_SEND_A_MAX (__BTRFS_SEND_A_MAX - 1) -#ifdef __KERNEL__ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg); -#endif #endif diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 2dd8754cb990..d0cbeb7ae81c 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -9,6 +9,7 @@ #include "ordered-data.h" #include "transaction.h" #include "block-group.h" +#include "zoned.h" /* * HOW DOES SPACE RESERVATION WORK @@ -187,6 +188,37 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) */ #define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75) +/* + * Calculate chunk size depending on volume type (regular or zoned). + */ +static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags) +{ + if (btrfs_is_zoned(fs_info)) + return fs_info->zone_size; + + ASSERT(flags & BTRFS_BLOCK_GROUP_TYPE_MASK); + + if (flags & BTRFS_BLOCK_GROUP_DATA) + return SZ_1G; + else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + return SZ_32M; + + /* Handle BTRFS_BLOCK_GROUP_METADATA */ + if (fs_info->fs_devices->total_rw_bytes > 50ULL * SZ_1G) + return SZ_1G; + + return SZ_256M; +} + +/* + * Update default chunk size. + */ +void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info, + u64 chunk_size) +{ + WRITE_ONCE(space_info->chunk_size, chunk_size); +} + static int create_space_info(struct btrfs_fs_info *info, u64 flags) { @@ -208,6 +240,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) INIT_LIST_HEAD(&space_info->tickets); INIT_LIST_HEAD(&space_info->priority_tickets); space_info->clamp = 1; + btrfs_update_space_info_chunk_size(space_info, calc_chunk_size(info, flags)); if (btrfs_is_zoned(info)) space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH; @@ -263,7 +296,7 @@ out: void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, u64 total_bytes, u64 bytes_used, u64 bytes_readonly, u64 bytes_zone_unusable, - struct btrfs_space_info **space_info) + bool active, struct btrfs_space_info **space_info) { struct btrfs_space_info *found; int factor; @@ -274,6 +307,8 @@ void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, ASSERT(found); spin_lock(&found->lock); found->total_bytes += total_bytes; + if (active) + found->active_total_bytes += total_bytes; found->disk_total += total_bytes * factor; found->bytes_used += bytes_used; found->disk_used += bytes_used * factor; @@ -337,6 +372,22 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, return avail; } +static inline u64 writable_total_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info) +{ + /* + * On regular filesystem, all total_bytes are always writable. On zoned + * filesystem, there may be a limitation imposed by max_active_zones. + * For metadata allocation, we cannot finish an existing active block + * group to avoid a deadlock. Thus, we need to consider only the active + * groups to be writable for metadata space. + */ + if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA)) + return space_info->total_bytes; + + return space_info->active_total_bytes; +} + int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush) @@ -349,9 +400,12 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, return 0; used = btrfs_space_info_used(space_info, true); - avail = calc_available_free_space(fs_info, space_info, flush); + if (btrfs_is_zoned(fs_info) && (space_info->flags & BTRFS_BLOCK_GROUP_METADATA)) + avail = 0; + else + avail = calc_available_free_space(fs_info, space_info, flush); - if (used + bytes < space_info->total_bytes + avail) + if (used + bytes < writable_total_bytes(fs_info, space_info) + avail) return 1; return 0; } @@ -387,7 +441,7 @@ again: ticket = list_first_entry(head, struct reserve_ticket, list); /* Check and see if our ticket can be satisfied now. */ - if ((used + ticket->bytes <= space_info->total_bytes) || + if ((used + ticket->bytes <= writable_total_bytes(fs_info, space_info)) || btrfs_can_overcommit(fs_info, space_info, ticket->bytes, flush)) { btrfs_space_info_update_bytes_may_use(fs_info, @@ -671,6 +725,18 @@ static void flush_space(struct btrfs_fs_info *fs_info, break; case ALLOC_CHUNK: case ALLOC_CHUNK_FORCE: + /* + * For metadata space on zoned filesystem, reaching here means we + * don't have enough space left in active_total_bytes. Try to + * activate a block group first, because we may have inactive + * block group already allocated. + */ + ret = btrfs_zoned_activate_one_bg(fs_info, space_info, false); + if (ret < 0) + break; + else if (ret == 1) + break; + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -681,6 +747,23 @@ static void flush_space(struct btrfs_fs_info *fs_info, (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE); btrfs_end_transaction(trans); + + /* + * For metadata space on zoned filesystem, allocating a new chunk + * is not enough. We still need to activate the block * group. + * Active the newly allocated block group by (maybe) finishing + * a block group. + */ + if (ret == 1) { + ret = btrfs_zoned_activate_one_bg(fs_info, space_info, true); + /* + * Revert to the original ret regardless we could finish + * one block group or not. + */ + if (ret >= 0) + ret = 1; + } + if (ret > 0 || ret == -ENOSPC) ret = 0; break; @@ -718,6 +801,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, { u64 used; u64 avail; + u64 total; u64 to_reclaim = space_info->reclaim_size; lockdep_assert_held(&space_info->lock); @@ -732,8 +816,9 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, * space. If that's the case add in our overage so we make sure to put * appropriate pressure on the flushing state machine. */ - if (space_info->total_bytes + avail < used) - to_reclaim += used - (space_info->total_bytes + avail); + total = writable_total_bytes(fs_info, space_info); + if (total + avail < used) + to_reclaim += used - (total + avail); return to_reclaim; } @@ -743,9 +828,12 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, { u64 global_rsv_size = fs_info->global_block_rsv.reserved; u64 ordered, delalloc; - u64 thresh = div_factor_fine(space_info->total_bytes, 90); + u64 total = writable_total_bytes(fs_info, space_info); + u64 thresh; u64 used; + thresh = div_factor_fine(total, 90); + lockdep_assert_held(&space_info->lock); /* If we're just plain full then async reclaim just slows us down. */ @@ -807,8 +895,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, BTRFS_RESERVE_FLUSH_ALL); used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_readonly + global_rsv_size; - if (used < space_info->total_bytes) - thresh += space_info->total_bytes - used; + if (used < total) + thresh += total - used; thresh >>= space_info->clamp; used = space_info->bytes_pinned; @@ -1280,7 +1368,7 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); /* * This is the priority reclaim path, so to_reclaim could be >0 still - * because we may have only satisified the priority tickets and still + * because we may have only satisfied the priority tickets and still * left non priority tickets on the list. We would then have * to_reclaim but ->bytes == 0. */ @@ -1525,7 +1613,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, * can_overcommit() to ensure we can overcommit to continue. */ if (!pending_tickets && - ((used + orig_bytes <= space_info->total_bytes) || + ((used + orig_bytes <= writable_total_bytes(fs_info, space_info)) || btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) { btrfs_space_info_update_bytes_may_use(fs_info, space_info, orig_bytes); diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index c096695598c1..12fd6147f92d 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -19,12 +19,16 @@ struct btrfs_space_info { u64 bytes_may_use; /* number of bytes that may be used for delalloc/allocations */ u64 bytes_readonly; /* total bytes that are read only */ + /* Total bytes in the space, but only accounts active block groups. */ + u64 active_total_bytes; u64 bytes_zone_unusable; /* total bytes that are unusable until resetting the device zone */ u64 max_extent_size; /* This will hold the maximum extent size of the space info if we had an ENOSPC in the allocator. */ + /* Chunk size in bytes */ + u64 chunk_size; /* * Once a block group drops below this threshold (percents) we'll @@ -122,7 +126,9 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info); void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, u64 total_bytes, u64 bytes_used, u64 bytes_readonly, u64 bytes_zone_unusable, - struct btrfs_space_info **space_info); + bool active, struct btrfs_space_info **space_info); +void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info, + u64 chunk_size); struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, u64 flags); u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info, diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index f429256f56db..12455b2b41de 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -12,15 +12,10 @@ static bool check_setget_bounds(const struct extent_buffer *eb, { const unsigned long member_offset = (unsigned long)ptr + off; - if (member_offset > eb->len) { + if (unlikely(member_offset + size > eb->len)) { btrfs_warn(eb->fs_info, - "bad eb member start: ptr 0x%lx start %llu member offset %lu size %d", - (unsigned long)ptr, eb->start, member_offset, size); - return false; - } - if (member_offset + size > eb->len) { - btrfs_warn(eb->fs_info, - "bad eb member end: ptr 0x%lx start %llu member offset %lu size %d", + "bad eb member %s: ptr 0x%lx start %llu member offset %lu size %d", + (member_offset > eb->len ? "start" : "end"), (unsigned long)ptr, eb->start, member_offset, size); return false; } diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index a105b291444f..6fc2b77ae5c3 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -123,7 +123,7 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct btrfs_subpage *subpage; /* - * We have cases like a dummy extent buffer page, which is not mappped + * We have cases like a dummy extent buffer page, which is not mapped * and doesn't need to be locked. */ if (page->mapping) @@ -731,7 +731,7 @@ void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, * It should not have any subpage::writers count. * Can be unlocked by unlock_page(). * This is the most common locked page for __extent_writepage() called - * inside extent_write_cache_pages() or extent_write_full_page(). + * inside extent_write_cache_pages(). * Rarer cases include the @locked_page from extent_write_locked_range(). * * - Page locked by lock_delalloc_pages() diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 6627dd7875ee..f89beac3c665 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -48,6 +48,7 @@ #include "block-group.h" #include "discard.h" #include "qgroup.h" +#include "raid56.h" #define CREATE_TRACE_POINTS #include <trace/events/btrfs.h> @@ -72,7 +73,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data); #define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT) /* - * Characters to print to indicate error conditions or uncommon filesystem sate. + * Characters to print to indicate error conditions or uncommon filesystem state. * RO is not an error. */ static const char fs_state_chars[] = { @@ -1815,6 +1816,8 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, error = -EBUSY; } else { snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); + shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", fs_type->name, + s->s_id); btrfs_sb(s)->bdev_holder = fs_type; if (!strstr(crc32c_impl(), "generic")) set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags); @@ -1931,10 +1934,6 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, btrfs_workqueue_set_max(fs_info->hipri_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size); - btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size); - btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size); - btrfs_workqueue_set_max(fs_info->endio_meta_write_workers, - new_pool_size); btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size); btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size); @@ -2246,12 +2245,8 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, if (type & BTRFS_BLOCK_GROUP_RAID0) num_stripes = nr_devices; - else if (type & BTRFS_BLOCK_GROUP_RAID1) - num_stripes = 2; - else if (type & BTRFS_BLOCK_GROUP_RAID1C3) - num_stripes = 3; - else if (type & BTRFS_BLOCK_GROUP_RAID1C4) - num_stripes = 4; + else if (type & BTRFS_BLOCK_GROUP_RAID1_MASK) + num_stripes = rattr->ncopies; else if (type & BTRFS_BLOCK_GROUP_RAID10) num_stripes = 4; @@ -2275,17 +2270,13 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, avail_space = rounddown(avail_space, BTRFS_STRIPE_LEN); /* - * In order to avoid overwriting the superblock on the drive, - * btrfs starts at an offset of at least 1MB when doing chunk - * allocation. - * - * This ensures we have at least min_stripe_size free space - * after excluding 1MB. + * Ensure we have at least min_stripe_size on top of the + * reserved space on the device. */ - if (avail_space <= SZ_1M + min_stripe_size) + if (avail_space <= BTRFS_DEVICE_RANGE_RESERVED + min_stripe_size) continue; - avail_space -= SZ_1M; + avail_space -= BTRFS_DEVICE_RANGE_RESERVED; devices_info[i].dev = device; devices_info[i].max_avail = avail_space; @@ -2703,13 +2694,9 @@ static int __init init_btrfs_fs(void) if (err) goto free_delayed_ref; - err = btrfs_end_io_wq_init(); - if (err) - goto free_prelim_ref; - err = btrfs_interface_init(); if (err) - goto free_end_io_wq; + goto free_prelim_ref; btrfs_print_mod_info(); @@ -2725,8 +2712,6 @@ static int __init init_btrfs_fs(void) unregister_ioctl: btrfs_interface_exit(); -free_end_io_wq: - btrfs_end_io_wq_exit(); free_prelim_ref: btrfs_prelim_ref_exit(); free_delayed_ref: @@ -2764,7 +2749,6 @@ static void __exit exit_btrfs_fs(void) extent_state_cache_exit(); extent_io_exit(); btrfs_interface_exit(); - btrfs_end_io_wq_exit(); unregister_filesystem(&btrfs_fs_type); btrfs_exit_sysfs(); btrfs_cleanup_fs_uuids(); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 92a1fa8e3da6..d5d0717fd09a 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -21,6 +21,7 @@ #include "space-info.h" #include "block-group.h" #include "qgroup.h" +#include "misc.h" /* * Structure name Path @@ -61,6 +62,10 @@ struct raid_kobject { .store = _store, \ } +#define BTRFS_ATTR_W(_prefix, _name, _store) \ + static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ + __INIT_KOBJ_ATTR(_name, 0200, NULL, _store) + #define BTRFS_ATTR_RW(_prefix, _name, _show, _store) \ static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ __INIT_KOBJ_ATTR(_name, 0644, _show, _store) @@ -92,6 +97,7 @@ static struct btrfs_feature_attr btrfs_attr_features_##_name = { \ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj); static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj); +static struct kobject *get_btrfs_kobj(struct kobject *kobj); static struct btrfs_feature_attr *to_btrfs_feature_attr(struct kobj_attribute *a) { @@ -270,12 +276,10 @@ static umode_t btrfs_feature_visible(struct kobject *kobj, return mode; } -BTRFS_FEAT_ATTR_INCOMPAT(mixed_backref, MIXED_BACKREF); BTRFS_FEAT_ATTR_INCOMPAT(default_subvol, DEFAULT_SUBVOL); BTRFS_FEAT_ATTR_INCOMPAT(mixed_groups, MIXED_GROUPS); BTRFS_FEAT_ATTR_INCOMPAT(compress_lzo, COMPRESS_LZO); BTRFS_FEAT_ATTR_INCOMPAT(compress_zstd, COMPRESS_ZSTD); -BTRFS_FEAT_ATTR_INCOMPAT(big_metadata, BIG_METADATA); BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF); BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56); BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA); @@ -283,9 +287,10 @@ BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES); BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID); BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE); BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34); -#ifdef CONFIG_BTRFS_DEBUG -/* Remove once support for zoned allocation is feature complete */ +#ifdef CONFIG_BLK_DEV_ZONED BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED); +#endif +#ifdef CONFIG_BTRFS_DEBUG /* Remove once support for extent tree v2 is feature complete */ BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2); #endif @@ -296,17 +301,15 @@ BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY); /* * Features which depend on feature bits and may differ between each fs. * - * /sys/fs/btrfs/features - all available features implemeted by this version + * /sys/fs/btrfs/features - all available features implemented by this version * /sys/fs/btrfs/UUID/features - features of the fs which are enabled or * can be changed on a mounted filesystem. */ static struct attribute *btrfs_supported_feature_attrs[] = { - BTRFS_FEAT_ATTR_PTR(mixed_backref), BTRFS_FEAT_ATTR_PTR(default_subvol), BTRFS_FEAT_ATTR_PTR(mixed_groups), BTRFS_FEAT_ATTR_PTR(compress_lzo), BTRFS_FEAT_ATTR_PTR(compress_zstd), - BTRFS_FEAT_ATTR_PTR(big_metadata), BTRFS_FEAT_ATTR_PTR(extended_iref), BTRFS_FEAT_ATTR_PTR(raid56), BTRFS_FEAT_ATTR_PTR(skinny_metadata), @@ -314,8 +317,10 @@ static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(metadata_uuid), BTRFS_FEAT_ATTR_PTR(free_space_tree), BTRFS_FEAT_ATTR_PTR(raid1c34), -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BLK_DEV_ZONED BTRFS_FEAT_ATTR_PTR(zoned), +#endif +#ifdef CONFIG_BTRFS_DEBUG BTRFS_FEAT_ATTR_PTR(extent_tree_v2), #endif #ifdef CONFIG_FS_VERITY @@ -709,6 +714,112 @@ static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \ } \ BTRFS_ATTR(space_info, field, btrfs_space_info_show_##field) +static ssize_t btrfs_chunk_size_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_space_info *sinfo = to_space_info(kobj); + + return sysfs_emit(buf, "%llu\n", READ_ONCE(sinfo->chunk_size)); +} + +/* + * Store new chunk size in space info. Can be called on a read-only filesystem. + * + * If the new chunk size value is larger than 10% of free space it is reduced + * to match that limit. Alignment must be to 256M and the system chunk size + * cannot be set. + */ +static ssize_t btrfs_chunk_size_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_space_info *space_info = to_space_info(kobj); + struct btrfs_fs_info *fs_info = to_fs_info(get_btrfs_kobj(kobj)); + char *retptr; + u64 val; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!fs_info->fs_devices) + return -EINVAL; + + if (btrfs_is_zoned(fs_info)) + return -EINVAL; + + /* System block type must not be changed. */ + if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM) + return -EPERM; + + val = memparse(buf, &retptr); + /* There could be trailing '\n', also catch any typos after the value */ + retptr = skip_spaces(retptr); + if (*retptr != 0 || val == 0) + return -EINVAL; + + val = min(val, BTRFS_MAX_DATA_CHUNK_SIZE); + + /* Limit stripe size to 10% of available space. */ + val = min(div_factor(fs_info->fs_devices->total_rw_bytes, 1), val); + + /* Must be multiple of 256M. */ + val &= ~((u64)SZ_256M - 1); + + /* Must be at least 256M. */ + if (val < SZ_256M) + return -EINVAL; + + btrfs_update_space_info_chunk_size(space_info, val); + + return len; +} + +#ifdef CONFIG_BTRFS_DEBUG +/* + * Request chunk allocation with current chunk size. + */ +static ssize_t btrfs_force_chunk_alloc_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_space_info *space_info = to_space_info(kobj); + struct btrfs_fs_info *fs_info = to_fs_info(get_btrfs_kobj(kobj)); + struct btrfs_trans_handle *trans; + bool val; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (sb_rdonly(fs_info->sb)) + return -EROFS; + + ret = kstrtobool(buf, &val); + if (ret) + return ret; + + if (!val) + return -EINVAL; + + /* + * This is unsafe to be called from sysfs context and may cause + * unexpected problems. + */ + trans = btrfs_start_transaction(fs_info->tree_root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + ret = btrfs_force_chunk_alloc(trans, space_info->flags); + btrfs_end_transaction(trans); + + if (ret == 1) + return len; + + return -ENOSPC; +} +BTRFS_ATTR_W(space_info, force_chunk_alloc, btrfs_force_chunk_alloc_store); + +#endif + SPACE_INFO_ATTR(flags); SPACE_INFO_ATTR(total_bytes); SPACE_INFO_ATTR(bytes_used); @@ -719,6 +830,7 @@ SPACE_INFO_ATTR(bytes_readonly); SPACE_INFO_ATTR(bytes_zone_unusable); SPACE_INFO_ATTR(disk_used); SPACE_INFO_ATTR(disk_total); +BTRFS_ATTR_RW(space_info, chunk_size, btrfs_chunk_size_show, btrfs_chunk_size_store); static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj, struct kobj_attribute *a, @@ -773,6 +885,10 @@ static struct attribute *space_info_attrs[] = { BTRFS_ATTR_PTR(space_info, disk_used), BTRFS_ATTR_PTR(space_info, disk_total), BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold), + BTRFS_ATTR_PTR(space_info, chunk_size), +#ifdef CONFIG_BTRFS_DEBUG + BTRFS_ATTR_PTR(space_info, force_chunk_alloc), +#endif NULL, }; ATTRIBUTE_GROUPS(space_info); @@ -871,6 +987,48 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj, BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show); +static ssize_t btrfs_commit_stats_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return sysfs_emit(buf, + "commits %llu\n" + "last_commit_ms %llu\n" + "max_commit_ms %llu\n" + "total_commit_ms %llu\n", + fs_info->commit_stats.commit_count, + div_u64(fs_info->commit_stats.last_commit_dur, NSEC_PER_MSEC), + div_u64(fs_info->commit_stats.max_commit_dur, NSEC_PER_MSEC), + div_u64(fs_info->commit_stats.total_commit_dur, NSEC_PER_MSEC)); +} + +static ssize_t btrfs_commit_stats_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + unsigned long val; + int ret; + + if (!fs_info) + return -EPERM; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + ret = kstrtoul(buf, 10, &val); + if (ret) + return ret; + if (val) + return -EINVAL; + + WRITE_ONCE(fs_info->commit_stats.max_commit_dur, 0); + + return len; +} +BTRFS_ATTR_RW(, commit_stats, btrfs_commit_stats_show, btrfs_commit_stats_store); + static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { @@ -1110,6 +1268,7 @@ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, generation), BTRFS_ATTR_PTR(, read_policy), BTRFS_ATTR_PTR(, bg_reclaim_threshold), + BTRFS_ATTR_PTR(, commit_stats), NULL, }; @@ -1140,6 +1299,16 @@ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj) return to_fs_devs(kobj)->fs_info; } +static struct kobject *get_btrfs_kobj(struct kobject *kobj) +{ + while (kobj) { + if (kobj->ktype == &btrfs_ktype) + return kobj; + kobj = kobj->parent; + } + return NULL; +} + #define NUM_FEATURE_BITS 64 #define BTRFS_FEATURE_NAME_MAX 13 static char btrfs_unknown_feature_names[FEAT_MAX][NUM_FEATURE_BITS][BTRFS_FEATURE_NAME_MAX]; @@ -2106,4 +2275,3 @@ void __cold btrfs_exit_sysfs(void) #endif kset_unregister(btrfs_kset); } - diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 1591bfa55bcc..cc9377cf56a3 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -59,6 +59,7 @@ struct inode *btrfs_new_test_inode(void) return NULL; inode->i_mode = S_IFREG; + inode->i_ino = BTRFS_FIRST_FREE_OBJECTID; BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; BTRFS_I(inode)->location.offset = 0; @@ -150,8 +151,8 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) { - unsigned long index; - struct extent_buffer *eb; + struct radix_tree_iter iter; + void **slot; struct btrfs_device *dev, *tmp; if (!fs_info) @@ -163,9 +164,25 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) test_mnt->mnt_sb->s_fs_info = NULL; - xa_for_each(&fs_info->extent_buffers, index, eb) { + spin_lock(&fs_info->buffer_lock); + radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) { + struct extent_buffer *eb; + + eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock); + if (!eb) + continue; + /* Shouldn't happen but that kind of thinking creates CVE's */ + if (radix_tree_exception(eb)) { + if (radix_tree_deref_retry(eb)) + slot = radix_tree_iter_retry(&iter); + continue; + } + slot = radix_tree_iter_resume(slot, &iter); + spin_unlock(&fs_info->buffer_lock); free_extent_buffer_stale(eb); + spin_lock(&fs_info->buffer_lock); } + spin_unlock(&fs_info->buffer_lock); btrfs_mapping_tree_free(&fs_info->mapping_tree); list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices, @@ -186,7 +203,7 @@ void btrfs_free_dummy_root(struct btrfs_root *root) if (!root) return; /* Will be freed by btrfs_free_fs_roots */ - if (WARN_ON(test_bit(BTRFS_ROOT_REGISTERED, &root->state))) + if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state))) return; btrfs_global_root_delete(root); btrfs_put_root(root); diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index 51a8b075c259..b7d181a08eab 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -47,7 +47,8 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) goto out; } - path->nodes[0] = eb = alloc_dummy_extent_buffer(fs_info, nodesize); + eb = alloc_dummy_extent_buffer(fs_info, nodesize); + path->nodes[0] = eb; if (!eb) { test_std_err(TEST_ALLOC_EXTENT_BUFFER); ret = -ENOMEM; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 06c0a958d114..0bec10740ad3 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -10,6 +10,7 @@ #include <linux/pagemap.h> #include <linux/blkdev.h> #include <linux/uuid.h> +#include <linux/timekeeping.h> #include "misc.h" #include "ctree.h" #include "disk-io.h" @@ -23,7 +24,7 @@ #include "space-info.h" #include "zoned.h" -#define BTRFS_ROOT_TRANS_TAG XA_MARK_0 +#define BTRFS_ROOT_TRANS_TAG 0 /* * Transaction states and transitions @@ -437,15 +438,15 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, */ smp_wmb(); - spin_lock(&fs_info->fs_roots_lock); + spin_lock(&fs_info->fs_roots_radix_lock); if (root->last_trans == trans->transid && !force) { - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); return 0; } - xa_set_mark(&fs_info->fs_roots, - (unsigned long)root->root_key.objectid, - BTRFS_ROOT_TRANS_TAG); - spin_unlock(&fs_info->fs_roots_lock); + radix_tree_tag_set(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + spin_unlock(&fs_info->fs_roots_radix_lock); root->last_trans = trans->transid; /* this is pretty tricky. We don't want to @@ -487,9 +488,11 @@ void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, spin_unlock(&cur_trans->dropped_roots_lock); /* Make sure we don't try to update the root at commit time */ - xa_clear_mark(&fs_info->fs_roots, - (unsigned long)root->root_key.objectid, - BTRFS_ROOT_TRANS_TAG); + spin_lock(&fs_info->fs_roots_radix_lock); + radix_tree_tag_clear(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + spin_unlock(&fs_info->fs_roots_radix_lock); } int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, @@ -1402,8 +1405,9 @@ void btrfs_add_dead_root(struct btrfs_root *root) static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *root; - unsigned long index; + struct btrfs_root *gang[8]; + int i; + int ret; /* * At this point no one can be using this transaction to modify any tree @@ -1411,46 +1415,57 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) */ ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING); - spin_lock(&fs_info->fs_roots_lock); - xa_for_each_marked(&fs_info->fs_roots, index, root, BTRFS_ROOT_TRANS_TAG) { - int ret; - - /* - * At this point we can neither have tasks logging inodes - * from a root nor trying to commit a log tree. - */ - ASSERT(atomic_read(&root->log_writers) == 0); - ASSERT(atomic_read(&root->log_commit[0]) == 0); - ASSERT(atomic_read(&root->log_commit[1]) == 0); - - xa_clear_mark(&fs_info->fs_roots, - (unsigned long)root->root_key.objectid, - BTRFS_ROOT_TRANS_TAG); - spin_unlock(&fs_info->fs_roots_lock); - - btrfs_free_log(trans, root); - ret = btrfs_update_reloc_root(trans, root); - if (ret) - return ret; - - /* See comments in should_cow_block() */ - clear_bit(BTRFS_ROOT_FORCE_COW, &root->state); - smp_mb__after_atomic(); + spin_lock(&fs_info->fs_roots_radix_lock); + while (1) { + ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix, + (void **)gang, 0, + ARRAY_SIZE(gang), + BTRFS_ROOT_TRANS_TAG); + if (ret == 0) + break; + for (i = 0; i < ret; i++) { + struct btrfs_root *root = gang[i]; + int ret2; + + /* + * At this point we can neither have tasks logging inodes + * from a root nor trying to commit a log tree. + */ + ASSERT(atomic_read(&root->log_writers) == 0); + ASSERT(atomic_read(&root->log_commit[0]) == 0); + ASSERT(atomic_read(&root->log_commit[1]) == 0); + + radix_tree_tag_clear(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + spin_unlock(&fs_info->fs_roots_radix_lock); + + btrfs_free_log(trans, root); + ret2 = btrfs_update_reloc_root(trans, root); + if (ret2) + return ret2; + + /* see comments in should_cow_block() */ + clear_bit(BTRFS_ROOT_FORCE_COW, &root->state); + smp_mb__after_atomic(); + + if (root->commit_root != root->node) { + list_add_tail(&root->dirty_list, + &trans->transaction->switch_commits); + btrfs_set_root_node(&root->root_item, + root->node); + } - if (root->commit_root != root->node) { - list_add_tail(&root->dirty_list, - &trans->transaction->switch_commits); - btrfs_set_root_node(&root->root_item, root->node); + ret2 = btrfs_update_root(trans, fs_info->tree_root, + &root->root_key, + &root->root_item); + if (ret2) + return ret2; + spin_lock(&fs_info->fs_roots_radix_lock); + btrfs_qgroup_free_meta_all_pertrans(root); } - - ret = btrfs_update_root(trans, fs_info->tree_root, - &root->root_key, &root->root_item); - if (ret) - return ret; - spin_lock(&fs_info->fs_roots_lock); - btrfs_qgroup_free_meta_all_pertrans(root); } - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); return 0; } @@ -1817,8 +1832,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size + dentry->d_name.len * 2); - parent_inode->i_mtime = parent_inode->i_ctime = - current_time(parent_inode); + parent_inode->i_mtime = current_time(parent_inode); + parent_inode->i_ctime = parent_inode->i_mtime; ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode)); if (ret) { btrfs_abort_transaction(trans, ret); @@ -2084,12 +2099,23 @@ static void add_pending_snapshot(struct btrfs_trans_handle *trans) list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots); } +static void update_commit_stats(struct btrfs_fs_info *fs_info, ktime_t interval) +{ + fs_info->commit_stats.commit_count++; + fs_info->commit_stats.last_commit_dur = interval; + fs_info->commit_stats.max_commit_dur = + max_t(u64, fs_info->commit_stats.max_commit_dur, interval); + fs_info->commit_stats.total_commit_dur += interval; +} + int btrfs_commit_transaction(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_transaction *prev_trans = NULL; int ret; + ktime_t start_time; + ktime_t interval; ASSERT(refcount_read(&trans->use_count) == 1); @@ -2214,6 +2240,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) } } + /* + * Get the time spent on the work done by the commit thread and not + * the time spent waiting on a previous commit + */ + start_time = ktime_get_ns(); + extwriter_counter_dec(cur_trans, trans->type); ret = btrfs_start_delalloc_flush(fs_info); @@ -2455,6 +2487,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) trace_btrfs_transaction_commit(fs_info); + interval = ktime_get_ns() - start_time; + btrfs_scrub_continue(fs_info); if (current->journal_info == trans) @@ -2462,6 +2496,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) kmem_cache_free(btrfs_trans_handle_cachep, trans); + update_commit_stats(fs_info, interval); + return ret; unlock_reloc: diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 370388fadf96..dcf75a8daa20 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -171,7 +171,7 @@ again: int index = (root->log_transid + 1) % 2; if (btrfs_need_log_full_commit(trans)) { - ret = -EAGAIN; + ret = BTRFS_LOG_FORCE_COMMIT; goto out; } @@ -194,7 +194,7 @@ again: * writing. */ if (zoned && !created) { - ret = -EAGAIN; + ret = BTRFS_LOG_FORCE_COMMIT; goto out; } @@ -2287,7 +2287,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, struct btrfs_key location; /* - * Currenly we only log dir index keys. Even if we replay a log created + * Currently we only log dir index keys. Even if we replay a log created * by an older kernel that logged both dir index and dir item keys, all * we need to do is process the dir index keys, we (and our caller) can * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY). @@ -3121,7 +3121,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, /* bail out if we need to do a full commit */ if (btrfs_need_log_full_commit(trans)) { - ret = -EAGAIN; + ret = BTRFS_LOG_FORCE_COMMIT; mutex_unlock(&root->log_mutex); goto out; } @@ -3222,7 +3222,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, } btrfs_wait_tree_log_extents(log, mark); mutex_unlock(&log_root_tree->log_mutex); - ret = -EAGAIN; + ret = BTRFS_LOG_FORCE_COMMIT; goto out; } @@ -3261,7 +3261,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, blk_finish_plug(&plug); btrfs_wait_tree_log_extents(log, mark); mutex_unlock(&log_root_tree->log_mutex); - ret = -EAGAIN; + ret = BTRFS_LOG_FORCE_COMMIT; goto out_wake_log_root; } @@ -5848,7 +5848,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, inode_only == LOG_INODE_ALL && inode->last_unlink_trans >= trans->transid) { btrfs_set_log_full_commit(trans); - ret = 1; + ret = BTRFS_LOG_FORCE_COMMIT; goto out_unlock; } @@ -6562,12 +6562,12 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, bool log_dentries = false; if (btrfs_test_opt(fs_info, NOTREELOG)) { - ret = 1; + ret = BTRFS_LOG_FORCE_COMMIT; goto end_no_trans; } if (btrfs_root_refs(&root->root_item) == 0) { - ret = 1; + ret = BTRFS_LOG_FORCE_COMMIT; goto end_no_trans; } @@ -6665,7 +6665,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, end_trans: if (ret < 0) { btrfs_set_log_full_commit(trans); - ret = 1; + ret = BTRFS_LOG_FORCE_COMMIT; } if (ret) @@ -7029,8 +7029,15 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, * anyone from syncing the log until we have updated both inodes * in the log. */ + ret = join_running_log_trans(root); + /* + * At least one of the inodes was logged before, so this should + * not fail, but if it does, it's not serious, just bail out and + * mark the log for a full commit. + */ + if (WARN_ON_ONCE(ret < 0)) + goto out; log_pinned = true; - btrfs_pin_log_trans(root); path = btrfs_alloc_path(); if (!path) { diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 1620f8170629..57ab5f3b8dc7 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -12,6 +12,9 @@ /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ #define BTRFS_NO_LOG_SYNC 256 +/* We can't use the tree log for whatever reason, force a transaction commit */ +#define BTRFS_LOG_FORCE_COMMIT (1) + struct btrfs_log_ctx { int log_ret; int log_transid; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9c20049d1fec..272901514b0c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -182,6 +182,13 @@ const char *btrfs_bg_type_to_raid_name(u64 flags) return btrfs_raid_array[index].raid_name; } +int btrfs_nr_parity_stripes(u64 type) +{ + enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(type); + + return btrfs_raid_array[index].nparity; +} + /* * Fill @buf with textual description of @bg_flags, no more than @size_buf * bytes including terminating null byte. @@ -238,7 +245,6 @@ out_overflow:; static int init_first_rw_device(struct btrfs_trans_handle *trans); static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); -static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); static int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, @@ -1396,12 +1402,7 @@ static u64 dev_extent_search_start(struct btrfs_device *device, u64 start) { switch (device->fs_devices->chunk_alloc_policy) { case BTRFS_CHUNK_ALLOC_REGULAR: - /* - * We don't want to overwrite the superblock on the drive nor - * any area used by the boot loader (grub for example), so we - * make sure to start at an offset of at least 1MB. - */ - return max_t(u64, start, SZ_1M); + return max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED); case BTRFS_CHUNK_ALLOC_ZONED: /* * We don't care about the starting region like regular @@ -5071,26 +5072,16 @@ static void init_alloc_chunk_ctl_policy_regular( struct btrfs_fs_devices *fs_devices, struct alloc_chunk_ctl *ctl) { - u64 type = ctl->type; + struct btrfs_space_info *space_info; - if (type & BTRFS_BLOCK_GROUP_DATA) { - ctl->max_stripe_size = SZ_1G; - ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; - } else if (type & BTRFS_BLOCK_GROUP_METADATA) { - /* For larger filesystems, use larger metadata chunks */ - if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) - ctl->max_stripe_size = SZ_1G; - else - ctl->max_stripe_size = SZ_256M; - ctl->max_chunk_size = ctl->max_stripe_size; - } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { - ctl->max_stripe_size = SZ_32M; - ctl->max_chunk_size = 2 * ctl->max_stripe_size; - ctl->devs_max = min_t(int, ctl->devs_max, - BTRFS_MAX_DEVS_SYS_CHUNK); - } else { - BUG(); - } + space_info = btrfs_find_space_info(fs_devices->fs_info, ctl->type); + ASSERT(space_info); + + ctl->max_chunk_size = READ_ONCE(space_info->chunk_size); + ctl->max_stripe_size = ctl->max_chunk_size; + + if (ctl->type & BTRFS_BLOCK_GROUP_SYSTEM) + ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK); /* We don't want a chunk larger than 10% of writable space */ ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), @@ -5720,7 +5711,8 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) { struct extent_map *em; struct map_lookup *map; - int ret; + enum btrfs_raid_types index; + int ret = 1; em = btrfs_get_chunk_map(fs_info, logical, len); if (IS_ERR(em)) @@ -5733,10 +5725,11 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) return 1; map = em->map_lookup; - if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK)) - ret = map->num_stripes; - else if (map->type & BTRFS_BLOCK_GROUP_RAID10) - ret = map->sub_stripes; + index = btrfs_bg_flags_to_raid_index(map->type); + + /* Non-RAID56, use their ncopies from btrfs_raid_array. */ + if (!(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)) + ret = btrfs_raid_array[index].ncopies; else if (map->type & BTRFS_BLOCK_GROUP_RAID5) ret = 2; else if (map->type & BTRFS_BLOCK_GROUP_RAID6) @@ -5748,8 +5741,6 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) * stripe under reconstruction. */ ret = map->num_stripes; - else - ret = 1; free_extent_map(em); down_read(&fs_info->dev_replace.rwsem); @@ -5768,6 +5759,9 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, struct map_lookup *map; unsigned long len = fs_info->sectorsize; + if (!btrfs_fs_incompat(fs_info, RAID56)) + return len; + em = btrfs_get_chunk_map(fs_info, logical, len); if (!WARN_ON(IS_ERR(em))) { @@ -5785,6 +5779,9 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) struct map_lookup *map; int ret = 0; + if (!btrfs_fs_incompat(fs_info, RAID56)) + return 0; + em = btrfs_get_chunk_map(fs_info, logical, len); if(!WARN_ON(IS_ERR(em))) { @@ -5917,18 +5914,17 @@ void btrfs_put_bioc(struct btrfs_io_context *bioc) kfree(bioc); } -/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */ /* * Please note that, discard won't be sent to target device of device * replace. */ -static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, - u64 logical, u64 *length_ret, - struct btrfs_io_context **bioc_ret) +struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, + u64 logical, u64 *length_ret, + u32 *num_stripes) { struct extent_map *em; struct map_lookup *map; - struct btrfs_io_context *bioc; + struct btrfs_discard_stripe *stripes; u64 length = *length_ret; u64 offset; u64 stripe_nr; @@ -5937,29 +5933,26 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, u64 stripe_cnt; u64 stripe_len; u64 stripe_offset; - u64 num_stripes; u32 stripe_index; u32 factor = 0; u32 sub_stripes = 0; u64 stripes_per_dev = 0; u32 remaining_stripes = 0; u32 last_stripe = 0; - int ret = 0; + int ret; int i; - /* Discard always returns a bioc. */ - ASSERT(bioc_ret); - em = btrfs_get_chunk_map(fs_info, logical, length); if (IS_ERR(em)) - return PTR_ERR(em); + return ERR_CAST(em); map = em->map_lookup; + /* we don't discard raid56 yet */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { ret = -EOPNOTSUPP; - goto out; - } + goto out_free_map; +} offset = logical - em->start; length = min_t(u64, em->start + em->len - logical, length); @@ -5985,7 +5978,7 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, * device we have to walk to find the data, and stripe_index is * the number of our device in the stripe array */ - num_stripes = 1; + *num_stripes = 1; stripe_index = 0; if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { @@ -5995,7 +5988,7 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, sub_stripes = map->sub_stripes; factor = map->num_stripes / sub_stripes; - num_stripes = min_t(u64, map->num_stripes, + *num_stripes = min_t(u64, map->num_stripes, sub_stripes * stripe_cnt); stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); stripe_index *= sub_stripes; @@ -6005,31 +5998,30 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, last_stripe *= sub_stripes; } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_DUP)) { - num_stripes = map->num_stripes; + *num_stripes = map->num_stripes; } else { stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &stripe_index); } - bioc = alloc_btrfs_io_context(fs_info, num_stripes, 0); - if (!bioc) { + stripes = kcalloc(*num_stripes, sizeof(*stripes), GFP_NOFS); + if (!stripes) { ret = -ENOMEM; - goto out; + goto out_free_map; } - for (i = 0; i < num_stripes; i++) { - bioc->stripes[i].physical = + for (i = 0; i < *num_stripes; i++) { + stripes[i].physical = map->stripes[stripe_index].physical + stripe_offset + stripe_nr * map->stripe_len; - bioc->stripes[i].dev = map->stripes[stripe_index].dev; + stripes[i].dev = map->stripes[stripe_index].dev; if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { - bioc->stripes[i].length = stripes_per_dev * - map->stripe_len; + stripes[i].length = stripes_per_dev * map->stripe_len; if (i / sub_stripes < remaining_stripes) - bioc->stripes[i].length += map->stripe_len; + stripes[i].length += map->stripe_len; /* * Special for the first stripe and @@ -6040,17 +6032,17 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, * off end_off */ if (i < sub_stripes) - bioc->stripes[i].length -= stripe_offset; + stripes[i].length -= stripe_offset; if (stripe_index >= last_stripe && stripe_index <= (last_stripe + sub_stripes - 1)) - bioc->stripes[i].length -= stripe_end_offset; + stripes[i].length -= stripe_end_offset; if (i == sub_stripes - 1) stripe_offset = 0; } else { - bioc->stripes[i].length = length; + stripes[i].length = length; } stripe_index++; @@ -6060,12 +6052,11 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, } } - *bioc_ret = bioc; - bioc->map_type = map->type; - bioc->num_stripes = num_stripes; -out: free_extent_map(em); - return ret; + return stripes; +out_free_map: + free_extent_map(em); + return ERR_PTR(ret); } /* @@ -6208,7 +6199,6 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op, bioc->stripes + i; new->physical = old->physical; - new->length = old->length; new->dev = dev_replace->tgtdev; bioc->tgtdev_map[i] = index_where_to_add; index_where_to_add++; @@ -6249,8 +6239,6 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op, bioc->stripes + num_stripes; tgtdev_stripe->physical = physical_of_found; - tgtdev_stripe->length = - bioc->stripes[index_srcdev].length; tgtdev_stripe->dev = dev_replace->tgtdev; bioc->tgtdev_map[index_srcdev] = num_stripes; @@ -6472,6 +6460,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, } } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + ASSERT(map->stripe_len == BTRFS_STRIPE_LEN); if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { /* push stripe_nr back to the start of the full stripe */ stripe_nr = div64_u64(raid56_full_stripe_start, @@ -6479,9 +6468,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, /* RAID[56] write or recovery. Return all stripes */ num_stripes = map->num_stripes; - max_errors = nr_parity_stripes(map); + max_errors = btrfs_chunk_max_errors(map); - *length = map->stripe_len; + /* Return the length to the full stripe end */ + *length = min(logical + *length, + raid56_full_stripe_start + em->start + + data_stripes * stripe_len) - logical; stripe_index = 0; stripe_offset = 0; } else { @@ -6604,10 +6596,6 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_io_context **bioc_ret, int mirror_num) { - if (op == BTRFS_MAP_DISCARD) - return __btrfs_map_block_for_discard(fs_info, logical, - length, bioc_ret); - return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, mirror_num, 0); } @@ -6620,77 +6608,106 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1); } -static inline void btrfs_end_bioc(struct btrfs_io_context *bioc, struct bio *bio) +static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_io_context *bioc) +{ + if (bioc->orig_bio->bi_opf & REQ_META) + return bioc->fs_info->endio_meta_workers; + return bioc->fs_info->endio_workers; +} + +static void btrfs_end_bio_work(struct work_struct *work) +{ + struct btrfs_bio *bbio = + container_of(work, struct btrfs_bio, end_io_work); + + bio_endio(&bbio->bio); +} + +static void btrfs_end_bioc(struct btrfs_io_context *bioc, bool async) { - bio->bi_private = bioc->private; - bio->bi_end_io = bioc->end_io; - bio_endio(bio); + struct bio *orig_bio = bioc->orig_bio; + struct btrfs_bio *bbio = btrfs_bio(orig_bio); + + bbio->mirror_num = bioc->mirror_num; + orig_bio->bi_private = bioc->private; + orig_bio->bi_end_io = bioc->end_io; + + /* + * Only send an error to the higher layers if it is beyond the tolerance + * threshold. + */ + if (atomic_read(&bioc->error) > bioc->max_errors) + orig_bio->bi_status = BLK_STS_IOERR; + else + orig_bio->bi_status = BLK_STS_OK; + + if (btrfs_op(orig_bio) == BTRFS_MAP_READ && async) { + INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); + queue_work(btrfs_end_io_wq(bioc), &bbio->end_io_work); + } else { + bio_endio(orig_bio); + } btrfs_put_bioc(bioc); } static void btrfs_end_bio(struct bio *bio) { - struct btrfs_io_context *bioc = bio->bi_private; - int is_orig_bio = 0; + struct btrfs_io_stripe *stripe = bio->bi_private; + struct btrfs_io_context *bioc = stripe->bioc; if (bio->bi_status) { atomic_inc(&bioc->error); if (bio->bi_status == BLK_STS_IOERR || bio->bi_status == BLK_STS_TARGET) { - struct btrfs_device *dev = btrfs_bio(bio)->device; - - ASSERT(dev->bdev); if (btrfs_op(bio) == BTRFS_MAP_WRITE) - btrfs_dev_stat_inc_and_print(dev, + btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_WRITE_ERRS); else if (!(bio->bi_opf & REQ_RAHEAD)) - btrfs_dev_stat_inc_and_print(dev, + btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_READ_ERRS); if (bio->bi_opf & REQ_PREFLUSH) - btrfs_dev_stat_inc_and_print(dev, + btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_FLUSH_ERRS); } } - if (bio == bioc->orig_bio) - is_orig_bio = 1; + if (bio != bioc->orig_bio) + bio_put(bio); btrfs_bio_counter_dec(bioc->fs_info); - - if (atomic_dec_and_test(&bioc->stripes_pending)) { - if (!is_orig_bio) { - bio_put(bio); - bio = bioc->orig_bio; - } - - btrfs_bio(bio)->mirror_num = bioc->mirror_num; - /* only send an error to the higher layers if it is - * beyond the tolerance of the btrfs bio - */ - if (atomic_read(&bioc->error) > bioc->max_errors) { - bio->bi_status = BLK_STS_IOERR; - } else { - /* - * this bio is actually up to date, we didn't - * go over the max number of errors - */ - bio->bi_status = BLK_STS_OK; - } - - btrfs_end_bioc(bioc, bio); - } else if (!is_orig_bio) { - bio_put(bio); - } + if (atomic_dec_and_test(&bioc->stripes_pending)) + btrfs_end_bioc(bioc, true); } -static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio, - u64 physical, struct btrfs_device *dev) +static void submit_stripe_bio(struct btrfs_io_context *bioc, + struct bio *orig_bio, int dev_nr, bool clone) { struct btrfs_fs_info *fs_info = bioc->fs_info; + struct btrfs_device *dev = bioc->stripes[dev_nr].dev; + u64 physical = bioc->stripes[dev_nr].physical; + struct bio *bio; + + if (!dev || !dev->bdev || + test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || + (btrfs_op(orig_bio) == BTRFS_MAP_WRITE && + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { + atomic_inc(&bioc->error); + if (atomic_dec_and_test(&bioc->stripes_pending)) + btrfs_end_bioc(bioc, false); + return; + } + + if (clone) { + bio = bio_alloc_clone(dev->bdev, orig_bio, GFP_NOFS, &fs_bio_set); + } else { + bio = orig_bio; + bio_set_dev(bio, dev->bdev); + btrfs_bio(bio)->device = dev; + } - bio->bi_private = bioc; - btrfs_bio(bio)->device = dev; + bioc->stripes[dev_nr].bioc = bioc; + bio->bi_private = &bioc->stripes[dev_nr]; bio->bi_end_io = btrfs_end_bio; bio->bi_iter.bi_sector = physical >> 9; /* @@ -6708,8 +6725,8 @@ static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio, } } btrfs_debug_in_rcu(fs_info, - "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", - bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, + "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", + __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name), dev->devid, bio->bi_iter.bi_size); @@ -6719,66 +6736,39 @@ static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio, submit_bio(bio); } -static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logical) -{ - atomic_inc(&bioc->error); - if (atomic_dec_and_test(&bioc->stripes_pending)) { - /* Should be the original bio. */ - WARN_ON(bio != bioc->orig_bio); - - btrfs_bio(bio)->mirror_num = bioc->mirror_num; - bio->bi_iter.bi_sector = logical >> 9; - if (atomic_read(&bioc->error) > bioc->max_errors) - bio->bi_status = BLK_STS_IOERR; - else - bio->bi_status = BLK_STS_OK; - btrfs_end_bioc(bioc, bio); - } -} - -blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, - int mirror_num) +void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num) { - struct btrfs_device *dev; - struct bio *first_bio = bio; u64 logical = bio->bi_iter.bi_sector << 9; - u64 length = 0; - u64 map_length; + u64 length = bio->bi_iter.bi_size; + u64 map_length = length; int ret; int dev_nr; int total_devs; struct btrfs_io_context *bioc = NULL; - length = bio->bi_iter.bi_size; - map_length = length; - btrfs_bio_counter_inc_blocked(fs_info); ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, &bioc, mirror_num, 1); if (ret) { btrfs_bio_counter_dec(fs_info); - return errno_to_blk_status(ret); + bio->bi_status = errno_to_blk_status(ret); + bio_endio(bio); + return; } total_devs = bioc->num_stripes; - bioc->orig_bio = first_bio; - bioc->private = first_bio->bi_private; - bioc->end_io = first_bio->bi_end_io; - atomic_set(&bioc->stripes_pending, bioc->num_stripes); + bioc->orig_bio = bio; + bioc->private = bio->bi_private; + bioc->end_io = bio->bi_end_io; + atomic_set(&bioc->stripes_pending, total_devs); if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) { - /* In this case, map_length has been set to the length of - a single stripe; not the whole write */ - if (btrfs_op(bio) == BTRFS_MAP_WRITE) { - ret = raid56_parity_write(bio, bioc, map_length); - } else { - ret = raid56_parity_recover(bio, bioc, map_length, - mirror_num, 1); - } - - btrfs_bio_counter_dec(fs_info); - return errno_to_blk_status(ret); + if (btrfs_op(bio) == BTRFS_MAP_WRITE) + raid56_parity_write(bio, bioc); + else + raid56_parity_recover(bio, bioc, mirror_num, true); + return; } if (map_length < length) { @@ -6789,26 +6779,11 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, } for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { - dev = bioc->stripes[dev_nr].dev; - if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING, - &dev->dev_state) || - (btrfs_op(first_bio) == BTRFS_MAP_WRITE && - !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { - bioc_error(bioc, first_bio, logical); - continue; - } - - if (dev_nr < total_devs - 1) { - bio = btrfs_bio_clone(dev->bdev, first_bio); - } else { - bio = first_bio; - bio_set_dev(bio, dev->bdev); - } + const bool should_clone = (dev_nr < total_devs - 1); - submit_stripe_bio(bioc, bio, bioc->stripes[dev_nr].physical, dev); + submit_stripe_bio(bioc, bio, dev_nr, should_clone); } btrfs_bio_counter_dec(fs_info); - return BLK_STS_OK; } static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args, @@ -6966,11 +6941,12 @@ static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, devid, uuid); } -static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes) +u64 btrfs_calc_stripe_length(const struct extent_map *em) { - const int data_stripes = calc_data_stripes(type, num_stripes); + const struct map_lookup *map = em->map_lookup; + const int data_stripes = calc_data_stripes(map->type, map->num_stripes); - return div_u64(chunk_len, data_stripes); + return div_u64(em->len, data_stripes); } #if BITS_PER_LONG == 32 @@ -7109,8 +7085,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, map->type = type; map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); map->verified_stripes = 0; - em->orig_block_len = calc_stripe_length(type, em->len, - map->num_stripes); + em->orig_block_len = btrfs_calc_stripe_length(em); for (i = 0; i < num_stripes; i++) { map->stripes[i].physical = btrfs_stripe_offset_nr(leaf, chunk, i); @@ -7236,7 +7211,8 @@ static int read_one_dev(struct extent_buffer *leaf, u8 fs_uuid[BTRFS_FSID_SIZE]; u8 dev_uuid[BTRFS_UUID_SIZE]; - devid = args.devid = btrfs_device_id(leaf, dev_item); + devid = btrfs_device_id(leaf, dev_item); + args.devid = devid; read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), BTRFS_UUID_SIZE); read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), @@ -7865,11 +7841,7 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans) void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) { btrfs_dev_stat_inc(dev, index); - btrfs_dev_stat_print_on_error(dev); -} -static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) -{ if (!dev->dev_stats_valid) return; btrfs_err_rl_in_rcu(dev->fs_info, @@ -8011,7 +7983,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, } map = em->map_lookup; - stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes); + stripe_len = btrfs_calc_stripe_length(em); if (physical_len != stripe_len) { btrfs_err(fs_info, "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu", @@ -8021,6 +7993,16 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, goto out; } + /* + * Very old mkfs.btrfs (before v4.1) will not respect the reserved + * space. Although kernel can handle it without problem, better to warn + * the users. + */ + if (physical_offset < BTRFS_DEVICE_RANGE_RESERVED) + btrfs_warn(fs_info, + "devid %llu physical %llu len %llu inside the reserved space", + devid, physical_offset, physical_len); + for (i = 0; i < map->num_stripes; i++) { if (map->stripes[i].dev->devid == devid && map->stripes[i].physical == physical_offset) { diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 6721002000ee..5639961b3626 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -355,6 +355,13 @@ struct btrfs_fs_devices { / sizeof(struct btrfs_stripe) + 1) /* + * Maximum number of sectors for a single bio to limit the size of the + * checksum array. This matches the number of bio_vecs per bio and thus the + * I/O size for buffered I/O. + */ +#define BTRFS_MAX_BIO_SECTORS (256) + +/* * Additional info to pass along bio. * * Mostly for btrfs specific features like csum and mirror_num. @@ -371,6 +378,9 @@ struct btrfs_bio { u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; struct bvec_iter iter; + /* For read end I/O handling */ + struct work_struct end_io_work; + /* * This member must come last, bio_alloc_bioset will allocate enough * bytes for entire btrfs_bio but relies on bio being last. @@ -391,10 +401,36 @@ static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio) } } +/* + * Iterate through a btrfs_bio (@bbio) on a per-sector basis. + * + * bvl - struct bio_vec + * bbio - struct btrfs_bio + * iters - struct bvec_iter + * bio_offset - unsigned int + */ +#define btrfs_bio_for_each_sector(fs_info, bvl, bbio, iter, bio_offset) \ + for ((iter) = (bbio)->iter, (bio_offset) = 0; \ + (iter).bi_size && \ + (((bvl) = bio_iter_iovec((&(bbio)->bio), (iter))), 1); \ + (bio_offset) += fs_info->sectorsize, \ + bio_advance_iter_single(&(bbio)->bio, &(iter), \ + (fs_info)->sectorsize)) + struct btrfs_io_stripe { struct btrfs_device *dev; + union { + /* Block mapping */ + u64 physical; + /* For the endio handler */ + struct btrfs_io_context *bioc; + }; +}; + +struct btrfs_discard_stripe { + struct btrfs_device *dev; u64 physical; - u64 length; /* only used for discard mappings */ + u64 length; }; /* @@ -533,6 +569,9 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_io_context **bioc_ret); +struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, + u64 logical, u64 *length_ret, + u32 *num_stripes); int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map, enum btrfs_map_op op, u64 logical, struct btrfs_io_geometry *io_geom); @@ -541,8 +580,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, u64 type); void btrfs_mapping_tree_free(struct extent_map_tree *tree); -blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, - int mirror_num); +void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder); struct btrfs_device *btrfs_scan_one_device(const char *path, @@ -601,6 +639,8 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len); unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, u64 logical); +u64 btrfs_calc_stripe_length(const struct extent_map *em); +int btrfs_nr_parity_stripes(u64 type); int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, struct btrfs_block_group *bg); int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 767a0c6c9694..b4f44662cda7 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -97,7 +97,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, { struct workspace *workspace = list_entry(ws, struct workspace, list); int ret; - char *data_in; + char *data_in = NULL; char *cpage_out; int nr_pages = 0; struct page *in_page = NULL; @@ -126,7 +126,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -ENOMEM; goto out; } - cpage_out = kmap(out_page); + cpage_out = page_address(out_page); pages[0] = out_page; nr_pages = 1; @@ -148,26 +148,26 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, int i; for (i = 0; i < in_buf_pages; i++) { - if (in_page) { - kunmap(in_page); + if (data_in) { + kunmap_local(data_in); put_page(in_page); } in_page = find_get_page(mapping, start >> PAGE_SHIFT); - data_in = kmap(in_page); + data_in = kmap_local_page(in_page); memcpy(workspace->buf + i * PAGE_SIZE, data_in, PAGE_SIZE); start += PAGE_SIZE; } workspace->strm.next_in = workspace->buf; } else { - if (in_page) { - kunmap(in_page); + if (data_in) { + kunmap_local(data_in); put_page(in_page); } in_page = find_get_page(mapping, start >> PAGE_SHIFT); - data_in = kmap(in_page); + data_in = kmap_local_page(in_page); start += PAGE_SIZE; workspace->strm.next_in = data_in; } @@ -196,9 +196,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, * the stream end if required */ if (workspace->strm.avail_out == 0) { - kunmap(out_page); if (nr_pages == nr_dest_pages) { - out_page = NULL; ret = -E2BIG; goto out; } @@ -207,7 +205,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -ENOMEM; goto out; } - cpage_out = kmap(out_page); + cpage_out = page_address(out_page); pages[nr_pages] = out_page; nr_pages++; workspace->strm.avail_out = PAGE_SIZE; @@ -234,9 +232,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, goto out; } else if (workspace->strm.avail_out == 0) { /* get another page for the stream end */ - kunmap(out_page); if (nr_pages == nr_dest_pages) { - out_page = NULL; ret = -E2BIG; goto out; } @@ -245,7 +241,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -ENOMEM; goto out; } - cpage_out = kmap(out_page); + cpage_out = page_address(out_page); pages[nr_pages] = out_page; nr_pages++; workspace->strm.avail_out = PAGE_SIZE; @@ -264,13 +260,11 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, *total_in = workspace->strm.total_in; out: *out_pages = nr_pages; - if (out_page) - kunmap(out_page); - - if (in_page) { - kunmap(in_page); + if (data_in) { + kunmap_local(data_in); put_page(in_page); } + return ret; } @@ -287,7 +281,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) unsigned long buf_start; struct page **pages_in = cb->compressed_pages; - data_in = kmap(pages_in[page_in_index]); + data_in = kmap_local_page(pages_in[page_in_index]); workspace->strm.next_in = data_in; workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE); workspace->strm.total_in = 0; @@ -309,7 +303,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) { pr_warn("BTRFS: inflateInit failed\n"); - kunmap(pages_in[page_in_index]); + kunmap_local(data_in); return -EIO; } while (workspace->strm.total_in < srclen) { @@ -336,13 +330,13 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) if (workspace->strm.avail_in == 0) { unsigned long tmp; - kunmap(pages_in[page_in_index]); + kunmap_local(data_in); page_in_index++; if (page_in_index >= total_pages_in) { data_in = NULL; break; } - data_in = kmap(pages_in[page_in_index]); + data_in = kmap_local_page(pages_in[page_in_index]); workspace->strm.next_in = data_in; tmp = srclen - workspace->strm.total_in; workspace->strm.avail_in = min(tmp, PAGE_SIZE); @@ -355,7 +349,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) done: zlib_inflateEnd(&workspace->strm); if (data_in) - kunmap(pages_in[page_in_index]); + kunmap_local(data_in); if (!ret) zero_fill_bio(cb->orig_bio); return ret; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 11237a913bee..b150b07ba1a7 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -94,9 +94,9 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, * Possible states of log buffer zones * * Empty[0] In use[0] Full[0] - * Empty[1] * x 0 - * In use[1] 0 x 0 - * Full[1] 1 1 C + * Empty[1] * 0 1 + * In use[1] x x 1 + * Full[1] 0 0 C * * Log position: * *: Special case, no superblock is written @@ -415,6 +415,16 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) nr_sectors = bdev_nr_sectors(bdev); zone_info->zone_size_shift = ilog2(zone_info->zone_size); zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors); + /* + * We limit max_zone_append_size also by max_segments * + * PAGE_SIZE. Technically, we can have multiple pages per segment. But, + * since btrfs adds the pages one by one to a bio, and btrfs cannot + * increase the metadata reservation even if it increases the number of + * extents, it is safe to stick with the limit. + */ + zone_info->max_zone_append_size = + min_t(u64, (u64)bdev_max_zone_append_sectors(bdev) << SECTOR_SHIFT, + (u64)bdev_max_segments(bdev) << PAGE_SHIFT); if (!IS_ALIGNED(nr_sectors, zone_sectors)) zone_info->nr_zones++; @@ -640,6 +650,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) u64 zoned_devices = 0; u64 nr_devices = 0; u64 zone_size = 0; + u64 max_zone_append_size = 0; const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED); int ret = 0; @@ -674,6 +685,11 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) ret = -EINVAL; goto out; } + if (!max_zone_append_size || + (zone_info->max_zone_append_size && + zone_info->max_zone_append_size < max_zone_append_size)) + max_zone_append_size = + zone_info->max_zone_append_size; } nr_devices++; } @@ -723,7 +739,11 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) } fs_info->zone_size = zone_size; + fs_info->max_zone_append_size = ALIGN_DOWN(max_zone_append_size, + fs_info->sectorsize); fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; + if (fs_info->max_zone_append_size < fs_info->max_extent_size) + fs_info->max_extent_size = fs_info->max_zone_append_size; /* * Check mount options here, because we might change fs_info->zoned @@ -1735,12 +1755,14 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, &mapped_length, &bioc); if (ret || !bioc || mapped_length < PAGE_SIZE) { - btrfs_put_bioc(bioc); - return -EIO; + ret = -EIO; + goto out_put_bioc; } - if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) - return -EINVAL; + if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + ret = -EINVAL; + goto out_put_bioc; + } nofs_flag = memalloc_nofs_save(); nmirrors = (int)bioc->num_stripes; @@ -1759,7 +1781,8 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, break; } memalloc_nofs_restore(nofs_flag); - +out_put_bioc: + btrfs_put_bioc(bioc); return ret; } @@ -1826,6 +1849,7 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, bool btrfs_zone_activate(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_space_info *space_info = block_group->space_info; struct map_lookup *map; struct btrfs_device *device; u64 physical; @@ -1837,6 +1861,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) map = block_group->physical_map; + spin_lock(&space_info->lock); spin_lock(&block_group->lock); if (block_group->zone_is_active) { ret = true; @@ -1865,7 +1890,10 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) /* Successfully activated all the zones */ block_group->zone_is_active = 1; + space_info->active_total_bytes += block_group->length; spin_unlock(&block_group->lock); + btrfs_try_granting_tickets(fs_info, space_info); + spin_unlock(&space_info->lock); /* For the active block group list */ btrfs_get_block_group(block_group); @@ -1878,6 +1906,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) out_unlock: spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); return ret; } @@ -1885,7 +1914,6 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ { struct btrfs_fs_info *fs_info = block_group->fs_info; struct map_lookup *map; - bool need_zone_finish; int ret = 0; int i; @@ -1942,12 +1970,6 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ } } - /* - * The block group is not fully allocated, so not fully written yet. We - * need to send ZONE_FINISH command to free up an active zone. - */ - need_zone_finish = !btrfs_zoned_bg_is_full(block_group); - block_group->zone_is_active = 0; block_group->alloc_offset = block_group->zone_capacity; block_group->free_space_ctl->free_space = 0; @@ -1963,15 +1985,13 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ if (device->zone_info->max_active_zones == 0) continue; - if (need_zone_finish) { - ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, - physical >> SECTOR_SHIFT, - device->zone_info->zone_size >> SECTOR_SHIFT, - GFP_NOFS); + ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, + physical >> SECTOR_SHIFT, + device->zone_info->zone_size >> SECTOR_SHIFT, + GFP_NOFS); - if (ret) - return ret; - } + if (ret) + return ret; btrfs_dev_clear_active_zone(device, physical); } @@ -1987,6 +2007,9 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ /* For active_bg_list */ btrfs_put_block_group(block_group); + clear_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags); + wake_up_all(&fs_info->zone_finish_wait); + return 0; } @@ -2023,6 +2046,9 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) } mutex_unlock(&fs_info->chunk_mutex); + if (!ret) + set_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags); + return ret; } @@ -2139,3 +2165,123 @@ bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info) factor = div64_u64(used * 100, total); return factor >= fs_info->bg_reclaim_threshold; } + +void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, + u64 length) +{ + struct btrfs_block_group *block_group; + + if (!btrfs_is_zoned(fs_info)) + return; + + block_group = btrfs_lookup_block_group(fs_info, logical); + /* It should be called on a previous data relocation block group. */ + ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)); + + spin_lock(&block_group->lock); + if (!block_group->zoned_data_reloc_ongoing) + goto out; + + /* All relocation extents are written. */ + if (block_group->start + block_group->alloc_offset == logical + length) { + /* Now, release this block group for further allocations. */ + block_group->zoned_data_reloc_ongoing = 0; + } + +out: + spin_unlock(&block_group->lock); + btrfs_put_block_group(block_group); +} + +int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_group *block_group; + struct btrfs_block_group *min_bg = NULL; + u64 min_avail = U64_MAX; + int ret; + + spin_lock(&fs_info->zone_active_bgs_lock); + list_for_each_entry(block_group, &fs_info->zone_active_bgs, + active_bg_list) { + u64 avail; + + spin_lock(&block_group->lock); + if (block_group->reserved || + (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM)) { + spin_unlock(&block_group->lock); + continue; + } + + avail = block_group->zone_capacity - block_group->alloc_offset; + if (min_avail > avail) { + if (min_bg) + btrfs_put_block_group(min_bg); + min_bg = block_group; + min_avail = avail; + btrfs_get_block_group(min_bg); + } + spin_unlock(&block_group->lock); + } + spin_unlock(&fs_info->zone_active_bgs_lock); + + if (!min_bg) + return 0; + + ret = btrfs_zone_finish(min_bg); + btrfs_put_block_group(min_bg); + + return ret < 0 ? ret : 1; +} + +int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + bool do_finish) +{ + struct btrfs_block_group *bg; + int index; + + if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA)) + return 0; + + /* No more block groups to activate */ + if (space_info->active_total_bytes == space_info->total_bytes) + return 0; + + for (;;) { + int ret; + bool need_finish = false; + + down_read(&space_info->groups_sem); + for (index = 0; index < BTRFS_NR_RAID_TYPES; index++) { + list_for_each_entry(bg, &space_info->block_groups[index], + list) { + if (!spin_trylock(&bg->lock)) + continue; + if (btrfs_zoned_bg_is_full(bg) || bg->zone_is_active) { + spin_unlock(&bg->lock); + continue; + } + spin_unlock(&bg->lock); + + if (btrfs_zone_activate(bg)) { + up_read(&space_info->groups_sem); + return 1; + } + + need_finish = true; + } + } + up_read(&space_info->groups_sem); + + if (!do_finish || !need_finish) + break; + + ret = btrfs_zone_finish_one_bg(fs_info); + if (ret == 0) + break; + if (ret < 0) + return ret; + } + + return 0; +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index bb1a189e11f9..e17462db3a84 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -19,6 +19,7 @@ struct btrfs_zoned_device_info { */ u64 zone_size; u8 zone_size_shift; + u64 max_zone_append_size; u32 nr_zones; unsigned int max_active_zones; atomic_t active_zones_left; @@ -77,6 +78,11 @@ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg); void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info); bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info); +void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, + u64 length); +int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info); +int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, bool do_finish); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -243,6 +249,23 @@ static inline bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info) { return false; } + +static inline void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) { } + +static inline int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) +{ + return 1; +} + +static inline int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + bool do_finish) +{ + /* Consider all the block groups are active */ + return 0; +} + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 0fe31a6f6e68..35a0224d4eb7 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -403,7 +403,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, /* map in the first page of input data */ in_page = find_get_page(mapping, start >> PAGE_SHIFT); - workspace->in_buf.src = kmap(in_page); + workspace->in_buf.src = kmap_local_page(in_page); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); @@ -415,7 +415,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, goto out; } pages[nr_pages++] = out_page; - workspace->out_buf.dst = kmap(out_page); + workspace->out_buf.dst = page_address(out_page); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); @@ -450,9 +450,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, if (workspace->out_buf.pos == workspace->out_buf.size) { tot_out += PAGE_SIZE; max_out -= PAGE_SIZE; - kunmap(out_page); if (nr_pages == nr_dest_pages) { - out_page = NULL; ret = -E2BIG; goto out; } @@ -462,7 +460,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, goto out; } pages[nr_pages++] = out_page; - workspace->out_buf.dst = kmap(out_page); + workspace->out_buf.dst = page_address(out_page); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); @@ -477,13 +475,12 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, /* Check if we need more input */ if (workspace->in_buf.pos == workspace->in_buf.size) { tot_in += PAGE_SIZE; - kunmap(in_page); + kunmap_local(workspace->in_buf.src); put_page(in_page); - start += PAGE_SIZE; len -= PAGE_SIZE; in_page = find_get_page(mapping, start >> PAGE_SHIFT); - workspace->in_buf.src = kmap(in_page); + workspace->in_buf.src = kmap_local_page(in_page); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); } @@ -510,9 +507,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, tot_out += PAGE_SIZE; max_out -= PAGE_SIZE; - kunmap(out_page); if (nr_pages == nr_dest_pages) { - out_page = NULL; ret = -E2BIG; goto out; } @@ -522,7 +517,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, goto out; } pages[nr_pages++] = out_page; - workspace->out_buf.dst = kmap(out_page); + workspace->out_buf.dst = page_address(out_page); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); } @@ -537,13 +532,10 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, *total_out = tot_out; out: *out_pages = nr_pages; - /* Cleanup */ - if (in_page) { - kunmap(in_page); + if (workspace->in_buf.src) { + kunmap_local(workspace->in_buf.src); put_page(in_page); } - if (out_page) - kunmap(out_page); return ret; } @@ -567,7 +559,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) goto done; } - workspace->in_buf.src = kmap(pages_in[page_in_index]); + workspace->in_buf.src = kmap_local_page(pages_in[page_in_index]); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); @@ -603,14 +595,15 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) break; if (workspace->in_buf.pos == workspace->in_buf.size) { - kunmap(pages_in[page_in_index++]); + kunmap_local(workspace->in_buf.src); + page_in_index++; if (page_in_index >= total_pages_in) { workspace->in_buf.src = NULL; ret = -EIO; goto done; } srclen -= PAGE_SIZE; - workspace->in_buf.src = kmap(pages_in[page_in_index]); + workspace->in_buf.src = kmap_local_page(pages_in[page_in_index]); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); } @@ -619,7 +612,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) zero_fill_bio(cb->orig_bio); done: if (workspace->in_buf.src) - kunmap(pages_in[page_in_index]); + kunmap_local(workspace->in_buf.src); return ret; } diff --git a/fs/buffer.c b/fs/buffer.c index 898c7f301b1b..55e762a58eb6 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -52,7 +52,7 @@ #include "internal.h" static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); -static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, +static int submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, struct writeback_control *wbc); #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) @@ -282,10 +282,10 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) spin_unlock_irqrestore(&first->b_uptodate_lock, flags); /* - * If none of the buffers had errors and they are all - * uptodate then we can set the page uptodate. + * If all of the buffers are uptodate then we can set the page + * uptodate. */ - if (page_uptodate && !PageError(page)) + if (page_uptodate) SetPageUptodate(page); unlock_page(page); return; @@ -562,7 +562,7 @@ void write_boundary_block(struct block_device *bdev, struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize); if (bh) { if (buffer_dirty(bh)) - ll_rw_block(REQ_OP_WRITE, 0, 1, &bh); + ll_rw_block(REQ_OP_WRITE, 1, &bh); put_bh(bh); } } @@ -1174,7 +1174,7 @@ static struct buffer_head *__bread_slow(struct buffer_head *bh) } else { get_bh(bh); bh->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, 0, bh); + submit_bh(REQ_OP_READ, bh); wait_on_buffer(bh); if (buffer_uptodate(bh)) return bh; @@ -1342,7 +1342,7 @@ void __breadahead(struct block_device *bdev, sector_t block, unsigned size) { struct buffer_head *bh = __getblk(bdev, block, size); if (likely(bh)) { - ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, &bh); + ll_rw_block(REQ_OP_READ | REQ_RAHEAD, 1, &bh); brelse(bh); } } @@ -1353,7 +1353,7 @@ void __breadahead_gfp(struct block_device *bdev, sector_t block, unsigned size, { struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp); if (likely(bh)) { - ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, &bh); + ll_rw_block(REQ_OP_READ | REQ_RAHEAD, 1, &bh); brelse(bh); } } @@ -1604,7 +1604,7 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len) { struct inode *bd_inode = bdev->bd_inode; struct address_space *bd_mapping = bd_inode->i_mapping; - struct pagevec pvec; + struct folio_batch fbatch; pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits); pgoff_t end; int i, count; @@ -1612,24 +1612,24 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len) struct buffer_head *head; end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits); - pagevec_init(&pvec); - while (pagevec_lookup_range(&pvec, bd_mapping, &index, end)) { - count = pagevec_count(&pvec); + folio_batch_init(&fbatch); + while (filemap_get_folios(bd_mapping, &index, end, &fbatch)) { + count = folio_batch_count(&fbatch); for (i = 0; i < count; i++) { - struct page *page = pvec.pages[i]; + struct folio *folio = fbatch.folios[i]; - if (!page_has_buffers(page)) + if (!folio_buffers(folio)) continue; /* - * We use page lock instead of bd_mapping->private_lock + * We use folio lock instead of bd_mapping->private_lock * to pin buffers here since we can afford to sleep and * it scales better than a global spinlock lock. */ - lock_page(page); - /* Recheck when the page is locked which pins bhs */ - if (!page_has_buffers(page)) + folio_lock(folio); + /* Recheck when the folio is locked which pins bhs */ + head = folio_buffers(folio); + if (!head) goto unlock_page; - head = page_buffers(page); bh = head; do { if (!buffer_mapped(bh) || (bh->b_blocknr < block)) @@ -1643,9 +1643,9 @@ next: bh = bh->b_this_page; } while (bh != head); unlock_page: - unlock_page(page); + folio_unlock(folio); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); /* End of range already reached? */ if (index > end || !index) @@ -1716,7 +1716,7 @@ int __block_write_full_page(struct inode *inode, struct page *page, struct buffer_head *bh, *head; unsigned int blocksize, bbits; int nr_underway = 0; - int write_flags = wbc_to_write_flags(wbc); + blk_opf_t write_flags = wbc_to_write_flags(wbc); head = create_page_buffers(page, inode, (1 << BH_Dirty)|(1 << BH_Uptodate)); @@ -1804,7 +1804,7 @@ int __block_write_full_page(struct inode *inode, struct page *page, do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { - submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc); + submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc); nr_underway++; } bh = next; @@ -1858,7 +1858,7 @@ recover: struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { clear_buffer_dirty(bh); - submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc); + submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc); nr_underway++; } bh = next; @@ -2033,7 +2033,7 @@ int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len, if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh) && (block_start < from || block_end > to)) { - ll_rw_block(REQ_OP_READ, 0, 1, &bh); + ll_rw_block(REQ_OP_READ, 1, &bh); *wait_bh++=bh; } } @@ -2259,6 +2259,7 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) unsigned int blocksize, bbits; int nr, i; int fully_mapped = 1; + bool page_error = false; VM_BUG_ON_FOLIO(folio_test_large(folio), folio); @@ -2283,8 +2284,10 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) if (iblock < lblock) { WARN_ON(bh->b_size != blocksize); err = get_block(inode, iblock, bh, 0); - if (err) + if (err) { folio_set_error(folio); + page_error = true; + } } if (!buffer_mapped(bh)) { folio_zero_range(folio, i * blocksize, @@ -2311,7 +2314,7 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) * All buffers are uptodate - we can set the folio uptodate * as well. But not if get_block() returned an error. */ - if (!folio_test_error(folio)) + if (!page_error) folio_mark_uptodate(folio); folio_unlock(folio); return 0; @@ -2334,7 +2337,7 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) if (buffer_uptodate(bh)) end_buffer_async_read(bh, 1); else - submit_bh(REQ_OP_READ, 0, bh); + submit_bh(REQ_OP_READ, bh); } return 0; } @@ -2534,330 +2537,6 @@ out_unlock: } EXPORT_SYMBOL(block_page_mkwrite); -/* - * nobh_write_begin()'s prereads are special: the buffer_heads are freed - * immediately, while under the page lock. So it needs a special end_io - * handler which does not touch the bh after unlocking it. - */ -static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate) -{ - __end_buffer_read_notouch(bh, uptodate); -} - -/* - * Attach the singly-linked list of buffers created by nobh_write_begin, to - * the page (converting it to circular linked list and taking care of page - * dirty races). - */ -static void attach_nobh_buffers(struct page *page, struct buffer_head *head) -{ - struct buffer_head *bh; - - BUG_ON(!PageLocked(page)); - - spin_lock(&page->mapping->private_lock); - bh = head; - do { - if (PageDirty(page)) - set_buffer_dirty(bh); - if (!bh->b_this_page) - bh->b_this_page = head; - bh = bh->b_this_page; - } while (bh != head); - attach_page_private(page, head); - spin_unlock(&page->mapping->private_lock); -} - -/* - * On entry, the page is fully not uptodate. - * On exit the page is fully uptodate in the areas outside (from,to) - * The filesystem needs to handle block truncation upon failure. - */ -int nobh_write_begin(struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata, - get_block_t *get_block) -{ - struct inode *inode = mapping->host; - const unsigned blkbits = inode->i_blkbits; - const unsigned blocksize = 1 << blkbits; - struct buffer_head *head, *bh; - struct page *page; - pgoff_t index; - unsigned from, to; - unsigned block_in_page; - unsigned block_start, block_end; - sector_t block_in_file; - int nr_reads = 0; - int ret = 0; - int is_mapped_to_disk = 1; - - index = pos >> PAGE_SHIFT; - from = pos & (PAGE_SIZE - 1); - to = from + len; - - page = grab_cache_page_write_begin(mapping, index); - if (!page) - return -ENOMEM; - *pagep = page; - *fsdata = NULL; - - if (page_has_buffers(page)) { - ret = __block_write_begin(page, pos, len, get_block); - if (unlikely(ret)) - goto out_release; - return ret; - } - - if (PageMappedToDisk(page)) - return 0; - - /* - * Allocate buffers so that we can keep track of state, and potentially - * attach them to the page if an error occurs. In the common case of - * no error, they will just be freed again without ever being attached - * to the page (which is all OK, because we're under the page lock). - * - * Be careful: the buffer linked list is a NULL terminated one, rather - * than the circular one we're used to. - */ - head = alloc_page_buffers(page, blocksize, false); - if (!head) { - ret = -ENOMEM; - goto out_release; - } - - block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits); - - /* - * We loop across all blocks in the page, whether or not they are - * part of the affected region. This is so we can discover if the - * page is fully mapped-to-disk. - */ - for (block_start = 0, block_in_page = 0, bh = head; - block_start < PAGE_SIZE; - block_in_page++, block_start += blocksize, bh = bh->b_this_page) { - int create; - - block_end = block_start + blocksize; - bh->b_state = 0; - create = 1; - if (block_start >= to) - create = 0; - ret = get_block(inode, block_in_file + block_in_page, - bh, create); - if (ret) - goto failed; - if (!buffer_mapped(bh)) - is_mapped_to_disk = 0; - if (buffer_new(bh)) - clean_bdev_bh_alias(bh); - if (PageUptodate(page)) { - set_buffer_uptodate(bh); - continue; - } - if (buffer_new(bh) || !buffer_mapped(bh)) { - zero_user_segments(page, block_start, from, - to, block_end); - continue; - } - if (buffer_uptodate(bh)) - continue; /* reiserfs does this */ - if (block_start < from || block_end > to) { - lock_buffer(bh); - bh->b_end_io = end_buffer_read_nobh; - submit_bh(REQ_OP_READ, 0, bh); - nr_reads++; - } - } - - if (nr_reads) { - /* - * The page is locked, so these buffers are protected from - * any VM or truncate activity. Hence we don't need to care - * for the buffer_head refcounts. - */ - for (bh = head; bh; bh = bh->b_this_page) { - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) - ret = -EIO; - } - if (ret) - goto failed; - } - - if (is_mapped_to_disk) - SetPageMappedToDisk(page); - - *fsdata = head; /* to be released by nobh_write_end */ - - return 0; - -failed: - BUG_ON(!ret); - /* - * Error recovery is a bit difficult. We need to zero out blocks that - * were newly allocated, and dirty them to ensure they get written out. - * Buffers need to be attached to the page at this point, otherwise - * the handling of potential IO errors during writeout would be hard - * (could try doing synchronous writeout, but what if that fails too?) - */ - attach_nobh_buffers(page, head); - page_zero_new_buffers(page, from, to); - -out_release: - unlock_page(page); - put_page(page); - *pagep = NULL; - - return ret; -} -EXPORT_SYMBOL(nobh_write_begin); - -int nobh_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - struct inode *inode = page->mapping->host; - struct buffer_head *head = fsdata; - struct buffer_head *bh; - BUG_ON(fsdata != NULL && page_has_buffers(page)); - - if (unlikely(copied < len) && head) - attach_nobh_buffers(page, head); - if (page_has_buffers(page)) - return generic_write_end(file, mapping, pos, len, - copied, page, fsdata); - - SetPageUptodate(page); - set_page_dirty(page); - if (pos+copied > inode->i_size) { - i_size_write(inode, pos+copied); - mark_inode_dirty(inode); - } - - unlock_page(page); - put_page(page); - - while (head) { - bh = head; - head = head->b_this_page; - free_buffer_head(bh); - } - - return copied; -} -EXPORT_SYMBOL(nobh_write_end); - -/* - * nobh_writepage() - based on block_full_write_page() except - * that it tries to operate without attaching bufferheads to - * the page. - */ -int nobh_writepage(struct page *page, get_block_t *get_block, - struct writeback_control *wbc) -{ - struct inode * const inode = page->mapping->host; - loff_t i_size = i_size_read(inode); - const pgoff_t end_index = i_size >> PAGE_SHIFT; - unsigned offset; - int ret; - - /* Is the page fully inside i_size? */ - if (page->index < end_index) - goto out; - - /* Is the page fully outside i_size? (truncate in progress) */ - offset = i_size & (PAGE_SIZE-1); - if (page->index >= end_index+1 || !offset) { - unlock_page(page); - return 0; /* don't care */ - } - - /* - * The page straddles i_size. It must be zeroed out on each and every - * writepage invocation because it may be mmapped. "A file is mapped - * in multiples of the page size. For a file that is not a multiple of - * the page size, the remaining memory is zeroed when mapped, and - * writes to that region are not written out to the file." - */ - zero_user_segment(page, offset, PAGE_SIZE); -out: - ret = mpage_writepage(page, get_block, wbc); - if (ret == -EAGAIN) - ret = __block_write_full_page(inode, page, get_block, wbc, - end_buffer_async_write); - return ret; -} -EXPORT_SYMBOL(nobh_writepage); - -int nobh_truncate_page(struct address_space *mapping, - loff_t from, get_block_t *get_block) -{ - pgoff_t index = from >> PAGE_SHIFT; - struct inode *inode = mapping->host; - unsigned blocksize = i_blocksize(inode); - struct folio *folio; - struct buffer_head map_bh; - size_t offset; - sector_t iblock; - int err; - - /* Block boundary? Nothing to do */ - if (!(from & (blocksize - 1))) - return 0; - - folio = __filemap_get_folio(mapping, index, FGP_LOCK | FGP_CREAT, - mapping_gfp_mask(mapping)); - err = -ENOMEM; - if (!folio) - goto out; - - if (folio_buffers(folio)) - goto has_buffers; - - iblock = from >> inode->i_blkbits; - map_bh.b_size = blocksize; - map_bh.b_state = 0; - err = get_block(inode, iblock, &map_bh, 0); - if (err) - goto unlock; - /* unmapped? It's a hole - nothing to do */ - if (!buffer_mapped(&map_bh)) - goto unlock; - - /* Ok, it's mapped. Make sure it's up-to-date */ - if (!folio_test_uptodate(folio)) { - err = mapping->a_ops->read_folio(NULL, folio); - if (err) { - folio_put(folio); - goto out; - } - folio_lock(folio); - if (!folio_test_uptodate(folio)) { - err = -EIO; - goto unlock; - } - if (folio_buffers(folio)) - goto has_buffers; - } - offset = offset_in_folio(folio, from); - folio_zero_segment(folio, offset, round_up(offset, blocksize)); - folio_mark_dirty(folio); - err = 0; - -unlock: - folio_unlock(folio); - folio_put(folio); -out: - return err; - -has_buffers: - folio_unlock(folio); - folio_put(folio); - return block_truncate_page(mapping, from, get_block); -} -EXPORT_SYMBOL(nobh_truncate_page); - int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t *get_block) { @@ -2915,7 +2594,7 @@ int block_truncate_page(struct address_space *mapping, if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) { err = -EIO; - ll_rw_block(REQ_OP_READ, 0, 1, &bh); + ll_rw_block(REQ_OP_READ, 1, &bh); wait_on_buffer(bh); /* Uhhuh. Read error. Complain and punt. */ if (!buffer_uptodate(bh)) @@ -2994,9 +2673,10 @@ static void end_bio_bh_io_sync(struct bio *bio) bio_put(bio); } -static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, +static int submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, struct writeback_control *wbc) { + const enum req_op op = opf & REQ_OP_MASK; struct bio *bio; BUG_ON(!buffer_locked(bh)); @@ -3012,11 +2692,11 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, clear_buffer_write_io_error(bh); if (buffer_meta(bh)) - op_flags |= REQ_META; + opf |= REQ_META; if (buffer_prio(bh)) - op_flags |= REQ_PRIO; + opf |= REQ_PRIO; - bio = bio_alloc(bh->b_bdev, 1, op | op_flags, GFP_NOIO); + bio = bio_alloc(bh->b_bdev, 1, opf, GFP_NOIO); fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO); @@ -3040,22 +2720,21 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, return 0; } -int submit_bh(int op, int op_flags, struct buffer_head *bh) +int submit_bh(blk_opf_t opf, struct buffer_head *bh) { - return submit_bh_wbc(op, op_flags, bh, NULL); + return submit_bh_wbc(opf, bh, NULL); } EXPORT_SYMBOL(submit_bh); /** * ll_rw_block: low-level access to block devices (DEPRECATED) - * @op: whether to %READ or %WRITE - * @op_flags: req_flag_bits + * @opf: block layer request operation and flags. * @nr: number of &struct buffer_heads in the array * @bhs: array of pointers to &struct buffer_head * * ll_rw_block() takes an array of pointers to &struct buffer_heads, and * requests an I/O operation on them, either a %REQ_OP_READ or a %REQ_OP_WRITE. - * @op_flags contains flags modifying the detailed I/O behavior, most notably + * @opf contains flags modifying the detailed I/O behavior, most notably * %REQ_RAHEAD. * * This function drops any buffer that it cannot get a lock on (with the @@ -3072,8 +2751,9 @@ EXPORT_SYMBOL(submit_bh); * All of the buffers must be for the same device, and must also be a * multiple of the current approved size for the device. */ -void ll_rw_block(int op, int op_flags, int nr, struct buffer_head *bhs[]) +void ll_rw_block(const blk_opf_t opf, int nr, struct buffer_head *bhs[]) { + const enum req_op op = opf & REQ_OP_MASK; int i; for (i = 0; i < nr; i++) { @@ -3081,18 +2761,18 @@ void ll_rw_block(int op, int op_flags, int nr, struct buffer_head *bhs[]) if (!trylock_buffer(bh)) continue; - if (op == WRITE) { + if (op == REQ_OP_WRITE) { if (test_clear_buffer_dirty(bh)) { bh->b_end_io = end_buffer_write_sync; get_bh(bh); - submit_bh(op, op_flags, bh); + submit_bh(opf, bh); continue; } } else { if (!buffer_uptodate(bh)) { bh->b_end_io = end_buffer_read_sync; get_bh(bh); - submit_bh(op, op_flags, bh); + submit_bh(opf, bh); continue; } } @@ -3101,7 +2781,7 @@ void ll_rw_block(int op, int op_flags, int nr, struct buffer_head *bhs[]) } EXPORT_SYMBOL(ll_rw_block); -void write_dirty_buffer(struct buffer_head *bh, int op_flags) +void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags) { lock_buffer(bh); if (!test_clear_buffer_dirty(bh)) { @@ -3110,7 +2790,7 @@ void write_dirty_buffer(struct buffer_head *bh, int op_flags) } bh->b_end_io = end_buffer_write_sync; get_bh(bh); - submit_bh(REQ_OP_WRITE, op_flags, bh); + submit_bh(REQ_OP_WRITE | op_flags, bh); } EXPORT_SYMBOL(write_dirty_buffer); @@ -3119,7 +2799,7 @@ EXPORT_SYMBOL(write_dirty_buffer); * and then start new I/O and then wait upon it. The caller must have a ref on * the buffer_head. */ -int __sync_dirty_buffer(struct buffer_head *bh, int op_flags) +int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags) { int ret = 0; @@ -3137,7 +2817,7 @@ int __sync_dirty_buffer(struct buffer_head *bh, int op_flags) get_bh(bh); bh->b_end_io = end_buffer_write_sync; - ret = submit_bh(REQ_OP_WRITE, op_flags, bh); + ret = submit_bh(REQ_OP_WRITE | op_flags, bh); wait_on_buffer(bh); if (!ret && !buffer_uptodate(bh)) ret = -EIO; @@ -3365,7 +3045,7 @@ int bh_submit_read(struct buffer_head *bh) get_bh(bh); bh->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, 0, bh); + submit_bh(REQ_OP_READ, bh); wait_on_buffer(bh); if (buffer_uptodate(bh)) return 0; diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index a41ae6efc545..1fee702d5529 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -21,7 +21,8 @@ static int cachefiles_ondemand_fd_release(struct inode *inode, * anon_fd. */ xas_for_each(&xas, req, ULONG_MAX) { - if (req->msg.opcode == CACHEFILES_OP_READ) { + if (req->msg.object_id == object_id && + req->msg.opcode == CACHEFILES_OP_READ) { req->error = -EIO; complete(&req->done); xas_store(&xas, NULL); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 6dee88815491..d6e5916138e4 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -63,7 +63,7 @@ (CONGESTION_ON_THRESH(congestion_kb) >> 2)) static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len, - struct folio *folio, void **_fsdata); + struct folio **foliop, void **_fsdata); static inline struct ceph_snap_context *page_snap_context(struct page *page) { @@ -1288,18 +1288,19 @@ ceph_find_incompatible(struct page *page) } static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len, - struct folio *folio, void **_fsdata) + struct folio **foliop, void **_fsdata) { struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_snap_context *snapc; - snapc = ceph_find_incompatible(folio_page(folio, 0)); + snapc = ceph_find_incompatible(folio_page(*foliop, 0)); if (snapc) { int r; - folio_unlock(folio); - folio_put(folio); + folio_unlock(*foliop); + folio_put(*foliop); + *foliop = NULL; if (IS_ERR(snapc)) return PTR_ERR(snapc); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 38c930384d41..ac8fd5e7f540 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -4377,6 +4377,7 @@ static void flush_dirty_session_caps(struct ceph_mds_session *s) ihold(inode); dout("flush_dirty_caps %llx.%llx\n", ceph_vinop(inode)); spin_unlock(&mdsc->cap_dirty_lock); + ceph_wait_on_async_create(inode); ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL); iput(inode); spin_lock(&mdsc->cap_dirty_lock); diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile index 8c9f2c00be72..e882e912a517 100644 --- a/fs/cifs/Makefile +++ b/fs/cifs/Makefile @@ -5,7 +5,7 @@ ccflags-y += -I$(src) # needed for trace events obj-$(CONFIG_CIFS) += cifs.o -cifs-y := trace.o cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o \ +cifs-y := trace.o cifsfs.o cifs_debug.o connect.o dir.o file.o \ inode.o link.o misc.o netmisc.o smbencrypt.o transport.o \ cifs_unicode.o nterr.o cifsencrypt.o \ readdir.o ioctl.o sess.o export.o unc.o winucase.o \ @@ -31,4 +31,4 @@ cifs-$(CONFIG_CIFS_SMB_DIRECT) += smbdirect.o cifs-$(CONFIG_CIFS_ROOT) += cifsroot.o -cifs-$(CONFIG_CIFS_ALLOW_INSECURE_LEGACY) += smb1ops.o +cifs-$(CONFIG_CIFS_ALLOW_INSECURE_LEGACY) += smb1ops.o cifssmb.o diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 1dd995efd5b8..11fd85de7217 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -36,7 +36,7 @@ cifs_dump_mem(char *label, void *data, int length) void cifs_dump_detail(void *buf, struct TCP_Server_Info *server) { #ifdef CONFIG_CIFS_DEBUG2 - struct smb_hdr *smb = (struct smb_hdr *)buf; + struct smb_hdr *smb = buf; cifs_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d\n", smb->Command, smb->Status.CifsError, @@ -55,7 +55,7 @@ void cifs_dump_mids(struct TCP_Server_Info *server) return; cifs_dbg(VFS, "Dump pending requests:\n"); - spin_lock(&GlobalMid_Lock); + spin_lock(&server->mid_lock); list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) { cifs_dbg(VFS, "State: %d Cmd: %d Pid: %d Cbdata: %p Mid %llu\n", mid_entry->mid_state, @@ -78,7 +78,7 @@ void cifs_dump_mids(struct TCP_Server_Info *server) mid_entry->resp_buf, 62); } } - spin_unlock(&GlobalMid_Lock); + spin_unlock(&server->mid_lock); #endif /* CONFIG_CIFS_DEBUG2 */ } @@ -162,11 +162,12 @@ cifs_dump_iface(struct seq_file *m, struct cifs_server_iface *iface) seq_printf(m, "\t\tIPv4: %pI4\n", &ipv4->sin_addr); else if (iface->sockaddr.ss_family == AF_INET6) seq_printf(m, "\t\tIPv6: %pI6\n", &ipv6->sin6_addr); + if (!iface->is_active) + seq_puts(m, "\t\t[for-cleanup]\n"); } static int cifs_debug_files_proc_show(struct seq_file *m, void *v) { - struct list_head *tmp, *tmp1, *tmp2; struct TCP_Server_Info *server; struct cifs_ses *ses; struct cifs_tcon *tcon; @@ -182,14 +183,10 @@ static int cifs_debug_files_proc_show(struct seq_file *m, void *v) #endif /* CIFS_DEBUG2 */ spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { - list_for_each(tmp, &server->smb_ses_list) { - ses = list_entry(tmp, struct cifs_ses, smb_ses_list); - list_for_each(tmp1, &ses->tcon_list) { - tcon = list_entry(tmp1, struct cifs_tcon, tcon_list); + list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { spin_lock(&tcon->open_file_lock); - list_for_each(tmp2, &tcon->openFileList) { - cfile = list_entry(tmp2, struct cifsFileInfo, - tlist); + list_for_each_entry(cfile, &tcon->openFileList, tlist) { seq_printf(m, "0x%x 0x%llx 0x%x %d %d %d %pd", tcon->tid, @@ -216,11 +213,11 @@ static int cifs_debug_files_proc_show(struct seq_file *m, void *v) static int cifs_debug_data_proc_show(struct seq_file *m, void *v) { - struct list_head *tmp2, *tmp3; struct mid_q_entry *mid_entry; struct TCP_Server_Info *server; struct cifs_ses *ses; struct cifs_tcon *tcon; + struct cifs_server_iface *iface; int c, i, j; seq_puts(m, @@ -378,9 +375,7 @@ skip_rdma: seq_printf(m, "\n\n\tSessions: "); i = 0; - list_for_each(tmp2, &server->smb_ses_list) { - ses = list_entry(tmp2, struct cifs_ses, - smb_ses_list); + list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { i++; if ((ses->serverDomain == NULL) || (ses->serverOS == NULL) || @@ -444,9 +439,7 @@ skip_rdma: else seq_puts(m, "none\n"); - list_for_each(tmp3, &ses->tcon_list) { - tcon = list_entry(tmp3, struct cifs_tcon, - tcon_list); + list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { ++j; seq_printf(m, "\n\t%d) ", j); cifs_debug_tcon(m, tcon); @@ -456,11 +449,10 @@ skip_rdma: if (ses->iface_count) seq_printf(m, "\n\n\tServer interfaces: %zu", ses->iface_count); - for (j = 0; j < ses->iface_count; j++) { - struct cifs_server_iface *iface; - - iface = &ses->iface_list[j]; - seq_printf(m, "\n\t%d)", j+1); + j = 0; + list_for_each_entry(iface, &ses->iface_list, + iface_head) { + seq_printf(m, "\n\t%d)", ++j); cifs_dump_iface(m, iface); if (is_ses_using_iface(ses, iface)) seq_puts(m, "\t\t[CONNECTED]\n"); @@ -471,10 +463,8 @@ skip_rdma: seq_printf(m, "\n\t\t[NONE]"); seq_puts(m, "\n\n\tMIDs: "); - spin_lock(&GlobalMid_Lock); - list_for_each(tmp3, &server->pending_mid_q) { - mid_entry = list_entry(tmp3, struct mid_q_entry, - qhead); + spin_lock(&server->mid_lock); + list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) { seq_printf(m, "\n\tState: %d com: %d pid:" " %d cbdata: %p mid %llu\n", mid_entry->mid_state, @@ -483,7 +473,7 @@ skip_rdma: mid_entry->callback_data, mid_entry->mid); } - spin_unlock(&GlobalMid_Lock); + spin_unlock(&server->mid_lock); seq_printf(m, "\n--\n"); } if (c == 0) @@ -502,7 +492,6 @@ static ssize_t cifs_stats_proc_write(struct file *file, { bool bv; int rc; - struct list_head *tmp1, *tmp2, *tmp3; struct TCP_Server_Info *server; struct cifs_ses *ses; struct cifs_tcon *tcon; @@ -512,8 +501,8 @@ static ssize_t cifs_stats_proc_write(struct file *file, #ifdef CONFIG_CIFS_STATS2 int i; - atomic_set(&totBufAllocCount, 0); - atomic_set(&totSmBufAllocCount, 0); + atomic_set(&total_buf_alloc_count, 0); + atomic_set(&total_small_buf_alloc_count, 0); #endif /* CONFIG_CIFS_STATS2 */ atomic_set(&tcpSesReconnectCount, 0); atomic_set(&tconInfoReconnectCount, 0); @@ -523,9 +512,7 @@ static ssize_t cifs_stats_proc_write(struct file *file, GlobalCurrentXid = 0; spin_unlock(&GlobalMid_Lock); spin_lock(&cifs_tcp_ses_lock); - list_for_each(tmp1, &cifs_tcp_ses_list) { - server = list_entry(tmp1, struct TCP_Server_Info, - tcp_ses_list); + list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { server->max_in_flight = 0; #ifdef CONFIG_CIFS_STATS2 for (i = 0; i < NUMBER_OF_SMB2_COMMANDS; i++) { @@ -536,13 +523,8 @@ static ssize_t cifs_stats_proc_write(struct file *file, server->fastest_cmd[0] = 0; } #endif /* CONFIG_CIFS_STATS2 */ - list_for_each(tmp2, &server->smb_ses_list) { - ses = list_entry(tmp2, struct cifs_ses, - smb_ses_list); - list_for_each(tmp3, &ses->tcon_list) { - tcon = list_entry(tmp3, - struct cifs_tcon, - tcon_list); + list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { atomic_set(&tcon->num_smbs_sent, 0); spin_lock(&tcon->stat_lock); tcon->bytes_read = 0; @@ -567,7 +549,6 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v) #ifdef CONFIG_CIFS_STATS2 int j; #endif /* STATS2 */ - struct list_head *tmp2, *tmp3; struct TCP_Server_Info *server; struct cifs_ses *ses; struct cifs_tcon *tcon; @@ -577,17 +558,17 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v) seq_printf(m, "Share (unique mount targets): %d\n", tconInfoAllocCount.counter); seq_printf(m, "SMB Request/Response Buffer: %d Pool size: %d\n", - bufAllocCount.counter, + buf_alloc_count.counter, cifs_min_rcv + tcpSesAllocCount.counter); seq_printf(m, "SMB Small Req/Resp Buffer: %d Pool size: %d\n", - smBufAllocCount.counter, cifs_min_small); + small_buf_alloc_count.counter, cifs_min_small); #ifdef CONFIG_CIFS_STATS2 seq_printf(m, "Total Large %d Small %d Allocations\n", - atomic_read(&totBufAllocCount), - atomic_read(&totSmBufAllocCount)); + atomic_read(&total_buf_alloc_count), + atomic_read(&total_small_buf_alloc_count)); #endif /* CONFIG_CIFS_STATS2 */ - seq_printf(m, "Operations (MIDs): %d\n", atomic_read(&midCount)); + seq_printf(m, "Operations (MIDs): %d\n", atomic_read(&mid_count)); seq_printf(m, "\n%d session %d share reconnects\n", tcpSesReconnectCount.counter, tconInfoReconnectCount.counter); @@ -617,13 +598,8 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v) atomic_read(&server->smb2slowcmd[j]), server->hostname, j); #endif /* STATS2 */ - list_for_each(tmp2, &server->smb_ses_list) { - ses = list_entry(tmp2, struct cifs_ses, - smb_ses_list); - list_for_each(tmp3, &ses->tcon_list) { - tcon = list_entry(tmp3, - struct cifs_tcon, - tcon_list); + list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { i++; seq_printf(m, "\n%d) %s", i, tcon->treeName); if (tcon->need_reconnect) diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index bf861fef2f0c..fa480d62f313 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -1379,6 +1379,7 @@ chown_chgrp_exit: return rc; } +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, const struct cifs_fid *cifsfid, u32 *pacllen, u32 __maybe_unused unused) @@ -1512,6 +1513,7 @@ out: cifs_put_tlink(tlink); return rc; } +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ /* Translate the CIFS ACL (similar to NTFS ACL) for a file into mode bits */ int diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 663cb9db4908..8f7835ccbca1 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -141,13 +141,13 @@ int cifs_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server, if ((cifs_pdu == NULL) || (server == NULL)) return -EINVAL; - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&server->srv_lock); if (!(cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) || server->tcpStatus == CifsNeedNegotiate) { - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); return rc; } - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); if (!server->session_estab) { memcpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 8f2e003e0590..8849f0852110 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -68,6 +68,34 @@ bool enable_negotiate_signing; /* false by default */ unsigned int global_secflags = CIFSSEC_DEF; /* unsigned int ntlmv2_support = 0; */ unsigned int sign_CIFS_PDUs = 1; + +/* + * Global transaction id (XID) information + */ +unsigned int GlobalCurrentXid; /* protected by GlobalMid_Sem */ +unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Sem */ +unsigned int GlobalMaxActiveXid; /* prot by GlobalMid_Sem */ +spinlock_t GlobalMid_Lock; /* protects above & list operations on midQ entries */ + +/* + * Global counters, updated atomically + */ +atomic_t sesInfoAllocCount; +atomic_t tconInfoAllocCount; +atomic_t tcpSesNextId; +atomic_t tcpSesAllocCount; +atomic_t tcpSesReconnectCount; +atomic_t tconInfoReconnectCount; + +atomic_t mid_count; +atomic_t buf_alloc_count; +atomic_t small_buf_alloc_count; +#ifdef CONFIG_CIFS_STATS2 +atomic_t total_buf_alloc_count; +atomic_t total_small_buf_alloc_count; +#endif/* STATS2 */ +struct list_head cifs_tcp_ses_list; +spinlock_t cifs_tcp_ses_lock; static const struct super_operations cifs_super_ops; unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE; module_param(CIFSMaxBufSize, uint, 0444); @@ -703,14 +731,17 @@ static void cifs_umount_begin(struct super_block *sb) tcon = cifs_sb_master_tcon(cifs_sb); spin_lock(&cifs_tcp_ses_lock); + spin_lock(&tcon->tc_lock); if ((tcon->tc_count > 1) || (tcon->status == TID_EXITING)) { /* we have other mounts to same share or we have already tried to force umount this and woken up all waiting network requests, nothing to do */ + spin_unlock(&tcon->tc_lock); spin_unlock(&cifs_tcp_ses_lock); return; } else if (tcon->tc_count == 1) tcon->status = TID_EXITING; + spin_unlock(&tcon->tc_lock); spin_unlock(&cifs_tcp_ses_lock); /* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */ @@ -1537,8 +1568,7 @@ cifs_destroy_request_bufs(void) kmem_cache_destroy(cifs_sm_req_cachep); } -static int -cifs_init_mids(void) +static int init_mids(void) { cifs_mid_cachep = kmem_cache_create("cifs_mpx_ids", sizeof(struct mid_q_entry), 0, @@ -1556,8 +1586,7 @@ cifs_init_mids(void) return 0; } -static void -cifs_destroy_mids(void) +static void destroy_mids(void) { mempool_destroy(cifs_mid_poolp); kmem_cache_destroy(cifs_mid_cachep); @@ -1579,11 +1608,11 @@ init_cifs(void) atomic_set(&tcpSesReconnectCount, 0); atomic_set(&tconInfoReconnectCount, 0); - atomic_set(&bufAllocCount, 0); - atomic_set(&smBufAllocCount, 0); + atomic_set(&buf_alloc_count, 0); + atomic_set(&small_buf_alloc_count, 0); #ifdef CONFIG_CIFS_STATS2 - atomic_set(&totBufAllocCount, 0); - atomic_set(&totSmBufAllocCount, 0); + atomic_set(&total_buf_alloc_count, 0); + atomic_set(&total_small_buf_alloc_count, 0); if (slow_rsp_threshold < 1) cifs_dbg(FYI, "slow_response_threshold msgs disabled\n"); else if (slow_rsp_threshold > 32767) @@ -1591,7 +1620,7 @@ init_cifs(void) "slow response threshold set higher than recommended (0 to 32767)\n"); #endif /* CONFIG_CIFS_STATS2 */ - atomic_set(&midCount, 0); + atomic_set(&mid_count, 0); GlobalCurrentXid = 0; GlobalTotalActiveXid = 0; GlobalMaxActiveXid = 0; @@ -1654,7 +1683,7 @@ init_cifs(void) if (rc) goto out_destroy_deferredclose_wq; - rc = cifs_init_mids(); + rc = init_mids(); if (rc) goto out_destroy_inodecache; @@ -1711,7 +1740,7 @@ out_destroy_request_bufs: #endif cifs_destroy_request_bufs(); out_destroy_mids: - cifs_destroy_mids(); + destroy_mids(); out_destroy_inodecache: cifs_destroy_inodecache(); out_destroy_deferredclose_wq: @@ -1747,7 +1776,7 @@ exit_cifs(void) dfs_cache_destroy(); #endif cifs_destroy_request_bufs(); - cifs_destroy_mids(); + destroy_mids(); cifs_destroy_inodecache(); destroy_workqueue(deferredclose_wq); destroy_workqueue(cifsoplockd_wq); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index b17be47a8e59..81f4c15936d0 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -153,6 +153,6 @@ extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ /* when changing internal version - update following two lines at same time */ -#define SMB3_PRODUCT_BUILD 37 -#define CIFS_VERSION "2.37" +#define SMB3_PRODUCT_BUILD 38 +#define CIFS_VERSION "2.38" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index e7737166e5b8..3070407cafa7 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -80,6 +80,9 @@ #define SMB_DNS_RESOLVE_INTERVAL_MIN 120 #define SMB_DNS_RESOLVE_INTERVAL_DEFAULT 600 +/* smb multichannel query server interfaces interval in seconds */ +#define SMB_INTERFACE_POLL_INTERVAL 600 + /* maximum number of PDUs in one compound */ #define MAX_COMPOUND 5 @@ -602,6 +605,7 @@ inc_rfc1001_len(void *buf, int count) struct TCP_Server_Info { struct list_head tcp_ses_list; struct list_head smb_ses_list; + spinlock_t srv_lock; /* protect anything here that is not protected */ __u64 conn_id; /* connection identifier (useful for debugging) */ int srv_count; /* reference counter */ /* 15 character server name + 0x20 16th byte indicating type = srv */ @@ -619,6 +623,7 @@ struct TCP_Server_Info { #endif wait_queue_head_t response_q; wait_queue_head_t request_q; /* if more than maxmpx to srvr must block*/ + spinlock_t mid_lock; /* protect mid queue and it's entries */ struct list_head pending_mid_q; bool noblocksnd; /* use blocking sendmsg */ bool noautotune; /* do not autotune send buf sizes */ @@ -933,15 +938,67 @@ static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net) #endif struct cifs_server_iface { + struct list_head iface_head; + struct kref refcount; size_t speed; unsigned int rdma_capable : 1; unsigned int rss_capable : 1; + unsigned int is_active : 1; /* unset if non existent */ struct sockaddr_storage sockaddr; }; +/* release iface when last ref is dropped */ +static inline void +release_iface(struct kref *ref) +{ + struct cifs_server_iface *iface = container_of(ref, + struct cifs_server_iface, + refcount); + list_del_init(&iface->iface_head); + kfree(iface); +} + +/* + * compare two interfaces a and b + * return 0 if everything matches. + * return 1 if a has higher link speed, or rdma capable, or rss capable + * return -1 otherwise. + */ +static inline int +iface_cmp(struct cifs_server_iface *a, struct cifs_server_iface *b) +{ + int cmp_ret = 0; + + WARN_ON(!a || !b); + if (a->speed == b->speed) { + if (a->rdma_capable == b->rdma_capable) { + if (a->rss_capable == b->rss_capable) { + cmp_ret = memcmp(&a->sockaddr, &b->sockaddr, + sizeof(a->sockaddr)); + if (!cmp_ret) + return 0; + else if (cmp_ret > 0) + return 1; + else + return -1; + } else if (a->rss_capable > b->rss_capable) + return 1; + else + return -1; + } else if (a->rdma_capable > b->rdma_capable) + return 1; + else + return -1; + } else if (a->speed > b->speed) + return 1; + else + return -1; +} + struct cifs_chan { unsigned int in_reconnect : 1; /* if session setup in progress for this channel */ struct TCP_Server_Info *server; + struct cifs_server_iface *iface; /* interface in use */ __u8 signkey[SMB3_SIGN_KEY_SIZE]; }; @@ -953,6 +1010,7 @@ struct cifs_ses { struct list_head rlist; /* reconnect list */ struct list_head tcon_list; struct cifs_tcon *tcon_ipc; + spinlock_t ses_lock; /* protect anything here that is not protected */ struct mutex session_mutex; struct TCP_Server_Info *server; /* pointer to server info */ int ses_count; /* reference counter */ @@ -993,7 +1051,7 @@ struct cifs_ses { */ spinlock_t iface_lock; /* ========= begin: protected by iface_lock ======== */ - struct cifs_server_iface *iface_list; + struct list_head iface_list; size_t iface_count; unsigned long iface_last_update; /* jiffies */ /* ========= end: protected by iface_lock ======== */ @@ -1114,6 +1172,7 @@ struct cifs_tcon { struct list_head tcon_list; int tc_count; struct list_head rlist; /* reconnect list */ + spinlock_t tc_lock; /* protect anything here that is not protected */ atomic_t num_local_opens; /* num of all opens including disconnected */ atomic_t num_remote_opens; /* num of all network opens on server */ struct list_head openFileList; @@ -1203,6 +1262,7 @@ struct cifs_tcon { #ifdef CONFIG_CIFS_DFS_UPCALL struct list_head ulist; /* cache update list */ #endif + struct delayed_work query_interfaces; /* query interfaces workqueue job */ }; /* @@ -1843,33 +1903,78 @@ require use of the stronger protocol */ */ /**************************************************************************** - * Locking notes. All updates to global variables and lists should be - * protected by spinlocks or semaphores. + * Here are all the locks (spinlock, mutex, semaphore) in cifs.ko, arranged according + * to the locking order. i.e. if two locks are to be held together, the lock that + * appears higher in this list needs to be taken before the other. + * + * If you hold a lock that is lower in this list, and you need to take a higher lock + * (or if you think that one of the functions that you're calling may need to), first + * drop the lock you hold, pick up the higher lock, then the lower one. This will + * ensure that locks are picked up only in one direction in the below table + * (top to bottom). * - * Spinlocks - * --------- - * GlobalMid_Lock protects: - * list operations on pending_mid_q and oplockQ - * updates to XID counters, multiplex id and SMB sequence numbers - * list operations on global DnotifyReqList - * updates to ses->status and TCP_Server_Info->tcpStatus - * updates to server->CurrentMid - * tcp_ses_lock protects: - * list operations on tcp and SMB session lists - * tcon->open_file_lock protects the list of open files hanging off the tcon - * inode->open_file_lock protects the openFileList hanging off the inode - * cfile->file_info_lock protects counters and fields in cifs file struct - * f_owner.lock protects certain per file struct operations - * mapping->page_lock protects certain per page operations + * Also, if you expect a function to be called with a lock held, explicitly document + * this in the comments on top of your function definition. * - * Note that the cifs_tcon.open_file_lock should be taken before - * not after the cifsInodeInfo.open_file_lock + * And also, try to keep the critical sections (lock hold time) to be as minimal as + * possible. Blocking / calling other functions with a lock held always increase + * the risk of a possible deadlock. * - * Semaphores - * ---------- - * cifsInodeInfo->lock_sem protects: - * the list of locks held by the inode + * Following this rule will avoid unnecessary deadlocks, which can get really hard to + * debug. Also, any new lock that you introduce, please add to this list in the correct + * order. * + * Please populate this list whenever you introduce new locks in your changes. Or in + * case I've missed some existing locks. Please ensure that it's added in the list + * based on the locking order expected. + * + * ===================================================================================== + * Lock Protects Initialization fn + * ===================================================================================== + * vol_list_lock + * vol_info->ctx_lock vol_info->ctx + * cifs_sb_info->tlink_tree_lock cifs_sb_info->tlink_tree cifs_setup_cifs_sb + * TCP_Server_Info-> TCP_Server_Info cifs_get_tcp_session + * reconnect_mutex + * TCP_Server_Info->srv_mutex TCP_Server_Info cifs_get_tcp_session + * cifs_ses->session_mutex cifs_ses sesInfoAlloc + * cifs_tcon + * cifs_tcon->open_file_lock cifs_tcon->openFileList tconInfoAlloc + * cifs_tcon->pending_opens + * cifs_tcon->stat_lock cifs_tcon->bytes_read tconInfoAlloc + * cifs_tcon->bytes_written + * cifs_tcp_ses_lock cifs_tcp_ses_list sesInfoAlloc + * GlobalMid_Lock GlobalMaxActiveXid init_cifs + * GlobalCurrentXid + * GlobalTotalActiveXid + * TCP_Server_Info->srv_lock (anything in struct not protected by another lock and can change) + * TCP_Server_Info->mid_lock TCP_Server_Info->pending_mid_q cifs_get_tcp_session + * ->CurrentMid + * (any changes in mid_q_entry fields) + * TCP_Server_Info->req_lock TCP_Server_Info->in_flight cifs_get_tcp_session + * ->credits + * ->echo_credits + * ->oplock_credits + * ->reconnect_instance + * cifs_ses->ses_lock (anything that is not protected by another lock and can change) + * cifs_ses->iface_lock cifs_ses->iface_list sesInfoAlloc + * ->iface_count + * ->iface_last_update + * cifs_ses->chan_lock cifs_ses->chans + * ->chans_need_reconnect + * ->chans_in_reconnect + * cifs_tcon->tc_lock (anything that is not protected by another lock and can change) + * cifsInodeInfo->open_file_lock cifsInodeInfo->openFileList cifs_alloc_inode + * cifsInodeInfo->writers_lock cifsInodeInfo->writers cifsInodeInfo_alloc + * cifsInodeInfo->lock_sem cifsInodeInfo->llist cifs_init_once + * ->can_cache_brlcks + * cifsInodeInfo->deferred_lock cifsInodeInfo->deferred_closes cifsInodeInfo_alloc + * cached_fid->fid_mutex cifs_tcon->crfid tconInfoAlloc + * cifsFileInfo->fh_mutex cifsFileInfo cifs_new_fileinfo + * cifsFileInfo->file_info_lock cifsFileInfo->count cifs_new_fileinfo + * ->invalidHandle initiate_cifs_search + * ->oplock_break_cancelled + * cifs_aio_ctx->aio_mutex cifs_aio_ctx cifs_aio_ctx_alloc ****************************************************************************/ #ifdef DECLARE_GLOBALS_HERE @@ -1885,47 +1990,44 @@ require use of the stronger protocol */ * sessions (and from that the tree connections) can be found * by iterating over cifs_tcp_ses_list */ -GLOBAL_EXTERN struct list_head cifs_tcp_ses_list; +extern struct list_head cifs_tcp_ses_list; /* * This lock protects the cifs_tcp_ses_list, the list of smb sessions per * tcp session, and the list of tcon's per smb session. It also protects - * the reference counters for the server, smb session, and tcon. It also - * protects some fields in the TCP_Server_Info struct such as dstaddr. Finally, - * changes to the tcon->tidStatus should be done while holding this lock. + * the reference counters for the server, smb session, and tcon. * generally the locks should be taken in order tcp_ses_lock before * tcon->open_file_lock and that before file->file_info_lock since the * structure order is cifs_socket-->cifs_ses-->cifs_tcon-->cifs_file */ -GLOBAL_EXTERN spinlock_t cifs_tcp_ses_lock; +extern spinlock_t cifs_tcp_ses_lock; /* * Global transaction id (XID) information */ -GLOBAL_EXTERN unsigned int GlobalCurrentXid; /* protected by GlobalMid_Sem */ -GLOBAL_EXTERN unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Sem */ -GLOBAL_EXTERN unsigned int GlobalMaxActiveXid; /* prot by GlobalMid_Sem */ -GLOBAL_EXTERN spinlock_t GlobalMid_Lock; /* protects above & list operations */ - /* on midQ entries */ +extern unsigned int GlobalCurrentXid; /* protected by GlobalMid_Sem */ +extern unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Sem */ +extern unsigned int GlobalMaxActiveXid; /* prot by GlobalMid_Sem */ +extern spinlock_t GlobalMid_Lock; /* protects above & list operations on midQ entries */ + /* * Global counters, updated atomically */ -GLOBAL_EXTERN atomic_t sesInfoAllocCount; -GLOBAL_EXTERN atomic_t tconInfoAllocCount; -GLOBAL_EXTERN atomic_t tcpSesNextId; -GLOBAL_EXTERN atomic_t tcpSesAllocCount; -GLOBAL_EXTERN atomic_t tcpSesReconnectCount; -GLOBAL_EXTERN atomic_t tconInfoReconnectCount; +extern atomic_t sesInfoAllocCount; +extern atomic_t tconInfoAllocCount; +extern atomic_t tcpSesNextId; +extern atomic_t tcpSesAllocCount; +extern atomic_t tcpSesReconnectCount; +extern atomic_t tconInfoReconnectCount; /* Various Debug counters */ -GLOBAL_EXTERN atomic_t bufAllocCount; /* current number allocated */ +extern atomic_t buf_alloc_count; /* current number allocated */ +extern atomic_t small_buf_alloc_count; #ifdef CONFIG_CIFS_STATS2 -GLOBAL_EXTERN atomic_t totBufAllocCount; /* total allocated over all time */ -GLOBAL_EXTERN atomic_t totSmBufAllocCount; +extern atomic_t total_buf_alloc_count; /* total allocated over all time */ +extern atomic_t total_small_buf_alloc_count; extern unsigned int slow_rsp_threshold; /* number of secs before logging */ #endif -GLOBAL_EXTERN atomic_t smBufAllocCount; -GLOBAL_EXTERN atomic_t midCount; /* Misc globals */ extern bool enable_oplocks; /* enable or disable oplocks */ @@ -1942,6 +2044,7 @@ extern unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */ extern unsigned int cifs_min_small; /* min size of small buf pool */ extern unsigned int cifs_max_pending; /* MAX requests at once to server*/ extern bool disable_legacy_dialects; /* forbid vers=1.0 and vers=2.0 mounts */ +extern atomic_t mid_count; void cifs_oplock_break(struct work_struct *work); void cifs_queue_oplock_break(struct cifsFileInfo *cfile); diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 3b7366ec03c7..daaadffa2b88 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -78,12 +78,8 @@ extern char *build_wildcard_path_from_dentry(struct dentry *direntry); extern char *cifs_compose_mount_options(const char *sb_mountdata, const char *fullpath, const struct dfs_info3_param *ref, char **devname); -/* extern void renew_parental_timestamps(struct dentry *direntry);*/ -extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer, - struct TCP_Server_Info *server); -extern void DeleteMidQEntry(struct mid_q_entry *midEntry); -extern void cifs_delete_mid(struct mid_q_entry *mid); -extern void cifs_mid_q_entry_release(struct mid_q_entry *midEntry); +extern void delete_mid(struct mid_q_entry *mid); +extern void release_mid(struct mid_q_entry *mid); extern void cifs_wake_up_task(struct mid_q_entry *mid); extern int cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid); @@ -521,6 +517,7 @@ extern int generate_smb30signingkey(struct cifs_ses *ses, extern int generate_smb311signingkey(struct cifs_ses *ses, struct TCP_Server_Info *server); +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY extern int CIFSSMBCopy(unsigned int xid, struct cifs_tcon *source_tcon, const char *fromName, @@ -551,6 +548,7 @@ extern int CIFSSMBSetPosixACL(const unsigned int xid, struct cifs_tcon *tcon, const struct nls_table *nls_codepage, int remap_special_chars); extern int CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon, const int netfid, __u64 *pExtAttrBits, __u64 *pMask); +#endif /* CIFS_ALLOW_INSECURE_LEGACY */ extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb); extern bool couldbe_mf_symlink(const struct cifs_fattr *fattr); extern int check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, @@ -636,6 +634,13 @@ cifs_chan_clear_need_reconnect(struct cifs_ses *ses, bool cifs_chan_needs_reconnect(struct cifs_ses *ses, struct TCP_Server_Info *server); +bool +cifs_chan_is_iface_active(struct cifs_ses *ses, + struct TCP_Server_Info *server); +int +cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server); +int +SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon); void extract_unc_hostname(const char *unc, const char **h, size_t *len); int copy_path_name(char *dst, const char *src); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 6371b9eebdad..7aa91e272027 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -29,7 +29,6 @@ #include "cifsproto.h" #include "cifs_unicode.h" #include "cifs_debug.h" -#include "smb2proto.h" #include "fscache.h" #include "smbdirect.h" #ifdef CONFIG_CIFS_DFS_UPCALL @@ -62,52 +61,6 @@ static struct { #define CIFS_NUM_PROT 1 #endif /* CIFS_POSIX */ -/* - * Mark as invalid, all open files on tree connections since they - * were closed when session to server was lost. - */ -void -cifs_mark_open_files_invalid(struct cifs_tcon *tcon) -{ - struct cifsFileInfo *open_file = NULL; - struct list_head *tmp; - struct list_head *tmp1; - - /* only send once per connect */ - spin_lock(&cifs_tcp_ses_lock); - if ((tcon->ses->ses_status != SES_GOOD) || (tcon->status != TID_NEED_RECON)) { - spin_unlock(&cifs_tcp_ses_lock); - return; - } - tcon->status = TID_IN_FILES_INVALIDATE; - spin_unlock(&cifs_tcp_ses_lock); - - /* list all files open on tree connection and mark them invalid */ - spin_lock(&tcon->open_file_lock); - list_for_each_safe(tmp, tmp1, &tcon->openFileList) { - open_file = list_entry(tmp, struct cifsFileInfo, tlist); - open_file->invalidHandle = true; - open_file->oplock_break_cancelled = true; - } - spin_unlock(&tcon->open_file_lock); - - mutex_lock(&tcon->crfid.fid_mutex); - tcon->crfid.is_valid = false; - /* cached handle is not valid, so SMB2_CLOSE won't be sent below */ - close_cached_dir_lease_locked(&tcon->crfid); - memset(tcon->crfid.fid, 0, sizeof(struct cifs_fid)); - mutex_unlock(&tcon->crfid.fid_mutex); - - spin_lock(&cifs_tcp_ses_lock); - if (tcon->status == TID_IN_FILES_INVALIDATE) - tcon->status = TID_NEED_TCON; - spin_unlock(&cifs_tcp_ses_lock); - - /* - * BB Add call to invalidate_inodes(sb) for all superblocks mounted - * to this tcon. - */ -} /* reconnect the socket, tcon, and smb session if needed */ static int @@ -134,18 +87,18 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) * only tree disconnect, open, and write, (and ulogoff which does not * have tcon) are allowed as we start force umount */ - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&tcon->tc_lock); if (tcon->status == TID_EXITING) { if (smb_command != SMB_COM_WRITE_ANDX && smb_command != SMB_COM_OPEN_ANDX && smb_command != SMB_COM_TREE_DISCONNECT) { - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&tcon->tc_lock); cifs_dbg(FYI, "can not send cmd %d while umounting\n", smb_command); return -ENODEV; } } - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&tcon->tc_lock); retries = server->nr_targets; @@ -165,12 +118,12 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) } /* are we still trying to reconnect? */ - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&server->srv_lock); if (server->tcpStatus != CifsNeedReconnect) { - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); break; } - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); if (retries && --retries) continue; @@ -201,13 +154,13 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) * and the server never sends an answer the socket will be closed * and tcpStatus set to reconnect. */ - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&server->srv_lock); if (server->tcpStatus == CifsNeedReconnect) { - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); rc = -EHOSTDOWN; goto out; } - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); /* * need to prevent multiple threads trying to simultaneously @@ -457,52 +410,6 @@ decode_ext_sec_blob(struct cifs_ses *ses, NEGOTIATE_RSP *pSMBr) return 0; } -int -cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required) -{ - bool srv_sign_required = server->sec_mode & server->vals->signing_required; - bool srv_sign_enabled = server->sec_mode & server->vals->signing_enabled; - bool mnt_sign_enabled = global_secflags & CIFSSEC_MAY_SIGN; - - /* - * Is signing required by mnt options? If not then check - * global_secflags to see if it is there. - */ - if (!mnt_sign_required) - mnt_sign_required = ((global_secflags & CIFSSEC_MUST_SIGN) == - CIFSSEC_MUST_SIGN); - - /* - * If signing is required then it's automatically enabled too, - * otherwise, check to see if the secflags allow it. - */ - mnt_sign_enabled = mnt_sign_required ? mnt_sign_required : - (global_secflags & CIFSSEC_MAY_SIGN); - - /* If server requires signing, does client allow it? */ - if (srv_sign_required) { - if (!mnt_sign_enabled) { - cifs_dbg(VFS, "Server requires signing, but it's disabled in SecurityFlags!\n"); - return -ENOTSUPP; - } - server->sign = true; - } - - /* If client requires signing, does server allow it? */ - if (mnt_sign_required) { - if (!srv_sign_enabled) { - cifs_dbg(VFS, "Server does not support signing!\n"); - return -ENOTSUPP; - } - server->sign = true; - } - - if (cifs_rdma_enabled(server) && server->sign) - cifs_dbg(VFS, "Signing is enabled, and RDMA read/write will be disabled\n"); - - return 0; -} - static bool should_set_ext_sec_flag(enum securityEnum sectype) { @@ -684,7 +591,7 @@ cifs_echo_callback(struct mid_q_entry *mid) struct TCP_Server_Info *server = mid->callback_data; struct cifs_credits credits = { .value = 1, .instance = 0 }; - DeleteMidQEntry(mid); + release_mid(mid); add_credits(server, &credits, CIFS_ECHO_OP); } @@ -1379,184 +1286,6 @@ openRetry: return rc; } -/* - * Discard any remaining data in the current SMB. To do this, we borrow the - * current bigbuf. - */ -int -cifs_discard_remaining_data(struct TCP_Server_Info *server) -{ - unsigned int rfclen = server->pdu_size; - int remaining = rfclen + server->vals->header_preamble_size - - server->total_read; - - while (remaining > 0) { - int length; - - length = cifs_discard_from_socket(server, - min_t(size_t, remaining, - CIFSMaxBufSize + MAX_HEADER_SIZE(server))); - if (length < 0) - return length; - server->total_read += length; - remaining -= length; - } - - return 0; -} - -static int -__cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid, - bool malformed) -{ - int length; - - length = cifs_discard_remaining_data(server); - dequeue_mid(mid, malformed); - mid->resp_buf = server->smallbuf; - server->smallbuf = NULL; - return length; -} - -static int -cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid) -{ - struct cifs_readdata *rdata = mid->callback_data; - - return __cifs_readv_discard(server, mid, rdata->result); -} - -int -cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) -{ - int length, len; - unsigned int data_offset, data_len; - struct cifs_readdata *rdata = mid->callback_data; - char *buf = server->smallbuf; - unsigned int buflen = server->pdu_size + - server->vals->header_preamble_size; - bool use_rdma_mr = false; - - cifs_dbg(FYI, "%s: mid=%llu offset=%llu bytes=%u\n", - __func__, mid->mid, rdata->offset, rdata->bytes); - - /* - * read the rest of READ_RSP header (sans Data array), or whatever we - * can if there's not enough data. At this point, we've read down to - * the Mid. - */ - len = min_t(unsigned int, buflen, server->vals->read_rsp_size) - - HEADER_SIZE(server) + 1; - - length = cifs_read_from_socket(server, - buf + HEADER_SIZE(server) - 1, len); - if (length < 0) - return length; - server->total_read += length; - - if (server->ops->is_session_expired && - server->ops->is_session_expired(buf)) { - cifs_reconnect(server, true); - return -1; - } - - if (server->ops->is_status_pending && - server->ops->is_status_pending(buf, server)) { - cifs_discard_remaining_data(server); - return -1; - } - - /* set up first two iov for signature check and to get credits */ - rdata->iov[0].iov_base = buf; - rdata->iov[0].iov_len = server->vals->header_preamble_size; - rdata->iov[1].iov_base = buf + server->vals->header_preamble_size; - rdata->iov[1].iov_len = - server->total_read - server->vals->header_preamble_size; - cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n", - rdata->iov[0].iov_base, rdata->iov[0].iov_len); - cifs_dbg(FYI, "1: iov_base=%p iov_len=%zu\n", - rdata->iov[1].iov_base, rdata->iov[1].iov_len); - - /* Was the SMB read successful? */ - rdata->result = server->ops->map_error(buf, false); - if (rdata->result != 0) { - cifs_dbg(FYI, "%s: server returned error %d\n", - __func__, rdata->result); - /* normal error on read response */ - return __cifs_readv_discard(server, mid, false); - } - - /* Is there enough to get to the rest of the READ_RSP header? */ - if (server->total_read < server->vals->read_rsp_size) { - cifs_dbg(FYI, "%s: server returned short header. got=%u expected=%zu\n", - __func__, server->total_read, - server->vals->read_rsp_size); - rdata->result = -EIO; - return cifs_readv_discard(server, mid); - } - - data_offset = server->ops->read_data_offset(buf) + - server->vals->header_preamble_size; - if (data_offset < server->total_read) { - /* - * win2k8 sometimes sends an offset of 0 when the read - * is beyond the EOF. Treat it as if the data starts just after - * the header. - */ - cifs_dbg(FYI, "%s: data offset (%u) inside read response header\n", - __func__, data_offset); - data_offset = server->total_read; - } else if (data_offset > MAX_CIFS_SMALL_BUFFER_SIZE) { - /* data_offset is beyond the end of smallbuf */ - cifs_dbg(FYI, "%s: data offset (%u) beyond end of smallbuf\n", - __func__, data_offset); - rdata->result = -EIO; - return cifs_readv_discard(server, mid); - } - - cifs_dbg(FYI, "%s: total_read=%u data_offset=%u\n", - __func__, server->total_read, data_offset); - - len = data_offset - server->total_read; - if (len > 0) { - /* read any junk before data into the rest of smallbuf */ - length = cifs_read_from_socket(server, - buf + server->total_read, len); - if (length < 0) - return length; - server->total_read += length; - } - - /* how much data is in the response? */ -#ifdef CONFIG_CIFS_SMB_DIRECT - use_rdma_mr = rdata->mr; -#endif - data_len = server->ops->read_data_length(buf, use_rdma_mr); - if (!use_rdma_mr && (data_offset + data_len > buflen)) { - /* data_len is corrupt -- discard frame */ - rdata->result = -EIO; - return cifs_readv_discard(server, mid); - } - - length = rdata->read_into_pages(server, rdata, data_len); - if (length < 0) - return length; - - server->total_read += length; - - cifs_dbg(FYI, "total_read=%u buflen=%u remaining=%u\n", - server->total_read, buflen, data_len); - - /* discard anything left over */ - if (server->total_read < buflen) - return cifs_readv_discard(server, mid); - - dequeue_mid(mid, false); - mid->resp_buf = server->smallbuf; - server->smallbuf = NULL; - return length; -} - static void cifs_readv_callback(struct mid_q_entry *mid) { @@ -1607,7 +1336,7 @@ cifs_readv_callback(struct mid_q_entry *mid) } queue_work(cifsiod_wq, &rdata->work); - DeleteMidQEntry(mid); + release_mid(mid); add_credits(server, &credits, 0); } @@ -1909,183 +1638,6 @@ CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms, return rc; } -void -cifs_writedata_release(struct kref *refcount) -{ - struct cifs_writedata *wdata = container_of(refcount, - struct cifs_writedata, refcount); -#ifdef CONFIG_CIFS_SMB_DIRECT - if (wdata->mr) { - smbd_deregister_mr(wdata->mr); - wdata->mr = NULL; - } -#endif - - if (wdata->cfile) - cifsFileInfo_put(wdata->cfile); - - kvfree(wdata->pages); - kfree(wdata); -} - -/* - * Write failed with a retryable error. Resend the write request. It's also - * possible that the page was redirtied so re-clean the page. - */ -static void -cifs_writev_requeue(struct cifs_writedata *wdata) -{ - int i, rc = 0; - struct inode *inode = d_inode(wdata->cfile->dentry); - struct TCP_Server_Info *server; - unsigned int rest_len; - - server = tlink_tcon(wdata->cfile->tlink)->ses->server; - i = 0; - rest_len = wdata->bytes; - do { - struct cifs_writedata *wdata2; - unsigned int j, nr_pages, wsize, tailsz, cur_len; - - wsize = server->ops->wp_retry_size(inode); - if (wsize < rest_len) { - nr_pages = wsize / PAGE_SIZE; - if (!nr_pages) { - rc = -ENOTSUPP; - break; - } - cur_len = nr_pages * PAGE_SIZE; - tailsz = PAGE_SIZE; - } else { - nr_pages = DIV_ROUND_UP(rest_len, PAGE_SIZE); - cur_len = rest_len; - tailsz = rest_len - (nr_pages - 1) * PAGE_SIZE; - } - - wdata2 = cifs_writedata_alloc(nr_pages, cifs_writev_complete); - if (!wdata2) { - rc = -ENOMEM; - break; - } - - for (j = 0; j < nr_pages; j++) { - wdata2->pages[j] = wdata->pages[i + j]; - lock_page(wdata2->pages[j]); - clear_page_dirty_for_io(wdata2->pages[j]); - } - - wdata2->sync_mode = wdata->sync_mode; - wdata2->nr_pages = nr_pages; - wdata2->offset = page_offset(wdata2->pages[0]); - wdata2->pagesz = PAGE_SIZE; - wdata2->tailsz = tailsz; - wdata2->bytes = cur_len; - - rc = cifs_get_writable_file(CIFS_I(inode), FIND_WR_ANY, - &wdata2->cfile); - if (!wdata2->cfile) { - cifs_dbg(VFS, "No writable handle to retry writepages rc=%d\n", - rc); - if (!is_retryable_error(rc)) - rc = -EBADF; - } else { - wdata2->pid = wdata2->cfile->pid; - rc = server->ops->async_writev(wdata2, - cifs_writedata_release); - } - - for (j = 0; j < nr_pages; j++) { - unlock_page(wdata2->pages[j]); - if (rc != 0 && !is_retryable_error(rc)) { - SetPageError(wdata2->pages[j]); - end_page_writeback(wdata2->pages[j]); - put_page(wdata2->pages[j]); - } - } - - kref_put(&wdata2->refcount, cifs_writedata_release); - if (rc) { - if (is_retryable_error(rc)) - continue; - i += nr_pages; - break; - } - - rest_len -= cur_len; - i += nr_pages; - } while (i < wdata->nr_pages); - - /* cleanup remaining pages from the original wdata */ - for (; i < wdata->nr_pages; i++) { - SetPageError(wdata->pages[i]); - end_page_writeback(wdata->pages[i]); - put_page(wdata->pages[i]); - } - - if (rc != 0 && !is_retryable_error(rc)) - mapping_set_error(inode->i_mapping, rc); - kref_put(&wdata->refcount, cifs_writedata_release); -} - -void -cifs_writev_complete(struct work_struct *work) -{ - struct cifs_writedata *wdata = container_of(work, - struct cifs_writedata, work); - struct inode *inode = d_inode(wdata->cfile->dentry); - int i = 0; - - if (wdata->result == 0) { - spin_lock(&inode->i_lock); - cifs_update_eof(CIFS_I(inode), wdata->offset, wdata->bytes); - spin_unlock(&inode->i_lock); - cifs_stats_bytes_written(tlink_tcon(wdata->cfile->tlink), - wdata->bytes); - } else if (wdata->sync_mode == WB_SYNC_ALL && wdata->result == -EAGAIN) - return cifs_writev_requeue(wdata); - - for (i = 0; i < wdata->nr_pages; i++) { - struct page *page = wdata->pages[i]; - if (wdata->result == -EAGAIN) - __set_page_dirty_nobuffers(page); - else if (wdata->result < 0) - SetPageError(page); - end_page_writeback(page); - cifs_readpage_to_fscache(inode, page); - put_page(page); - } - if (wdata->result != -EAGAIN) - mapping_set_error(inode->i_mapping, wdata->result); - kref_put(&wdata->refcount, cifs_writedata_release); -} - -struct cifs_writedata * -cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete) -{ - struct page **pages = - kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); - if (pages) - return cifs_writedata_direct_alloc(pages, complete); - - return NULL; -} - -struct cifs_writedata * -cifs_writedata_direct_alloc(struct page **pages, work_func_t complete) -{ - struct cifs_writedata *wdata; - - wdata = kzalloc(sizeof(*wdata), GFP_NOFS); - if (wdata != NULL) { - wdata->pages = pages; - kref_init(&wdata->refcount); - INIT_LIST_HEAD(&wdata->list); - init_completion(&wdata->done); - INIT_WORK(&wdata->work, complete); - } - return wdata; -} - /* * Check the mid_state and signature on received buffer (if any), and queue the * workqueue completion task. @@ -2132,7 +1684,7 @@ cifs_writev_callback(struct mid_q_entry *mid) } queue_work(cifsiod_wq, &wdata->work); - DeleteMidQEntry(mid); + release_mid(mid); add_credits(tcon->ses->server, &credits, 0); } @@ -3660,7 +3212,6 @@ setACLerrorExit: return rc; } -/* BB fix tabs in this function FIXME BB */ int CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon, const int netfid, __u64 *pExtAttrBits, __u64 *pMask) @@ -3677,7 +3228,7 @@ CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon, GetExtAttrRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, - (void **) &pSMBr); + (void **) &pSMBr); if (rc) return rc; @@ -3723,7 +3274,7 @@ GetExtAttrRetry: __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); __u16 count = le16_to_cpu(pSMBr->t2.DataCount); struct file_chattr_info *pfinfo; - /* BB Do we need a cast or hash here ? */ + if (count != 16) { cifs_dbg(FYI, "Invalid size ret in GetExtAttr\n"); rc = -EIO; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 1849e3411487..7f205a9a2de4 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -119,10 +119,10 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server) goto requeue_resolve; } - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&server->srv_lock); rc = cifs_convert_address((struct sockaddr *)&server->dstaddr, ipaddr, strlen(ipaddr)); - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); kfree(ipaddr); /* rc == 1 means success here */ @@ -145,6 +145,25 @@ requeue_resolve: return rc; } +static void smb2_query_server_interfaces(struct work_struct *work) +{ + int rc; + struct cifs_tcon *tcon = container_of(work, + struct cifs_tcon, + query_interfaces.work); + + /* + * query server network interfaces, in case they change + */ + rc = SMB3_request_interfaces(0, tcon); + if (rc) { + cifs_dbg(FYI, "%s: failed to query server interfaces: %d\n", + __func__, rc); + } + + queue_delayed_work(cifsiod_wq, &tcon->query_interfaces, + (SMB_INTERFACE_POLL_INTERVAL * HZ)); +} static void cifs_resolve_server(struct work_struct *work) { @@ -186,17 +205,22 @@ cifs_signal_cifsd_for_reconnect(struct TCP_Server_Info *server, /* If server is a channel, select the primary channel */ pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&pserver->srv_lock); if (!all_channels) { pserver->tcpStatus = CifsNeedReconnect; - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&pserver->srv_lock); return; } + spin_unlock(&pserver->srv_lock); + spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { spin_lock(&ses->chan_lock); - for (i = 0; i < ses->chan_count; i++) + for (i = 0; i < ses->chan_count; i++) { + spin_lock(&ses->chans[i].server->srv_lock); ses->chans[i].server->tcpStatus = CifsNeedReconnect; + spin_unlock(&ses->chans[i].server->srv_lock); + } spin_unlock(&ses->chan_lock); } spin_unlock(&cifs_tcp_ses_lock); @@ -217,7 +241,7 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server, bool mark_smb_session) { struct TCP_Server_Info *pserver; - struct cifs_ses *ses; + struct cifs_ses *ses, *nses; struct cifs_tcon *tcon; /* @@ -231,7 +255,11 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server, spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { + list_for_each_entry_safe(ses, nses, &pserver->smb_ses_list, smb_ses_list) { + /* check if iface is still active */ + if (!cifs_chan_is_iface_active(ses, server)) + cifs_chan_update_iface(ses, server); + spin_lock(&ses->chan_lock); if (!mark_smb_session && cifs_chan_needs_reconnect(ses, server)) goto next_session; @@ -291,7 +319,7 @@ cifs_abort_connection(struct TCP_Server_Info *server) /* mark submitted MIDs for retry and issue callback */ INIT_LIST_HEAD(&retry_list); cifs_dbg(FYI, "%s: moving mids to private list\n", __func__); - spin_lock(&GlobalMid_Lock); + spin_lock(&server->mid_lock); list_for_each_entry_safe(mid, nmid, &server->pending_mid_q, qhead) { kref_get(&mid->refcount); if (mid->mid_state == MID_REQUEST_SUBMITTED) @@ -299,14 +327,14 @@ cifs_abort_connection(struct TCP_Server_Info *server) list_move(&mid->qhead, &retry_list); mid->mid_flags |= MID_DELETED; } - spin_unlock(&GlobalMid_Lock); + spin_unlock(&server->mid_lock); cifs_server_unlock(server); cifs_dbg(FYI, "%s: issuing mid callbacks\n", __func__); list_for_each_entry_safe(mid, nmid, &retry_list, qhead) { list_del_init(&mid->qhead); mid->callback(mid); - cifs_mid_q_entry_release(mid); + release_mid(mid); } if (cifs_rdma_enabled(server)) { @@ -318,11 +346,11 @@ cifs_abort_connection(struct TCP_Server_Info *server) static bool cifs_tcp_ses_needs_reconnect(struct TCP_Server_Info *server, int num_targets) { - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&server->srv_lock); server->nr_targets = num_targets; if (server->tcpStatus == CifsExiting) { /* the demux thread will exit normally next time through the loop */ - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); wake_up(&server->response_q); return false; } @@ -332,7 +360,7 @@ static bool cifs_tcp_ses_needs_reconnect(struct TCP_Server_Info *server, int num server->hostname); server->tcpStatus = CifsNeedReconnect; - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); return true; } @@ -382,20 +410,20 @@ static int __cifs_reconnect(struct TCP_Server_Info *server, } else { atomic_inc(&tcpSesReconnectCount); set_credits(server, 1); - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&server->srv_lock); if (server->tcpStatus != CifsExiting) server->tcpStatus = CifsNeedNegotiate; - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); cifs_swn_reset_server_dstaddr(server); cifs_server_unlock(server); mod_delayed_work(cifsiod_wq, &server->reconnect, 0); } } while (server->tcpStatus == CifsNeedReconnect); - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&server->srv_lock); if (server->tcpStatus == CifsNeedNegotiate) mod_delayed_work(cifsiod_wq, &server->echo, 0); - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); wake_up(&server->response_q); return rc; @@ -509,10 +537,10 @@ static int reconnect_dfs_server(struct TCP_Server_Info *server) */ atomic_inc(&tcpSesReconnectCount); set_credits(server, 1); - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&server->srv_lock); if (server->tcpStatus != CifsExiting) server->tcpStatus = CifsNeedNegotiate; - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); cifs_swn_reset_server_dstaddr(server); cifs_server_unlock(server); mod_delayed_work(cifsiod_wq, &server->reconnect, 0); @@ -524,11 +552,10 @@ static int reconnect_dfs_server(struct TCP_Server_Info *server) dfs_cache_free_tgts(&tl); /* Need to set up echo worker again once connection has been established */ - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&server->srv_lock); if (server->tcpStatus == CifsNeedNegotiate) mod_delayed_work(cifsiod_wq, &server->echo, 0); - - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); wake_up(&server->response_q); return rc; @@ -537,12 +564,12 @@ static int reconnect_dfs_server(struct TCP_Server_Info *server) int cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session) { /* If tcp session is not an dfs connection, then reconnect to last target server */ - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&server->srv_lock); if (!server->is_dfs_conn) { - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); return __cifs_reconnect(server, mark_smb_session); } - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); mutex_lock(&server->refpath_lock); if (!server->origin_fullpath || !server->leaf_fullpath) { @@ -638,18 +665,18 @@ server_unresponsive(struct TCP_Server_Info *server) * 65s kernel_recvmsg times out, and we see that we haven't gotten * a response in >60s. */ - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&server->srv_lock); if ((server->tcpStatus == CifsGood || server->tcpStatus == CifsNeedNegotiate) && (!server->ops->can_echo || server->ops->can_echo(server)) && time_after(jiffies, server->lstrp + 3 * server->echo_interval)) { - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); cifs_server_dbg(VFS, "has not responded in %lu seconds. Reconnecting...\n", (3 * server->echo_interval) / HZ); cifs_reconnect(server, false); return true; } - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); return false; } @@ -694,18 +721,18 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg) else length = sock_recvmsg(server->ssocket, smb_msg, 0); - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&server->srv_lock); if (server->tcpStatus == CifsExiting) { - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); return -ESHUTDOWN; } if (server->tcpStatus == CifsNeedReconnect) { - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); cifs_reconnect(server, false); return -ECONNABORTED; } - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); if (length == -ERESTARTSYS || length == -EAGAIN || @@ -817,7 +844,7 @@ dequeue_mid(struct mid_q_entry *mid, bool malformed) #ifdef CONFIG_CIFS_STATS2 mid->when_received = jiffies; #endif - spin_lock(&GlobalMid_Lock); + spin_lock(&mid->server->mid_lock); if (!malformed) mid->mid_state = MID_RESPONSE_RECEIVED; else @@ -827,12 +854,12 @@ dequeue_mid(struct mid_q_entry *mid, bool malformed) * function has finished processing it is a bug. */ if (mid->mid_flags & MID_DELETED) { - spin_unlock(&GlobalMid_Lock); + spin_unlock(&mid->server->mid_lock); pr_warn_once("trying to dequeue a deleted mid\n"); } else { list_del_init(&mid->qhead); mid->mid_flags |= MID_DELETED; - spin_unlock(&GlobalMid_Lock); + spin_unlock(&mid->server->mid_lock); } } @@ -871,21 +898,68 @@ handle_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server, dequeue_mid(mid, malformed); } +int +cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required) +{ + bool srv_sign_required = server->sec_mode & server->vals->signing_required; + bool srv_sign_enabled = server->sec_mode & server->vals->signing_enabled; + bool mnt_sign_enabled; + + /* + * Is signing required by mnt options? If not then check + * global_secflags to see if it is there. + */ + if (!mnt_sign_required) + mnt_sign_required = ((global_secflags & CIFSSEC_MUST_SIGN) == + CIFSSEC_MUST_SIGN); + + /* + * If signing is required then it's automatically enabled too, + * otherwise, check to see if the secflags allow it. + */ + mnt_sign_enabled = mnt_sign_required ? mnt_sign_required : + (global_secflags & CIFSSEC_MAY_SIGN); + + /* If server requires signing, does client allow it? */ + if (srv_sign_required) { + if (!mnt_sign_enabled) { + cifs_dbg(VFS, "Server requires signing, but it's disabled in SecurityFlags!\n"); + return -EOPNOTSUPP; + } + server->sign = true; + } + + /* If client requires signing, does server allow it? */ + if (mnt_sign_required) { + if (!srv_sign_enabled) { + cifs_dbg(VFS, "Server does not support signing!\n"); + return -EOPNOTSUPP; + } + server->sign = true; + } + + if (cifs_rdma_enabled(server) && server->sign) + cifs_dbg(VFS, "Signing is enabled, and RDMA read/write will be disabled\n"); + + return 0; +} + + static void clean_demultiplex_info(struct TCP_Server_Info *server) { int length; /* take it off the list, if it's not already */ - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&server->srv_lock); list_del_init(&server->tcp_ses_list); - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); cancel_delayed_work_sync(&server->echo); cancel_delayed_work_sync(&server->resolve); - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&server->srv_lock); server->tcpStatus = CifsExiting; - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); wake_up_all(&server->response_q); /* check if we have blocked requests that need to free */ @@ -916,7 +990,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) struct list_head *tmp, *tmp2; INIT_LIST_HEAD(&dispose_list); - spin_lock(&GlobalMid_Lock); + spin_lock(&server->mid_lock); list_for_each_safe(tmp, tmp2, &server->pending_mid_q) { mid_entry = list_entry(tmp, struct mid_q_entry, qhead); cifs_dbg(FYI, "Clearing mid %llu\n", mid_entry->mid); @@ -925,7 +999,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) list_move(&mid_entry->qhead, &dispose_list); mid_entry->mid_flags |= MID_DELETED; } - spin_unlock(&GlobalMid_Lock); + spin_unlock(&server->mid_lock); /* now walk dispose list and issue callbacks */ list_for_each_safe(tmp, tmp2, &dispose_list) { @@ -933,7 +1007,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) cifs_dbg(FYI, "Callback mid %llu\n", mid_entry->mid); list_del_init(&mid_entry->qhead); mid_entry->callback(mid_entry); - cifs_mid_q_entry_release(mid_entry); + release_mid(mid_entry); } /* 1/8th of sec is more than enough time for them to exit */ msleep(125); @@ -1007,19 +1081,18 @@ int cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid) { char *buf = server->large_buf ? server->bigbuf : server->smallbuf; - int length; + int rc; /* * We know that we received enough to get to the MID as we * checked the pdu_length earlier. Now check to see - * if the rest of the header is OK. We borrow the length - * var for the rest of the loop to avoid a new stack var. + * if the rest of the header is OK. * * 48 bytes is enough to display the header and a little bit * into the payload for debugging purposes. */ - length = server->ops->check_message(buf, server->total_read, server); - if (length != 0) + rc = server->ops->check_message(buf, server->total_read, server); + if (rc) cifs_dump_mem("Bad SMB: ", buf, min_t(unsigned int, server->total_read, 48)); @@ -1034,9 +1107,9 @@ cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid) return -1; if (!mid) - return length; + return rc; - handle_mid(mid, server, buf, length); + handle_mid(mid, server, buf, rc); return 0; } @@ -1173,7 +1246,7 @@ next_pdu: if (length < 0) { for (i = 0; i < num_mids; i++) if (mids[i]) - cifs_mid_q_entry_release(mids[i]); + release_mid(mids[i]); continue; } @@ -1200,7 +1273,7 @@ next_pdu: if (!mids[i]->multiRsp || mids[i]->multiEnd) mids[i]->callback(mids[i]); - cifs_mid_q_entry_release(mids[i]); + release_mid(mids[i]); } else if (server->ops->is_oplock_break && server->ops->is_oplock_break(bufs[i], server)) { @@ -1208,7 +1281,7 @@ next_pdu: cifs_dbg(FYI, "Received oplock break\n"); } else { cifs_server_dbg(VFS, "No task to wake, unknown frame received! NumMids %d\n", - atomic_read(&midCount)); + atomic_read(&mid_count)); cifs_dump_mem("Received Data is: ", bufs[i], HEADER_SIZE(server)); smb2_add_credits_from_hdr(bufs[i], server); @@ -1379,6 +1452,7 @@ match_security(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) return true; } +/* this function must be called with srv_lock held */ static int match_server(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) { struct sockaddr *addr = (struct sockaddr *)&ctx->dstaddr; @@ -1439,6 +1513,7 @@ cifs_find_tcp_session(struct smb3_fs_context *ctx) spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { + spin_lock(&server->srv_lock); #ifdef CONFIG_CIFS_DFS_UPCALL /* * DFS failover implementation in cifs_reconnect() requires unique tcp sessions for @@ -1446,15 +1521,20 @@ cifs_find_tcp_session(struct smb3_fs_context *ctx) * shares or even links that may connect to same server but having completely * different failover targets. */ - if (server->is_dfs_conn) + if (server->is_dfs_conn) { + spin_unlock(&server->srv_lock); continue; + } #endif /* * Skip ses channels since they're only handled in lower layers * (e.g. cifs_send_recv). */ - if (CIFS_SERVER_IS_CHAN(server) || !match_server(server, ctx)) + if (CIFS_SERVER_IS_CHAN(server) || !match_server(server, ctx)) { + spin_unlock(&server->srv_lock); continue; + } + spin_unlock(&server->srv_lock); ++server->srv_count; spin_unlock(&cifs_tcp_ses_lock); @@ -1502,9 +1582,9 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect) else cancel_delayed_work_sync(&server->reconnect); - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&server->srv_lock); server->tcpStatus = CifsExiting; - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); cifs_crypto_secmech_release(server); @@ -1563,8 +1643,8 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx, if (primary_server) { spin_lock(&cifs_tcp_ses_lock); ++primary_server->srv_count; - tcp_ses->primary_server = primary_server; spin_unlock(&cifs_tcp_ses_lock); + tcp_ses->primary_server = primary_server; } init_waitqueue_head(&tcp_ses->response_q); init_waitqueue_head(&tcp_ses->request_q); @@ -1580,6 +1660,8 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx, tcp_ses->lstrp = jiffies; tcp_ses->compress_algorithm = cpu_to_le16(ctx->compression); spin_lock_init(&tcp_ses->req_lock); + spin_lock_init(&tcp_ses->srv_lock); + spin_lock_init(&tcp_ses->mid_lock); INIT_LIST_HEAD(&tcp_ses->tcp_ses_list); INIT_LIST_HEAD(&tcp_ses->smb_ses_list); INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request); @@ -1653,9 +1735,9 @@ smbd_connected: * to the struct since the kernel thread not created yet * no need to spinlock this update of tcpStatus */ - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&tcp_ses->srv_lock); tcp_ses->tcpStatus = CifsNeedNegotiate; - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&tcp_ses->srv_lock); if ((ctx->max_credits < 20) || (ctx->max_credits > 60000)) tcp_ses->max_credits = SMB2_MAX_CREDITS_AVAILABLE; @@ -1697,6 +1779,7 @@ out_err: return ERR_PTR(rc); } +/* this function must be called with ses_lock held */ static int match_session(struct cifs_ses *ses, struct smb3_fs_context *ctx) { if (ctx->sectype != Unspecified && @@ -1832,10 +1915,17 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { - if (ses->ses_status == SES_EXITING) + spin_lock(&ses->ses_lock); + if (ses->ses_status == SES_EXITING) { + spin_unlock(&ses->ses_lock); continue; - if (!match_session(ses, ctx)) + } + if (!match_session(ses, ctx)) { + spin_unlock(&ses->ses_lock); continue; + } + spin_unlock(&ses->ses_lock); + ++ses->ses_count; spin_unlock(&cifs_tcp_ses_lock); return ses; @@ -1850,26 +1940,28 @@ void cifs_put_smb_ses(struct cifs_ses *ses) unsigned int chan_count; struct TCP_Server_Info *server = ses->server; - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&ses->ses_lock); if (ses->ses_status == SES_EXITING) { - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&ses->ses_lock); return; } + spin_unlock(&ses->ses_lock); cifs_dbg(FYI, "%s: ses_count=%d\n", __func__, ses->ses_count); cifs_dbg(FYI, "%s: ses ipc: %s\n", __func__, ses->tcon_ipc ? ses->tcon_ipc->treeName : "NONE"); + spin_lock(&cifs_tcp_ses_lock); if (--ses->ses_count > 0) { spin_unlock(&cifs_tcp_ses_lock); return; } + spin_unlock(&cifs_tcp_ses_lock); /* ses_count can never go negative */ WARN_ON(ses->ses_count < 0); if (ses->ses_status == SES_GOOD) ses->ses_status = SES_EXITING; - spin_unlock(&cifs_tcp_ses_lock); cifs_free_ipc(ses); @@ -1886,7 +1978,6 @@ void cifs_put_smb_ses(struct cifs_ses *ses) list_del_init(&ses->smb_ses_list); spin_unlock(&cifs_tcp_ses_lock); - spin_lock(&ses->chan_lock); chan_count = ses->chan_count; /* close any extra channels */ @@ -1894,13 +1985,14 @@ void cifs_put_smb_ses(struct cifs_ses *ses) int i; for (i = 1; i < chan_count; i++) { - spin_unlock(&ses->chan_lock); + if (ses->chans[i].iface) { + kref_put(&ses->chans[i].iface->refcount, release_iface); + ses->chans[i].iface = NULL; + } cifs_put_tcp_session(ses->chans[i].server, 0); - spin_lock(&ses->chan_lock); ses->chans[i].server = NULL; } } - spin_unlock(&ses->chan_lock); sesInfoFree(ses); cifs_put_tcp_session(server, 0); @@ -2204,6 +2296,7 @@ get_ses_fail: return ERR_PTR(rc); } +/* this function must be called with tc_lock held */ static int match_tcon(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) { if (tcon->status == TID_EXITING) @@ -2226,16 +2319,17 @@ static int match_tcon(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) static struct cifs_tcon * cifs_find_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) { - struct list_head *tmp; struct cifs_tcon *tcon; spin_lock(&cifs_tcp_ses_lock); - list_for_each(tmp, &ses->tcon_list) { - tcon = list_entry(tmp, struct cifs_tcon, tcon_list); - - if (!match_tcon(tcon, ctx)) + list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { + spin_lock(&tcon->tc_lock); + if (!match_tcon(tcon, ctx)) { + spin_unlock(&tcon->tc_lock); continue; + } ++tcon->tc_count; + spin_unlock(&tcon->tc_lock); spin_unlock(&cifs_tcp_ses_lock); return tcon; } @@ -2270,6 +2364,9 @@ cifs_put_tcon(struct cifs_tcon *tcon) list_del_init(&tcon->tcon_list); spin_unlock(&cifs_tcp_ses_lock); + /* cancel polling of interfaces */ + cancel_delayed_work_sync(&tcon->query_interfaces); + if (tcon->use_witness) { int rc; @@ -2507,6 +2604,12 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) tcon->local_lease = ctx->local_lease; INIT_LIST_HEAD(&tcon->pending_opens); + /* schedule query interfaces poll */ + INIT_DELAYED_WORK(&tcon->query_interfaces, + smb2_query_server_interfaces); + queue_delayed_work(cifsiod_wq, &tcon->query_interfaces, + (SMB_INTERFACE_POLL_INTERVAL * HZ)); + spin_lock(&cifs_tcp_ses_lock); list_add(&tcon->tcon_list, &ses->tcon_list); spin_unlock(&cifs_tcp_ses_lock); @@ -2603,7 +2706,7 @@ match_prepath(struct super_block *sb, struct cifs_mnt_data *mnt_data) int cifs_match_super(struct super_block *sb, void *data) { - struct cifs_mnt_data *mnt_data = (struct cifs_mnt_data *)data; + struct cifs_mnt_data *mnt_data = data; struct smb3_fs_context *ctx; struct cifs_sb_info *cifs_sb; struct TCP_Server_Info *tcp_srv; @@ -2626,6 +2729,9 @@ cifs_match_super(struct super_block *sb, void *data) ctx = mnt_data->ctx; + spin_lock(&tcp_srv->srv_lock); + spin_lock(&ses->ses_lock); + spin_lock(&tcon->tc_lock); if (!match_server(tcp_srv, ctx) || !match_session(ses, ctx) || !match_tcon(tcon, ctx) || @@ -2636,6 +2742,10 @@ cifs_match_super(struct super_block *sb, void *data) rc = compare_mount_options(sb, mnt_data); out: + spin_unlock(&tcon->tc_lock); + spin_unlock(&ses->ses_lock); + spin_unlock(&tcp_srv->srv_lock); + spin_unlock(&cifs_tcp_ses_lock); cifs_put_tlink(tlink); return rc; @@ -2913,6 +3023,7 @@ ip_connect(struct TCP_Server_Info *server) return generic_ip_connect(server); } +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) { @@ -3018,6 +3129,7 @@ void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon, } } } +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ int cifs_setup_cifs_sb(struct cifs_sb_info *cifs_sb) { @@ -3134,6 +3246,7 @@ static int mount_get_conns(struct mount_ctx *mnt_ctx) if (tcon->posix_extensions) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_POSIX_PATHS; +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY /* tell server which Unix caps we support */ if (cap_unix(tcon->ses)) { /* @@ -3141,16 +3254,17 @@ static int mount_get_conns(struct mount_ctx *mnt_ctx) * for just this mount. */ reset_cifs_unix_caps(xid, tcon, cifs_sb, ctx); - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&tcon->ses->server->srv_lock); if ((tcon->ses->server->tcpStatus == CifsNeedReconnect) && (le64_to_cpu(tcon->fsUnixInfo.Capability) & CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP)) { - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&tcon->ses->server->srv_lock); rc = -EACCES; goto out; } - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&tcon->ses->server->srv_lock); } else +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ tcon->unix_ext = 0; /* server does not support them */ /* do not care if a following call succeed - informational */ @@ -3232,9 +3346,9 @@ static int mount_get_dfs_conns(struct mount_ctx *mnt_ctx) rc = mount_get_conns(mnt_ctx); if (mnt_ctx->server) { cifs_dbg(FYI, "%s: marking tcp session as a dfs connection\n", __func__); - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&mnt_ctx->server->srv_lock); mnt_ctx->server->is_dfs_conn = true; - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&mnt_ctx->server->srv_lock); } return rc; } @@ -3949,28 +4063,28 @@ cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses, return -ENOSYS; /* only send once per connect */ - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&server->srv_lock); if (!server->ops->need_neg(server) || server->tcpStatus != CifsNeedNegotiate) { - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); return 0; } server->tcpStatus = CifsInNegotiate; - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); rc = server->ops->negotiate(xid, ses, server); if (rc == 0) { - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&server->srv_lock); if (server->tcpStatus == CifsInNegotiate) server->tcpStatus = CifsGood; else rc = -EHOSTDOWN; - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); } else { - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&server->srv_lock); if (server->tcpStatus == CifsInNegotiate) server->tcpStatus = CifsNeedNegotiate; - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); } return rc; @@ -3982,14 +4096,20 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, struct nls_table *nls_info) { int rc = -ENOSYS; + struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr; + struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr; bool is_binding = false; + spin_lock(&ses->ses_lock); + if (server->dstaddr.ss_family == AF_INET6) + scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI6", &addr6->sin6_addr); + else + scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI4", &addr->sin_addr); - spin_lock(&cifs_tcp_ses_lock); if (ses->ses_status != SES_GOOD && ses->ses_status != SES_NEW && ses->ses_status != SES_NEED_RECON) { - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&ses->ses_lock); return 0; } @@ -3998,7 +4118,7 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, if (CIFS_ALL_CHANS_GOOD(ses) || cifs_chan_in_reconnect(ses, server)) { spin_unlock(&ses->chan_lock); - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&ses->ses_lock); return 0; } is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses); @@ -4007,7 +4127,7 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, if (!is_binding) ses->ses_status = SES_IN_SETUP; - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&ses->ses_lock); if (!is_binding) { ses->capabilities = server->capabilities; @@ -4031,22 +4151,22 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, if (rc) { cifs_server_dbg(VFS, "Send error in SessSetup = %d\n", rc); - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&ses->ses_lock); if (ses->ses_status == SES_IN_SETUP) ses->ses_status = SES_NEED_RECON; spin_lock(&ses->chan_lock); cifs_chan_clear_in_reconnect(ses, server); spin_unlock(&ses->chan_lock); - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&ses->ses_lock); } else { - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&ses->ses_lock); if (ses->ses_status == SES_IN_SETUP) ses->ses_status = SES_GOOD; spin_lock(&ses->chan_lock); cifs_chan_clear_in_reconnect(ses, server); cifs_chan_clear_need_reconnect(ses, server); spin_unlock(&ses->chan_lock); - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&ses->ses_lock); } return rc; @@ -4120,8 +4240,10 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) goto out; } +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY if (cap_unix(ses)) reset_cifs_unix_caps(0, tcon, NULL, ctx); +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ out: kfree(ctx->username); @@ -4510,15 +4632,15 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru struct dfs_info3_param ref = {0}; /* only send once per connect */ - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&tcon->tc_lock); if (tcon->ses->ses_status != SES_GOOD || (tcon->status != TID_NEW && tcon->status != TID_NEED_TCON)) { - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&tcon->tc_lock); return 0; } tcon->status = TID_IN_TCON; - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&tcon->tc_lock); tree = kzalloc(MAX_TREE_SIZE, GFP_KERNEL); if (!tree) { @@ -4557,15 +4679,15 @@ out: cifs_put_tcp_super(sb); if (rc) { - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&tcon->tc_lock); if (tcon->status == TID_IN_TCON) tcon->status = TID_NEED_TCON; - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&tcon->tc_lock); } else { - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&tcon->tc_lock); if (tcon->status == TID_IN_TCON) tcon->status = TID_GOOD; - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&tcon->tc_lock); tcon->need_reconnect = false; } @@ -4578,28 +4700,28 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru const struct smb_version_operations *ops = tcon->ses->server->ops; /* only send once per connect */ - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&tcon->tc_lock); if (tcon->ses->ses_status != SES_GOOD || (tcon->status != TID_NEW && tcon->status != TID_NEED_TCON)) { - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&tcon->tc_lock); return 0; } tcon->status = TID_IN_TCON; - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&tcon->tc_lock); rc = ops->tree_connect(xid, tcon->ses, tcon->treeName, tcon, nlsc); if (rc) { - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&tcon->tc_lock); if (tcon->status == TID_IN_TCON) tcon->status = TID_NEED_TCON; - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&tcon->tc_lock); } else { - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&tcon->tc_lock); if (tcon->status == TID_IN_TCON) tcon->status = TID_GOOD; - spin_unlock(&cifs_tcp_ses_lock); tcon->need_reconnect = false; + spin_unlock(&tcon->tc_lock); } return rc; diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c index 34a8f3baed5e..a9b6c3eba6de 100644 --- a/fs/cifs/dfs_cache.c +++ b/fs/cifs/dfs_cache.c @@ -1526,15 +1526,21 @@ static void refresh_mounts(struct cifs_ses **sessions) spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { - if (!server->is_dfs_conn) + spin_lock(&server->srv_lock); + if (!server->is_dfs_conn) { + spin_unlock(&server->srv_lock); continue; + } + spin_unlock(&server->srv_lock); list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { + spin_lock(&tcon->tc_lock); if (!tcon->ipc && !tcon->need_reconnect) { tcon->tc_count++; list_add_tail(&tcon->ulist, &tcons); } + spin_unlock(&tcon->tc_lock); } } } diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index ce9b22aecfba..08f7392716e2 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -193,6 +193,7 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, return PTR_ERR(full_path); } +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY if (tcon->unix_ext && cap_unix(tcon->ses) && !tcon->broken_posix_open && (CIFS_UNIX_POSIX_PATH_OPS_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))) { @@ -261,6 +262,7 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, * rare for path not covered on files) */ } +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ desired_access = 0; if (OPEN_FMODE(oflags) & FMODE_READ) @@ -316,6 +318,7 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, goto out; } +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY /* * If Open reported that we actually created a file then we now have to * set the mode if possible. @@ -357,6 +360,9 @@ cifs_create_get_file_info: rc = cifs_get_inode_info_unix(&newinode, full_path, inode->i_sb, xid); else { +#else + { +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ /* TODO: Add support for calling POSIX query info here, but passing in fid */ rc = cifs_get_inode_info(&newinode, full_path, buf, inode->i_sb, xid, fid); @@ -377,7 +383,9 @@ cifs_create_get_file_info: } } +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY cifs_create_set_dentry: +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ if (rc != 0) { cifs_dbg(FYI, "Create worked, get_inode_info failed rc = %d\n", rc); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index e64cda7a7610..85f2abcb2795 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -26,6 +26,7 @@ #include "cifspdu.h" #include "cifsglob.h" #include "cifsproto.h" +#include "smb2proto.h" #include "cifs_unicode.h" #include "cifs_debug.h" #include "cifs_fs_sb.h" @@ -34,6 +35,53 @@ #include "fs_context.h" #include "cifs_ioctl.h" +/* + * Mark as invalid, all open files on tree connections since they + * were closed when session to server was lost. + */ +void +cifs_mark_open_files_invalid(struct cifs_tcon *tcon) +{ + struct cifsFileInfo *open_file = NULL; + struct list_head *tmp; + struct list_head *tmp1; + + /* only send once per connect */ + spin_lock(&tcon->ses->ses_lock); + if ((tcon->ses->ses_status != SES_GOOD) || (tcon->status != TID_NEED_RECON)) { + spin_unlock(&tcon->ses->ses_lock); + return; + } + tcon->status = TID_IN_FILES_INVALIDATE; + spin_unlock(&tcon->ses->ses_lock); + + /* list all files open on tree connection and mark them invalid */ + spin_lock(&tcon->open_file_lock); + list_for_each_safe(tmp, tmp1, &tcon->openFileList) { + open_file = list_entry(tmp, struct cifsFileInfo, tlist); + open_file->invalidHandle = true; + open_file->oplock_break_cancelled = true; + } + spin_unlock(&tcon->open_file_lock); + + mutex_lock(&tcon->crfid.fid_mutex); + tcon->crfid.is_valid = false; + /* cached handle is not valid, so SMB2_CLOSE won't be sent below */ + close_cached_dir_lease_locked(&tcon->crfid); + memset(tcon->crfid.fid, 0, sizeof(struct cifs_fid)); + mutex_unlock(&tcon->crfid.fid_mutex); + + spin_lock(&tcon->tc_lock); + if (tcon->status == TID_IN_FILES_INVALIDATE) + tcon->status = TID_NEED_TCON; + spin_unlock(&tcon->tc_lock); + + /* + * BB Add call to invalidate_inodes(sb) for all superblocks mounted + * to this tcon. + */ +} + static inline int cifs_convert_flags(unsigned int flags) { if ((flags & O_ACCMODE) == O_RDONLY) @@ -52,6 +100,7 @@ static inline int cifs_convert_flags(unsigned int flags) FILE_READ_DATA); } +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY static u32 cifs_posix_convert_flags(unsigned int flags) { u32 posix_flags = 0; @@ -85,6 +134,7 @@ static u32 cifs_posix_convert_flags(unsigned int flags) return posix_flags; } +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ static inline int cifs_get_disposition(unsigned int flags) { @@ -100,6 +150,7 @@ static inline int cifs_get_disposition(unsigned int flags) return FILE_OPEN; } +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY int cifs_posix_open(const char *full_path, struct inode **pinode, struct super_block *sb, int mode, unsigned int f_flags, __u32 *poplock, __u16 *pnetfid, unsigned int xid) @@ -161,6 +212,7 @@ posix_open_ret: kfree(presp_data); return rc; } +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ static int cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, @@ -579,6 +631,7 @@ int cifs_open(struct inode *inode, struct file *file) else oplock = 0; +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY if (!tcon->broken_posix_open && tcon->unix_ext && cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))) { @@ -603,6 +656,7 @@ int cifs_open(struct inode *inode, struct file *file) * or DFS errors. */ } +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ if (server->ops->get_lease_key) server->ops->get_lease_key(inode, &fid); @@ -630,6 +684,7 @@ int cifs_open(struct inode *inode, struct file *file) goto out; } +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY if ((oplock & CIFS_CREATE_ACTION) && !posix_open_ok && tcon->unix_ext) { /* * Time to set mode which we can not set earlier due to @@ -647,6 +702,7 @@ int cifs_open(struct inode *inode, struct file *file) CIFSSMBUnixSetFileInfo(xid, tcon, &args, fid.netfid, cfile->pid); } +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ use_cache: fscache_use_cookie(cifs_inode_cookie(file_inode(file)), @@ -664,7 +720,9 @@ out: return rc; } +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY static int cifs_push_posix_locks(struct cifsFileInfo *cfile); +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ /* * Try to reacquire byte range locks that were released when session @@ -673,10 +731,12 @@ static int cifs_push_posix_locks(struct cifsFileInfo *cfile); static int cifs_relock_file(struct cifsFileInfo *cfile) { - struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb); struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); int rc = 0; +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY + struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb); +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ down_read_nested(&cinode->lock_sem, SINGLE_DEPTH_NESTING); if (cinode->can_cache_brlcks) { @@ -685,11 +745,13 @@ cifs_relock_file(struct cifsFileInfo *cfile) return rc; } +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY if (cap_unix(tcon->ses) && (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) rc = cifs_push_posix_locks(cfile); else +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ rc = tcon->ses->server->ops->push_mand_locks(cfile); up_read(&cinode->lock_sem); @@ -750,6 +812,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) else oplock = 0; +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY if (tcon->unix_ext && cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))) { @@ -773,6 +836,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) * in the reconnect path it is important to retry hard */ } +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ desired_access = cifs_convert_flags(cfile->f_flags); @@ -817,7 +881,9 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) goto reopen_error_exit; } +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY reopen_success: +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ cfile->invalidHandle = false; mutex_unlock(&cfile->fh_mutex); cinode = CIFS_I(inode); @@ -928,9 +994,7 @@ int cifs_close(struct inode *inode, struct file *file) void cifs_reopen_persistent_handles(struct cifs_tcon *tcon) { - struct cifsFileInfo *open_file; - struct list_head *tmp; - struct list_head *tmp1; + struct cifsFileInfo *open_file, *tmp; struct list_head tmp_list; if (!tcon->use_persistent || !tcon->need_reopen_files) @@ -943,8 +1007,7 @@ cifs_reopen_persistent_handles(struct cifs_tcon *tcon) /* list all files open on tree connection, reopen resilient handles */ spin_lock(&tcon->open_file_lock); - list_for_each(tmp, &tcon->openFileList) { - open_file = list_entry(tmp, struct cifsFileInfo, tlist); + list_for_each_entry(open_file, &tcon->openFileList, tlist) { if (!open_file->invalidHandle) continue; cifsFileInfo_get(open_file); @@ -952,8 +1015,7 @@ cifs_reopen_persistent_handles(struct cifs_tcon *tcon) } spin_unlock(&tcon->open_file_lock); - list_for_each_safe(tmp, tmp1, &tmp_list) { - open_file = list_entry(tmp, struct cifsFileInfo, rlist); + list_for_each_entry_safe(open_file, tmp, &tmp_list, rlist) { if (cifs_reopen_file(open_file, false /* do not flush */)) tcon->need_reopen_files = true; list_del_init(&open_file->rlist); @@ -1196,6 +1258,7 @@ try_again: return rc; } +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY /* * Check if there is another lock that prevents us to set the lock (posix * style). If such a lock exists, update the flock structure with its @@ -1334,6 +1397,7 @@ hash_lockowner(fl_owner_t owner) { return cifs_lock_secret ^ hash32_ptr((const void *)owner); } +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ struct lock_to_push { struct list_head llist; @@ -1344,6 +1408,7 @@ struct lock_to_push { __u8 type; }; +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY static int cifs_push_posix_locks(struct cifsFileInfo *cfile) { @@ -1431,14 +1496,17 @@ err_out: } goto out; } +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ static int cifs_push_locks(struct cifsFileInfo *cfile) { - struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb); struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); int rc = 0; +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY + struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb); +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ /* we are going to update can_cache_brlcks here - need a write access */ cifs_down_write(&cinode->lock_sem); @@ -1447,11 +1515,13 @@ cifs_push_locks(struct cifsFileInfo *cfile) return rc; } +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY if (cap_unix(tcon->ses) && (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) rc = cifs_push_posix_locks(cfile); else +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ rc = tcon->ses->server->ops->push_mand_locks(cfile); cinode->can_cache_brlcks = false; @@ -1515,6 +1585,7 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type, struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct TCP_Server_Info *server = tcon->ses->server; +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY __u16 netfid = cfile->fid.netfid; if (posix_lck) { @@ -1534,6 +1605,7 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type, posix_lock_type, wait_flag); return rc; } +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ rc = cifs_lock_test(cfile, flock->fl_start, length, type, flock); if (!rc) @@ -1594,6 +1666,7 @@ cifs_free_llist(struct list_head *llist) } } +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY int cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, unsigned int xid) @@ -1706,6 +1779,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, kfree(buf); return rc; } +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ static int cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, @@ -1719,6 +1793,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, struct TCP_Server_Info *server = tcon->ses->server; struct inode *inode = d_inode(cfile->dentry); +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY if (posix_lck) { int posix_lock_type; @@ -1740,7 +1815,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, NULL, posix_lock_type, wait_flag); goto out; } - +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ if (lock) { struct cifsLockInfo *lock; @@ -2204,6 +2279,185 @@ cifs_get_readable_path(struct cifs_tcon *tcon, const char *name, return -ENOENT; } +void +cifs_writedata_release(struct kref *refcount) +{ + struct cifs_writedata *wdata = container_of(refcount, + struct cifs_writedata, refcount); +#ifdef CONFIG_CIFS_SMB_DIRECT + if (wdata->mr) { + smbd_deregister_mr(wdata->mr); + wdata->mr = NULL; + } +#endif + + if (wdata->cfile) + cifsFileInfo_put(wdata->cfile); + + kvfree(wdata->pages); + kfree(wdata); +} + +/* + * Write failed with a retryable error. Resend the write request. It's also + * possible that the page was redirtied so re-clean the page. + */ +static void +cifs_writev_requeue(struct cifs_writedata *wdata) +{ + int i, rc = 0; + struct inode *inode = d_inode(wdata->cfile->dentry); + struct TCP_Server_Info *server; + unsigned int rest_len; + + server = tlink_tcon(wdata->cfile->tlink)->ses->server; + i = 0; + rest_len = wdata->bytes; + do { + struct cifs_writedata *wdata2; + unsigned int j, nr_pages, wsize, tailsz, cur_len; + + wsize = server->ops->wp_retry_size(inode); + if (wsize < rest_len) { + nr_pages = wsize / PAGE_SIZE; + if (!nr_pages) { + rc = -EOPNOTSUPP; + break; + } + cur_len = nr_pages * PAGE_SIZE; + tailsz = PAGE_SIZE; + } else { + nr_pages = DIV_ROUND_UP(rest_len, PAGE_SIZE); + cur_len = rest_len; + tailsz = rest_len - (nr_pages - 1) * PAGE_SIZE; + } + + wdata2 = cifs_writedata_alloc(nr_pages, cifs_writev_complete); + if (!wdata2) { + rc = -ENOMEM; + break; + } + + for (j = 0; j < nr_pages; j++) { + wdata2->pages[j] = wdata->pages[i + j]; + lock_page(wdata2->pages[j]); + clear_page_dirty_for_io(wdata2->pages[j]); + } + + wdata2->sync_mode = wdata->sync_mode; + wdata2->nr_pages = nr_pages; + wdata2->offset = page_offset(wdata2->pages[0]); + wdata2->pagesz = PAGE_SIZE; + wdata2->tailsz = tailsz; + wdata2->bytes = cur_len; + + rc = cifs_get_writable_file(CIFS_I(inode), FIND_WR_ANY, + &wdata2->cfile); + if (!wdata2->cfile) { + cifs_dbg(VFS, "No writable handle to retry writepages rc=%d\n", + rc); + if (!is_retryable_error(rc)) + rc = -EBADF; + } else { + wdata2->pid = wdata2->cfile->pid; + rc = server->ops->async_writev(wdata2, + cifs_writedata_release); + } + + for (j = 0; j < nr_pages; j++) { + unlock_page(wdata2->pages[j]); + if (rc != 0 && !is_retryable_error(rc)) { + SetPageError(wdata2->pages[j]); + end_page_writeback(wdata2->pages[j]); + put_page(wdata2->pages[j]); + } + } + + kref_put(&wdata2->refcount, cifs_writedata_release); + if (rc) { + if (is_retryable_error(rc)) + continue; + i += nr_pages; + break; + } + + rest_len -= cur_len; + i += nr_pages; + } while (i < wdata->nr_pages); + + /* cleanup remaining pages from the original wdata */ + for (; i < wdata->nr_pages; i++) { + SetPageError(wdata->pages[i]); + end_page_writeback(wdata->pages[i]); + put_page(wdata->pages[i]); + } + + if (rc != 0 && !is_retryable_error(rc)) + mapping_set_error(inode->i_mapping, rc); + kref_put(&wdata->refcount, cifs_writedata_release); +} + +void +cifs_writev_complete(struct work_struct *work) +{ + struct cifs_writedata *wdata = container_of(work, + struct cifs_writedata, work); + struct inode *inode = d_inode(wdata->cfile->dentry); + int i = 0; + + if (wdata->result == 0) { + spin_lock(&inode->i_lock); + cifs_update_eof(CIFS_I(inode), wdata->offset, wdata->bytes); + spin_unlock(&inode->i_lock); + cifs_stats_bytes_written(tlink_tcon(wdata->cfile->tlink), + wdata->bytes); + } else if (wdata->sync_mode == WB_SYNC_ALL && wdata->result == -EAGAIN) + return cifs_writev_requeue(wdata); + + for (i = 0; i < wdata->nr_pages; i++) { + struct page *page = wdata->pages[i]; + + if (wdata->result == -EAGAIN) + __set_page_dirty_nobuffers(page); + else if (wdata->result < 0) + SetPageError(page); + end_page_writeback(page); + cifs_readpage_to_fscache(inode, page); + put_page(page); + } + if (wdata->result != -EAGAIN) + mapping_set_error(inode->i_mapping, wdata->result); + kref_put(&wdata->refcount, cifs_writedata_release); +} + +struct cifs_writedata * +cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete) +{ + struct page **pages = + kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); + if (pages) + return cifs_writedata_direct_alloc(pages, complete); + + return NULL; +} + +struct cifs_writedata * +cifs_writedata_direct_alloc(struct page **pages, work_func_t complete) +{ + struct cifs_writedata *wdata; + + wdata = kzalloc(sizeof(*wdata), GFP_NOFS); + if (wdata != NULL) { + wdata->pages = pages; + kref_init(&wdata->refcount); + INIT_LIST_HEAD(&wdata->list); + init_completion(&wdata->done); + INIT_WORK(&wdata->work, complete); + } + return wdata; +} + + static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) { struct address_space *mapping = page->mapping; @@ -4459,10 +4713,11 @@ static void cifs_readahead(struct readahead_control *ractl) * TODO: Send a whole batch of pages to be read * by the cache. */ - page = readahead_page(ractl); - last_batch_size = 1 << thp_order(page); + struct folio *folio = readahead_folio(ractl); + + last_batch_size = folio_nr_pages(folio); if (cifs_readpage_from_fscache(ractl->mapping->host, - page) < 0) { + &folio->page) < 0) { /* * TODO: Deal with cache read failure * here, but for the moment, delegate @@ -4470,7 +4725,7 @@ static void cifs_readahead(struct readahead_control *ractl) */ caching = false; } - unlock_page(page); + folio_unlock(folio); next_cached++; cache_nr_pages--; if (cache_nr_pages == 0) diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 81da81e18553..eeeaba3dec05 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -339,6 +339,7 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb) fattr->cf_flags = CIFS_FATTR_DFS_REFERRAL; } +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY static int cifs_get_file_info_unix(struct file *filp) { @@ -432,6 +433,14 @@ int cifs_get_inode_info_unix(struct inode **pinode, cgiiu_exit: return rc; } +#else +int cifs_get_inode_info_unix(struct inode **pinode, + const unsigned char *full_path, + struct super_block *sb, unsigned int xid) +{ + return -EOPNOTSUPP; +} +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ static int cifs_sfu_type(struct cifs_fattr *fattr, const char *path, @@ -795,6 +804,7 @@ static __u64 simple_hashstr(const char *str) return hash; } +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY /** * cifs_backup_query_path_info - SMB1 fallback code to get ino * @@ -847,6 +857,7 @@ cifs_backup_query_path_info(int xid, *data = (FILE_ALL_INFO *)info.srch_entries_start; return 0; } +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ static void cifs_set_fattr_ino(int xid, @@ -991,6 +1002,7 @@ cifs_get_inode_info(struct inode **inode, rc = 0; break; case -EACCES: +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY /* * perm errors, try again with backup flags if possible * @@ -1022,6 +1034,9 @@ cifs_get_inode_info(struct inode **inode, /* nothing we can do, bail out */ goto out; } +#else + goto out; +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ break; default: cifs_dbg(FYI, "%s: unhandled err rc %d\n", __func__, rc); @@ -1037,8 +1052,9 @@ cifs_get_inode_info(struct inode **inode, /* * 4. Tweak fattr based on mount options */ - +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY handle_mnt_opt: +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ /* query for SFU type info if supported and needed */ if (fattr.cf_cifsattrs & ATTR_SYSTEM && cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) { @@ -1223,7 +1239,7 @@ static const struct inode_operations cifs_ipc_inode_ops = { static int cifs_find_inode(struct inode *inode, void *opaque) { - struct cifs_fattr *fattr = (struct cifs_fattr *) opaque; + struct cifs_fattr *fattr = opaque; /* don't match inode with different uniqueid */ if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid) @@ -1247,7 +1263,7 @@ cifs_find_inode(struct inode *inode, void *opaque) static int cifs_init_inode(struct inode *inode, void *opaque) { - struct cifs_fattr *fattr = (struct cifs_fattr *) opaque; + struct cifs_fattr *fattr = opaque; CIFS_I(inode)->uniqueid = fattr->cf_uniqueid; CIFS_I(inode)->createtime = fattr->cf_createtime; @@ -1435,6 +1451,7 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, unsigned int xid, return server->ops->set_file_info(inode, full_path, &info_buf, xid); } +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY /* * Open the given file (if it isn't already), set the DELETE_ON_CLOSE bit * and rename it to a random name that hopefully won't conflict with @@ -1565,6 +1582,7 @@ undo_setattr: goto out_close; } +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ /* copied from fs/nfs/dir.c with small changes */ static void @@ -1627,6 +1645,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) } cifs_close_deferred_file_under_dentry(tcon, full_path); +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))) { rc = CIFSPOSIXDelFile(xid, tcon, full_path, @@ -1636,6 +1655,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) if ((rc == 0) || (rc == -ENOENT)) goto psx_del_no_retry; } +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ retry_std_delete: if (!server->ops->unlink) { @@ -1714,9 +1734,11 @@ cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode, if (tcon->posix_extensions) rc = smb311_posix_get_inode_info(&inode, full_path, parent->i_sb, xid); +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY else if (tcon->unix_ext) rc = cifs_get_inode_info_unix(&inode, full_path, parent->i_sb, xid); +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ else rc = cifs_get_inode_info(&inode, full_path, NULL, parent->i_sb, xid, NULL); @@ -1746,6 +1768,7 @@ cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode, if (parent->i_mode & S_ISGID) mode |= S_ISGID; +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY if (tcon->unix_ext) { struct cifs_unix_set_info_args args = { .mode = mode, @@ -1768,6 +1791,9 @@ cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode, cifs_sb->local_nls, cifs_remap(cifs_sb)); } else { +#else + { +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ struct TCP_Server_Info *server = tcon->ses->server; if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) && (mode & S_IWUGO) == 0 && server->ops->mkdir_setinfo) @@ -1788,6 +1814,7 @@ cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode, return 0; } +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY static int cifs_posix_mkdir(struct inode *inode, struct dentry *dentry, umode_t mode, const char *full_path, struct cifs_sb_info *cifs_sb, @@ -1850,6 +1877,7 @@ posix_mkdir_get_info: xid); goto posix_mkdir_out; } +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ int cifs_mkdir(struct user_namespace *mnt_userns, struct inode *inode, struct dentry *direntry, umode_t mode) @@ -1892,6 +1920,7 @@ int cifs_mkdir(struct user_namespace *mnt_userns, struct inode *inode, goto mkdir_out; } +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))) { rc = cifs_posix_mkdir(inode, direntry, mode, full_path, cifs_sb, @@ -1899,6 +1928,7 @@ int cifs_mkdir(struct user_namespace *mnt_userns, struct inode *inode, if (rc != -EOPNOTSUPP) goto mkdir_out; } +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ if (!server->ops->mkdir) { rc = -ENOSYS; @@ -2015,9 +2045,12 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry, struct tcon_link *tlink; struct cifs_tcon *tcon; struct TCP_Server_Info *server; +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY struct cifs_fid fid; struct cifs_open_parms oparms; - int oplock, rc; + int oplock; +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ + int rc; tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) @@ -2043,6 +2076,7 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry, if (server->vals->protocol_id != 0) goto do_rename_exit; +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY /* open-file renames don't work across directories */ if (to_dentry->d_parent != from_dentry->d_parent) goto do_rename_exit; @@ -2064,6 +2098,7 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry, cifs_sb->local_nls, cifs_remap(cifs_sb)); CIFSSMBClose(xid, tcon, fid.netfid); } +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ do_rename_exit: if (rc == 0) d_move(from_dentry, to_dentry); @@ -2081,11 +2116,13 @@ cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir, struct cifs_sb_info *cifs_sb; struct tcon_link *tlink; struct cifs_tcon *tcon; - FILE_UNIX_BASIC_INFO *info_buf_source = NULL; - FILE_UNIX_BASIC_INFO *info_buf_target; unsigned int xid; int rc, tmprc; int retry_count = 0; + FILE_UNIX_BASIC_INFO *info_buf_source = NULL; +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY + FILE_UNIX_BASIC_INFO *info_buf_target; +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ if (flags & ~RENAME_NOREPLACE) return -EINVAL; @@ -2139,6 +2176,7 @@ cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir, if (flags & RENAME_NOREPLACE) goto cifs_rename_exit; +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY if (rc == -EEXIST && tcon->unix_ext) { /* * Are src and dst hardlinks of same inode? We can only tell @@ -2178,6 +2216,8 @@ cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir, */ unlink_target: +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ + /* Try unlinking the target dentry if it's not negative */ if (d_really_is_positive(target_dentry) && (rc == -EACCES || rc == -EEXIST)) { if (d_is_dir(target_dentry)) @@ -2337,14 +2377,18 @@ int cifs_revalidate_file_attr(struct file *filp) { int rc = 0; struct dentry *dentry = file_dentry(filp); +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data; +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ if (!cifs_dentry_needs_reval(dentry)) return rc; +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY if (tlink_tcon(cfile->tlink)->unix_ext) rc = cifs_get_file_info_unix(filp); else +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ rc = cifs_get_file_info(filp); return rc; @@ -2653,6 +2697,7 @@ set_size_out: return rc; } +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY static int cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) { @@ -2800,6 +2845,7 @@ out: free_xid(xid); return rc; } +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ static int cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) @@ -2995,16 +3041,20 @@ cifs_setattr(struct user_namespace *mnt_userns, struct dentry *direntry, struct iattr *attrs) { struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb); - struct cifs_tcon *pTcon = cifs_sb_master_tcon(cifs_sb); int rc, retries = 0; +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY + struct cifs_tcon *pTcon = cifs_sb_master_tcon(cifs_sb); +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ if (unlikely(cifs_forced_shutdown(cifs_sb))) return -EIO; do { +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY if (pTcon->unix_ext) rc = cifs_setattr_unix(direntry, attrs); else +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ rc = cifs_setattr_nounix(direntry, attrs); retries++; } while (is_retryable_error(rc) && retries < 2); diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 0359b604bdbc..b6e6e5d6c8dd 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -333,6 +333,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) tcon = tlink_tcon(pSMBFile->tlink); caps = le64_to_cpu(tcon->fsUnixInfo.Capability); #ifdef CONFIG_CIFS_POSIX +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY if (CIFS_UNIX_EXTATTR_CAP & caps) { __u64 ExtAttrMask = 0; rc = CIFSGetExtAttr(xid, tcon, @@ -345,6 +346,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) if (rc != EOPNOTSUPP) break; } +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ #endif /* CONFIG_CIFS_POSIX */ rc = 0; if (CIFS_I(inode)->cifsAttrs & ATTR_COMPRESSED) { diff --git a/fs/cifs/link.c b/fs/cifs/link.c index bbdf3281559c..6803cb27eecc 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -286,6 +286,7 @@ out: return rc; } +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY /* * SMB 1.0 Protocol specific functions */ @@ -368,6 +369,7 @@ cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, CIFSSMBClose(xid, tcon, fid.netfid); return rc; } +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ /* * SMB 2.1/SMB3 Protocol specific functions @@ -532,11 +534,15 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode, goto cifs_hl_exit; } +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY if (tcon->unix_ext) rc = CIFSUnixCreateHardLink(xid, tcon, from_name, to_name, cifs_sb->local_nls, cifs_remap(cifs_sb)); else { +#else + { +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ server = tcon->ses->server; if (!server->ops->create_hardlink) { rc = -ENOSYS; @@ -704,10 +710,12 @@ cifs_symlink(struct user_namespace *mnt_userns, struct inode *inode, /* BB what if DFS and this volume is on different share? BB */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) rc = create_mf_symlink(xid, pTcon, cifs_sb, full_path, symname); +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY else if (pTcon->unix_ext) rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname, cifs_sb->local_nls, cifs_remap(cifs_sb)); +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ /* else rc = CIFSCreateReparseSymLink(xid, pTcon, fromName, toName, cifs_sb_target->local_nls); */ diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index c69e1240d730..7a906067db04 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -69,12 +69,14 @@ sesInfoAlloc(void) ret_buf = kzalloc(sizeof(struct cifs_ses), GFP_KERNEL); if (ret_buf) { atomic_inc(&sesInfoAllocCount); + spin_lock_init(&ret_buf->ses_lock); ret_buf->ses_status = SES_NEW; ++ret_buf->ses_count; INIT_LIST_HEAD(&ret_buf->smb_ses_list); INIT_LIST_HEAD(&ret_buf->tcon_list); mutex_init(&ret_buf->session_mutex); spin_lock_init(&ret_buf->iface_lock); + INIT_LIST_HEAD(&ret_buf->iface_list); spin_lock_init(&ret_buf->chan_lock); } return ret_buf; @@ -83,6 +85,8 @@ sesInfoAlloc(void) void sesInfoFree(struct cifs_ses *buf_to_free) { + struct cifs_server_iface *iface = NULL, *niface = NULL; + if (buf_to_free == NULL) { cifs_dbg(FYI, "Null buffer passed to sesInfoFree\n"); return; @@ -96,7 +100,11 @@ sesInfoFree(struct cifs_ses *buf_to_free) kfree(buf_to_free->user_name); kfree(buf_to_free->domainName); kfree_sensitive(buf_to_free->auth_key.response); - kfree(buf_to_free->iface_list); + spin_lock(&buf_to_free->iface_lock); + list_for_each_entry_safe(iface, niface, &buf_to_free->iface_list, + iface_head) + kref_put(&iface->refcount, release_iface); + spin_unlock(&buf_to_free->iface_lock); kfree_sensitive(buf_to_free); } @@ -119,6 +127,7 @@ tconInfoAlloc(void) atomic_inc(&tconInfoAllocCount); ret_buf->status = TID_NEW; ++ret_buf->tc_count; + spin_lock_init(&ret_buf->tc_lock); INIT_LIST_HEAD(&ret_buf->openFileList); INIT_LIST_HEAD(&ret_buf->tcon_list); spin_lock_init(&ret_buf->open_file_lock); @@ -165,9 +174,9 @@ cifs_buf_get(void) /* clear the first few header bytes */ /* for most paths, more is cleared in header_assemble */ memset(ret_buf, 0, buf_size + 3); - atomic_inc(&bufAllocCount); + atomic_inc(&buf_alloc_count); #ifdef CONFIG_CIFS_STATS2 - atomic_inc(&totBufAllocCount); + atomic_inc(&total_buf_alloc_count); #endif /* CONFIG_CIFS_STATS2 */ return ret_buf; @@ -182,7 +191,7 @@ cifs_buf_release(void *buf_to_free) } mempool_free(buf_to_free, cifs_req_poolp); - atomic_dec(&bufAllocCount); + atomic_dec(&buf_alloc_count); return; } @@ -198,9 +207,9 @@ cifs_small_buf_get(void) ret_buf = mempool_alloc(cifs_sm_req_poolp, GFP_NOFS); /* No need to clear memory here, cleared in header assemble */ /* memset(ret_buf, 0, sizeof(struct smb_hdr) + 27);*/ - atomic_inc(&smBufAllocCount); + atomic_inc(&small_buf_alloc_count); #ifdef CONFIG_CIFS_STATS2 - atomic_inc(&totSmBufAllocCount); + atomic_inc(&total_small_buf_alloc_count); #endif /* CONFIG_CIFS_STATS2 */ return ret_buf; @@ -216,7 +225,7 @@ cifs_small_buf_release(void *buf_to_free) } mempool_free(buf_to_free, cifs_sm_req_poolp); - atomic_dec(&smBufAllocCount); + atomic_dec(&small_buf_alloc_count); return; } @@ -393,7 +402,6 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) { struct smb_hdr *buf = (struct smb_hdr *)buffer; struct smb_com_lock_req *pSMB = (struct smb_com_lock_req *)buf; - struct list_head *tmp, *tmp1, *tmp2; struct cifs_ses *ses; struct cifs_tcon *tcon; struct cifsInodeInfo *pCifsInode; @@ -460,18 +468,14 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) /* look up tcon based on tid & uid */ spin_lock(&cifs_tcp_ses_lock); - list_for_each(tmp, &srv->smb_ses_list) { - ses = list_entry(tmp, struct cifs_ses, smb_ses_list); - list_for_each(tmp1, &ses->tcon_list) { - tcon = list_entry(tmp1, struct cifs_tcon, tcon_list); + list_for_each_entry(ses, &srv->smb_ses_list, smb_ses_list) { + list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { if (tcon->tid != buf->Tid) continue; cifs_stats_inc(&tcon->stats.cifs_stats.num_oplock_brks); spin_lock(&tcon->open_file_lock); - list_for_each(tmp2, &tcon->openFileList) { - netfile = list_entry(tmp2, struct cifsFileInfo, - tlist); + list_for_each_entry(netfile, &tcon->openFileList, tlist) { if (pSMB->Fid != netfile->fid.netfid) continue; @@ -756,14 +760,12 @@ void cifs_close_all_deferred_files(struct cifs_tcon *tcon) { struct cifsFileInfo *cfile; - struct list_head *tmp; struct file_list *tmp_list, *tmp_next_list; struct list_head file_head; INIT_LIST_HEAD(&file_head); spin_lock(&tcon->open_file_lock); - list_for_each(tmp, &tcon->openFileList) { - cfile = list_entry(tmp, struct cifsFileInfo, tlist); + list_for_each_entry(cfile, &tcon->openFileList, tlist) { if (delayed_work_pending(&cfile->deferred)) { if (cancel_delayed_work(&cfile->deferred)) { tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC); @@ -786,7 +788,6 @@ void cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path) { struct cifsFileInfo *cfile; - struct list_head *tmp; struct file_list *tmp_list, *tmp_next_list; struct list_head file_head; void *page; @@ -795,8 +796,7 @@ cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path) INIT_LIST_HEAD(&file_head); page = alloc_dentry_path(); spin_lock(&tcon->open_file_lock); - list_for_each(tmp, &tcon->openFileList) { - cfile = list_entry(tmp, struct cifsFileInfo, tlist); + list_for_each_entry(cfile, &tcon->openFileList, tlist) { full_path = build_path_from_dentry(cfile->dentry, page); if (strstr(full_path, path)) { if (delayed_work_pending(&cfile->deferred)) { diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index 235aa1b395eb..28caae7aed1b 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -911,7 +911,7 @@ map_and_check_smb_error(struct mid_q_entry *mid, bool logErr) unsigned int smbCalcSize(void *buf, struct TCP_Server_Info *server) { - struct smb_hdr *ptr = (struct smb_hdr *)buf; + struct smb_hdr *ptr = buf; return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) + 2 /* size of the bcc field */ + get_bcc(ptr)); } diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index d417de354d9d..3af3b05b6c74 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -58,7 +58,7 @@ bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface) spin_lock(&ses->chan_lock); for (i = 0; i < ses->chan_count; i++) { - if (is_server_using_iface(ses->chans[i].server, iface)) { + if (ses->chans[i].iface == iface) { spin_unlock(&ses->chan_lock); return true; } @@ -146,16 +146,24 @@ cifs_chan_needs_reconnect(struct cifs_ses *ses, return CIFS_CHAN_NEEDS_RECONNECT(ses, chan_index); } +bool +cifs_chan_is_iface_active(struct cifs_ses *ses, + struct TCP_Server_Info *server) +{ + unsigned int chan_index = cifs_ses_get_chan_index(ses, server); + + return ses->chans[chan_index].iface && + ses->chans[chan_index].iface->is_active; +} + /* returns number of channels added */ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) { int old_chan_count, new_chan_count; int left; - int i = 0; int rc = 0; int tries = 0; - struct cifs_server_iface *ifaces = NULL; - size_t iface_count; + struct cifs_server_iface *iface = NULL, *niface = NULL; spin_lock(&ses->chan_lock); @@ -185,32 +193,16 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) spin_unlock(&ses->chan_lock); /* - * Make a copy of the iface list at the time and use that - * instead so as to not hold the iface spinlock for opening - * channels - */ - spin_lock(&ses->iface_lock); - iface_count = ses->iface_count; - if (iface_count <= 0) { - spin_unlock(&ses->iface_lock); - cifs_dbg(VFS, "no iface list available to open channels\n"); - return 0; - } - ifaces = kmemdup(ses->iface_list, iface_count*sizeof(*ifaces), - GFP_ATOMIC); - if (!ifaces) { - spin_unlock(&ses->iface_lock); - return 0; - } - spin_unlock(&ses->iface_lock); - - /* * Keep connecting to same, fastest, iface for all channels as * long as its RSS. Try next fastest one if not RSS or channel * creation fails. */ + spin_lock(&ses->iface_lock); + iface = list_first_entry(&ses->iface_list, struct cifs_server_iface, + iface_head); + spin_unlock(&ses->iface_lock); + while (left > 0) { - struct cifs_server_iface *iface; tries++; if (tries > 3*ses->chan_max) { @@ -219,31 +211,128 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) break; } - iface = &ifaces[i]; - if (is_ses_using_iface(ses, iface) && !iface->rss_capable) { - i = (i+1) % iface_count; - continue; + spin_lock(&ses->iface_lock); + if (!ses->iface_count) { + spin_unlock(&ses->iface_lock); + break; } - rc = cifs_ses_add_channel(cifs_sb, ses, iface); - if (rc) { - cifs_dbg(FYI, "failed to open extra channel on iface#%d rc=%d\n", - i, rc); - i = (i+1) % iface_count; - continue; + list_for_each_entry_safe_from(iface, niface, &ses->iface_list, + iface_head) { + /* skip ifaces that are unusable */ + if (!iface->is_active || + (is_ses_using_iface(ses, iface) && + !iface->rss_capable)) { + continue; + } + + /* take ref before unlock */ + kref_get(&iface->refcount); + + spin_unlock(&ses->iface_lock); + rc = cifs_ses_add_channel(cifs_sb, ses, iface); + spin_lock(&ses->iface_lock); + + if (rc) { + cifs_dbg(VFS, "failed to open extra channel on iface:%pIS rc=%d\n", + &iface->sockaddr, + rc); + kref_put(&iface->refcount, release_iface); + continue; + } + + cifs_dbg(FYI, "successfully opened new channel on iface:%pIS\n", + &iface->sockaddr); + break; } + spin_unlock(&ses->iface_lock); - cifs_dbg(FYI, "successfully opened new channel on iface#%d\n", - i); left--; new_chan_count++; } - kfree(ifaces); return new_chan_count - old_chan_count; } /* + * update the iface for the channel if necessary. + * will return 0 when iface is updated, 1 if removed, 2 otherwise + * Must be called with chan_lock held. + */ +int +cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server) +{ + unsigned int chan_index; + struct cifs_server_iface *iface = NULL; + struct cifs_server_iface *old_iface = NULL; + int rc = 0; + + spin_lock(&ses->chan_lock); + chan_index = cifs_ses_get_chan_index(ses, server); + if (!chan_index) { + spin_unlock(&ses->chan_lock); + return 0; + } + + if (ses->chans[chan_index].iface) { + old_iface = ses->chans[chan_index].iface; + if (old_iface->is_active) { + spin_unlock(&ses->chan_lock); + return 1; + } + } + spin_unlock(&ses->chan_lock); + + spin_lock(&ses->iface_lock); + /* then look for a new one */ + list_for_each_entry(iface, &ses->iface_list, iface_head) { + if (!iface->is_active || + (is_ses_using_iface(ses, iface) && + !iface->rss_capable)) { + continue; + } + kref_get(&iface->refcount); + } + + if (!list_entry_is_head(iface, &ses->iface_list, iface_head)) { + rc = 1; + iface = NULL; + cifs_dbg(FYI, "unable to find a suitable iface\n"); + } + + /* now drop the ref to the current iface */ + if (old_iface && iface) { + kref_put(&old_iface->refcount, release_iface); + cifs_dbg(FYI, "replacing iface: %pIS with %pIS\n", + &old_iface->sockaddr, + &iface->sockaddr); + } else if (old_iface) { + kref_put(&old_iface->refcount, release_iface); + cifs_dbg(FYI, "releasing ref to iface: %pIS\n", + &old_iface->sockaddr); + } else { + WARN_ON(!iface); + cifs_dbg(FYI, "adding new iface: %pIS\n", &iface->sockaddr); + } + spin_unlock(&ses->iface_lock); + + spin_lock(&ses->chan_lock); + chan_index = cifs_ses_get_chan_index(ses, server); + ses->chans[chan_index].iface = iface; + + /* No iface is found. if secondary chan, drop connection */ + if (!iface && CIFS_SERVER_IS_CHAN(server)) + ses->chans[chan_index].server = NULL; + + spin_unlock(&ses->chan_lock); + + if (!iface && CIFS_SERVER_IS_CHAN(server)) + cifs_put_tcp_session(server, false); + + return rc; +} + +/* * If server is a channel of ses, return the corresponding enclosing * cifs_chan otherwise return NULL. */ @@ -355,6 +444,7 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, spin_unlock(&ses->chan_lock); goto out; } + chan->iface = iface; ses->chan_count++; atomic_set(&ses->chan_seq, 0); @@ -384,6 +474,14 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, out: if (rc && chan->server) { + /* + * we should avoid race with these delayed works before we + * remove this channel + */ + cancel_delayed_work_sync(&chan->server->echo); + cancel_delayed_work_sync(&chan->server->resolve); + cancel_delayed_work_sync(&chan->server->reconnect); + spin_lock(&ses->chan_lock); /* we rely on all bits beyond chan_count to be clear */ cifs_chan_clear_need_reconnect(ses, chan->server); @@ -394,14 +492,14 @@ out: */ WARN_ON(ses->chan_count < 1); spin_unlock(&ses->chan_lock); - } - if (rc && chan->server) cifs_put_tcp_session(chan->server, 0); + } return rc; } +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, struct TCP_Server_Info *server, SESSION_SETUP_ANDX *pSMB) @@ -494,7 +592,6 @@ static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses, *pbcc_area = bcc_ptr; } - static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses, const struct nls_table *nls_cp) { @@ -656,6 +753,7 @@ static void decode_ascii_ssetup(char **pbcc_area, __u16 bleft, for it later, but it is not very important */ cifs_dbg(FYI, "ascii: bytes left %d\n", bleft); } +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, struct cifs_ses *ses) @@ -1073,6 +1171,7 @@ struct sess_data { struct kvec iov[3]; }; +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY static int sess_alloc_buffer(struct sess_data *sess_data, int wct) { @@ -1749,3 +1848,4 @@ out: kfree(sess_data); return rc; } +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 2e20ee4dab7b..f36b2d2d40ca 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -92,17 +92,17 @@ cifs_find_mid(struct TCP_Server_Info *server, char *buffer) struct smb_hdr *buf = (struct smb_hdr *)buffer; struct mid_q_entry *mid; - spin_lock(&GlobalMid_Lock); + spin_lock(&server->mid_lock); list_for_each_entry(mid, &server->pending_mid_q, qhead) { if (compare_mid(mid->mid, buf) && mid->mid_state == MID_REQUEST_SUBMITTED && le16_to_cpu(mid->command) == buf->Command) { kref_get(&mid->refcount); - spin_unlock(&GlobalMid_Lock); + spin_unlock(&server->mid_lock); return mid; } } - spin_unlock(&GlobalMid_Lock); + spin_unlock(&server->mid_lock); return NULL; } @@ -166,7 +166,7 @@ cifs_get_next_mid(struct TCP_Server_Info *server) __u16 last_mid, cur_mid; bool collision, reconnect = false; - spin_lock(&GlobalMid_Lock); + spin_lock(&server->mid_lock); /* mid is 16 bit only for CIFS/SMB */ cur_mid = (__u16)((server->CurrentMid) & 0xffff); @@ -225,7 +225,7 @@ cifs_get_next_mid(struct TCP_Server_Info *server) } cur_mid++; } - spin_unlock(&GlobalMid_Lock); + spin_unlock(&server->mid_lock); if (reconnect) { cifs_signal_cifsd_for_reconnect(server, false); diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 17813c3d0c6e..818cc4dee0e2 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -132,15 +132,15 @@ static __u32 get_neg_ctxt_len(struct smb2_hdr *hdr, __u32 len, } int -smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr) +smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server) { struct smb2_hdr *shdr = (struct smb2_hdr *)buf; struct smb2_pdu *pdu = (struct smb2_pdu *)shdr; - __u64 mid; - __u32 clc_len; /* calculated length */ - int command; - int pdu_size = sizeof(struct smb2_pdu); int hdr_size = sizeof(struct smb2_hdr); + int pdu_size = sizeof(struct smb2_pdu); + int command; + __u32 calc_len; /* calculated length */ + __u64 mid; /* * Add function to do table lookup of StructureSize by command @@ -154,7 +154,7 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr) /* decrypt frame now that it is completely read in */ spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(iter, &srvr->smb_ses_list, smb_ses_list) { + list_for_each_entry(iter, &server->smb_ses_list, smb_ses_list) { if (iter->Suid == le64_to_cpu(thdr->SessionId)) { ses = iter; break; @@ -221,30 +221,33 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr) } } - clc_len = smb2_calc_size(buf, srvr); + calc_len = smb2_calc_size(buf, server); + + /* For SMB2_IOCTL, OutputOffset and OutputLength are optional, so might + * be 0, and not a real miscalculation */ + if (command == SMB2_IOCTL_HE && calc_len == 0) + return 0; - if (shdr->Command == SMB2_NEGOTIATE) - clc_len += get_neg_ctxt_len(shdr, len, clc_len); + if (command == SMB2_NEGOTIATE_HE) + calc_len += get_neg_ctxt_len(shdr, len, calc_len); - if (len != clc_len) { - cifs_dbg(FYI, "Calculated size %u length %u mismatch mid %llu\n", - clc_len, len, mid); + if (len != calc_len) { /* create failed on symlink */ if (command == SMB2_CREATE_HE && shdr->Status == STATUS_STOPPED_ON_SYMLINK) return 0; /* Windows 7 server returns 24 bytes more */ - if (clc_len + 24 == len && command == SMB2_OPLOCK_BREAK_HE) + if (calc_len + 24 == len && command == SMB2_OPLOCK_BREAK_HE) return 0; /* server can return one byte more due to implied bcc[0] */ - if (clc_len == len + 1) + if (calc_len == len + 1) return 0; /* * Some windows servers (win2016) will pad also the final * PDU in a compound to 8 bytes. */ - if (((clc_len + 7) & ~7) == len) + if (((calc_len + 7) & ~7) == len) return 0; /* @@ -253,12 +256,18 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr) * SMB2/SMB3 frame length (header + smb2 response specific data) * Some windows servers also pad up to 8 bytes when compounding. */ - if (clc_len < len) + if (calc_len < len) return 0; - pr_warn_once( - "srv rsp too short, len %d not %d. cmd:%d mid:%llu\n", - len, clc_len, command, mid); + /* Only log a message if len was really miscalculated */ + if (unlikely(cifsFYI)) + cifs_dbg(FYI, "Server response too short: calculated " + "length %u doesn't match read length %u (cmd=%d, mid=%llu)\n", + calc_len, len, command, mid); + else + pr_warn("Server response too short: calculated length " + "%u doesn't match read length %u (cmd=%d, mid=%llu)\n", + calc_len, len, command, mid); return 1; } @@ -402,7 +411,7 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *shdr) unsigned int smb2_calc_size(void *buf, struct TCP_Server_Info *srvr) { - struct smb2_pdu *pdu = (struct smb2_pdu *)buf; + struct smb2_pdu *pdu = buf; struct smb2_hdr *shdr = &pdu->hdr; int offset; /* the offset from the beginning of SMB to data area */ int data_length; /* the length of the variable length data area */ diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 8543cafdfd34..c0039dc0715a 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -126,13 +126,13 @@ smb2_add_credits(struct TCP_Server_Info *server, optype, scredits, add); } - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&server->srv_lock); if (server->tcpStatus == CifsNeedReconnect || server->tcpStatus == CifsExiting) { - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); return; } - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); switch (rc) { case -1: @@ -218,12 +218,12 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, spin_lock(&server->req_lock); } else { spin_unlock(&server->req_lock); - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&server->srv_lock); if (server->tcpStatus == CifsExiting) { - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); return -ENOENT; } - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); spin_lock(&server->req_lock); scredits = server->credits; @@ -319,19 +319,19 @@ smb2_get_next_mid(struct TCP_Server_Info *server) { __u64 mid; /* for SMB2 we need the current value */ - spin_lock(&GlobalMid_Lock); + spin_lock(&server->mid_lock); mid = server->CurrentMid++; - spin_unlock(&GlobalMid_Lock); + spin_unlock(&server->mid_lock); return mid; } static void smb2_revert_current_mid(struct TCP_Server_Info *server, const unsigned int val) { - spin_lock(&GlobalMid_Lock); + spin_lock(&server->mid_lock); if (server->CurrentMid >= val) server->CurrentMid -= val; - spin_unlock(&GlobalMid_Lock); + spin_unlock(&server->mid_lock); } static struct mid_q_entry * @@ -346,7 +346,7 @@ __smb2_find_mid(struct TCP_Server_Info *server, char *buf, bool dequeue) return NULL; } - spin_lock(&GlobalMid_Lock); + spin_lock(&server->mid_lock); list_for_each_entry(mid, &server->pending_mid_q, qhead) { if ((mid->mid == wire_mid) && (mid->mid_state == MID_REQUEST_SUBMITTED) && @@ -356,11 +356,11 @@ __smb2_find_mid(struct TCP_Server_Info *server, char *buf, bool dequeue) list_del_init(&mid->qhead); mid->mid_flags |= MID_DELETED; } - spin_unlock(&GlobalMid_Lock); + spin_unlock(&server->mid_lock); return mid; } } - spin_unlock(&GlobalMid_Lock); + spin_unlock(&server->mid_lock); return NULL; } @@ -403,9 +403,9 @@ smb2_negotiate(const unsigned int xid, { int rc; - spin_lock(&GlobalMid_Lock); + spin_lock(&server->mid_lock); server->CurrentMid = 0; - spin_unlock(&GlobalMid_Lock); + spin_unlock(&server->mid_lock); rc = SMB2_negotiate(xid, ses, server); /* BB we probably don't need to retry with modern servers */ if (rc == -EAGAIN) @@ -512,73 +512,41 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) static int parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, size_t buf_len, - struct cifs_server_iface **iface_list, - size_t *iface_count) + struct cifs_ses *ses) { struct network_interface_info_ioctl_rsp *p; struct sockaddr_in *addr4; struct sockaddr_in6 *addr6; struct iface_info_ipv4 *p4; struct iface_info_ipv6 *p6; - struct cifs_server_iface *info; + struct cifs_server_iface *info = NULL, *iface = NULL, *niface = NULL; + struct cifs_server_iface tmp_iface; ssize_t bytes_left; size_t next = 0; int nb_iface = 0; - int rc = 0; - - *iface_list = NULL; - *iface_count = 0; - - /* - * Fist pass: count and sanity check - */ + int rc = 0, ret = 0; bytes_left = buf_len; p = buf; - while (bytes_left >= sizeof(*p)) { - nb_iface++; - next = le32_to_cpu(p->Next); - if (!next) { - bytes_left -= sizeof(*p); - break; - } - p = (struct network_interface_info_ioctl_rsp *)((u8 *)p+next); - bytes_left -= next; - } - - if (!nb_iface) { - cifs_dbg(VFS, "%s: malformed interface info\n", __func__); - rc = -EINVAL; - goto out; - } - - /* Azure rounds the buffer size up 8, to a 16 byte boundary */ - if ((bytes_left > 8) || p->Next) - cifs_dbg(VFS, "%s: incomplete interface info\n", __func__); - + spin_lock(&ses->iface_lock); /* - * Second pass: extract info to internal structure + * Go through iface_list and do kref_put to remove + * any unused ifaces. ifaces in use will be removed + * when the last user calls a kref_put on it */ - - *iface_list = kcalloc(nb_iface, sizeof(**iface_list), GFP_KERNEL); - if (!*iface_list) { - rc = -ENOMEM; - goto out; + list_for_each_entry_safe(iface, niface, &ses->iface_list, + iface_head) { + iface->is_active = 0; + kref_put(&iface->refcount, release_iface); } + spin_unlock(&ses->iface_lock); - info = *iface_list; - bytes_left = buf_len; - p = buf; while (bytes_left >= sizeof(*p)) { - info->speed = le64_to_cpu(p->LinkSpeed); - info->rdma_capable = le32_to_cpu(p->Capability & RDMA_CAPABLE) ? 1 : 0; - info->rss_capable = le32_to_cpu(p->Capability & RSS_CAPABLE) ? 1 : 0; - - cifs_dbg(FYI, "%s: adding iface %zu\n", __func__, *iface_count); - cifs_dbg(FYI, "%s: speed %zu bps\n", __func__, info->speed); - cifs_dbg(FYI, "%s: capabilities 0x%08x\n", __func__, - le32_to_cpu(p->Capability)); + memset(&tmp_iface, 0, sizeof(tmp_iface)); + tmp_iface.speed = le64_to_cpu(p->LinkSpeed); + tmp_iface.rdma_capable = le32_to_cpu(p->Capability & RDMA_CAPABLE) ? 1 : 0; + tmp_iface.rss_capable = le32_to_cpu(p->Capability & RSS_CAPABLE) ? 1 : 0; switch (p->Family) { /* @@ -587,7 +555,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, * conversion explicit in case either one changes. */ case INTERNETWORK: - addr4 = (struct sockaddr_in *)&info->sockaddr; + addr4 = (struct sockaddr_in *)&tmp_iface.sockaddr; p4 = (struct iface_info_ipv4 *)p->Buffer; addr4->sin_family = AF_INET; memcpy(&addr4->sin_addr, &p4->IPv4Address, 4); @@ -599,7 +567,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, &addr4->sin_addr); break; case INTERNETWORKV6: - addr6 = (struct sockaddr_in6 *)&info->sockaddr; + addr6 = (struct sockaddr_in6 *)&tmp_iface.sockaddr; p6 = (struct iface_info_ipv6 *)p->Buffer; addr6->sin6_family = AF_INET6; memcpy(&addr6->sin6_addr, &p6->IPv6Address, 16); @@ -619,46 +587,96 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, goto next_iface; } - (*iface_count)++; - info++; + /* + * The iface_list is assumed to be sorted by speed. + * Check if the new interface exists in that list. + * NEVER change iface. it could be in use. + * Add a new one instead + */ + spin_lock(&ses->iface_lock); + iface = niface = NULL; + list_for_each_entry_safe(iface, niface, &ses->iface_list, + iface_head) { + ret = iface_cmp(iface, &tmp_iface); + if (!ret) { + /* just get a ref so that it doesn't get picked/freed */ + iface->is_active = 1; + kref_get(&iface->refcount); + spin_unlock(&ses->iface_lock); + goto next_iface; + } else if (ret < 0) { + /* all remaining ifaces are slower */ + kref_get(&iface->refcount); + break; + } + } + spin_unlock(&ses->iface_lock); + + /* no match. insert the entry in the list */ + info = kmalloc(sizeof(struct cifs_server_iface), + GFP_KERNEL); + if (!info) { + rc = -ENOMEM; + goto out; + } + memcpy(info, &tmp_iface, sizeof(tmp_iface)); + + /* add this new entry to the list */ + kref_init(&info->refcount); + info->is_active = 1; + + cifs_dbg(FYI, "%s: adding iface %zu\n", __func__, ses->iface_count); + cifs_dbg(FYI, "%s: speed %zu bps\n", __func__, info->speed); + cifs_dbg(FYI, "%s: capabilities 0x%08x\n", __func__, + le32_to_cpu(p->Capability)); + + spin_lock(&ses->iface_lock); + if (!list_entry_is_head(iface, &ses->iface_list, iface_head)) { + list_add_tail(&info->iface_head, &iface->iface_head); + kref_put(&iface->refcount, release_iface); + } else + list_add_tail(&info->iface_head, &ses->iface_list); + spin_unlock(&ses->iface_lock); + + ses->iface_count++; + ses->iface_last_update = jiffies; next_iface: + nb_iface++; next = le32_to_cpu(p->Next); - if (!next) + if (!next) { + bytes_left -= sizeof(*p); break; + } p = (struct network_interface_info_ioctl_rsp *)((u8 *)p+next); bytes_left -= next; } - if (!*iface_count) { + if (!nb_iface) { + cifs_dbg(VFS, "%s: malformed interface info\n", __func__); rc = -EINVAL; goto out; } -out: - if (rc) { - kfree(*iface_list); - *iface_count = 0; - *iface_list = NULL; - } - return rc; -} + /* Azure rounds the buffer size up 8, to a 16 byte boundary */ + if ((bytes_left > 8) || p->Next) + cifs_dbg(VFS, "%s: incomplete interface info\n", __func__); -static int compare_iface(const void *ia, const void *ib) -{ - const struct cifs_server_iface *a = (struct cifs_server_iface *)ia; - const struct cifs_server_iface *b = (struct cifs_server_iface *)ib; - return a->speed == b->speed ? 0 : (a->speed > b->speed ? -1 : 1); + if (!ses->iface_count) { + rc = -EINVAL; + goto out; + } + +out: + return rc; } -static int +int SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) { int rc; unsigned int ret_data_len = 0; struct network_interface_info_ioctl_rsp *out_buf = NULL; - struct cifs_server_iface *iface_list; - size_t iface_count; struct cifs_ses *ses = tcon->ses; rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, @@ -674,21 +692,10 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) goto out; } - rc = parse_server_interfaces(out_buf, ret_data_len, - &iface_list, &iface_count); + rc = parse_server_interfaces(out_buf, ret_data_len, ses); if (rc) goto out; - /* sort interfaces from fastest to slowest */ - sort(iface_list, iface_count, sizeof(*iface_list), compare_iface, NULL); - - spin_lock(&ses->iface_lock); - kfree(ses->iface_list); - ses->iface_list = iface_list; - ses->iface_count = iface_count; - ses->iface_last_update = jiffies; - spin_unlock(&ses->iface_lock); - out: kfree(out_buf); return rc; @@ -1138,9 +1145,7 @@ move_smb2_ea_to_cifs(char *dst, size_t dst_size, size_t name_len, value_len, user_name_len; while (src_size > 0) { - name = &src->ea_data[0]; name_len = (size_t)src->ea_name_length; - value = &src->ea_data[src->ea_name_length + 1]; value_len = (size_t)le16_to_cpu(src->ea_value_length); if (name_len == 0) @@ -1152,6 +1157,9 @@ move_smb2_ea_to_cifs(char *dst, size_t dst_size, goto out; } + name = &src->ea_data[0]; + value = &src->ea_data[src->ea_name_length + 1]; + if (ea_name) { if (ea_name_len == name_len && memcmp(ea_name, name, name_len) == 0) { @@ -2567,7 +2575,6 @@ static void smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server) { struct smb2_hdr *shdr = (struct smb2_hdr *)buf; - struct list_head *tmp, *tmp1; struct cifs_ses *ses; struct cifs_tcon *tcon; @@ -2575,12 +2582,12 @@ smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server) return; spin_lock(&cifs_tcp_ses_lock); - list_for_each(tmp, &server->smb_ses_list) { - ses = list_entry(tmp, struct cifs_ses, smb_ses_list); - list_for_each(tmp1, &ses->tcon_list) { - tcon = list_entry(tmp1, struct cifs_tcon, tcon_list); + list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { if (tcon->tid == le32_to_cpu(shdr->Id.SyncId.TreeId)) { + spin_lock(&tcon->tc_lock); tcon->need_reconnect = true; + spin_unlock(&tcon->tc_lock); spin_unlock(&cifs_tcp_ses_lock); pr_warn_once("Server share %s deleted.\n", tcon->treeName); @@ -4556,9 +4563,11 @@ smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key) list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { if (ses->Suid == ses_id) { + spin_lock(&ses->ses_lock); ses_enc_key = enc ? ses->smb3encryptionkey : ses->smb3decryptionkey; memcpy(key, ses_enc_key, SMB3_ENC_DEC_KEY_SIZE); + spin_unlock(&ses->ses_lock); spin_unlock(&cifs_tcp_ses_lock); return 0; } @@ -5073,23 +5082,24 @@ static void smb2_decrypt_offload(struct work_struct *work) mid->callback(mid); } else { - spin_lock(&cifs_tcp_ses_lock); - spin_lock(&GlobalMid_Lock); + spin_lock(&dw->server->srv_lock); if (dw->server->tcpStatus == CifsNeedReconnect) { + spin_lock(&dw->server->mid_lock); mid->mid_state = MID_RETRY_NEEDED; - spin_unlock(&GlobalMid_Lock); - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&dw->server->mid_lock); + spin_unlock(&dw->server->srv_lock); mid->callback(mid); } else { + spin_lock(&dw->server->mid_lock); mid->mid_state = MID_REQUEST_SUBMITTED; mid->mid_flags &= ~(MID_DELETED); list_add_tail(&mid->qhead, &dw->server->pending_mid_q); - spin_unlock(&GlobalMid_Lock); - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&dw->server->mid_lock); + spin_unlock(&dw->server->srv_lock); } } - cifs_mid_q_entry_release(mid); + release_mid(mid); } free_pages: diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index b515140bad8d..590a1d4ac140 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -162,7 +162,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, if (smb2_command == SMB2_TREE_CONNECT || smb2_command == SMB2_IOCTL) return 0; - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&tcon->tc_lock); if (tcon->status == TID_EXITING) { /* * only tree disconnect, open, and write, @@ -172,13 +172,13 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, if ((smb2_command != SMB2_WRITE) && (smb2_command != SMB2_CREATE) && (smb2_command != SMB2_TREE_DISCONNECT)) { - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&tcon->tc_lock); cifs_dbg(FYI, "can not send cmd %d while umounting\n", smb2_command); return -ENODEV; } } - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&tcon->tc_lock); if ((!tcon->ses) || (tcon->ses->ses_status == SES_EXITING) || (!tcon->ses->server) || !server) return -EIO; @@ -217,12 +217,12 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, } /* are we still trying to reconnect? */ - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&server->srv_lock); if (server->tcpStatus != CifsNeedReconnect) { - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); break; } - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); if (retries && --retries) continue; @@ -256,13 +256,13 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, * and the server never sends an answer the socket will be closed * and tcpStatus set to reconnect. */ - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&server->srv_lock); if (server->tcpStatus == CifsNeedReconnect) { - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); rc = -EHOSTDOWN; goto out; } - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); /* * need to prevent multiple threads trying to simultaneously @@ -354,7 +354,7 @@ fill_small_buf(__le16 smb2_command, struct cifs_tcon *tcon, void *buf, unsigned int *total_len) { - struct smb2_pdu *spdu = (struct smb2_pdu *)buf; + struct smb2_pdu *spdu = buf; /* lookup word count ie StructureSize from table */ __u16 parmsize = smb2_req_struct_sizes[le16_to_cpu(smb2_command)]; @@ -543,6 +543,7 @@ assemble_neg_contexts(struct smb2_negotiate_req *req, struct TCP_Server_Info *server, unsigned int *total_len) { char *pneg_ctxt; + char *hostname = NULL; unsigned int ctxt_len, neg_context_count; if (*total_len > 200) { @@ -570,16 +571,25 @@ assemble_neg_contexts(struct smb2_negotiate_req *req, *total_len += ctxt_len; pneg_ctxt += ctxt_len; - ctxt_len = build_netname_ctxt((struct smb2_netname_neg_context *)pneg_ctxt, - server->hostname); - *total_len += ctxt_len; - pneg_ctxt += ctxt_len; + /* + * secondary channels don't have the hostname field populated + * use the hostname field in the primary channel instead + */ + hostname = CIFS_SERVER_IS_CHAN(server) ? + server->primary_server->hostname : server->hostname; + if (hostname && (hostname[0] != 0)) { + ctxt_len = build_netname_ctxt((struct smb2_netname_neg_context *)pneg_ctxt, + hostname); + *total_len += ctxt_len; + pneg_ctxt += ctxt_len; + neg_context_count = 3; + } else + neg_context_count = 2; build_posix_ctxt((struct smb2_posix_neg_context *)pneg_ctxt); *total_len += sizeof(struct smb2_posix_neg_context); pneg_ctxt += sizeof(struct smb2_posix_neg_context); - - neg_context_count = 4; + neg_context_count++; if (server->compress_algorithm) { build_compression_ctxt((struct smb2_compression_capabilities_context *) @@ -3766,7 +3776,7 @@ smb2_echo_callback(struct mid_q_entry *mid) credits.instance = server->reconnect_instance; } - DeleteMidQEntry(mid); + release_mid(mid); add_credits(server, &credits, CIFS_ECHO_OP); } @@ -3901,15 +3911,15 @@ SMB2_echo(struct TCP_Server_Info *server) cifs_dbg(FYI, "In echo request for conn_id %lld\n", server->conn_id); - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&server->srv_lock); if (server->ops->need_neg && server->ops->need_neg(server)) { - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); /* No need to send echo on newly established connections */ mod_delayed_work(cifsiod_wq, &server->reconnect, 0); return rc; } - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); rc = smb2_plain_req_init(SMB2_ECHO, NULL, server, (void **)&req, &total_len); @@ -4191,7 +4201,7 @@ smb2_readv_callback(struct mid_q_entry *mid) rdata->offset, rdata->got_bytes); queue_work(cifsiod_wq, &rdata->work); - DeleteMidQEntry(mid); + release_mid(mid); add_credits(server, &credits, 0); } @@ -4430,7 +4440,7 @@ smb2_writev_callback(struct mid_q_entry *mid) wdata->offset, wdata->bytes); queue_work(cifsiod_wq, &wdata->work); - DeleteMidQEntry(mid); + release_mid(mid); add_credits(server, &credits, 0); } diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 55e79f6ee78d..1a5fc3314dbf 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -640,13 +640,13 @@ smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server) if (!is_signed) return 0; - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&server->srv_lock); if (server->ops->need_neg && server->ops->need_neg(server)) { - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); return 0; } - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); if (!is_binding && !server->session_estab) { strncpy(shdr->Signature, "BSRSPYL", 8); return 0; @@ -750,7 +750,7 @@ smb2_mid_entry_alloc(const struct smb2_hdr *shdr, temp->callback = cifs_wake_up_task; temp->callback_data = current; - atomic_inc(&midCount); + atomic_inc(&mid_count); temp->mid_state = MID_REQUEST_ALLOCATED; trace_smb3_cmd_enter(le32_to_cpu(shdr->Id.SyncId.TreeId), le64_to_cpu(shdr->SessionId), @@ -762,28 +762,30 @@ static int smb2_get_mid_entry(struct cifs_ses *ses, struct TCP_Server_Info *server, struct smb2_hdr *shdr, struct mid_q_entry **mid) { - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&server->srv_lock); if (server->tcpStatus == CifsExiting) { - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); return -ENOENT; } if (server->tcpStatus == CifsNeedReconnect) { - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); cifs_dbg(FYI, "tcp session dead - return to caller to retry\n"); return -EAGAIN; } if (server->tcpStatus == CifsNeedNegotiate && shdr->Command != SMB2_NEGOTIATE) { - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); return -EAGAIN; } + spin_unlock(&server->srv_lock); + spin_lock(&ses->ses_lock); if (ses->ses_status == SES_NEW) { if ((shdr->Command != SMB2_SESSION_SETUP) && (shdr->Command != SMB2_NEGOTIATE)) { - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&ses->ses_lock); return -EAGAIN; } /* else ok - we are setting up session */ @@ -791,19 +793,19 @@ smb2_get_mid_entry(struct cifs_ses *ses, struct TCP_Server_Info *server, if (ses->ses_status == SES_EXITING) { if (shdr->Command != SMB2_LOGOFF) { - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&ses->ses_lock); return -EAGAIN; } /* else ok - we are shutting down the session */ } - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&ses->ses_lock); *mid = smb2_mid_entry_alloc(shdr, server); if (*mid == NULL) return -ENOMEM; - spin_lock(&GlobalMid_Lock); + spin_lock(&server->mid_lock); list_add_tail(&(*mid)->qhead, &server->pending_mid_q); - spin_unlock(&GlobalMid_Lock); + spin_unlock(&server->mid_lock); return 0; } @@ -854,7 +856,7 @@ smb2_setup_request(struct cifs_ses *ses, struct TCP_Server_Info *server, rc = smb2_sign_rqst(rqst, server); if (rc) { revert_current_mid_from_hdr(server, shdr); - cifs_delete_mid(mid); + delete_mid(mid); return ERR_PTR(rc); } @@ -869,13 +871,13 @@ smb2_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst) (struct smb2_hdr *)rqst->rq_iov[0].iov_base; struct mid_q_entry *mid; - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&server->srv_lock); if (server->tcpStatus == CifsNeedNegotiate && shdr->Command != SMB2_NEGOTIATE) { - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); return ERR_PTR(-EAGAIN); } - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); smb2_seq_num_into_buf(server, shdr); @@ -888,7 +890,7 @@ smb2_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst) rc = smb2_sign_rqst(rqst, server); if (rc) { revert_current_mid_from_hdr(server, shdr); - DeleteMidQEntry(mid); + release_mid(mid); return ERR_PTR(rc); } diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index bfc9bd55870a..de7aeced7e16 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -21,6 +21,7 @@ #include <asm/processor.h> #include <linux/mempool.h> #include <linux/sched/signal.h> +#include <linux/task_io_accounting_ops.h> #include "cifspdu.h" #include "cifsglob.h" #include "cifsproto.h" @@ -37,13 +38,13 @@ cifs_wake_up_task(struct mid_q_entry *mid) wake_up_process(mid->callback_data); } -struct mid_q_entry * -AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) +static struct mid_q_entry * +alloc_mid(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) { struct mid_q_entry *temp; if (server == NULL) { - cifs_dbg(VFS, "Null TCP session in AllocMidQEntry\n"); + cifs_dbg(VFS, "%s: null TCP session\n", __func__); return NULL; } @@ -68,12 +69,12 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) temp->callback = cifs_wake_up_task; temp->callback_data = current; - atomic_inc(&midCount); + atomic_inc(&mid_count); temp->mid_state = MID_REQUEST_ALLOCATED; return temp; } -static void _cifs_mid_q_entry_release(struct kref *refcount) +static void __release_mid(struct kref *refcount) { struct mid_q_entry *midEntry = container_of(refcount, struct mid_q_entry, refcount); @@ -91,7 +92,7 @@ static void _cifs_mid_q_entry_release(struct kref *refcount) server->ops->handle_cancelled_mid(midEntry, server); midEntry->mid_state = MID_FREE; - atomic_dec(&midCount); + atomic_dec(&mid_count); if (midEntry->large_buf) cifs_buf_release(midEntry->resp_buf); else @@ -152,29 +153,26 @@ static void _cifs_mid_q_entry_release(struct kref *refcount) mempool_free(midEntry, cifs_mid_poolp); } -void cifs_mid_q_entry_release(struct mid_q_entry *midEntry) +void release_mid(struct mid_q_entry *mid) { - spin_lock(&GlobalMid_Lock); - kref_put(&midEntry->refcount, _cifs_mid_q_entry_release); - spin_unlock(&GlobalMid_Lock); -} + struct TCP_Server_Info *server = mid->server; -void DeleteMidQEntry(struct mid_q_entry *midEntry) -{ - cifs_mid_q_entry_release(midEntry); + spin_lock(&server->mid_lock); + kref_put(&mid->refcount, __release_mid); + spin_unlock(&server->mid_lock); } void -cifs_delete_mid(struct mid_q_entry *mid) +delete_mid(struct mid_q_entry *mid) { - spin_lock(&GlobalMid_Lock); + spin_lock(&mid->server->mid_lock); if (!(mid->mid_flags & MID_DELETED)) { list_del_init(&mid->qhead); mid->mid_flags |= MID_DELETED; } - spin_unlock(&GlobalMid_Lock); + spin_unlock(&mid->server->mid_lock); - DeleteMidQEntry(mid); + release_mid(mid); } /* @@ -577,12 +575,12 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, } else { spin_unlock(&server->req_lock); - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&server->srv_lock); if (server->tcpStatus == CifsExiting) { - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); return -ENOENT; } - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); /* * For normal commands, reserve the last MAX_COMPOUND @@ -725,11 +723,11 @@ cifs_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf, struct mid_q_entry **ppmidQ) { - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&ses->ses_lock); if (ses->ses_status == SES_NEW) { if ((in_buf->Command != SMB_COM_SESSION_SETUP_ANDX) && (in_buf->Command != SMB_COM_NEGOTIATE)) { - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&ses->ses_lock); return -EAGAIN; } /* else ok - we are setting up session */ @@ -738,19 +736,19 @@ static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf, if (ses->ses_status == SES_EXITING) { /* check if SMB session is bad because we are setting it up */ if (in_buf->Command != SMB_COM_LOGOFF_ANDX) { - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&ses->ses_lock); return -EAGAIN; } /* else ok - we are shutting down session */ } - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&ses->ses_lock); - *ppmidQ = AllocMidQEntry(in_buf, ses->server); + *ppmidQ = alloc_mid(in_buf, ses->server); if (*ppmidQ == NULL) return -ENOMEM; - spin_lock(&GlobalMid_Lock); + spin_lock(&ses->server->mid_lock); list_add_tail(&(*ppmidQ)->qhead, &ses->server->pending_mid_q); - spin_unlock(&GlobalMid_Lock); + spin_unlock(&ses->server->mid_lock); return 0; } @@ -782,13 +780,13 @@ cifs_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst) if (server->sign) hdr->Flags2 |= SMBFLG2_SECURITY_SIGNATURE; - mid = AllocMidQEntry(hdr, server); + mid = alloc_mid(hdr, server); if (mid == NULL) return ERR_PTR(-ENOMEM); rc = cifs_sign_rqst(rqst, server, &mid->sequence_number); if (rc) { - DeleteMidQEntry(mid); + release_mid(mid); return ERR_PTR(rc); } @@ -849,9 +847,9 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, mid->mid_state = MID_REQUEST_SUBMITTED; /* put it on the pending_mid_q */ - spin_lock(&GlobalMid_Lock); + spin_lock(&server->mid_lock); list_add_tail(&mid->qhead, &server->pending_mid_q); - spin_unlock(&GlobalMid_Lock); + spin_unlock(&server->mid_lock); /* * Need to store the time in mid before calling I/O. For call_async, @@ -865,7 +863,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, if (rc < 0) { revert_current_mid(server, mid->credits); server->sequence_number -= 2; - cifs_delete_mid(mid); + delete_mid(mid); } cifs_server_unlock(server); @@ -912,10 +910,10 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server) cifs_dbg(FYI, "%s: cmd=%d mid=%llu state=%d\n", __func__, le16_to_cpu(mid->command), mid->mid, mid->mid_state); - spin_lock(&GlobalMid_Lock); + spin_lock(&server->mid_lock); switch (mid->mid_state) { case MID_RESPONSE_RECEIVED: - spin_unlock(&GlobalMid_Lock); + spin_unlock(&server->mid_lock); return rc; case MID_RETRY_NEEDED: rc = -EAGAIN; @@ -935,9 +933,9 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server) __func__, mid->mid, mid->mid_state); rc = -EIO; } - spin_unlock(&GlobalMid_Lock); + spin_unlock(&server->mid_lock); - DeleteMidQEntry(mid); + release_mid(mid); return rc; } @@ -997,7 +995,7 @@ cifs_setup_request(struct cifs_ses *ses, struct TCP_Server_Info *ignored, return ERR_PTR(rc); rc = cifs_sign_rqst(rqst, ses->server, &mid->sequence_number); if (rc) { - cifs_delete_mid(mid); + delete_mid(mid); return ERR_PTR(rc); } return mid; @@ -1026,7 +1024,7 @@ static void cifs_cancelled_callback(struct mid_q_entry *mid) { cifs_compound_callback(mid); - DeleteMidQEntry(mid); + release_mid(mid); } /* @@ -1078,12 +1076,12 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, return -EIO; } - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&server->srv_lock); if (server->tcpStatus == CifsExiting) { - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); return -ENOENT; } - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); /* * Wait for all the requests to become available. @@ -1130,7 +1128,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, if (IS_ERR(midQ[i])) { revert_current_mid(server, i); for (j = 0; j < i; j++) - cifs_delete_mid(midQ[j]); + delete_mid(midQ[j]); cifs_server_unlock(server); /* Update # of requests on wire to server */ @@ -1186,17 +1184,17 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, /* * Compounding is never used during session establish. */ - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&ses->ses_lock); if ((ses->ses_status == SES_NEW) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) { - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&ses->ses_lock); cifs_server_lock(server); smb311_update_preauth_hash(ses, server, rqst[0].rq_iov, rqst[0].rq_nvec); cifs_server_unlock(server); - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&ses->ses_lock); } - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&ses->ses_lock); for (i = 0; i < num_rqst; i++) { rc = wait_for_response(server, midQ[i]); @@ -1208,14 +1206,14 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, cifs_server_dbg(FYI, "Cancelling wait for mid %llu cmd: %d\n", midQ[i]->mid, le16_to_cpu(midQ[i]->command)); send_cancel(server, &rqst[i], midQ[i]); - spin_lock(&GlobalMid_Lock); + spin_lock(&server->mid_lock); midQ[i]->mid_flags |= MID_WAIT_CANCELLED; if (midQ[i]->mid_state == MID_REQUEST_SUBMITTED) { midQ[i]->callback = cifs_cancelled_callback; cancelled_mid[i] = true; credits[i].value = 0; } - spin_unlock(&GlobalMid_Lock); + spin_unlock(&server->mid_lock); } } @@ -1250,7 +1248,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, rc = server->ops->check_receive(midQ[i], server, flags & CIFS_LOG_ERROR); - /* mark it so buf will not be freed by cifs_delete_mid */ + /* mark it so buf will not be freed by delete_mid */ if ((flags & CIFS_NO_RSP_BUF) == 0) midQ[i]->resp_buf = NULL; @@ -1259,19 +1257,19 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, /* * Compounding is never used during session establish. */ - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&ses->ses_lock); if ((ses->ses_status == SES_NEW) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) { struct kvec iov = { .iov_base = resp_iov[0].iov_base, .iov_len = resp_iov[0].iov_len }; - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&ses->ses_lock); cifs_server_lock(server); smb311_update_preauth_hash(ses, server, &iov, 1); cifs_server_unlock(server); - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&ses->ses_lock); } - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&ses->ses_lock); out: /* @@ -1282,7 +1280,7 @@ out: */ for (i = 0; i < num_rqst; i++) { if (!cancelled_mid[i]) - cifs_delete_mid(midQ[i]); + delete_mid(midQ[i]); } return rc; @@ -1360,12 +1358,12 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, return -EIO; } - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&server->srv_lock); if (server->tcpStatus == CifsExiting) { - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); return -ENOENT; } - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); /* Ensure that we do not send more than 50 overlapping requests to the same server. We may make this configurable later or @@ -1419,15 +1417,15 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, rc = wait_for_response(server, midQ); if (rc != 0) { send_cancel(server, &rqst, midQ); - spin_lock(&GlobalMid_Lock); + spin_lock(&server->mid_lock); if (midQ->mid_state == MID_REQUEST_SUBMITTED) { /* no longer considered to be "in-flight" */ - midQ->callback = DeleteMidQEntry; - spin_unlock(&GlobalMid_Lock); + midQ->callback = release_mid; + spin_unlock(&server->mid_lock); add_credits(server, &credits, 0); return rc; } - spin_unlock(&GlobalMid_Lock); + spin_unlock(&server->mid_lock); } rc = cifs_sync_mid_result(midQ, server); @@ -1447,7 +1445,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, memcpy(out_buf, midQ->resp_buf, *pbytes_returned + 4); rc = cifs_check_receive(midQ, server, 0); out: - cifs_delete_mid(midQ); + delete_mid(midQ); add_credits(server, &credits, 0); return rc; @@ -1505,12 +1503,12 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, return -EIO; } - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&server->srv_lock); if (server->tcpStatus == CifsExiting) { - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); return -ENOENT; } - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); /* Ensure that we do not send more than 50 overlapping requests to the same server. We may make this configurable later or @@ -1540,7 +1538,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, rc = cifs_sign_smb(in_buf, server, &midQ->sequence_number); if (rc) { - cifs_delete_mid(midQ); + delete_mid(midQ); cifs_server_unlock(server); return rc; } @@ -1557,7 +1555,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, cifs_server_unlock(server); if (rc < 0) { - cifs_delete_mid(midQ); + delete_mid(midQ); return rc; } @@ -1568,19 +1566,19 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, (server->tcpStatus != CifsNew))); /* Were we interrupted by a signal ? */ - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&server->srv_lock); if ((rc == -ERESTARTSYS) && (midQ->mid_state == MID_REQUEST_SUBMITTED) && ((server->tcpStatus == CifsGood) || (server->tcpStatus == CifsNew))) { - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); if (in_buf->Command == SMB_COM_TRANSACTION2) { /* POSIX lock. We send a NT_CANCEL SMB to cause the blocking lock to return. */ rc = send_cancel(server, &rqst, midQ); if (rc) { - cifs_delete_mid(midQ); + delete_mid(midQ); return rc; } } else { @@ -1592,7 +1590,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, /* If we get -ENOLCK back the lock may have already been removed. Don't exit in this case. */ if (rc && rc != -ENOLCK) { - cifs_delete_mid(midQ); + delete_mid(midQ); return rc; } } @@ -1600,21 +1598,21 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, rc = wait_for_response(server, midQ); if (rc) { send_cancel(server, &rqst, midQ); - spin_lock(&GlobalMid_Lock); + spin_lock(&server->mid_lock); if (midQ->mid_state == MID_REQUEST_SUBMITTED) { /* no longer considered to be "in-flight" */ - midQ->callback = DeleteMidQEntry; - spin_unlock(&GlobalMid_Lock); + midQ->callback = release_mid; + spin_unlock(&server->mid_lock); return rc; } - spin_unlock(&GlobalMid_Lock); + spin_unlock(&server->mid_lock); } /* We got the response - restart system call. */ rstart = 1; - spin_lock(&cifs_tcp_ses_lock); + spin_lock(&server->srv_lock); } - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&server->srv_lock); rc = cifs_sync_mid_result(midQ, server); if (rc != 0) @@ -1631,8 +1629,186 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, memcpy(out_buf, midQ->resp_buf, *pbytes_returned + 4); rc = cifs_check_receive(midQ, server, 0); out: - cifs_delete_mid(midQ); + delete_mid(midQ); if (rstart && rc == -EACCES) return -ERESTARTSYS; return rc; } + +/* + * Discard any remaining data in the current SMB. To do this, we borrow the + * current bigbuf. + */ +int +cifs_discard_remaining_data(struct TCP_Server_Info *server) +{ + unsigned int rfclen = server->pdu_size; + int remaining = rfclen + server->vals->header_preamble_size - + server->total_read; + + while (remaining > 0) { + int length; + + length = cifs_discard_from_socket(server, + min_t(size_t, remaining, + CIFSMaxBufSize + MAX_HEADER_SIZE(server))); + if (length < 0) + return length; + server->total_read += length; + remaining -= length; + } + + return 0; +} + +static int +__cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid, + bool malformed) +{ + int length; + + length = cifs_discard_remaining_data(server); + dequeue_mid(mid, malformed); + mid->resp_buf = server->smallbuf; + server->smallbuf = NULL; + return length; +} + +static int +cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid) +{ + struct cifs_readdata *rdata = mid->callback_data; + + return __cifs_readv_discard(server, mid, rdata->result); +} + +int +cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) +{ + int length, len; + unsigned int data_offset, data_len; + struct cifs_readdata *rdata = mid->callback_data; + char *buf = server->smallbuf; + unsigned int buflen = server->pdu_size + + server->vals->header_preamble_size; + bool use_rdma_mr = false; + + cifs_dbg(FYI, "%s: mid=%llu offset=%llu bytes=%u\n", + __func__, mid->mid, rdata->offset, rdata->bytes); + + /* + * read the rest of READ_RSP header (sans Data array), or whatever we + * can if there's not enough data. At this point, we've read down to + * the Mid. + */ + len = min_t(unsigned int, buflen, server->vals->read_rsp_size) - + HEADER_SIZE(server) + 1; + + length = cifs_read_from_socket(server, + buf + HEADER_SIZE(server) - 1, len); + if (length < 0) + return length; + server->total_read += length; + + if (server->ops->is_session_expired && + server->ops->is_session_expired(buf)) { + cifs_reconnect(server, true); + return -1; + } + + if (server->ops->is_status_pending && + server->ops->is_status_pending(buf, server)) { + cifs_discard_remaining_data(server); + return -1; + } + + /* set up first two iov for signature check and to get credits */ + rdata->iov[0].iov_base = buf; + rdata->iov[0].iov_len = server->vals->header_preamble_size; + rdata->iov[1].iov_base = buf + server->vals->header_preamble_size; + rdata->iov[1].iov_len = + server->total_read - server->vals->header_preamble_size; + cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n", + rdata->iov[0].iov_base, rdata->iov[0].iov_len); + cifs_dbg(FYI, "1: iov_base=%p iov_len=%zu\n", + rdata->iov[1].iov_base, rdata->iov[1].iov_len); + + /* Was the SMB read successful? */ + rdata->result = server->ops->map_error(buf, false); + if (rdata->result != 0) { + cifs_dbg(FYI, "%s: server returned error %d\n", + __func__, rdata->result); + /* normal error on read response */ + return __cifs_readv_discard(server, mid, false); + } + + /* Is there enough to get to the rest of the READ_RSP header? */ + if (server->total_read < server->vals->read_rsp_size) { + cifs_dbg(FYI, "%s: server returned short header. got=%u expected=%zu\n", + __func__, server->total_read, + server->vals->read_rsp_size); + rdata->result = -EIO; + return cifs_readv_discard(server, mid); + } + + data_offset = server->ops->read_data_offset(buf) + + server->vals->header_preamble_size; + if (data_offset < server->total_read) { + /* + * win2k8 sometimes sends an offset of 0 when the read + * is beyond the EOF. Treat it as if the data starts just after + * the header. + */ + cifs_dbg(FYI, "%s: data offset (%u) inside read response header\n", + __func__, data_offset); + data_offset = server->total_read; + } else if (data_offset > MAX_CIFS_SMALL_BUFFER_SIZE) { + /* data_offset is beyond the end of smallbuf */ + cifs_dbg(FYI, "%s: data offset (%u) beyond end of smallbuf\n", + __func__, data_offset); + rdata->result = -EIO; + return cifs_readv_discard(server, mid); + } + + cifs_dbg(FYI, "%s: total_read=%u data_offset=%u\n", + __func__, server->total_read, data_offset); + + len = data_offset - server->total_read; + if (len > 0) { + /* read any junk before data into the rest of smallbuf */ + length = cifs_read_from_socket(server, + buf + server->total_read, len); + if (length < 0) + return length; + server->total_read += length; + } + + /* how much data is in the response? */ +#ifdef CONFIG_CIFS_SMB_DIRECT + use_rdma_mr = rdata->mr; +#endif + data_len = server->ops->read_data_length(buf, use_rdma_mr); + if (!use_rdma_mr && (data_offset + data_len > buflen)) { + /* data_len is corrupt -- discard frame */ + rdata->result = -EIO; + return cifs_readv_discard(server, mid); + } + + length = rdata->read_into_pages(server, rdata, data_len); + if (length < 0) + return length; + + server->total_read += length; + + cifs_dbg(FYI, "total_read=%u buflen=%u remaining=%u\n", + server->total_read, buflen, data_len); + + /* discard anything left over */ + if (server->total_read < buflen) + return cifs_readv_discard(server, mid); + + dequeue_mid(mid, false); + mid->resp_buf = server->smallbuf; + server->smallbuf = NULL; + return length; +} diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 9d486fbbfbbd..998fa51f9b68 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -201,6 +201,7 @@ static int cifs_xattr_set(const struct xattr_handler *handler, break; } +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY case XATTR_ACL_ACCESS: #ifdef CONFIG_CIFS_POSIX if (!value) @@ -224,6 +225,7 @@ static int cifs_xattr_set(const struct xattr_handler *handler, cifs_remap(cifs_sb)); #endif /* CONFIG_CIFS_POSIX */ break; +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ } out: @@ -364,7 +366,7 @@ static int cifs_xattr_get(const struct xattr_handler *handler, } break; } - +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY case XATTR_ACL_ACCESS: #ifdef CONFIG_CIFS_POSIX if (sb->s_flags & SB_POSIXACL) @@ -384,6 +386,7 @@ static int cifs_xattr_get(const struct xattr_handler *handler, cifs_remap(cifs_sb)); #endif /* CONFIG_CIFS_POSIX */ break; +#endif /* ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ } /* We could add an additional check for streams ie diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c index 8adf81042498..ccdbec388091 100644 --- a/fs/coda/symlink.c +++ b/fs/coda/symlink.c @@ -22,25 +22,24 @@ static int coda_symlink_filler(struct file *file, struct folio *folio) { - struct page *page = &folio->page; struct inode *inode = folio->mapping->host; int error; struct coda_inode_info *cii; unsigned int len = PAGE_SIZE; - char *p = page_address(page); + char *p = folio_address(folio); cii = ITOC(inode); error = venus_readlink(inode->i_sb, &cii->c_fid, p, &len); if (error) goto fail; - SetPageUptodate(page); - unlock_page(page); + folio_mark_uptodate(folio); + folio_unlock(folio); return 0; fail: - SetPageError(page); - unlock_page(page); + folio_set_error(folio); + folio_unlock(folio); return error; } diff --git a/fs/coredump.c b/fs/coredump.c index ebc43f960b64..9f4aae202109 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -816,9 +816,9 @@ static int __dump_skip(struct coredump_params *cprm, size_t nr) { static char zeroes[PAGE_SIZE]; struct file *file = cprm->file; - if (file->f_op->llseek && file->f_op->llseek != no_llseek) { + if (file->f_mode & FMODE_LSEEK) { if (dump_interrupted() || - file->f_op->llseek(file, nr, SEEK_CUR) < 0) + vfs_llseek(file, nr, SEEK_CUR) < 0) return 0; cprm->pos += nr; return 1; diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 7ae59a6afc5c..61ccf7722fc3 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -183,6 +183,7 @@ static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset, unsigned int len) { struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; + struct file_ra_state ra; struct page *pages[BLKS_PER_BUF]; unsigned i, blocknr, buffer; unsigned long devsize; @@ -212,6 +213,9 @@ static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset, devsize = bdev_nr_bytes(sb->s_bdev) >> PAGE_SHIFT; /* Ok, read in BLKS_PER_BUF pages completely first. */ + file_ra_state_init(&ra, mapping); + page_cache_sync_readahead(mapping, &ra, NULL, blocknr, BLKS_PER_BUF); + for (i = 0; i < BLKS_PER_BUF; i++) { struct page *page = NULL; @@ -224,19 +228,6 @@ static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset, pages[i] = page; } - for (i = 0; i < BLKS_PER_BUF; i++) { - struct page *page = pages[i]; - - if (page) { - wait_on_page_locked(page); - if (!PageUptodate(page)) { - /* asynchronous error */ - put_page(page); - pages[i] = NULL; - } - } - } - buffer = next_buffer; next_buffer = NEXT_BUFFER(buffer); buffer_blocknr[buffer] = blocknr; diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 6b4c8094cc7b..f5be777d8279 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -31,7 +31,7 @@ #define FSCRYPT_CONTEXT_V2 2 /* Keep this in sync with include/uapi/linux/fscrypt.h */ -#define FSCRYPT_MODE_MAX FSCRYPT_MODE_ADIANTUM +#define FSCRYPT_MODE_MAX FSCRYPT_MODE_AES_256_HCTR2 struct fscrypt_context_v1 { u8 version; /* FSCRYPT_CONTEXT_V1 */ diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index c35711896bd4..fbc71abdabe3 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -53,6 +53,13 @@ struct fscrypt_mode fscrypt_modes[] = { .ivsize = 32, .blk_crypto_mode = BLK_ENCRYPTION_MODE_ADIANTUM, }, + [FSCRYPT_MODE_AES_256_HCTR2] = { + .friendly_name = "AES-256-HCTR2", + .cipher_str = "hctr2(aes)", + .keysize = 32, + .security_strength = 32, + .ivsize = 32, + }, }; static DEFINE_MUTEX(fscrypt_mode_key_setup_mutex); diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 5f858cee1e3b..8a054e6d1e68 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -61,7 +61,7 @@ fscrypt_get_dummy_policy(struct super_block *sb) return sb->s_cop->get_dummy_policy(sb); } -static bool fscrypt_valid_enc_modes(u32 contents_mode, u32 filenames_mode) +static bool fscrypt_valid_enc_modes_v1(u32 contents_mode, u32 filenames_mode) { if (contents_mode == FSCRYPT_MODE_AES_256_XTS && filenames_mode == FSCRYPT_MODE_AES_256_CTS) @@ -78,6 +78,14 @@ static bool fscrypt_valid_enc_modes(u32 contents_mode, u32 filenames_mode) return false; } +static bool fscrypt_valid_enc_modes_v2(u32 contents_mode, u32 filenames_mode) +{ + if (contents_mode == FSCRYPT_MODE_AES_256_XTS && + filenames_mode == FSCRYPT_MODE_AES_256_HCTR2) + return true; + return fscrypt_valid_enc_modes_v1(contents_mode, filenames_mode); +} + static bool supported_direct_key_modes(const struct inode *inode, u32 contents_mode, u32 filenames_mode) { @@ -151,7 +159,7 @@ static bool supported_iv_ino_lblk_policy(const struct fscrypt_policy_v2 *policy, static bool fscrypt_supported_v1_policy(const struct fscrypt_policy_v1 *policy, const struct inode *inode) { - if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode, + if (!fscrypt_valid_enc_modes_v1(policy->contents_encryption_mode, policy->filenames_encryption_mode)) { fscrypt_warn(inode, "Unsupported encryption modes (contents %d, filenames %d)", @@ -187,7 +195,7 @@ static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy, { int count = 0; - if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode, + if (!fscrypt_valid_enc_modes_v2(policy->contents_encryption_mode, policy->filenames_encryption_mode)) { fscrypt_warn(inode, "Unsupported encryption modes (contents %d, filenames %d)", @@ -334,13 +334,35 @@ static unsigned long dax_end_pfn(void *entry) for (pfn = dax_to_pfn(entry); \ pfn < dax_end_pfn(entry); pfn++) +static inline bool dax_mapping_is_cow(struct address_space *mapping) +{ + return (unsigned long)mapping == PAGE_MAPPING_DAX_COW; +} + /* - * TODO: for reflink+dax we need a way to associate a single page with - * multiple address_space instances at different linear_page_index() - * offsets. + * Set the page->mapping with FS_DAX_MAPPING_COW flag, increase the refcount. + */ +static inline void dax_mapping_set_cow(struct page *page) +{ + if ((uintptr_t)page->mapping != PAGE_MAPPING_DAX_COW) { + /* + * Reset the index if the page was already mapped + * regularly before. + */ + if (page->mapping) + page->index = 1; + page->mapping = (void *)PAGE_MAPPING_DAX_COW; + } + page->index++; +} + +/* + * When it is called in dax_insert_entry(), the cow flag will indicate that + * whether this entry is shared by multiple files. If so, set the page->mapping + * FS_DAX_MAPPING_COW, and use page->index as refcount. */ static void dax_associate_entry(void *entry, struct address_space *mapping, - struct vm_area_struct *vma, unsigned long address) + struct vm_area_struct *vma, unsigned long address, bool cow) { unsigned long size = dax_entry_size(entry), pfn, index; int i = 0; @@ -352,9 +374,13 @@ static void dax_associate_entry(void *entry, struct address_space *mapping, for_each_mapped_pfn(entry, pfn) { struct page *page = pfn_to_page(pfn); - WARN_ON_ONCE(page->mapping); - page->mapping = mapping; - page->index = index + i++; + if (cow) { + dax_mapping_set_cow(page); + } else { + WARN_ON_ONCE(page->mapping); + page->mapping = mapping; + page->index = index + i++; + } } } @@ -370,7 +396,12 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping, struct page *page = pfn_to_page(pfn); WARN_ON_ONCE(trunc && page_ref_count(page) > 1); - WARN_ON_ONCE(page->mapping && page->mapping != mapping); + if (dax_mapping_is_cow(page->mapping)) { + /* keep the CoW flag if this page is still shared */ + if (page->index-- > 0) + continue; + } else + WARN_ON_ONCE(page->mapping && page->mapping != mapping); page->mapping = NULL; page->index = 0; } @@ -456,6 +487,69 @@ void dax_unlock_page(struct page *page, dax_entry_t cookie) } /* + * dax_lock_mapping_entry - Lock the DAX entry corresponding to a mapping + * @mapping: the file's mapping whose entry we want to lock + * @index: the offset within this file + * @page: output the dax page corresponding to this dax entry + * + * Return: A cookie to pass to dax_unlock_mapping_entry() or 0 if the entry + * could not be locked. + */ +dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, pgoff_t index, + struct page **page) +{ + XA_STATE(xas, NULL, 0); + void *entry; + + rcu_read_lock(); + for (;;) { + entry = NULL; + if (!dax_mapping(mapping)) + break; + + xas.xa = &mapping->i_pages; + xas_lock_irq(&xas); + xas_set(&xas, index); + entry = xas_load(&xas); + if (dax_is_locked(entry)) { + rcu_read_unlock(); + wait_entry_unlocked(&xas, entry); + rcu_read_lock(); + continue; + } + if (!entry || + dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { + /* + * Because we are looking for entry from file's mapping + * and index, so the entry may not be inserted for now, + * or even a zero/empty entry. We don't think this is + * an error case. So, return a special value and do + * not output @page. + */ + entry = (void *)~0UL; + } else { + *page = pfn_to_page(dax_to_pfn(entry)); + dax_lock_entry(&xas, entry); + } + xas_unlock_irq(&xas); + break; + } + rcu_read_unlock(); + return (dax_entry_t)entry; +} + +void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index, + dax_entry_t cookie) +{ + XA_STATE(xas, &mapping->i_pages, index); + + if (cookie == ~0UL) + return; + + dax_unlock_entry(&xas, (void *)cookie); +} + +/* * Find page cache entry at given index. If it is a DAX entry, return it * with the entry locked. If the page cache doesn't contain an entry at * that index, add a locked empty entry. @@ -736,22 +830,42 @@ static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter } /* + * MAP_SYNC on a dax mapping guarantees dirty metadata is + * flushed on write-faults (non-cow), but not read-faults. + */ +static bool dax_fault_is_synchronous(const struct iomap_iter *iter, + struct vm_area_struct *vma) +{ + return (iter->flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) && + (iter->iomap.flags & IOMAP_F_DIRTY); +} + +static bool dax_fault_is_cow(const struct iomap_iter *iter) +{ + return (iter->flags & IOMAP_WRITE) && + (iter->iomap.flags & IOMAP_F_SHARED); +} + +/* * By this point grab_mapping_entry() has ensured that we have a locked entry * of the appropriate size so we don't have to worry about downgrading PMDs to * PTEs. If we happen to be trying to insert a PTE and there is a PMD * already in the tree, we will skip the insertion and just dirty the PMD as * appropriate. */ -static void *dax_insert_entry(struct xa_state *xas, - struct address_space *mapping, struct vm_fault *vmf, - void *entry, pfn_t pfn, unsigned long flags, bool dirty) +static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf, + const struct iomap_iter *iter, void *entry, pfn_t pfn, + unsigned long flags) { + struct address_space *mapping = vmf->vma->vm_file->f_mapping; void *new_entry = dax_make_entry(pfn, flags); + bool dirty = !dax_fault_is_synchronous(iter, vmf->vma); + bool cow = dax_fault_is_cow(iter); if (dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) { + if (cow || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) { unsigned long index = xas->xa_index; /* we are replacing a zero page with block mapping */ if (dax_is_pmd_entry(entry)) @@ -763,11 +877,12 @@ static void *dax_insert_entry(struct xa_state *xas, xas_reset(xas); xas_lock_irq(xas); - if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { + if (cow || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { void *old; dax_disassociate_entry(entry, mapping, false); - dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address); + dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address, + cow); /* * Only swap our new entry into the page cache if the current * entry is a zero page or an empty entry. If a normal PTE or @@ -787,6 +902,9 @@ static void *dax_insert_entry(struct xa_state *xas, if (dirty) xas_set_mark(xas, PAGECACHE_TAG_DIRTY); + if (cow) + xas_set_mark(xas, PAGECACHE_TAG_TOWRITE); + xas_unlock_irq(xas); return entry; } @@ -931,20 +1049,22 @@ int dax_writeback_mapping_range(struct address_space *mapping, } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); -static int dax_iomap_pfn(const struct iomap *iomap, loff_t pos, size_t size, - pfn_t *pfnp) +static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos, + size_t size, void **kaddr, pfn_t *pfnp) { pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); - int id, rc; + int id, rc = 0; long length; id = dax_read_lock(); length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size), - DAX_ACCESS, NULL, pfnp); + DAX_ACCESS, kaddr, pfnp); if (length < 0) { rc = length; goto out; } + if (!pfnp) + goto out_check_addr; rc = -EINVAL; if (PFN_PHYS(length) < size) goto out; @@ -954,11 +1074,71 @@ static int dax_iomap_pfn(const struct iomap *iomap, loff_t pos, size_t size, if (length > 1 && !pfn_t_devmap(*pfnp)) goto out; rc = 0; + +out_check_addr: + if (!kaddr) + goto out; + if (!*kaddr) + rc = -EFAULT; out: dax_read_unlock(id); return rc; } +/** + * dax_iomap_cow_copy - Copy the data from source to destination before write + * @pos: address to do copy from. + * @length: size of copy operation. + * @align_size: aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE) + * @srcmap: iomap srcmap + * @daddr: destination address to copy to. + * + * This can be called from two places. Either during DAX write fault (page + * aligned), to copy the length size data to daddr. Or, while doing normal DAX + * write operation, dax_iomap_actor() might call this to do the copy of either + * start or end unaligned address. In the latter case the rest of the copy of + * aligned ranges is taken care by dax_iomap_actor() itself. + */ +static int dax_iomap_cow_copy(loff_t pos, uint64_t length, size_t align_size, + const struct iomap *srcmap, void *daddr) +{ + loff_t head_off = pos & (align_size - 1); + size_t size = ALIGN(head_off + length, align_size); + loff_t end = pos + length; + loff_t pg_end = round_up(end, align_size); + bool copy_all = head_off == 0 && end == pg_end; + void *saddr = 0; + int ret = 0; + + ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL); + if (ret) + return ret; + + if (copy_all) { + ret = copy_mc_to_kernel(daddr, saddr, length); + return ret ? -EIO : 0; + } + + /* Copy the head part of the range */ + if (head_off) { + ret = copy_mc_to_kernel(daddr, saddr, head_off); + if (ret) + return -EIO; + } + + /* Copy the tail part of the range */ + if (end < pg_end) { + loff_t tail_off = head_off + length; + loff_t tail_len = pg_end - end; + + ret = copy_mc_to_kernel(daddr + tail_off, saddr + tail_off, + tail_len); + if (ret) + return -EIO; + } + return 0; +} + /* * The user has performed a load from a hole in the file. Allocating a new * page in the file would cause excessive storage usage for workloads with @@ -966,17 +1146,15 @@ out: * If this page is ever written to we will re-fault and change the mapping to * point to real DAX storage instead. */ -static vm_fault_t dax_load_hole(struct xa_state *xas, - struct address_space *mapping, void **entry, - struct vm_fault *vmf) +static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf, + const struct iomap_iter *iter, void **entry) { - struct inode *inode = mapping->host; + struct inode *inode = iter->inode; unsigned long vaddr = vmf->address; pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr)); vm_fault_t ret; - *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, - DAX_ZERO_PAGE, false); + *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE); ret = vmf_insert_mixed(vmf->vma, vaddr, pfn); trace_dax_load_hole(inode, vmf, ret); @@ -985,7 +1163,7 @@ static vm_fault_t dax_load_hole(struct xa_state *xas, #ifdef CONFIG_FS_DAX_PMD static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, - const struct iomap *iomap, void **entry) + const struct iomap_iter *iter, void **entry) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; unsigned long pmd_addr = vmf->address & PMD_MASK; @@ -1003,8 +1181,8 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, goto fallback; pfn = page_to_pfn_t(zero_page); - *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, - DAX_PMD | DAX_ZERO_PAGE, false); + *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, + DAX_PMD | DAX_ZERO_PAGE); if (arch_needs_pgtable_deposit()) { pgtable = pte_alloc_one(vma->vm_mm); @@ -1037,23 +1215,34 @@ fallback: } #else static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, - const struct iomap *iomap, void **entry) + const struct iomap_iter *iter, void **entry) { return VM_FAULT_FALLBACK; } #endif /* CONFIG_FS_DAX_PMD */ -static int dax_memzero(struct dax_device *dax_dev, pgoff_t pgoff, - unsigned int offset, size_t size) +static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size) { + const struct iomap *iomap = &iter->iomap; + const struct iomap *srcmap = iomap_iter_srcmap(iter); + unsigned offset = offset_in_page(pos); + pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); void *kaddr; long ret; - ret = dax_direct_access(dax_dev, pgoff, 1, DAX_ACCESS, &kaddr, NULL); - if (ret > 0) { - memset(kaddr + offset, 0, size); - dax_flush(dax_dev, kaddr + offset, size); - } + ret = dax_direct_access(iomap->dax_dev, pgoff, 1, DAX_ACCESS, &kaddr, + NULL); + if (ret < 0) + return ret; + memset(kaddr + offset, 0, size); + if (srcmap->addr != iomap->addr) { + ret = dax_iomap_cow_copy(pos, size, PAGE_SIZE, srcmap, + kaddr); + if (ret < 0) + return ret; + dax_flush(iomap->dax_dev, kaddr, PAGE_SIZE); + } else + dax_flush(iomap->dax_dev, kaddr + offset, size); return ret; } @@ -1080,7 +1269,7 @@ static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero) if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE) rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1); else - rc = dax_memzero(iomap->dax_dev, pgoff, offset, size); + rc = dax_memzero(iter, pos, size); dax_read_unlock(id); if (rc < 0) @@ -1088,10 +1277,10 @@ static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero) pos += size; length -= size; written += size; - if (did_zero) - *did_zero = true; } while (length > 0); + if (did_zero) + *did_zero = true; return written; } @@ -1129,15 +1318,17 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, struct iov_iter *iter) { const struct iomap *iomap = &iomi->iomap; + const struct iomap *srcmap = &iomi->srcmap; loff_t length = iomap_length(iomi); loff_t pos = iomi->pos; struct dax_device *dax_dev = iomap->dax_dev; loff_t end = pos + length, done = 0; + bool write = iov_iter_rw(iter) == WRITE; ssize_t ret = 0; size_t xfer; int id; - if (iov_iter_rw(iter) == READ) { + if (!write) { end = min(end, i_size_read(iomi->inode)); if (pos >= end) return 0; @@ -1146,7 +1337,12 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, return iov_iter_zero(min(length, end - pos), iter); } - if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED)) + /* + * In DAX mode, enforce either pure overwrites of written extents, or + * writes to unwritten extents as part of a copy-on-write operation. + */ + if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED && + !(iomap->flags & IOMAP_F_SHARED))) return -EIO; /* @@ -1188,6 +1384,14 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, break; } + if (write && + srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) { + ret = dax_iomap_cow_copy(pos, length, PAGE_SIZE, srcmap, + kaddr); + if (ret) + break; + } + map_len = PFN_PHYS(map_len); kaddr += offset; map_len -= offset; @@ -1197,7 +1401,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, if (recovery) xfer = dax_recovery_write(dax_dev, pgoff, kaddr, map_len, iter); - else if (iov_iter_rw(iter) == WRITE) + else if (write) xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr, map_len, iter); else @@ -1268,17 +1472,6 @@ static vm_fault_t dax_fault_return(int error) } /* - * MAP_SYNC on a dax mapping guarantees dirty metadata is - * flushed on write-faults (non-cow), but not read-faults. - */ -static bool dax_fault_is_synchronous(unsigned long flags, - struct vm_area_struct *vma, const struct iomap *iomap) -{ - return (flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) - && (iomap->flags & IOMAP_F_DIRTY); -} - -/* * When handling a synchronous page fault and the inode need a fsync, we can * insert the PTE/PMD into page tables only after that fsync happened. Skip * insertion for now and return the pfn so that caller can insert it after the @@ -1335,15 +1528,15 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf, const struct iomap_iter *iter, pfn_t *pfnp, struct xa_state *xas, void **entry, bool pmd) { - struct address_space *mapping = vmf->vma->vm_file->f_mapping; const struct iomap *iomap = &iter->iomap; + const struct iomap *srcmap = &iter->srcmap; size_t size = pmd ? PMD_SIZE : PAGE_SIZE; loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT; - bool write = vmf->flags & FAULT_FLAG_WRITE; - bool sync = dax_fault_is_synchronous(iter->flags, vmf->vma, iomap); + bool write = iter->flags & IOMAP_WRITE; unsigned long entry_flags = pmd ? DAX_PMD : 0; int err = 0; pfn_t pfn; + void *kaddr; if (!pmd && vmf->cow_page) return dax_fault_cow_page(vmf, iter); @@ -1352,23 +1545,29 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf, if (!write && (iomap->type == IOMAP_UNWRITTEN || iomap->type == IOMAP_HOLE)) { if (!pmd) - return dax_load_hole(xas, mapping, entry, vmf); - return dax_pmd_load_hole(xas, vmf, iomap, entry); + return dax_load_hole(xas, vmf, iter, entry); + return dax_pmd_load_hole(xas, vmf, iter, entry); } - if (iomap->type != IOMAP_MAPPED) { + if (iomap->type != IOMAP_MAPPED && !(iomap->flags & IOMAP_F_SHARED)) { WARN_ON_ONCE(1); return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS; } - err = dax_iomap_pfn(&iter->iomap, pos, size, &pfn); + err = dax_iomap_direct_access(iomap, pos, size, &kaddr, &pfn); if (err) return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err); - *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, entry_flags, - write && !sync); + *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, entry_flags); + + if (write && + srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) { + err = dax_iomap_cow_copy(pos, size, size, srcmap, kaddr); + if (err) + return dax_fault_return(err); + } - if (sync) + if (dax_fault_is_synchronous(iter, vmf->vma)) return dax_fault_synchronous_pfnp(pfnp, pfn); /* insert PMD pfn */ @@ -1674,3 +1873,85 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, return dax_insert_pfn_mkwrite(vmf, pfn, order); } EXPORT_SYMBOL_GPL(dax_finish_sync_fault); + +static loff_t dax_range_compare_iter(struct iomap_iter *it_src, + struct iomap_iter *it_dest, u64 len, bool *same) +{ + const struct iomap *smap = &it_src->iomap; + const struct iomap *dmap = &it_dest->iomap; + loff_t pos1 = it_src->pos, pos2 = it_dest->pos; + void *saddr, *daddr; + int id, ret; + + len = min(len, min(smap->length, dmap->length)); + + if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) { + *same = true; + return len; + } + + if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) { + *same = false; + return 0; + } + + id = dax_read_lock(); + ret = dax_iomap_direct_access(smap, pos1, ALIGN(pos1 + len, PAGE_SIZE), + &saddr, NULL); + if (ret < 0) + goto out_unlock; + + ret = dax_iomap_direct_access(dmap, pos2, ALIGN(pos2 + len, PAGE_SIZE), + &daddr, NULL); + if (ret < 0) + goto out_unlock; + + *same = !memcmp(saddr, daddr, len); + if (!*same) + len = 0; + dax_read_unlock(id); + return len; + +out_unlock: + dax_read_unlock(id); + return -EIO; +} + +int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff, + struct inode *dst, loff_t dstoff, loff_t len, bool *same, + const struct iomap_ops *ops) +{ + struct iomap_iter src_iter = { + .inode = src, + .pos = srcoff, + .len = len, + .flags = IOMAP_DAX, + }; + struct iomap_iter dst_iter = { + .inode = dst, + .pos = dstoff, + .len = len, + .flags = IOMAP_DAX, + }; + int ret; + + while ((ret = iomap_iter(&src_iter, ops)) > 0) { + while ((ret = iomap_iter(&dst_iter, ops)) > 0) { + dst_iter.processed = dax_range_compare_iter(&src_iter, + &dst_iter, len, same); + } + if (ret <= 0) + src_iter.processed = ret; + } + return ret; +} + +int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags, + const struct iomap_ops *ops) +{ + return __generic_remap_file_range_prep(file_in, pos_in, file_out, + pos_out, len, remap_flags, ops); +} +EXPORT_SYMBOL_GPL(dax_remap_file_range_prep); diff --git a/fs/dcache.c b/fs/dcache.c index 93f4f5ee07bf..ea5cdec24ea7 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2240,6 +2240,7 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode, } res = d_splice_alias(inode, found); if (res) { + d_lookup_done(found); dput(found); return res; } @@ -2563,7 +2564,15 @@ EXPORT_SYMBOL(d_rehash); static inline unsigned start_dir_add(struct inode *dir) { - + /* + * The caller holds a spinlock (dentry::d_lock). On !PREEMPT_RT + * kernels spin_lock() implicitly disables preemption, but not on + * PREEMPT_RT. So for RT it has to be done explicitly to protect + * the sequence count write side critical section against a reader + * or another writer preempting, which would result in a live lock. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); for (;;) { unsigned n = dir->i_dir_seq; if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n) @@ -2572,9 +2581,13 @@ static inline unsigned start_dir_add(struct inode *dir) } } -static inline void end_dir_add(struct inode *dir, unsigned n) +static inline void end_dir_add(struct inode *dir, unsigned int n, + wait_queue_head_t *d_wait) { smp_store_release(&dir->i_dir_seq, n + 2); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); + wake_up_all(d_wait); } static void d_wait_lookup(struct dentry *dentry) @@ -2701,32 +2714,50 @@ mismatch: } EXPORT_SYMBOL(d_alloc_parallel); -void __d_lookup_done(struct dentry *dentry) +/* + * - Unhash the dentry + * - Retrieve and clear the waitqueue head in dentry + * - Return the waitqueue head + */ +static wait_queue_head_t *__d_lookup_unhash(struct dentry *dentry) { - struct hlist_bl_head *b = in_lookup_hash(dentry->d_parent, - dentry->d_name.hash); + wait_queue_head_t *d_wait; + struct hlist_bl_head *b; + + lockdep_assert_held(&dentry->d_lock); + + b = in_lookup_hash(dentry->d_parent, dentry->d_name.hash); hlist_bl_lock(b); dentry->d_flags &= ~DCACHE_PAR_LOOKUP; __hlist_bl_del(&dentry->d_u.d_in_lookup_hash); - wake_up_all(dentry->d_wait); + d_wait = dentry->d_wait; dentry->d_wait = NULL; hlist_bl_unlock(b); INIT_HLIST_NODE(&dentry->d_u.d_alias); INIT_LIST_HEAD(&dentry->d_lru); + return d_wait; +} + +void __d_lookup_unhash_wake(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + wake_up_all(__d_lookup_unhash(dentry)); + spin_unlock(&dentry->d_lock); } -EXPORT_SYMBOL(__d_lookup_done); +EXPORT_SYMBOL(__d_lookup_unhash_wake); /* inode->i_lock held if inode is non-NULL */ static inline void __d_add(struct dentry *dentry, struct inode *inode) { + wait_queue_head_t *d_wait; struct inode *dir = NULL; unsigned n; spin_lock(&dentry->d_lock); if (unlikely(d_in_lookup(dentry))) { dir = dentry->d_parent->d_inode; n = start_dir_add(dir); - __d_lookup_done(dentry); + d_wait = __d_lookup_unhash(dentry); } if (inode) { unsigned add_flags = d_flags_for_inode(inode); @@ -2738,7 +2769,7 @@ static inline void __d_add(struct dentry *dentry, struct inode *inode) } __d_rehash(dentry); if (dir) - end_dir_add(dir, n); + end_dir_add(dir, n, d_wait); spin_unlock(&dentry->d_lock); if (inode) spin_unlock(&inode->i_lock); @@ -2885,6 +2916,7 @@ static void __d_move(struct dentry *dentry, struct dentry *target, bool exchange) { struct dentry *old_parent, *p; + wait_queue_head_t *d_wait; struct inode *dir = NULL; unsigned n; @@ -2915,7 +2947,7 @@ static void __d_move(struct dentry *dentry, struct dentry *target, if (unlikely(d_in_lookup(target))) { dir = target->d_parent->d_inode; n = start_dir_add(dir); - __d_lookup_done(target); + d_wait = __d_lookup_unhash(target); } write_seqcount_begin(&dentry->d_seq); @@ -2951,7 +2983,7 @@ static void __d_move(struct dentry *dentry, struct dentry *target, write_seqcount_end(&dentry->d_seq); if (dir) - end_dir_add(dir, n); + end_dir_add(dir, n, d_wait); if (dentry->d_parent != old_parent) spin_unlock(&dentry->d_parent->d_lock); diff --git a/fs/direct-io.c b/fs/direct-io.c index 840752006f60..df5e2d048799 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -117,8 +117,7 @@ struct dio_submit { /* dio_state communicated between submission path and end_io */ struct dio { int flags; /* doesn't change */ - int op; - int op_flags; + blk_opf_t opf; /* request operation type and flags */ struct gendisk *bio_disk; struct inode *inode; loff_t i_size; /* i_size when submitted */ @@ -167,12 +166,13 @@ static inline unsigned dio_pages_present(struct dio_submit *sdio) */ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio) { + const enum req_op dio_op = dio->opf & REQ_OP_MASK; ssize_t ret; ret = iov_iter_get_pages(sdio->iter, dio->pages, LONG_MAX, DIO_PAGES, &sdio->from); - if (ret < 0 && sdio->blocks_available && (dio->op == REQ_OP_WRITE)) { + if (ret < 0 && sdio->blocks_available && dio_op == REQ_OP_WRITE) { struct page *page = ZERO_PAGE(0); /* * A memory fault, but the filesystem has some outstanding @@ -234,6 +234,7 @@ static inline struct page *dio_get_page(struct dio *dio, */ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags) { + const enum req_op dio_op = dio->opf & REQ_OP_MASK; loff_t offset = dio->iocb->ki_pos; ssize_t transferred = 0; int err; @@ -251,7 +252,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags) transferred = dio->result; /* Check for short read case */ - if ((dio->op == REQ_OP_READ) && + if (dio_op == REQ_OP_READ && ((offset + transferred) > dio->i_size)) transferred = dio->i_size - offset; /* ignore EFAULT if some IO has been done */ @@ -286,7 +287,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags) * zeros from unwritten extents. */ if (flags & DIO_COMPLETE_INVALIDATE && - ret > 0 && dio->op == REQ_OP_WRITE && + ret > 0 && dio_op == REQ_OP_WRITE && dio->inode->i_mapping->nrpages) { err = invalidate_inode_pages2_range(dio->inode->i_mapping, offset >> PAGE_SHIFT, @@ -305,7 +306,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags) */ dio->iocb->ki_pos += transferred; - if (ret > 0 && dio->op == REQ_OP_WRITE) + if (ret > 0 && dio_op == REQ_OP_WRITE) ret = generic_write_sync(dio->iocb, ret); dio->iocb->ki_complete(dio->iocb, ret); } @@ -329,6 +330,7 @@ static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio); static void dio_bio_end_aio(struct bio *bio) { struct dio *dio = bio->bi_private; + const enum req_op dio_op = dio->opf & REQ_OP_MASK; unsigned long remaining; unsigned long flags; bool defer_completion = false; @@ -353,7 +355,7 @@ static void dio_bio_end_aio(struct bio *bio) */ if (dio->result) defer_completion = dio->defer_completion || - (dio->op == REQ_OP_WRITE && + (dio_op == REQ_OP_WRITE && dio->inode->i_mapping->nrpages); if (defer_completion) { INIT_WORK(&dio->complete_work, dio_aio_complete_work); @@ -396,7 +398,7 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, * bio_alloc() is guaranteed to return a bio when allowed to sleep and * we request a valid number of vectors. */ - bio = bio_alloc(bdev, nr_vecs, dio->op | dio->op_flags, GFP_KERNEL); + bio = bio_alloc(bdev, nr_vecs, dio->opf, GFP_KERNEL); bio->bi_iter.bi_sector = first_sector; if (dio->is_async) bio->bi_end_io = dio_bio_end_aio; @@ -415,6 +417,7 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, */ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) { + const enum req_op dio_op = dio->opf & REQ_OP_MASK; struct bio *bio = sdio->bio; unsigned long flags; @@ -426,7 +429,7 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) dio->refcount++; spin_unlock_irqrestore(&dio->bio_lock, flags); - if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty) + if (dio->is_async && dio_op == REQ_OP_READ && dio->should_dirty) bio_set_pages_dirty(bio); dio->bio_disk = bio->bi_bdev->bd_disk; @@ -492,7 +495,8 @@ static struct bio *dio_await_one(struct dio *dio) static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio) { blk_status_t err = bio->bi_status; - bool should_dirty = dio->op == REQ_OP_READ && dio->should_dirty; + const enum req_op dio_op = dio->opf & REQ_OP_MASK; + bool should_dirty = dio_op == REQ_OP_READ && dio->should_dirty; if (err) { if (err == BLK_STS_AGAIN && (bio->bi_opf & REQ_NOWAIT)) @@ -619,6 +623,7 @@ static int dio_set_defer_completion(struct dio *dio) static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, struct buffer_head *map_bh) { + const enum req_op dio_op = dio->opf & REQ_OP_MASK; int ret; sector_t fs_startblk; /* Into file, in filesystem-sized blocks */ sector_t fs_endblk; /* Into file, in filesystem-sized blocks */ @@ -653,7 +658,7 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, * which may decide to handle it or also return an unmapped * buffer head. */ - create = dio->op == REQ_OP_WRITE; + create = dio_op == REQ_OP_WRITE; if (dio->flags & DIO_SKIP_HOLES) { i_size = i_size_read(dio->inode); if (i_size && fs_startblk <= (i_size - 1) >> i_blkbits) @@ -801,10 +806,11 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, unsigned offset, unsigned len, sector_t blocknr, struct buffer_head *map_bh) { + const enum req_op dio_op = dio->opf & REQ_OP_MASK; int ret = 0; int boundary = sdio->boundary; /* dio_send_cur_page may clear it */ - if (dio->op == REQ_OP_WRITE) { + if (dio_op == REQ_OP_WRITE) { /* * Read accounting is performed in submit_bio() */ @@ -917,6 +923,7 @@ static inline void dio_zero_block(struct dio *dio, struct dio_submit *sdio, static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, struct buffer_head *map_bh) { + const enum req_op dio_op = dio->opf & REQ_OP_MASK; const unsigned blkbits = sdio->blkbits; const unsigned i_blkbits = blkbits + sdio->blkfactor; int ret = 0; @@ -992,7 +999,7 @@ do_holes: loff_t i_size_aligned; /* AKPM: eargh, -ENOTBLK is a hack */ - if (dio->op == REQ_OP_WRITE) { + if (dio_op == REQ_OP_WRITE) { put_page(page); return -ENOTBLK; } @@ -1196,12 +1203,11 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, dio->inode = inode; if (iov_iter_rw(iter) == WRITE) { - dio->op = REQ_OP_WRITE; - dio->op_flags = REQ_SYNC | REQ_IDLE; + dio->opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; if (iocb->ki_flags & IOCB_NOWAIT) - dio->op_flags |= REQ_NOWAIT; + dio->opf |= REQ_NOWAIT; } else { - dio->op = REQ_OP_READ; + dio->opf = REQ_OP_READ; } /* @@ -1210,7 +1216,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, */ if (dio->is_async && iov_iter_rw(iter) == WRITE) { retval = 0; - if (iocb->ki_flags & IOCB_DSYNC) + if (iocb_is_dsync(iocb)) retval = dio_set_defer_completion(dio); else if (!dio->inode->i_sb->s_dio_done_wq) { /* diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig index ee92634196a8..1105ce3c80cb 100644 --- a/fs/dlm/Kconfig +++ b/fs/dlm/Kconfig @@ -9,6 +9,15 @@ menuconfig DLM A general purpose distributed lock manager for kernel or userspace applications. +config DLM_DEPRECATED_API + bool "DLM deprecated API" + depends on DLM + help + Enables deprecated DLM timeout features that will be removed in + later Linux kernel releases. + + If you are unsure, say N. + config DLM_DEBUG bool "DLM debugging" depends on DLM diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile index 3545fdafc6fb..71dab733cf9a 100644 --- a/fs/dlm/Makefile +++ b/fs/dlm/Makefile @@ -9,7 +9,6 @@ dlm-y := ast.o \ member.o \ memory.o \ midcomms.o \ - netlink.o \ lowcomms.o \ plock.o \ rcom.o \ @@ -18,5 +17,6 @@ dlm-y := ast.o \ requestqueue.o \ user.o \ util.o +dlm-$(CONFIG_DLM_DEPRECATED_API) += netlink.o dlm-$(CONFIG_DLM_DEBUG) += debug_fs.o diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index bfac462dd3e8..19ef136f9e4f 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -255,13 +255,13 @@ void dlm_callback_work(struct work_struct *work) if (callbacks[i].flags & DLM_CB_SKIP) { continue; } else if (callbacks[i].flags & DLM_CB_BAST) { - bastfn(lkb->lkb_astparam, callbacks[i].mode); trace_dlm_bast(ls, lkb, callbacks[i].mode); + bastfn(lkb->lkb_astparam, callbacks[i].mode); } else if (callbacks[i].flags & DLM_CB_CAST) { lkb->lkb_lksb->sb_status = callbacks[i].sb_status; lkb->lkb_lksb->sb_flags = callbacks[i].sb_flags; + trace_dlm_ast(ls, lkb); castfn(lkb->lkb_astparam); - trace_dlm_ast(ls, lkb, lkb->lkb_lksb); } } diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 42eee2783756..ac8b62106ce0 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -75,8 +75,9 @@ struct dlm_cluster { unsigned int cl_log_info; unsigned int cl_protocol; unsigned int cl_mark; +#ifdef CONFIG_DLM_DEPRECATED_API unsigned int cl_timewarn_cs; - unsigned int cl_waitwarn_us; +#endif unsigned int cl_new_rsb_count; unsigned int cl_recover_callbacks; char cl_cluster_name[DLM_LOCKSPACE_LEN]; @@ -102,8 +103,9 @@ enum { CLUSTER_ATTR_LOG_INFO, CLUSTER_ATTR_PROTOCOL, CLUSTER_ATTR_MARK, +#ifdef CONFIG_DLM_DEPRECATED_API CLUSTER_ATTR_TIMEWARN_CS, - CLUSTER_ATTR_WAITWARN_US, +#endif CLUSTER_ATTR_NEW_RSB_COUNT, CLUSTER_ATTR_RECOVER_CALLBACKS, CLUSTER_ATTR_CLUSTER_NAME, @@ -224,8 +226,9 @@ CLUSTER_ATTR(log_debug, NULL); CLUSTER_ATTR(log_info, NULL); CLUSTER_ATTR(protocol, dlm_check_protocol_and_dlm_running); CLUSTER_ATTR(mark, NULL); +#ifdef CONFIG_DLM_DEPRECATED_API CLUSTER_ATTR(timewarn_cs, dlm_check_zero); -CLUSTER_ATTR(waitwarn_us, NULL); +#endif CLUSTER_ATTR(new_rsb_count, NULL); CLUSTER_ATTR(recover_callbacks, NULL); @@ -240,8 +243,9 @@ static struct configfs_attribute *cluster_attrs[] = { [CLUSTER_ATTR_LOG_INFO] = &cluster_attr_log_info, [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol, [CLUSTER_ATTR_MARK] = &cluster_attr_mark, +#ifdef CONFIG_DLM_DEPRECATED_API [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs, - [CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us, +#endif [CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count, [CLUSTER_ATTR_RECOVER_CALLBACKS] = &cluster_attr_recover_callbacks, [CLUSTER_ATTR_CLUSTER_NAME] = &cluster_attr_cluster_name, @@ -432,8 +436,9 @@ static struct config_group *make_cluster(struct config_group *g, cl->cl_log_debug = dlm_config.ci_log_debug; cl->cl_log_info = dlm_config.ci_log_info; cl->cl_protocol = dlm_config.ci_protocol; +#ifdef CONFIG_DLM_DEPRECATED_API cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs; - cl->cl_waitwarn_us = dlm_config.ci_waitwarn_us; +#endif cl->cl_new_rsb_count = dlm_config.ci_new_rsb_count; cl->cl_recover_callbacks = dlm_config.ci_recover_callbacks; memcpy(cl->cl_cluster_name, dlm_config.ci_cluster_name, @@ -954,8 +959,9 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num) #define DEFAULT_LOG_INFO 1 #define DEFAULT_PROTOCOL DLM_PROTO_TCP #define DEFAULT_MARK 0 +#ifdef CONFIG_DLM_DEPRECATED_API #define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */ -#define DEFAULT_WAITWARN_US 0 +#endif #define DEFAULT_NEW_RSB_COUNT 128 #define DEFAULT_RECOVER_CALLBACKS 0 #define DEFAULT_CLUSTER_NAME "" @@ -971,8 +977,9 @@ struct dlm_config_info dlm_config = { .ci_log_info = DEFAULT_LOG_INFO, .ci_protocol = DEFAULT_PROTOCOL, .ci_mark = DEFAULT_MARK, +#ifdef CONFIG_DLM_DEPRECATED_API .ci_timewarn_cs = DEFAULT_TIMEWARN_CS, - .ci_waitwarn_us = DEFAULT_WAITWARN_US, +#endif .ci_new_rsb_count = DEFAULT_NEW_RSB_COUNT, .ci_recover_callbacks = DEFAULT_RECOVER_CALLBACKS, .ci_cluster_name = DEFAULT_CLUSTER_NAME diff --git a/fs/dlm/config.h b/fs/dlm/config.h index df92b0a07fc6..55c5f2c13ebd 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -37,8 +37,9 @@ struct dlm_config_info { int ci_log_info; int ci_protocol; int ci_mark; +#ifdef CONFIG_DLM_DEPRECATED_API int ci_timewarn_cs; - int ci_waitwarn_us; +#endif int ci_new_rsb_count; int ci_recover_callbacks; char ci_cluster_name[DLM_LOCKSPACE_LEN]; diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 776c3ed519f0..8aca8085d24e 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -145,7 +145,9 @@ struct dlm_args { void (*bastfn) (void *astparam, int mode); int mode; struct dlm_lksb *lksb; +#ifdef CONFIG_DLM_DEPRECATED_API unsigned long timeout; +#endif }; @@ -203,10 +205,20 @@ struct dlm_args { #define DLM_IFL_OVERLAP_UNLOCK 0x00080000 #define DLM_IFL_OVERLAP_CANCEL 0x00100000 #define DLM_IFL_ENDOFLIFE 0x00200000 +#ifdef CONFIG_DLM_DEPRECATED_API #define DLM_IFL_WATCH_TIMEWARN 0x00400000 #define DLM_IFL_TIMEOUT_CANCEL 0x00800000 +#endif #define DLM_IFL_DEADLOCK_CANCEL 0x01000000 #define DLM_IFL_STUB_MS 0x02000000 /* magic number for m_flags */ +/* least significant 2 bytes are message changed, they are full transmitted + * but at receive side only the 2 bytes LSB will be set. + * + * Even wireshark dlm dissector does only evaluate the lower bytes and note + * that they may not be used on transceiver side, we assume the higher bytes + * are for internal use or reserved so long they are not parsed on receiver + * side. + */ #define DLM_IFL_USER 0x00000001 #define DLM_IFL_ORPHAN 0x00000002 @@ -249,10 +261,12 @@ struct dlm_lkb { struct list_head lkb_rsb_lookup; /* waiting for rsb lookup */ struct list_head lkb_wait_reply; /* waiting for remote reply */ struct list_head lkb_ownqueue; /* list of locks for a process */ - struct list_head lkb_time_list; ktime_t lkb_timestamp; - ktime_t lkb_wait_time; + +#ifdef CONFIG_DLM_DEPRECATED_API + struct list_head lkb_time_list; unsigned long lkb_timeout_cs; +#endif struct mutex lkb_cb_mutex; struct work_struct lkb_cb_work; @@ -568,8 +582,10 @@ struct dlm_ls { struct mutex ls_orphans_mutex; struct list_head ls_orphans; +#ifdef CONFIG_DLM_DEPRECATED_API struct mutex ls_timeout_mutex; struct list_head ls_timeout; +#endif spinlock_t ls_new_rsb_spin; int ls_new_rsb_count; @@ -606,8 +622,8 @@ struct dlm_ls { wait_queue_head_t ls_uevent_wait; /* user part of join/leave */ int ls_uevent_result; - struct completion ls_members_done; - int ls_members_result; + struct completion ls_recovery_done; + int ls_recovery_result; struct miscdevice ls_device; @@ -688,7 +704,9 @@ struct dlm_ls { #define LSFL_RCOM_READY 5 #define LSFL_RCOM_WAIT 6 #define LSFL_UEVENT_WAIT 7 +#ifdef CONFIG_DLM_DEPRECATED_API #define LSFL_TIMEWARN 8 +#endif #define LSFL_CB_DELAY 9 #define LSFL_NODIR 10 @@ -741,9 +759,15 @@ static inline int dlm_no_directory(struct dlm_ls *ls) return test_bit(LSFL_NODIR, &ls->ls_flags); } +#ifdef CONFIG_DLM_DEPRECATED_API int dlm_netlink_init(void); void dlm_netlink_exit(void); void dlm_timeout_warn(struct dlm_lkb *lkb); +#else +static inline int dlm_netlink_init(void) { return 0; } +static inline void dlm_netlink_exit(void) { }; +static inline void dlm_timeout_warn(struct dlm_lkb *lkb) { }; +#endif int dlm_plock_init(void); void dlm_plock_exit(void); diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 226822f49d30..dac7eb75dba9 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -296,12 +296,14 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb);); +#ifdef CONFIG_DLM_DEPRECATED_API /* if the operation was a cancel, then return -DLM_ECANCEL, if a timeout caused the cancel then return -ETIMEDOUT */ if (rv == -DLM_ECANCEL && (lkb->lkb_flags & DLM_IFL_TIMEOUT_CANCEL)) { lkb->lkb_flags &= ~DLM_IFL_TIMEOUT_CANCEL; rv = -ETIMEDOUT; } +#endif if (rv == -DLM_ECANCEL && (lkb->lkb_flags & DLM_IFL_DEADLOCK_CANCEL)) { lkb->lkb_flags &= ~DLM_IFL_DEADLOCK_CANCEL; @@ -1210,7 +1212,9 @@ static int _create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret, kref_init(&lkb->lkb_ref); INIT_LIST_HEAD(&lkb->lkb_ownqueue); INIT_LIST_HEAD(&lkb->lkb_rsb_lookup); +#ifdef CONFIG_DLM_DEPRECATED_API INIT_LIST_HEAD(&lkb->lkb_time_list); +#endif INIT_LIST_HEAD(&lkb->lkb_cb_list); mutex_init(&lkb->lkb_cb_mutex); INIT_WORK(&lkb->lkb_cb_work, dlm_callback_work); @@ -1306,6 +1310,13 @@ static inline void hold_lkb(struct dlm_lkb *lkb) kref_get(&lkb->lkb_ref); } +static void unhold_lkb_assert(struct kref *kref) +{ + struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref); + + DLM_ASSERT(false, dlm_print_lkb(lkb);); +} + /* This is called when we need to remove a reference and are certain it's not the last ref. e.g. del_lkb is always called between a find_lkb/put_lkb and is always the inverse of a previous add_lkb. @@ -1313,9 +1324,7 @@ static inline void hold_lkb(struct dlm_lkb *lkb) static inline void unhold_lkb(struct dlm_lkb *lkb) { - int rv; - rv = kref_put(&lkb->lkb_ref, kill_lkb); - DLM_ASSERT(!rv, dlm_print_lkb(lkb);); + kref_put(&lkb->lkb_ref, unhold_lkb_assert); } static void lkb_add_ordered(struct list_head *new, struct list_head *head, @@ -1402,75 +1411,6 @@ static int msg_reply_type(int mstype) return -1; } -static int nodeid_warned(int nodeid, int num_nodes, int *warned) -{ - int i; - - for (i = 0; i < num_nodes; i++) { - if (!warned[i]) { - warned[i] = nodeid; - return 0; - } - if (warned[i] == nodeid) - return 1; - } - return 0; -} - -void dlm_scan_waiters(struct dlm_ls *ls) -{ - struct dlm_lkb *lkb; - s64 us; - s64 debug_maxus = 0; - u32 debug_scanned = 0; - u32 debug_expired = 0; - int num_nodes = 0; - int *warned = NULL; - - if (!dlm_config.ci_waitwarn_us) - return; - - mutex_lock(&ls->ls_waiters_mutex); - - list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) { - if (!lkb->lkb_wait_time) - continue; - - debug_scanned++; - - us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_wait_time)); - - if (us < dlm_config.ci_waitwarn_us) - continue; - - lkb->lkb_wait_time = 0; - - debug_expired++; - if (us > debug_maxus) - debug_maxus = us; - - if (!num_nodes) { - num_nodes = ls->ls_num_nodes; - warned = kcalloc(num_nodes, sizeof(int), GFP_KERNEL); - } - if (!warned) - continue; - if (nodeid_warned(lkb->lkb_wait_nodeid, num_nodes, warned)) - continue; - - log_error(ls, "waitwarn %x %lld %d us check connection to " - "node %d", lkb->lkb_id, (long long)us, - dlm_config.ci_waitwarn_us, lkb->lkb_wait_nodeid); - } - mutex_unlock(&ls->ls_waiters_mutex); - kfree(warned); - - if (debug_expired) - log_debug(ls, "scan_waiters %u warn %u over %d us max %lld us", - debug_scanned, debug_expired, - dlm_config.ci_waitwarn_us, (long long)debug_maxus); -} - /* add/remove lkb from global waiters list of lkb's waiting for a reply from a remote node */ @@ -1514,7 +1454,6 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid) lkb->lkb_wait_count++; lkb->lkb_wait_type = mstype; - lkb->lkb_wait_time = ktime_get(); lkb->lkb_wait_nodeid = to_nodeid; /* for debugging */ hold_lkb(lkb); list_add(&lkb->lkb_wait_reply, &ls->ls_waiters); @@ -1842,6 +1781,7 @@ void dlm_scan_rsbs(struct dlm_ls *ls) } } +#ifdef CONFIG_DLM_DEPRECATED_API static void add_timeout(struct dlm_lkb *lkb) { struct dlm_ls *ls = lkb->lkb_resource->res_ls; @@ -1962,17 +1902,11 @@ void dlm_adjust_timeouts(struct dlm_ls *ls) list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us); mutex_unlock(&ls->ls_timeout_mutex); - - if (!dlm_config.ci_waitwarn_us) - return; - - mutex_lock(&ls->ls_waiters_mutex); - list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) { - if (ktime_to_us(lkb->lkb_wait_time)) - lkb->lkb_wait_time = ktime_get(); - } - mutex_unlock(&ls->ls_waiters_mutex); } +#else +static void add_timeout(struct dlm_lkb *lkb) { } +static void del_timeout(struct dlm_lkb *lkb) { } +#endif /* lkb is master or local copy */ @@ -2837,12 +2771,20 @@ static void confirm_master(struct dlm_rsb *r, int error) } } +#ifdef CONFIG_DLM_DEPRECATED_API static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags, int namelen, unsigned long timeout_cs, void (*ast) (void *astparam), void *astparam, void (*bast) (void *astparam, int mode), struct dlm_args *args) +#else +static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags, + int namelen, void (*ast)(void *astparam), + void *astparam, + void (*bast)(void *astparam, int mode), + struct dlm_args *args) +#endif { int rv = -EINVAL; @@ -2895,7 +2837,9 @@ static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags, args->astfn = ast; args->astparam = astparam; args->bastfn = bast; +#ifdef CONFIG_DLM_DEPRECATED_API args->timeout = timeout_cs; +#endif args->mode = mode; args->lksb = lksb; rv = 0; @@ -2951,7 +2895,9 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, lkb->lkb_lksb = args->lksb; lkb->lkb_lvbptr = args->lksb->sb_lvbptr; lkb->lkb_ownpid = (int) current->pid; +#ifdef CONFIG_DLM_DEPRECATED_API lkb->lkb_timeout_cs = args->timeout; +#endif rv = 0; out: if (rv) @@ -3472,10 +3418,15 @@ int dlm_lock(dlm_lockspace_t *lockspace, if (error) goto out; - trace_dlm_lock_start(ls, lkb, mode, flags); + trace_dlm_lock_start(ls, lkb, name, namelen, mode, flags); +#ifdef CONFIG_DLM_DEPRECATED_API error = set_lock_args(mode, lksb, flags, namelen, 0, ast, astarg, bast, &args); +#else + error = set_lock_args(mode, lksb, flags, namelen, ast, astarg, bast, + &args); +#endif if (error) goto out_put; @@ -3487,7 +3438,7 @@ int dlm_lock(dlm_lockspace_t *lockspace, if (error == -EINPROGRESS) error = 0; out_put: - trace_dlm_lock_end(ls, lkb, mode, flags, error); + trace_dlm_lock_end(ls, lkb, name, namelen, mode, flags, error); if (convert || error) __put_lkb(ls, lkb); @@ -5839,9 +5790,14 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc) return 0; } +#ifdef CONFIG_DLM_DEPRECATED_API int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode, uint32_t flags, void *name, unsigned int namelen, unsigned long timeout_cs) +#else +int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, + int mode, uint32_t flags, void *name, unsigned int namelen) +#endif { struct dlm_lkb *lkb; struct dlm_args args; @@ -5864,8 +5820,13 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, goto out; } } +#ifdef CONFIG_DLM_DEPRECATED_API error = set_lock_args(mode, &ua->lksb, flags, namelen, timeout_cs, fake_astfn, ua, fake_bastfn, &args); +#else + error = set_lock_args(mode, &ua->lksb, flags, namelen, fake_astfn, ua, + fake_bastfn, &args); +#endif if (error) { kfree(ua->lksb.sb_lvbptr); ua->lksb.sb_lvbptr = NULL; @@ -5904,9 +5865,14 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, return error; } +#ifdef CONFIG_DLM_DEPRECATED_API int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, int mode, uint32_t flags, uint32_t lkid, char *lvb_in, unsigned long timeout_cs) +#else +int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, + int mode, uint32_t flags, uint32_t lkid, char *lvb_in) +#endif { struct dlm_lkb *lkb; struct dlm_args args; @@ -5941,8 +5907,13 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, ua->bastaddr = ua_tmp->bastaddr; ua->user_lksb = ua_tmp->user_lksb; +#ifdef CONFIG_DLM_DEPRECATED_API error = set_lock_args(mode, &ua->lksb, flags, 0, timeout_cs, fake_astfn, ua, fake_bastfn, &args); +#else + error = set_lock_args(mode, &ua->lksb, flags, 0, fake_astfn, ua, + fake_bastfn, &args); +#endif if (error) goto out_put; @@ -5966,7 +5937,7 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, int mode, uint32_t flags, void *name, unsigned int namelen, - unsigned long timeout_cs, uint32_t *lkid) + uint32_t *lkid) { struct dlm_lkb *lkb = NULL, *iter; struct dlm_user_args *ua; diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h index 252a5898f908..a7b6474f009d 100644 --- a/fs/dlm/lock.h +++ b/fs/dlm/lock.h @@ -24,9 +24,15 @@ int dlm_put_lkb(struct dlm_lkb *lkb); void dlm_scan_rsbs(struct dlm_ls *ls); int dlm_lock_recovery_try(struct dlm_ls *ls); void dlm_unlock_recovery(struct dlm_ls *ls); -void dlm_scan_waiters(struct dlm_ls *ls); + +#ifdef CONFIG_DLM_DEPRECATED_API void dlm_scan_timeout(struct dlm_ls *ls); void dlm_adjust_timeouts(struct dlm_ls *ls); +#else +static inline void dlm_scan_timeout(struct dlm_ls *ls) { } +static inline void dlm_adjust_timeouts(struct dlm_ls *ls) { } +#endif + int dlm_master_lookup(struct dlm_ls *ls, int nodeid, char *name, int len, unsigned int flags, int *r_nodeid, int *result); @@ -41,15 +47,22 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls); int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc); int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc); +#ifdef CONFIG_DLM_DEPRECATED_API int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode, uint32_t flags, void *name, unsigned int namelen, unsigned long timeout_cs); int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, int mode, uint32_t flags, uint32_t lkid, char *lvb_in, unsigned long timeout_cs); +#else +int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode, + uint32_t flags, void *name, unsigned int namelen); +int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, + int mode, uint32_t flags, uint32_t lkid, char *lvb_in); +#endif int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, int mode, uint32_t flags, void *name, unsigned int namelen, - unsigned long timeout_cs, uint32_t *lkid); + uint32_t *lkid); int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, uint32_t flags, uint32_t lkid, char *lvb_in); int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 19ed41a5da93..3972f4d86c75 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -275,7 +275,6 @@ static int dlm_scand(void *data) ls->ls_scan_time = jiffies; dlm_scan_rsbs(ls); dlm_scan_timeout(ls); - dlm_scan_waiters(ls); dlm_unlock_recovery(ls); } else { ls->ls_scan_time += HZ; @@ -490,13 +489,28 @@ static int new_lockspace(const char *name, const char *cluster, ls->ls_ops_arg = ops_arg; } - if (flags & DLM_LSFL_TIMEWARN) +#ifdef CONFIG_DLM_DEPRECATED_API + if (flags & DLM_LSFL_TIMEWARN) { + pr_warn_once("===============================================================\n" + "WARNING: the dlm DLM_LSFL_TIMEWARN flag is being deprecated and\n" + " will be removed in v6.2!\n" + " Inclusive DLM_LSFL_TIMEWARN define in UAPI header!\n" + "===============================================================\n"); + set_bit(LSFL_TIMEWARN, &ls->ls_flags); + } /* ls_exflags are forced to match among nodes, and we don't - need to require all nodes to have some flags set */ + * need to require all nodes to have some flags set + */ ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS | DLM_LSFL_NEWEXCL)); +#else + /* ls_exflags are forced to match among nodes, and we don't + * need to require all nodes to have some flags set + */ + ls->ls_exflags = (flags & ~(DLM_LSFL_FS | DLM_LSFL_NEWEXCL)); +#endif size = READ_ONCE(dlm_config.ci_rsbtbl_size); ls->ls_rsbtbl_size = size; @@ -527,8 +541,10 @@ static int new_lockspace(const char *name, const char *cluster, mutex_init(&ls->ls_waiters_mutex); INIT_LIST_HEAD(&ls->ls_orphans); mutex_init(&ls->ls_orphans_mutex); +#ifdef CONFIG_DLM_DEPRECATED_API INIT_LIST_HEAD(&ls->ls_timeout); mutex_init(&ls->ls_timeout_mutex); +#endif INIT_LIST_HEAD(&ls->ls_new_rsb); spin_lock_init(&ls->ls_new_rsb_spin); @@ -548,8 +564,8 @@ static int new_lockspace(const char *name, const char *cluster, init_waitqueue_head(&ls->ls_uevent_wait); ls->ls_uevent_result = 0; - init_completion(&ls->ls_members_done); - ls->ls_members_result = -1; + init_completion(&ls->ls_recovery_done); + ls->ls_recovery_result = -1; mutex_init(&ls->ls_cb_mutex); INIT_LIST_HEAD(&ls->ls_cb_delay); @@ -645,8 +661,9 @@ static int new_lockspace(const char *name, const char *cluster, if (error) goto out_recoverd; - wait_for_completion(&ls->ls_members_done); - error = ls->ls_members_result; + /* wait until recovery is successful or failed */ + wait_for_completion(&ls->ls_recovery_done); + error = ls->ls_recovery_result; if (error) goto out_members; diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 19e82f08c0e0..a4e84e8d94c8 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -529,7 +529,7 @@ static void lowcomms_write_space(struct sock *sk) return; if (!test_and_set_bit(CF_CONNECTED, &con->flags)) { - log_print("successful connected to node %d", con->nodeid); + log_print("connected to node %d", con->nodeid); queue_work(send_workqueue, &con->swork); return; } @@ -1931,7 +1931,7 @@ static int dlm_sctp_connect(struct connection *con, struct socket *sock, return ret; if (!test_and_set_bit(CF_CONNECTED, &con->flags)) - log_print("successful connected to node %d", con->nodeid); + log_print("connected to node %d", con->nodeid); return 0; } diff --git a/fs/dlm/member.c b/fs/dlm/member.c index 98084e0cfccf..2af2ccfe43a9 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -534,7 +534,11 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) int i, error, neg = 0, low = -1; /* previously removed members that we've not finished removing need to - count as a negative change so the "neg" recovery steps will happen */ + * count as a negative change so the "neg" recovery steps will happen + * + * This functionality must report all member changes to lsops or + * midcomms layer and must never return before. + */ list_for_each_entry(memb, &ls->ls_nodes_gone, list) { log_rinfo(ls, "prev removed member %d", memb->nodeid); @@ -583,19 +587,6 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) *neg_out = neg; error = ping_members(ls); - /* error -EINTR means that a new recovery action is triggered. - * We ignore this recovery action and let run the new one which might - * have new member configuration. - */ - if (error == -EINTR) - error = 0; - - /* new_lockspace() may be waiting to know if the config - * is good or bad - */ - ls->ls_members_result = error; - complete(&ls->ls_members_done); - log_rinfo(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes); return error; } @@ -675,7 +666,16 @@ int dlm_ls_stop(struct dlm_ls *ls) if (!ls->ls_recover_begin) ls->ls_recover_begin = jiffies; - dlm_lsop_recover_prep(ls); + /* call recover_prep ops only once and not multiple times + * for each possible dlm_ls_stop() when recovery is already + * stopped. + * + * If we successful was able to clear LSFL_RUNNING bit and + * it was set we know it is the first dlm_ls_stop() call. + */ + if (new) + dlm_lsop_recover_prep(ls); + return 0; } diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index 0993eebf2060..737f185aad8d 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -29,6 +29,8 @@ struct plock_async_data { struct plock_op { struct list_head list; int done; + /* if lock op got interrupted while waiting dlm_controld reply */ + bool sigint; struct dlm_plock_info info; /* if set indicates async handling */ struct plock_async_data *data; @@ -79,8 +81,7 @@ static void send_op(struct plock_op *op) abandoned waiter. So, we have to insert the unlock-close when the lock call is interrupted. */ -static void do_unlock_close(struct dlm_ls *ls, u64 number, - struct file *file, struct file_lock *fl) +static void do_unlock_close(const struct dlm_plock_info *info) { struct plock_op *op; @@ -89,15 +90,12 @@ static void do_unlock_close(struct dlm_ls *ls, u64 number, return; op->info.optype = DLM_PLOCK_OP_UNLOCK; - op->info.pid = fl->fl_pid; - op->info.fsid = ls->ls_global_id; - op->info.number = number; + op->info.pid = info->pid; + op->info.fsid = info->fsid; + op->info.number = info->number; op->info.start = 0; op->info.end = OFFSET_MAX; - if (fl->fl_lmops && fl->fl_lmops->lm_grant) - op->info.owner = (__u64) fl->fl_pid; - else - op->info.owner = (__u64)(long) fl->fl_owner; + op->info.owner = info->owner; op->info.flags |= DLM_PLOCK_FL_CLOSE; send_op(op); @@ -161,16 +159,24 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, rv = wait_event_interruptible(recv_wq, (op->done != 0)); if (rv == -ERESTARTSYS) { spin_lock(&ops_lock); - list_del(&op->list); + /* recheck under ops_lock if we got a done != 0, + * if so this interrupt case should be ignored + */ + if (op->done != 0) { + spin_unlock(&ops_lock); + goto do_lock_wait; + } + + op->sigint = true; spin_unlock(&ops_lock); - log_print("%s: wait interrupted %x %llx, op removed", + log_debug(ls, "%s: wait interrupted %x %llx pid %d", __func__, ls->ls_global_id, - (unsigned long long)number); - dlm_release_plock_op(op); - do_unlock_close(ls, number, file, fl); + (unsigned long long)number, op->info.pid); goto out; } +do_lock_wait: + WARN_ON(!list_empty(&op->list)); rv = op->info.rv; @@ -378,7 +384,7 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count, spin_lock(&ops_lock); if (!list_empty(&send_list)) { - op = list_entry(send_list.next, struct plock_op, list); + op = list_first_entry(&send_list, struct plock_op, list); if (op->info.flags & DLM_PLOCK_FL_CLOSE) list_del(&op->list); else @@ -425,6 +431,19 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count, if (iter->info.fsid == info.fsid && iter->info.number == info.number && iter->info.owner == info.owner) { + if (iter->sigint) { + list_del(&iter->list); + spin_unlock(&ops_lock); + + pr_debug("%s: sigint cleanup %x %llx pid %d", + __func__, iter->info.fsid, + (unsigned long long)iter->info.number, + iter->info.pid); + do_unlock_close(&iter->info); + memcpy(&iter->info, &info, sizeof(info)); + dlm_release_plock_op(iter); + return count; + } list_del_init(&iter->list); memcpy(&iter->info, &info, sizeof(info)); if (iter->data) @@ -443,7 +462,7 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count, else wake_up(&recv_wq); } else - log_print("%s: no op %x %llx - may got interrupted?", __func__, + log_print("%s: no op %x %llx", __func__, info.fsid, (unsigned long long)info.number); return count; } diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index a55dfce705dd..e15eb511b04b 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -70,6 +70,10 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) /* * Add or remove nodes from the lockspace's ls_nodes list. + * + * Due to the fact that we must report all membership changes to lsops + * or midcomms layer, it is not permitted to abort ls_recover() until + * this is done. */ error = dlm_recover_members(ls, rv, &neg); @@ -239,14 +243,12 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) jiffies_to_msecs(jiffies - start)); mutex_unlock(&ls->ls_recoverd_active); - dlm_lsop_recover_done(ls); return 0; fail: dlm_release_root_list(ls); - log_rinfo(ls, "dlm_recover %llu error %d", - (unsigned long long)rv->seq, error); mutex_unlock(&ls->ls_recoverd_active); + return error; } @@ -257,6 +259,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) static void do_ls_recovery(struct dlm_ls *ls) { struct dlm_recover *rv = NULL; + int error; spin_lock(&ls->ls_recover_lock); rv = ls->ls_recover_args; @@ -266,7 +269,31 @@ static void do_ls_recovery(struct dlm_ls *ls) spin_unlock(&ls->ls_recover_lock); if (rv) { - ls_recover(ls, rv); + error = ls_recover(ls, rv); + switch (error) { + case 0: + ls->ls_recovery_result = 0; + complete(&ls->ls_recovery_done); + + dlm_lsop_recover_done(ls); + break; + case -EINTR: + /* if recovery was interrupted -EINTR we wait for the next + * ls_recover() iteration until it hopefully succeeds. + */ + log_rinfo(ls, "%s %llu interrupted and should be queued to run again", + __func__, (unsigned long long)rv->seq); + break; + default: + log_rinfo(ls, "%s %llu error %d", __func__, + (unsigned long long)rv->seq, error); + + /* let new_lockspace() get aware of critical error */ + ls->ls_recovery_result = error; + complete(&ls->ls_recovery_done); + break; + } + kfree(rv->nodes); kfree(rv); } diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 1060b24f18d4..99e8f0744513 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -250,6 +250,14 @@ static int device_user_lock(struct dlm_user_proc *proc, goto out; } +#ifdef CONFIG_DLM_DEPRECATED_API + if (params->timeout) + pr_warn_once("========================================================\n" + "WARNING: the lkb timeout feature is being deprecated and\n" + " will be removed in v6.2!\n" + "========================================================\n"); +#endif + ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS); if (!ua) goto out; @@ -262,23 +270,34 @@ static int device_user_lock(struct dlm_user_proc *proc, ua->xid = params->xid; if (params->flags & DLM_LKF_CONVERT) { +#ifdef CONFIG_DLM_DEPRECATED_API error = dlm_user_convert(ls, ua, params->mode, params->flags, params->lkid, params->lvb, (unsigned long) params->timeout); +#else + error = dlm_user_convert(ls, ua, + params->mode, params->flags, + params->lkid, params->lvb); +#endif } else if (params->flags & DLM_LKF_ORPHAN) { error = dlm_user_adopt_orphan(ls, ua, params->mode, params->flags, params->name, params->namelen, - (unsigned long) params->timeout, &lkid); if (!error) error = lkid; } else { +#ifdef CONFIG_DLM_DEPRECATED_API error = dlm_user_request(ls, ua, params->mode, params->flags, params->name, params->namelen, (unsigned long) params->timeout); +#else + error = dlm_user_request(ls, ua, + params->mode, params->flags, + params->name, params->namelen); +#endif if (!error) error = ua->lksb.sb_lkid; } diff --git a/fs/efivarfs/Makefile b/fs/efivarfs/Makefile index 0b1c5e63eb71..7bfc2f9754a8 100644 --- a/fs/efivarfs/Makefile +++ b/fs/efivarfs/Makefile @@ -5,4 +5,4 @@ obj-$(CONFIG_EFIVAR_FS) += efivarfs.o -efivarfs-objs := inode.o file.o super.o +efivarfs-objs := inode.o file.o super.o vars.o diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h index 30ae44cb7453..8ebf3a6a8aa2 100644 --- a/fs/efivarfs/internal.h +++ b/fs/efivarfs/internal.h @@ -7,6 +7,46 @@ #define EFIVAR_FS_INTERNAL_H #include <linux/list.h> +#include <linux/efi.h> + +struct efi_variable { + efi_char16_t VariableName[EFI_VAR_NAME_LEN/sizeof(efi_char16_t)]; + efi_guid_t VendorGuid; + unsigned long DataSize; + __u8 Data[1024]; + efi_status_t Status; + __u32 Attributes; +} __attribute__((packed)); + +struct efivar_entry { + struct efi_variable var; + struct list_head list; + struct kobject kobj; +}; + +int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *), + void *data, bool duplicates, struct list_head *head); + +int efivar_entry_add(struct efivar_entry *entry, struct list_head *head); +void __efivar_entry_add(struct efivar_entry *entry, struct list_head *head); +void efivar_entry_remove(struct efivar_entry *entry); +int efivar_entry_delete(struct efivar_entry *entry); + +int efivar_entry_size(struct efivar_entry *entry, unsigned long *size); +int __efivar_entry_get(struct efivar_entry *entry, u32 *attributes, + unsigned long *size, void *data); +int efivar_entry_get(struct efivar_entry *entry, u32 *attributes, + unsigned long *size, void *data); +int efivar_entry_set_get_size(struct efivar_entry *entry, u32 attributes, + unsigned long *size, void *data, bool *set); + +int efivar_entry_iter(int (*func)(struct efivar_entry *, void *), + struct list_head *head, void *data); + +bool efivar_validate(efi_guid_t vendor, efi_char16_t *var_name, u8 *data, + unsigned long data_size); +bool efivar_variable_is_removable(efi_guid_t vendor, const char *name, + size_t len); extern const struct file_operations efivarfs_file_operations; extern const struct inode_operations efivarfs_dir_inode_operations; diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 15880a68faad..6780fc81cc11 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -155,10 +155,8 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, goto fail_inode; } - efivar_entry_size(entry, &size); - err = efivar_entry_add(entry, &efivarfs_list); - if (err) - goto fail_inode; + __efivar_entry_get(entry, NULL, &size, NULL); + __efivar_entry_add(entry, &efivarfs_list); /* copied by the above to local storage in the dentry. */ kfree(name); @@ -182,10 +180,7 @@ fail: static int efivarfs_destroy(struct efivar_entry *entry, void *data) { - int err = efivar_entry_remove(entry); - - if (err) - return err; + efivar_entry_remove(entry); kfree(entry); return 0; } @@ -221,7 +216,7 @@ static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc) err = efivar_init(efivarfs_callback, (void *)sb, true, &efivarfs_list); if (err) - __efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL, NULL); + efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL); return err; } @@ -246,7 +241,7 @@ static void efivarfs_kill_sb(struct super_block *sb) kill_litter_super(sb); /* Remove all entries and destroy */ - __efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL, NULL); + efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL); } static struct file_system_type efivarfs_type = { diff --git a/fs/efivarfs/vars.c b/fs/efivarfs/vars.c new file mode 100644 index 000000000000..a0ef63cfcecb --- /dev/null +++ b/fs/efivarfs/vars.c @@ -0,0 +1,738 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Originally from efivars.c + * + * Copyright (C) 2001,2003,2004 Dell <Matt_Domsch@dell.com> + * Copyright (C) 2004 Intel Corporation <matthew.e.tolentino@intel.com> + */ + +#include <linux/capability.h> +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/string.h> +#include <linux/smp.h> +#include <linux/efi.h> +#include <linux/device.h> +#include <linux/slab.h> +#include <linux/ctype.h> +#include <linux/ucs2_string.h> + +#include "internal.h" + +MODULE_IMPORT_NS(EFIVAR); + +static bool +validate_device_path(efi_char16_t *var_name, int match, u8 *buffer, + unsigned long len) +{ + struct efi_generic_dev_path *node; + int offset = 0; + + node = (struct efi_generic_dev_path *)buffer; + + if (len < sizeof(*node)) + return false; + + while (offset <= len - sizeof(*node) && + node->length >= sizeof(*node) && + node->length <= len - offset) { + offset += node->length; + + if ((node->type == EFI_DEV_END_PATH || + node->type == EFI_DEV_END_PATH2) && + node->sub_type == EFI_DEV_END_ENTIRE) + return true; + + node = (struct efi_generic_dev_path *)(buffer + offset); + } + + /* + * If we're here then either node->length pointed past the end + * of the buffer or we reached the end of the buffer without + * finding a device path end node. + */ + return false; +} + +static bool +validate_boot_order(efi_char16_t *var_name, int match, u8 *buffer, + unsigned long len) +{ + /* An array of 16-bit integers */ + if ((len % 2) != 0) + return false; + + return true; +} + +static bool +validate_load_option(efi_char16_t *var_name, int match, u8 *buffer, + unsigned long len) +{ + u16 filepathlength; + int i, desclength = 0, namelen; + + namelen = ucs2_strnlen(var_name, EFI_VAR_NAME_LEN); + + /* Either "Boot" or "Driver" followed by four digits of hex */ + for (i = match; i < match+4; i++) { + if (var_name[i] > 127 || + hex_to_bin(var_name[i] & 0xff) < 0) + return true; + } + + /* Reject it if there's 4 digits of hex and then further content */ + if (namelen > match + 4) + return false; + + /* A valid entry must be at least 8 bytes */ + if (len < 8) + return false; + + filepathlength = buffer[4] | buffer[5] << 8; + + /* + * There's no stored length for the description, so it has to be + * found by hand + */ + desclength = ucs2_strsize((efi_char16_t *)(buffer + 6), len - 6) + 2; + + /* Each boot entry must have a descriptor */ + if (!desclength) + return false; + + /* + * If the sum of the length of the description, the claimed filepath + * length and the original header are greater than the length of the + * variable, it's malformed + */ + if ((desclength + filepathlength + 6) > len) + return false; + + /* + * And, finally, check the filepath + */ + return validate_device_path(var_name, match, buffer + desclength + 6, + filepathlength); +} + +static bool +validate_uint16(efi_char16_t *var_name, int match, u8 *buffer, + unsigned long len) +{ + /* A single 16-bit integer */ + if (len != 2) + return false; + + return true; +} + +static bool +validate_ascii_string(efi_char16_t *var_name, int match, u8 *buffer, + unsigned long len) +{ + int i; + + for (i = 0; i < len; i++) { + if (buffer[i] > 127) + return false; + + if (buffer[i] == 0) + return true; + } + + return false; +} + +struct variable_validate { + efi_guid_t vendor; + char *name; + bool (*validate)(efi_char16_t *var_name, int match, u8 *data, + unsigned long len); +}; + +/* + * This is the list of variables we need to validate, as well as the + * whitelist for what we think is safe not to default to immutable. + * + * If it has a validate() method that's not NULL, it'll go into the + * validation routine. If not, it is assumed valid, but still used for + * whitelisting. + * + * Note that it's sorted by {vendor,name}, but globbed names must come after + * any other name with the same prefix. + */ +static const struct variable_validate variable_validate[] = { + { EFI_GLOBAL_VARIABLE_GUID, "BootNext", validate_uint16 }, + { EFI_GLOBAL_VARIABLE_GUID, "BootOrder", validate_boot_order }, + { EFI_GLOBAL_VARIABLE_GUID, "Boot*", validate_load_option }, + { EFI_GLOBAL_VARIABLE_GUID, "DriverOrder", validate_boot_order }, + { EFI_GLOBAL_VARIABLE_GUID, "Driver*", validate_load_option }, + { EFI_GLOBAL_VARIABLE_GUID, "ConIn", validate_device_path }, + { EFI_GLOBAL_VARIABLE_GUID, "ConInDev", validate_device_path }, + { EFI_GLOBAL_VARIABLE_GUID, "ConOut", validate_device_path }, + { EFI_GLOBAL_VARIABLE_GUID, "ConOutDev", validate_device_path }, + { EFI_GLOBAL_VARIABLE_GUID, "ErrOut", validate_device_path }, + { EFI_GLOBAL_VARIABLE_GUID, "ErrOutDev", validate_device_path }, + { EFI_GLOBAL_VARIABLE_GUID, "Lang", validate_ascii_string }, + { EFI_GLOBAL_VARIABLE_GUID, "OsIndications", NULL }, + { EFI_GLOBAL_VARIABLE_GUID, "PlatformLang", validate_ascii_string }, + { EFI_GLOBAL_VARIABLE_GUID, "Timeout", validate_uint16 }, + { LINUX_EFI_CRASH_GUID, "*", NULL }, + { NULL_GUID, "", NULL }, +}; + +/* + * Check if @var_name matches the pattern given in @match_name. + * + * @var_name: an array of @len non-NUL characters. + * @match_name: a NUL-terminated pattern string, optionally ending in "*". A + * final "*" character matches any trailing characters @var_name, + * including the case when there are none left in @var_name. + * @match: on output, the number of non-wildcard characters in @match_name + * that @var_name matches, regardless of the return value. + * @return: whether @var_name fully matches @match_name. + */ +static bool +variable_matches(const char *var_name, size_t len, const char *match_name, + int *match) +{ + for (*match = 0; ; (*match)++) { + char c = match_name[*match]; + + switch (c) { + case '*': + /* Wildcard in @match_name means we've matched. */ + return true; + + case '\0': + /* @match_name has ended. Has @var_name too? */ + return (*match == len); + + default: + /* + * We've reached a non-wildcard char in @match_name. + * Continue only if there's an identical character in + * @var_name. + */ + if (*match < len && c == var_name[*match]) + continue; + return false; + } + } +} + +bool +efivar_validate(efi_guid_t vendor, efi_char16_t *var_name, u8 *data, + unsigned long data_size) +{ + int i; + unsigned long utf8_size; + u8 *utf8_name; + + utf8_size = ucs2_utf8size(var_name); + utf8_name = kmalloc(utf8_size + 1, GFP_KERNEL); + if (!utf8_name) + return false; + + ucs2_as_utf8(utf8_name, var_name, utf8_size); + utf8_name[utf8_size] = '\0'; + + for (i = 0; variable_validate[i].name[0] != '\0'; i++) { + const char *name = variable_validate[i].name; + int match = 0; + + if (efi_guidcmp(vendor, variable_validate[i].vendor)) + continue; + + if (variable_matches(utf8_name, utf8_size+1, name, &match)) { + if (variable_validate[i].validate == NULL) + break; + kfree(utf8_name); + return variable_validate[i].validate(var_name, match, + data, data_size); + } + } + kfree(utf8_name); + return true; +} + +bool +efivar_variable_is_removable(efi_guid_t vendor, const char *var_name, + size_t len) +{ + int i; + bool found = false; + int match = 0; + + /* + * Check if our variable is in the validated variables list + */ + for (i = 0; variable_validate[i].name[0] != '\0'; i++) { + if (efi_guidcmp(variable_validate[i].vendor, vendor)) + continue; + + if (variable_matches(var_name, len, + variable_validate[i].name, &match)) { + found = true; + break; + } + } + + /* + * If it's in our list, it is removable. + */ + return found; +} + +static bool variable_is_present(efi_char16_t *variable_name, efi_guid_t *vendor, + struct list_head *head) +{ + struct efivar_entry *entry, *n; + unsigned long strsize1, strsize2; + bool found = false; + + strsize1 = ucs2_strsize(variable_name, 1024); + list_for_each_entry_safe(entry, n, head, list) { + strsize2 = ucs2_strsize(entry->var.VariableName, 1024); + if (strsize1 == strsize2 && + !memcmp(variable_name, &(entry->var.VariableName), + strsize2) && + !efi_guidcmp(entry->var.VendorGuid, + *vendor)) { + found = true; + break; + } + } + return found; +} + +/* + * Returns the size of variable_name, in bytes, including the + * terminating NULL character, or variable_name_size if no NULL + * character is found among the first variable_name_size bytes. + */ +static unsigned long var_name_strnsize(efi_char16_t *variable_name, + unsigned long variable_name_size) +{ + unsigned long len; + efi_char16_t c; + + /* + * The variable name is, by definition, a NULL-terminated + * string, so make absolutely sure that variable_name_size is + * the value we expect it to be. If not, return the real size. + */ + for (len = 2; len <= variable_name_size; len += sizeof(c)) { + c = variable_name[(len / sizeof(c)) - 1]; + if (!c) + break; + } + + return min(len, variable_name_size); +} + +/* + * Print a warning when duplicate EFI variables are encountered and + * disable the sysfs workqueue since the firmware is buggy. + */ +static void dup_variable_bug(efi_char16_t *str16, efi_guid_t *vendor_guid, + unsigned long len16) +{ + size_t i, len8 = len16 / sizeof(efi_char16_t); + char *str8; + + str8 = kzalloc(len8, GFP_KERNEL); + if (!str8) + return; + + for (i = 0; i < len8; i++) + str8[i] = str16[i]; + + printk(KERN_WARNING "efivars: duplicate variable: %s-%pUl\n", + str8, vendor_guid); + kfree(str8); +} + +/** + * efivar_init - build the initial list of EFI variables + * @func: callback function to invoke for every variable + * @data: function-specific data to pass to @func + * @duplicates: error if we encounter duplicates on @head? + * @head: initialised head of variable list + * + * Get every EFI variable from the firmware and invoke @func. @func + * should call efivar_entry_add() to build the list of variables. + * + * Returns 0 on success, or a kernel error code on failure. + */ +int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *), + void *data, bool duplicates, struct list_head *head) +{ + unsigned long variable_name_size = 1024; + efi_char16_t *variable_name; + efi_status_t status; + efi_guid_t vendor_guid; + int err = 0; + + variable_name = kzalloc(variable_name_size, GFP_KERNEL); + if (!variable_name) { + printk(KERN_ERR "efivars: Memory allocation failed.\n"); + return -ENOMEM; + } + + err = efivar_lock(); + if (err) + goto free; + + /* + * Per EFI spec, the maximum storage allocated for both + * the variable name and variable data is 1024 bytes. + */ + + do { + variable_name_size = 1024; + + status = efivar_get_next_variable(&variable_name_size, + variable_name, + &vendor_guid); + switch (status) { + case EFI_SUCCESS: + variable_name_size = var_name_strnsize(variable_name, + variable_name_size); + + /* + * Some firmware implementations return the + * same variable name on multiple calls to + * get_next_variable(). Terminate the loop + * immediately as there is no guarantee that + * we'll ever see a different variable name, + * and may end up looping here forever. + */ + if (duplicates && + variable_is_present(variable_name, &vendor_guid, + head)) { + dup_variable_bug(variable_name, &vendor_guid, + variable_name_size); + status = EFI_NOT_FOUND; + } else { + err = func(variable_name, vendor_guid, + variable_name_size, data); + if (err) + status = EFI_NOT_FOUND; + } + break; + case EFI_UNSUPPORTED: + err = -EOPNOTSUPP; + status = EFI_NOT_FOUND; + break; + case EFI_NOT_FOUND: + break; + default: + printk(KERN_WARNING "efivars: get_next_variable: status=%lx\n", + status); + status = EFI_NOT_FOUND; + break; + } + + } while (status != EFI_NOT_FOUND); + + efivar_unlock(); +free: + kfree(variable_name); + + return err; +} + +/** + * efivar_entry_add - add entry to variable list + * @entry: entry to add to list + * @head: list head + * + * Returns 0 on success, or a kernel error code on failure. + */ +int efivar_entry_add(struct efivar_entry *entry, struct list_head *head) +{ + int err; + + err = efivar_lock(); + if (err) + return err; + list_add(&entry->list, head); + efivar_unlock(); + + return 0; +} + +/** + * __efivar_entry_add - add entry to variable list + * @entry: entry to add to list + * @head: list head + */ +void __efivar_entry_add(struct efivar_entry *entry, struct list_head *head) +{ + list_add(&entry->list, head); +} + +/** + * efivar_entry_remove - remove entry from variable list + * @entry: entry to remove from list + * + * Returns 0 on success, or a kernel error code on failure. + */ +void efivar_entry_remove(struct efivar_entry *entry) +{ + list_del(&entry->list); +} + +/* + * efivar_entry_list_del_unlock - remove entry from variable list + * @entry: entry to remove + * + * Remove @entry from the variable list and release the list lock. + * + * NOTE: slightly weird locking semantics here - we expect to be + * called with the efivars lock already held, and we release it before + * returning. This is because this function is usually called after + * set_variable() while the lock is still held. + */ +static void efivar_entry_list_del_unlock(struct efivar_entry *entry) +{ + list_del(&entry->list); + efivar_unlock(); +} + +/** + * efivar_entry_delete - delete variable and remove entry from list + * @entry: entry containing variable to delete + * + * Delete the variable from the firmware and remove @entry from the + * variable list. It is the caller's responsibility to free @entry + * once we return. + * + * Returns 0 on success, -EINTR if we can't grab the semaphore, + * converted EFI status code if set_variable() fails. + */ +int efivar_entry_delete(struct efivar_entry *entry) +{ + efi_status_t status; + int err; + + err = efivar_lock(); + if (err) + return err; + + status = efivar_set_variable_locked(entry->var.VariableName, + &entry->var.VendorGuid, + 0, 0, NULL, false); + if (!(status == EFI_SUCCESS || status == EFI_NOT_FOUND)) { + efivar_unlock(); + return efi_status_to_err(status); + } + + efivar_entry_list_del_unlock(entry); + return 0; +} + +/** + * efivar_entry_size - obtain the size of a variable + * @entry: entry for this variable + * @size: location to store the variable's size + */ +int efivar_entry_size(struct efivar_entry *entry, unsigned long *size) +{ + efi_status_t status; + int err; + + *size = 0; + + err = efivar_lock(); + if (err) + return err; + + status = efivar_get_variable(entry->var.VariableName, + &entry->var.VendorGuid, NULL, size, NULL); + efivar_unlock(); + + if (status != EFI_BUFFER_TOO_SMALL) + return efi_status_to_err(status); + + return 0; +} + +/** + * __efivar_entry_get - call get_variable() + * @entry: read data for this variable + * @attributes: variable attributes + * @size: size of @data buffer + * @data: buffer to store variable data + * + * The caller MUST call efivar_entry_iter_begin() and + * efivar_entry_iter_end() before and after the invocation of this + * function, respectively. + */ +int __efivar_entry_get(struct efivar_entry *entry, u32 *attributes, + unsigned long *size, void *data) +{ + efi_status_t status; + + status = efivar_get_variable(entry->var.VariableName, + &entry->var.VendorGuid, + attributes, size, data); + + return efi_status_to_err(status); +} + +/** + * efivar_entry_get - call get_variable() + * @entry: read data for this variable + * @attributes: variable attributes + * @size: size of @data buffer + * @data: buffer to store variable data + */ +int efivar_entry_get(struct efivar_entry *entry, u32 *attributes, + unsigned long *size, void *data) +{ + int err; + + err = efivar_lock(); + if (err) + return err; + err = __efivar_entry_get(entry, attributes, size, data); + efivar_unlock(); + + return 0; +} + +/** + * efivar_entry_set_get_size - call set_variable() and get new size (atomic) + * @entry: entry containing variable to set and get + * @attributes: attributes of variable to be written + * @size: size of data buffer + * @data: buffer containing data to write + * @set: did the set_variable() call succeed? + * + * This is a pretty special (complex) function. See efivarfs_file_write(). + * + * Atomically call set_variable() for @entry and if the call is + * successful, return the new size of the variable from get_variable() + * in @size. The success of set_variable() is indicated by @set. + * + * Returns 0 on success, -EINVAL if the variable data is invalid, + * -ENOSPC if the firmware does not have enough available space, or a + * converted EFI status code if either of set_variable() or + * get_variable() fail. + * + * If the EFI variable does not exist when calling set_variable() + * (EFI_NOT_FOUND), @entry is removed from the variable list. + */ +int efivar_entry_set_get_size(struct efivar_entry *entry, u32 attributes, + unsigned long *size, void *data, bool *set) +{ + efi_char16_t *name = entry->var.VariableName; + efi_guid_t *vendor = &entry->var.VendorGuid; + efi_status_t status; + int err; + + *set = false; + + if (efivar_validate(*vendor, name, data, *size) == false) + return -EINVAL; + + /* + * The lock here protects the get_variable call, the conditional + * set_variable call, and removal of the variable from the efivars + * list (in the case of an authenticated delete). + */ + err = efivar_lock(); + if (err) + return err; + + /* + * Ensure that the available space hasn't shrunk below the safe level + */ + status = check_var_size(attributes, *size + ucs2_strsize(name, 1024)); + if (status != EFI_SUCCESS) { + if (status != EFI_UNSUPPORTED) { + err = efi_status_to_err(status); + goto out; + } + + if (*size > 65536) { + err = -ENOSPC; + goto out; + } + } + + status = efivar_set_variable_locked(name, vendor, attributes, *size, + data, false); + if (status != EFI_SUCCESS) { + err = efi_status_to_err(status); + goto out; + } + + *set = true; + + /* + * Writing to the variable may have caused a change in size (which + * could either be an append or an overwrite), or the variable to be + * deleted. Perform a GetVariable() so we can tell what actually + * happened. + */ + *size = 0; + status = efivar_get_variable(entry->var.VariableName, + &entry->var.VendorGuid, + NULL, size, NULL); + + if (status == EFI_NOT_FOUND) + efivar_entry_list_del_unlock(entry); + else + efivar_unlock(); + + if (status && status != EFI_BUFFER_TOO_SMALL) + return efi_status_to_err(status); + + return 0; + +out: + efivar_unlock(); + return err; + +} + +/** + * efivar_entry_iter - iterate over variable list + * @func: callback function + * @head: head of variable list + * @data: function-specific data to pass to callback + * + * Iterate over the list of EFI variables and call @func with every + * entry on the list. It is safe for @func to remove entries in the + * list via efivar_entry_delete() while iterating. + * + * Some notes for the callback function: + * - a non-zero return value indicates an error and terminates the loop + * - @func is called from atomic context + */ +int efivar_entry_iter(int (*func)(struct efivar_entry *, void *), + struct list_head *head, void *data) +{ + struct efivar_entry *entry, *n; + int err = 0; + + err = efivar_lock(); + if (err) + return err; + + list_for_each_entry_safe(entry, n, head, list) { + err = func(entry, data); + if (err) + break; + } + efivar_unlock(); + + return err; +} diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index 19e6c56a9f47..26fa170090b8 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -17,7 +17,7 @@ struct z_erofs_decompress_req { /* indicate the algorithm will be used for decompression */ unsigned int alg; - bool inplace_io, partial_decoding; + bool inplace_io, partial_decoding, fillgaps; }; struct z_erofs_decompressor { diff --git a/fs/erofs/data.c b/fs/erofs/data.c index fbb037ba326e..fe8ac0e163f7 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -366,42 +366,33 @@ static sector_t erofs_bmap(struct address_space *mapping, sector_t block) return iomap_bmap(mapping, block, &erofs_iomap_ops); } -static int erofs_prepare_dio(struct kiocb *iocb, struct iov_iter *to) +static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); - loff_t align = iocb->ki_pos | iov_iter_count(to) | - iov_iter_alignment(to); - struct block_device *bdev = inode->i_sb->s_bdev; - unsigned int blksize_mask; - - if (bdev) - blksize_mask = (1 << ilog2(bdev_logical_block_size(bdev))) - 1; - else - blksize_mask = (1 << inode->i_blkbits) - 1; - if (align & blksize_mask) - return -EINVAL; - return 0; -} - -static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) -{ /* no need taking (shared) inode lock since it's a ro filesystem */ if (!iov_iter_count(to)) return 0; #ifdef CONFIG_FS_DAX - if (IS_DAX(iocb->ki_filp->f_mapping->host)) + if (IS_DAX(inode)) return dax_iomap_rw(iocb, to, &erofs_iomap_ops); #endif if (iocb->ki_flags & IOCB_DIRECT) { - int err = erofs_prepare_dio(iocb, to); + struct block_device *bdev = inode->i_sb->s_bdev; + unsigned int blksize_mask; + + if (bdev) + blksize_mask = bdev_logical_block_size(bdev) - 1; + else + blksize_mask = (1 << inode->i_blkbits) - 1; + + if ((iocb->ki_pos | iov_iter_count(to) | + iov_iter_alignment(to)) & blksize_mask) + return -EINVAL; - if (!err) - return iomap_dio_rw(iocb, to, &erofs_iomap_ops, - NULL, 0, NULL, 0); - if (err < 0) - return err; + return iomap_dio_rw(iocb, to, &erofs_iomap_ops, + NULL, 0, NULL, 0); } return filemap_read(iocb, to, 0); } diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 6dca1900c733..2d55569f96ac 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -83,7 +83,7 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, j = 0; /* 'valid' bounced can only be tested after a complete round */ - if (test_bit(j, bounced)) { + if (!rq->fillgaps && test_bit(j, bounced)) { DBG_BUGON(i < lz4_max_distance_pages); DBG_BUGON(top >= lz4_max_distance_pages); availables[top++] = rq->out[i - lz4_max_distance_pages]; @@ -91,14 +91,18 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, if (page) { __clear_bit(j, bounced); - if (kaddr) { - if (kaddr + PAGE_SIZE == page_address(page)) + if (!PageHighMem(page)) { + if (!i) { + kaddr = page_address(page); + continue; + } + if (kaddr && + kaddr + PAGE_SIZE == page_address(page)) { kaddr += PAGE_SIZE; - else - kaddr = NULL; - } else if (!i) { - kaddr = page_address(page); + continue; + } } + kaddr = NULL; continue; } kaddr = NULL; diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c index 05a3063cf2bc..5e59b3f523eb 100644 --- a/fs/erofs/decompressor_lzma.c +++ b/fs/erofs/decompressor_lzma.c @@ -143,6 +143,7 @@ again: DBG_BUGON(z_erofs_lzma_head); z_erofs_lzma_head = head; spin_unlock(&z_erofs_lzma_lock); + wake_up_all(&z_erofs_lzma_wq); z_erofs_lzma_max_dictsize = dict_size; mutex_unlock(&lzma_resize_mutex); diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c index 18e59821c597..ecf28f66b97d 100644 --- a/fs/erofs/dir.c +++ b/fs/erofs/dir.c @@ -22,10 +22,9 @@ static void debug_one_dentry(unsigned char d_type, const char *de_name, } static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx, - void *dentry_blk, unsigned int *ofs, + void *dentry_blk, struct erofs_dirent *de, unsigned int nameoff, unsigned int maxsize) { - struct erofs_dirent *de = dentry_blk + *ofs; const struct erofs_dirent *end = dentry_blk + nameoff; while (de < end) { @@ -59,9 +58,8 @@ static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx, /* stopped by some reason */ return 1; ++de; - *ofs += sizeof(struct erofs_dirent); + ctx->pos += sizeof(struct erofs_dirent); } - *ofs = maxsize; return 0; } @@ -90,33 +88,33 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) nameoff = le16_to_cpu(de->nameoff); if (nameoff < sizeof(struct erofs_dirent) || - nameoff >= PAGE_SIZE) { + nameoff >= EROFS_BLKSIZ) { erofs_err(dir->i_sb, "invalid de[0].nameoff %u @ nid %llu", nameoff, EROFS_I(dir)->nid); err = -EFSCORRUPTED; - goto skip_this; + break; } maxsize = min_t(unsigned int, - dirsize - ctx->pos + ofs, PAGE_SIZE); + dirsize - ctx->pos + ofs, EROFS_BLKSIZ); /* search dirents at the arbitrary position */ if (initial) { initial = false; ofs = roundup(ofs, sizeof(struct erofs_dirent)); + ctx->pos = blknr_to_addr(i) + ofs; if (ofs >= nameoff) goto skip_this; } - err = erofs_fill_dentries(dir, ctx, de, &ofs, + err = erofs_fill_dentries(dir, ctx, de, (void *)de + ofs, nameoff, maxsize); -skip_this: - ctx->pos = blknr_to_addr(i) + ofs; - if (err) break; +skip_this: + ctx->pos = blknr_to_addr(i) + maxsize; ++i; ofs = 0; } diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 95addc5c9d34..3173debeaa5a 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -255,7 +255,8 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, if (IS_ERR(bdev)) return PTR_ERR(bdev); dif->bdev = bdev; - dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off); + dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off, + NULL, NULL); } dif->blocks = le32_to_cpu(dis->blocks); @@ -720,7 +721,8 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) } sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev, - &sbi->dax_part_off); + &sbi->dax_part_off, + NULL, NULL); } err = erofs_read_superblock(sb); @@ -812,7 +814,7 @@ static int erofs_release_device_info(int id, void *ptr, void *data) { struct erofs_device_info *dif = ptr; - fs_put_dax(dif->dax_dev); + fs_put_dax(dif->dax_dev, NULL); if (dif->bdev) blkdev_put(dif->bdev, FMODE_READ | FMODE_EXCL); erofs_fscache_unregister_cookie(&dif->fscache); @@ -886,7 +888,7 @@ static void erofs_kill_sb(struct super_block *sb) return; erofs_free_dev_context(sbi->devs); - fs_put_dax(sbi->dax_dev); + fs_put_dax(sbi->dax_dev, NULL); erofs_fscache_unregister_cookie(&sbi->s_fscache); erofs_fscache_unregister_fs(sb); kfree(sbi->opt.fsid); diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c index ec9a1d780dc1..46627cb69abe 100644 --- a/fs/erofs/utils.c +++ b/fs/erofs/utils.c @@ -282,7 +282,7 @@ static struct shrinker erofs_shrinker_info = { int __init erofs_init_shrinker(void) { - return register_shrinker(&erofs_shrinker_info); + return register_shrinker(&erofs_shrinker_info, "erofs-shrinker"); } void erofs_exit_shrinker(void) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 724bb57075f6..5792ca9e0d5e 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -2,6 +2,7 @@ /* * Copyright (C) 2018 HUAWEI, Inc. * https://www.huawei.com/ + * Copyright (C) 2022 Alibaba Cloud */ #include "zdata.h" #include "compress.h" @@ -26,6 +27,82 @@ static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = { _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES) }; +struct z_erofs_bvec_iter { + struct page *bvpage; + struct z_erofs_bvset *bvset; + unsigned int nr, cur; +}; + +static struct page *z_erofs_bvec_iter_end(struct z_erofs_bvec_iter *iter) +{ + if (iter->bvpage) + kunmap_local(iter->bvset); + return iter->bvpage; +} + +static struct page *z_erofs_bvset_flip(struct z_erofs_bvec_iter *iter) +{ + unsigned long base = (unsigned long)((struct z_erofs_bvset *)0)->bvec; + /* have to access nextpage in advance, otherwise it will be unmapped */ + struct page *nextpage = iter->bvset->nextpage; + struct page *oldpage; + + DBG_BUGON(!nextpage); + oldpage = z_erofs_bvec_iter_end(iter); + iter->bvpage = nextpage; + iter->bvset = kmap_local_page(nextpage); + iter->nr = (PAGE_SIZE - base) / sizeof(struct z_erofs_bvec); + iter->cur = 0; + return oldpage; +} + +static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter, + struct z_erofs_bvset_inline *bvset, + unsigned int bootstrap_nr, + unsigned int cur) +{ + *iter = (struct z_erofs_bvec_iter) { + .nr = bootstrap_nr, + .bvset = (struct z_erofs_bvset *)bvset, + }; + + while (cur > iter->nr) { + cur -= iter->nr; + z_erofs_bvset_flip(iter); + } + iter->cur = cur; +} + +static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter, + struct z_erofs_bvec *bvec, + struct page **candidate_bvpage) +{ + if (iter->cur == iter->nr) { + if (!*candidate_bvpage) + return -EAGAIN; + + DBG_BUGON(iter->bvset->nextpage); + iter->bvset->nextpage = *candidate_bvpage; + z_erofs_bvset_flip(iter); + + iter->bvset->nextpage = NULL; + *candidate_bvpage = NULL; + } + iter->bvset->bvec[iter->cur++] = *bvec; + return 0; +} + +static void z_erofs_bvec_dequeue(struct z_erofs_bvec_iter *iter, + struct z_erofs_bvec *bvec, + struct page **old_bvpage) +{ + if (iter->cur == iter->nr) + *old_bvpage = z_erofs_bvset_flip(iter); + else + *old_bvpage = NULL; + *bvec = iter->bvset->bvec[iter->cur++]; +} + static void z_erofs_destroy_pcluster_pool(void) { int i; @@ -46,7 +123,7 @@ static int z_erofs_create_pcluster_pool(void) for (pcs = pcluster_pool; pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) { - size = struct_size(a, compressed_pages, pcs->maxpages); + size = struct_size(a, compressed_bvecs, pcs->maxpages); sprintf(pcs->name, "erofs_pcluster-%u", pcs->maxpages); pcs->slab = kmem_cache_create(pcs->name, size, 0, @@ -150,30 +227,29 @@ int __init z_erofs_init_zip_subsystem(void) return err; } -enum z_erofs_collectmode { - COLLECT_SECONDARY, - COLLECT_PRIMARY, +enum z_erofs_pclustermode { + Z_EROFS_PCLUSTER_INFLIGHT, /* - * The current collection was the tail of an exist chain, in addition - * that the previous processed chained collections are all decided to + * The current pclusters was the tail of an exist chain, in addition + * that the previous processed chained pclusters are all decided to * be hooked up to it. - * A new chain will be created for the remaining collections which are - * not processed yet, therefore different from COLLECT_PRIMARY_FOLLOWED, - * the next collection cannot reuse the whole page safely in - * the following scenario: + * A new chain will be created for the remaining pclusters which are + * not processed yet, so different from Z_EROFS_PCLUSTER_FOLLOWED, + * the next pcluster cannot reuse the whole page safely for inplace I/O + * in the following scenario: * ________________________________________________________________ * | tail (partial) page | head (partial) page | - * | (belongs to the next cl) | (belongs to the current cl) | - * |_______PRIMARY_FOLLOWED_______|________PRIMARY_HOOKED___________| + * | (belongs to the next pcl) | (belongs to the current pcl) | + * |_______PCLUSTER_FOLLOWED______|________PCLUSTER_HOOKED__________| */ - COLLECT_PRIMARY_HOOKED, + Z_EROFS_PCLUSTER_HOOKED, /* - * a weak form of COLLECT_PRIMARY_FOLLOWED, the difference is that it + * a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it * could be dispatched into bypass queue later due to uptodated managed * pages. All related online pages cannot be reused for inplace I/O (or - * pagevec) since it can be directly decoded without I/O submission. + * bvpage) since it can be directly decoded without I/O submission. */ - COLLECT_PRIMARY_FOLLOWED_NOINPLACE, + Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE, /* * The current collection has been linked with the owned chain, and * could also be linked with the remaining collections, which means @@ -184,39 +260,36 @@ enum z_erofs_collectmode { * ________________________________________________________________ * | tail (partial) page | head (partial) page | * | (of the current cl) | (of the previous collection) | - * | PRIMARY_FOLLOWED or | | - * |_____PRIMARY_HOOKED___|____________PRIMARY_FOLLOWED____________| + * | PCLUSTER_FOLLOWED or | | + * |_____PCLUSTER_HOOKED__|___________PCLUSTER_FOLLOWED____________| * * [ (*) the above page can be used as inplace I/O. ] */ - COLLECT_PRIMARY_FOLLOWED, + Z_EROFS_PCLUSTER_FOLLOWED, }; struct z_erofs_decompress_frontend { struct inode *const inode; struct erofs_map_blocks map; + struct z_erofs_bvec_iter biter; - struct z_erofs_pagevec_ctor vector; - + struct page *candidate_bvpage; struct z_erofs_pcluster *pcl, *tailpcl; - /* a pointer used to pick up inplace I/O pages */ - struct page **icpage_ptr; z_erofs_next_pcluster_t owned_head; - - enum z_erofs_collectmode mode; + enum z_erofs_pclustermode mode; bool readahead; /* used for applying cache strategy on the fly */ bool backmost; erofs_off_t headoffset; + + /* a pointer used to pick up inplace I/O pages */ + unsigned int icur; }; #define DECOMPRESS_FRONTEND_INIT(__i) { \ .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \ - .mode = COLLECT_PRIMARY_FOLLOWED, .backmost = true } - -static struct page *z_pagemap_global[Z_EROFS_VMAP_GLOBAL_PAGES]; -static DEFINE_MUTEX(z_pagemap_global_lock); + .mode = Z_EROFS_PCLUSTER_FOLLOWED, .backmost = true } static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, enum z_erofs_cache_alloctype type, @@ -231,24 +304,21 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, */ gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; - struct page **pages; - pgoff_t index; + unsigned int i; - if (fe->mode < COLLECT_PRIMARY_FOLLOWED) + if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED) return; - pages = pcl->compressed_pages; - index = pcl->obj.index; - for (; index < pcl->obj.index + pcl->pclusterpages; ++index, ++pages) { + for (i = 0; i < pcl->pclusterpages; ++i) { struct page *page; compressed_page_t t; struct page *newpage = NULL; /* the compressed page was loaded before */ - if (READ_ONCE(*pages)) + if (READ_ONCE(pcl->compressed_bvecs[i].page)) continue; - page = find_get_page(mc, index); + page = find_get_page(mc, pcl->obj.index + i); if (page) { t = tag_compressed_page_justfound(page); @@ -269,7 +339,8 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, } } - if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t))) + if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, + tagptr_cast_ptr(t))) continue; if (page) @@ -283,7 +354,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, * managed cache since it can be moved to the bypass queue instead. */ if (standalone) - fe->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE; + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; } /* called by erofs_shrinker to get rid of all compressed_pages */ @@ -300,7 +371,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, * therefore no need to worry about available decompression users. */ for (i = 0; i < pcl->pclusterpages; ++i) { - struct page *page = pcl->compressed_pages[i]; + struct page *page = pcl->compressed_bvecs[i].page; if (!page) continue; @@ -313,7 +384,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, continue; /* barrier is implied in the following 'unlock_page' */ - WRITE_ONCE(pcl->compressed_pages[i], NULL); + WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); detach_page_private(page); unlock_page(page); } @@ -323,56 +394,59 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, int erofs_try_to_free_cached_page(struct page *page) { struct z_erofs_pcluster *const pcl = (void *)page_private(page); - int ret = 0; /* 0 - busy */ + int ret, i; - if (erofs_workgroup_try_to_freeze(&pcl->obj, 1)) { - unsigned int i; + if (!erofs_workgroup_try_to_freeze(&pcl->obj, 1)) + return 0; - DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); - for (i = 0; i < pcl->pclusterpages; ++i) { - if (pcl->compressed_pages[i] == page) { - WRITE_ONCE(pcl->compressed_pages[i], NULL); - ret = 1; - break; - } + ret = 0; + DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); + for (i = 0; i < pcl->pclusterpages; ++i) { + if (pcl->compressed_bvecs[i].page == page) { + WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); + ret = 1; + break; } - erofs_workgroup_unfreeze(&pcl->obj, 1); - - if (ret) - detach_page_private(page); } + erofs_workgroup_unfreeze(&pcl->obj, 1); + if (ret) + detach_page_private(page); return ret; } -/* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */ static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, - struct page *page) + struct z_erofs_bvec *bvec) { struct z_erofs_pcluster *const pcl = fe->pcl; - while (fe->icpage_ptr > pcl->compressed_pages) - if (!cmpxchg(--fe->icpage_ptr, NULL, page)) + while (fe->icur > 0) { + if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page, + NULL, bvec->page)) { + pcl->compressed_bvecs[fe->icur] = *bvec; return true; + } + } return false; } /* callers must be with pcluster lock held */ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, - struct page *page, enum z_erofs_page_type type, - bool pvec_safereuse) + struct z_erofs_bvec *bvec, bool exclusive) { int ret; - /* give priority for inplaceio */ - if (fe->mode >= COLLECT_PRIMARY && - type == Z_EROFS_PAGE_TYPE_EXCLUSIVE && - z_erofs_try_inplace_io(fe, page)) - return 0; - - ret = z_erofs_pagevec_enqueue(&fe->vector, page, type, - pvec_safereuse); - fe->pcl->vcnt += (unsigned int)ret; - return ret ? 0 : -EAGAIN; + if (exclusive) { + /* give priority for inplaceio to use file pages first */ + if (z_erofs_try_inplace_io(fe, bvec)) + return 0; + /* otherwise, check if it can be used as a bvpage */ + if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED && + !fe->candidate_bvpage) + fe->candidate_bvpage = bvec->page; + } + ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage); + fe->pcl->vcnt += (ret >= 0); + return ret; } static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) @@ -385,7 +459,7 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) *owned_head) == Z_EROFS_PCLUSTER_NIL) { *owned_head = &pcl->next; /* so we can attach this pcluster to our submission chain. */ - f->mode = COLLECT_PRIMARY_FOLLOWED; + f->mode = Z_EROFS_PCLUSTER_FOLLOWED; return; } @@ -393,66 +467,21 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) * type 2, link to the end of an existing open chain, be careful * that its submission is controlled by the original attached chain. */ - if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, + if (*owned_head != &pcl->next && pcl != f->tailpcl && + cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, *owned_head) == Z_EROFS_PCLUSTER_TAIL) { *owned_head = Z_EROFS_PCLUSTER_TAIL; - f->mode = COLLECT_PRIMARY_HOOKED; + f->mode = Z_EROFS_PCLUSTER_HOOKED; f->tailpcl = NULL; return; } /* type 3, it belongs to a chain, but it isn't the end of the chain */ - f->mode = COLLECT_PRIMARY; + f->mode = Z_EROFS_PCLUSTER_INFLIGHT; } -static int z_erofs_lookup_pcluster(struct z_erofs_decompress_frontend *fe, - struct inode *inode, - struct erofs_map_blocks *map) -{ - struct z_erofs_pcluster *pcl = fe->pcl; - unsigned int length; - - /* to avoid unexpected loop formed by corrupted images */ - if (fe->owned_head == &pcl->next || pcl == fe->tailpcl) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - - if (pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - - length = READ_ONCE(pcl->length); - if (length & Z_EROFS_PCLUSTER_FULL_LENGTH) { - if ((map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) > length) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - } else { - unsigned int llen = map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT; - - if (map->m_flags & EROFS_MAP_FULL_MAPPED) - llen |= Z_EROFS_PCLUSTER_FULL_LENGTH; - - while (llen > length && - length != cmpxchg_relaxed(&pcl->length, length, llen)) { - cpu_relax(); - length = READ_ONCE(pcl->length); - } - } - mutex_lock(&pcl->lock); - /* used to check tail merging loop due to corrupted images */ - if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL) - fe->tailpcl = pcl; - - z_erofs_try_to_claim_pcluster(fe); - return 0; -} - -static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe, - struct inode *inode, - struct erofs_map_blocks *map) +static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) { + struct erofs_map_blocks *map = &fe->map; bool ztailpacking = map->m_flags & EROFS_MAP_META; struct z_erofs_pcluster *pcl; struct erofs_workgroup *grp; @@ -471,14 +500,13 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe, atomic_set(&pcl->obj.refcount, 1); pcl->algorithmformat = map->m_algorithmformat; - pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) | - (map->m_flags & EROFS_MAP_FULL_MAPPED ? - Z_EROFS_PCLUSTER_FULL_LENGTH : 0); + pcl->length = 0; + pcl->partial = true; /* new pclusters should be claimed as type 1, primary and followed */ pcl->next = fe->owned_head; pcl->pageofs_out = map->m_la & ~PAGE_MASK; - fe->mode = COLLECT_PRIMARY_FOLLOWED; + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED; /* * lock all primary followed works before visible to others @@ -494,7 +522,7 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe, } else { pcl->obj.index = map->m_pa >> PAGE_SHIFT; - grp = erofs_insert_workgroup(inode->i_sb, &pcl->obj); + grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj); if (IS_ERR(grp)) { err = PTR_ERR(grp); goto err_out; @@ -520,11 +548,10 @@ err_out: return err; } -static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe, - struct inode *inode, - struct erofs_map_blocks *map) +static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) { - struct erofs_workgroup *grp; + struct erofs_map_blocks *map = &fe->map; + struct erofs_workgroup *grp = NULL; int ret; DBG_BUGON(fe->pcl); @@ -533,38 +560,35 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe, DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); - if (map->m_flags & EROFS_MAP_META) { - if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - goto tailpacking; + if (!(map->m_flags & EROFS_MAP_META)) { + grp = erofs_find_workgroup(fe->inode->i_sb, + map->m_pa >> PAGE_SHIFT); + } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { + DBG_BUGON(1); + return -EFSCORRUPTED; } - grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT); if (grp) { fe->pcl = container_of(grp, struct z_erofs_pcluster, obj); + ret = -EEXIST; } else { -tailpacking: - ret = z_erofs_register_pcluster(fe, inode, map); - if (!ret) - goto out; - if (ret != -EEXIST) - return ret; + ret = z_erofs_register_pcluster(fe); } - ret = z_erofs_lookup_pcluster(fe, inode, map); - if (ret) { - erofs_workgroup_put(&fe->pcl->obj); + if (ret == -EEXIST) { + mutex_lock(&fe->pcl->lock); + /* used to check tail merging loop due to corrupted images */ + if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL) + fe->tailpcl = fe->pcl; + + z_erofs_try_to_claim_pcluster(fe); + } else if (ret) { return ret; } - -out: - z_erofs_pagevec_ctor_init(&fe->vector, Z_EROFS_NR_INLINE_PAGEVECS, - fe->pcl->pagevec, fe->pcl->vcnt); + z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset, + Z_EROFS_INLINE_BVECS, fe->pcl->vcnt); /* since file-backed online pages are traversed in reverse order */ - fe->icpage_ptr = fe->pcl->compressed_pages + - z_erofs_pclusterpages(fe->pcl); + fe->icur = z_erofs_pclusterpages(fe->pcl); return 0; } @@ -593,14 +617,19 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) if (!pcl) return false; - z_erofs_pagevec_ctor_exit(&fe->vector, false); + z_erofs_bvec_iter_end(&fe->biter); mutex_unlock(&pcl->lock); + if (fe->candidate_bvpage) { + DBG_BUGON(z_erofs_is_shortlived_page(fe->candidate_bvpage)); + fe->candidate_bvpage = NULL; + } + /* * if all pending pages are added, don't hold its reference * any longer if the pcluster isn't hosted by ourselves. */ - if (fe->mode < COLLECT_PRIMARY_FOLLOWED_NOINPLACE) + if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE) erofs_workgroup_put(&pcl->obj); fe->pcl = NULL; @@ -628,11 +657,10 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct erofs_map_blocks *const map = &fe->map; const loff_t offset = page_offset(page); - bool tight = true; + bool tight = true, exclusive; enum z_erofs_cache_alloctype cache_strategy; - enum z_erofs_page_type page_type; - unsigned int cur, end, spiltted, index; + unsigned int cur, end, spiltted; int err = 0; /* register locked file pages as online pages in pack */ @@ -653,7 +681,7 @@ repeat: map->m_llen = 0; err = z_erofs_map_blocks_iter(inode, map, 0); if (err) - goto err_out; + goto out; } else { if (fe->pcl) goto hitted; @@ -663,9 +691,9 @@ repeat: if (!(map->m_flags & EROFS_MAP_MAPPED)) goto hitted; - err = z_erofs_collector_begin(fe, inode, map); + err = z_erofs_collector_begin(fe); if (err) - goto err_out; + goto out; if (z_erofs_is_inline_pcluster(fe->pcl)) { void *mp; @@ -676,11 +704,12 @@ repeat: err = PTR_ERR(mp); erofs_err(inode->i_sb, "failed to get inline page, err %d", err); - goto err_out; + goto out; } get_page(fe->map.buf.page); - WRITE_ONCE(fe->pcl->compressed_pages[0], fe->map.buf.page); - fe->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE; + WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, + fe->map.buf.page); + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; } else { /* bind cache first when cached decompression is preferred */ if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy, @@ -696,10 +725,10 @@ hitted: * Ensure the current partial page belongs to this submit chain rather * than other concurrent submit chains or the noio(bypass) chain since * those chains are handled asynchronously thus the page cannot be used - * for inplace I/O or pagevec (should be processed in strict order.) + * for inplace I/O or bvpage (should be processed in a strict order.) */ - tight &= (fe->mode >= COLLECT_PRIMARY_HOOKED && - fe->mode != COLLECT_PRIMARY_FOLLOWED_NOINPLACE); + tight &= (fe->mode >= Z_EROFS_PCLUSTER_HOOKED && + fe->mode != Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); cur = end - min_t(unsigned int, offset + end - map->m_la, end); if (!(map->m_flags & EROFS_MAP_MAPPED)) { @@ -707,60 +736,59 @@ hitted: goto next_part; } - /* let's derive page type */ - page_type = cur ? Z_EROFS_VLE_PAGE_TYPE_HEAD : - (!spiltted ? Z_EROFS_PAGE_TYPE_EXCLUSIVE : - (tight ? Z_EROFS_PAGE_TYPE_EXCLUSIVE : - Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED)); - + exclusive = (!cur && (!spiltted || tight)); if (cur) - tight &= (fe->mode >= COLLECT_PRIMARY_FOLLOWED); + tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED); retry: - err = z_erofs_attach_page(fe, page, page_type, - fe->mode >= COLLECT_PRIMARY_FOLLOWED); - /* should allocate an additional short-lived page for pagevec */ - if (err == -EAGAIN) { - struct page *const newpage = - alloc_page(GFP_NOFS | __GFP_NOFAIL); - - set_page_private(newpage, Z_EROFS_SHORTLIVED_PAGE); - err = z_erofs_attach_page(fe, newpage, - Z_EROFS_PAGE_TYPE_EXCLUSIVE, true); - if (!err) - goto retry; + err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) { + .page = page, + .offset = offset - map->m_la, + .end = end, + }), exclusive); + /* should allocate an additional short-lived page for bvset */ + if (err == -EAGAIN && !fe->candidate_bvpage) { + fe->candidate_bvpage = alloc_page(GFP_NOFS | __GFP_NOFAIL); + set_page_private(fe->candidate_bvpage, + Z_EROFS_SHORTLIVED_PAGE); + goto retry; } - if (err) - goto err_out; - - index = page->index - (map->m_la >> PAGE_SHIFT); - - z_erofs_onlinepage_fixup(page, index, true); + if (err) { + DBG_BUGON(err == -EAGAIN && fe->candidate_bvpage); + goto out; + } + z_erofs_onlinepage_split(page); /* bump up the number of spiltted parts of a page */ ++spiltted; - /* also update nr_pages */ - fe->pcl->nr_pages = max_t(pgoff_t, fe->pcl->nr_pages, index + 1); + if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) + fe->pcl->multibases = true; + + if ((map->m_flags & EROFS_MAP_FULL_MAPPED) && + fe->pcl->length == map->m_llen) + fe->pcl->partial = false; + if (fe->pcl->length < offset + end - map->m_la) { + fe->pcl->length = offset + end - map->m_la; + fe->pcl->pageofs_out = map->m_la & ~PAGE_MASK; + } next_part: - /* can be used for verification */ + /* shorten the remaining extent to update progress */ map->m_llen = offset + cur - map->m_la; + map->m_flags &= ~EROFS_MAP_FULL_MAPPED; end = cur; if (end > 0) goto repeat; out: + if (err) + z_erofs_page_mark_eio(page); z_erofs_onlinepage_endio(page); erofs_dbg("%s, finish page: %pK spiltted: %u map->m_llen %llu", __func__, page, spiltted, map->m_llen); return err; - - /* if some error occurred while processing this page */ -err_out: - SetPageError(page); - goto out; } static bool z_erofs_get_sync_decompress_policy(struct erofs_sb_info *sbi, @@ -783,97 +811,137 @@ static bool z_erofs_page_is_invalidated(struct page *page) return !page->mapping && !z_erofs_is_shortlived_page(page); } -static int z_erofs_decompress_pcluster(struct super_block *sb, - struct z_erofs_pcluster *pcl, - struct page **pagepool) -{ - struct erofs_sb_info *const sbi = EROFS_SB(sb); - unsigned int pclusterpages = z_erofs_pclusterpages(pcl); - struct z_erofs_pagevec_ctor ctor; - unsigned int i, inputsize, outputsize, llen, nr_pages; - struct page *pages_onstack[Z_EROFS_VMAP_ONSTACK_PAGES]; - struct page **pages, **compressed_pages, *page; +struct z_erofs_decompress_backend { + struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES]; + struct super_block *sb; + struct z_erofs_pcluster *pcl; - enum z_erofs_page_type page_type; - bool overlapped, partial; - int err; + /* pages with the longest decompressed length for deduplication */ + struct page **decompressed_pages; + /* pages to keep the compressed data */ + struct page **compressed_pages; - might_sleep(); - DBG_BUGON(!READ_ONCE(pcl->nr_pages)); + struct list_head decompressed_secondary_bvecs; + struct page **pagepool; + unsigned int onstack_used, nr_pages; +}; - mutex_lock(&pcl->lock); - nr_pages = pcl->nr_pages; +struct z_erofs_bvec_item { + struct z_erofs_bvec bvec; + struct list_head list; +}; - if (nr_pages <= Z_EROFS_VMAP_ONSTACK_PAGES) { - pages = pages_onstack; - } else if (nr_pages <= Z_EROFS_VMAP_GLOBAL_PAGES && - mutex_trylock(&z_pagemap_global_lock)) { - pages = z_pagemap_global; - } else { - gfp_t gfp_flags = GFP_KERNEL; +static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be, + struct z_erofs_bvec *bvec) +{ + struct z_erofs_bvec_item *item; - if (nr_pages > Z_EROFS_VMAP_GLOBAL_PAGES) - gfp_flags |= __GFP_NOFAIL; + if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK)) { + unsigned int pgnr; + struct page *oldpage; - pages = kvmalloc_array(nr_pages, sizeof(struct page *), - gfp_flags); + pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT; + DBG_BUGON(pgnr >= be->nr_pages); + oldpage = be->decompressed_pages[pgnr]; + be->decompressed_pages[pgnr] = bvec->page; - /* fallback to global pagemap for the lowmem scenario */ - if (!pages) { - mutex_lock(&z_pagemap_global_lock); - pages = z_pagemap_global; - } + if (!oldpage) + return; } - for (i = 0; i < nr_pages; ++i) - pages[i] = NULL; - - err = 0; - z_erofs_pagevec_ctor_init(&ctor, Z_EROFS_NR_INLINE_PAGEVECS, - pcl->pagevec, 0); - - for (i = 0; i < pcl->vcnt; ++i) { - unsigned int pagenr; + /* (cold path) one pcluster is requested multiple times */ + item = kmalloc(sizeof(*item), GFP_KERNEL | __GFP_NOFAIL); + item->bvec = *bvec; + list_add(&item->list, &be->decompressed_secondary_bvecs); +} - page = z_erofs_pagevec_dequeue(&ctor, &page_type); +static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be, + int err) +{ + unsigned int off0 = be->pcl->pageofs_out; + struct list_head *p, *n; + + list_for_each_safe(p, n, &be->decompressed_secondary_bvecs) { + struct z_erofs_bvec_item *bvi; + unsigned int end, cur; + void *dst, *src; + + bvi = container_of(p, struct z_erofs_bvec_item, list); + cur = bvi->bvec.offset < 0 ? -bvi->bvec.offset : 0; + end = min_t(unsigned int, be->pcl->length - bvi->bvec.offset, + bvi->bvec.end); + dst = kmap_local_page(bvi->bvec.page); + while (cur < end) { + unsigned int pgnr, scur, len; + + pgnr = (bvi->bvec.offset + cur + off0) >> PAGE_SHIFT; + DBG_BUGON(pgnr >= be->nr_pages); + + scur = bvi->bvec.offset + cur - + ((pgnr << PAGE_SHIFT) - off0); + len = min_t(unsigned int, end - cur, PAGE_SIZE - scur); + if (!be->decompressed_pages[pgnr]) { + err = -EFSCORRUPTED; + cur += len; + continue; + } + src = kmap_local_page(be->decompressed_pages[pgnr]); + memcpy(dst + cur, src + scur, len); + kunmap_local(src); + cur += len; + } + kunmap_local(dst); + if (err) + z_erofs_page_mark_eio(bvi->bvec.page); + z_erofs_onlinepage_endio(bvi->bvec.page); + list_del(p); + kfree(bvi); + } +} - /* all pages in pagevec ought to be valid */ - DBG_BUGON(!page); - DBG_BUGON(z_erofs_page_is_invalidated(page)); +static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be) +{ + struct z_erofs_pcluster *pcl = be->pcl; + struct z_erofs_bvec_iter biter; + struct page *old_bvpage; + int i; - if (z_erofs_put_shortlivedpage(pagepool, page)) - continue; + z_erofs_bvec_iter_begin(&biter, &pcl->bvset, Z_EROFS_INLINE_BVECS, 0); + for (i = 0; i < pcl->vcnt; ++i) { + struct z_erofs_bvec bvec; - if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD) - pagenr = 0; - else - pagenr = z_erofs_onlinepage_index(page); + z_erofs_bvec_dequeue(&biter, &bvec, &old_bvpage); - DBG_BUGON(pagenr >= nr_pages); + if (old_bvpage) + z_erofs_put_shortlivedpage(be->pagepool, old_bvpage); - /* - * currently EROFS doesn't support multiref(dedup), - * so here erroring out one multiref page. - */ - if (pages[pagenr]) { - DBG_BUGON(1); - SetPageError(pages[pagenr]); - z_erofs_onlinepage_endio(pages[pagenr]); - err = -EFSCORRUPTED; - } - pages[pagenr] = page; + DBG_BUGON(z_erofs_page_is_invalidated(bvec.page)); + z_erofs_do_decompressed_bvec(be, &bvec); } - z_erofs_pagevec_ctor_exit(&ctor, true); - overlapped = false; - compressed_pages = pcl->compressed_pages; + old_bvpage = z_erofs_bvec_iter_end(&biter); + if (old_bvpage) + z_erofs_put_shortlivedpage(be->pagepool, old_bvpage); +} +static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be, + bool *overlapped) +{ + struct z_erofs_pcluster *pcl = be->pcl; + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); + int i, err = 0; + + *overlapped = false; for (i = 0; i < pclusterpages; ++i) { - unsigned int pagenr; + struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i]; + struct page *page = bvec->page; - page = compressed_pages[i]; - /* all compressed pages ought to be valid */ - DBG_BUGON(!page); + /* compressed pages ought to be present before decompressing */ + if (!page) { + DBG_BUGON(1); + continue; + } + be->compressed_pages[i] = page; if (z_erofs_is_inline_pcluster(pcl)) { if (!PageUptodate(page)) @@ -883,109 +951,129 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, DBG_BUGON(z_erofs_page_is_invalidated(page)); if (!z_erofs_is_shortlived_page(page)) { - if (erofs_page_is_managed(sbi, page)) { + if (erofs_page_is_managed(EROFS_SB(be->sb), page)) { if (!PageUptodate(page)) err = -EIO; continue; } + z_erofs_do_decompressed_bvec(be, bvec); + *overlapped = true; + } + } - /* - * only if non-head page can be selected - * for inplace decompression - */ - pagenr = z_erofs_onlinepage_index(page); - - DBG_BUGON(pagenr >= nr_pages); - if (pages[pagenr]) { - DBG_BUGON(1); - SetPageError(pages[pagenr]); - z_erofs_onlinepage_endio(pages[pagenr]); - err = -EFSCORRUPTED; - } - pages[pagenr] = page; + if (err) + return err; + return 0; +} - overlapped = true; - } +static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, + int err) +{ + struct erofs_sb_info *const sbi = EROFS_SB(be->sb); + struct z_erofs_pcluster *pcl = be->pcl; + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); + unsigned int i, inputsize; + int err2; + struct page *page; + bool overlapped; - /* PG_error needs checking for all non-managed pages */ - if (PageError(page)) { - DBG_BUGON(PageUptodate(page)); - err = -EIO; - } + mutex_lock(&pcl->lock); + be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT; + + /* allocate (de)compressed page arrays if cannot be kept on stack */ + be->decompressed_pages = NULL; + be->compressed_pages = NULL; + be->onstack_used = 0; + if (be->nr_pages <= Z_EROFS_ONSTACK_PAGES) { + be->decompressed_pages = be->onstack_pages; + be->onstack_used = be->nr_pages; + memset(be->decompressed_pages, 0, + sizeof(struct page *) * be->nr_pages); } + if (pclusterpages + be->onstack_used <= Z_EROFS_ONSTACK_PAGES) + be->compressed_pages = be->onstack_pages + be->onstack_used; + + if (!be->decompressed_pages) + be->decompressed_pages = + kvcalloc(be->nr_pages, sizeof(struct page *), + GFP_KERNEL | __GFP_NOFAIL); + if (!be->compressed_pages) + be->compressed_pages = + kvcalloc(pclusterpages, sizeof(struct page *), + GFP_KERNEL | __GFP_NOFAIL); + + z_erofs_parse_out_bvecs(be); + err2 = z_erofs_parse_in_bvecs(be, &overlapped); + if (err2) + err = err2; if (err) goto out; - llen = pcl->length >> Z_EROFS_PCLUSTER_LENGTH_BIT; - if (nr_pages << PAGE_SHIFT >= pcl->pageofs_out + llen) { - outputsize = llen; - partial = !(pcl->length & Z_EROFS_PCLUSTER_FULL_LENGTH); - } else { - outputsize = (nr_pages << PAGE_SHIFT) - pcl->pageofs_out; - partial = true; - } - if (z_erofs_is_inline_pcluster(pcl)) inputsize = pcl->tailpacking_size; else inputsize = pclusterpages * PAGE_SIZE; err = z_erofs_decompress(&(struct z_erofs_decompress_req) { - .sb = sb, - .in = compressed_pages, - .out = pages, + .sb = be->sb, + .in = be->compressed_pages, + .out = be->decompressed_pages, .pageofs_in = pcl->pageofs_in, .pageofs_out = pcl->pageofs_out, .inputsize = inputsize, - .outputsize = outputsize, + .outputsize = pcl->length, .alg = pcl->algorithmformat, .inplace_io = overlapped, - .partial_decoding = partial - }, pagepool); + .partial_decoding = pcl->partial, + .fillgaps = pcl->multibases, + }, be->pagepool); out: /* must handle all compressed pages before actual file pages */ if (z_erofs_is_inline_pcluster(pcl)) { - page = compressed_pages[0]; - WRITE_ONCE(compressed_pages[0], NULL); + page = pcl->compressed_bvecs[0].page; + WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL); put_page(page); } else { for (i = 0; i < pclusterpages; ++i) { - page = compressed_pages[i]; + page = pcl->compressed_bvecs[i].page; if (erofs_page_is_managed(sbi, page)) continue; /* recycle all individual short-lived pages */ - (void)z_erofs_put_shortlivedpage(pagepool, page); - WRITE_ONCE(compressed_pages[i], NULL); + (void)z_erofs_put_shortlivedpage(be->pagepool, page); + WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); } } + if (be->compressed_pages < be->onstack_pages || + be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES) + kvfree(be->compressed_pages); + z_erofs_fill_other_copies(be, err); - for (i = 0; i < nr_pages; ++i) { - page = pages[i]; + for (i = 0; i < be->nr_pages; ++i) { + page = be->decompressed_pages[i]; if (!page) continue; DBG_BUGON(z_erofs_page_is_invalidated(page)); /* recycle all individual short-lived pages */ - if (z_erofs_put_shortlivedpage(pagepool, page)) + if (z_erofs_put_shortlivedpage(be->pagepool, page)) continue; - - if (err < 0) - SetPageError(page); - + if (err) + z_erofs_page_mark_eio(page); z_erofs_onlinepage_endio(page); } - if (pages == z_pagemap_global) - mutex_unlock(&z_pagemap_global_lock); - else if (pages != pages_onstack) - kvfree(pages); + if (be->decompressed_pages != be->onstack_pages) + kvfree(be->decompressed_pages); - pcl->nr_pages = 0; + pcl->length = 0; + pcl->partial = true; + pcl->multibases = false; + pcl->bvset.nextpage = NULL; pcl->vcnt = 0; /* pcluster lock MUST be taken before the following line */ @@ -997,22 +1085,25 @@ out: static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, struct page **pagepool) { + struct z_erofs_decompress_backend be = { + .sb = io->sb, + .pagepool = pagepool, + .decompressed_secondary_bvecs = + LIST_HEAD_INIT(be.decompressed_secondary_bvecs), + }; z_erofs_next_pcluster_t owned = io->head; while (owned != Z_EROFS_PCLUSTER_TAIL_CLOSED) { - struct z_erofs_pcluster *pcl; - - /* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */ + /* impossible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */ DBG_BUGON(owned == Z_EROFS_PCLUSTER_TAIL); - - /* no possible that 'owned' equals NULL */ + /* impossible that 'owned' equals Z_EROFS_PCLUSTER_NIL */ DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL); - pcl = container_of(owned, struct z_erofs_pcluster, next); - owned = READ_ONCE(pcl->next); + be.pcl = container_of(owned, struct z_erofs_pcluster, next); + owned = READ_ONCE(be.pcl->next); - z_erofs_decompress_pcluster(io->sb, pcl, pagepool); - erofs_workgroup_put(&pcl->obj); + z_erofs_decompress_pcluster(&be, io->eio ? -EIO : 0); + erofs_workgroup_put(&be.pcl->obj); } } @@ -1038,7 +1129,6 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, if (sync) { if (!atomic_add_return(bios, &io->pending_bios)) complete(&io->u.done); - return; } @@ -1071,7 +1161,7 @@ static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, int justfound; repeat: - page = READ_ONCE(pcl->compressed_pages[nr]); + page = READ_ONCE(pcl->compressed_bvecs[nr].page); oldpage = page; if (!page) @@ -1087,7 +1177,7 @@ repeat: * otherwise, it will go inplace I/O path instead. */ if (page->private == Z_EROFS_PREALLOCATED_PAGE) { - WRITE_ONCE(pcl->compressed_pages[nr], page); + WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); set_page_private(page, 0); tocache = true; goto out_tocache; @@ -1113,14 +1203,13 @@ repeat: /* the page is still in manage cache */ if (page->mapping == mc) { - WRITE_ONCE(pcl->compressed_pages[nr], page); + WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); - ClearPageError(page); if (!PagePrivate(page)) { /* * impossible to be !PagePrivate(page) for * the current restriction as well if - * the page is already in compressed_pages[]. + * the page is already in compressed_bvecs[]. */ DBG_BUGON(!justfound); @@ -1149,7 +1238,8 @@ repeat: put_page(page); out_allocpage: page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL); - if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) { + if (oldpage != cmpxchg(&pcl->compressed_bvecs[nr].page, + oldpage, page)) { erofs_pagepool_add(pagepool, page); cond_resched(); goto repeat; @@ -1186,6 +1276,7 @@ fg_out: q = fgq; init_completion(&fgq->u.done); atomic_set(&fgq->pending_bios, 0); + q->eio = false; } q->sb = sb; q->head = Z_EROFS_PCLUSTER_TAIL_CLOSED; @@ -1246,26 +1337,25 @@ static void z_erofs_decompressqueue_endio(struct bio *bio) DBG_BUGON(PageUptodate(page)); DBG_BUGON(z_erofs_page_is_invalidated(page)); - if (err) - SetPageError(page); - if (erofs_page_is_managed(EROFS_SB(q->sb), page)) { if (!err) SetPageUptodate(page); unlock_page(page); } } + if (err) + q->eio = true; z_erofs_decompress_kickoff(q, tagptr_unfold_tags(t), -1); bio_put(bio); } -static void z_erofs_submit_queue(struct super_block *sb, - struct z_erofs_decompress_frontend *f, +static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, struct page **pagepool, struct z_erofs_decompressqueue *fgq, bool *force_fg) { - struct erofs_sb_info *const sbi = EROFS_SB(sb); + struct super_block *sb = f->inode->i_sb; + struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb)); z_erofs_next_pcluster_t qtail[NR_JOBQUEUES]; struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; void *bi_private; @@ -1317,7 +1407,7 @@ static void z_erofs_submit_queue(struct super_block *sb, struct page *page; page = pickup_page_for_submission(pcl, i++, pagepool, - MNGD_MAPPING(sbi)); + mc); if (!page) continue; @@ -1369,15 +1459,14 @@ submit_bio_retry: z_erofs_decompress_kickoff(q[JQ_SUBMIT], *force_fg, nr_bios); } -static void z_erofs_runqueue(struct super_block *sb, - struct z_erofs_decompress_frontend *f, +static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, struct page **pagepool, bool force_fg) { struct z_erofs_decompressqueue io[NR_JOBQUEUES]; if (f->owned_head == Z_EROFS_PCLUSTER_TAIL) return; - z_erofs_submit_queue(sb, f, pagepool, io, &force_fg); + z_erofs_submit_queue(f, pagepool, io, &force_fg); /* handle bypass queue (no i/o pclusters) immediately */ z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool); @@ -1475,7 +1564,7 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio) (void)z_erofs_collector_end(&f); /* if some compressed cluster ready, need submit them anyway */ - z_erofs_runqueue(inode->i_sb, &f, &pagepool, + z_erofs_runqueue(&f, &pagepool, z_erofs_get_sync_decompress_policy(sbi, 0)); if (err) @@ -1524,7 +1613,7 @@ static void z_erofs_readahead(struct readahead_control *rac) z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false); (void)z_erofs_collector_end(&f); - z_erofs_runqueue(inode->i_sb, &f, &pagepool, + z_erofs_runqueue(&f, &pagepool, z_erofs_get_sync_decompress_policy(sbi, nr_pages)); erofs_put_metabuf(&f.map.buf); erofs_release_pages(&pagepool); diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index 58053bb5066f..e7f04c4fbb81 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -7,13 +7,10 @@ #define __EROFS_FS_ZDATA_H #include "internal.h" -#include "zpvec.h" +#include "tagptr.h" #define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE) -#define Z_EROFS_NR_INLINE_PAGEVECS 3 - -#define Z_EROFS_PCLUSTER_FULL_LENGTH 0x00000001 -#define Z_EROFS_PCLUSTER_LENGTH_BIT 1 +#define Z_EROFS_INLINE_BVECS 2 /* * let's leave a type here in case of introducing @@ -21,6 +18,21 @@ */ typedef void *z_erofs_next_pcluster_t; +struct z_erofs_bvec { + struct page *page; + int offset; + unsigned int end; +}; + +#define __Z_EROFS_BVSET(name, total) \ +struct name { \ + /* point to the next page which contains the following bvecs */ \ + struct page *nextpage; \ + struct z_erofs_bvec bvec[total]; \ +} +__Z_EROFS_BVSET(z_erofs_bvset,); +__Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS); + /* * Structure fields follow one of the following exclusion rules. * @@ -38,24 +50,21 @@ struct z_erofs_pcluster { /* A: point to next chained pcluster or TAILs */ z_erofs_next_pcluster_t next; - /* A: lower limit of decompressed length and if full length or not */ + /* L: the maximum decompression size of this round */ unsigned int length; + /* L: total number of bvecs */ + unsigned int vcnt; + /* I: page offset of start position of decompression */ unsigned short pageofs_out; /* I: page offset of inline compressed data */ unsigned short pageofs_in; - /* L: maximum relative page index in pagevec[] */ - unsigned short nr_pages; - - /* L: total number of pages in pagevec[] */ - unsigned int vcnt; - union { - /* L: inline a certain number of pagevecs for bootstrap */ - erofs_vtptr_t pagevec[Z_EROFS_NR_INLINE_PAGEVECS]; + /* L: inline a certain number of bvec for bootstrap */ + struct z_erofs_bvset_inline bvset; /* I: can be used to free the pcluster by RCU. */ struct rcu_head rcu; @@ -72,8 +81,14 @@ struct z_erofs_pcluster { /* I: compression algorithm format */ unsigned char algorithmformat; - /* A: compressed pages (can be cached or inplaced pages) */ - struct page *compressed_pages[]; + /* L: whether partial decompression or not */ + bool partial; + + /* L: indicate several pageofs_outs or not */ + bool multibases; + + /* A: compressed bvecs (can be cached or inplaced pages) */ + struct z_erofs_bvec compressed_bvecs[]; }; /* let's avoid the valid 32-bit kernel addresses */ @@ -94,6 +109,8 @@ struct z_erofs_decompressqueue { struct completion done; struct work_struct work; } u; + + bool eio; }; static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl) @@ -108,38 +125,17 @@ static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) return pcl->pclusterpages; } -#define Z_EROFS_ONLINEPAGE_COUNT_BITS 2 -#define Z_EROFS_ONLINEPAGE_COUNT_MASK ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1) -#define Z_EROFS_ONLINEPAGE_INDEX_SHIFT (Z_EROFS_ONLINEPAGE_COUNT_BITS) - /* - * waiters (aka. ongoing_packs): # to unlock the page - * sub-index: 0 - for partial page, >= 1 full page sub-index + * bit 31: I/O error occurred on this page + * bit 0 - 30: remaining parts to complete this page */ -typedef atomic_t z_erofs_onlinepage_t; - -/* type punning */ -union z_erofs_onlinepage_converter { - z_erofs_onlinepage_t *o; - unsigned long *v; -}; - -static inline unsigned int z_erofs_onlinepage_index(struct page *page) -{ - union z_erofs_onlinepage_converter u; - - DBG_BUGON(!PagePrivate(page)); - u.v = &page_private(page); - - return atomic_read(u.o) >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT; -} +#define Z_EROFS_PAGE_EIO (1 << 31) static inline void z_erofs_onlinepage_init(struct page *page) { union { - z_erofs_onlinepage_t o; + atomic_t o; unsigned long v; - /* keep from being unlocked in advance */ } u = { .o = ATOMIC_INIT(1) }; set_page_private(page, u.v); @@ -147,49 +143,36 @@ static inline void z_erofs_onlinepage_init(struct page *page) SetPagePrivate(page); } -static inline void z_erofs_onlinepage_fixup(struct page *page, - uintptr_t index, bool down) +static inline void z_erofs_onlinepage_split(struct page *page) { - union z_erofs_onlinepage_converter u = { .v = &page_private(page) }; - int orig, orig_index, val; - -repeat: - orig = atomic_read(u.o); - orig_index = orig >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT; - if (orig_index) { - if (!index) - return; + atomic_inc((atomic_t *)&page->private); +} - DBG_BUGON(orig_index != index); - } +static inline void z_erofs_page_mark_eio(struct page *page) +{ + int orig; - val = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) | - ((orig & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned int)down); - if (atomic_cmpxchg(u.o, orig, val) != orig) - goto repeat; + do { + orig = atomic_read((atomic_t *)&page->private); + } while (atomic_cmpxchg((atomic_t *)&page->private, orig, + orig | Z_EROFS_PAGE_EIO) != orig); } static inline void z_erofs_onlinepage_endio(struct page *page) { - union z_erofs_onlinepage_converter u; unsigned int v; DBG_BUGON(!PagePrivate(page)); - u.v = &page_private(page); - - v = atomic_dec_return(u.o); - if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) { + v = atomic_dec_return((atomic_t *)&page->private); + if (!(v & ~Z_EROFS_PAGE_EIO)) { set_page_private(page, 0); ClearPagePrivate(page); - if (!PageError(page)) + if (!(v & Z_EROFS_PAGE_EIO)) SetPageUptodate(page); unlock_page(page); } - erofs_dbg("%s, page %p value %x", __func__, page, atomic_read(u.o)); } -#define Z_EROFS_VMAP_ONSTACK_PAGES \ - min_t(unsigned int, THREAD_SIZE / 8 / sizeof(struct page *), 96U) -#define Z_EROFS_VMAP_GLOBAL_PAGES 2048 +#define Z_EROFS_ONSTACK_PAGES 32 #endif diff --git a/fs/erofs/zpvec.h b/fs/erofs/zpvec.h deleted file mode 100644 index b05464f4a808..000000000000 --- a/fs/erofs/zpvec.h +++ /dev/null @@ -1,159 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2018 HUAWEI, Inc. - * https://www.huawei.com/ - */ -#ifndef __EROFS_FS_ZPVEC_H -#define __EROFS_FS_ZPVEC_H - -#include "tagptr.h" - -/* page type in pagevec for decompress subsystem */ -enum z_erofs_page_type { - /* including Z_EROFS_VLE_PAGE_TAIL_EXCLUSIVE */ - Z_EROFS_PAGE_TYPE_EXCLUSIVE, - - Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED, - - Z_EROFS_VLE_PAGE_TYPE_HEAD, - Z_EROFS_VLE_PAGE_TYPE_MAX -}; - -extern void __compiletime_error("Z_EROFS_PAGE_TYPE_EXCLUSIVE != 0") - __bad_page_type_exclusive(void); - -/* pagevec tagged pointer */ -typedef tagptr2_t erofs_vtptr_t; - -/* pagevec collector */ -struct z_erofs_pagevec_ctor { - struct page *curr, *next; - erofs_vtptr_t *pages; - - unsigned int nr, index; -}; - -static inline void z_erofs_pagevec_ctor_exit(struct z_erofs_pagevec_ctor *ctor, - bool atomic) -{ - if (!ctor->curr) - return; - - if (atomic) - kunmap_atomic(ctor->pages); - else - kunmap(ctor->curr); -} - -static inline struct page * -z_erofs_pagevec_ctor_next_page(struct z_erofs_pagevec_ctor *ctor, - unsigned int nr) -{ - unsigned int index; - - /* keep away from occupied pages */ - if (ctor->next) - return ctor->next; - - for (index = 0; index < nr; ++index) { - const erofs_vtptr_t t = ctor->pages[index]; - const unsigned int tags = tagptr_unfold_tags(t); - - if (tags == Z_EROFS_PAGE_TYPE_EXCLUSIVE) - return tagptr_unfold_ptr(t); - } - DBG_BUGON(nr >= ctor->nr); - return NULL; -} - -static inline void -z_erofs_pagevec_ctor_pagedown(struct z_erofs_pagevec_ctor *ctor, - bool atomic) -{ - struct page *next = z_erofs_pagevec_ctor_next_page(ctor, ctor->nr); - - z_erofs_pagevec_ctor_exit(ctor, atomic); - - ctor->curr = next; - ctor->next = NULL; - ctor->pages = atomic ? - kmap_atomic(ctor->curr) : kmap(ctor->curr); - - ctor->nr = PAGE_SIZE / sizeof(struct page *); - ctor->index = 0; -} - -static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor, - unsigned int nr, - erofs_vtptr_t *pages, - unsigned int i) -{ - ctor->nr = nr; - ctor->curr = ctor->next = NULL; - ctor->pages = pages; - - if (i >= nr) { - i -= nr; - z_erofs_pagevec_ctor_pagedown(ctor, false); - while (i > ctor->nr) { - i -= ctor->nr; - z_erofs_pagevec_ctor_pagedown(ctor, false); - } - } - ctor->next = z_erofs_pagevec_ctor_next_page(ctor, i); - ctor->index = i; -} - -static inline bool z_erofs_pagevec_enqueue(struct z_erofs_pagevec_ctor *ctor, - struct page *page, - enum z_erofs_page_type type, - bool pvec_safereuse) -{ - if (!ctor->next) { - /* some pages cannot be reused as pvec safely without I/O */ - if (type == Z_EROFS_PAGE_TYPE_EXCLUSIVE && !pvec_safereuse) - type = Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED; - - if (type != Z_EROFS_PAGE_TYPE_EXCLUSIVE && - ctor->index + 1 == ctor->nr) - return false; - } - - if (ctor->index >= ctor->nr) - z_erofs_pagevec_ctor_pagedown(ctor, false); - - /* exclusive page type must be 0 */ - if (Z_EROFS_PAGE_TYPE_EXCLUSIVE != (uintptr_t)NULL) - __bad_page_type_exclusive(); - - /* should remind that collector->next never equal to 1, 2 */ - if (type == (uintptr_t)ctor->next) { - ctor->next = page; - } - ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, page, type); - return true; -} - -static inline struct page * -z_erofs_pagevec_dequeue(struct z_erofs_pagevec_ctor *ctor, - enum z_erofs_page_type *type) -{ - erofs_vtptr_t t; - - if (ctor->index >= ctor->nr) { - DBG_BUGON(!ctor->next); - z_erofs_pagevec_ctor_pagedown(ctor, true); - } - - t = ctor->pages[ctor->index]; - - *type = tagptr_unfold_tags(t); - - /* should remind that collector->next never equal to 1, 2 */ - if (*type == (uintptr_t)ctor->next) - ctor->next = tagptr_unfold_ptr(t); - - ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, NULL, 0); - return tagptr_unfold_ptr(t); -} -#endif diff --git a/fs/eventpoll.c b/fs/eventpoll.c index e2daa940ebce..8b56b94e2f56 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1747,6 +1747,21 @@ static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms) return to; } +/* + * autoremove_wake_function, but remove even on failure to wake up, because we + * know that default_wake_function/ttwu will only fail if the thread is already + * woken, and in that case the ep_poll loop will remove the entry anyways, not + * try to reuse it. + */ +static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry, + unsigned int mode, int sync, void *key) +{ + int ret = default_wake_function(wq_entry, mode, sync, key); + + list_del_init(&wq_entry->entry); + return ret; +} + /** * ep_poll - Retrieves ready events, and delivers them to the caller-supplied * event buffer. @@ -1828,8 +1843,15 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, * normal wakeup path no need to call __remove_wait_queue() * explicitly, thus ep->lock is not taken, which halts the * event delivery. + * + * In fact, we now use an even more aggressive function that + * unconditionally removes, because we don't reuse the wait + * entry between loop iterations. This lets us also avoid the + * performance issue if a process is killed, causing all of its + * threads to wake up without being removed normally. */ init_wait(&wait); + wait.func = ep_autoremove_wake_function; write_lock_irq(&ep->lock); /* diff --git a/fs/exec.c b/fs/exec.c index 0989fb8472a1..5fd73915c62c 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -65,6 +65,7 @@ #include <linux/io_uring.h> #include <linux/syscall_user_dispatch.h> #include <linux/coredump.h> +#include <linux/time_namespace.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> @@ -630,7 +631,6 @@ int copy_string_kernel(const char *arg, struct linux_binprm *bprm) unsigned int bytes_to_copy = min_t(unsigned int, len, min_not_zero(offset_in_page(pos), PAGE_SIZE)); struct page *page; - char *kaddr; pos -= bytes_to_copy; arg -= bytes_to_copy; @@ -639,11 +639,8 @@ int copy_string_kernel(const char *arg, struct linux_binprm *bprm) page = get_arg_page(bprm, pos, 1); if (!page) return -E2BIG; - kaddr = kmap_atomic(page); flush_arg_page(bprm, pos & PAGE_MASK, page); - memcpy(kaddr + offset_in_page(pos), arg, bytes_to_copy); - flush_dcache_page(page); - kunmap_atomic(kaddr); + memcpy_to_page(page, offset_in_page(pos), arg, bytes_to_copy); put_arg_page(page); } @@ -982,10 +979,12 @@ static int exec_mmap(struct mm_struct *mm) { struct task_struct *tsk; struct mm_struct *old_mm, *active_mm; + bool vfork; int ret; /* Notify parent that we're no longer interested in the old VM */ tsk = current; + vfork = !!tsk->vfork_done; old_mm = current->mm; exec_mm_release(tsk, old_mm); if (old_mm) @@ -1030,6 +1029,10 @@ static int exec_mmap(struct mm_struct *mm) tsk->mm->vmacache_seqnum = 0; vmacache_flush(tsk); task_unlock(tsk); + + if (vfork) + timens_on_fork(tsk->nsproxy, tsk); + if (old_mm) { mmap_read_unlock(old_mm); BUG_ON(active_mm != old_mm); @@ -1149,7 +1152,7 @@ static int de_thread(struct task_struct *tsk) /* * We are going to release_task()->ptrace_unlink() silently, * the tracer can sleep in do_wait(). EXIT_DEAD guarantees - * the tracer wont't block again waiting for this thread. + * the tracer won't block again waiting for this thread. */ if (unlikely(leader->ptrace)) __wake_up_parent(leader, leader->parent); @@ -1301,7 +1304,7 @@ int begin_new_exec(struct linux_binprm * bprm) bprm->mm = NULL; #ifdef CONFIG_POSIX_TIMERS - exit_itimers(me->signal); + exit_itimers(me); flush_itimer_signals(); #endif diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index 4a7a2308eb72..a8f8eee4937c 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -27,9 +27,9 @@ enum exfat_error_mode { * exfat nls lossy flag */ enum { - NLS_NAME_NO_LOSSY, /* no lossy */ - NLS_NAME_LOSSY, /* just detected incorrect filename(s) */ - NLS_NAME_OVERLEN, /* the length is over than its limit */ + NLS_NAME_NO_LOSSY = 0, /* no lossy */ + NLS_NAME_LOSSY = 1 << 0, /* just detected incorrect filename(s) */ + NLS_NAME_OVERLEN = 1 << 1, /* the length is over than its limit */ }; #define EXFAT_HASH_BITS 8 @@ -483,6 +483,7 @@ struct inode *exfat_build_inode(struct super_block *sb, void exfat_hash_inode(struct inode *inode, loff_t i_pos); void exfat_unhash_inode(struct inode *inode); struct inode *exfat_iget(struct super_block *sb, loff_t i_pos); +int __exfat_write_inode(struct inode *inode, int sync); int exfat_write_inode(struct inode *inode, struct writeback_control *wbc); void exfat_evict_inode(struct inode *inode); int exfat_block_truncate_page(struct inode *inode, loff_t from); @@ -508,14 +509,16 @@ void __exfat_fs_error(struct super_block *sb, int report, const char *fmt, ...) #define exfat_fs_error_ratelimit(sb, fmt, args...) \ __exfat_fs_error(sb, __ratelimit(&EXFAT_SB(sb)->ratelimit), \ fmt, ## args) -void exfat_msg(struct super_block *sb, const char *lv, const char *fmt, ...) - __printf(3, 4) __cold; + +/* expand to pr_*() with prefix */ #define exfat_err(sb, fmt, ...) \ - exfat_msg(sb, KERN_ERR, fmt, ##__VA_ARGS__) + pr_err("exFAT-fs (%s): " fmt "\n", (sb)->s_id, ##__VA_ARGS__) #define exfat_warn(sb, fmt, ...) \ - exfat_msg(sb, KERN_WARNING, fmt, ##__VA_ARGS__) + pr_warn("exFAT-fs (%s): " fmt "\n", (sb)->s_id, ##__VA_ARGS__) #define exfat_info(sb, fmt, ...) \ - exfat_msg(sb, KERN_INFO, fmt, ##__VA_ARGS__) + pr_info("exFAT-fs (%s): " fmt "\n", (sb)->s_id, ##__VA_ARGS__) +#define exfat_debug(sb, fmt, ...) \ + pr_debug("exFAT-fs (%s): " fmt "\n", (sb)->s_id, ##__VA_ARGS__) void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, u8 tz, __le16 time, __le16 date, u8 time_cs); diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c index 9de6a6b844c9..ee0b7cf51157 100644 --- a/fs/exfat/fatent.c +++ b/fs/exfat/fatent.c @@ -331,7 +331,7 @@ int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc, /* find new cluster */ if (hint_clu == EXFAT_EOF_CLUSTER) { if (sbi->clu_srch_ptr < EXFAT_FIRST_CLUSTER) { - exfat_err(sb, "sbi->clu_srch_ptr is invalid (%u)\n", + exfat_err(sb, "sbi->clu_srch_ptr is invalid (%u)", sbi->clu_srch_ptr); sbi->clu_srch_ptr = EXFAT_FIRST_CLUSTER; } diff --git a/fs/exfat/file.c b/fs/exfat/file.c index 20d4e47f57ab..4e0793f35e8f 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -101,7 +101,6 @@ int __exfat_truncate(struct inode *inode, loff_t new_size) struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_inode_info *ei = EXFAT_I(inode); - int evict = (ei->dir.dir == DIR_DELETED) ? 1 : 0; /* check if the given file ID is opened */ if (ei->type != TYPE_FILE && ei->type != TYPE_DIR) @@ -149,50 +148,19 @@ int __exfat_truncate(struct inode *inode, loff_t new_size) if (ei->type == TYPE_FILE) ei->attr |= ATTR_ARCHIVE; - /* update the directory entry */ - if (!evict) { - struct timespec64 ts; - struct exfat_dentry *ep, *ep2; - struct exfat_entry_set_cache *es; - int err; - - es = exfat_get_dentry_set(sb, &(ei->dir), ei->entry, - ES_ALL_ENTRIES); - if (!es) - return -EIO; - ep = exfat_get_dentry_cached(es, 0); - ep2 = exfat_get_dentry_cached(es, 1); - - ts = current_time(inode); - exfat_set_entry_time(sbi, &ts, - &ep->dentry.file.modify_tz, - &ep->dentry.file.modify_time, - &ep->dentry.file.modify_date, - &ep->dentry.file.modify_time_cs); - ep->dentry.file.attr = cpu_to_le16(ei->attr); - - /* File size should be zero if there is no cluster allocated */ - if (ei->start_clu == EXFAT_EOF_CLUSTER) { - ep2->dentry.stream.valid_size = 0; - ep2->dentry.stream.size = 0; - } else { - ep2->dentry.stream.valid_size = cpu_to_le64(new_size); - ep2->dentry.stream.size = ep2->dentry.stream.valid_size; - } - - if (new_size == 0) { - /* Any directory can not be truncated to zero */ - WARN_ON(ei->type != TYPE_FILE); - - ep2->dentry.stream.flags = ALLOC_FAT_CHAIN; - ep2->dentry.stream.start_clu = EXFAT_FREE_CLUSTER; - } - - exfat_update_dir_chksum_with_entry_set(es); - err = exfat_free_dentry_set(es, inode_needs_sync(inode)); - if (err) - return err; - } + /* + * update the directory entry + * + * If the directory entry is updated by mark_inode_dirty(), the + * directory entry will be written after a writeback cycle of + * updating the bitmap/FAT, which may result in clusters being + * freed but referenced by the directory entry in the event of a + * sudden power failure. + * __exfat_write_inode() is called for directory entry, bitmap + * and FAT to be written in a same writeback. + */ + if (__exfat_write_inode(inode, inode_needs_sync(inode))) + return -EIO; /* cut off from the FAT chain */ if (ei->flags == ALLOC_FAT_CHAIN && last_clu != EXFAT_FREE_CLUSTER && @@ -243,12 +211,6 @@ void exfat_truncate(struct inode *inode, loff_t size) if (err) goto write_size; - inode->i_ctime = inode->i_mtime = current_time(inode); - if (IS_DIRSYNC(inode)) - exfat_sync_inode(inode); - else - mark_inode_dirty(inode); - inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> inode->i_blkbits; write_size: @@ -330,6 +292,12 @@ int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, attr->ia_valid &= ~ATTR_MODE; } + if (attr->ia_valid & ATTR_SIZE) + inode->i_mtime = inode->i_ctime = current_time(inode); + + setattr_copy(&init_user_ns, inode, attr); + exfat_truncate_atime(&inode->i_atime); + if (attr->ia_valid & ATTR_SIZE) { error = exfat_block_truncate_page(inode, attr->ia_size); if (error) @@ -337,13 +305,15 @@ int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, down_write(&EXFAT_I(inode)->truncate_lock); truncate_setsize(inode, attr->ia_size); + + /* + * __exfat_write_inode() is called from exfat_truncate(), inode + * is already written by it, so mark_inode_dirty() is unneeded. + */ exfat_truncate(inode, attr->ia_size); up_write(&EXFAT_I(inode)->truncate_lock); - } - - setattr_copy(&init_user_ns, inode, attr); - exfat_truncate_atime(&inode->i_atime); - mark_inode_dirty(inode); + } else + mark_inode_dirty(inode); out: return error; diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index 0133d385d8e8..a795437b86d0 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -17,7 +17,7 @@ #include "exfat_raw.h" #include "exfat_fs.h" -static int __exfat_write_inode(struct inode *inode, int sync) +int __exfat_write_inode(struct inode *inode, int sync) { unsigned long long on_disk_size; struct exfat_dentry *ep, *ep2; @@ -75,6 +75,13 @@ static int __exfat_write_inode(struct inode *inode, int sync) ep2->dentry.stream.valid_size = cpu_to_le64(on_disk_size); ep2->dentry.stream.size = ep2->dentry.stream.valid_size; + if (on_disk_size) { + ep2->dentry.stream.flags = ei->flags; + ep2->dentry.stream.start_clu = cpu_to_le32(ei->start_clu); + } else { + ep2->dentry.stream.flags = ALLOC_FAT_CHAIN; + ep2->dentry.stream.start_clu = EXFAT_FREE_CLUSTER; + } exfat_update_dir_chksum_with_entry_set(es); return exfat_free_dentry_set(es, sync); @@ -105,7 +112,7 @@ void exfat_sync_inode(struct inode *inode) static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset, unsigned int *clu, int create) { - int ret, modified = false; + int ret; unsigned int last_clu; struct exfat_chain new_clu; struct super_block *sb = inode->i_sb; @@ -196,7 +203,6 @@ static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset, if (new_clu.flags == ALLOC_FAT_CHAIN) ei->flags = ALLOC_FAT_CHAIN; ei->start_clu = new_clu.dir; - modified = true; } else { if (new_clu.flags != ei->flags) { /* no-fat-chain bit is disabled, @@ -206,7 +212,6 @@ static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset, exfat_chain_cont_cluster(sb, ei->start_clu, num_clusters); ei->flags = ALLOC_FAT_CHAIN; - modified = true; } if (new_clu.flags == ALLOC_FAT_CHAIN) if (exfat_ent_set(sb, last_clu, new_clu.dir)) @@ -216,33 +221,6 @@ static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset, num_clusters += num_to_be_allocated; *clu = new_clu.dir; - if (ei->dir.dir != DIR_DELETED && modified) { - struct exfat_dentry *ep; - struct exfat_entry_set_cache *es; - int err; - - es = exfat_get_dentry_set(sb, &(ei->dir), ei->entry, - ES_ALL_ENTRIES); - if (!es) - return -EIO; - /* get stream entry */ - ep = exfat_get_dentry_cached(es, 1); - - /* update directory entry */ - ep->dentry.stream.flags = ei->flags; - ep->dentry.stream.start_clu = - cpu_to_le32(ei->start_clu); - ep->dentry.stream.valid_size = - cpu_to_le64(i_size_read(inode)); - ep->dentry.stream.size = - ep->dentry.stream.valid_size; - - exfat_update_dir_chksum_with_entry_set(es); - err = exfat_free_dentry_set(es, inode_needs_sync(inode)); - if (err) - return err; - } /* end of if != DIR_DELETED */ - inode->i_blocks += num_to_be_allocated << sbi->sect_per_clus_bits; @@ -384,6 +362,7 @@ static void exfat_write_failed(struct address_space *mapping, loff_t to) if (to > i_size_read(inode)) { truncate_pagecache(inode, i_size_read(inode)); + inode->i_mtime = inode->i_ctime = current_time(inode); exfat_truncate(inode, EXFAT_I(inode)->i_size_aligned); } } diff --git a/fs/exfat/misc.c b/fs/exfat/misc.c index 9380e0188b55..2e1a1a6b1021 100644 --- a/fs/exfat/misc.c +++ b/fs/exfat/misc.c @@ -46,23 +46,6 @@ void __exfat_fs_error(struct super_block *sb, int report, const char *fmt, ...) } } -/* - * exfat_msg() - print preformated EXFAT specific messages. - * All logs except what uses exfat_fs_error() should be written by exfat_msg() - */ -void exfat_msg(struct super_block *sb, const char *level, const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - - va_start(args, fmt); - vaf.fmt = fmt; - vaf.va = &args; - /* level means KERN_ pacility level */ - printk("%sexFAT-fs (%s): %pV\n", level, sb->s_id, &vaf); - va_end(args); -} - #define SECS_PER_MIN (60) #define TIMEZONE_SEC(x) ((x) * 15 * SECS_PER_MIN) diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index 76acc3721951..b617bebc3d0f 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -318,7 +318,6 @@ static int exfat_find_empty_entry(struct inode *inode, unsigned int ret, last_clu; loff_t size = 0; struct exfat_chain clu; - struct exfat_dentry *ep = NULL; struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_inode_info *ei = EXFAT_I(inode); @@ -383,25 +382,6 @@ static int exfat_find_empty_entry(struct inode *inode, p_dir->size++; size = EXFAT_CLU_TO_B(p_dir->size, sbi); - /* update the directory entry */ - if (p_dir->dir != sbi->root_dir) { - struct buffer_head *bh; - - ep = exfat_get_dentry(sb, - &(ei->dir), ei->entry + 1, &bh); - if (!ep) - return -EIO; - - ep->dentry.stream.valid_size = cpu_to_le64(size); - ep->dentry.stream.size = ep->dentry.stream.valid_size; - ep->dentry.stream.flags = p_dir->flags; - exfat_update_bh(bh, IS_DIRSYNC(inode)); - brelse(bh); - if (exfat_update_dir_chksum(inode, &(ei->dir), - ei->entry)) - return -EIO; - } - /* directory inode should be updated in here */ i_size_write(inode, size); ei->i_size_ondisk += sbi->cluster_size; @@ -462,7 +442,7 @@ static int __exfat_resolve_path(struct inode *inode, const unsigned char *path, return namelen; /* return error value */ if ((lossy && !lookup) || !namelen) - return -EINVAL; + return (lossy & NLS_NAME_OVERLEN) ? -ENAMETOOLONG : -EINVAL; exfat_chain_set(p_dir, ei->start_clu, EXFAT_B_TO_CLU(i_size_read(inode), sbi), ei->flags); @@ -1198,7 +1178,9 @@ static int __exfat_rename(struct inode *old_parent_inode, return -ENOENT; } - exfat_chain_dup(&olddir, &ei->dir); + exfat_chain_set(&olddir, EXFAT_I(old_parent_inode)->start_clu, + EXFAT_B_TO_CLU_ROUND_UP(i_size_read(old_parent_inode), sbi), + EXFAT_I(old_parent_inode)->flags); dentry = ei->entry; ep = exfat_get_dentry(sb, &olddir, dentry, &old_bh); diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c index ef115e673406..705710f93e2d 100644 --- a/fs/exfat/nls.c +++ b/fs/exfat/nls.c @@ -509,7 +509,7 @@ static int exfat_utf8_to_utf16(struct super_block *sb, } if (unilen > MAX_NAME_LENGTH) { - exfat_err(sb, "failed to %s (estr:ENAMETOOLONG) nls len : %d, unilen : %d > %d", + exfat_debug(sb, "failed to %s (estr:ENAMETOOLONG) nls len : %d, unilen : %d > %d", __func__, len, unilen, MAX_NAME_LENGTH); return -ENAMETOOLONG; } @@ -671,7 +671,7 @@ static int exfat_load_upcase_table(struct super_block *sb, bh = sb_bread(sb, sector); if (!bh) { - exfat_err(sb, "failed to read sector(0x%llx)\n", + exfat_err(sb, "failed to read sector(0x%llx)", (unsigned long long)sector); ret = -EIO; goto free_table; diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 6a4dfe9f31ee..35f0305cd493 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -464,7 +464,7 @@ static int exfat_read_boot_sector(struct super_block *sb) */ if (p_boot->sect_size_bits < EXFAT_MIN_SECT_SIZE_BITS || p_boot->sect_size_bits > EXFAT_MAX_SECT_SIZE_BITS) { - exfat_err(sb, "bogus sector size bits : %u\n", + exfat_err(sb, "bogus sector size bits : %u", p_boot->sect_size_bits); return -EINVAL; } @@ -473,7 +473,7 @@ static int exfat_read_boot_sector(struct super_block *sb) * sect_per_clus_bits could be at least 0 and at most 25 - sect_size_bits. */ if (p_boot->sect_per_clus_bits > EXFAT_MAX_SECT_PER_CLUS_BITS(p_boot)) { - exfat_err(sb, "bogus sectors bits per cluster : %u\n", + exfat_err(sb, "bogus sectors bits per cluster : %u", p_boot->sect_per_clus_bits); return -EINVAL; } diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 43de293cef56..8f597753ac12 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -200,19 +200,19 @@ static struct page * ext2_get_page(struct inode *dir, unsigned long n, int quiet, void **page_addr) { struct address_space *mapping = dir->i_mapping; - struct page *page = read_mapping_page(mapping, n, NULL); - if (!IS_ERR(page)) { - *page_addr = kmap_local_page(page); - if (unlikely(!PageChecked(page))) { - if (PageError(page) || !ext2_check_page(page, quiet, - *page_addr)) - goto fail; - } + struct folio *folio = read_mapping_folio(mapping, n, NULL); + + if (IS_ERR(folio)) + return &folio->page; + *page_addr = kmap_local_folio(folio, n & (folio_nr_pages(folio) - 1)); + if (unlikely(!folio_test_checked(folio))) { + if (!ext2_check_page(&folio->page, quiet, *page_addr)) + goto fail; } - return page; + return &folio->page; fail: - ext2_put_page(page, *page_addr); + ext2_put_page(&folio->page, *page_addr); return ERR_PTR(-EIO); } diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index d4f306aa5ace..28de11a22e5f 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -795,7 +795,6 @@ extern const struct file_operations ext2_file_operations; /* inode.c */ extern void ext2_set_file_ops(struct inode *inode); extern const struct address_space_operations ext2_aops; -extern const struct address_space_operations ext2_nobh_aops; extern const struct iomap_ops ext2_iomap_ops; /* namei.c */ diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index e6b932219803..918ab2f9e4c0 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -908,25 +908,6 @@ static int ext2_write_end(struct file *file, struct address_space *mapping, return ret; } -static int -ext2_nobh_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct page **pagep, void **fsdata) -{ - int ret; - - ret = nobh_write_begin(mapping, pos, len, pagep, fsdata, - ext2_get_block); - if (ret < 0) - ext2_write_failed(mapping, pos + len); - return ret; -} - -static int ext2_nobh_writepage(struct page *page, - struct writeback_control *wbc) -{ - return nobh_writepage(page, ext2_get_block, wbc); -} - static sector_t ext2_bmap(struct address_space *mapping, sector_t block) { return generic_block_bmap(mapping,block,ext2_get_block); @@ -973,26 +954,11 @@ const struct address_space_operations ext2_aops = { .bmap = ext2_bmap, .direct_IO = ext2_direct_IO, .writepages = ext2_writepages, - .migratepage = buffer_migrate_page, + .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; -const struct address_space_operations ext2_nobh_aops = { - .dirty_folio = block_dirty_folio, - .invalidate_folio = block_invalidate_folio, - .read_folio = ext2_read_folio, - .readahead = ext2_readahead, - .writepage = ext2_nobh_writepage, - .write_begin = ext2_nobh_write_begin, - .write_end = nobh_write_end, - .bmap = ext2_bmap, - .direct_IO = ext2_direct_IO, - .writepages = ext2_writepages, - .migratepage = buffer_migrate_page, - .error_remove_page = generic_error_remove_page, -}; - static const struct address_space_operations ext2_dax_aops = { .writepages = ext2_dax_writepages, .direct_IO = noop_direct_IO, @@ -1298,13 +1264,10 @@ static int ext2_setsize(struct inode *inode, loff_t newsize) inode_dio_wait(inode); - if (IS_DAX(inode)) { + if (IS_DAX(inode)) error = dax_zero_range(inode, newsize, PAGE_ALIGN(newsize) - newsize, NULL, &ext2_iomap_ops); - } else if (test_opt(inode->i_sb, NOBH)) - error = nobh_truncate_page(inode->i_mapping, - newsize, ext2_get_block); else error = block_truncate_page(inode->i_mapping, newsize, ext2_get_block); @@ -1396,8 +1359,6 @@ void ext2_set_file_ops(struct inode *inode) inode->i_fop = &ext2_file_operations; if (IS_DAX(inode)) inode->i_mapping->a_ops = &ext2_dax_aops; - else if (test_opt(inode->i_sb, NOBH)) - inode->i_mapping->a_ops = &ext2_nobh_aops; else inode->i_mapping->a_ops = &ext2_aops; } @@ -1497,10 +1458,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) } else if (S_ISDIR(inode->i_mode)) { inode->i_op = &ext2_dir_inode_operations; inode->i_fop = &ext2_dir_operations; - if (test_opt(inode->i_sb, NOBH)) - inode->i_mapping->a_ops = &ext2_nobh_aops; - else - inode->i_mapping->a_ops = &ext2_aops; + inode->i_mapping->a_ops = &ext2_aops; } else if (S_ISLNK(inode->i_mode)) { if (ext2_inode_is_fast_symlink(inode)) { inode->i_link = (char *)ei->i_data; @@ -1510,10 +1468,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) } else { inode->i_op = &ext2_symlink_inode_operations; inode_nohighmem(inode); - if (test_opt(inode->i_sb, NOBH)) - inode->i_mapping->a_ops = &ext2_nobh_aops; - else - inode->i_mapping->a_ops = &ext2_aops; + inode->i_mapping->a_ops = &ext2_aops; } } else { inode->i_op = &ext2_special_inode_operations; @@ -1679,14 +1634,14 @@ int ext2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (error) return error; - if (is_quota_modification(inode, iattr)) { + if (is_quota_modification(mnt_userns, inode, iattr)) { error = dquot_initialize(inode); if (error) return error; } - if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) || - (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) { - error = dquot_transfer(inode, iattr); + if (i_uid_needs_update(mnt_userns, iattr, inode) || + i_gid_needs_update(mnt_userns, iattr, inode)) { + error = dquot_transfer(mnt_userns, inode, iattr); if (error) return error; } diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 5f6b7560eb3f..5fd9a22d2b70 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -178,10 +178,7 @@ static int ext2_symlink (struct user_namespace * mnt_userns, struct inode * dir, /* slow symlink */ inode->i_op = &ext2_symlink_inode_operations; inode_nohighmem(inode); - if (test_opt(inode->i_sb, NOBH)) - inode->i_mapping->a_ops = &ext2_nobh_aops; - else - inode->i_mapping->a_ops = &ext2_aops; + inode->i_mapping->a_ops = &ext2_aops; err = page_symlink(inode, symname, l); if (err) goto out_fail; @@ -247,10 +244,7 @@ static int ext2_mkdir(struct user_namespace * mnt_userns, inode->i_op = &ext2_dir_inode_operations; inode->i_fop = &ext2_dir_operations; - if (test_opt(inode->i_sb, NOBH)) - inode->i_mapping->a_ops = &ext2_nobh_aops; - else - inode->i_mapping->a_ops = &ext2_aops; + inode->i_mapping->a_ops = &ext2_aops; inode_inc_link_count(inode); diff --git a/fs/ext2/super.c b/fs/ext2/super.c index f6a19f6d9f6d..252c742379cf 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -171,7 +171,7 @@ static void ext2_put_super (struct super_block * sb) brelse (sbi->s_sbh); sb->s_fs_info = NULL; kfree(sbi->s_blockgroup_lock); - fs_put_dax(sbi->s_daxdev); + fs_put_dax(sbi->s_daxdev, NULL); kfree(sbi); } @@ -296,9 +296,6 @@ static int ext2_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",noacl"); #endif - if (test_opt(sb, NOBH)) - seq_puts(seq, ",nobh"); - if (test_opt(sb, USRQUOTA)) seq_puts(seq, ",usrquota"); @@ -551,7 +548,8 @@ static int parse_options(char *options, struct super_block *sb, clear_opt (opts->s_mount_opt, OLDALLOC); break; case Opt_nobh: - set_opt (opts->s_mount_opt, NOBH); + ext2_msg(sb, KERN_INFO, + "nobh option not supported"); break; #ifdef CONFIG_EXT2_FS_XATTR case Opt_user_xattr: @@ -835,7 +833,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) } sb->s_fs_info = sbi; sbi->s_sb_block = sb_block; - sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off); + sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off, + NULL, NULL); spin_lock_init(&sbi->s_lock); ret = -EINVAL; @@ -1059,9 +1058,10 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) sbi->s_frags_per_group); goto failed_mount; } - if (sbi->s_inodes_per_group > sb->s_blocksize * 8) { + if (sbi->s_inodes_per_group < sbi->s_inodes_per_block || + sbi->s_inodes_per_group > sb->s_blocksize * 8) { ext2_msg(sb, KERN_ERR, - "error: #inodes per group too big: %lu", + "error: invalid #inodes per group: %lu", sbi->s_inodes_per_group); goto failed_mount; } @@ -1071,6 +1071,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) - le32_to_cpu(es->s_first_data_block) - 1) / EXT2_BLOCKS_PER_GROUP(sb)) + 1; + if ((u64)sbi->s_groups_count * sbi->s_inodes_per_group != + le32_to_cpu(es->s_inodes_count)) { + ext2_msg(sb, KERN_ERR, "error: invalid #inodes: %u vs computed %llu", + le32_to_cpu(es->s_inodes_count), + (u64)sbi->s_groups_count * sbi->s_inodes_per_group); + goto failed_mount; + } db_count = (sbi->s_groups_count + EXT2_DESC_PER_BLOCK(sb) - 1) / EXT2_DESC_PER_BLOCK(sb); sbi->s_group_desc = kmalloc_array(db_count, @@ -1204,7 +1211,7 @@ failed_mount_group_desc: failed_mount: brelse(bh); failed_sbi: - fs_put_dax(sbi->s_daxdev); + fs_put_dax(sbi->s_daxdev, NULL); sb->s_fs_info = NULL; kfree(sbi->s_blockgroup_lock); kfree(sbi); @@ -1490,8 +1497,7 @@ static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data, len = i_size-off; toread = len; while (toread > 0) { - tocopy = sb->s_blocksize - offset < toread ? - sb->s_blocksize - offset : toread; + tocopy = min_t(size_t, sb->s_blocksize - offset, toread); tmp_bh.b_state = 0; tmp_bh.b_size = sb->s_blocksize; @@ -1529,8 +1535,7 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type, struct buffer_head *bh; while (towrite > 0) { - tocopy = sb->s_blocksize - offset < towrite ? - sb->s_blocksize - offset : towrite; + tocopy = min_t(size_t, sb->s_blocksize - offset, towrite); tmp_bh.b_state = 0; tmp_bh.b_size = sb->s_blocksize; diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 841fa6d9d744..641abfa4b718 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -517,36 +517,36 @@ bad_block: /* Here we know that we can set the new attribute. */ if (header) { - /* assert(header == HDR(bh)); */ + int offset; + lock_buffer(bh); if (header->h_refcount == cpu_to_le32(1)) { __u32 hash = le32_to_cpu(header->h_hash); + struct mb_cache_entry *oe; - ea_bdebug(bh, "modifying in-place"); + oe = mb_cache_entry_delete_or_get(EA_BLOCK_CACHE(inode), + hash, bh->b_blocknr); + if (!oe) { + ea_bdebug(bh, "modifying in-place"); + goto update_block; + } /* - * This must happen under buffer lock for - * ext2_xattr_set2() to reliably detect modified block + * Someone is trying to reuse the block, leave it alone */ - mb_cache_entry_delete(EA_BLOCK_CACHE(inode), hash, - bh->b_blocknr); - - /* keep the buffer locked while modifying it. */ - } else { - int offset; - - unlock_buffer(bh); - ea_bdebug(bh, "cloning"); - header = kmemdup(HDR(bh), bh->b_size, GFP_KERNEL); - error = -ENOMEM; - if (header == NULL) - goto cleanup; - header->h_refcount = cpu_to_le32(1); - - offset = (char *)here - bh->b_data; - here = ENTRY((char *)header + offset); - offset = (char *)last - bh->b_data; - last = ENTRY((char *)header + offset); + mb_cache_entry_put(EA_BLOCK_CACHE(inode), oe); } + unlock_buffer(bh); + ea_bdebug(bh, "cloning"); + header = kmemdup(HDR(bh), bh->b_size, GFP_KERNEL); + error = -ENOMEM; + if (header == NULL) + goto cleanup; + header->h_refcount = cpu_to_le32(1); + + offset = (char *)here - bh->b_data; + here = ENTRY((char *)header + offset); + offset = (char *)last - bh->b_data; + last = ENTRY((char *)header + offset); } else { /* Allocate a buffer where we construct the new block. */ header = kzalloc(sb->s_blocksize, GFP_KERNEL); @@ -559,6 +559,7 @@ bad_block: last = here = ENTRY(header+1); } +update_block: /* Iff we are modifying the block in-place, bh is locked here. */ if (not_found) { @@ -651,6 +652,55 @@ cleanup: return error; } +static void ext2_xattr_release_block(struct inode *inode, + struct buffer_head *bh) +{ + struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); + +retry_ref: + lock_buffer(bh); + if (HDR(bh)->h_refcount == cpu_to_le32(1)) { + __u32 hash = le32_to_cpu(HDR(bh)->h_hash); + struct mb_cache_entry *oe; + + /* + * This must happen under buffer lock to properly + * serialize with ext2_xattr_set() reusing the block. + */ + oe = mb_cache_entry_delete_or_get(ea_block_cache, hash, + bh->b_blocknr); + if (oe) { + /* + * Someone is trying to reuse the block. Wait + * and retry. + */ + unlock_buffer(bh); + mb_cache_entry_wait_unused(oe); + mb_cache_entry_put(ea_block_cache, oe); + goto retry_ref; + } + + /* Free the old block. */ + ea_bdebug(bh, "freeing"); + ext2_free_blocks(inode, bh->b_blocknr, 1); + /* We let our caller release bh, so we + * need to duplicate the buffer before. */ + get_bh(bh); + bforget(bh); + unlock_buffer(bh); + } else { + /* Decrement the refcount only. */ + le32_add_cpu(&HDR(bh)->h_refcount, -1); + dquot_free_block(inode, 1); + mark_buffer_dirty(bh); + unlock_buffer(bh); + ea_bdebug(bh, "refcount now=%d", + le32_to_cpu(HDR(bh)->h_refcount)); + if (IS_SYNC(inode)) + sync_dirty_buffer(bh); + } +} + /* * Second half of ext2_xattr_set(): Update the file system. */ @@ -747,34 +797,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, * If there was an old block and we are no longer using it, * release the old block. */ - lock_buffer(old_bh); - if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) { - __u32 hash = le32_to_cpu(HDR(old_bh)->h_hash); - - /* - * This must happen under buffer lock for - * ext2_xattr_set2() to reliably detect freed block - */ - mb_cache_entry_delete(ea_block_cache, hash, - old_bh->b_blocknr); - /* Free the old block. */ - ea_bdebug(old_bh, "freeing"); - ext2_free_blocks(inode, old_bh->b_blocknr, 1); - mark_inode_dirty(inode); - /* We let our caller release old_bh, so we - * need to duplicate the buffer before. */ - get_bh(old_bh); - bforget(old_bh); - } else { - /* Decrement the refcount only. */ - le32_add_cpu(&HDR(old_bh)->h_refcount, -1); - dquot_free_block_nodirty(inode, 1); - mark_inode_dirty(inode); - mark_buffer_dirty(old_bh); - ea_bdebug(old_bh, "refcount now=%d", - le32_to_cpu(HDR(old_bh)->h_refcount)); - } - unlock_buffer(old_bh); + ext2_xattr_release_block(inode, old_bh); } cleanup: @@ -828,30 +851,7 @@ ext2_xattr_delete_inode(struct inode *inode) EXT2_I(inode)->i_file_acl); goto cleanup; } - lock_buffer(bh); - if (HDR(bh)->h_refcount == cpu_to_le32(1)) { - __u32 hash = le32_to_cpu(HDR(bh)->h_hash); - - /* - * This must happen under buffer lock for ext2_xattr_set2() to - * reliably detect freed block - */ - mb_cache_entry_delete(EA_BLOCK_CACHE(inode), hash, - bh->b_blocknr); - ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1); - get_bh(bh); - bforget(bh); - unlock_buffer(bh); - } else { - le32_add_cpu(&HDR(bh)->h_refcount, -1); - ea_bdebug(bh, "refcount now=%d", - le32_to_cpu(HDR(bh)->h_refcount)); - unlock_buffer(bh); - mark_buffer_dirty(bh); - if (IS_SYNC(inode)) - sync_dirty_buffer(bh); - dquot_free_block_nodirty(inode, 1); - } + ext2_xattr_release_block(inode, bh); EXT2_I(inode)->i_file_acl = 0; cleanup: @@ -943,7 +943,7 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header) if (!header->h_hash) return NULL; /* never share */ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); -again: + ce = mb_cache_entry_find_first(ea_block_cache, hash); while (ce) { struct buffer_head *bh; @@ -955,22 +955,8 @@ again: inode->i_ino, (unsigned long) ce->e_value); } else { lock_buffer(bh); - /* - * We have to be careful about races with freeing or - * rehashing of xattr block. Once we hold buffer lock - * xattr block's state is stable so we can check - * whether the block got freed / rehashed or not. - * Since we unhash mbcache entry under buffer lock when - * freeing / rehashing xattr block, checking whether - * entry is still hashed is reliable. - */ - if (hlist_bl_unhashed(&ce->e_hash_list)) { - mb_cache_entry_put(ea_block_cache, ce); - unlock_buffer(bh); - brelse(bh); - goto again; - } else if (le32_to_cpu(HDR(bh)->h_refcount) > - EXT2_XATTR_REFCOUNT_MAX) { + if (le32_to_cpu(HDR(bh)->h_refcount) > + EXT2_XATTR_REFCOUNT_MAX) { ea_idebug(inode, "block %ld refcount %d>%d", (unsigned long) ce->e_value, le32_to_cpu(HDR(bh)->h_refcount), diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 78ee3ef795ae..8ff4b9192a9f 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -666,7 +666,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries) * it's possible we've just missed a transaction commit here, * so ignore the returned status */ - jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); + ext4_debug("%s: retrying operation after ENOSPC\n", sb->s_id); (void) jbd2_journal_force_commit_nested(sbi->s_journal); return 1; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 75b8d81b2469..9bca5565547b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -724,6 +724,8 @@ enum { #define EXT4_IOC_GETSTATE _IOW('f', 41, __u32) #define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap) #define EXT4_IOC_CHECKPOINT _IOW('f', 43, __u32) +#define EXT4_IOC_GETFSUUID _IOR('f', 44, struct fsuuid) +#define EXT4_IOC_SETFSUUID _IOW('f', 44, struct fsuuid) #define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32) @@ -753,6 +755,15 @@ enum { EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT | \ EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN) +/* + * Structure for EXT4_IOC_GETFSUUID/EXT4_IOC_SETFSUUID + */ +struct fsuuid { + __u32 fsu_len; + __u32 fsu_flags; + __u8 fsu_uuid[]; +}; + #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* * ioctl commands in 32 bit emulation @@ -3016,7 +3027,7 @@ int ext4_fileattr_set(struct user_namespace *mnt_userns, struct dentry *dentry, struct fileattr *fa); int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa); extern void ext4_reset_inode_seed(struct inode *inode); -int ext4_update_overhead(struct super_block *sb); +int ext4_update_overhead(struct super_block *sb, bool force); /* migrate.c */ extern int ext4_ext_migrate(struct inode *); @@ -3058,14 +3069,14 @@ extern unsigned int ext4_list_backups(struct super_block *sb, /* super.c */ extern struct buffer_head *ext4_sb_bread(struct super_block *sb, - sector_t block, int op_flags); + sector_t block, blk_opf_t op_flags); extern struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, sector_t block); -extern void ext4_read_bh_nowait(struct buffer_head *bh, int op_flags, +extern void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io); -extern int ext4_read_bh(struct buffer_head *bh, int op_flags, +extern int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io); -extern int ext4_read_bh_lock(struct buffer_head *bh, int op_flags, bool wait); +extern int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait); extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block); extern int ext4_seq_options_show(struct seq_file *seq, void *offset); extern int ext4_calculate_overhead(struct super_block *sb); @@ -3583,6 +3594,7 @@ extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, extern int ext4_inline_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, int *has_inline, __u64 start, __u64 len); +extern void *ext4_read_inline_link(struct inode *inode); struct iomap; extern int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap); @@ -3799,7 +3811,7 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh) extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; extern int ext4_resize_begin(struct super_block *sb); -extern void ext4_resize_end(struct super_block *sb); +extern int ext4_resize_end(struct super_block *sb, bool update_backups); static inline void ext4_set_io_unwritten_flag(struct inode *inode, struct ext4_io_end *io_end) diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 3477a16d08ae..8e1fb18f465e 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -267,8 +267,7 @@ int __ext4_forget(const char *where, unsigned int line, handle_t *handle, trace_ext4_forget(inode, is_metadata, blocknr); BUFFER_TRACE(bh, "enter"); - jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " - "data mode %x\n", + ext4_debug("forgetting bh %p: is_metadata=%d, mode %o, data mode %x\n", bh, is_metadata, inode->i_mode, test_opt(inode->i_sb, DATA_FLAGS)); diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 9a3a8996aacf..23167efda95e 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -1654,7 +1654,8 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi) sbi->s_es_shrinker.scan_objects = ext4_es_scan; sbi->s_es_shrinker.count_objects = ext4_es_count; sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; - err = register_shrinker(&sbi->s_es_shrinker); + err = register_shrinker(&sbi->s_es_shrinker, "ext4-es:%s", + sbi->s_sb->s_id); if (err) goto err4; diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 795a60ad1897..2af962cbb835 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -658,7 +658,7 @@ void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t star static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail) { - int write_flags = REQ_SYNC; + blk_opf_t write_flags = REQ_SYNC; struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh; /* Add REQ_FUA | REQ_PREFLUSH only its tail */ @@ -668,7 +668,7 @@ static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail) set_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = ext4_end_buffer_io_sync; - submit_bh(REQ_OP_WRITE, write_flags, bh); + submit_bh(REQ_OP_WRITE | write_flags, bh); EXT4_SB(sb)->s_fc_bh = NULL; } @@ -917,8 +917,8 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) mutex_unlock(&ei->i_fc_lock); cur_lblk_off = old_blk_size; - jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n", - __func__, cur_lblk_off, new_blk_size, inode->i_ino); + ext4_debug("will try writing %d to %d for inode %ld\n", + cur_lblk_off, new_blk_size, inode->i_ino); while (cur_lblk_off <= new_blk_size) { map.m_lblk = cur_lblk_off; @@ -1168,7 +1168,7 @@ static void ext4_fc_update_stats(struct super_block *sb, int status, { struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats; - jbd_debug(1, "Fast commit ended with status = %d for tid %u", + ext4_debug("Fast commit ended with status = %d for tid %u", status, commit_tid); if (status == EXT4_FC_STATUS_OK) { stats->fc_num_commits++; @@ -1375,14 +1375,14 @@ static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl, inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { - jbd_debug(1, "Inode %d not found", darg.ino); + ext4_debug("Inode %d not found", darg.ino); return 0; } old_parent = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); if (IS_ERR(old_parent)) { - jbd_debug(1, "Dir with inode %d not found", darg.parent_ino); + ext4_debug("Dir with inode %d not found", darg.parent_ino); iput(inode); return 0; } @@ -1407,21 +1407,21 @@ static int ext4_fc_replay_link_internal(struct super_block *sb, dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL); if (IS_ERR(dir)) { - jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino); + ext4_debug("Dir with inode %d not found.", darg->parent_ino); dir = NULL; goto out; } dentry_dir = d_obtain_alias(dir); if (IS_ERR(dentry_dir)) { - jbd_debug(1, "Failed to obtain dentry"); + ext4_debug("Failed to obtain dentry"); dentry_dir = NULL; goto out; } dentry_inode = d_alloc(dentry_dir, &qstr_dname); if (!dentry_inode) { - jbd_debug(1, "Inode dentry not created."); + ext4_debug("Inode dentry not created."); ret = -ENOMEM; goto out; } @@ -1434,7 +1434,7 @@ static int ext4_fc_replay_link_internal(struct super_block *sb, * could complete. */ if (ret && ret != -EEXIST) { - jbd_debug(1, "Failed to link\n"); + ext4_debug("Failed to link\n"); goto out; } @@ -1468,7 +1468,7 @@ static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl, inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { - jbd_debug(1, "Inode not found."); + ext4_debug("Inode not found."); return 0; } @@ -1576,7 +1576,7 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl, /* Given that we just wrote the inode on disk, this SHOULD succeed. */ inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { - jbd_debug(1, "Inode not found."); + ext4_debug("Inode not found."); return -EFSCORRUPTED; } @@ -1630,7 +1630,7 @@ static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl, inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { - jbd_debug(1, "inode %d not found.", darg.ino); + ext4_debug("inode %d not found.", darg.ino); inode = NULL; ret = -EINVAL; goto out; @@ -1643,7 +1643,7 @@ static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl, */ dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); if (IS_ERR(dir)) { - jbd_debug(1, "Dir %d not found.", darg.ino); + ext4_debug("Dir %d not found.", darg.ino); goto out; } ret = ext4_init_new_dir(NULL, dir, inode); @@ -1727,7 +1727,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb, inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL); if (IS_ERR(inode)) { - jbd_debug(1, "Inode not found."); + ext4_debug("Inode not found."); return 0; } @@ -1741,7 +1741,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb, cur = start; remaining = len; - jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n", + ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n", start, start_pblk, len, ext4_ext_is_unwritten(ex), inode->i_ino); @@ -1802,7 +1802,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb, } /* Range is mapped and needs a state change */ - jbd_debug(1, "Converting from %ld to %d %lld", + ext4_debug("Converting from %ld to %d %lld", map.m_flags & EXT4_MAP_UNWRITTEN, ext4_ext_is_unwritten(ex), map.m_pblk); ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, @@ -1845,7 +1845,7 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl, inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL); if (IS_ERR(inode)) { - jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino)); + ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino)); return 0; } @@ -1853,7 +1853,7 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl, if (ret) goto out; - jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n", + ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n", inode->i_ino, le32_to_cpu(lrange.fc_lblk), le32_to_cpu(lrange.fc_len)); while (remaining > 0) { @@ -1902,7 +1902,7 @@ static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) inode = ext4_iget(sb, state->fc_modified_inodes[i], EXT4_IGET_NORMAL); if (IS_ERR(inode)) { - jbd_debug(1, "Inode %d not found.", + ext4_debug("Inode %d not found.", state->fc_modified_inodes[i]); continue; } @@ -2031,7 +2031,7 @@ static int ext4_fc_replay_scan(journal_t *journal, for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) { memcpy(&tl, cur, sizeof(tl)); val = cur + sizeof(tl); - jbd_debug(3, "Scan phase, tag:%s, blk %lld\n", + ext4_debug("Scan phase, tag:%s, blk %lld\n", tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr); switch (le16_to_cpu(tl.fc_tag)) { case EXT4_FC_TAG_ADD_RANGE: @@ -2126,7 +2126,7 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, sbi->s_mount_state |= EXT4_FC_REPLAY; } if (!sbi->s_fc_replay_state.fc_replay_num_tags) { - jbd_debug(1, "Replay stops\n"); + ext4_debug("Replay stops\n"); ext4_fc_set_bitmaps_and_counters(sb); return 0; } @@ -2150,7 +2150,7 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, ext4_fc_set_bitmaps_and_counters(sb); break; } - jbd_debug(3, "Replay phase, tag:%s\n", + ext4_debug("Replay phase, tag:%s\n", tag2str(le16_to_cpu(tl.fc_tag))); state->fc_replay_num_tags--; switch (le16_to_cpu(tl.fc_tag)) { diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 07a8c75b65ed..860fc5119009 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -460,7 +460,7 @@ static int ext4_splice_branch(handle_t *handle, * the new i_size. But that is not done here - it is done in * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. */ - jbd_debug(5, "splicing indirect only\n"); + ext4_debug("splicing indirect only\n"); BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, ar->inode, where->bh); if (err) @@ -472,7 +472,7 @@ static int ext4_splice_branch(handle_t *handle, err = ext4_mark_inode_dirty(handle, ar->inode); if (unlikely(err)) goto err_out; - jbd_debug(5, "splicing direct\n"); + ext4_debug("splicing direct\n"); } return err; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index cff52ff6549d..a4fbe825694b 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -6,6 +6,7 @@ #include <linux/iomap.h> #include <linux/fiemap.h> +#include <linux/namei.h> #include <linux/iversion.h> #include <linux/sched/mm.h> @@ -35,6 +36,9 @@ static int get_max_inline_xattr_value_size(struct inode *inode, struct ext4_inode *raw_inode; int free, min_offs; + if (!EXT4_INODE_HAS_XATTR_SPACE(inode)) + return 0; + min_offs = EXT4_SB(inode->i_sb)->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE - EXT4_I(inode)->i_extra_isize - @@ -1588,6 +1592,35 @@ out: return ret; } +void *ext4_read_inline_link(struct inode *inode) +{ + struct ext4_iloc iloc; + int ret, inline_size; + void *link; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ERR_PTR(ret); + + ret = -ENOMEM; + inline_size = ext4_get_inline_size(inode); + link = kmalloc(inline_size + 1, GFP_NOFS); + if (!link) + goto out; + + ret = ext4_read_inline_data(inode, link, inline_size, &iloc); + if (ret < 0) { + kfree(link); + goto out; + } + nd_terminate_link(link, inode->i_size, ret); +out: + if (ret < 0) + link = ERR_PTR(ret); + brelse(iloc.bh); + return link; +} + struct buffer_head *ext4_get_first_inline_block(struct inode *inode, struct ext4_dir_entry_2 **parent_de, int *retval) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 84c0eb55071d..601214453c3a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -177,6 +177,8 @@ void ext4_evict_inode(struct inode *inode) trace_ext4_evict_inode(inode); + if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL) + ext4_evict_ea_inode(inode); if (inode->i_nlink) { /* * When journalling data dirty buffers are tracked only in the @@ -1554,9 +1556,9 @@ struct mpage_da_data { static void mpage_release_unused_pages(struct mpage_da_data *mpd, bool invalidate) { - int nr_pages, i; + unsigned nr, i; pgoff_t index, end; - struct pagevec pvec; + struct folio_batch fbatch; struct inode *inode = mpd->inode; struct address_space *mapping = inode->i_mapping; @@ -1571,18 +1573,28 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, ext4_lblk_t start, last; start = index << (PAGE_SHIFT - inode->i_blkbits); last = end << (PAGE_SHIFT - inode->i_blkbits); + + /* + * avoid racing with extent status tree scans made by + * ext4_insert_delayed_block() + */ + down_write(&EXT4_I(inode)->i_data_sem); ext4_es_remove_extent(inode, start, last - start + 1); + up_write(&EXT4_I(inode)->i_data_sem); } - pagevec_init(&pvec); + folio_batch_init(&fbatch); while (index <= end) { - nr_pages = pagevec_lookup_range(&pvec, mapping, &index, end); - if (nr_pages == 0) + nr = filemap_get_folios(mapping, &index, end, &fbatch); + if (nr == 0) break; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - struct folio *folio = page_folio(page); + for (i = 0; i < nr; i++) { + struct folio *folio = fbatch.folios[i]; + if (folio->index < mpd->first_page) + continue; + if (folio->index + folio_nr_pages(folio) - 1 > end) + continue; BUG_ON(!folio_test_locked(folio)); BUG_ON(folio_test_writeback(folio)); if (invalidate) { @@ -1594,7 +1606,7 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, } folio_unlock(folio); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); } } @@ -2311,8 +2323,8 @@ out: */ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) { - struct pagevec pvec; - int nr_pages, i; + struct folio_batch fbatch; + unsigned nr, i; struct inode *inode = mpd->inode; int bpp_bits = PAGE_SHIFT - inode->i_blkbits; pgoff_t start, end; @@ -2326,14 +2338,13 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) lblk = start << bpp_bits; pblock = mpd->map.m_pblk; - pagevec_init(&pvec); + folio_batch_init(&fbatch); while (start <= end) { - nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, - &start, end); - if (nr_pages == 0) + nr = filemap_get_folios(inode->i_mapping, &start, end, &fbatch); + if (nr == 0) break; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr; i++) { + struct page *page = &fbatch.folios[i]->page; err = mpage_process_page(mpd, page, &lblk, &pblock, &map_bh); @@ -2349,14 +2360,14 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) if (err < 0) goto out; } - pagevec_release(&pvec); + folio_batch_release(&fbatch); } /* Extent fully mapped and matches with page boundary. We are done. */ mpd->map.m_len = 0; mpd->map.m_flags = 0; return 0; out: - pagevec_release(&pvec); + folio_batch_release(&fbatch); return err; } @@ -3140,13 +3151,15 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) { struct inode *inode = mapping->host; journal_t *journal; + sector_t ret = 0; int err; + inode_lock_shared(inode); /* * We can get here for an inline file via the FIBMAP ioctl */ if (ext4_has_inline_data(inode)) - return 0; + goto out; if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && test_opt(inode->i_sb, DELALLOC)) { @@ -3185,10 +3198,14 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) jbd2_journal_unlock_updates(journal); if (err) - return 0; + goto out; } - return iomap_bmap(mapping, block, &ext4_iomap_ops); + ret = iomap_bmap(mapping, block, &ext4_iomap_ops); + +out: + inode_unlock_shared(inode); + return ret; } static int ext4_read_folio(struct file *file, struct folio *folio) @@ -3631,7 +3648,7 @@ static const struct address_space_operations ext4_aops = { .invalidate_folio = ext4_invalidate_folio, .release_folio = ext4_release_folio, .direct_IO = noop_direct_IO, - .migratepage = buffer_migrate_page, + .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, .swap_activate = ext4_iomap_swap_activate, @@ -3666,7 +3683,7 @@ static const struct address_space_operations ext4_da_aops = { .invalidate_folio = ext4_invalidate_folio, .release_folio = ext4_release_folio, .direct_IO = noop_direct_IO, - .migratepage = buffer_migrate_page, + .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, .swap_activate = ext4_iomap_swap_activate, @@ -4685,8 +4702,7 @@ static inline int ext4_iget_extra_inode(struct inode *inode, __le32 *magic = (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; - if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize + sizeof(__le32) <= - EXT4_INODE_SIZE(inode->i_sb) && + if (EXT4_INODE_HAS_XATTR_SPACE(inode) && *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { ext4_set_inode_state(inode, EXT4_STATE_XATTR); return ext4_find_inline_data_nolock(inode); @@ -5213,7 +5229,7 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) if (EXT4_SB(inode->i_sb)->s_journal) { if (ext4_journal_current_handle()) { - jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); + ext4_debug("called recursively, non-PF_MEMALLOC!\n"); dump_stack(); return -EIO; } @@ -5350,14 +5366,14 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (error) return error; - if (is_quota_modification(inode, attr)) { + if (is_quota_modification(mnt_userns, inode, attr)) { error = dquot_initialize(inode); if (error) return error; } - if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) || - (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) { + if (i_uid_needs_update(mnt_userns, attr, inode) || + i_gid_needs_update(mnt_userns, attr, inode)) { handle_t *handle; /* (user+group)*(old+new) structure, inode write (sb, @@ -5374,7 +5390,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, * counts xattr inode references. */ down_read(&EXT4_I(inode)->xattr_sem); - error = dquot_transfer(inode, attr); + error = dquot_transfer(mnt_userns, inode, attr); up_read(&EXT4_I(inode)->xattr_sem); if (error) { @@ -5383,10 +5399,8 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } /* Update corresponding info in inode so that everything is in * one transaction */ - if (attr->ia_valid & ATTR_UID) - inode->i_uid = attr->ia_uid; - if (attr->ia_valid & ATTR_GID) - inode->i_gid = attr->ia_gid; + i_uid_update(mnt_userns, attr, inode); + i_gid_update(mnt_userns, attr, inode); error = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); if (unlikely(error)) { diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index cb01c1da0f9d..3cf3ec4b1c21 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -20,6 +20,7 @@ #include <linux/delay.h> #include <linux/iversion.h> #include <linux/fileattr.h> +#include <linux/uuid.h> #include "ext4_jbd2.h" #include "ext4.h" #include <linux/fsmap.h> @@ -41,6 +42,15 @@ static void ext4_sb_setlabel(struct ext4_super_block *es, const void *arg) memcpy(es->s_volume_name, (char *)arg, EXT4_LABEL_MAX); } +/* + * Superblock modification callback function for changing file system + * UUID. + */ +static void ext4_sb_setuuid(struct ext4_super_block *es, const void *arg) +{ + memcpy(es->s_uuid, (__u8 *)arg, UUID_SIZE); +} + static int ext4_update_primary_sb(struct super_block *sb, handle_t *handle, ext4_update_sb_callback func, @@ -944,7 +954,9 @@ static long ext4_ioctl_group_add(struct file *file, test_opt(sb, INIT_INODE_TABLE)) err = ext4_register_li_request(sb, input->group); group_add_out: - ext4_resize_end(sb); + err2 = ext4_resize_end(sb, false); + if (err == 0) + err = err2; return err; } @@ -1131,6 +1143,73 @@ static int ext4_ioctl_getlabel(struct ext4_sb_info *sbi, char __user *user_label return 0; } +static int ext4_ioctl_getuuid(struct ext4_sb_info *sbi, + struct fsuuid __user *ufsuuid) +{ + struct fsuuid fsuuid; + __u8 uuid[UUID_SIZE]; + + if (copy_from_user(&fsuuid, ufsuuid, sizeof(fsuuid))) + return -EFAULT; + + if (fsuuid.fsu_len == 0) { + fsuuid.fsu_len = UUID_SIZE; + if (copy_to_user(ufsuuid, &fsuuid, sizeof(fsuuid.fsu_len))) + return -EFAULT; + return -EINVAL; + } + + if (fsuuid.fsu_len != UUID_SIZE || fsuuid.fsu_flags != 0) + return -EINVAL; + + lock_buffer(sbi->s_sbh); + memcpy(uuid, sbi->s_es->s_uuid, UUID_SIZE); + unlock_buffer(sbi->s_sbh); + + if (copy_to_user(&ufsuuid->fsu_uuid[0], uuid, UUID_SIZE)) + return -EFAULT; + return 0; +} + +static int ext4_ioctl_setuuid(struct file *filp, + const struct fsuuid __user *ufsuuid) +{ + int ret = 0; + struct super_block *sb = file_inode(filp)->i_sb; + struct fsuuid fsuuid; + __u8 uuid[UUID_SIZE]; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* + * If any checksums (group descriptors or metadata) are being used + * then the checksum seed feature is required to change the UUID. + */ + if (((ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb)) + && !ext4_has_feature_csum_seed(sb)) + || ext4_has_feature_stable_inodes(sb)) + return -EOPNOTSUPP; + + if (copy_from_user(&fsuuid, ufsuuid, sizeof(fsuuid))) + return -EFAULT; + + if (fsuuid.fsu_len != UUID_SIZE || fsuuid.fsu_flags != 0) + return -EINVAL; + + if (copy_from_user(uuid, &ufsuuid->fsu_uuid[0], UUID_SIZE)) + return -EFAULT; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + ret = ext4_update_superblocks_fn(sb, ext4_sb_setuuid, &uuid); + mnt_drop_write_file(filp); + + return ret; +} + static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -1223,7 +1302,9 @@ setversion_out: err = err2; mnt_drop_write_file(filp); group_extend_out: - ext4_resize_end(sb); + err2 = ext4_resize_end(sb, false); + if (err == 0) + err = err2; return err; } @@ -1371,7 +1452,9 @@ mext_out: err = ext4_register_li_request(sb, o_group); resizefs_out: - ext4_resize_end(sb); + err2 = ext4_resize_end(sb, true); + if (err == 0) + err = err2; return err; } @@ -1509,6 +1592,10 @@ resizefs_out: return ext4_ioctl_setlabel(filp, (const void __user *)arg); + case EXT4_IOC_GETFSUUID: + return ext4_ioctl_getuuid(EXT4_SB(sb), (void __user *)arg); + case EXT4_IOC_SETFSUUID: + return ext4_ioctl_setuuid(filp, (const void __user *)arg); default: return -ENOTTY; } @@ -1586,6 +1673,8 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case EXT4_IOC_CHECKPOINT: case FS_IOC_GETFSLABEL: case FS_IOC_SETFSLABEL: + case EXT4_IOC_GETFSUUID: + case EXT4_IOC_SETFSUUID: break; default: return -ENOIOCTLCMD; @@ -1599,13 +1688,15 @@ static void set_overhead(struct ext4_super_block *es, const void *arg) es->s_overhead_clusters = cpu_to_le32(*((unsigned long *) arg)); } -int ext4_update_overhead(struct super_block *sb) +int ext4_update_overhead(struct super_block *sb, bool force) { struct ext4_sb_info *sbi = EXT4_SB(sb); - if (sb_rdonly(sb) || sbi->s_overhead == 0 || - sbi->s_overhead == le32_to_cpu(sbi->s_es->s_overhead_clusters)) + if (sb_rdonly(sb)) + return 0; + if (!force && + (sbi->s_overhead == 0 || + sbi->s_overhead == le32_to_cpu(sbi->s_es->s_overhead_clusters))) return 0; - return ext4_update_superblocks_fn(sb, set_overhead, &sbi->s_overhead); } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 9e06334771a3..bd8f8b5c3d30 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1933,6 +1933,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) unsigned ret = 0; int len0 = len; void *buddy; + bool split = false; BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); BUG_ON(e4b->bd_group != ex->fe_group); @@ -1957,12 +1958,16 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) /* let's maintain buddy itself */ while (len) { - ord = mb_find_order_for_block(e4b, start); + if (!split) + ord = mb_find_order_for_block(e4b, start); if (((start >> ord) << ord) == start && len >= (1 << ord)) { /* the whole chunk may be allocated at once! */ mlen = 1 << ord; - buddy = mb_find_buddy(e4b, ord, &max); + if (!split) + buddy = mb_find_buddy(e4b, ord, &max); + else + split = false; BUG_ON((start >> ord) >= max); mb_set_bit(start >> ord, buddy); e4b->bd_info->bb_counters[ord]--; @@ -1989,6 +1994,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) mb_clear_bit(cur + 1, buddy); e4b->bd_info->bb_counters[ord]++; e4b->bd_info->bb_counters[ord]++; + split = true; } mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); @@ -5928,6 +5934,15 @@ static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode, sbi = EXT4_SB(sb); + if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && + !ext4_inode_block_valid(inode, block, count)) { + ext4_error(sb, "Freeing blocks in system zone - " + "Block = %llu, count = %lu", block, count); + /* err = 0. ext4_std_error should be a no op */ + goto error_return; + } + flags |= EXT4_FREE_BLOCKS_VALIDATED; + do_more: overflow = 0; ext4_get_group_no_and_offset(sb, block, &block_group, &bit); @@ -5944,6 +5959,8 @@ do_more: overflow = EXT4_C2B(sbi, bit) + count - EXT4_BLOCKS_PER_GROUP(sb); count -= overflow; + /* The range changed so it's no longer validated */ + flags &= ~EXT4_FREE_BLOCKS_VALIDATED; } count_clusters = EXT4_NUM_B2C(sbi, count); bitmap_bh = ext4_read_block_bitmap(sb, block_group); @@ -5958,7 +5975,8 @@ do_more: goto error_return; } - if (!ext4_inode_block_valid(inode, block, count)) { + if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && + !ext4_inode_block_valid(inode, block, count)) { ext4_error(sb, "Freeing blocks in system zone - " "Block = %llu, count = %lu", block, count); /* err = 0. ext4_std_error should be a no op */ @@ -6081,6 +6099,8 @@ do_more: block += count; count = overflow; put_bh(bitmap_bh); + /* The range changed so it's no longer validated */ + flags &= ~EXT4_FREE_BLOCKS_VALIDATED; goto do_more; } error_return: @@ -6127,6 +6147,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, "block = %llu, count = %lu", block, count); return; } + flags |= EXT4_FREE_BLOCKS_VALIDATED; ext4_debug("freeing block %llu\n", block); trace_ext4_free_blocks(inode, block, count, flags); @@ -6158,6 +6179,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, block -= overflow; count += overflow; } + /* The range changed so it's no longer validated */ + flags &= ~EXT4_FREE_BLOCKS_VALIDATED; } overflow = EXT4_LBLK_COFF(sbi, count); if (overflow) { @@ -6168,6 +6191,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, return; } else count += sbi->s_cluster_ratio - overflow; + /* The range changed so it's no longer validated */ + flags &= ~EXT4_FREE_BLOCKS_VALIDATED; } if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) { diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 42f590518b4c..54e7d3c95fd7 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -417,7 +417,7 @@ int ext4_ext_migrate(struct inode *inode) struct inode *tmp_inode = NULL; struct migrate_struct lb; unsigned long max_entries; - __u32 goal; + __u32 goal, tmp_csum_seed; uid_t owner[2]; /* @@ -465,6 +465,7 @@ int ext4_ext_migrate(struct inode *inode) * the migration. */ ei = EXT4_I(inode); + tmp_csum_seed = EXT4_I(tmp_inode)->i_csum_seed; EXT4_I(tmp_inode)->i_csum_seed = ei->i_csum_seed; i_size_write(tmp_inode, i_size_read(inode)); /* @@ -575,6 +576,7 @@ err_out: * the inode is not visible to user space. */ tmp_inode->i_blocks = 0; + EXT4_I(tmp_inode)->i_csum_seed = tmp_csum_seed; /* Reset the extent details */ ext4_ext_tree_init(handle, tmp_inode); diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 79d05e464c43..9af68a7ecdcf 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -52,7 +52,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) lock_buffer(bh); bh->b_end_io = end_buffer_write_sync; get_bh(bh); - submit_bh(REQ_OP_WRITE, REQ_SYNC | REQ_META | REQ_PRIO, bh); + submit_bh(REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); sb_end_write(sb); if (unlikely(!buffer_uptodate(bh))) @@ -150,8 +150,6 @@ static int kmmpd(void *data) mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval, EXT4_MMP_MIN_CHECK_INTERVAL); mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); - BUILD_BUG_ON(sizeof(mmp->mmp_bdevname) < BDEVNAME_SIZE); - bdevname(bh->b_bdev, mmp->mmp_bdevname); memcpy(mmp->mmp_nodename, init_utsname()->nodename, sizeof(mmp->mmp_nodename)); @@ -372,13 +370,16 @@ skip: EXT4_SB(sb)->s_mmp_bh = bh; + BUILD_BUG_ON(sizeof(mmp->mmp_bdevname) < BDEVNAME_SIZE); + snprintf(mmp->mmp_bdevname, sizeof(mmp->mmp_bdevname), + "%pg", bh->b_bdev); + /* * Start a kernel thread to update the MMP block periodically. */ EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, sb, "kmmpd-%.*s", (int)sizeof(mmp->mmp_bdevname), - bdevname(bh->b_bdev, - mmp->mmp_bdevname)); + mmp->mmp_bdevname); if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) { EXT4_SB(sb)->s_mmp_tsk = NULL; ext4_warning(sb, "Unable to create kmmpd thread for %s.", diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index db4ba99d1ceb..3a31b662f661 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -54,6 +54,7 @@ static struct buffer_head *ext4_append(handle_t *handle, struct inode *inode, ext4_lblk_t *block) { + struct ext4_map_blocks map; struct buffer_head *bh; int err; @@ -63,6 +64,21 @@ static struct buffer_head *ext4_append(handle_t *handle, return ERR_PTR(-ENOSPC); *block = inode->i_size >> inode->i_sb->s_blocksize_bits; + map.m_lblk = *block; + map.m_len = 1; + + /* + * We're appending new directory block. Make sure the block is not + * allocated yet, otherwise we will end up corrupting the + * directory. + */ + err = ext4_map_blocks(NULL, inode, &map, 0); + if (err < 0) + return ERR_PTR(err); + if (err) { + EXT4_ERROR_INODE(inode, "Logical block already allocated"); + return ERR_PTR(-EFSCORRUPTED); + } bh = ext4_bread(handle, inode, *block, EXT4_GET_BLOCKS_CREATE); if (IS_ERR(bh)) @@ -110,6 +126,13 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, struct ext4_dir_entry *dirent; int is_dx_block = 0; + if (block >= inode->i_size) { + ext4_error_inode(inode, func, line, block, + "Attempting to read directory block (%u) that is past i_size (%llu)", + block, inode->i_size); + return ERR_PTR(-EFSCORRUPTED); + } + if (ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_EIO)) bh = ERR_PTR(-EIO); else @@ -3067,11 +3090,8 @@ bool ext4_empty_dir(struct inode *inode) de = (struct ext4_dir_entry_2 *) (bh->b_data + (offset & (sb->s_blocksize - 1))); if (ext4_check_dir_entry(inode, NULL, de, bh, - bh->b_data, bh->b_size, offset)) { - offset = (offset | (sb->s_blocksize - 1)) + 1; - continue; - } - if (le32_to_cpu(de->inode)) { + bh->b_data, bh->b_size, offset) || + le32_to_cpu(de->inode)) { brelse(bh); return false; } diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c index 7de0612eb42d..69a9cf9137a6 100644 --- a/fs/ext4/orphan.c +++ b/fs/ext4/orphan.c @@ -181,8 +181,8 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) } else brelse(iloc.bh); - jbd_debug(4, "superblock will point to %lu\n", inode->i_ino); - jbd_debug(4, "orphan inode %lu will point to %d\n", + ext4_debug("superblock will point to %lu\n", inode->i_ino); + ext4_debug("orphan inode %lu will point to %d\n", inode->i_ino, NEXT_ORPHAN(inode)); out: ext4_std_error(sb, err); @@ -251,7 +251,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) } mutex_lock(&sbi->s_orphan_lock); - jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino); + ext4_debug("remove inode %lu from orphan list\n", inode->i_ino); prev = ei->i_orphan.prev; list_del_init(&ei->i_orphan); @@ -267,7 +267,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) ino_next = NEXT_ORPHAN(inode); if (prev == &sbi->s_orphan) { - jbd_debug(4, "superblock will point to %u\n", ino_next); + ext4_debug("superblock will point to %u\n", ino_next); BUFFER_TRACE(sbi->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, inode->i_sb, sbi->s_sbh, EXT4_JTR_NONE); @@ -286,7 +286,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) struct inode *i_prev = &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode; - jbd_debug(4, "orphan inode %lu will point to %u\n", + ext4_debug("orphan inode %lu will point to %u\n", i_prev->i_ino, ino_next); err = ext4_reserve_inode_write(handle, i_prev, &iloc2); if (err) { @@ -332,8 +332,8 @@ static void ext4_process_orphan(struct inode *inode, ext4_msg(sb, KERN_DEBUG, "%s: truncating inode %lu to %lld bytes", __func__, inode->i_ino, inode->i_size); - jbd_debug(2, "truncating inode %lu to %lld bytes\n", - inode->i_ino, inode->i_size); + ext4_debug("truncating inode %lu to %lld bytes\n", + inode->i_ino, inode->i_size); inode_lock(inode); truncate_inode_pages(inode->i_mapping, inode->i_size); ret = ext4_truncate(inode); @@ -353,8 +353,8 @@ static void ext4_process_orphan(struct inode *inode, ext4_msg(sb, KERN_DEBUG, "%s: deleting unreferenced inode %lu", __func__, inode->i_ino); - jbd_debug(2, "deleting unreferenced inode %lu\n", - inode->i_ino); + ext4_debug("deleting unreferenced inode %lu\n", + inode->i_ino); (*nr_orphans)++; } iput(inode); /* The delete magic happens here! */ @@ -391,7 +391,7 @@ void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es) int inodes_per_ob = ext4_inodes_per_orphan_block(sb); if (!es->s_last_orphan && !oi->of_blocks) { - jbd_debug(4, "no orphan inodes to clean up\n"); + ext4_debug("no orphan inodes to clean up\n"); return; } @@ -415,7 +415,7 @@ void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es) "clearing orphan list.\n"); es->s_last_orphan = 0; } - jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); + ext4_debug("Skipping orphan recovery on fs with errors.\n"); return; } @@ -459,7 +459,7 @@ void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es) * so, skip the rest. */ if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { - jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); + ext4_debug("Skipping orphan recovery on fs with errors.\n"); es->s_last_orphan = 0; break; } diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 8b70a4701293..fea2a68d067b 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -97,10 +97,13 @@ int ext4_resize_begin(struct super_block *sb) return ret; } -void ext4_resize_end(struct super_block *sb) +int ext4_resize_end(struct super_block *sb, bool update_backups) { clear_bit_unlock(EXT4_FLAGS_RESIZING, &EXT4_SB(sb)->s_ext4_flags); smp_mb__after_atomic(); + if (update_backups) + return ext4_update_overhead(sb, true); + return 0; } static ext4_group_t ext4_meta_bg_first_group(struct super_block *sb, @@ -1380,6 +1383,17 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb, return err; } +static void ext4_add_overhead(struct super_block *sb, + const ext4_fsblk_t overhead) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + + sbi->s_overhead += overhead; + es->s_overhead_clusters = cpu_to_le32(sbi->s_overhead); + smp_wmb(); +} + /* * ext4_update_super() updates the super block so that the newly added * groups can be seen by the filesystem. @@ -1481,9 +1495,18 @@ static void ext4_update_super(struct super_block *sb, } /* - * Update the fs overhead information + * Update the fs overhead information. + * + * For bigalloc, if the superblock already has a properly calculated + * overhead, update it with a value based on numbers already computed + * above for the newly allocated capacity. */ - ext4_calculate_overhead(sb); + if (ext4_has_feature_bigalloc(sb) && (sbi->s_overhead != 0)) + ext4_add_overhead(sb, + EXT4_NUM_B2C(sbi, blocks_count - free_blocks)); + else + ext4_calculate_overhead(sb); + es->s_overhead_clusters = cpu_to_le32(sbi->s_overhead); if (test_opt(sb, DEBUG)) printk(KERN_DEBUG "EXT4-fs: added group %u:" @@ -1988,6 +2011,16 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) } brelse(bh); + /* + * For bigalloc, trim the requested size to the nearest cluster + * boundary to avoid creating an unusable filesystem. We do this + * silently, instead of returning an error, to avoid breaking + * callers that blindly resize the filesystem to the full size of + * the underlying block device. + */ + if (ext4_has_feature_bigalloc(sb)) + n_blocks_count &= ~((1 << EXT4_CLUSTER_BITS(sb)) - 1); + retry: o_blocks_count = ext4_blocks_count(es); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 845f2f8aee5f..9a66abcca1a8 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -159,7 +159,7 @@ MODULE_ALIAS("ext3"); #define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type) -static inline void __ext4_read_bh(struct buffer_head *bh, int op_flags, +static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io) { /* @@ -171,10 +171,10 @@ static inline void __ext4_read_bh(struct buffer_head *bh, int op_flags, bh->b_end_io = end_io ? end_io : end_buffer_read_sync; get_bh(bh); - submit_bh(REQ_OP_READ, op_flags, bh); + submit_bh(REQ_OP_READ | op_flags, bh); } -void ext4_read_bh_nowait(struct buffer_head *bh, int op_flags, +void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io) { BUG_ON(!buffer_locked(bh)); @@ -186,7 +186,7 @@ void ext4_read_bh_nowait(struct buffer_head *bh, int op_flags, __ext4_read_bh(bh, op_flags, end_io); } -int ext4_read_bh(struct buffer_head *bh, int op_flags, bh_end_io_t *end_io) +int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io) { BUG_ON(!buffer_locked(bh)); @@ -203,7 +203,7 @@ int ext4_read_bh(struct buffer_head *bh, int op_flags, bh_end_io_t *end_io) return -EIO; } -int ext4_read_bh_lock(struct buffer_head *bh, int op_flags, bool wait) +int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait) { if (trylock_buffer(bh)) { if (wait) @@ -227,8 +227,8 @@ int ext4_read_bh_lock(struct buffer_head *bh, int op_flags, bool wait) * return. */ static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb, - sector_t block, int op_flags, - gfp_t gfp) + sector_t block, + blk_opf_t op_flags, gfp_t gfp) { struct buffer_head *bh; int ret; @@ -248,7 +248,7 @@ static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb, } struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block, - int op_flags) + blk_opf_t op_flags) { return __ext4_sb_bread_gfp(sb, block, op_flags, __GFP_MOVABLE); } @@ -1307,7 +1307,7 @@ static void ext4_put_super(struct super_block *sb) if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); kfree(sbi->s_blockgroup_lock); - fs_put_dax(sbi->s_daxdev); + fs_put_dax(sbi->s_daxdev, NULL); fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); #if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); @@ -3011,6 +3011,15 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, } else if (test_opt2(sb, DAX_INODE)) { SEQ_OPTS_PUTS("dax=inode"); } + + if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD && + !test_opt2(sb, MB_OPTIMIZE_SCAN)) { + SEQ_OPTS_PUTS("mb_optimize_scan=0"); + } else if (sbi->s_groups_count < MB_DEFAULT_LINEAR_SCAN_THRESHOLD && + test_opt2(sb, MB_OPTIMIZE_SCAN)) { + SEQ_OPTS_PUTS("mb_optimize_scan=1"); + } + ext4_show_quota_options(seq, sb); return 0; } @@ -4272,7 +4281,7 @@ static void ext4_free_sbi(struct ext4_sb_info *sbi) return; kfree(sbi->s_blockgroup_lock); - fs_put_dax(sbi->s_daxdev); + fs_put_dax(sbi->s_daxdev, NULL); kfree(sbi); } @@ -4284,7 +4293,8 @@ static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb) if (!sbi) return NULL; - sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off); + sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off, + NULL, NULL); sbi->s_blockgroup_lock = kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); @@ -4296,7 +4306,7 @@ static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb) sbi->s_sb = sb; return sbi; err_out: - fs_put_dax(sbi->s_daxdev); + fs_put_dax(sbi->s_daxdev, NULL); kfree(sbi); return NULL; } @@ -5523,7 +5533,7 @@ static int ext4_fill_super(struct super_block *sb, struct fs_context *fc) "Quota mode: %s.", descr, ext4_quota_mode(sb)); /* Update the s_overhead_clusters if necessary */ - ext4_update_overhead(sb); + ext4_update_overhead(sb, false); return 0; free_sbi: @@ -5585,7 +5595,7 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb, return NULL; } - jbd_debug(2, "Journal inode found at %p: %lld bytes\n", + ext4_debug("Journal inode found at %p: %lld bytes\n", journal_inode, journal_inode->i_size); if (!S_ISREG(journal_inode->i_mode)) { ext4_msg(sb, KERN_ERR, "invalid journal inode"); @@ -5939,8 +5949,8 @@ static int ext4_commit_super(struct super_block *sb) /* Clear potential dirty bit if it was journalled update */ clear_buffer_dirty(sbh); sbh->b_end_io = end_buffer_write_sync; - submit_bh(REQ_OP_WRITE, - REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0), sbh); + submit_bh(REQ_OP_WRITE | REQ_SYNC | + (test_opt(sb, BARRIER) ? REQ_FUA : 0), sbh); wait_on_buffer(sbh); if (buffer_write_io_error(sbh)) { ext4_msg(sb, KERN_ERR, "I/O error while writing " diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index d281f5bcc526..3d3ed3c38f56 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -74,6 +74,21 @@ static const char *ext4_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *callback) { struct buffer_head *bh; + char *inline_link; + + /* + * Create a new inlined symlink is not supported, just provide a + * method to read the leftovers. + */ + if (ext4_has_inline_data(inode)) { + if (!dentry) + return ERR_PTR(-ECHILD); + + inline_link = ext4_read_inline_link(inode); + if (!IS_ERR(inline_link)) + set_delayed_call(callback, kfree_link, inline_link); + return inline_link; + } if (!dentry) { bh = ext4_getblk(NULL, inode, 0, EXT4_GET_BLOCKS_CACHED_NOWAIT); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 564e28a1aa94..533216e80fa2 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -436,6 +436,21 @@ error: return err; } +/* Remove entry from mbcache when EA inode is getting evicted */ +void ext4_evict_ea_inode(struct inode *inode) +{ + struct mb_cache_entry *oe; + + if (!EA_INODE_CACHE(inode)) + return; + /* Wait for entry to get unused so that we can remove it */ + while ((oe = mb_cache_entry_delete_or_get(EA_INODE_CACHE(inode), + ext4_xattr_inode_get_hash(inode), inode->i_ino))) { + mb_cache_entry_wait_unused(oe); + mb_cache_entry_put(EA_INODE_CACHE(inode), oe); + } +} + static int ext4_xattr_inode_verify_hashes(struct inode *ea_inode, struct ext4_xattr_entry *entry, void *buffer, @@ -976,10 +991,8 @@ int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode, static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, int ref_change) { - struct mb_cache *ea_inode_cache = EA_INODE_CACHE(ea_inode); struct ext4_iloc iloc; s64 ref_count; - u32 hash; int ret; inode_lock(ea_inode); @@ -1002,14 +1015,6 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, set_nlink(ea_inode, 1); ext4_orphan_del(handle, ea_inode); - - if (ea_inode_cache) { - hash = ext4_xattr_inode_get_hash(ea_inode); - mb_cache_entry_create(ea_inode_cache, - GFP_NOFS, hash, - ea_inode->i_ino, - true /* reusable */); - } } } else { WARN_ONCE(ref_count < 0, "EA inode %lu ref_count=%lld", @@ -1022,12 +1027,6 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, clear_nlink(ea_inode); ext4_orphan_add(handle, ea_inode); - - if (ea_inode_cache) { - hash = ext4_xattr_inode_get_hash(ea_inode); - mb_cache_entry_delete(ea_inode_cache, hash, - ea_inode->i_ino); - } } } @@ -1237,6 +1236,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, if (error) goto out; +retry_ref: lock_buffer(bh); hash = le32_to_cpu(BHDR(bh)->h_hash); ref = le32_to_cpu(BHDR(bh)->h_refcount); @@ -1246,9 +1246,18 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, * This must happen under buffer lock for * ext4_xattr_block_set() to reliably detect freed block */ - if (ea_block_cache) - mb_cache_entry_delete(ea_block_cache, hash, - bh->b_blocknr); + if (ea_block_cache) { + struct mb_cache_entry *oe; + + oe = mb_cache_entry_delete_or_get(ea_block_cache, hash, + bh->b_blocknr); + if (oe) { + unlock_buffer(bh); + mb_cache_entry_wait_unused(oe); + mb_cache_entry_put(ea_block_cache, oe); + goto retry_ref; + } + } get_bh(bh); unlock_buffer(bh); @@ -1858,6 +1867,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, #define header(x) ((struct ext4_xattr_header *)(x)) if (s->base) { + int offset = (char *)s->here - bs->bh->b_data; + BUFFER_TRACE(bs->bh, "get_write_access"); error = ext4_journal_get_write_access(handle, sb, bs->bh, EXT4_JTR_NONE); @@ -1873,9 +1884,20 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, * ext4_xattr_block_set() to reliably detect modified * block */ - if (ea_block_cache) - mb_cache_entry_delete(ea_block_cache, hash, - bs->bh->b_blocknr); + if (ea_block_cache) { + struct mb_cache_entry *oe; + + oe = mb_cache_entry_delete_or_get(ea_block_cache, + hash, bs->bh->b_blocknr); + if (oe) { + /* + * Xattr block is getting reused. Leave + * it alone. + */ + mb_cache_entry_put(ea_block_cache, oe); + goto clone_block; + } + } ea_bdebug(bs->bh, "modifying in-place"); error = ext4_xattr_set_entry(i, s, handle, inode, true /* is_block */); @@ -1890,49 +1912,47 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, if (error) goto cleanup; goto inserted; - } else { - int offset = (char *)s->here - bs->bh->b_data; + } +clone_block: + unlock_buffer(bs->bh); + ea_bdebug(bs->bh, "cloning"); + s->base = kmemdup(BHDR(bs->bh), bs->bh->b_size, GFP_NOFS); + error = -ENOMEM; + if (s->base == NULL) + goto cleanup; + s->first = ENTRY(header(s->base)+1); + header(s->base)->h_refcount = cpu_to_le32(1); + s->here = ENTRY(s->base + offset); + s->end = s->base + bs->bh->b_size; - unlock_buffer(bs->bh); - ea_bdebug(bs->bh, "cloning"); - s->base = kmemdup(BHDR(bs->bh), bs->bh->b_size, GFP_NOFS); - error = -ENOMEM; - if (s->base == NULL) + /* + * If existing entry points to an xattr inode, we need + * to prevent ext4_xattr_set_entry() from decrementing + * ref count on it because the reference belongs to the + * original block. In this case, make the entry look + * like it has an empty value. + */ + if (!s->not_found && s->here->e_value_inum) { + ea_ino = le32_to_cpu(s->here->e_value_inum); + error = ext4_xattr_inode_iget(inode, ea_ino, + le32_to_cpu(s->here->e_hash), + &tmp_inode); + if (error) goto cleanup; - s->first = ENTRY(header(s->base)+1); - header(s->base)->h_refcount = cpu_to_le32(1); - s->here = ENTRY(s->base + offset); - s->end = s->base + bs->bh->b_size; - - /* - * If existing entry points to an xattr inode, we need - * to prevent ext4_xattr_set_entry() from decrementing - * ref count on it because the reference belongs to the - * original block. In this case, make the entry look - * like it has an empty value. - */ - if (!s->not_found && s->here->e_value_inum) { - ea_ino = le32_to_cpu(s->here->e_value_inum); - error = ext4_xattr_inode_iget(inode, ea_ino, - le32_to_cpu(s->here->e_hash), - &tmp_inode); - if (error) - goto cleanup; - - if (!ext4_test_inode_state(tmp_inode, - EXT4_STATE_LUSTRE_EA_INODE)) { - /* - * Defer quota free call for previous - * inode until success is guaranteed. - */ - old_ea_inode_quota = le32_to_cpu( - s->here->e_value_size); - } - iput(tmp_inode); - s->here->e_value_inum = 0; - s->here->e_value_size = 0; + if (!ext4_test_inode_state(tmp_inode, + EXT4_STATE_LUSTRE_EA_INODE)) { + /* + * Defer quota free call for previous + * inode until success is guaranteed. + */ + old_ea_inode_quota = le32_to_cpu( + s->here->e_value_size); } + iput(tmp_inode); + + s->here->e_value_inum = 0; + s->here->e_value_size = 0; } } else { /* Allocate a buffer where we construct the new block. */ @@ -1999,18 +2019,13 @@ inserted: lock_buffer(new_bh); /* * We have to be careful about races with - * freeing, rehashing or adding references to - * xattr block. Once we hold buffer lock xattr - * block's state is stable so we can check - * whether the block got freed / rehashed or - * not. Since we unhash mbcache entry under - * buffer lock when freeing / rehashing xattr - * block, checking whether entry is still - * hashed is reliable. Same rules hold for - * e_reusable handling. + * adding references to xattr block. Once we + * hold buffer lock xattr block's state is + * stable so we can check the additional + * reference fits. */ - if (hlist_bl_unhashed(&ce->e_hash_list) || - !ce->e_reusable) { + ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1; + if (ref > EXT4_XATTR_REFCOUNT_MAX) { /* * Undo everything and check mbcache * again. @@ -2025,9 +2040,8 @@ inserted: new_bh = NULL; goto inserted; } - ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1; BHDR(new_bh)->h_refcount = cpu_to_le32(ref); - if (ref >= EXT4_XATTR_REFCOUNT_MAX) + if (ref == EXT4_XATTR_REFCOUNT_MAX) ce->e_reusable = 0; ea_bdebug(new_bh, "reusing; refcount now=%d", ref); @@ -2175,8 +2189,9 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, struct ext4_inode *raw_inode; int error; - if (EXT4_I(inode)->i_extra_isize == 0) + if (!EXT4_INODE_HAS_XATTR_SPACE(inode)) return 0; + raw_inode = ext4_raw_inode(&is->iloc); header = IHDR(inode, raw_inode); is->s.base = is->s.first = IFIRST(header); @@ -2204,8 +2219,9 @@ int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, struct ext4_xattr_search *s = &is->s; int error; - if (EXT4_I(inode)->i_extra_isize == 0) + if (!EXT4_INODE_HAS_XATTR_SPACE(inode)) return -ENOSPC; + error = ext4_xattr_set_entry(i, s, handle, inode, false /* is_block */); if (error) return error; diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 77efb9a627ad..824faf0b15a8 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -84,7 +84,7 @@ struct ext4_xattr_entry { /* * The minimum size of EA value when you start storing it in an external inode * size of block - size of header - size of 1 entry - 4 null bytes -*/ + */ #define EXT4_XATTR_MIN_LARGE_EA_SIZE(b) \ ((b) - EXT4_XATTR_LEN(3) - sizeof(struct ext4_xattr_header) - 4) @@ -95,6 +95,19 @@ struct ext4_xattr_entry { #define EXT4_ZERO_XATTR_VALUE ((void *)-1) +/* + * If we want to add an xattr to the inode, we should make sure that + * i_extra_isize is not 0 and that the inode size is not less than + * EXT4_GOOD_OLD_INODE_SIZE + extra_isize + pad. + * EXT4_GOOD_OLD_INODE_SIZE extra_isize header entry pad data + * |--------------------------|------------|------|---------|---|-------| + */ +#define EXT4_INODE_HAS_XATTR_SPACE(inode) \ + ((EXT4_I(inode)->i_extra_isize != 0) && \ + (EXT4_GOOD_OLD_INODE_SIZE + EXT4_I(inode)->i_extra_isize + \ + sizeof(struct ext4_xattr_ibody_header) + EXT4_XATTR_PAD <= \ + EXT4_INODE_SIZE((inode)->i_sb))) + struct ext4_xattr_info { const char *name; const void *value; @@ -178,6 +191,7 @@ extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array); extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_inode *raw_inode, handle_t *handle); +extern void ext4_evict_ea_inode(struct inode *inode); extern const struct xattr_handler *ext4_xattr_handlers[]; diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 6d8b2bf14de0..8259e0fa97e1 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -463,9 +463,7 @@ const struct address_space_operations f2fs_meta_aops = { .dirty_folio = f2fs_dirty_meta_folio, .invalidate_folio = f2fs_invalidate_folio, .release_folio = f2fs_release_folio, -#ifdef CONFIG_MIGRATION - .migratepage = f2fs_migrate_page, -#endif + .migrate_folio = filemap_migrate_folio, }; static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 87bf06e15efe..70e97075e535 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1906,45 +1906,40 @@ bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, struct page *page, void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino) { struct address_space *mapping = sbi->compress_inode->i_mapping; - struct pagevec pvec; + struct folio_batch fbatch; pgoff_t index = 0; pgoff_t end = MAX_BLKADDR(sbi); if (!mapping->nrpages) return; - pagevec_init(&pvec); + folio_batch_init(&fbatch); do { - unsigned int nr_pages; - int i; + unsigned int nr, i; - nr_pages = pagevec_lookup_range(&pvec, mapping, - &index, end - 1); - if (!nr_pages) + nr = filemap_get_folios(mapping, &index, end - 1, &fbatch); + if (!nr) break; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - - if (page->index > end) - break; + for (i = 0; i < nr; i++) { + struct folio *folio = fbatch.folios[i]; - lock_page(page); - if (page->mapping != mapping) { - unlock_page(page); + folio_lock(folio); + if (folio->mapping != mapping) { + folio_unlock(folio); continue; } - if (ino != get_page_private_data(page)) { - unlock_page(page); + if (ino != get_page_private_data(&folio->page)) { + folio_unlock(folio); continue; } - generic_error_remove_page(mapping, page); - unlock_page(page); + generic_error_remove_page(mapping, &folio->page); + folio_unlock(folio); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } while (index < end); } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 0b159c555069..aa3ccddfa037 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -403,11 +403,11 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) return 0; } -static unsigned int f2fs_io_flags(struct f2fs_io_info *fio) +static blk_opf_t f2fs_io_flags(struct f2fs_io_info *fio) { unsigned int temp_mask = (1 << NR_TEMP_TYPE) - 1; unsigned int fua_flag, meta_flag, io_flag; - unsigned int op_flags = 0; + blk_opf_t op_flags = 0; if (fio->op != REQ_OP_WRITE) return 0; @@ -1015,7 +1015,7 @@ out: } static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, - unsigned nr_pages, unsigned op_flag, + unsigned nr_pages, blk_opf_t op_flag, pgoff_t first_idx, bool for_write) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -1063,7 +1063,8 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, /* This can handle encryption stuffs */ static int f2fs_submit_page_read(struct inode *inode, struct page *page, - block_t blkaddr, int op_flags, bool for_write) + block_t blkaddr, blk_opf_t op_flags, + bool for_write) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct bio *bio; @@ -1197,7 +1198,7 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) } struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, - int op_flags, bool for_write) + blk_opf_t op_flags, bool for_write) { struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; @@ -3769,42 +3770,6 @@ out: return blknr; } -#ifdef CONFIG_MIGRATION -#include <linux/migrate.h> - -int f2fs_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page, enum migrate_mode mode) -{ - int rc, extra_count = 0; - - BUG_ON(PageWriteback(page)); - - rc = migrate_page_move_mapping(mapping, newpage, - page, extra_count); - if (rc != MIGRATEPAGE_SUCCESS) - return rc; - - /* guarantee to start from no stale private field */ - set_page_private(newpage, 0); - if (PagePrivate(page)) { - set_page_private(newpage, page_private(page)); - SetPagePrivate(newpage); - get_page(newpage); - - set_page_private(page, 0); - ClearPagePrivate(page); - put_page(page); - } - - if (mode != MIGRATE_SYNC_NO_COPY) - migrate_page_copy(newpage, page); - else - migrate_page_states(newpage, page); - - return MIGRATEPAGE_SUCCESS; -} -#endif - #ifdef CONFIG_SWAP static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk, unsigned int blkcnt) @@ -4036,15 +4001,13 @@ const struct address_space_operations f2fs_dblock_aops = { .write_begin = f2fs_write_begin, .write_end = f2fs_write_end, .dirty_folio = f2fs_dirty_data_folio, + .migrate_folio = filemap_migrate_folio, .invalidate_folio = f2fs_invalidate_folio, .release_folio = f2fs_release_folio, .direct_IO = noop_direct_IO, .bmap = f2fs_bmap, .swap_activate = f2fs_swap_activate, .swap_deactivate = f2fs_swap_deactivate, -#ifdef CONFIG_MIGRATION - .migratepage = f2fs_migrate_page, -#endif }; void f2fs_clear_page_cache_dirty_tag(struct page *page) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index eddfd35eadb6..3c7cdb70fe2e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1188,8 +1188,8 @@ struct f2fs_io_info { nid_t ino; /* inode number */ enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ enum temp_type temp; /* contains HOT/WARM/COLD */ - int op; /* contains REQ_OP_ */ - int op_flags; /* req_flag_bits */ + enum req_op op; /* contains REQ_OP_ */ + blk_opf_t op_flags; /* req_flag_bits */ block_t new_blkaddr; /* new block address to be written */ block_t old_blkaddr; /* old block address before Cow */ struct page *page; /* page to be written */ @@ -3778,7 +3778,7 @@ int f2fs_reserve_new_block(struct dnode_of_data *dn); int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index); int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, - int op_flags, bool for_write); + blk_opf_t op_flags, bool for_write); struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index); struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, bool for_write); @@ -3801,10 +3801,6 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, void f2fs_write_failed(struct inode *inode, loff_t to); void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length); bool f2fs_release_folio(struct folio *folio, gfp_t wait); -#ifdef CONFIG_MIGRATION -int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, - struct page *page, enum migrate_mode mode); -#endif bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len); void f2fs_clear_page_cache_dirty_tag(struct page *page); int f2fs_init_post_read_processing(void); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 24f5b02c78e7..ce4905a073b3 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -861,10 +861,8 @@ static void __setattr_copy(struct user_namespace *mnt_userns, { unsigned int ia_valid = attr->ia_valid; - if (ia_valid & ATTR_UID) - inode->i_uid = attr->ia_uid; - if (ia_valid & ATTR_GID) - inode->i_gid = attr->ia_gid; + i_uid_update(mnt_userns, attr, inode); + i_gid_update(mnt_userns, attr, inode); if (ia_valid & ATTR_ATIME) inode->i_atime = attr->ia_atime; if (ia_valid & ATTR_MTIME) @@ -917,17 +915,15 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (err) return err; - if (is_quota_modification(inode, attr)) { + if (is_quota_modification(mnt_userns, inode, attr)) { err = f2fs_dquot_initialize(inode); if (err) return err; } - if ((attr->ia_valid & ATTR_UID && - !uid_eq(attr->ia_uid, inode->i_uid)) || - (attr->ia_valid & ATTR_GID && - !gid_eq(attr->ia_gid, inode->i_gid))) { + if (i_uid_needs_update(mnt_userns, attr, inode) || + i_gid_needs_update(mnt_userns, attr, inode)) { f2fs_lock_op(F2FS_I_SB(inode)); - err = dquot_transfer(inode, attr); + err = dquot_transfer(mnt_userns, inode, attr); if (err) { set_sbi_flag(F2FS_I_SB(inode), SBI_QUOTA_NEED_REPAIR); @@ -938,10 +934,8 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, * update uid/gid under lock_op(), so that dquot and inode can * be updated atomically. */ - if (attr->ia_valid & ATTR_UID) - inode->i_uid = attr->ia_uid; - if (attr->ia_valid & ATTR_GID) - inode->i_gid = attr->ia_gid; + i_uid_update(mnt_userns, attr, inode); + i_gid_update(mnt_userns, attr, inode); f2fs_mark_inode_dirty_sync(inode, true); f2fs_unlock_op(F2FS_I_SB(inode)); } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index d1928952d371..e06a0c478b39 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1331,7 +1331,7 @@ fail: * 0: f2fs_put_page(page, 0) * LOCKED_PAGE or error: f2fs_put_page(page, 1) */ -static int read_node_page(struct page *page, int op_flags) +static int read_node_page(struct page *page, blk_opf_t op_flags) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); struct node_info ni; @@ -2165,9 +2165,7 @@ const struct address_space_operations f2fs_node_aops = { .dirty_folio = f2fs_dirty_node_folio, .invalidate_folio = f2fs_invalidate_folio, .release_folio = f2fs_release_folio, -#ifdef CONFIG_MIGRATION - .migratepage = f2fs_migrate_page, -#endif + .migrate_folio = filemap_migrate_folio, }; static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 3cb7f8a43b4d..dcd0a1e35095 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -255,18 +255,18 @@ static int recover_quota_data(struct inode *inode, struct page *page) memset(&attr, 0, sizeof(attr)); - attr.ia_uid = make_kuid(inode->i_sb->s_user_ns, i_uid); - attr.ia_gid = make_kgid(inode->i_sb->s_user_ns, i_gid); + attr.ia_vfsuid = VFSUIDT_INIT(make_kuid(inode->i_sb->s_user_ns, i_uid)); + attr.ia_vfsgid = VFSGIDT_INIT(make_kgid(inode->i_sb->s_user_ns, i_gid)); - if (!uid_eq(attr.ia_uid, inode->i_uid)) + if (!vfsuid_eq(attr.ia_vfsuid, i_uid_into_vfsuid(&init_user_ns, inode))) attr.ia_valid |= ATTR_UID; - if (!gid_eq(attr.ia_gid, inode->i_gid)) + if (!vfsgid_eq(attr.ia_vfsgid, i_gid_into_vfsgid(&init_user_ns, inode))) attr.ia_valid |= ATTR_GID; if (!attr.ia_valid) return 0; - err = dquot_transfer(inode, &attr); + err = dquot_transfer(&init_user_ns, inode, &attr); if (err) set_sbi_flag(F2FS_I_SB(inode), SBI_QUOTA_NEED_REPAIR); return err; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 72ce00ebcede..0de21f82d7bc 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1089,7 +1089,7 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ? &(dcc->fstrim_list) : &(dcc->wait_list); - int flag = dpolicy->sync ? REQ_SYNC : 0; + blk_opf_t flag = dpolicy->sync ? REQ_SYNC : 0; block_t lstart, start, len, total_len; int err = 0; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1a0578a3f4e4..2451623c05a7 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4607,7 +4607,7 @@ static int __init init_f2fs_fs(void) err = f2fs_init_sysfs(); if (err) goto free_garbage_collection_cache; - err = register_shrinker(&f2fs_shrinker_info); + err = register_shrinker(&f2fs_shrinker_info, "f2fs-shrinker"); if (err) goto free_sysfs; err = register_filesystem(&f2fs_fs_type); diff --git a/fs/fat/file.c b/fs/fat/file.c index 3dae3ed60f3a..3e4eb3467cb4 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -90,7 +90,8 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) * out the RO attribute for checking by the security * module, just because it maps to a file mode. */ - err = security_inode_setattr(file->f_path.dentry, &ia); + err = security_inode_setattr(file_mnt_user_ns(file), + file->f_path.dentry, &ia); if (err) goto out_unlock_inode; @@ -516,9 +517,11 @@ int fat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } if (((attr->ia_valid & ATTR_UID) && - (!uid_eq(attr->ia_uid, sbi->options.fs_uid))) || + (!uid_eq(from_vfsuid(mnt_userns, i_user_ns(inode), attr->ia_vfsuid), + sbi->options.fs_uid))) || ((attr->ia_valid & ATTR_GID) && - (!gid_eq(attr->ia_gid, sbi->options.fs_gid))) || + (!gid_eq(from_vfsgid(mnt_userns, i_user_ns(inode), attr->ia_vfsgid), + sbi->options.fs_gid))) || ((attr->ia_valid & ATTR_MODE) && (attr->ia_mode & ~FAT_VALID_MODE))) error = -EPERM; diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index c573314806cf..21620054e1c4 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -889,22 +889,57 @@ out: return err; } -static int vfat_rename(struct user_namespace *mnt_userns, struct inode *old_dir, - struct dentry *old_dentry, struct inode *new_dir, - struct dentry *new_dentry, unsigned int flags) +static int vfat_get_dotdot_de(struct inode *inode, struct buffer_head **bh, + struct msdos_dir_entry **de) +{ + if (S_ISDIR(inode->i_mode)) { + if (fat_get_dotdot_entry(inode, bh, de)) + return -EIO; + } + return 0; +} + +static int vfat_sync_ipos(struct inode *dir, struct inode *inode) +{ + if (IS_DIRSYNC(dir)) + return fat_sync_inode(inode); + mark_inode_dirty(inode); + return 0; +} + +static int vfat_update_dotdot_de(struct inode *dir, struct inode *inode, + struct buffer_head *dotdot_bh, + struct msdos_dir_entry *dotdot_de) +{ + fat_set_start(dotdot_de, MSDOS_I(dir)->i_logstart); + mark_buffer_dirty_inode(dotdot_bh, inode); + if (IS_DIRSYNC(dir)) + return sync_dirty_buffer(dotdot_bh); + return 0; +} + +static void vfat_update_dir_metadata(struct inode *dir, struct timespec64 *ts) +{ + inode_inc_iversion(dir); + fat_truncate_time(dir, ts, S_CTIME | S_MTIME); + if (IS_DIRSYNC(dir)) + (void)fat_sync_inode(dir); + else + mark_inode_dirty(dir); +} + +static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) { struct buffer_head *dotdot_bh; - struct msdos_dir_entry *dotdot_de; + struct msdos_dir_entry *dotdot_de = NULL; struct inode *old_inode, *new_inode; struct fat_slot_info old_sinfo, sinfo; struct timespec64 ts; loff_t new_i_pos; - int err, is_dir, update_dotdot, corrupt = 0; + int err, is_dir, corrupt = 0; struct super_block *sb = old_dir->i_sb; - if (flags & ~RENAME_NOREPLACE) - return -EINVAL; - old_sinfo.bh = sinfo.bh = dotdot_bh = NULL; old_inode = d_inode(old_dentry); new_inode = d_inode(new_dentry); @@ -913,15 +948,13 @@ static int vfat_rename(struct user_namespace *mnt_userns, struct inode *old_dir, if (err) goto out; - is_dir = S_ISDIR(old_inode->i_mode); - update_dotdot = (is_dir && old_dir != new_dir); - if (update_dotdot) { - if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de)) { - err = -EIO; + if (old_dir != new_dir) { + err = vfat_get_dotdot_de(old_inode, &dotdot_bh, &dotdot_de); + if (err) goto out; - } } + is_dir = S_ISDIR(old_inode->i_mode); ts = current_time(old_dir); if (new_inode) { if (is_dir) { @@ -942,21 +975,15 @@ static int vfat_rename(struct user_namespace *mnt_userns, struct inode *old_dir, fat_detach(old_inode); fat_attach(old_inode, new_i_pos); - if (IS_DIRSYNC(new_dir)) { - err = fat_sync_inode(old_inode); - if (err) - goto error_inode; - } else - mark_inode_dirty(old_inode); + err = vfat_sync_ipos(new_dir, old_inode); + if (err) + goto error_inode; - if (update_dotdot) { - fat_set_start(dotdot_de, MSDOS_I(new_dir)->i_logstart); - mark_buffer_dirty_inode(dotdot_bh, old_inode); - if (IS_DIRSYNC(new_dir)) { - err = sync_dirty_buffer(dotdot_bh); - if (err) - goto error_dotdot; - } + if (dotdot_de) { + err = vfat_update_dotdot_de(new_dir, old_inode, dotdot_bh, + dotdot_de); + if (err) + goto error_dotdot; drop_nlink(old_dir); if (!new_inode) inc_nlink(new_dir); @@ -966,12 +993,7 @@ static int vfat_rename(struct user_namespace *mnt_userns, struct inode *old_dir, old_sinfo.bh = NULL; if (err) goto error_dotdot; - inode_inc_iversion(old_dir); - fat_truncate_time(old_dir, &ts, S_CTIME|S_MTIME); - if (IS_DIRSYNC(old_dir)) - (void)fat_sync_inode(old_dir); - else - mark_inode_dirty(old_dir); + vfat_update_dir_metadata(old_dir, &ts); if (new_inode) { drop_nlink(new_inode); @@ -991,10 +1013,9 @@ error_dotdot: /* data cluster is shared, serious corruption */ corrupt = 1; - if (update_dotdot) { - fat_set_start(dotdot_de, MSDOS_I(old_dir)->i_logstart); - mark_buffer_dirty_inode(dotdot_bh, old_inode); - corrupt |= sync_dirty_buffer(dotdot_bh); + if (dotdot_de) { + corrupt |= vfat_update_dotdot_de(old_dir, old_inode, dotdot_bh, + dotdot_de); } error_inode: fat_detach(old_inode); @@ -1021,13 +1042,145 @@ error_inode: goto out; } +static void vfat_exchange_ipos(struct inode *old_inode, struct inode *new_inode, + loff_t old_i_pos, loff_t new_i_pos) +{ + fat_detach(old_inode); + fat_detach(new_inode); + fat_attach(old_inode, new_i_pos); + fat_attach(new_inode, old_i_pos); +} + +static void vfat_move_nlink(struct inode *src, struct inode *dst) +{ + drop_nlink(src); + inc_nlink(dst); +} + +static int vfat_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct buffer_head *old_dotdot_bh = NULL, *new_dotdot_bh = NULL; + struct msdos_dir_entry *old_dotdot_de = NULL, *new_dotdot_de = NULL; + struct inode *old_inode, *new_inode; + struct timespec64 ts = current_time(old_dir); + loff_t old_i_pos, new_i_pos; + int err, corrupt = 0; + struct super_block *sb = old_dir->i_sb; + + old_inode = d_inode(old_dentry); + new_inode = d_inode(new_dentry); + + /* Acquire super block lock for the operation to be atomic */ + mutex_lock(&MSDOS_SB(sb)->s_lock); + + /* if directories are not the same, get ".." info to update */ + if (old_dir != new_dir) { + err = vfat_get_dotdot_de(old_inode, &old_dotdot_bh, + &old_dotdot_de); + if (err) + goto out; + + err = vfat_get_dotdot_de(new_inode, &new_dotdot_bh, + &new_dotdot_de); + if (err) + goto out; + } + + old_i_pos = MSDOS_I(old_inode)->i_pos; + new_i_pos = MSDOS_I(new_inode)->i_pos; + + vfat_exchange_ipos(old_inode, new_inode, old_i_pos, new_i_pos); + + err = vfat_sync_ipos(old_dir, new_inode); + if (err) + goto error_exchange; + err = vfat_sync_ipos(new_dir, old_inode); + if (err) + goto error_exchange; + + /* update ".." directory entry info */ + if (old_dotdot_de) { + err = vfat_update_dotdot_de(new_dir, old_inode, old_dotdot_bh, + old_dotdot_de); + if (err) + goto error_old_dotdot; + } + if (new_dotdot_de) { + err = vfat_update_dotdot_de(old_dir, new_inode, new_dotdot_bh, + new_dotdot_de); + if (err) + goto error_new_dotdot; + } + + /* if cross directory and only one is a directory, adjust nlink */ + if (!old_dotdot_de != !new_dotdot_de) { + if (old_dotdot_de) + vfat_move_nlink(old_dir, new_dir); + else + vfat_move_nlink(new_dir, old_dir); + } + + vfat_update_dir_metadata(old_dir, &ts); + /* if directories are not the same, update new_dir as well */ + if (old_dir != new_dir) + vfat_update_dir_metadata(new_dir, &ts); + +out: + brelse(old_dotdot_bh); + brelse(new_dotdot_bh); + mutex_unlock(&MSDOS_SB(sb)->s_lock); + + return err; + +error_new_dotdot: + if (new_dotdot_de) { + corrupt |= vfat_update_dotdot_de(new_dir, new_inode, + new_dotdot_bh, new_dotdot_de); + } + +error_old_dotdot: + if (old_dotdot_de) { + corrupt |= vfat_update_dotdot_de(old_dir, old_inode, + old_dotdot_bh, old_dotdot_de); + } + +error_exchange: + vfat_exchange_ipos(old_inode, new_inode, new_i_pos, old_i_pos); + corrupt |= vfat_sync_ipos(new_dir, new_inode); + corrupt |= vfat_sync_ipos(old_dir, old_inode); + + if (corrupt < 0) { + fat_fs_error(new_dir->i_sb, + "%s: Filesystem corrupted (i_pos %lld, %lld)", + __func__, old_i_pos, new_i_pos); + } + goto out; +} + +static int vfat_rename2(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) +{ + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + return -EINVAL; + + if (flags & RENAME_EXCHANGE) { + return vfat_rename_exchange(old_dir, old_dentry, + new_dir, new_dentry); + } + + /* VFS already handled RENAME_NOREPLACE, handle it as a normal rename */ + return vfat_rename(old_dir, old_dentry, new_dir, new_dentry); +} + static const struct inode_operations vfat_dir_inode_operations = { .create = vfat_create, .lookup = vfat_lookup, .unlink = vfat_unlink, .mkdir = vfat_mkdir, .rmdir = vfat_rmdir, - .rename = vfat_rename, + .rename = vfat_rename2, .setattr = fat_setattr, .getattr = fat_getattr, .update_time = fat_update_time, diff --git a/fs/fcntl.c b/fs/fcntl.c index 34a3faa4886d..146c9ab0cd4b 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -78,6 +78,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg) } spin_lock(&filp->f_lock); filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK); + filp->f_iocb_flags = iocb_flags(filp); spin_unlock(&filp->f_lock); out: diff --git a/fs/file_table.c b/fs/file_table.c index 5424e3a8df5f..99c6796c9f28 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -45,7 +45,7 @@ static struct percpu_counter nr_files __cacheline_aligned_in_smp; static void file_free_rcu(struct rcu_head *head) { - struct file *f = container_of(head, struct file, f_u.fu_rcuhead); + struct file *f = container_of(head, struct file, f_rcuhead); put_cred(f->f_cred); kmem_cache_free(filp_cachep, f); @@ -56,7 +56,7 @@ static inline void file_free(struct file *f) security_file_free(f); if (!(f->f_mode & FMODE_NOACCOUNT)) percpu_counter_dec(&nr_files); - call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); + call_rcu(&f->f_rcuhead, file_free_rcu); } /* @@ -142,7 +142,7 @@ static struct file *__alloc_file(int flags, const struct cred *cred) f->f_cred = get_cred(cred); error = security_file_alloc(f); if (unlikely(error)) { - file_free_rcu(&f->f_u.fu_rcuhead); + file_free_rcu(&f->f_rcuhead); return ERR_PTR(error); } @@ -235,12 +235,15 @@ static struct file *alloc_file(const struct path *path, int flags, file->f_mapping = path->dentry->d_inode->i_mapping; file->f_wb_err = filemap_sample_wb_err(file->f_mapping); file->f_sb_err = file_sample_sb_err(file); + if (fop->llseek) + file->f_mode |= FMODE_LSEEK; if ((file->f_mode & FMODE_READ) && likely(fop->read || fop->read_iter)) file->f_mode |= FMODE_CAN_READ; if ((file->f_mode & FMODE_WRITE) && likely(fop->write || fop->write_iter)) file->f_mode |= FMODE_CAN_WRITE; + file->f_iocb_flags = iocb_flags(file); file->f_mode |= FMODE_OPENED; file->f_op = fop; if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) @@ -341,13 +344,13 @@ static void delayed_fput(struct work_struct *unused) struct llist_node *node = llist_del_all(&delayed_fput_list); struct file *f, *t; - llist_for_each_entry_safe(f, t, node, f_u.fu_llist) + llist_for_each_entry_safe(f, t, node, f_llist) __fput(f); } static void ____fput(struct callback_head *work) { - __fput(container_of(work, struct file, f_u.fu_rcuhead)); + __fput(container_of(work, struct file, f_rcuhead)); } /* @@ -374,8 +377,8 @@ void fput(struct file *file) struct task_struct *task = current; if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { - init_task_work(&file->f_u.fu_rcuhead, ____fput); - if (!task_work_add(task, &file->f_u.fu_rcuhead, TWA_RESUME)) + init_task_work(&file->f_rcuhead, ____fput); + if (!task_work_add(task, &file->f_rcuhead, TWA_RESUME)) return; /* * After this task has run exit_task_work(), @@ -384,7 +387,7 @@ void fput(struct file *file) */ } - if (llist_add(&file->f_u.fu_llist, &delayed_fput_list)) + if (llist_add(&file->f_llist, &delayed_fput_list)) schedule_delayed_work(&delayed_fput_work, 1); } } diff --git a/fs/freevxfs/vxfs_immed.c b/fs/freevxfs/vxfs_immed.c index c2ef9f0debbd..9b49ec36e667 100644 --- a/fs/freevxfs/vxfs_immed.c +++ b/fs/freevxfs/vxfs_immed.c @@ -13,16 +13,6 @@ #include "vxfs_extern.h" #include "vxfs_inode.h" - -static int vxfs_immed_read_folio(struct file *, struct folio *); - -/* - * Address space operations for immed files and directories. - */ -const struct address_space_operations vxfs_immed_aops = { - .read_folio = vxfs_immed_read_folio, -}; - /** * vxfs_immed_read_folio - read part of an immed inode into pagecache * @file: file context (unused) @@ -30,7 +20,7 @@ const struct address_space_operations vxfs_immed_aops = { * * Description: * vxfs_immed_read_folio reads a part of the immed area of the - * file that hosts @pp into the pagecache. + * file that hosts @folio into the pagecache. * * Returns: * Zero on success, else a negative error code. @@ -38,21 +28,26 @@ const struct address_space_operations vxfs_immed_aops = { * Locking status: * @folio is locked and will be unlocked. */ -static int -vxfs_immed_read_folio(struct file *fp, struct folio *folio) +static int vxfs_immed_read_folio(struct file *fp, struct folio *folio) { - struct page *pp = &folio->page; - struct vxfs_inode_info *vip = VXFS_INO(pp->mapping->host); - u_int64_t offset = (u_int64_t)pp->index << PAGE_SHIFT; - caddr_t kaddr; + struct vxfs_inode_info *vip = VXFS_INO(folio->mapping->host); + void *src = vip->vii_immed.vi_immed + folio_pos(folio); + unsigned long i; - kaddr = kmap(pp); - memcpy(kaddr, vip->vii_immed.vi_immed + offset, PAGE_SIZE); - kunmap(pp); - - flush_dcache_page(pp); - SetPageUptodate(pp); - unlock_page(pp); + for (i = 0; i < folio_nr_pages(folio); i++) { + memcpy_to_page(folio_page(folio, i), 0, src, PAGE_SIZE); + src += PAGE_SIZE; + } + + folio_mark_uptodate(folio); + folio_unlock(folio); return 0; } + +/* + * Address space operations for immed files and directories. + */ +const struct address_space_operations vxfs_immed_aops = { + .read_folio = vxfs_immed_read_folio, +}; diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c index 0e633d2bfc7d..c99282df7761 100644 --- a/fs/freevxfs/vxfs_subr.c +++ b/fs/freevxfs/vxfs_subr.c @@ -51,15 +51,9 @@ vxfs_get_page(struct address_space *mapping, u_long n) kmap(pp); /** if (!PageChecked(pp)) **/ /** vxfs_check_page(pp); **/ - if (PageError(pp)) - goto fail; } return (pp); - -fail: - vxfs_put_page(pp); - return ERR_PTR(-EIO); } /** diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 9d3cf0111709..74920826d8f6 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -372,17 +372,22 @@ nomem: return NULL; } +static inline bool fscache_cookie_is_dropped(struct fscache_cookie *cookie) +{ + return READ_ONCE(cookie->state) == FSCACHE_COOKIE_STATE_DROPPED; +} + static void fscache_wait_on_collision(struct fscache_cookie *candidate, struct fscache_cookie *wait_for) { enum fscache_cookie_state *statep = &wait_for->state; - wait_var_event_timeout(statep, READ_ONCE(*statep) == FSCACHE_COOKIE_STATE_DROPPED, + wait_var_event_timeout(statep, fscache_cookie_is_dropped(wait_for), 20 * HZ); - if (READ_ONCE(*statep) != FSCACHE_COOKIE_STATE_DROPPED) { + if (!fscache_cookie_is_dropped(wait_for)) { pr_notice("Potential collision c=%08x old: c=%08x", candidate->debug_id, wait_for->debug_id); - wait_var_event(statep, READ_ONCE(*statep) == FSCACHE_COOKIE_STATE_DROPPED); + wait_var_event(statep, fscache_cookie_is_dropped(wait_for)); } } @@ -517,7 +522,14 @@ static void fscache_perform_lookup(struct fscache_cookie *cookie) } fscache_see_cookie(cookie, fscache_cookie_see_active); - fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_ACTIVE); + spin_lock(&cookie->lock); + if (test_and_clear_bit(FSCACHE_COOKIE_DO_INVALIDATE, &cookie->flags)) + __fscache_set_cookie_state(cookie, + FSCACHE_COOKIE_STATE_INVALIDATING); + else + __fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_ACTIVE); + spin_unlock(&cookie->lock); + wake_up_cookie_state(cookie); trace = fscache_access_lookup_cookie_end; out: @@ -752,6 +764,9 @@ again_locked: spin_lock(&cookie->lock); } + if (test_and_clear_bit(FSCACHE_COOKIE_DO_INVALIDATE, &cookie->flags)) + fscache_end_cookie_access(cookie, fscache_access_invalidate_cookie_end); + switch (state) { case FSCACHE_COOKIE_STATE_RELINQUISHING: fscache_see_cookie(cookie, fscache_cookie_see_relinquish); @@ -1048,6 +1063,9 @@ void __fscache_invalidate(struct fscache_cookie *cookie, return; case FSCACHE_COOKIE_STATE_LOOKING_UP: + __fscache_begin_cookie_access(cookie, fscache_access_invalidate_cookie); + set_bit(FSCACHE_COOKIE_DO_INVALIDATE, &cookie->flags); + fallthrough; case FSCACHE_COOKIE_STATE_CREATING: spin_unlock(&cookie->lock); _leave(" [look %x]", cookie->inval_counter); diff --git a/fs/fscache/volume.c b/fs/fscache/volume.c index f2aa7dbad766..a058e0136bfe 100644 --- a/fs/fscache/volume.c +++ b/fs/fscache/volume.c @@ -143,7 +143,7 @@ static void fscache_wait_on_volume_collision(struct fscache_volume *candidate, { wait_var_event_timeout(&candidate->flags, !fscache_is_acquire_pending(candidate), 20 * HZ); - if (!fscache_is_acquire_pending(candidate)) { + if (fscache_is_acquire_pending(candidate)) { pr_notice("Potential volume collision new=%08x old=%08x", candidate->debug_id, collidee_debug_id); fscache_stat(&fscache_n_volumes_collision); @@ -182,7 +182,7 @@ static bool fscache_hash_volume(struct fscache_volume *candidate) hlist_bl_add_head(&candidate->hash_link, h); hlist_bl_unlock(h); - if (test_bit(FSCACHE_VOLUME_ACQUIRE_PENDING, &candidate->flags)) + if (fscache_is_acquire_pending(candidate)) fscache_wait_on_volume_collision(candidate, collidee_debug_id); return true; diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 7cede9a3bc96..247ef4f76761 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -258,7 +258,7 @@ int fuse_ctl_add_conn(struct fuse_conn *fc) struct dentry *parent; char name[32]; - if (!fuse_control_sb) + if (!fuse_control_sb || fc->no_control) return 0; parent = fuse_control_sb->s_root; @@ -296,7 +296,7 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc) { int i; - if (!fuse_control_sb) + if (!fuse_control_sb || fc->no_control) return; for (i = fc->ctl_ndents - 1; i >= 0; i--) { diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 10eb50cbf398..e23e802a8013 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -138,9 +138,9 @@ static struct fuse_dax_mapping *alloc_dax_mapping(struct fuse_conn_dax *fcd) WARN_ON(fcd->nr_free_ranges <= 0); fcd->nr_free_ranges--; } + __kick_dmap_free_worker(fcd, 0); spin_unlock(&fcd->lock); - kick_dmap_free_worker(fcd, 0); return dmap; } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 74303d6e987b..b585b04e815e 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -11,6 +11,7 @@ #include <linux/pagemap.h> #include <linux/file.h> #include <linux/fs_context.h> +#include <linux/moduleparam.h> #include <linux/sched.h> #include <linux/namei.h> #include <linux/slab.h> @@ -21,6 +22,11 @@ #include <linux/types.h> #include <linux/kernel.h> +static bool __read_mostly allow_sys_admin_access; +module_param(allow_sys_admin_access, bool, 0644); +MODULE_PARM_DESC(allow_sys_admin_access, + "Allow users with CAP_SYS_ADMIN in initial userns to bypass allow_other access check"); + static void fuse_advise_use_readdirplus(struct inode *dir) { struct fuse_inode *fi = get_fuse_inode(dir); @@ -537,6 +543,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, struct fuse_file *ff; void *security_ctx = NULL; u32 security_ctxlen; + bool trunc = flags & O_TRUNC; /* Userspace expects S_IFREG in create mode */ BUG_ON((mode & S_IFMT) != S_IFREG); @@ -561,7 +568,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, inarg.mode = mode; inarg.umask = current_umask(); - if (fm->fc->handle_killpriv_v2 && (flags & O_TRUNC) && + if (fm->fc->handle_killpriv_v2 && trunc && !(flags & O_EXCL) && !capable(CAP_FSETID)) { inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID; } @@ -623,6 +630,10 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, } else { file->private_data = ff; fuse_finish_open(inode, file); + if (fm->fc->atomic_o_trunc && trunc) + truncate_pagecache(inode, 0); + else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) + invalidate_inode_pages2(inode->i_mapping); } return err; @@ -1224,6 +1235,9 @@ int fuse_allow_current_process(struct fuse_conn *fc) { const struct cred *cred; + if (allow_sys_admin_access && capable(CAP_SYS_ADMIN)) + return 1; + if (fc->allow_other) return current_in_userns(fc->user_ns); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 05caa2b9272e..7154b9555f39 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -210,13 +210,9 @@ void fuse_finish_open(struct inode *inode, struct file *file) fi->attr_version = atomic64_inc_return(&fc->attr_version); i_size_write(inode, 0); spin_unlock(&fi->lock); - truncate_pagecache(inode, 0); file_update_time(file); fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); - } else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) { - invalidate_inode_pages2(inode->i_mapping); } - if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache) fuse_link_write_file(file); } @@ -239,30 +235,38 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) if (err) return err; - if (is_wb_truncate || dax_truncate) { + if (is_wb_truncate || dax_truncate) inode_lock(inode); - fuse_set_nowrite(inode); - } if (dax_truncate) { filemap_invalidate_lock(inode->i_mapping); err = fuse_dax_break_layouts(inode, 0, 0); if (err) - goto out; + goto out_inode_unlock; } + if (is_wb_truncate || dax_truncate) + fuse_set_nowrite(inode); + err = fuse_do_open(fm, get_node_id(inode), file, isdir); if (!err) fuse_finish_open(inode, file); -out: + if (is_wb_truncate || dax_truncate) + fuse_release_nowrite(inode); + if (!err) { + struct fuse_file *ff = file->private_data; + + if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) + truncate_pagecache(inode, 0); + else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) + invalidate_inode_pages2(inode->i_mapping); + } if (dax_truncate) filemap_invalidate_unlock(inode->i_mapping); - - if (is_wb_truncate | dax_truncate) { - fuse_release_nowrite(inode); +out_inode_unlock: + if (is_wb_truncate || dax_truncate) inode_unlock(inode); - } return err; } @@ -338,6 +342,15 @@ static int fuse_open(struct inode *inode, struct file *file) static int fuse_release(struct inode *inode, struct file *file) { + struct fuse_conn *fc = get_fuse_conn(inode); + + /* + * Dirty pages might remain despite write_inode_now() call from + * fuse_flush() due to writes racing with the close. + */ + if (fc->writeback_cache) + write_inode_now(inode, 1); + fuse_release_common(file, false); /* return value is ignored by VFS */ @@ -1042,7 +1055,7 @@ static unsigned int fuse_write_flags(struct kiocb *iocb) { unsigned int flags = iocb->ki_filp->f_flags; - if (iocb->ki_flags & IOCB_DSYNC) + if (iocb_is_dsync(iocb)) flags |= O_DSYNC; if (iocb->ki_flags & IOCB_SYNC) flags |= O_SYNC; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 8c0665c5dff8..6b3beda16c1b 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -180,6 +180,12 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, inode->i_uid = make_kuid(fc->user_ns, attr->uid); inode->i_gid = make_kgid(fc->user_ns, attr->gid); inode->i_blocks = attr->blocks; + + /* Sanitize nsecs */ + attr->atimensec = min_t(u32, attr->atimensec, NSEC_PER_SEC - 1); + attr->mtimensec = min_t(u32, attr->mtimensec, NSEC_PER_SEC - 1); + attr->ctimensec = min_t(u32, attr->ctimensec, NSEC_PER_SEC - 1); + inode->i_atime.tv_sec = attr->atime; inode->i_atime.tv_nsec = attr->atimensec; /* mtime from server may be stale due to local buffered write */ @@ -476,8 +482,14 @@ static void fuse_umount_begin(struct super_block *sb) { struct fuse_conn *fc = get_fuse_conn_super(sb); - if (!fc->no_force_umount) - fuse_abort_conn(fc); + if (fc->no_force_umount) + return; + + fuse_abort_conn(fc); + + // Only retire block-device-based superblocks. + if (sb->s_bdev != NULL) + retire_super(sb); } static void fuse_send_destroy(struct fuse_mount *fm) diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index 33cde4bbccdc..61d8afcb10a3 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -9,6 +9,17 @@ #include <linux/compat.h> #include <linux/fileattr.h> +static ssize_t fuse_send_ioctl(struct fuse_mount *fm, struct fuse_args *args) +{ + ssize_t ret = fuse_simple_request(fm, args); + + /* Translate ENOSYS, which shouldn't be returned from fs */ + if (ret == -ENOSYS) + ret = -ENOTTY; + + return ret; +} + /* * CUSE servers compiled on 32bit broke on 64bit kernels because the * ABI was defined to be 'struct iovec' which is different on 32bit @@ -259,7 +270,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, ap.args.out_pages = true; ap.args.out_argvar = true; - transferred = fuse_simple_request(fm, &ap.args); + transferred = fuse_send_ioctl(fm, &ap.args); err = transferred; if (transferred < 0) goto out; @@ -393,7 +404,7 @@ static int fuse_priv_ioctl(struct inode *inode, struct fuse_file *ff, args.out_args[1].size = inarg.out_size; args.out_args[1].value = ptr; - err = fuse_simple_request(fm, &args); + err = fuse_send_ioctl(fm, &args); if (!err) { if (outarg.result < 0) err = outarg.result; diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 8db53fa67359..4d8d4f16c727 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -741,8 +741,7 @@ out: } /* Free virtqueues (device must already be reset) */ -static void virtio_fs_cleanup_vqs(struct virtio_device *vdev, - struct virtio_fs *fs) +static void virtio_fs_cleanup_vqs(struct virtio_device *vdev) { vdev->config->del_vqs(vdev); } @@ -757,7 +756,7 @@ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, { struct virtio_fs *fs = dax_get_private(dax_dev); phys_addr_t offset = PFN_PHYS(pgoff); - size_t max_nr_pages = fs->window_len/PAGE_SIZE - pgoff; + size_t max_nr_pages = fs->window_len / PAGE_SIZE - pgoff; if (kaddr) *kaddr = fs->window_kaddr + offset; @@ -895,7 +894,7 @@ static int virtio_fs_probe(struct virtio_device *vdev) out_vqs: virtio_reset_device(vdev); - virtio_fs_cleanup_vqs(vdev, fs); + virtio_fs_cleanup_vqs(vdev); kfree(fs->vqs); out: @@ -927,7 +926,7 @@ static void virtio_fs_remove(struct virtio_device *vdev) virtio_fs_stop_all_queues(fs); virtio_fs_drain_all_queues_locked(fs); virtio_reset_device(vdev); - virtio_fs_cleanup_vqs(vdev, fs); + virtio_fs_cleanup_vqs(vdev); vdev->priv = NULL; /* Put device reference on virtio_fs object */ diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 106e90a36583..57ff883d432c 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -774,7 +774,7 @@ static const struct address_space_operations gfs2_aops = { .invalidate_folio = iomap_invalidate_folio, .bmap = gfs2_bmap, .direct_IO = noop_direct_IO, - .migratepage = iomap_migrate_page, + .migrate_folio = filemap_migrate_folio, .is_partially_uptodate = iomap_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index b6697333bb2b..3bdb2c668a71 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -310,9 +310,8 @@ static void gfs2_metapath_ra(struct gfs2_glock *gl, __be64 *start, __be64 *end) if (trylock_buffer(rabh)) { if (!buffer_uptodate(rabh)) { rabh->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, - REQ_RAHEAD | REQ_META | REQ_PRIO, - rabh); + submit_bh(REQ_OP_READ | REQ_RAHEAD | REQ_META | + REQ_PRIO, rabh); continue; } unlock_buffer(rabh); diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 42b7dfffb5e7..54a6d17b8c25 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -1508,9 +1508,8 @@ static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index, continue; } bh->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, - REQ_RAHEAD | REQ_META | REQ_PRIO, - bh); + submit_bh(REQ_OP_READ | REQ_RAHEAD | REQ_META | + REQ_PRIO, bh); continue; } brelse(bh); @@ -2017,7 +2016,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, l_blocks++; } - gfs2_rlist_alloc(&rlist); + gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, LM_FLAG_NODE_SCOPE); for (x = 0; x < rlist.rl_rgrps; x++) { struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(rlist.rl_ghs[x].gh_gl); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 2cceb193dcd8..d8f1239344c1 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1066,8 +1066,7 @@ out_unlock: gfs2_glock_dq(gh); out_uninit: gfs2_holder_uninit(gh); - if (statfs_gh) - kfree(statfs_gh); + kfree(statfs_gh); from->count = orig_count - written; return written ? written : ret; } diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index c992d53013d3..41b6c89e4bf7 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -405,10 +405,13 @@ static void do_error(struct gfs2_glock *gl, const int ret) /** * demote_incompat_holders - demote incompatible demoteable holders * @gl: the glock we want to promote - * @new_gh: the new holder to be promoted + * @current_gh: the newly promoted holder + * + * We're passing the newly promoted holder in @current_gh, but actually, any of + * the strong holders would do. */ static void demote_incompat_holders(struct gfs2_glock *gl, - struct gfs2_holder *new_gh) + struct gfs2_holder *current_gh) { struct gfs2_holder *gh, *tmp; @@ -424,8 +427,10 @@ static void demote_incompat_holders(struct gfs2_glock *gl, */ if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) return; + if (gh == current_gh) + continue; if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags) && - !may_grant(gl, new_gh, gh)) { + !may_grant(gl, current_gh, gh)) { /* * We should not recurse into do_promote because * __gfs2_glock_dq only calls handle_callback, @@ -478,8 +483,7 @@ find_first_strong_holder(struct gfs2_glock *gl) * gfs2_instantiate - Call the glops instantiate function * @gh: The glock holder * - * Returns: 0 if instantiate was successful, 2 if type specific operation is - * underway, or error. + * Returns: 0 if instantiate was successful, or error. */ int gfs2_instantiate(struct gfs2_holder *gh) { @@ -489,7 +493,7 @@ int gfs2_instantiate(struct gfs2_holder *gh) again: if (!test_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags)) - return 0; + goto done; /* * Since we unlock the lockref lock, we set a flag to indicate @@ -508,78 +512,55 @@ again: goto again; } - ret = glops->go_instantiate(gh); + ret = glops->go_instantiate(gl); if (!ret) clear_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags); clear_and_wake_up_bit(GLF_INSTANTIATE_IN_PROG, &gl->gl_flags); - return ret; + if (ret) + return ret; + +done: + if (glops->go_held) + return glops->go_held(gh); + return 0; } /** * do_promote - promote as many requests as possible on the current queue * @gl: The glock * - * Returns: 1 if there is a blocked holder at the head of the list, or 2 - * if a type specific operation is underway. + * Returns: 1 if there is a blocked holder at the head of the list */ static int do_promote(struct gfs2_glock *gl) -__releases(&gl->gl_lockref.lock) -__acquires(&gl->gl_lockref.lock) { - struct gfs2_holder *gh, *tmp, *first_gh; + struct gfs2_holder *gh, *current_gh; bool incompat_holders_demoted = false; - bool lock_released; - int ret; -restart: - first_gh = find_first_strong_holder(gl); - list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) { - lock_released = false; + current_gh = find_first_strong_holder(gl); + list_for_each_entry(gh, &gl->gl_holders, gh_list) { if (test_bit(HIF_HOLDER, &gh->gh_iflags)) continue; - if (!may_grant(gl, first_gh, gh)) { + if (!may_grant(gl, current_gh, gh)) { /* - * If we get here, it means we may not grant this holder for - * some reason. If this holder is the head of the list, it - * means we have a blocked holder at the head, so return 1. + * If we get here, it means we may not grant this + * holder for some reason. If this holder is at the + * head of the list, it means we have a blocked holder + * at the head, so return 1. */ if (list_is_first(&gh->gh_list, &gl->gl_holders)) return 1; do_error(gl, 0); break; } - if (!incompat_holders_demoted) { - demote_incompat_holders(gl, first_gh); - incompat_holders_demoted = true; - first_gh = gh; - } - if (test_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags) && - !(gh->gh_flags & GL_SKIP) && gl->gl_ops->go_instantiate) { - lock_released = true; - spin_unlock(&gl->gl_lockref.lock); - ret = gfs2_instantiate(gh); - spin_lock(&gl->gl_lockref.lock); - if (ret) { - if (ret == 1) - return 2; - gh->gh_error = ret; - list_del_init(&gh->gh_list); - trace_gfs2_glock_queue(gh, 0); - gfs2_holder_wake(gh); - goto restart; - } - } set_bit(HIF_HOLDER, &gh->gh_iflags); trace_gfs2_promote(gh); gfs2_holder_wake(gh); - /* - * If we released the gl_lockref.lock the holders list may have - * changed. For that reason, we start again at the start of - * the holders queue. - */ - if (lock_released) - goto restart; + if (!incompat_holders_demoted) { + current_gh = gh; + demote_incompat_holders(gl, current_gh); + incompat_holders_demoted = true; + } } return 0; } @@ -657,7 +638,6 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret) const struct gfs2_glock_operations *glops = gl->gl_ops; struct gfs2_holder *gh; unsigned state = ret & LM_OUT_ST_MASK; - int rv; spin_lock(&gl->gl_lockref.lock); trace_gfs2_glock_state_change(gl, state); @@ -715,6 +695,8 @@ retry: gfs2_demote_wake(gl); if (state != LM_ST_UNLOCKED) { if (glops->go_xmote_bh) { + int rv; + spin_unlock(&gl->gl_lockref.lock); rv = glops->go_xmote_bh(gl); spin_lock(&gl->gl_lockref.lock); @@ -723,13 +705,10 @@ retry: goto out; } } - rv = do_promote(gl); - if (rv == 2) - goto out_locked; + do_promote(gl); } out: clear_bit(GLF_LOCK, &gl->gl_flags); -out_locked: spin_unlock(&gl->gl_lockref.lock); } @@ -886,7 +865,6 @@ __releases(&gl->gl_lockref.lock) __acquires(&gl->gl_lockref.lock) { struct gfs2_holder *gh = NULL; - int ret; if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) return; @@ -905,18 +883,14 @@ __acquires(&gl->gl_lockref.lock) } else { if (test_bit(GLF_DEMOTE, &gl->gl_flags)) gfs2_demote_wake(gl); - ret = do_promote(gl); - if (ret == 0) + if (do_promote(gl) == 0) goto out_unlock; - if (ret == 2) - goto out; gh = find_first_waiter(gl); gl->gl_target = gh->gh_state; if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) do_error(gl, 0); /* Fail queued try locks */ } do_xmote(gl, gh, gl->gl_target); -out: return; out_sched: @@ -1314,6 +1288,25 @@ static void gfs2_glock_update_hold_time(struct gfs2_glock *gl, } /** + * gfs2_glock_holder_ready - holder is ready and its error code can be collected + * @gh: the glock holder + * + * Called when a glock holder no longer needs to be waited for because it is + * now either held (HIF_HOLDER set; gh_error == 0), or acquiring the lock has + * failed (gh_error != 0). + */ + +int gfs2_glock_holder_ready(struct gfs2_holder *gh) +{ + if (gh->gh_error || (gh->gh_flags & GL_SKIP)) + return gh->gh_error; + gh->gh_error = gfs2_instantiate(gh); + if (gh->gh_error) + gfs2_glock_dq(gh); + return gh->gh_error; +} + +/** * gfs2_glock_wait - wait on a glock acquisition * @gh: the glock holder * @@ -1327,7 +1320,7 @@ int gfs2_glock_wait(struct gfs2_holder *gh) might_sleep(); wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE); gfs2_glock_update_hold_time(gh->gh_gl, start_time); - return gh->gh_error; + return gfs2_glock_holder_ready(gh); } static int glocks_pending(unsigned int num_gh, struct gfs2_holder *ghs) @@ -1355,7 +1348,6 @@ int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs) struct gfs2_sbd *sdp = ghs[0].gh_gl->gl_name.ln_sbd; int i, ret = 0, timeout = 0; unsigned long start_time = jiffies; - bool keep_waiting; might_sleep(); /* @@ -1365,53 +1357,33 @@ int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs) for (i = 0; i < num_gh; i++) timeout += ghs[i].gh_gl->gl_hold_time << 1; -wait_for_dlm: if (!wait_event_timeout(sdp->sd_async_glock_wait, - !glocks_pending(num_gh, ghs), timeout)) + !glocks_pending(num_gh, ghs), timeout)) { ret = -ESTALE; /* request timed out. */ + goto out; + } - /* - * If dlm granted all our requests, we need to adjust the glock - * minimum hold time values according to how long we waited. - * - * If our request timed out, we need to repeatedly release any held - * glocks we acquired thus far to allow dlm to acquire the remaining - * glocks without deadlocking. We cannot currently cancel outstanding - * glock acquisitions. - * - * The HIF_WAIT bit tells us which requests still need a response from - * dlm. - * - * If dlm sent us any errors, we return the first error we find. - */ - keep_waiting = false; for (i = 0; i < num_gh; i++) { - /* Skip holders we have already dequeued below. */ - if (!gfs2_holder_queued(&ghs[i])) - continue; - /* Skip holders with a pending DLM response. */ - if (test_bit(HIF_WAIT, &ghs[i].gh_iflags)) { - keep_waiting = true; - continue; - } + struct gfs2_holder *gh = &ghs[i]; + int ret2; - if (test_bit(HIF_HOLDER, &ghs[i].gh_iflags)) { - if (ret == -ESTALE) - gfs2_glock_dq(&ghs[i]); - else - gfs2_glock_update_hold_time(ghs[i].gh_gl, - start_time); + if (test_bit(HIF_HOLDER, &gh->gh_iflags)) { + gfs2_glock_update_hold_time(gh->gh_gl, + start_time); } + ret2 = gfs2_glock_holder_ready(gh); if (!ret) - ret = ghs[i].gh_error; + ret = ret2; } - if (keep_waiting) - goto wait_for_dlm; +out: + if (ret) { + for (i = 0; i < num_gh; i++) { + struct gfs2_holder *gh = &ghs[i]; - /* - * At this point, we've either acquired all locks or released them all. - */ + gfs2_glock_dq(gh); + } + } return ret; } @@ -1490,10 +1462,10 @@ __acquires(&gl->gl_lockref.lock) if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) { if (test_bit(GLF_LOCK, &gl->gl_flags)) { - struct gfs2_holder *first_gh; + struct gfs2_holder *current_gh; - first_gh = find_first_strong_holder(gl); - try_futile = !may_grant(gl, first_gh, gh); + current_gh = find_first_strong_holder(gl); + try_futile = !may_grant(gl, current_gh, gh); } if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) goto fail; @@ -1779,7 +1751,7 @@ static int glock_compare(const void *arg_a, const void *arg_b) } /** - * nq_m_sync - synchonously acquire more than one glock in deadlock free order + * nq_m_sync - synchronously acquire more than one glock in deadlock free order * @num_gh: the number of structures * @ghs: an array of struct gfs2_holder structures * @p: placeholder for the holder structure to pass back @@ -1800,8 +1772,6 @@ static int nq_m_sync(unsigned int num_gh, struct gfs2_holder *ghs, sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare, NULL); for (x = 0; x < num_gh; x++) { - p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC); - error = gfs2_glock_nq(p[x]); if (error) { while (x--) @@ -1818,7 +1788,6 @@ static int nq_m_sync(unsigned int num_gh, struct gfs2_holder *ghs, * @num_gh: the number of structures * @ghs: an array of struct gfs2_holder structures * - * * Returns: 0 on success (all glocks acquired), * errno on failure (no glocks acquired) */ @@ -1833,7 +1802,6 @@ int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs) case 0: return 0; case 1: - ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC); return gfs2_glock_nq(ghs); default: if (num_gh <= 4) @@ -2245,20 +2213,6 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp) glock_hash_walk(dump_glock_func, sdp); } -void gfs2_glock_finish_truncate(struct gfs2_inode *ip) -{ - struct gfs2_glock *gl = ip->i_gl; - int ret; - - ret = gfs2_truncatei_resume(ip); - gfs2_glock_assert_withdraw(gl, ret == 0); - - spin_lock(&gl->gl_lockref.lock); - clear_bit(GLF_LOCK, &gl->gl_flags); - run_queue(gl, 1); - spin_unlock(&gl->gl_lockref.lock); -} - static const char *state2str(unsigned state) { switch(state) { @@ -2533,7 +2487,7 @@ int __init gfs2_glock_init(void) return -ENOMEM; } - ret = register_shrinker(&glock_shrinker); + ret = register_shrinker(&glock_shrinker, "gfs2-glock"); if (ret) { destroy_workqueue(gfs2_delete_workqueue); destroy_workqueue(glock_workqueue); diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index c0ae9100a0bc..5aed8b500cf5 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -213,6 +213,7 @@ extern void gfs2_holder_uninit(struct gfs2_holder *gh); extern int gfs2_glock_nq(struct gfs2_holder *gh); extern int gfs2_glock_poll(struct gfs2_holder *gh); extern int gfs2_instantiate(struct gfs2_holder *gh); +extern int gfs2_glock_holder_ready(struct gfs2_holder *gh); extern int gfs2_glock_wait(struct gfs2_holder *gh); extern int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs); extern void gfs2_glock_dq(struct gfs2_holder *gh); @@ -273,7 +274,6 @@ extern void gfs2_cancel_delete_work(struct gfs2_glock *gl); extern bool gfs2_delete_work_queued(const struct gfs2_glock *gl); extern void gfs2_flush_delete_work(struct gfs2_sbd *sdp); extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp); -extern void gfs2_glock_finish_truncate(struct gfs2_inode *ip); extern void gfs2_glock_thaw(struct gfs2_sbd *sdp); extern void gfs2_glock_add_to_lru(struct gfs2_glock *gl); extern void gfs2_glock_free(struct gfs2_glock *gl); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 392800f082a6..49210a2e7ce7 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -485,35 +485,33 @@ int gfs2_inode_refresh(struct gfs2_inode *ip) * Returns: errno */ -static int inode_go_instantiate(struct gfs2_holder *gh) +static int inode_go_instantiate(struct gfs2_glock *gl) +{ + struct gfs2_inode *ip = gl->gl_object; + + if (!ip) /* no inode to populate - read it in later */ + return 0; + + return gfs2_inode_refresh(ip); +} + +static int inode_go_held(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; - struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct gfs2_inode *ip = gl->gl_object; int error = 0; if (!ip) /* no inode to populate - read it in later */ - goto out; - - error = gfs2_inode_refresh(ip); - if (error) - goto out; + return 0; if (gh->gh_state != LM_ST_DEFERRED) inode_dio_wait(&ip->i_inode); if ((ip->i_diskflags & GFS2_DIF_TRUNC_IN_PROG) && (gl->gl_state == LM_ST_EXCLUSIVE) && - (gh->gh_state == LM_ST_EXCLUSIVE)) { - spin_lock(&sdp->sd_trunc_lock); - if (list_empty(&ip->i_trunc_list)) - list_add(&ip->i_trunc_list, &sdp->sd_trunc_list); - spin_unlock(&sdp->sd_trunc_lock); - wake_up(&sdp->sd_quota_wait); - error = 1; - } + (gh->gh_state == LM_ST_EXCLUSIVE)) + error = gfs2_truncatei_resume(ip); -out: return error; } @@ -737,6 +735,7 @@ const struct gfs2_glock_operations gfs2_inode_glops = { .go_inval = inode_go_inval, .go_demote_ok = inode_go_demote_ok, .go_instantiate = inode_go_instantiate, + .go_held = inode_go_held, .go_dump = inode_go_dump, .go_type = LM_TYPE_INODE, .go_flags = GLOF_ASPACE | GLOF_LRU | GLOF_LVB, diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 8c00fb389ae5..d09d9892cd05 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -219,7 +219,8 @@ struct gfs2_glock_operations { int (*go_xmote_bh)(struct gfs2_glock *gl); void (*go_inval) (struct gfs2_glock *gl, int flags); int (*go_demote_ok) (const struct gfs2_glock *gl); - int (*go_instantiate) (struct gfs2_holder *gh); + int (*go_instantiate) (struct gfs2_glock *gl); + int (*go_held)(struct gfs2_holder *gh); void (*go_dump)(struct seq_file *seq, struct gfs2_glock *gl, const char *fs_id_buf); void (*go_callback)(struct gfs2_glock *gl, bool remote); @@ -396,7 +397,6 @@ struct gfs2_inode { atomic_t i_sizehint; /* hint of the write size */ struct rw_semaphore i_rw_mutex; struct list_head i_ordered; - struct list_head i_trunc_list; __be64 *i_hash_cache; u32 i_entries; u32 i_diskflags; @@ -784,8 +784,6 @@ struct gfs2_sbd { struct mutex sd_quota_mutex; struct mutex sd_quota_sync_mutex; wait_queue_head_t sd_quota_wait; - struct list_head sd_trunc_list; - spinlock_t sd_trunc_lock; unsigned int sd_quota_slots; unsigned long *sd_quota_bitmap; diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 2559a79cf14b..6ce369b096d4 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -1058,7 +1058,7 @@ restart: /* * Expand static jid arrays if necessary (by increments of RECOVER_SIZE_INC) - * to accomodate the largest slot number. (NB dlm slot numbers start at 1, + * to accommodate the largest slot number. (NB dlm slot numbers start at 1, * gfs2 jids start at 0, so jid = slot - 1) */ diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index f0ee3ff6f9a8..eec4159b08aa 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -823,7 +823,7 @@ void gfs2_flush_revokes(struct gfs2_sbd *sdp) void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, u64 seq, u32 tail, u32 lblock, u32 flags, - int op_flags) + blk_opf_t op_flags) { struct gfs2_log_header *lh; u32 hash, crc; @@ -905,7 +905,7 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, static void log_write_header(struct gfs2_sbd *sdp, u32 flags) { - int op_flags = REQ_PREFLUSH | REQ_FUA | REQ_META | REQ_SYNC; + blk_opf_t op_flags = REQ_PREFLUSH | REQ_FUA | REQ_META | REQ_SYNC; enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state); gfs2_assert_withdraw(sdp, (state != SFS_FROZEN)); diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index fc905c2af53c..653cffcbf869 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -82,7 +82,7 @@ extern void gfs2_log_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr, unsigned int *extra_revokes); extern void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, u64 seq, u32 tail, u32 lblock, u32 flags, - int op_flags); + blk_opf_t op_flags); extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 type); extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 6ba51cbb94cf..1902413d5d12 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -238,7 +238,7 @@ static void gfs2_end_log_write(struct bio *bio) * there is no pending bio, then this is a no-op. */ -void gfs2_log_submit_bio(struct bio **biop, int opf) +void gfs2_log_submit_bio(struct bio **biop, blk_opf_t opf) { struct bio *bio = *biop; if (bio) { @@ -292,7 +292,7 @@ static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno, */ static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno, - struct bio **biop, int op, + struct bio **biop, enum req_op op, bio_end_io_t *end_io, bool flush) { struct bio *bio = *biop; @@ -452,36 +452,36 @@ static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd, * @head: The journal head to start from * @done: If set, perform only cleanup, else search and set if found. * - * Find the page with 'index' in the journal's mapping. Search the page for + * Find the folio with 'index' in the journal's mapping. Search the folio for * the journal head if requested (cleanup == false). Release refs on the - * page so the page cache can reclaim it (put_page() twice). We grabbed a - * reference on this page two times, first when we did a find_or_create_page() - * to obtain the page to add it to the bio and second when we do a - * find_get_page() here to get the page to wait on while I/O on it is being + * folio so the page cache can reclaim it. We grabbed a + * reference on this folio twice, first when we did a find_or_create_page() + * to obtain the folio to add it to the bio and second when we do a + * filemap_get_folio() here to get the folio to wait on while I/O on it is being * completed. - * This function is also used to free up a page we might've grabbed but not + * This function is also used to free up a folio we might've grabbed but not * used. Maybe we added it to a bio, but not submitted it for I/O. Or we * submitted the I/O, but we already found the jhead so we only need to drop - * our references to the page. + * our references to the folio. */ static void gfs2_jhead_process_page(struct gfs2_jdesc *jd, unsigned long index, struct gfs2_log_header_host *head, bool *done) { - struct page *page; + struct folio *folio; - page = find_get_page(jd->jd_inode->i_mapping, index); - wait_on_page_locked(page); + folio = filemap_get_folio(jd->jd_inode->i_mapping, index); - if (PageError(page)) + folio_wait_locked(folio); + if (folio_test_error(folio)) *done = true; if (!*done) - *done = gfs2_jhead_pg_srch(jd, head, page); + *done = gfs2_jhead_pg_srch(jd, head, &folio->page); - put_page(page); /* Once for find_get_page */ - put_page(page); /* Once more for find_or_create_page */ + /* filemap_get_folio() and the earlier find_or_create_page() */ + folio_put_refs(folio, 2); } static struct bio *gfs2_chain_bio(struct bio *prev, unsigned int nr_iovecs) diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index f707601597dc..1412ffba1d44 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -16,7 +16,7 @@ extern u64 gfs2_log_bmap(struct gfs2_jdesc *jd, unsigned int lbn); extern void gfs2_log_write(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, struct page *page, unsigned size, unsigned offset, u64 blkno); -extern void gfs2_log_submit_bio(struct bio **biop, int opf); +extern void gfs2_log_submit_bio(struct bio **biop, blk_opf_t opf); extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh); extern int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, bool keep_cache); diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 244187e3e70f..14ae9de76277 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -38,7 +38,6 @@ static void gfs2_init_inode_once(void *foo) inode_init_once(&ip->i_inode); atomic_set(&ip->i_sizehint, 0); init_rwsem(&ip->i_rw_mutex); - INIT_LIST_HEAD(&ip->i_trunc_list); INIT_LIST_HEAD(&ip->i_ordered); ip->i_qadata = NULL; gfs2_holder_mark_uninitialized(&ip->i_rgd_gh); @@ -148,7 +147,7 @@ static int __init init_gfs2_fs(void) if (!gfs2_trans_cachep) goto fail_cachep8; - error = register_shrinker(&gfs2_qd_shrinker); + error = register_shrinker(&gfs2_qd_shrinker, "gfs2-qd"); if (error) goto fail_shrinker; diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 868dcc71b581..7e70e0ba5a6c 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -34,7 +34,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb { struct buffer_head *bh, *head; int nr_underway = 0; - int write_flags = REQ_META | REQ_PRIO | wbc_to_write_flags(wbc); + blk_opf_t write_flags = REQ_META | REQ_PRIO | wbc_to_write_flags(wbc); BUG_ON(!PageLocked(page)); BUG_ON(!page_has_buffers(page)); @@ -75,7 +75,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { - submit_bh(REQ_OP_WRITE, write_flags, bh); + submit_bh(REQ_OP_WRITE | write_flags, bh); nr_underway++; } bh = next; @@ -217,14 +217,13 @@ static void gfs2_meta_read_endio(struct bio *bio) * Submit several consecutive buffer head I/O requests as a single bio I/O * request. (See submit_bh_wbc.) */ -static void gfs2_submit_bhs(int op, int op_flags, struct buffer_head *bhs[], - int num) +static void gfs2_submit_bhs(blk_opf_t opf, struct buffer_head *bhs[], int num) { while (num > 0) { struct buffer_head *bh = *bhs; struct bio *bio; - bio = bio_alloc(bh->b_bdev, num, op | op_flags, GFP_NOIO); + bio = bio_alloc(bh->b_bdev, num, opf, GFP_NOIO); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); while (num > 0) { bh = *bhs; @@ -288,7 +287,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, } } - gfs2_submit_bhs(REQ_OP_READ, REQ_META | REQ_PRIO, bhs, num); + gfs2_submit_bhs(REQ_OP_READ | REQ_META | REQ_PRIO, bhs, num); if (!(flags & DIO_WAIT)) return 0; @@ -527,7 +526,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) if (buffer_uptodate(first_bh)) goto out; if (!buffer_locked(first_bh)) - ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &first_bh); + ll_rw_block(REQ_OP_READ | REQ_META | REQ_PRIO, 1, &first_bh); dblock++; extlen--; @@ -536,9 +535,8 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) bh = gfs2_getbuf(gl, dblock, CREATE); if (!buffer_uptodate(bh) && !buffer_locked(bh)) - ll_rw_block(REQ_OP_READ, - REQ_RAHEAD | REQ_META | REQ_PRIO, - 1, &bh); + ll_rw_block(REQ_OP_READ | REQ_RAHEAD | REQ_META | + REQ_PRIO, 1, &bh); brelse(bh); dblock++; extlen--; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index c9b423c874a3..549879929c84 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -106,8 +106,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) mutex_init(&sdp->sd_quota_mutex); mutex_init(&sdp->sd_quota_sync_mutex); init_waitqueue_head(&sdp->sd_quota_wait); - INIT_LIST_HEAD(&sdp->sd_trunc_list); - spin_lock_init(&sdp->sd_trunc_lock); spin_lock_init(&sdp->sd_bitmap_lock); INIT_LIST_HEAD(&sdp->sd_sc_inodes_list); diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 59d727a4ae2c..f201eaf59d0d 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -746,7 +746,7 @@ static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index, if (PageUptodate(page)) set_buffer_uptodate(bh); if (!buffer_uptodate(bh)) { - ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bh); + ll_rw_block(REQ_OP_READ | REQ_META | REQ_PRIO, 1, &bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) goto unlock_out; @@ -1517,25 +1517,6 @@ static void quotad_check_timeo(struct gfs2_sbd *sdp, const char *msg, } } -static void quotad_check_trunc_list(struct gfs2_sbd *sdp) -{ - struct gfs2_inode *ip; - - while(1) { - ip = NULL; - spin_lock(&sdp->sd_trunc_lock); - if (!list_empty(&sdp->sd_trunc_list)) { - ip = list_first_entry(&sdp->sd_trunc_list, - struct gfs2_inode, i_trunc_list); - list_del_init(&ip->i_trunc_list); - } - spin_unlock(&sdp->sd_trunc_lock); - if (ip == NULL) - return; - gfs2_glock_finish_truncate(ip); - } -} - void gfs2_wake_up_statfs(struct gfs2_sbd *sdp) { if (!sdp->sd_statfs_force_sync) { sdp->sd_statfs_force_sync = 1; @@ -1558,7 +1539,6 @@ int gfs2_quotad(void *data) unsigned long quotad_timeo = 0; unsigned long t = 0; DEFINE_WAIT(wait); - int empty; while (!kthread_should_stop()) { @@ -1579,19 +1559,13 @@ int gfs2_quotad(void *data) quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t, "ad_timeo, &tune->gt_quota_quantum); - /* Check for & recover partially truncated inodes */ - quotad_check_trunc_list(sdp); - try_to_freeze(); bypass: t = min(quotad_timeo, statfs_timeo); prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_INTERRUPTIBLE); - spin_lock(&sdp->sd_trunc_lock); - empty = list_empty(&sdp->sd_trunc_list); - spin_unlock(&sdp->sd_trunc_lock); - if (empty && !sdp->sd_statfs_force_sync) + if (!sdp->sd_statfs_force_sync) t -= schedule_timeout(t); else t = 0; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 8a63870eef5a..f602fb844951 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1196,9 +1196,8 @@ static void rgrp_set_bitmap_flags(struct gfs2_rgrpd *rgd) * Returns: errno */ -int gfs2_rgrp_go_instantiate(struct gfs2_holder *gh) +int gfs2_rgrp_go_instantiate(struct gfs2_glock *gl) { - struct gfs2_glock *gl = gh->gh_gl; struct gfs2_rgrpd *rgd = gl->gl_object; struct gfs2_sbd *sdp = rgd->rd_sbd; unsigned int length = rgd->rd_length; @@ -2720,12 +2719,15 @@ void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist, * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate * and initialize an array of glock holders for them * @rlist: the list of resource groups + * @state: the state we're requesting + * @flags: the modifier flags * * FIXME: Don't use NOFAIL * */ -void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist) +void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, + unsigned int state, u16 flags) { unsigned int x; @@ -2733,8 +2735,8 @@ void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist) sizeof(struct gfs2_holder), GFP_NOFS | __GFP_NOFAIL); for (x = 0; x < rlist->rl_rgrps; x++) - gfs2_holder_init(rlist->rl_rgd[x]->rd_gl, LM_ST_EXCLUSIVE, - LM_FLAG_NODE_SCOPE, &rlist->rl_ghs[x]); + gfs2_holder_init(rlist->rl_rgd[x]->rd_gl, state, flags, + &rlist->rl_ghs[x]); } /** diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 46dd94e9e085..00b30cf893af 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -31,7 +31,7 @@ extern struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd); extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp); extern int gfs2_rindex_update(struct gfs2_sbd *sdp); extern void gfs2_free_clones(struct gfs2_rgrpd *rgd); -extern int gfs2_rgrp_go_instantiate(struct gfs2_holder *gh); +extern int gfs2_rgrp_go_instantiate(struct gfs2_glock *gl); extern void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd); extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); @@ -64,7 +64,8 @@ struct gfs2_rgrp_list { extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist, u64 block); -extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist); +extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, + unsigned int state, u16 flags); extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); extern u64 gfs2_ri_total(struct gfs2_sbd *sdp); extern void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_rgrpd *rgd, diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index bdb773e5c88f..b5b0f285b27f 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1196,7 +1196,7 @@ static bool gfs2_upgrade_iopen_glock(struct inode *inode) gfs2_glock_dq(gh); return false; } - return true; + return gfs2_glock_holder_ready(gh) == 0; } /** diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 0c5650fe1fd1..f6a66050380e 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -1313,7 +1313,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip) else goto out; - gfs2_rlist_alloc(&rlist); + gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, LM_FLAG_NODE_SCOPE); for (x = 0; x < rlist.rl_rgrps; x++) { rgd = gfs2_glock2rgrp(rlist.rl_ghs[x].gh_gl); diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index c0a73a6ffb28..c83fd0e8404d 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -296,10 +296,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) page = read_mapping_page(mapping, block++, NULL); if (IS_ERR(page)) goto fail; - if (PageError(page)) { - put_page(page); - goto fail; - } node->page[i] = page; } diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index 177fae4e6581..a5ab00e54220 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -447,10 +447,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) page = read_mapping_page(mapping, block, NULL); if (IS_ERR(page)) goto fail; - if (PageError(page)) { - put_page(page); - goto fail; - } node->page[i] = page; } diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 396e73aa0961..a5db2e3b2980 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -525,7 +525,7 @@ int hfsplus_compare_dentry(const struct dentry *dentry, unsigned int len, /* wrapper.c */ int hfsplus_submit_bio(struct super_block *sb, sector_t sector, void *buf, - void **data, int op, int op_flags); + void **data, blk_opf_t opf); int hfsplus_read_wrapper(struct super_block *sb); /* diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c index 63164ebc52fa..9ec21664eda6 100644 --- a/fs/hfsplus/part_tbl.c +++ b/fs/hfsplus/part_tbl.c @@ -112,8 +112,7 @@ static int hfs_parse_new_pmap(struct super_block *sb, void *buf, if ((u8 *)pm - (u8 *)buf >= buf_size) { res = hfsplus_submit_bio(sb, *part_start + HFS_PMAP_BLK + i, - buf, (void **)&pm, REQ_OP_READ, - 0); + buf, (void **)&pm, REQ_OP_READ); if (res) return res; } @@ -137,7 +136,7 @@ int hfs_part_find(struct super_block *sb, return -ENOMEM; res = hfsplus_submit_bio(sb, *part_start + HFS_PMAP_BLK, - buf, &data, REQ_OP_READ, 0); + buf, &data, REQ_OP_READ); if (res) goto out; diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 8479add998b5..122ed89ebf9f 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -221,7 +221,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait) error2 = hfsplus_submit_bio(sb, sbi->part_start + HFSPLUS_VOLHEAD_SECTOR, - sbi->s_vhdr_buf, NULL, REQ_OP_WRITE, + sbi->s_vhdr_buf, NULL, REQ_OP_WRITE | REQ_SYNC); if (!error) error = error2; @@ -230,7 +230,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait) error2 = hfsplus_submit_bio(sb, sbi->part_start + sbi->sect_count - 2, - sbi->s_backup_vhdr_buf, NULL, REQ_OP_WRITE, + sbi->s_backup_vhdr_buf, NULL, REQ_OP_WRITE | REQ_SYNC); if (!error) error2 = error; diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index 0b8ad6586df5..0b791adf02e5 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -45,8 +45,9 @@ struct hfsplus_wd { * will work correctly. */ int hfsplus_submit_bio(struct super_block *sb, sector_t sector, - void *buf, void **data, int op, int op_flags) + void *buf, void **data, blk_opf_t opf) { + const enum req_op op = opf & REQ_OP_MASK; struct bio *bio; int ret = 0; u64 io_size; @@ -63,10 +64,10 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector, offset = start & (io_size - 1); sector &= ~((io_size >> HFSPLUS_SECTOR_SHIFT) - 1); - bio = bio_alloc(sb->s_bdev, 1, op | op_flags, GFP_NOIO); + bio = bio_alloc(sb->s_bdev, 1, opf, GFP_NOIO); bio->bi_iter.bi_sector = sector; - if (op != WRITE && data) + if (op != REQ_OP_WRITE && data) *data = (u8 *)buf + offset; while (io_size > 0) { @@ -184,7 +185,7 @@ int hfsplus_read_wrapper(struct super_block *sb) reread: error = hfsplus_submit_bio(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, sbi->s_vhdr_buf, (void **)&sbi->s_vhdr, - REQ_OP_READ, 0); + REQ_OP_READ); if (error) goto out_free_backup_vhdr; @@ -216,8 +217,7 @@ reread: error = hfsplus_submit_bio(sb, part_start + part_size - 2, sbi->s_backup_vhdr_buf, - (void **)&sbi->s_backup_vhdr, REQ_OP_READ, - 0); + (void **)&sbi->s_backup_vhdr, REQ_OP_READ); if (error) goto out_free_backup_vhdr; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index cc1bc6f93a01..07881b76d42f 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -416,15 +416,15 @@ static int hostfs_writepage(struct page *page, struct writeback_control *wbc) err = write_file(HOSTFS_I(inode)->fd, &base, buffer, count); if (err != count) { - ClearPageUptodate(page); + if (err >= 0) + err = -EIO; + mapping_set_error(mapping, err); goto out; } if (base > inode->i_size) inode->i_size = base; - if (PageError(page)) - ClearPageError(page); err = 0; out: diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 62408047e8d7..fe0e374b02a3 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -11,7 +11,6 @@ #include <linux/thread_info.h> #include <asm/current.h> -#include <linux/sched/signal.h> /* remove ASAP */ #include <linux/falloc.h> #include <linux/fs.h> #include <linux/mount.h> @@ -40,7 +39,6 @@ #include <linux/uaccess.h> #include <linux/sched/mm.h> -static const struct super_operations hugetlbfs_ops; static const struct address_space_operations hugetlbfs_aops; const struct file_operations hugetlbfs_file_operations; static const struct inode_operations hugetlbfs_dir_inode_operations; @@ -108,16 +106,6 @@ static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma) } #endif -static void huge_pagevec_release(struct pagevec *pvec) -{ - int i; - - for (i = 0; i < pagevec_count(pvec); ++i) - put_page(pvec->pages[i]); - - pagevec_reinit(pvec); -} - /* * Mask used when checking the page offset value passed in via system * calls. This value will be converted to a loff_t which is signed. @@ -325,8 +313,7 @@ hugetlbfs_read_actor(struct page *page, unsigned long offset, /* * Support for read() - Find the page attached to f_mapping and copy out the - * data. Its *very* similar to generic_file_buffered_read(), we can't use that - * since it has PAGE_SIZE assumptions. + * data. This provides functionality similar to filemap_read(). */ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) { @@ -480,25 +467,19 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, struct address_space *mapping = &inode->i_data; const pgoff_t start = lstart >> huge_page_shift(h); const pgoff_t end = lend >> huge_page_shift(h); - struct pagevec pvec; + struct folio_batch fbatch; pgoff_t next, index; int i, freed = 0; bool truncate_op = (lend == LLONG_MAX); - pagevec_init(&pvec); + folio_batch_init(&fbatch); next = start; - while (next < end) { - /* - * When no more pages are found, we are done. - */ - if (!pagevec_lookup_range(&pvec, mapping, &next, end - 1)) - break; - - for (i = 0; i < pagevec_count(&pvec); ++i) { - struct page *page = pvec.pages[i]; + while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) { + for (i = 0; i < folio_batch_count(&fbatch); ++i) { + struct folio *folio = fbatch.folios[i]; u32 hash = 0; - index = page->index; + index = folio->index; if (!truncate_op) { /* * Only need to hold the fault mutex in the @@ -511,15 +492,15 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, } /* - * If page is mapped, it was faulted in after being + * If folio is mapped, it was faulted in after being * unmapped in caller. Unmap (again) now after taking * the fault mutex. The mutex will prevent faults - * until we finish removing the page. + * until we finish removing the folio. * * This race can only happen in the hole punch case. * Getting here in a truncate operation is a bug. */ - if (unlikely(page_mapped(page))) { + if (unlikely(folio_mapped(folio))) { BUG_ON(truncate_op); mutex_unlock(&hugetlb_fault_mutex_table[hash]); @@ -532,7 +513,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, i_mmap_unlock_write(mapping); } - lock_page(page); + folio_lock(folio); /* * We must free the huge page and remove from page * cache (remove_huge_page) BEFORE removing the @@ -542,8 +523,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, * the subpool and global reserve usage count can need * to be adjusted. */ - VM_BUG_ON(HPageRestoreReserve(page)); - remove_huge_page(page); + VM_BUG_ON(HPageRestoreReserve(&folio->page)); + remove_huge_page(&folio->page); freed++; if (!truncate_op) { if (unlikely(hugetlb_unreserve_pages(inode, @@ -551,11 +532,11 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, hugetlb_fix_reserve_counts(inode); } - unlock_page(page); + folio_unlock(folio); if (!truncate_op) mutex_unlock(&hugetlb_fault_mutex_table[hash]); } - huge_pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } @@ -600,41 +581,79 @@ static void hugetlb_vmtruncate(struct inode *inode, loff_t offset) remove_inode_hugepages(inode, offset, LLONG_MAX); } +static void hugetlbfs_zero_partial_page(struct hstate *h, + struct address_space *mapping, + loff_t start, + loff_t end) +{ + pgoff_t idx = start >> huge_page_shift(h); + struct folio *folio; + + folio = filemap_lock_folio(mapping, idx); + if (!folio) + return; + + start = start & ~huge_page_mask(h); + end = end & ~huge_page_mask(h); + if (!end) + end = huge_page_size(h); + + folio_zero_segment(folio, (size_t)start, (size_t)end); + + folio_unlock(folio); + folio_put(folio); +} + static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) { + struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); + struct address_space *mapping = inode->i_mapping; struct hstate *h = hstate_inode(inode); loff_t hpage_size = huge_page_size(h); loff_t hole_start, hole_end; /* - * For hole punch round up the beginning offset of the hole and - * round down the end. + * hole_start and hole_end indicate the full pages within the hole. */ hole_start = round_up(offset, hpage_size); hole_end = round_down(offset + len, hpage_size); - if (hole_end > hole_start) { - struct address_space *mapping = inode->i_mapping; - struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); + inode_lock(inode); - inode_lock(inode); + /* protected by i_rwsem */ + if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { + inode_unlock(inode); + return -EPERM; + } - /* protected by i_rwsem */ - if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { - inode_unlock(inode); - return -EPERM; - } + i_mmap_lock_write(mapping); + + /* If range starts before first full page, zero partial page. */ + if (offset < hole_start) + hugetlbfs_zero_partial_page(h, mapping, + offset, min(offset + len, hole_start)); - i_mmap_lock_write(mapping); + /* Unmap users of full pages in the hole. */ + if (hole_end > hole_start) { if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) hugetlb_vmdelete_list(&mapping->i_mmap, hole_start >> PAGE_SHIFT, hole_end >> PAGE_SHIFT, 0); - i_mmap_unlock_write(mapping); - remove_inode_hugepages(inode, hole_start, hole_end); - inode_unlock(inode); } + /* If range extends beyond last full page, zero partial page. */ + if ((offset + len) > hole_end && (offset + len) > hole_start) + hugetlbfs_zero_partial_page(h, mapping, + hole_end, offset + len); + + i_mmap_unlock_write(mapping); + + /* Remove full pages from the file. */ + if (hole_end > hole_start) + remove_inode_hugepages(inode, hole_start, hole_end); + + inode_unlock(inode); + return 0; } @@ -759,7 +778,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, SetHPageMigratable(page); /* - * unlock_page because locked by add_to_page_cache() + * unlock_page because locked by huge_add_to_page_cache() * put_page() due to reference from alloc_huge_page() */ unlock_page(page); @@ -970,28 +989,33 @@ static int hugetlbfs_symlink(struct user_namespace *mnt_userns, return error; } -static int hugetlbfs_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page, +#ifdef CONFIG_MIGRATION +static int hugetlbfs_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, enum migrate_mode mode) { int rc; - rc = migrate_huge_page_move_mapping(mapping, newpage, page); + rc = migrate_huge_page_move_mapping(mapping, dst, src); if (rc != MIGRATEPAGE_SUCCESS) return rc; - if (hugetlb_page_subpool(page)) { - hugetlb_set_page_subpool(newpage, hugetlb_page_subpool(page)); - hugetlb_set_page_subpool(page, NULL); + if (hugetlb_page_subpool(&src->page)) { + hugetlb_set_page_subpool(&dst->page, + hugetlb_page_subpool(&src->page)); + hugetlb_set_page_subpool(&src->page, NULL); } if (mode != MIGRATE_SYNC_NO_COPY) - migrate_page_copy(newpage, page); + folio_migrate_copy(dst, src); else - migrate_page_states(newpage, page); + folio_migrate_flags(dst, src); return MIGRATEPAGE_SUCCESS; } +#else +#define hugetlbfs_migrate_folio NULL +#endif static int hugetlbfs_error_remove_page(struct address_space *mapping, struct page *page) @@ -1055,7 +1079,7 @@ static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bsize = huge_page_size(h); if (sbinfo) { spin_lock(&sbinfo->stat_lock); - /* If no limits set, just report 0 for max/free/used + /* If no limits set, just report 0 or -1 for max/free/used * blocks, like simple_statfs() */ if (sbinfo->spool) { long free_pages; @@ -1158,7 +1182,7 @@ static const struct address_space_operations hugetlbfs_aops = { .write_begin = hugetlbfs_write_begin, .write_end = hugetlbfs_write_end, .dirty_folio = noop_dirty_folio, - .migratepage = hugetlbfs_migrate_page, + .migrate_folio = hugetlbfs_migrate_folio, .error_remove_page = hugetlbfs_error_remove_page, }; @@ -1282,7 +1306,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par ps = memparse(param->string, &rest); ctx->hstate = size_to_hstate(ps); if (!ctx->hstate) { - pr_err("Unsupported page size %lu MB\n", ps >> 20); + pr_err("Unsupported page size %lu MB\n", ps / SZ_1M); return -EINVAL; } return 0; @@ -1358,7 +1382,7 @@ hugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc) /* * Allocate and initialize subpool if maximum or minimum size is * specified. Any needed reservations (for minimum size) are taken - * taken when the subpool is created. + * when the subpool is created. */ if (ctx->max_hpages != -1 || ctx->min_hpages != -1) { sbinfo->spool = hugepage_new_subpool(ctx->hstate, @@ -1528,7 +1552,7 @@ static struct vfsmount *__init mount_one_hugetlbfs(struct hstate *h) } if (IS_ERR(mnt)) pr_err("Cannot mount internal hugetlbfs for page size %luK", - huge_page_size(h) >> 10); + huge_page_size(h) / SZ_1K); return mnt; } diff --git a/fs/inode.c b/fs/inode.c index bd4da9c5207e..524ee91f74a6 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -604,7 +604,7 @@ void clear_inode(struct inode *inode) { /* * We have to cycle the i_pages lock here because reclaim can be in the - * process of removing the last page (in __delete_from_page_cache()) + * process of removing the last page (in __filemap_remove_folio()) * and we must not free the mapping under it. */ xa_lock_irq(&inode->i_data.i_pages); @@ -2010,67 +2010,57 @@ static int __remove_privs(struct user_namespace *mnt_userns, return notify_change(mnt_userns, dentry, &newattrs, NULL); } -/* - * Remove special file priviledges (suid, capabilities) when file is written - * to or truncated. - */ -int file_remove_privs(struct file *file) +static int __file_remove_privs(struct file *file, unsigned int flags) { struct dentry *dentry = file_dentry(file); struct inode *inode = file_inode(file); + int error; int kill; - int error = 0; - /* - * Fast path for nothing security related. - * As well for non-regular files, e.g. blkdev inodes. - * For example, blkdev_write_iter() might get here - * trying to remove privs which it is not allowed to. - */ if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode)) return 0; kill = dentry_needs_remove_privs(dentry); - if (kill < 0) + if (kill <= 0) return kill; - if (kill) - error = __remove_privs(file_mnt_user_ns(file), dentry, kill); + + if (flags & IOCB_NOWAIT) + return -EAGAIN; + + error = __remove_privs(file_mnt_user_ns(file), dentry, kill); if (!error) inode_has_no_xattr(inode); return error; } -EXPORT_SYMBOL(file_remove_privs); /** - * file_update_time - update mtime and ctime time - * @file: file accessed + * file_remove_privs - remove special file privileges (suid, capabilities) + * @file: file to remove privileges from + * + * When file is modified by a write or truncation ensure that special + * file privileges are removed. * - * Update the mtime and ctime members of an inode and mark the inode - * for writeback. Note that this function is meant exclusively for - * usage in the file write path of filesystems, and filesystems may - * choose to explicitly ignore update via this function with the - * S_NOCMTIME inode flag, e.g. for network filesystem where these - * timestamps are handled by the server. This can return an error for - * file systems who need to allocate space in order to update an inode. + * Return: 0 on success, negative errno on failure. */ +int file_remove_privs(struct file *file) +{ + return __file_remove_privs(file, 0); +} +EXPORT_SYMBOL(file_remove_privs); -int file_update_time(struct file *file) +static int inode_needs_update_time(struct inode *inode, struct timespec64 *now) { - struct inode *inode = file_inode(file); - struct timespec64 now; int sync_it = 0; - int ret; /* First try to exhaust all avenues to not sync */ if (IS_NOCMTIME(inode)) return 0; - now = current_time(inode); - if (!timespec64_equal(&inode->i_mtime, &now)) + if (!timespec64_equal(&inode->i_mtime, now)) sync_it = S_MTIME; - if (!timespec64_equal(&inode->i_ctime, &now)) + if (!timespec64_equal(&inode->i_ctime, now)) sync_it |= S_CTIME; if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode)) @@ -2079,37 +2069,127 @@ int file_update_time(struct file *file) if (!sync_it) return 0; - /* Finally allowed to write? Takes lock. */ - if (__mnt_want_write_file(file)) - return 0; + return sync_it; +} + +static int __file_update_time(struct file *file, struct timespec64 *now, + int sync_mode) +{ + int ret = 0; + struct inode *inode = file_inode(file); - ret = inode_update_time(inode, &now, sync_it); - __mnt_drop_write_file(file); + /* try to update time settings */ + if (!__mnt_want_write_file(file)) { + ret = inode_update_time(inode, now, sync_mode); + __mnt_drop_write_file(file); + } return ret; } + +/** + * file_update_time - update mtime and ctime time + * @file: file accessed + * + * Update the mtime and ctime members of an inode and mark the inode for + * writeback. Note that this function is meant exclusively for usage in + * the file write path of filesystems, and filesystems may choose to + * explicitly ignore updates via this function with the _NOCMTIME inode + * flag, e.g. for network filesystem where these imestamps are handled + * by the server. This can return an error for file systems who need to + * allocate space in order to update an inode. + * + * Return: 0 on success, negative errno on failure. + */ +int file_update_time(struct file *file) +{ + int ret; + struct inode *inode = file_inode(file); + struct timespec64 now = current_time(inode); + + ret = inode_needs_update_time(inode, &now); + if (ret <= 0) + return ret; + + return __file_update_time(file, &now, ret); +} EXPORT_SYMBOL(file_update_time); -/* Caller must hold the file's inode lock */ -int file_modified(struct file *file) +/** + * file_modified_flags - handle mandated vfs changes when modifying a file + * @file: file that was modified + * @flags: kiocb flags + * + * When file has been modified ensure that special + * file privileges are removed and time settings are updated. + * + * If IOCB_NOWAIT is set, special file privileges will not be removed and + * time settings will not be updated. It will return -EAGAIN. + * + * Context: Caller must hold the file's inode lock. + * + * Return: 0 on success, negative errno on failure. + */ +static int file_modified_flags(struct file *file, int flags) { - int err; + int ret; + struct inode *inode = file_inode(file); + struct timespec64 now = current_time(inode); /* * Clear the security bits if the process is not being run by root. * This keeps people from modifying setuid and setgid binaries. */ - err = file_remove_privs(file); - if (err) - return err; + ret = __file_remove_privs(file, flags); + if (ret) + return ret; if (unlikely(file->f_mode & FMODE_NOCMTIME)) return 0; - return file_update_time(file); + ret = inode_needs_update_time(inode, &now); + if (ret <= 0) + return ret; + if (flags & IOCB_NOWAIT) + return -EAGAIN; + + return __file_update_time(file, &now, ret); +} + +/** + * file_modified - handle mandated vfs changes when modifying a file + * @file: file that was modified + * + * When file has been modified ensure that special + * file privileges are removed and time settings are updated. + * + * Context: Caller must hold the file's inode lock. + * + * Return: 0 on success, negative errno on failure. + */ +int file_modified(struct file *file) +{ + return file_modified_flags(file, 0); } EXPORT_SYMBOL(file_modified); +/** + * kiocb_modified - handle mandated vfs changes when modifying a file + * @iocb: iocb that was modified + * + * When file has been modified ensure that special + * file privileges are removed and time settings are updated. + * + * Context: Caller must hold the file's inode lock. + * + * Return: 0 on success, negative errno on failure. + */ +int kiocb_modified(struct kiocb *iocb) +{ + return file_modified_flags(iocb->ki_filp, iocb->ki_flags); +} +EXPORT_SYMBOL_GPL(kiocb_modified); + int inode_needs_sync(struct inode *inode) { if (IS_SYNC(inode)) diff --git a/fs/io-wq.c b/fs/io-wq.c deleted file mode 100644 index 824623bcf1a5..000000000000 --- a/fs/io-wq.c +++ /dev/null @@ -1,1424 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Basic worker thread pool for io_uring - * - * Copyright (C) 2019 Jens Axboe - * - */ -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/errno.h> -#include <linux/sched/signal.h> -#include <linux/percpu.h> -#include <linux/slab.h> -#include <linux/rculist_nulls.h> -#include <linux/cpu.h> -#include <linux/task_work.h> -#include <linux/audit.h> -#include <uapi/linux/io_uring.h> - -#include "io-wq.h" - -#define WORKER_IDLE_TIMEOUT (5 * HZ) - -enum { - IO_WORKER_F_UP = 1, /* up and active */ - IO_WORKER_F_RUNNING = 2, /* account as running */ - IO_WORKER_F_FREE = 4, /* worker on free list */ - IO_WORKER_F_BOUND = 8, /* is doing bounded work */ -}; - -enum { - IO_WQ_BIT_EXIT = 0, /* wq exiting */ -}; - -enum { - IO_ACCT_STALLED_BIT = 0, /* stalled on hash */ -}; - -/* - * One for each thread in a wqe pool - */ -struct io_worker { - refcount_t ref; - unsigned flags; - struct hlist_nulls_node nulls_node; - struct list_head all_list; - struct task_struct *task; - struct io_wqe *wqe; - - struct io_wq_work *cur_work; - struct io_wq_work *next_work; - raw_spinlock_t lock; - - struct completion ref_done; - - unsigned long create_state; - struct callback_head create_work; - int create_index; - - union { - struct rcu_head rcu; - struct work_struct work; - }; -}; - -#if BITS_PER_LONG == 64 -#define IO_WQ_HASH_ORDER 6 -#else -#define IO_WQ_HASH_ORDER 5 -#endif - -#define IO_WQ_NR_HASH_BUCKETS (1u << IO_WQ_HASH_ORDER) - -struct io_wqe_acct { - unsigned nr_workers; - unsigned max_workers; - int index; - atomic_t nr_running; - raw_spinlock_t lock; - struct io_wq_work_list work_list; - unsigned long flags; -}; - -enum { - IO_WQ_ACCT_BOUND, - IO_WQ_ACCT_UNBOUND, - IO_WQ_ACCT_NR, -}; - -/* - * Per-node worker thread pool - */ -struct io_wqe { - raw_spinlock_t lock; - struct io_wqe_acct acct[IO_WQ_ACCT_NR]; - - int node; - - struct hlist_nulls_head free_list; - struct list_head all_list; - - struct wait_queue_entry wait; - - struct io_wq *wq; - struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS]; - - cpumask_var_t cpu_mask; -}; - -/* - * Per io_wq state - */ -struct io_wq { - unsigned long state; - - free_work_fn *free_work; - io_wq_work_fn *do_work; - - struct io_wq_hash *hash; - - atomic_t worker_refs; - struct completion worker_done; - - struct hlist_node cpuhp_node; - - struct task_struct *task; - - struct io_wqe *wqes[]; -}; - -static enum cpuhp_state io_wq_online; - -struct io_cb_cancel_data { - work_cancel_fn *fn; - void *data; - int nr_running; - int nr_pending; - bool cancel_all; -}; - -static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index); -static void io_wqe_dec_running(struct io_worker *worker); -static bool io_acct_cancel_pending_work(struct io_wqe *wqe, - struct io_wqe_acct *acct, - struct io_cb_cancel_data *match); -static void create_worker_cb(struct callback_head *cb); -static void io_wq_cancel_tw_create(struct io_wq *wq); - -static bool io_worker_get(struct io_worker *worker) -{ - return refcount_inc_not_zero(&worker->ref); -} - -static void io_worker_release(struct io_worker *worker) -{ - if (refcount_dec_and_test(&worker->ref)) - complete(&worker->ref_done); -} - -static inline struct io_wqe_acct *io_get_acct(struct io_wqe *wqe, bool bound) -{ - return &wqe->acct[bound ? IO_WQ_ACCT_BOUND : IO_WQ_ACCT_UNBOUND]; -} - -static inline struct io_wqe_acct *io_work_get_acct(struct io_wqe *wqe, - struct io_wq_work *work) -{ - return io_get_acct(wqe, !(work->flags & IO_WQ_WORK_UNBOUND)); -} - -static inline struct io_wqe_acct *io_wqe_get_acct(struct io_worker *worker) -{ - return io_get_acct(worker->wqe, worker->flags & IO_WORKER_F_BOUND); -} - -static void io_worker_ref_put(struct io_wq *wq) -{ - if (atomic_dec_and_test(&wq->worker_refs)) - complete(&wq->worker_done); -} - -static void io_worker_cancel_cb(struct io_worker *worker) -{ - struct io_wqe_acct *acct = io_wqe_get_acct(worker); - struct io_wqe *wqe = worker->wqe; - struct io_wq *wq = wqe->wq; - - atomic_dec(&acct->nr_running); - raw_spin_lock(&worker->wqe->lock); - acct->nr_workers--; - raw_spin_unlock(&worker->wqe->lock); - io_worker_ref_put(wq); - clear_bit_unlock(0, &worker->create_state); - io_worker_release(worker); -} - -static bool io_task_worker_match(struct callback_head *cb, void *data) -{ - struct io_worker *worker; - - if (cb->func != create_worker_cb) - return false; - worker = container_of(cb, struct io_worker, create_work); - return worker == data; -} - -static void io_worker_exit(struct io_worker *worker) -{ - struct io_wqe *wqe = worker->wqe; - struct io_wq *wq = wqe->wq; - - while (1) { - struct callback_head *cb = task_work_cancel_match(wq->task, - io_task_worker_match, worker); - - if (!cb) - break; - io_worker_cancel_cb(worker); - } - - io_worker_release(worker); - wait_for_completion(&worker->ref_done); - - raw_spin_lock(&wqe->lock); - if (worker->flags & IO_WORKER_F_FREE) - hlist_nulls_del_rcu(&worker->nulls_node); - list_del_rcu(&worker->all_list); - raw_spin_unlock(&wqe->lock); - io_wqe_dec_running(worker); - worker->flags = 0; - preempt_disable(); - current->flags &= ~PF_IO_WORKER; - preempt_enable(); - - kfree_rcu(worker, rcu); - io_worker_ref_put(wqe->wq); - do_exit(0); -} - -static inline bool io_acct_run_queue(struct io_wqe_acct *acct) -{ - bool ret = false; - - raw_spin_lock(&acct->lock); - if (!wq_list_empty(&acct->work_list) && - !test_bit(IO_ACCT_STALLED_BIT, &acct->flags)) - ret = true; - raw_spin_unlock(&acct->lock); - - return ret; -} - -/* - * Check head of free list for an available worker. If one isn't available, - * caller must create one. - */ -static bool io_wqe_activate_free_worker(struct io_wqe *wqe, - struct io_wqe_acct *acct) - __must_hold(RCU) -{ - struct hlist_nulls_node *n; - struct io_worker *worker; - - /* - * Iterate free_list and see if we can find an idle worker to - * activate. If a given worker is on the free_list but in the process - * of exiting, keep trying. - */ - hlist_nulls_for_each_entry_rcu(worker, n, &wqe->free_list, nulls_node) { - if (!io_worker_get(worker)) - continue; - if (io_wqe_get_acct(worker) != acct) { - io_worker_release(worker); - continue; - } - if (wake_up_process(worker->task)) { - io_worker_release(worker); - return true; - } - io_worker_release(worker); - } - - return false; -} - -/* - * We need a worker. If we find a free one, we're good. If not, and we're - * below the max number of workers, create one. - */ -static bool io_wqe_create_worker(struct io_wqe *wqe, struct io_wqe_acct *acct) -{ - /* - * Most likely an attempt to queue unbounded work on an io_wq that - * wasn't setup with any unbounded workers. - */ - if (unlikely(!acct->max_workers)) - pr_warn_once("io-wq is not configured for unbound workers"); - - raw_spin_lock(&wqe->lock); - if (acct->nr_workers >= acct->max_workers) { - raw_spin_unlock(&wqe->lock); - return true; - } - acct->nr_workers++; - raw_spin_unlock(&wqe->lock); - atomic_inc(&acct->nr_running); - atomic_inc(&wqe->wq->worker_refs); - return create_io_worker(wqe->wq, wqe, acct->index); -} - -static void io_wqe_inc_running(struct io_worker *worker) -{ - struct io_wqe_acct *acct = io_wqe_get_acct(worker); - - atomic_inc(&acct->nr_running); -} - -static void create_worker_cb(struct callback_head *cb) -{ - struct io_worker *worker; - struct io_wq *wq; - struct io_wqe *wqe; - struct io_wqe_acct *acct; - bool do_create = false; - - worker = container_of(cb, struct io_worker, create_work); - wqe = worker->wqe; - wq = wqe->wq; - acct = &wqe->acct[worker->create_index]; - raw_spin_lock(&wqe->lock); - if (acct->nr_workers < acct->max_workers) { - acct->nr_workers++; - do_create = true; - } - raw_spin_unlock(&wqe->lock); - if (do_create) { - create_io_worker(wq, wqe, worker->create_index); - } else { - atomic_dec(&acct->nr_running); - io_worker_ref_put(wq); - } - clear_bit_unlock(0, &worker->create_state); - io_worker_release(worker); -} - -static bool io_queue_worker_create(struct io_worker *worker, - struct io_wqe_acct *acct, - task_work_func_t func) -{ - struct io_wqe *wqe = worker->wqe; - struct io_wq *wq = wqe->wq; - - /* raced with exit, just ignore create call */ - if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) - goto fail; - if (!io_worker_get(worker)) - goto fail; - /* - * create_state manages ownership of create_work/index. We should - * only need one entry per worker, as the worker going to sleep - * will trigger the condition, and waking will clear it once it - * runs the task_work. - */ - if (test_bit(0, &worker->create_state) || - test_and_set_bit_lock(0, &worker->create_state)) - goto fail_release; - - atomic_inc(&wq->worker_refs); - init_task_work(&worker->create_work, func); - worker->create_index = acct->index; - if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL)) { - /* - * EXIT may have been set after checking it above, check after - * adding the task_work and remove any creation item if it is - * now set. wq exit does that too, but we can have added this - * work item after we canceled in io_wq_exit_workers(). - */ - if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) - io_wq_cancel_tw_create(wq); - io_worker_ref_put(wq); - return true; - } - io_worker_ref_put(wq); - clear_bit_unlock(0, &worker->create_state); -fail_release: - io_worker_release(worker); -fail: - atomic_dec(&acct->nr_running); - io_worker_ref_put(wq); - return false; -} - -static void io_wqe_dec_running(struct io_worker *worker) -{ - struct io_wqe_acct *acct = io_wqe_get_acct(worker); - struct io_wqe *wqe = worker->wqe; - - if (!(worker->flags & IO_WORKER_F_UP)) - return; - - if (!atomic_dec_and_test(&acct->nr_running)) - return; - if (!io_acct_run_queue(acct)) - return; - - atomic_inc(&acct->nr_running); - atomic_inc(&wqe->wq->worker_refs); - io_queue_worker_create(worker, acct, create_worker_cb); -} - -/* - * Worker will start processing some work. Move it to the busy list, if - * it's currently on the freelist - */ -static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker) -{ - if (worker->flags & IO_WORKER_F_FREE) { - worker->flags &= ~IO_WORKER_F_FREE; - raw_spin_lock(&wqe->lock); - hlist_nulls_del_init_rcu(&worker->nulls_node); - raw_spin_unlock(&wqe->lock); - } -} - -/* - * No work, worker going to sleep. Move to freelist, and unuse mm if we - * have one attached. Dropping the mm may potentially sleep, so we drop - * the lock in that case and return success. Since the caller has to - * retry the loop in that case (we changed task state), we don't regrab - * the lock if we return success. - */ -static void __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker) - __must_hold(wqe->lock) -{ - if (!(worker->flags & IO_WORKER_F_FREE)) { - worker->flags |= IO_WORKER_F_FREE; - hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); - } -} - -static inline unsigned int io_get_work_hash(struct io_wq_work *work) -{ - return work->flags >> IO_WQ_HASH_SHIFT; -} - -static bool io_wait_on_hash(struct io_wqe *wqe, unsigned int hash) -{ - struct io_wq *wq = wqe->wq; - bool ret = false; - - spin_lock_irq(&wq->hash->wait.lock); - if (list_empty(&wqe->wait.entry)) { - __add_wait_queue(&wq->hash->wait, &wqe->wait); - if (!test_bit(hash, &wq->hash->map)) { - __set_current_state(TASK_RUNNING); - list_del_init(&wqe->wait.entry); - ret = true; - } - } - spin_unlock_irq(&wq->hash->wait.lock); - return ret; -} - -static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct, - struct io_worker *worker) - __must_hold(acct->lock) -{ - struct io_wq_work_node *node, *prev; - struct io_wq_work *work, *tail; - unsigned int stall_hash = -1U; - struct io_wqe *wqe = worker->wqe; - - wq_list_for_each(node, prev, &acct->work_list) { - unsigned int hash; - - work = container_of(node, struct io_wq_work, list); - - /* not hashed, can run anytime */ - if (!io_wq_is_hashed(work)) { - wq_list_del(&acct->work_list, node, prev); - return work; - } - - hash = io_get_work_hash(work); - /* all items with this hash lie in [work, tail] */ - tail = wqe->hash_tail[hash]; - - /* hashed, can run if not already running */ - if (!test_and_set_bit(hash, &wqe->wq->hash->map)) { - wqe->hash_tail[hash] = NULL; - wq_list_cut(&acct->work_list, &tail->list, prev); - return work; - } - if (stall_hash == -1U) - stall_hash = hash; - /* fast forward to a next hash, for-each will fix up @prev */ - node = &tail->list; - } - - if (stall_hash != -1U) { - bool unstalled; - - /* - * Set this before dropping the lock to avoid racing with new - * work being added and clearing the stalled bit. - */ - set_bit(IO_ACCT_STALLED_BIT, &acct->flags); - raw_spin_unlock(&acct->lock); - unstalled = io_wait_on_hash(wqe, stall_hash); - raw_spin_lock(&acct->lock); - if (unstalled) { - clear_bit(IO_ACCT_STALLED_BIT, &acct->flags); - if (wq_has_sleeper(&wqe->wq->hash->wait)) - wake_up(&wqe->wq->hash->wait); - } - } - - return NULL; -} - -static bool io_flush_signals(void) -{ - if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL))) { - __set_current_state(TASK_RUNNING); - clear_notify_signal(); - if (task_work_pending(current)) - task_work_run(); - return true; - } - return false; -} - -static void io_assign_current_work(struct io_worker *worker, - struct io_wq_work *work) -{ - if (work) { - io_flush_signals(); - cond_resched(); - } - - raw_spin_lock(&worker->lock); - worker->cur_work = work; - worker->next_work = NULL; - raw_spin_unlock(&worker->lock); -} - -static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work); - -static void io_worker_handle_work(struct io_worker *worker) -{ - struct io_wqe_acct *acct = io_wqe_get_acct(worker); - struct io_wqe *wqe = worker->wqe; - struct io_wq *wq = wqe->wq; - bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state); - - do { - struct io_wq_work *work; - - /* - * If we got some work, mark us as busy. If we didn't, but - * the list isn't empty, it means we stalled on hashed work. - * Mark us stalled so we don't keep looking for work when we - * can't make progress, any work completion or insertion will - * clear the stalled flag. - */ - raw_spin_lock(&acct->lock); - work = io_get_next_work(acct, worker); - raw_spin_unlock(&acct->lock); - if (work) { - __io_worker_busy(wqe, worker); - - /* - * Make sure cancelation can find this, even before - * it becomes the active work. That avoids a window - * where the work has been removed from our general - * work list, but isn't yet discoverable as the - * current work item for this worker. - */ - raw_spin_lock(&worker->lock); - worker->next_work = work; - raw_spin_unlock(&worker->lock); - } else { - break; - } - io_assign_current_work(worker, work); - __set_current_state(TASK_RUNNING); - - /* handle a whole dependent link */ - do { - struct io_wq_work *next_hashed, *linked; - unsigned int hash = io_get_work_hash(work); - - next_hashed = wq_next_work(work); - - if (unlikely(do_kill) && (work->flags & IO_WQ_WORK_UNBOUND)) - work->flags |= IO_WQ_WORK_CANCEL; - wq->do_work(work); - io_assign_current_work(worker, NULL); - - linked = wq->free_work(work); - work = next_hashed; - if (!work && linked && !io_wq_is_hashed(linked)) { - work = linked; - linked = NULL; - } - io_assign_current_work(worker, work); - if (linked) - io_wqe_enqueue(wqe, linked); - - if (hash != -1U && !next_hashed) { - /* serialize hash clear with wake_up() */ - spin_lock_irq(&wq->hash->wait.lock); - clear_bit(hash, &wq->hash->map); - clear_bit(IO_ACCT_STALLED_BIT, &acct->flags); - spin_unlock_irq(&wq->hash->wait.lock); - if (wq_has_sleeper(&wq->hash->wait)) - wake_up(&wq->hash->wait); - } - } while (work); - } while (1); -} - -static int io_wqe_worker(void *data) -{ - struct io_worker *worker = data; - struct io_wqe_acct *acct = io_wqe_get_acct(worker); - struct io_wqe *wqe = worker->wqe; - struct io_wq *wq = wqe->wq; - bool last_timeout = false; - char buf[TASK_COMM_LEN]; - - worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING); - - snprintf(buf, sizeof(buf), "iou-wrk-%d", wq->task->pid); - set_task_comm(current, buf); - - audit_alloc_kernel(current); - - while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) { - long ret; - - set_current_state(TASK_INTERRUPTIBLE); - while (io_acct_run_queue(acct)) - io_worker_handle_work(worker); - - raw_spin_lock(&wqe->lock); - /* timed out, exit unless we're the last worker */ - if (last_timeout && acct->nr_workers > 1) { - acct->nr_workers--; - raw_spin_unlock(&wqe->lock); - __set_current_state(TASK_RUNNING); - break; - } - last_timeout = false; - __io_worker_idle(wqe, worker); - raw_spin_unlock(&wqe->lock); - if (io_flush_signals()) - continue; - ret = schedule_timeout(WORKER_IDLE_TIMEOUT); - if (signal_pending(current)) { - struct ksignal ksig; - - if (!get_signal(&ksig)) - continue; - break; - } - last_timeout = !ret; - } - - if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) - io_worker_handle_work(worker); - - audit_free(current); - io_worker_exit(worker); - return 0; -} - -/* - * Called when a worker is scheduled in. Mark us as currently running. - */ -void io_wq_worker_running(struct task_struct *tsk) -{ - struct io_worker *worker = tsk->worker_private; - - if (!worker) - return; - if (!(worker->flags & IO_WORKER_F_UP)) - return; - if (worker->flags & IO_WORKER_F_RUNNING) - return; - worker->flags |= IO_WORKER_F_RUNNING; - io_wqe_inc_running(worker); -} - -/* - * Called when worker is going to sleep. If there are no workers currently - * running and we have work pending, wake up a free one or create a new one. - */ -void io_wq_worker_sleeping(struct task_struct *tsk) -{ - struct io_worker *worker = tsk->worker_private; - - if (!worker) - return; - if (!(worker->flags & IO_WORKER_F_UP)) - return; - if (!(worker->flags & IO_WORKER_F_RUNNING)) - return; - - worker->flags &= ~IO_WORKER_F_RUNNING; - io_wqe_dec_running(worker); -} - -static void io_init_new_worker(struct io_wqe *wqe, struct io_worker *worker, - struct task_struct *tsk) -{ - tsk->worker_private = worker; - worker->task = tsk; - set_cpus_allowed_ptr(tsk, wqe->cpu_mask); - tsk->flags |= PF_NO_SETAFFINITY; - - raw_spin_lock(&wqe->lock); - hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); - list_add_tail_rcu(&worker->all_list, &wqe->all_list); - worker->flags |= IO_WORKER_F_FREE; - raw_spin_unlock(&wqe->lock); - wake_up_new_task(tsk); -} - -static bool io_wq_work_match_all(struct io_wq_work *work, void *data) -{ - return true; -} - -static inline bool io_should_retry_thread(long err) -{ - /* - * Prevent perpetual task_work retry, if the task (or its group) is - * exiting. - */ - if (fatal_signal_pending(current)) - return false; - - switch (err) { - case -EAGAIN: - case -ERESTARTSYS: - case -ERESTARTNOINTR: - case -ERESTARTNOHAND: - return true; - default: - return false; - } -} - -static void create_worker_cont(struct callback_head *cb) -{ - struct io_worker *worker; - struct task_struct *tsk; - struct io_wqe *wqe; - - worker = container_of(cb, struct io_worker, create_work); - clear_bit_unlock(0, &worker->create_state); - wqe = worker->wqe; - tsk = create_io_thread(io_wqe_worker, worker, wqe->node); - if (!IS_ERR(tsk)) { - io_init_new_worker(wqe, worker, tsk); - io_worker_release(worker); - return; - } else if (!io_should_retry_thread(PTR_ERR(tsk))) { - struct io_wqe_acct *acct = io_wqe_get_acct(worker); - - atomic_dec(&acct->nr_running); - raw_spin_lock(&wqe->lock); - acct->nr_workers--; - if (!acct->nr_workers) { - struct io_cb_cancel_data match = { - .fn = io_wq_work_match_all, - .cancel_all = true, - }; - - raw_spin_unlock(&wqe->lock); - while (io_acct_cancel_pending_work(wqe, acct, &match)) - ; - } else { - raw_spin_unlock(&wqe->lock); - } - io_worker_ref_put(wqe->wq); - kfree(worker); - return; - } - - /* re-create attempts grab a new worker ref, drop the existing one */ - io_worker_release(worker); - schedule_work(&worker->work); -} - -static void io_workqueue_create(struct work_struct *work) -{ - struct io_worker *worker = container_of(work, struct io_worker, work); - struct io_wqe_acct *acct = io_wqe_get_acct(worker); - - if (!io_queue_worker_create(worker, acct, create_worker_cont)) - kfree(worker); -} - -static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) -{ - struct io_wqe_acct *acct = &wqe->acct[index]; - struct io_worker *worker; - struct task_struct *tsk; - - __set_current_state(TASK_RUNNING); - - worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node); - if (!worker) { -fail: - atomic_dec(&acct->nr_running); - raw_spin_lock(&wqe->lock); - acct->nr_workers--; - raw_spin_unlock(&wqe->lock); - io_worker_ref_put(wq); - return false; - } - - refcount_set(&worker->ref, 1); - worker->wqe = wqe; - raw_spin_lock_init(&worker->lock); - init_completion(&worker->ref_done); - - if (index == IO_WQ_ACCT_BOUND) - worker->flags |= IO_WORKER_F_BOUND; - - tsk = create_io_thread(io_wqe_worker, worker, wqe->node); - if (!IS_ERR(tsk)) { - io_init_new_worker(wqe, worker, tsk); - } else if (!io_should_retry_thread(PTR_ERR(tsk))) { - kfree(worker); - goto fail; - } else { - INIT_WORK(&worker->work, io_workqueue_create); - schedule_work(&worker->work); - } - - return true; -} - -/* - * Iterate the passed in list and call the specific function for each - * worker that isn't exiting - */ -static bool io_wq_for_each_worker(struct io_wqe *wqe, - bool (*func)(struct io_worker *, void *), - void *data) -{ - struct io_worker *worker; - bool ret = false; - - list_for_each_entry_rcu(worker, &wqe->all_list, all_list) { - if (io_worker_get(worker)) { - /* no task if node is/was offline */ - if (worker->task) - ret = func(worker, data); - io_worker_release(worker); - if (ret) - break; - } - } - - return ret; -} - -static bool io_wq_worker_wake(struct io_worker *worker, void *data) -{ - __set_notify_signal(worker->task); - wake_up_process(worker->task); - return false; -} - -static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe) -{ - struct io_wq *wq = wqe->wq; - - do { - work->flags |= IO_WQ_WORK_CANCEL; - wq->do_work(work); - work = wq->free_work(work); - } while (work); -} - -static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work) -{ - struct io_wqe_acct *acct = io_work_get_acct(wqe, work); - unsigned int hash; - struct io_wq_work *tail; - - if (!io_wq_is_hashed(work)) { -append: - wq_list_add_tail(&work->list, &acct->work_list); - return; - } - - hash = io_get_work_hash(work); - tail = wqe->hash_tail[hash]; - wqe->hash_tail[hash] = work; - if (!tail) - goto append; - - wq_list_add_after(&work->list, &tail->list, &acct->work_list); -} - -static bool io_wq_work_match_item(struct io_wq_work *work, void *data) -{ - return work == data; -} - -static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) -{ - struct io_wqe_acct *acct = io_work_get_acct(wqe, work); - struct io_cb_cancel_data match; - unsigned work_flags = work->flags; - bool do_create; - - /* - * If io-wq is exiting for this task, or if the request has explicitly - * been marked as one that should not get executed, cancel it here. - */ - if (test_bit(IO_WQ_BIT_EXIT, &wqe->wq->state) || - (work->flags & IO_WQ_WORK_CANCEL)) { - io_run_cancel(work, wqe); - return; - } - - raw_spin_lock(&acct->lock); - io_wqe_insert_work(wqe, work); - clear_bit(IO_ACCT_STALLED_BIT, &acct->flags); - raw_spin_unlock(&acct->lock); - - raw_spin_lock(&wqe->lock); - rcu_read_lock(); - do_create = !io_wqe_activate_free_worker(wqe, acct); - rcu_read_unlock(); - - raw_spin_unlock(&wqe->lock); - - if (do_create && ((work_flags & IO_WQ_WORK_CONCURRENT) || - !atomic_read(&acct->nr_running))) { - bool did_create; - - did_create = io_wqe_create_worker(wqe, acct); - if (likely(did_create)) - return; - - raw_spin_lock(&wqe->lock); - if (acct->nr_workers) { - raw_spin_unlock(&wqe->lock); - return; - } - raw_spin_unlock(&wqe->lock); - - /* fatal condition, failed to create the first worker */ - match.fn = io_wq_work_match_item, - match.data = work, - match.cancel_all = false, - - io_acct_cancel_pending_work(wqe, acct, &match); - } -} - -void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) -{ - struct io_wqe *wqe = wq->wqes[numa_node_id()]; - - io_wqe_enqueue(wqe, work); -} - -/* - * Work items that hash to the same value will not be done in parallel. - * Used to limit concurrent writes, generally hashed by inode. - */ -void io_wq_hash_work(struct io_wq_work *work, void *val) -{ - unsigned int bit; - - bit = hash_ptr(val, IO_WQ_HASH_ORDER); - work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT)); -} - -static bool __io_wq_worker_cancel(struct io_worker *worker, - struct io_cb_cancel_data *match, - struct io_wq_work *work) -{ - if (work && match->fn(work, match->data)) { - work->flags |= IO_WQ_WORK_CANCEL; - __set_notify_signal(worker->task); - return true; - } - - return false; -} - -static bool io_wq_worker_cancel(struct io_worker *worker, void *data) -{ - struct io_cb_cancel_data *match = data; - - /* - * Hold the lock to avoid ->cur_work going out of scope, caller - * may dereference the passed in work. - */ - raw_spin_lock(&worker->lock); - if (__io_wq_worker_cancel(worker, match, worker->cur_work) || - __io_wq_worker_cancel(worker, match, worker->next_work)) - match->nr_running++; - raw_spin_unlock(&worker->lock); - - return match->nr_running && !match->cancel_all; -} - -static inline void io_wqe_remove_pending(struct io_wqe *wqe, - struct io_wq_work *work, - struct io_wq_work_node *prev) -{ - struct io_wqe_acct *acct = io_work_get_acct(wqe, work); - unsigned int hash = io_get_work_hash(work); - struct io_wq_work *prev_work = NULL; - - if (io_wq_is_hashed(work) && work == wqe->hash_tail[hash]) { - if (prev) - prev_work = container_of(prev, struct io_wq_work, list); - if (prev_work && io_get_work_hash(prev_work) == hash) - wqe->hash_tail[hash] = prev_work; - else - wqe->hash_tail[hash] = NULL; - } - wq_list_del(&acct->work_list, &work->list, prev); -} - -static bool io_acct_cancel_pending_work(struct io_wqe *wqe, - struct io_wqe_acct *acct, - struct io_cb_cancel_data *match) -{ - struct io_wq_work_node *node, *prev; - struct io_wq_work *work; - - raw_spin_lock(&acct->lock); - wq_list_for_each(node, prev, &acct->work_list) { - work = container_of(node, struct io_wq_work, list); - if (!match->fn(work, match->data)) - continue; - io_wqe_remove_pending(wqe, work, prev); - raw_spin_unlock(&acct->lock); - io_run_cancel(work, wqe); - match->nr_pending++; - /* not safe to continue after unlock */ - return true; - } - raw_spin_unlock(&acct->lock); - - return false; -} - -static void io_wqe_cancel_pending_work(struct io_wqe *wqe, - struct io_cb_cancel_data *match) -{ - int i; -retry: - for (i = 0; i < IO_WQ_ACCT_NR; i++) { - struct io_wqe_acct *acct = io_get_acct(wqe, i == 0); - - if (io_acct_cancel_pending_work(wqe, acct, match)) { - if (match->cancel_all) - goto retry; - break; - } - } -} - -static void io_wqe_cancel_running_work(struct io_wqe *wqe, - struct io_cb_cancel_data *match) -{ - rcu_read_lock(); - io_wq_for_each_worker(wqe, io_wq_worker_cancel, match); - rcu_read_unlock(); -} - -enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, - void *data, bool cancel_all) -{ - struct io_cb_cancel_data match = { - .fn = cancel, - .data = data, - .cancel_all = cancel_all, - }; - int node; - - /* - * First check pending list, if we're lucky we can just remove it - * from there. CANCEL_OK means that the work is returned as-new, - * no completion will be posted for it. - * - * Then check if a free (going busy) or busy worker has the work - * currently running. If we find it there, we'll return CANCEL_RUNNING - * as an indication that we attempt to signal cancellation. The - * completion will run normally in this case. - * - * Do both of these while holding the wqe->lock, to ensure that - * we'll find a work item regardless of state. - */ - for_each_node(node) { - struct io_wqe *wqe = wq->wqes[node]; - - io_wqe_cancel_pending_work(wqe, &match); - if (match.nr_pending && !match.cancel_all) - return IO_WQ_CANCEL_OK; - - raw_spin_lock(&wqe->lock); - io_wqe_cancel_running_work(wqe, &match); - raw_spin_unlock(&wqe->lock); - if (match.nr_running && !match.cancel_all) - return IO_WQ_CANCEL_RUNNING; - } - - if (match.nr_running) - return IO_WQ_CANCEL_RUNNING; - if (match.nr_pending) - return IO_WQ_CANCEL_OK; - return IO_WQ_CANCEL_NOTFOUND; -} - -static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode, - int sync, void *key) -{ - struct io_wqe *wqe = container_of(wait, struct io_wqe, wait); - int i; - - list_del_init(&wait->entry); - - rcu_read_lock(); - for (i = 0; i < IO_WQ_ACCT_NR; i++) { - struct io_wqe_acct *acct = &wqe->acct[i]; - - if (test_and_clear_bit(IO_ACCT_STALLED_BIT, &acct->flags)) - io_wqe_activate_free_worker(wqe, acct); - } - rcu_read_unlock(); - return 1; -} - -struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) -{ - int ret, node, i; - struct io_wq *wq; - - if (WARN_ON_ONCE(!data->free_work || !data->do_work)) - return ERR_PTR(-EINVAL); - if (WARN_ON_ONCE(!bounded)) - return ERR_PTR(-EINVAL); - - wq = kzalloc(struct_size(wq, wqes, nr_node_ids), GFP_KERNEL); - if (!wq) - return ERR_PTR(-ENOMEM); - ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node); - if (ret) - goto err_wq; - - refcount_inc(&data->hash->refs); - wq->hash = data->hash; - wq->free_work = data->free_work; - wq->do_work = data->do_work; - - ret = -ENOMEM; - for_each_node(node) { - struct io_wqe *wqe; - int alloc_node = node; - - if (!node_online(alloc_node)) - alloc_node = NUMA_NO_NODE; - wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, alloc_node); - if (!wqe) - goto err; - if (!alloc_cpumask_var(&wqe->cpu_mask, GFP_KERNEL)) - goto err; - cpumask_copy(wqe->cpu_mask, cpumask_of_node(node)); - wq->wqes[node] = wqe; - wqe->node = alloc_node; - wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded; - wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers = - task_rlimit(current, RLIMIT_NPROC); - INIT_LIST_HEAD(&wqe->wait.entry); - wqe->wait.func = io_wqe_hash_wake; - for (i = 0; i < IO_WQ_ACCT_NR; i++) { - struct io_wqe_acct *acct = &wqe->acct[i]; - - acct->index = i; - atomic_set(&acct->nr_running, 0); - INIT_WQ_LIST(&acct->work_list); - raw_spin_lock_init(&acct->lock); - } - wqe->wq = wq; - raw_spin_lock_init(&wqe->lock); - INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0); - INIT_LIST_HEAD(&wqe->all_list); - } - - wq->task = get_task_struct(data->task); - atomic_set(&wq->worker_refs, 1); - init_completion(&wq->worker_done); - return wq; -err: - io_wq_put_hash(data->hash); - cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node); - for_each_node(node) { - if (!wq->wqes[node]) - continue; - free_cpumask_var(wq->wqes[node]->cpu_mask); - kfree(wq->wqes[node]); - } -err_wq: - kfree(wq); - return ERR_PTR(ret); -} - -static bool io_task_work_match(struct callback_head *cb, void *data) -{ - struct io_worker *worker; - - if (cb->func != create_worker_cb && cb->func != create_worker_cont) - return false; - worker = container_of(cb, struct io_worker, create_work); - return worker->wqe->wq == data; -} - -void io_wq_exit_start(struct io_wq *wq) -{ - set_bit(IO_WQ_BIT_EXIT, &wq->state); -} - -static void io_wq_cancel_tw_create(struct io_wq *wq) -{ - struct callback_head *cb; - - while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) { - struct io_worker *worker; - - worker = container_of(cb, struct io_worker, create_work); - io_worker_cancel_cb(worker); - } -} - -static void io_wq_exit_workers(struct io_wq *wq) -{ - int node; - - if (!wq->task) - return; - - io_wq_cancel_tw_create(wq); - - rcu_read_lock(); - for_each_node(node) { - struct io_wqe *wqe = wq->wqes[node]; - - io_wq_for_each_worker(wqe, io_wq_worker_wake, NULL); - } - rcu_read_unlock(); - io_worker_ref_put(wq); - wait_for_completion(&wq->worker_done); - - for_each_node(node) { - spin_lock_irq(&wq->hash->wait.lock); - list_del_init(&wq->wqes[node]->wait.entry); - spin_unlock_irq(&wq->hash->wait.lock); - } - put_task_struct(wq->task); - wq->task = NULL; -} - -static void io_wq_destroy(struct io_wq *wq) -{ - int node; - - cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node); - - for_each_node(node) { - struct io_wqe *wqe = wq->wqes[node]; - struct io_cb_cancel_data match = { - .fn = io_wq_work_match_all, - .cancel_all = true, - }; - io_wqe_cancel_pending_work(wqe, &match); - free_cpumask_var(wqe->cpu_mask); - kfree(wqe); - } - io_wq_put_hash(wq->hash); - kfree(wq); -} - -void io_wq_put_and_exit(struct io_wq *wq) -{ - WARN_ON_ONCE(!test_bit(IO_WQ_BIT_EXIT, &wq->state)); - - io_wq_exit_workers(wq); - io_wq_destroy(wq); -} - -struct online_data { - unsigned int cpu; - bool online; -}; - -static bool io_wq_worker_affinity(struct io_worker *worker, void *data) -{ - struct online_data *od = data; - - if (od->online) - cpumask_set_cpu(od->cpu, worker->wqe->cpu_mask); - else - cpumask_clear_cpu(od->cpu, worker->wqe->cpu_mask); - return false; -} - -static int __io_wq_cpu_online(struct io_wq *wq, unsigned int cpu, bool online) -{ - struct online_data od = { - .cpu = cpu, - .online = online - }; - int i; - - rcu_read_lock(); - for_each_node(i) - io_wq_for_each_worker(wq->wqes[i], io_wq_worker_affinity, &od); - rcu_read_unlock(); - return 0; -} - -static int io_wq_cpu_online(unsigned int cpu, struct hlist_node *node) -{ - struct io_wq *wq = hlist_entry_safe(node, struct io_wq, cpuhp_node); - - return __io_wq_cpu_online(wq, cpu, true); -} - -static int io_wq_cpu_offline(unsigned int cpu, struct hlist_node *node) -{ - struct io_wq *wq = hlist_entry_safe(node, struct io_wq, cpuhp_node); - - return __io_wq_cpu_online(wq, cpu, false); -} - -int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask) -{ - int i; - - rcu_read_lock(); - for_each_node(i) { - struct io_wqe *wqe = wq->wqes[i]; - - if (mask) - cpumask_copy(wqe->cpu_mask, mask); - else - cpumask_copy(wqe->cpu_mask, cpumask_of_node(i)); - } - rcu_read_unlock(); - return 0; -} - -/* - * Set max number of unbounded workers, returns old value. If new_count is 0, - * then just return the old value. - */ -int io_wq_max_workers(struct io_wq *wq, int *new_count) -{ - int prev[IO_WQ_ACCT_NR]; - bool first_node = true; - int i, node; - - BUILD_BUG_ON((int) IO_WQ_ACCT_BOUND != (int) IO_WQ_BOUND); - BUILD_BUG_ON((int) IO_WQ_ACCT_UNBOUND != (int) IO_WQ_UNBOUND); - BUILD_BUG_ON((int) IO_WQ_ACCT_NR != 2); - - for (i = 0; i < IO_WQ_ACCT_NR; i++) { - if (new_count[i] > task_rlimit(current, RLIMIT_NPROC)) - new_count[i] = task_rlimit(current, RLIMIT_NPROC); - } - - for (i = 0; i < IO_WQ_ACCT_NR; i++) - prev[i] = 0; - - rcu_read_lock(); - for_each_node(node) { - struct io_wqe *wqe = wq->wqes[node]; - struct io_wqe_acct *acct; - - raw_spin_lock(&wqe->lock); - for (i = 0; i < IO_WQ_ACCT_NR; i++) { - acct = &wqe->acct[i]; - if (first_node) - prev[i] = max_t(int, acct->max_workers, prev[i]); - if (new_count[i]) - acct->max_workers = new_count[i]; - } - raw_spin_unlock(&wqe->lock); - first_node = false; - } - rcu_read_unlock(); - - for (i = 0; i < IO_WQ_ACCT_NR; i++) - new_count[i] = prev[i]; - - return 0; -} - -static __init int io_wq_init(void) -{ - int ret; - - ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "io-wq/online", - io_wq_cpu_online, io_wq_cpu_offline); - if (ret < 0) - return ret; - io_wq_online = ret; - return 0; -} -subsys_initcall(io_wq_init); diff --git a/fs/io-wq.h b/fs/io-wq.h deleted file mode 100644 index ba6eee76d028..000000000000 --- a/fs/io-wq.h +++ /dev/null @@ -1,228 +0,0 @@ -#ifndef INTERNAL_IO_WQ_H -#define INTERNAL_IO_WQ_H - -#include <linux/refcount.h> - -struct io_wq; - -enum { - IO_WQ_WORK_CANCEL = 1, - IO_WQ_WORK_HASHED = 2, - IO_WQ_WORK_UNBOUND = 4, - IO_WQ_WORK_CONCURRENT = 16, - - IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */ -}; - -enum io_wq_cancel { - IO_WQ_CANCEL_OK, /* cancelled before started */ - IO_WQ_CANCEL_RUNNING, /* found, running, and attempted cancelled */ - IO_WQ_CANCEL_NOTFOUND, /* work not found */ -}; - -struct io_wq_work_node { - struct io_wq_work_node *next; -}; - -struct io_wq_work_list { - struct io_wq_work_node *first; - struct io_wq_work_node *last; -}; - -#define wq_list_for_each(pos, prv, head) \ - for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next) - -#define wq_list_for_each_resume(pos, prv) \ - for (; pos; prv = pos, pos = (pos)->next) - -#define wq_list_empty(list) (READ_ONCE((list)->first) == NULL) -#define INIT_WQ_LIST(list) do { \ - (list)->first = NULL; \ -} while (0) - -static inline void wq_list_add_after(struct io_wq_work_node *node, - struct io_wq_work_node *pos, - struct io_wq_work_list *list) -{ - struct io_wq_work_node *next = pos->next; - - pos->next = node; - node->next = next; - if (!next) - list->last = node; -} - -/** - * wq_list_merge - merge the second list to the first one. - * @list0: the first list - * @list1: the second list - * Return the first node after mergence. - */ -static inline struct io_wq_work_node *wq_list_merge(struct io_wq_work_list *list0, - struct io_wq_work_list *list1) -{ - struct io_wq_work_node *ret; - - if (!list0->first) { - ret = list1->first; - } else { - ret = list0->first; - list0->last->next = list1->first; - } - INIT_WQ_LIST(list0); - INIT_WQ_LIST(list1); - return ret; -} - -static inline void wq_list_add_tail(struct io_wq_work_node *node, - struct io_wq_work_list *list) -{ - node->next = NULL; - if (!list->first) { - list->last = node; - WRITE_ONCE(list->first, node); - } else { - list->last->next = node; - list->last = node; - } -} - -static inline void wq_list_add_head(struct io_wq_work_node *node, - struct io_wq_work_list *list) -{ - node->next = list->first; - if (!node->next) - list->last = node; - WRITE_ONCE(list->first, node); -} - -static inline void wq_list_cut(struct io_wq_work_list *list, - struct io_wq_work_node *last, - struct io_wq_work_node *prev) -{ - /* first in the list, if prev==NULL */ - if (!prev) - WRITE_ONCE(list->first, last->next); - else - prev->next = last->next; - - if (last == list->last) - list->last = prev; - last->next = NULL; -} - -static inline void __wq_list_splice(struct io_wq_work_list *list, - struct io_wq_work_node *to) -{ - list->last->next = to->next; - to->next = list->first; - INIT_WQ_LIST(list); -} - -static inline bool wq_list_splice(struct io_wq_work_list *list, - struct io_wq_work_node *to) -{ - if (!wq_list_empty(list)) { - __wq_list_splice(list, to); - return true; - } - return false; -} - -static inline void wq_stack_add_head(struct io_wq_work_node *node, - struct io_wq_work_node *stack) -{ - node->next = stack->next; - stack->next = node; -} - -static inline void wq_list_del(struct io_wq_work_list *list, - struct io_wq_work_node *node, - struct io_wq_work_node *prev) -{ - wq_list_cut(list, node, prev); -} - -static inline -struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack) -{ - struct io_wq_work_node *node = stack->next; - - stack->next = node->next; - return node; -} - -struct io_wq_work { - struct io_wq_work_node list; - unsigned flags; - int cancel_seq; -}; - -static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) -{ - if (!work->list.next) - return NULL; - - return container_of(work->list.next, struct io_wq_work, list); -} - -typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *); -typedef void (io_wq_work_fn)(struct io_wq_work *); - -struct io_wq_hash { - refcount_t refs; - unsigned long map; - struct wait_queue_head wait; -}; - -static inline void io_wq_put_hash(struct io_wq_hash *hash) -{ - if (refcount_dec_and_test(&hash->refs)) - kfree(hash); -} - -struct io_wq_data { - struct io_wq_hash *hash; - struct task_struct *task; - io_wq_work_fn *do_work; - free_work_fn *free_work; -}; - -struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data); -void io_wq_exit_start(struct io_wq *wq); -void io_wq_put_and_exit(struct io_wq *wq); - -void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); -void io_wq_hash_work(struct io_wq_work *work, void *val); - -int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask); -int io_wq_max_workers(struct io_wq *wq, int *new_count); - -static inline bool io_wq_is_hashed(struct io_wq_work *work) -{ - return work->flags & IO_WQ_WORK_HASHED; -} - -typedef bool (work_cancel_fn)(struct io_wq_work *, void *); - -enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, - void *data, bool cancel_all); - -#if defined(CONFIG_IO_WQ) -extern void io_wq_worker_sleeping(struct task_struct *); -extern void io_wq_worker_running(struct task_struct *); -#else -static inline void io_wq_worker_sleeping(struct task_struct *tsk) -{ -} -static inline void io_wq_worker_running(struct task_struct *tsk) -{ -} -#endif - -static inline bool io_wq_current_is_worker(void) -{ - return in_task() && (current->flags & PF_IO_WORKER) && - current->worker_private; -} -#endif diff --git a/fs/io_uring.c b/fs/io_uring.c deleted file mode 100644 index 5ff2cdb425bc..000000000000 --- a/fs/io_uring.c +++ /dev/null @@ -1,13257 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Shared application/kernel submission and completion ring pairs, for - * supporting fast/efficient IO. - * - * A note on the read/write ordering memory barriers that are matched between - * the application and kernel side. - * - * After the application reads the CQ ring tail, it must use an - * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses - * before writing the tail (using smp_load_acquire to read the tail will - * do). It also needs a smp_mb() before updating CQ head (ordering the - * entry load(s) with the head store), pairing with an implicit barrier - * through a control-dependency in io_get_cqe (smp_store_release to - * store head will do). Failure to do so could lead to reading invalid - * CQ entries. - * - * Likewise, the application must use an appropriate smp_wmb() before - * writing the SQ tail (ordering SQ entry stores with the tail store), - * which pairs with smp_load_acquire in io_get_sqring (smp_store_release - * to store the tail will do). And it needs a barrier ordering the SQ - * head load before writing new SQ entries (smp_load_acquire to read - * head will do). - * - * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application - * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after* - * updating the SQ tail; a full memory barrier smp_mb() is needed - * between. - * - * Also see the examples in the liburing library: - * - * git://git.kernel.dk/liburing - * - * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens - * from data shared between the kernel and application. This is done both - * for ordering purposes, but also to ensure that once a value is loaded from - * data that the application could potentially modify, it remains stable. - * - * Copyright (C) 2018-2019 Jens Axboe - * Copyright (c) 2018-2019 Christoph Hellwig - */ -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/errno.h> -#include <linux/syscalls.h> -#include <linux/compat.h> -#include <net/compat.h> -#include <linux/refcount.h> -#include <linux/uio.h> -#include <linux/bits.h> - -#include <linux/sched/signal.h> -#include <linux/fs.h> -#include <linux/file.h> -#include <linux/fdtable.h> -#include <linux/mm.h> -#include <linux/mman.h> -#include <linux/percpu.h> -#include <linux/slab.h> -#include <linux/blk-mq.h> -#include <linux/bvec.h> -#include <linux/net.h> -#include <net/sock.h> -#include <net/af_unix.h> -#include <net/scm.h> -#include <linux/anon_inodes.h> -#include <linux/sched/mm.h> -#include <linux/uaccess.h> -#include <linux/nospec.h> -#include <linux/sizes.h> -#include <linux/hugetlb.h> -#include <linux/highmem.h> -#include <linux/namei.h> -#include <linux/fsnotify.h> -#include <linux/fadvise.h> -#include <linux/eventpoll.h> -#include <linux/splice.h> -#include <linux/task_work.h> -#include <linux/pagemap.h> -#include <linux/io_uring.h> -#include <linux/audit.h> -#include <linux/security.h> -#include <linux/xattr.h> - -#define CREATE_TRACE_POINTS -#include <trace/events/io_uring.h> - -#include <uapi/linux/io_uring.h> - -#include "internal.h" -#include "io-wq.h" - -#define IORING_MAX_ENTRIES 32768 -#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) -#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 - -/* only define max */ -#define IORING_MAX_FIXED_FILES (1U << 20) -#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ - IORING_REGISTER_LAST + IORING_OP_LAST) - -#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3) -#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT) -#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1) - -#define IORING_MAX_REG_BUFFERS (1U << 14) - -#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ - IOSQE_IO_HARDLINK | IOSQE_ASYNC) - -#define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \ - IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS) - -#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ - REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \ - REQ_F_ASYNC_DATA) - -#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\ - IO_REQ_CLEAN_FLAGS) - -#define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED) - -#define IO_TCTX_REFS_CACHE_NR (1U << 10) - -struct io_uring { - u32 head ____cacheline_aligned_in_smp; - u32 tail ____cacheline_aligned_in_smp; -}; - -/* - * This data is shared with the application through the mmap at offsets - * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING. - * - * The offsets to the member fields are published through struct - * io_sqring_offsets when calling io_uring_setup. - */ -struct io_rings { - /* - * Head and tail offsets into the ring; the offsets need to be - * masked to get valid indices. - * - * The kernel controls head of the sq ring and the tail of the cq ring, - * and the application controls tail of the sq ring and the head of the - * cq ring. - */ - struct io_uring sq, cq; - /* - * Bitmasks to apply to head and tail offsets (constant, equals - * ring_entries - 1) - */ - u32 sq_ring_mask, cq_ring_mask; - /* Ring sizes (constant, power of 2) */ - u32 sq_ring_entries, cq_ring_entries; - /* - * Number of invalid entries dropped by the kernel due to - * invalid index stored in array - * - * Written by the kernel, shouldn't be modified by the - * application (i.e. get number of "new events" by comparing to - * cached value). - * - * After a new SQ head value was read by the application this - * counter includes all submissions that were dropped reaching - * the new SQ head (and possibly more). - */ - u32 sq_dropped; - /* - * Runtime SQ flags - * - * Written by the kernel, shouldn't be modified by the - * application. - * - * The application needs a full memory barrier before checking - * for IORING_SQ_NEED_WAKEUP after updating the sq tail. - */ - atomic_t sq_flags; - /* - * Runtime CQ flags - * - * Written by the application, shouldn't be modified by the - * kernel. - */ - u32 cq_flags; - /* - * Number of completion events lost because the queue was full; - * this should be avoided by the application by making sure - * there are not more requests pending than there is space in - * the completion queue. - * - * Written by the kernel, shouldn't be modified by the - * application (i.e. get number of "new events" by comparing to - * cached value). - * - * As completion events come in out of order this counter is not - * ordered with any other data. - */ - u32 cq_overflow; - /* - * Ring buffer of completion events. - * - * The kernel writes completion events fresh every time they are - * produced, so the application is allowed to modify pending - * entries. - */ - struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp; -}; - -struct io_mapped_ubuf { - u64 ubuf; - u64 ubuf_end; - unsigned int nr_bvecs; - unsigned long acct_pages; - struct bio_vec bvec[]; -}; - -struct io_ring_ctx; - -struct io_overflow_cqe { - struct list_head list; - struct io_uring_cqe cqe; -}; - -/* - * FFS_SCM is only available on 64-bit archs, for 32-bit we just define it as 0 - * and define IO_URING_SCM_ALL. For this case, we use SCM for all files as we - * can't safely always dereference the file when the task has exited and ring - * cleanup is done. If a file is tracked and part of SCM, then unix gc on - * process exit may reap it before __io_sqe_files_unregister() is run. - */ -#define FFS_NOWAIT 0x1UL -#define FFS_ISREG 0x2UL -#if defined(CONFIG_64BIT) -#define FFS_SCM 0x4UL -#else -#define IO_URING_SCM_ALL -#define FFS_SCM 0x0UL -#endif -#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG|FFS_SCM) - -struct io_fixed_file { - /* file * with additional FFS_* flags */ - unsigned long file_ptr; -}; - -struct io_rsrc_put { - struct list_head list; - u64 tag; - union { - void *rsrc; - struct file *file; - struct io_mapped_ubuf *buf; - }; -}; - -struct io_file_table { - struct io_fixed_file *files; - unsigned long *bitmap; - unsigned int alloc_hint; -}; - -struct io_rsrc_node { - struct percpu_ref refs; - struct list_head node; - struct list_head rsrc_list; - struct io_rsrc_data *rsrc_data; - struct llist_node llist; - bool done; -}; - -typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); - -struct io_rsrc_data { - struct io_ring_ctx *ctx; - - u64 **tags; - unsigned int nr; - rsrc_put_fn *do_put; - atomic_t refs; - struct completion done; - bool quiesce; -}; - -#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf)) -struct io_buffer_list { - /* - * If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not, - * then these are classic provided buffers and ->buf_list is used. - */ - union { - struct list_head buf_list; - struct { - struct page **buf_pages; - struct io_uring_buf_ring *buf_ring; - }; - }; - __u16 bgid; - - /* below is for ring provided buffers */ - __u16 buf_nr_pages; - __u16 nr_entries; - __u16 head; - __u16 mask; -}; - -struct io_buffer { - struct list_head list; - __u64 addr; - __u32 len; - __u16 bid; - __u16 bgid; -}; - -struct io_restriction { - DECLARE_BITMAP(register_op, IORING_REGISTER_LAST); - DECLARE_BITMAP(sqe_op, IORING_OP_LAST); - u8 sqe_flags_allowed; - u8 sqe_flags_required; - bool registered; -}; - -enum { - IO_SQ_THREAD_SHOULD_STOP = 0, - IO_SQ_THREAD_SHOULD_PARK, -}; - -struct io_sq_data { - refcount_t refs; - atomic_t park_pending; - struct mutex lock; - - /* ctx's that are using this sqd */ - struct list_head ctx_list; - - struct task_struct *thread; - struct wait_queue_head wait; - - unsigned sq_thread_idle; - int sq_cpu; - pid_t task_pid; - pid_t task_tgid; - - unsigned long state; - struct completion exited; -}; - -#define IO_COMPL_BATCH 32 -#define IO_REQ_CACHE_SIZE 32 -#define IO_REQ_ALLOC_BATCH 8 - -struct io_submit_link { - struct io_kiocb *head; - struct io_kiocb *last; -}; - -struct io_submit_state { - /* inline/task_work completion list, under ->uring_lock */ - struct io_wq_work_node free_list; - /* batch completion logic */ - struct io_wq_work_list compl_reqs; - struct io_submit_link link; - - bool plug_started; - bool need_plug; - bool flush_cqes; - unsigned short submit_nr; - struct blk_plug plug; -}; - -struct io_ev_fd { - struct eventfd_ctx *cq_ev_fd; - unsigned int eventfd_async: 1; - struct rcu_head rcu; -}; - -#define BGID_ARRAY 64 - -struct io_ring_ctx { - /* const or read-mostly hot data */ - struct { - struct percpu_ref refs; - - struct io_rings *rings; - unsigned int flags; - enum task_work_notify_mode notify_method; - unsigned int compat: 1; - unsigned int drain_next: 1; - unsigned int restricted: 1; - unsigned int off_timeout_used: 1; - unsigned int drain_active: 1; - unsigned int drain_disabled: 1; - unsigned int has_evfd: 1; - unsigned int syscall_iopoll: 1; - } ____cacheline_aligned_in_smp; - - /* submission data */ - struct { - struct mutex uring_lock; - - /* - * Ring buffer of indices into array of io_uring_sqe, which is - * mmapped by the application using the IORING_OFF_SQES offset. - * - * This indirection could e.g. be used to assign fixed - * io_uring_sqe entries to operations and only submit them to - * the queue when needed. - * - * The kernel modifies neither the indices array nor the entries - * array. - */ - u32 *sq_array; - struct io_uring_sqe *sq_sqes; - unsigned cached_sq_head; - unsigned sq_entries; - struct list_head defer_list; - - /* - * Fixed resources fast path, should be accessed only under - * uring_lock, and updated through io_uring_register(2) - */ - struct io_rsrc_node *rsrc_node; - int rsrc_cached_refs; - atomic_t cancel_seq; - struct io_file_table file_table; - unsigned nr_user_files; - unsigned nr_user_bufs; - struct io_mapped_ubuf **user_bufs; - - struct io_submit_state submit_state; - - struct io_buffer_list *io_bl; - struct xarray io_bl_xa; - struct list_head io_buffers_cache; - - struct list_head timeout_list; - struct list_head ltimeout_list; - struct list_head cq_overflow_list; - struct list_head apoll_cache; - struct xarray personalities; - u32 pers_next; - unsigned sq_thread_idle; - } ____cacheline_aligned_in_smp; - - /* IRQ completion list, under ->completion_lock */ - struct io_wq_work_list locked_free_list; - unsigned int locked_free_nr; - - const struct cred *sq_creds; /* cred used for __io_sq_thread() */ - struct io_sq_data *sq_data; /* if using sq thread polling */ - - struct wait_queue_head sqo_sq_wait; - struct list_head sqd_list; - - unsigned long check_cq; - - struct { - /* - * We cache a range of free CQEs we can use, once exhausted it - * should go through a slower range setup, see __io_get_cqe() - */ - struct io_uring_cqe *cqe_cached; - struct io_uring_cqe *cqe_sentinel; - - unsigned cached_cq_tail; - unsigned cq_entries; - struct io_ev_fd __rcu *io_ev_fd; - struct wait_queue_head cq_wait; - unsigned cq_extra; - atomic_t cq_timeouts; - unsigned cq_last_tm_flush; - } ____cacheline_aligned_in_smp; - - struct { - spinlock_t completion_lock; - - spinlock_t timeout_lock; - - /* - * ->iopoll_list is protected by the ctx->uring_lock for - * io_uring instances that don't use IORING_SETUP_SQPOLL. - * For SQPOLL, only the single threaded io_sq_thread() will - * manipulate the list, hence no extra locking is needed there. - */ - struct io_wq_work_list iopoll_list; - struct hlist_head *cancel_hash; - unsigned cancel_hash_bits; - bool poll_multi_queue; - - struct list_head io_buffers_comp; - } ____cacheline_aligned_in_smp; - - struct io_restriction restrictions; - - /* slow path rsrc auxilary data, used by update/register */ - struct { - struct io_rsrc_node *rsrc_backup_node; - struct io_mapped_ubuf *dummy_ubuf; - struct io_rsrc_data *file_data; - struct io_rsrc_data *buf_data; - - struct delayed_work rsrc_put_work; - struct llist_head rsrc_put_llist; - struct list_head rsrc_ref_list; - spinlock_t rsrc_ref_lock; - - struct list_head io_buffers_pages; - }; - - /* Keep this last, we don't need it for the fast path */ - struct { - #if defined(CONFIG_UNIX) - struct socket *ring_sock; - #endif - /* hashed buffered write serialization */ - struct io_wq_hash *hash_map; - - /* Only used for accounting purposes */ - struct user_struct *user; - struct mm_struct *mm_account; - - /* ctx exit and cancelation */ - struct llist_head fallback_llist; - struct delayed_work fallback_work; - struct work_struct exit_work; - struct list_head tctx_list; - struct completion ref_comp; - u32 iowq_limits[2]; - bool iowq_limits_set; - }; -}; - -/* - * Arbitrary limit, can be raised if need be - */ -#define IO_RINGFD_REG_MAX 16 - -struct io_uring_task { - /* submission side */ - int cached_refs; - struct xarray xa; - struct wait_queue_head wait; - const struct io_ring_ctx *last; - struct io_wq *io_wq; - struct percpu_counter inflight; - atomic_t inflight_tracked; - atomic_t in_idle; - - spinlock_t task_lock; - struct io_wq_work_list task_list; - struct io_wq_work_list prio_task_list; - struct callback_head task_work; - struct file **registered_rings; - bool task_running; -}; - -/* - * First field must be the file pointer in all the - * iocb unions! See also 'struct kiocb' in <linux/fs.h> - */ -struct io_poll_iocb { - struct file *file; - struct wait_queue_head *head; - __poll_t events; - struct wait_queue_entry wait; -}; - -struct io_poll_update { - struct file *file; - u64 old_user_data; - u64 new_user_data; - __poll_t events; - bool update_events; - bool update_user_data; -}; - -struct io_close { - struct file *file; - int fd; - u32 file_slot; -}; - -struct io_timeout_data { - struct io_kiocb *req; - struct hrtimer timer; - struct timespec64 ts; - enum hrtimer_mode mode; - u32 flags; -}; - -struct io_accept { - struct file *file; - struct sockaddr __user *addr; - int __user *addr_len; - int flags; - u32 file_slot; - unsigned long nofile; -}; - -struct io_socket { - struct file *file; - int domain; - int type; - int protocol; - int flags; - u32 file_slot; - unsigned long nofile; -}; - -struct io_sync { - struct file *file; - loff_t len; - loff_t off; - int flags; - int mode; -}; - -struct io_cancel { - struct file *file; - u64 addr; - u32 flags; - s32 fd; -}; - -struct io_timeout { - struct file *file; - u32 off; - u32 target_seq; - struct list_head list; - /* head of the link, used by linked timeouts only */ - struct io_kiocb *head; - /* for linked completions */ - struct io_kiocb *prev; -}; - -struct io_timeout_rem { - struct file *file; - u64 addr; - - /* timeout update */ - struct timespec64 ts; - u32 flags; - bool ltimeout; -}; - -struct io_rw { - /* NOTE: kiocb has the file as the first member, so don't do it here */ - struct kiocb kiocb; - u64 addr; - u32 len; - rwf_t flags; -}; - -struct io_connect { - struct file *file; - struct sockaddr __user *addr; - int addr_len; -}; - -struct io_sr_msg { - struct file *file; - union { - struct compat_msghdr __user *umsg_compat; - struct user_msghdr __user *umsg; - void __user *buf; - }; - int msg_flags; - size_t len; - size_t done_io; - unsigned int flags; -}; - -struct io_open { - struct file *file; - int dfd; - u32 file_slot; - struct filename *filename; - struct open_how how; - unsigned long nofile; -}; - -struct io_rsrc_update { - struct file *file; - u64 arg; - u32 nr_args; - u32 offset; -}; - -struct io_fadvise { - struct file *file; - u64 offset; - u32 len; - u32 advice; -}; - -struct io_madvise { - struct file *file; - u64 addr; - u32 len; - u32 advice; -}; - -struct io_epoll { - struct file *file; - int epfd; - int op; - int fd; - struct epoll_event event; -}; - -struct io_splice { - struct file *file_out; - loff_t off_out; - loff_t off_in; - u64 len; - int splice_fd_in; - unsigned int flags; -}; - -struct io_provide_buf { - struct file *file; - __u64 addr; - __u32 len; - __u32 bgid; - __u16 nbufs; - __u16 bid; -}; - -struct io_statx { - struct file *file; - int dfd; - unsigned int mask; - unsigned int flags; - struct filename *filename; - struct statx __user *buffer; -}; - -struct io_shutdown { - struct file *file; - int how; -}; - -struct io_rename { - struct file *file; - int old_dfd; - int new_dfd; - struct filename *oldpath; - struct filename *newpath; - int flags; -}; - -struct io_unlink { - struct file *file; - int dfd; - int flags; - struct filename *filename; -}; - -struct io_mkdir { - struct file *file; - int dfd; - umode_t mode; - struct filename *filename; -}; - -struct io_symlink { - struct file *file; - int new_dfd; - struct filename *oldpath; - struct filename *newpath; -}; - -struct io_hardlink { - struct file *file; - int old_dfd; - int new_dfd; - struct filename *oldpath; - struct filename *newpath; - int flags; -}; - -struct io_msg { - struct file *file; - u64 user_data; - u32 len; -}; - -struct io_async_connect { - struct sockaddr_storage address; -}; - -struct io_async_msghdr { - struct iovec fast_iov[UIO_FASTIOV]; - /* points to an allocated iov, if NULL we use fast_iov instead */ - struct iovec *free_iov; - struct sockaddr __user *uaddr; - struct msghdr msg; - struct sockaddr_storage addr; -}; - -struct io_rw_state { - struct iov_iter iter; - struct iov_iter_state iter_state; - struct iovec fast_iov[UIO_FASTIOV]; -}; - -struct io_async_rw { - struct io_rw_state s; - const struct iovec *free_iovec; - size_t bytes_done; - struct wait_page_queue wpq; -}; - -struct io_xattr { - struct file *file; - struct xattr_ctx ctx; - struct filename *filename; -}; - -enum { - REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, - REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, - REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT, - REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT, - REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, - REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, - REQ_F_CQE_SKIP_BIT = IOSQE_CQE_SKIP_SUCCESS_BIT, - - /* first byte is taken by user flags, shift it to not overlap */ - REQ_F_FAIL_BIT = 8, - REQ_F_INFLIGHT_BIT, - REQ_F_CUR_POS_BIT, - REQ_F_NOWAIT_BIT, - REQ_F_LINK_TIMEOUT_BIT, - REQ_F_NEED_CLEANUP_BIT, - REQ_F_POLLED_BIT, - REQ_F_BUFFER_SELECTED_BIT, - REQ_F_BUFFER_RING_BIT, - REQ_F_COMPLETE_INLINE_BIT, - REQ_F_REISSUE_BIT, - REQ_F_CREDS_BIT, - REQ_F_REFCOUNT_BIT, - REQ_F_ARM_LTIMEOUT_BIT, - REQ_F_ASYNC_DATA_BIT, - REQ_F_SKIP_LINK_CQES_BIT, - REQ_F_SINGLE_POLL_BIT, - REQ_F_DOUBLE_POLL_BIT, - REQ_F_PARTIAL_IO_BIT, - REQ_F_CQE32_INIT_BIT, - REQ_F_APOLL_MULTISHOT_BIT, - /* keep async read/write and isreg together and in order */ - REQ_F_SUPPORT_NOWAIT_BIT, - REQ_F_ISREG_BIT, - - /* not a real bit, just to check we're not overflowing the space */ - __REQ_F_LAST_BIT, -}; - -enum { - /* ctx owns file */ - REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT), - /* drain existing IO first */ - REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT), - /* linked sqes */ - REQ_F_LINK = BIT(REQ_F_LINK_BIT), - /* doesn't sever on completion < 0 */ - REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT), - /* IOSQE_ASYNC */ - REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT), - /* IOSQE_BUFFER_SELECT */ - REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), - /* IOSQE_CQE_SKIP_SUCCESS */ - REQ_F_CQE_SKIP = BIT(REQ_F_CQE_SKIP_BIT), - - /* fail rest of links */ - REQ_F_FAIL = BIT(REQ_F_FAIL_BIT), - /* on inflight list, should be cancelled and waited on exit reliably */ - REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT), - /* read/write uses file position */ - REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT), - /* must not punt to workers */ - REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), - /* has or had linked timeout */ - REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), - /* needs cleanup */ - REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), - /* already went through poll handler */ - REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), - /* buffer already selected */ - REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), - /* buffer selected from ring, needs commit */ - REQ_F_BUFFER_RING = BIT(REQ_F_BUFFER_RING_BIT), - /* completion is deferred through io_comp_state */ - REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT), - /* caller should reissue async */ - REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT), - /* supports async reads/writes */ - REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT), - /* regular file */ - REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), - /* has creds assigned */ - REQ_F_CREDS = BIT(REQ_F_CREDS_BIT), - /* skip refcounting if not set */ - REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT), - /* there is a linked timeout that has to be armed */ - REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT), - /* ->async_data allocated */ - REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT), - /* don't post CQEs while failing linked requests */ - REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT), - /* single poll may be active */ - REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT), - /* double poll may active */ - REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT), - /* request has already done partial IO */ - REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT), - /* fast poll multishot mode */ - REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT), - /* ->extra1 and ->extra2 are initialised */ - REQ_F_CQE32_INIT = BIT(REQ_F_CQE32_INIT_BIT), -}; - -struct async_poll { - struct io_poll_iocb poll; - struct io_poll_iocb *double_poll; -}; - -typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked); - -struct io_task_work { - union { - struct io_wq_work_node node; - struct llist_node fallback_node; - }; - io_req_tw_func_t func; -}; - -enum { - IORING_RSRC_FILE = 0, - IORING_RSRC_BUFFER = 1, -}; - -struct io_cqe { - __u64 user_data; - __s32 res; - /* fd initially, then cflags for completion */ - union { - __u32 flags; - int fd; - }; -}; - -enum { - IO_CHECK_CQ_OVERFLOW_BIT, - IO_CHECK_CQ_DROPPED_BIT, -}; - -/* - * NOTE! Each of the iocb union members has the file pointer - * as the first entry in their struct definition. So you can - * access the file pointer through any of the sub-structs, - * or directly as just 'file' in this struct. - */ -struct io_kiocb { - union { - struct file *file; - struct io_rw rw; - struct io_poll_iocb poll; - struct io_poll_update poll_update; - struct io_accept accept; - struct io_sync sync; - struct io_cancel cancel; - struct io_timeout timeout; - struct io_timeout_rem timeout_rem; - struct io_connect connect; - struct io_sr_msg sr_msg; - struct io_open open; - struct io_close close; - struct io_rsrc_update rsrc_update; - struct io_fadvise fadvise; - struct io_madvise madvise; - struct io_epoll epoll; - struct io_splice splice; - struct io_provide_buf pbuf; - struct io_statx statx; - struct io_shutdown shutdown; - struct io_rename rename; - struct io_unlink unlink; - struct io_mkdir mkdir; - struct io_symlink symlink; - struct io_hardlink hardlink; - struct io_msg msg; - struct io_xattr xattr; - struct io_socket sock; - struct io_uring_cmd uring_cmd; - }; - - u8 opcode; - /* polled IO has completed */ - u8 iopoll_completed; - /* - * Can be either a fixed buffer index, or used with provided buffers. - * For the latter, before issue it points to the buffer group ID, - * and after selection it points to the buffer ID itself. - */ - u16 buf_index; - unsigned int flags; - - struct io_cqe cqe; - - struct io_ring_ctx *ctx; - struct task_struct *task; - - struct io_rsrc_node *rsrc_node; - - union { - /* store used ubuf, so we can prevent reloading */ - struct io_mapped_ubuf *imu; - - /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ - struct io_buffer *kbuf; - - /* - * stores buffer ID for ring provided buffers, valid IFF - * REQ_F_BUFFER_RING is set. - */ - struct io_buffer_list *buf_list; - }; - - union { - /* used by request caches, completion batching and iopoll */ - struct io_wq_work_node comp_list; - /* cache ->apoll->events */ - __poll_t apoll_events; - }; - atomic_t refs; - atomic_t poll_refs; - struct io_task_work io_task_work; - /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ - union { - struct hlist_node hash_node; - struct { - u64 extra1; - u64 extra2; - }; - }; - /* internal polling, see IORING_FEAT_FAST_POLL */ - struct async_poll *apoll; - /* opcode allocated if it needs to store data for async defer */ - void *async_data; - /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */ - struct io_kiocb *link; - /* custom credentials, valid IFF REQ_F_CREDS is set */ - const struct cred *creds; - struct io_wq_work work; -}; - -struct io_tctx_node { - struct list_head ctx_node; - struct task_struct *task; - struct io_ring_ctx *ctx; -}; - -struct io_defer_entry { - struct list_head list; - struct io_kiocb *req; - u32 seq; -}; - -struct io_cancel_data { - struct io_ring_ctx *ctx; - union { - u64 data; - struct file *file; - }; - u32 flags; - int seq; -}; - -/* - * The URING_CMD payload starts at 'cmd' in the first sqe, and continues into - * the following sqe if SQE128 is used. - */ -#define uring_cmd_pdu_size(is_sqe128) \ - ((1 + !!(is_sqe128)) * sizeof(struct io_uring_sqe) - \ - offsetof(struct io_uring_sqe, cmd)) - -struct io_op_def { - /* needs req->file assigned */ - unsigned needs_file : 1; - /* should block plug */ - unsigned plug : 1; - /* hash wq insertion if file is a regular file */ - unsigned hash_reg_file : 1; - /* unbound wq insertion if file is a non-regular file */ - unsigned unbound_nonreg_file : 1; - /* set if opcode supports polled "wait" */ - unsigned pollin : 1; - unsigned pollout : 1; - unsigned poll_exclusive : 1; - /* op supports buffer selection */ - unsigned buffer_select : 1; - /* do prep async if is going to be punted */ - unsigned needs_async_setup : 1; - /* opcode is not supported by this kernel */ - unsigned not_supported : 1; - /* skip auditing */ - unsigned audit_skip : 1; - /* supports ioprio */ - unsigned ioprio : 1; - /* supports iopoll */ - unsigned iopoll : 1; - /* size of async data needed, if any */ - unsigned short async_size; -}; - -static const struct io_op_def io_op_defs[] = { - [IORING_OP_NOP] = { - .audit_skip = 1, - .iopoll = 1, - }, - [IORING_OP_READV] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollin = 1, - .buffer_select = 1, - .needs_async_setup = 1, - .plug = 1, - .audit_skip = 1, - .ioprio = 1, - .iopoll = 1, - .async_size = sizeof(struct io_async_rw), - }, - [IORING_OP_WRITEV] = { - .needs_file = 1, - .hash_reg_file = 1, - .unbound_nonreg_file = 1, - .pollout = 1, - .needs_async_setup = 1, - .plug = 1, - .audit_skip = 1, - .ioprio = 1, - .iopoll = 1, - .async_size = sizeof(struct io_async_rw), - }, - [IORING_OP_FSYNC] = { - .needs_file = 1, - .audit_skip = 1, - }, - [IORING_OP_READ_FIXED] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollin = 1, - .plug = 1, - .audit_skip = 1, - .ioprio = 1, - .iopoll = 1, - .async_size = sizeof(struct io_async_rw), - }, - [IORING_OP_WRITE_FIXED] = { - .needs_file = 1, - .hash_reg_file = 1, - .unbound_nonreg_file = 1, - .pollout = 1, - .plug = 1, - .audit_skip = 1, - .ioprio = 1, - .iopoll = 1, - .async_size = sizeof(struct io_async_rw), - }, - [IORING_OP_POLL_ADD] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .audit_skip = 1, - }, - [IORING_OP_POLL_REMOVE] = { - .audit_skip = 1, - }, - [IORING_OP_SYNC_FILE_RANGE] = { - .needs_file = 1, - .audit_skip = 1, - }, - [IORING_OP_SENDMSG] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollout = 1, - .needs_async_setup = 1, - .async_size = sizeof(struct io_async_msghdr), - }, - [IORING_OP_RECVMSG] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollin = 1, - .buffer_select = 1, - .needs_async_setup = 1, - .async_size = sizeof(struct io_async_msghdr), - }, - [IORING_OP_TIMEOUT] = { - .audit_skip = 1, - .async_size = sizeof(struct io_timeout_data), - }, - [IORING_OP_TIMEOUT_REMOVE] = { - /* used by timeout updates' prep() */ - .audit_skip = 1, - }, - [IORING_OP_ACCEPT] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollin = 1, - .poll_exclusive = 1, - .ioprio = 1, /* used for flags */ - }, - [IORING_OP_ASYNC_CANCEL] = { - .audit_skip = 1, - }, - [IORING_OP_LINK_TIMEOUT] = { - .audit_skip = 1, - .async_size = sizeof(struct io_timeout_data), - }, - [IORING_OP_CONNECT] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollout = 1, - .needs_async_setup = 1, - .async_size = sizeof(struct io_async_connect), - }, - [IORING_OP_FALLOCATE] = { - .needs_file = 1, - }, - [IORING_OP_OPENAT] = {}, - [IORING_OP_CLOSE] = {}, - [IORING_OP_FILES_UPDATE] = { - .audit_skip = 1, - .iopoll = 1, - }, - [IORING_OP_STATX] = { - .audit_skip = 1, - }, - [IORING_OP_READ] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollin = 1, - .buffer_select = 1, - .plug = 1, - .audit_skip = 1, - .ioprio = 1, - .iopoll = 1, - .async_size = sizeof(struct io_async_rw), - }, - [IORING_OP_WRITE] = { - .needs_file = 1, - .hash_reg_file = 1, - .unbound_nonreg_file = 1, - .pollout = 1, - .plug = 1, - .audit_skip = 1, - .ioprio = 1, - .iopoll = 1, - .async_size = sizeof(struct io_async_rw), - }, - [IORING_OP_FADVISE] = { - .needs_file = 1, - .audit_skip = 1, - }, - [IORING_OP_MADVISE] = {}, - [IORING_OP_SEND] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollout = 1, - .audit_skip = 1, - }, - [IORING_OP_RECV] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollin = 1, - .buffer_select = 1, - .audit_skip = 1, - }, - [IORING_OP_OPENAT2] = { - }, - [IORING_OP_EPOLL_CTL] = { - .unbound_nonreg_file = 1, - .audit_skip = 1, - }, - [IORING_OP_SPLICE] = { - .needs_file = 1, - .hash_reg_file = 1, - .unbound_nonreg_file = 1, - .audit_skip = 1, - }, - [IORING_OP_PROVIDE_BUFFERS] = { - .audit_skip = 1, - .iopoll = 1, - }, - [IORING_OP_REMOVE_BUFFERS] = { - .audit_skip = 1, - .iopoll = 1, - }, - [IORING_OP_TEE] = { - .needs_file = 1, - .hash_reg_file = 1, - .unbound_nonreg_file = 1, - .audit_skip = 1, - }, - [IORING_OP_SHUTDOWN] = { - .needs_file = 1, - }, - [IORING_OP_RENAMEAT] = {}, - [IORING_OP_UNLINKAT] = {}, - [IORING_OP_MKDIRAT] = {}, - [IORING_OP_SYMLINKAT] = {}, - [IORING_OP_LINKAT] = {}, - [IORING_OP_MSG_RING] = { - .needs_file = 1, - .iopoll = 1, - }, - [IORING_OP_FSETXATTR] = { - .needs_file = 1 - }, - [IORING_OP_SETXATTR] = {}, - [IORING_OP_FGETXATTR] = { - .needs_file = 1 - }, - [IORING_OP_GETXATTR] = {}, - [IORING_OP_SOCKET] = { - .audit_skip = 1, - }, - [IORING_OP_URING_CMD] = { - .needs_file = 1, - .plug = 1, - .needs_async_setup = 1, - .async_size = uring_cmd_pdu_size(1), - }, -}; - -/* requests with any of those set should undergo io_disarm_next() */ -#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) -#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) - -static bool io_disarm_next(struct io_kiocb *req); -static void io_uring_del_tctx_node(unsigned long index); -static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, - struct task_struct *task, - bool cancel_all); -static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); - -static void __io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags); -static void io_dismantle_req(struct io_kiocb *req); -static void io_queue_linked_timeout(struct io_kiocb *req); -static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, - struct io_uring_rsrc_update2 *up, - unsigned nr_args); -static void io_clean_op(struct io_kiocb *req); -static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, - unsigned issue_flags); -static struct file *io_file_get_normal(struct io_kiocb *req, int fd); -static void io_queue_sqe(struct io_kiocb *req); -static void io_rsrc_put_work(struct work_struct *work); - -static void io_req_task_queue(struct io_kiocb *req); -static void __io_submit_flush_completions(struct io_ring_ctx *ctx); -static int io_req_prep_async(struct io_kiocb *req); - -static int io_install_fixed_file(struct io_kiocb *req, struct file *file, - unsigned int issue_flags, u32 slot_index); -static int __io_close_fixed(struct io_kiocb *req, unsigned int issue_flags, - unsigned int offset); -static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags); - -static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer); -static void io_eventfd_signal(struct io_ring_ctx *ctx); -static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags); - -static struct kmem_cache *req_cachep; - -static const struct file_operations io_uring_fops; - -const char *io_uring_get_opcode(u8 opcode) -{ - switch ((enum io_uring_op)opcode) { - case IORING_OP_NOP: - return "NOP"; - case IORING_OP_READV: - return "READV"; - case IORING_OP_WRITEV: - return "WRITEV"; - case IORING_OP_FSYNC: - return "FSYNC"; - case IORING_OP_READ_FIXED: - return "READ_FIXED"; - case IORING_OP_WRITE_FIXED: - return "WRITE_FIXED"; - case IORING_OP_POLL_ADD: - return "POLL_ADD"; - case IORING_OP_POLL_REMOVE: - return "POLL_REMOVE"; - case IORING_OP_SYNC_FILE_RANGE: - return "SYNC_FILE_RANGE"; - case IORING_OP_SENDMSG: - return "SENDMSG"; - case IORING_OP_RECVMSG: - return "RECVMSG"; - case IORING_OP_TIMEOUT: - return "TIMEOUT"; - case IORING_OP_TIMEOUT_REMOVE: - return "TIMEOUT_REMOVE"; - case IORING_OP_ACCEPT: - return "ACCEPT"; - case IORING_OP_ASYNC_CANCEL: - return "ASYNC_CANCEL"; - case IORING_OP_LINK_TIMEOUT: - return "LINK_TIMEOUT"; - case IORING_OP_CONNECT: - return "CONNECT"; - case IORING_OP_FALLOCATE: - return "FALLOCATE"; - case IORING_OP_OPENAT: - return "OPENAT"; - case IORING_OP_CLOSE: - return "CLOSE"; - case IORING_OP_FILES_UPDATE: - return "FILES_UPDATE"; - case IORING_OP_STATX: - return "STATX"; - case IORING_OP_READ: - return "READ"; - case IORING_OP_WRITE: - return "WRITE"; - case IORING_OP_FADVISE: - return "FADVISE"; - case IORING_OP_MADVISE: - return "MADVISE"; - case IORING_OP_SEND: - return "SEND"; - case IORING_OP_RECV: - return "RECV"; - case IORING_OP_OPENAT2: - return "OPENAT2"; - case IORING_OP_EPOLL_CTL: - return "EPOLL_CTL"; - case IORING_OP_SPLICE: - return "SPLICE"; - case IORING_OP_PROVIDE_BUFFERS: - return "PROVIDE_BUFFERS"; - case IORING_OP_REMOVE_BUFFERS: - return "REMOVE_BUFFERS"; - case IORING_OP_TEE: - return "TEE"; - case IORING_OP_SHUTDOWN: - return "SHUTDOWN"; - case IORING_OP_RENAMEAT: - return "RENAMEAT"; - case IORING_OP_UNLINKAT: - return "UNLINKAT"; - case IORING_OP_MKDIRAT: - return "MKDIRAT"; - case IORING_OP_SYMLINKAT: - return "SYMLINKAT"; - case IORING_OP_LINKAT: - return "LINKAT"; - case IORING_OP_MSG_RING: - return "MSG_RING"; - case IORING_OP_FSETXATTR: - return "FSETXATTR"; - case IORING_OP_SETXATTR: - return "SETXATTR"; - case IORING_OP_FGETXATTR: - return "FGETXATTR"; - case IORING_OP_GETXATTR: - return "GETXATTR"; - case IORING_OP_SOCKET: - return "SOCKET"; - case IORING_OP_URING_CMD: - return "URING_CMD"; - case IORING_OP_LAST: - return "INVALID"; - } - return "INVALID"; -} - -struct sock *io_uring_get_socket(struct file *file) -{ -#if defined(CONFIG_UNIX) - if (file->f_op == &io_uring_fops) { - struct io_ring_ctx *ctx = file->private_data; - - return ctx->ring_sock->sk; - } -#endif - return NULL; -} -EXPORT_SYMBOL(io_uring_get_socket); - -#if defined(CONFIG_UNIX) -static inline bool io_file_need_scm(struct file *filp) -{ -#if defined(IO_URING_SCM_ALL) - return true; -#else - return !!unix_get_socket(filp); -#endif -} -#else -static inline bool io_file_need_scm(struct file *filp) -{ - return false; -} -#endif - -static void io_ring_submit_unlock(struct io_ring_ctx *ctx, unsigned issue_flags) -{ - lockdep_assert_held(&ctx->uring_lock); - if (issue_flags & IO_URING_F_UNLOCKED) - mutex_unlock(&ctx->uring_lock); -} - -static void io_ring_submit_lock(struct io_ring_ctx *ctx, unsigned issue_flags) -{ - /* - * "Normal" inline submissions always hold the uring_lock, since we - * grab it from the system call. Same is true for the SQPOLL offload. - * The only exception is when we've detached the request and issue it - * from an async worker thread, grab the lock for that case. - */ - if (issue_flags & IO_URING_F_UNLOCKED) - mutex_lock(&ctx->uring_lock); - lockdep_assert_held(&ctx->uring_lock); -} - -static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) -{ - if (!*locked) { - mutex_lock(&ctx->uring_lock); - *locked = true; - } -} - -#define io_for_each_link(pos, head) \ - for (pos = (head); pos; pos = pos->link) - -/* - * Shamelessly stolen from the mm implementation of page reference checking, - * see commit f958d7b528b1 for details. - */ -#define req_ref_zero_or_close_to_overflow(req) \ - ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u) - -static inline bool req_ref_inc_not_zero(struct io_kiocb *req) -{ - WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); - return atomic_inc_not_zero(&req->refs); -} - -static inline bool req_ref_put_and_test(struct io_kiocb *req) -{ - if (likely(!(req->flags & REQ_F_REFCOUNT))) - return true; - - WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); - return atomic_dec_and_test(&req->refs); -} - -static inline void req_ref_get(struct io_kiocb *req) -{ - WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); - WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); - atomic_inc(&req->refs); -} - -static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) -{ - if (!wq_list_empty(&ctx->submit_state.compl_reqs)) - __io_submit_flush_completions(ctx); -} - -static inline void __io_req_set_refcount(struct io_kiocb *req, int nr) -{ - if (!(req->flags & REQ_F_REFCOUNT)) { - req->flags |= REQ_F_REFCOUNT; - atomic_set(&req->refs, nr); - } -} - -static inline void io_req_set_refcount(struct io_kiocb *req) -{ - __io_req_set_refcount(req, 1); -} - -#define IO_RSRC_REF_BATCH 100 - -static void io_rsrc_put_node(struct io_rsrc_node *node, int nr) -{ - percpu_ref_put_many(&node->refs, nr); -} - -static inline void io_req_put_rsrc_locked(struct io_kiocb *req, - struct io_ring_ctx *ctx) - __must_hold(&ctx->uring_lock) -{ - struct io_rsrc_node *node = req->rsrc_node; - - if (node) { - if (node == ctx->rsrc_node) - ctx->rsrc_cached_refs++; - else - io_rsrc_put_node(node, 1); - } -} - -static inline void io_req_put_rsrc(struct io_kiocb *req) -{ - if (req->rsrc_node) - io_rsrc_put_node(req->rsrc_node, 1); -} - -static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx) - __must_hold(&ctx->uring_lock) -{ - if (ctx->rsrc_cached_refs) { - io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs); - ctx->rsrc_cached_refs = 0; - } -} - -static void io_rsrc_refs_refill(struct io_ring_ctx *ctx) - __must_hold(&ctx->uring_lock) -{ - ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH; - percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH); -} - -static inline void io_req_set_rsrc_node(struct io_kiocb *req, - struct io_ring_ctx *ctx, - unsigned int issue_flags) -{ - if (!req->rsrc_node) { - req->rsrc_node = ctx->rsrc_node; - - if (!(issue_flags & IO_URING_F_UNLOCKED)) { - lockdep_assert_held(&ctx->uring_lock); - ctx->rsrc_cached_refs--; - if (unlikely(ctx->rsrc_cached_refs < 0)) - io_rsrc_refs_refill(ctx); - } else { - percpu_ref_get(&req->rsrc_node->refs); - } - } -} - -static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list) -{ - if (req->flags & REQ_F_BUFFER_RING) { - if (req->buf_list) - req->buf_list->head++; - req->flags &= ~REQ_F_BUFFER_RING; - } else { - list_add(&req->kbuf->list, list); - req->flags &= ~REQ_F_BUFFER_SELECTED; - } - - return IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); -} - -static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req) -{ - lockdep_assert_held(&req->ctx->completion_lock); - - if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) - return 0; - return __io_put_kbuf(req, &req->ctx->io_buffers_comp); -} - -static inline unsigned int io_put_kbuf(struct io_kiocb *req, - unsigned issue_flags) -{ - unsigned int cflags; - - if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) - return 0; - - /* - * We can add this buffer back to two lists: - * - * 1) The io_buffers_cache list. This one is protected by the - * ctx->uring_lock. If we already hold this lock, add back to this - * list as we can grab it from issue as well. - * 2) The io_buffers_comp list. This one is protected by the - * ctx->completion_lock. - * - * We migrate buffers from the comp_list to the issue cache list - * when we need one. - */ - if (req->flags & REQ_F_BUFFER_RING) { - /* no buffers to recycle for this case */ - cflags = __io_put_kbuf(req, NULL); - } else if (issue_flags & IO_URING_F_UNLOCKED) { - struct io_ring_ctx *ctx = req->ctx; - - spin_lock(&ctx->completion_lock); - cflags = __io_put_kbuf(req, &ctx->io_buffers_comp); - spin_unlock(&ctx->completion_lock); - } else { - lockdep_assert_held(&req->ctx->uring_lock); - - cflags = __io_put_kbuf(req, &req->ctx->io_buffers_cache); - } - - return cflags; -} - -static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, - unsigned int bgid) -{ - if (ctx->io_bl && bgid < BGID_ARRAY) - return &ctx->io_bl[bgid]; - - return xa_load(&ctx->io_bl_xa, bgid); -} - -static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) -{ - struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; - struct io_buffer *buf; - - if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) - return; - /* - * For legacy provided buffer mode, don't recycle if we already did - * IO to this buffer. For ring-mapped provided buffer mode, we should - * increment ring->head to explicitly monopolize the buffer to avoid - * multiple use. - */ - if ((req->flags & REQ_F_BUFFER_SELECTED) && - (req->flags & REQ_F_PARTIAL_IO)) - return; - - /* - * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear - * the flag and hence ensure that bl->head doesn't get incremented. - * If the tail has already been incremented, hang on to it. - */ - if (req->flags & REQ_F_BUFFER_RING) { - if (req->buf_list) { - if (req->flags & REQ_F_PARTIAL_IO) { - req->buf_list->head++; - req->buf_list = NULL; - } else { - req->buf_index = req->buf_list->bgid; - req->flags &= ~REQ_F_BUFFER_RING; - } - } - return; - } - - io_ring_submit_lock(ctx, issue_flags); - - buf = req->kbuf; - bl = io_buffer_get_list(ctx, buf->bgid); - list_add(&buf->list, &bl->buf_list); - req->flags &= ~REQ_F_BUFFER_SELECTED; - req->buf_index = buf->bgid; - - io_ring_submit_unlock(ctx, issue_flags); -} - -static bool io_match_task(struct io_kiocb *head, struct task_struct *task, - bool cancel_all) - __must_hold(&req->ctx->timeout_lock) -{ - struct io_kiocb *req; - - if (task && head->task != task) - return false; - if (cancel_all) - return true; - - io_for_each_link(req, head) { - if (req->flags & REQ_F_INFLIGHT) - return true; - } - return false; -} - -static bool io_match_linked(struct io_kiocb *head) -{ - struct io_kiocb *req; - - io_for_each_link(req, head) { - if (req->flags & REQ_F_INFLIGHT) - return true; - } - return false; -} - -/* - * As io_match_task() but protected against racing with linked timeouts. - * User must not hold timeout_lock. - */ -static bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, - bool cancel_all) -{ - bool matched; - - if (task && head->task != task) - return false; - if (cancel_all) - return true; - - if (head->flags & REQ_F_LINK_TIMEOUT) { - struct io_ring_ctx *ctx = head->ctx; - - /* protect against races with linked timeouts */ - spin_lock_irq(&ctx->timeout_lock); - matched = io_match_linked(head); - spin_unlock_irq(&ctx->timeout_lock); - } else { - matched = io_match_linked(head); - } - return matched; -} - -static inline bool req_has_async_data(struct io_kiocb *req) -{ - return req->flags & REQ_F_ASYNC_DATA; -} - -static inline void req_set_fail(struct io_kiocb *req) -{ - req->flags |= REQ_F_FAIL; - if (req->flags & REQ_F_CQE_SKIP) { - req->flags &= ~REQ_F_CQE_SKIP; - req->flags |= REQ_F_SKIP_LINK_CQES; - } -} - -static inline void req_fail_link_node(struct io_kiocb *req, int res) -{ - req_set_fail(req); - req->cqe.res = res; -} - -static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx) -{ - wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); -} - -static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref) -{ - struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs); - - complete(&ctx->ref_comp); -} - -static inline bool io_is_timeout_noseq(struct io_kiocb *req) -{ - return !req->timeout.off; -} - -static __cold void io_fallback_req_func(struct work_struct *work) -{ - struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, - fallback_work.work); - struct llist_node *node = llist_del_all(&ctx->fallback_llist); - struct io_kiocb *req, *tmp; - bool locked = false; - - percpu_ref_get(&ctx->refs); - llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node) - req->io_task_work.func(req, &locked); - - if (locked) { - io_submit_flush_completions(ctx); - mutex_unlock(&ctx->uring_lock); - } - percpu_ref_put(&ctx->refs); -} - -static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) -{ - struct io_ring_ctx *ctx; - int hash_bits; - - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); - if (!ctx) - return NULL; - - xa_init(&ctx->io_bl_xa); - - /* - * Use 5 bits less than the max cq entries, that should give us around - * 32 entries per hash list if totally full and uniformly spread. - */ - hash_bits = ilog2(p->cq_entries); - hash_bits -= 5; - if (hash_bits <= 0) - hash_bits = 1; - ctx->cancel_hash_bits = hash_bits; - ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head), - GFP_KERNEL); - if (!ctx->cancel_hash) - goto err; - __hash_init(ctx->cancel_hash, 1U << hash_bits); - - ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL); - if (!ctx->dummy_ubuf) - goto err; - /* set invalid range, so io_import_fixed() fails meeting it */ - ctx->dummy_ubuf->ubuf = -1UL; - - if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, - PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) - goto err; - - ctx->flags = p->flags; - init_waitqueue_head(&ctx->sqo_sq_wait); - INIT_LIST_HEAD(&ctx->sqd_list); - INIT_LIST_HEAD(&ctx->cq_overflow_list); - INIT_LIST_HEAD(&ctx->io_buffers_cache); - INIT_LIST_HEAD(&ctx->apoll_cache); - init_completion(&ctx->ref_comp); - xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); - mutex_init(&ctx->uring_lock); - init_waitqueue_head(&ctx->cq_wait); - spin_lock_init(&ctx->completion_lock); - spin_lock_init(&ctx->timeout_lock); - INIT_WQ_LIST(&ctx->iopoll_list); - INIT_LIST_HEAD(&ctx->io_buffers_pages); - INIT_LIST_HEAD(&ctx->io_buffers_comp); - INIT_LIST_HEAD(&ctx->defer_list); - INIT_LIST_HEAD(&ctx->timeout_list); - INIT_LIST_HEAD(&ctx->ltimeout_list); - spin_lock_init(&ctx->rsrc_ref_lock); - INIT_LIST_HEAD(&ctx->rsrc_ref_list); - INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work); - init_llist_head(&ctx->rsrc_put_llist); - INIT_LIST_HEAD(&ctx->tctx_list); - ctx->submit_state.free_list.next = NULL; - INIT_WQ_LIST(&ctx->locked_free_list); - INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func); - INIT_WQ_LIST(&ctx->submit_state.compl_reqs); - return ctx; -err: - kfree(ctx->dummy_ubuf); - kfree(ctx->cancel_hash); - kfree(ctx->io_bl); - xa_destroy(&ctx->io_bl_xa); - kfree(ctx); - return NULL; -} - -static void io_account_cq_overflow(struct io_ring_ctx *ctx) -{ - struct io_rings *r = ctx->rings; - - WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1); - ctx->cq_extra--; -} - -static bool req_need_defer(struct io_kiocb *req, u32 seq) -{ - if (unlikely(req->flags & REQ_F_IO_DRAIN)) { - struct io_ring_ctx *ctx = req->ctx; - - return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail; - } - - return false; -} - -static inline bool io_req_ffs_set(struct io_kiocb *req) -{ - return req->flags & REQ_F_FIXED_FILE; -} - -static inline void io_req_track_inflight(struct io_kiocb *req) -{ - if (!(req->flags & REQ_F_INFLIGHT)) { - req->flags |= REQ_F_INFLIGHT; - atomic_inc(&req->task->io_uring->inflight_tracked); - } -} - -static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) -{ - if (WARN_ON_ONCE(!req->link)) - return NULL; - - req->flags &= ~REQ_F_ARM_LTIMEOUT; - req->flags |= REQ_F_LINK_TIMEOUT; - - /* linked timeouts should have two refs once prep'ed */ - io_req_set_refcount(req); - __io_req_set_refcount(req->link, 2); - return req->link; -} - -static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) -{ - if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT))) - return NULL; - return __io_prep_linked_timeout(req); -} - -static noinline void __io_arm_ltimeout(struct io_kiocb *req) -{ - io_queue_linked_timeout(__io_prep_linked_timeout(req)); -} - -static inline void io_arm_ltimeout(struct io_kiocb *req) -{ - if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT)) - __io_arm_ltimeout(req); -} - -static void io_prep_async_work(struct io_kiocb *req) -{ - const struct io_op_def *def = &io_op_defs[req->opcode]; - struct io_ring_ctx *ctx = req->ctx; - - if (!(req->flags & REQ_F_CREDS)) { - req->flags |= REQ_F_CREDS; - req->creds = get_current_cred(); - } - - req->work.list.next = NULL; - req->work.flags = 0; - req->work.cancel_seq = atomic_read(&ctx->cancel_seq); - if (req->flags & REQ_F_FORCE_ASYNC) - req->work.flags |= IO_WQ_WORK_CONCURRENT; - - if (req->flags & REQ_F_ISREG) { - if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL)) - io_wq_hash_work(&req->work, file_inode(req->file)); - } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) { - if (def->unbound_nonreg_file) - req->work.flags |= IO_WQ_WORK_UNBOUND; - } -} - -static void io_prep_async_link(struct io_kiocb *req) -{ - struct io_kiocb *cur; - - if (req->flags & REQ_F_LINK_TIMEOUT) { - struct io_ring_ctx *ctx = req->ctx; - - spin_lock_irq(&ctx->timeout_lock); - io_for_each_link(cur, req) - io_prep_async_work(cur); - spin_unlock_irq(&ctx->timeout_lock); - } else { - io_for_each_link(cur, req) - io_prep_async_work(cur); - } -} - -static inline void io_req_add_compl_list(struct io_kiocb *req) -{ - struct io_submit_state *state = &req->ctx->submit_state; - - if (!(req->flags & REQ_F_CQE_SKIP)) - state->flush_cqes = true; - wq_list_add_tail(&req->comp_list, &state->compl_reqs); -} - -static void io_queue_iowq(struct io_kiocb *req, bool *dont_use) -{ - struct io_kiocb *link = io_prep_linked_timeout(req); - struct io_uring_task *tctx = req->task->io_uring; - - BUG_ON(!tctx); - BUG_ON(!tctx->io_wq); - - /* init ->work of the whole link before punting */ - io_prep_async_link(req); - - /* - * Not expected to happen, but if we do have a bug where this _can_ - * happen, catch it here and ensure the request is marked as - * canceled. That will make io-wq go through the usual work cancel - * procedure rather than attempt to run this request (or create a new - * worker for it). - */ - if (WARN_ON_ONCE(!same_thread_group(req->task, current))) - req->work.flags |= IO_WQ_WORK_CANCEL; - - trace_io_uring_queue_async_work(req->ctx, req, req->cqe.user_data, - req->opcode, req->flags, &req->work, - io_wq_is_hashed(&req->work)); - io_wq_enqueue(tctx->io_wq, &req->work); - if (link) - io_queue_linked_timeout(link); -} - -static void io_kill_timeout(struct io_kiocb *req, int status) - __must_hold(&req->ctx->completion_lock) - __must_hold(&req->ctx->timeout_lock) -{ - struct io_timeout_data *io = req->async_data; - - if (hrtimer_try_to_cancel(&io->timer) != -1) { - if (status) - req_set_fail(req); - atomic_set(&req->ctx->cq_timeouts, - atomic_read(&req->ctx->cq_timeouts) + 1); - list_del_init(&req->timeout.list); - io_req_tw_post_queue(req, status, 0); - } -} - -static __cold void io_queue_deferred(struct io_ring_ctx *ctx) -{ - while (!list_empty(&ctx->defer_list)) { - struct io_defer_entry *de = list_first_entry(&ctx->defer_list, - struct io_defer_entry, list); - - if (req_need_defer(de->req, de->seq)) - break; - list_del_init(&de->list); - io_req_task_queue(de->req); - kfree(de); - } -} - -static __cold void io_flush_timeouts(struct io_ring_ctx *ctx) - __must_hold(&ctx->completion_lock) -{ - u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); - struct io_kiocb *req, *tmp; - - spin_lock_irq(&ctx->timeout_lock); - list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { - u32 events_needed, events_got; - - if (io_is_timeout_noseq(req)) - break; - - /* - * Since seq can easily wrap around over time, subtract - * the last seq at which timeouts were flushed before comparing. - * Assuming not more than 2^31-1 events have happened since, - * these subtractions won't have wrapped, so we can check if - * target is in [last_seq, current_seq] by comparing the two. - */ - events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush; - events_got = seq - ctx->cq_last_tm_flush; - if (events_got < events_needed) - break; - - io_kill_timeout(req, 0); - } - ctx->cq_last_tm_flush = seq; - spin_unlock_irq(&ctx->timeout_lock); -} - -static inline void io_commit_cqring(struct io_ring_ctx *ctx) -{ - /* order cqe stores with ring update */ - smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); -} - -static void __io_commit_cqring_flush(struct io_ring_ctx *ctx) -{ - if (ctx->off_timeout_used || ctx->drain_active) { - spin_lock(&ctx->completion_lock); - if (ctx->off_timeout_used) - io_flush_timeouts(ctx); - if (ctx->drain_active) - io_queue_deferred(ctx); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - } - if (ctx->has_evfd) - io_eventfd_signal(ctx); -} - -static inline bool io_sqring_full(struct io_ring_ctx *ctx) -{ - struct io_rings *r = ctx->rings; - - return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries; -} - -static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) -{ - return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); -} - -/* - * writes to the cq entry need to come after reading head; the - * control dependency is enough as we're using WRITE_ONCE to - * fill the cq entry - */ -static noinline struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx) -{ - struct io_rings *rings = ctx->rings; - unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); - unsigned int shift = 0; - unsigned int free, queued, len; - - if (ctx->flags & IORING_SETUP_CQE32) - shift = 1; - - /* userspace may cheat modifying the tail, be safe and do min */ - queued = min(__io_cqring_events(ctx), ctx->cq_entries); - free = ctx->cq_entries - queued; - /* we need a contiguous range, limit based on the current array offset */ - len = min(free, ctx->cq_entries - off); - if (!len) - return NULL; - - ctx->cached_cq_tail++; - ctx->cqe_cached = &rings->cqes[off]; - ctx->cqe_sentinel = ctx->cqe_cached + len; - ctx->cqe_cached++; - return &rings->cqes[off << shift]; -} - -static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) -{ - if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) { - struct io_uring_cqe *cqe = ctx->cqe_cached; - - if (ctx->flags & IORING_SETUP_CQE32) { - unsigned int off = ctx->cqe_cached - ctx->rings->cqes; - - cqe += off; - } - - ctx->cached_cq_tail++; - ctx->cqe_cached++; - return cqe; - } - - return __io_get_cqe(ctx); -} - -static void io_eventfd_signal(struct io_ring_ctx *ctx) -{ - struct io_ev_fd *ev_fd; - - rcu_read_lock(); - /* - * rcu_dereference ctx->io_ev_fd once and use it for both for checking - * and eventfd_signal - */ - ev_fd = rcu_dereference(ctx->io_ev_fd); - - /* - * Check again if ev_fd exists incase an io_eventfd_unregister call - * completed between the NULL check of ctx->io_ev_fd at the start of - * the function and rcu_read_lock. - */ - if (unlikely(!ev_fd)) - goto out; - if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) - goto out; - - if (!ev_fd->eventfd_async || io_wq_current_is_worker()) - eventfd_signal(ev_fd->cq_ev_fd, 1); -out: - rcu_read_unlock(); -} - -static inline void io_cqring_wake(struct io_ring_ctx *ctx) -{ - /* - * wake_up_all() may seem excessive, but io_wake_function() and - * io_should_wake() handle the termination of the loop and only - * wake as many waiters as we need to. - */ - if (wq_has_sleeper(&ctx->cq_wait)) - wake_up_all(&ctx->cq_wait); -} - -/* - * This should only get called when at least one event has been posted. - * Some applications rely on the eventfd notification count only changing - * IFF a new CQE has been added to the CQ ring. There's no depedency on - * 1:1 relationship between how many times this function is called (and - * hence the eventfd count) and number of CQEs posted to the CQ ring. - */ -static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx) -{ - if (unlikely(ctx->off_timeout_used || ctx->drain_active || - ctx->has_evfd)) - __io_commit_cqring_flush(ctx); - - io_cqring_wake(ctx); -} - -static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) -{ - if (unlikely(ctx->off_timeout_used || ctx->drain_active || - ctx->has_evfd)) - __io_commit_cqring_flush(ctx); - - if (ctx->flags & IORING_SETUP_SQPOLL) - io_cqring_wake(ctx); -} - -/* Returns true if there are no backlogged entries after the flush */ -static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) -{ - bool all_flushed, posted; - size_t cqe_size = sizeof(struct io_uring_cqe); - - if (!force && __io_cqring_events(ctx) == ctx->cq_entries) - return false; - - if (ctx->flags & IORING_SETUP_CQE32) - cqe_size <<= 1; - - posted = false; - spin_lock(&ctx->completion_lock); - while (!list_empty(&ctx->cq_overflow_list)) { - struct io_uring_cqe *cqe = io_get_cqe(ctx); - struct io_overflow_cqe *ocqe; - - if (!cqe && !force) - break; - ocqe = list_first_entry(&ctx->cq_overflow_list, - struct io_overflow_cqe, list); - if (cqe) - memcpy(cqe, &ocqe->cqe, cqe_size); - else - io_account_cq_overflow(ctx); - - posted = true; - list_del(&ocqe->list); - kfree(ocqe); - } - - all_flushed = list_empty(&ctx->cq_overflow_list); - if (all_flushed) { - clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); - atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags); - } - - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - if (posted) - io_cqring_ev_posted(ctx); - return all_flushed; -} - -static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx) -{ - bool ret = true; - - if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) { - /* iopoll syncs against uring_lock, not completion_lock */ - if (ctx->flags & IORING_SETUP_IOPOLL) - mutex_lock(&ctx->uring_lock); - ret = __io_cqring_overflow_flush(ctx, false); - if (ctx->flags & IORING_SETUP_IOPOLL) - mutex_unlock(&ctx->uring_lock); - } - - return ret; -} - -static void __io_put_task(struct task_struct *task, int nr) -{ - struct io_uring_task *tctx = task->io_uring; - - percpu_counter_sub(&tctx->inflight, nr); - if (unlikely(atomic_read(&tctx->in_idle))) - wake_up(&tctx->wait); - put_task_struct_many(task, nr); -} - -/* must to be called somewhat shortly after putting a request */ -static inline void io_put_task(struct task_struct *task, int nr) -{ - if (likely(task == current)) - task->io_uring->cached_refs += nr; - else - __io_put_task(task, nr); -} - -static void io_task_refs_refill(struct io_uring_task *tctx) -{ - unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR; - - percpu_counter_add(&tctx->inflight, refill); - refcount_add(refill, ¤t->usage); - tctx->cached_refs += refill; -} - -static inline void io_get_task_refs(int nr) -{ - struct io_uring_task *tctx = current->io_uring; - - tctx->cached_refs -= nr; - if (unlikely(tctx->cached_refs < 0)) - io_task_refs_refill(tctx); -} - -static __cold void io_uring_drop_tctx_refs(struct task_struct *task) -{ - struct io_uring_task *tctx = task->io_uring; - unsigned int refs = tctx->cached_refs; - - if (refs) { - tctx->cached_refs = 0; - percpu_counter_sub(&tctx->inflight, refs); - put_task_struct_many(task, refs); - } -} - -static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, - s32 res, u32 cflags, u64 extra1, - u64 extra2) -{ - struct io_overflow_cqe *ocqe; - size_t ocq_size = sizeof(struct io_overflow_cqe); - bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); - - if (is_cqe32) - ocq_size += sizeof(struct io_uring_cqe); - - ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT); - trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe); - if (!ocqe) { - /* - * If we're in ring overflow flush mode, or in task cancel mode, - * or cannot allocate an overflow entry, then we need to drop it - * on the floor. - */ - io_account_cq_overflow(ctx); - set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq); - return false; - } - if (list_empty(&ctx->cq_overflow_list)) { - set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); - atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags); - - } - ocqe->cqe.user_data = user_data; - ocqe->cqe.res = res; - ocqe->cqe.flags = cflags; - if (is_cqe32) { - ocqe->cqe.big_cqe[0] = extra1; - ocqe->cqe.big_cqe[1] = extra2; - } - list_add_tail(&ocqe->list, &ctx->cq_overflow_list); - return true; -} - -static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, - struct io_kiocb *req) -{ - struct io_uring_cqe *cqe; - - if (!(ctx->flags & IORING_SETUP_CQE32)) { - trace_io_uring_complete(req->ctx, req, req->cqe.user_data, - req->cqe.res, req->cqe.flags, 0, 0); - - /* - * If we can't get a cq entry, userspace overflowed the - * submission (by quite a lot). Increment the overflow count in - * the ring. - */ - cqe = io_get_cqe(ctx); - if (likely(cqe)) { - memcpy(cqe, &req->cqe, sizeof(*cqe)); - return true; - } - - return io_cqring_event_overflow(ctx, req->cqe.user_data, - req->cqe.res, req->cqe.flags, - 0, 0); - } else { - u64 extra1 = 0, extra2 = 0; - - if (req->flags & REQ_F_CQE32_INIT) { - extra1 = req->extra1; - extra2 = req->extra2; - } - - trace_io_uring_complete(req->ctx, req, req->cqe.user_data, - req->cqe.res, req->cqe.flags, extra1, extra2); - - /* - * If we can't get a cq entry, userspace overflowed the - * submission (by quite a lot). Increment the overflow count in - * the ring. - */ - cqe = io_get_cqe(ctx); - if (likely(cqe)) { - memcpy(cqe, &req->cqe, sizeof(struct io_uring_cqe)); - WRITE_ONCE(cqe->big_cqe[0], extra1); - WRITE_ONCE(cqe->big_cqe[1], extra2); - return true; - } - - return io_cqring_event_overflow(ctx, req->cqe.user_data, - req->cqe.res, req->cqe.flags, - extra1, extra2); - } -} - -static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, - s32 res, u32 cflags) -{ - struct io_uring_cqe *cqe; - - ctx->cq_extra++; - trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0); - - /* - * If we can't get a cq entry, userspace overflowed the - * submission (by quite a lot). Increment the overflow count in - * the ring. - */ - cqe = io_get_cqe(ctx); - if (likely(cqe)) { - WRITE_ONCE(cqe->user_data, user_data); - WRITE_ONCE(cqe->res, res); - WRITE_ONCE(cqe->flags, cflags); - - if (ctx->flags & IORING_SETUP_CQE32) { - WRITE_ONCE(cqe->big_cqe[0], 0); - WRITE_ONCE(cqe->big_cqe[1], 0); - } - return true; - } - return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); -} - -static void __io_req_complete_put(struct io_kiocb *req) -{ - /* - * If we're the last reference to this request, add to our locked - * free_list cache. - */ - if (req_ref_put_and_test(req)) { - struct io_ring_ctx *ctx = req->ctx; - - if (req->flags & IO_REQ_LINK_FLAGS) { - if (req->flags & IO_DISARM_MASK) - io_disarm_next(req); - if (req->link) { - io_req_task_queue(req->link); - req->link = NULL; - } - } - io_req_put_rsrc(req); - /* - * Selected buffer deallocation in io_clean_op() assumes that - * we don't hold ->completion_lock. Clean them here to avoid - * deadlocks. - */ - io_put_kbuf_comp(req); - io_dismantle_req(req); - io_put_task(req->task, 1); - wq_list_add_head(&req->comp_list, &ctx->locked_free_list); - ctx->locked_free_nr++; - } -} - -static void __io_req_complete_post(struct io_kiocb *req, s32 res, - u32 cflags) -{ - if (!(req->flags & REQ_F_CQE_SKIP)) { - req->cqe.res = res; - req->cqe.flags = cflags; - __io_fill_cqe_req(req->ctx, req); - } - __io_req_complete_put(req); -} - -static void io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags) -{ - struct io_ring_ctx *ctx = req->ctx; - - spin_lock(&ctx->completion_lock); - __io_req_complete_post(req, res, cflags); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); -} - -static inline void io_req_complete_state(struct io_kiocb *req, s32 res, - u32 cflags) -{ - req->cqe.res = res; - req->cqe.flags = cflags; - req->flags |= REQ_F_COMPLETE_INLINE; -} - -static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags, - s32 res, u32 cflags) -{ - if (issue_flags & IO_URING_F_COMPLETE_DEFER) - io_req_complete_state(req, res, cflags); - else - io_req_complete_post(req, res, cflags); -} - -static inline void io_req_complete(struct io_kiocb *req, s32 res) -{ - if (res < 0) - req_set_fail(req); - __io_req_complete(req, 0, res, 0); -} - -static void io_req_complete_failed(struct io_kiocb *req, s32 res) -{ - req_set_fail(req); - io_req_complete_post(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED)); -} - -/* - * Don't initialise the fields below on every allocation, but do that in - * advance and keep them valid across allocations. - */ -static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) -{ - req->ctx = ctx; - req->link = NULL; - req->async_data = NULL; - /* not necessary, but safer to zero */ - req->cqe.res = 0; -} - -static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, - struct io_submit_state *state) -{ - spin_lock(&ctx->completion_lock); - wq_list_splice(&ctx->locked_free_list, &state->free_list); - ctx->locked_free_nr = 0; - spin_unlock(&ctx->completion_lock); -} - -static inline bool io_req_cache_empty(struct io_ring_ctx *ctx) -{ - return !ctx->submit_state.free_list.next; -} - -/* - * A request might get retired back into the request caches even before opcode - * handlers and io_issue_sqe() are done with it, e.g. inline completion path. - * Because of that, io_alloc_req() should be called only under ->uring_lock - * and with extra caution to not get a request that is still worked on. - */ -static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) - __must_hold(&ctx->uring_lock) -{ - gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; - void *reqs[IO_REQ_ALLOC_BATCH]; - int ret, i; - - /* - * If we have more than a batch's worth of requests in our IRQ side - * locked cache, grab the lock and move them over to our submission - * side cache. - */ - if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) { - io_flush_cached_locked_reqs(ctx, &ctx->submit_state); - if (!io_req_cache_empty(ctx)) - return true; - } - - ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs); - - /* - * Bulk alloc is all-or-nothing. If we fail to get a batch, - * retry single alloc to be on the safe side. - */ - if (unlikely(ret <= 0)) { - reqs[0] = kmem_cache_alloc(req_cachep, gfp); - if (!reqs[0]) - return false; - ret = 1; - } - - percpu_ref_get_many(&ctx->refs, ret); - for (i = 0; i < ret; i++) { - struct io_kiocb *req = reqs[i]; - - io_preinit_req(req, ctx); - io_req_add_to_cache(req, ctx); - } - return true; -} - -static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx) -{ - if (unlikely(io_req_cache_empty(ctx))) - return __io_alloc_req_refill(ctx); - return true; -} - -static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx) -{ - struct io_wq_work_node *node; - - node = wq_stack_extract(&ctx->submit_state.free_list); - return container_of(node, struct io_kiocb, comp_list); -} - -static inline void io_put_file(struct file *file) -{ - if (file) - fput(file); -} - -static inline void io_dismantle_req(struct io_kiocb *req) -{ - unsigned int flags = req->flags; - - if (unlikely(flags & IO_REQ_CLEAN_FLAGS)) - io_clean_op(req); - if (!(flags & REQ_F_FIXED_FILE)) - io_put_file(req->file); -} - -static __cold void io_free_req(struct io_kiocb *req) -{ - struct io_ring_ctx *ctx = req->ctx; - - io_req_put_rsrc(req); - io_dismantle_req(req); - io_put_task(req->task, 1); - - spin_lock(&ctx->completion_lock); - wq_list_add_head(&req->comp_list, &ctx->locked_free_list); - ctx->locked_free_nr++; - spin_unlock(&ctx->completion_lock); -} - -static inline void io_remove_next_linked(struct io_kiocb *req) -{ - struct io_kiocb *nxt = req->link; - - req->link = nxt->link; - nxt->link = NULL; -} - -static struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req) - __must_hold(&req->ctx->completion_lock) - __must_hold(&req->ctx->timeout_lock) -{ - struct io_kiocb *link = req->link; - - if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { - struct io_timeout_data *io = link->async_data; - - io_remove_next_linked(req); - link->timeout.head = NULL; - if (hrtimer_try_to_cancel(&io->timer) != -1) { - list_del(&link->timeout.list); - return link; - } - } - return NULL; -} - -static void io_fail_links(struct io_kiocb *req) - __must_hold(&req->ctx->completion_lock) -{ - struct io_kiocb *nxt, *link = req->link; - bool ignore_cqes = req->flags & REQ_F_SKIP_LINK_CQES; - - req->link = NULL; - while (link) { - long res = -ECANCELED; - - if (link->flags & REQ_F_FAIL) - res = link->cqe.res; - - nxt = link->link; - link->link = NULL; - - trace_io_uring_fail_link(req->ctx, req, req->cqe.user_data, - req->opcode, link); - - if (ignore_cqes) - link->flags |= REQ_F_CQE_SKIP; - else - link->flags &= ~REQ_F_CQE_SKIP; - __io_req_complete_post(link, res, 0); - link = nxt; - } -} - -static bool io_disarm_next(struct io_kiocb *req) - __must_hold(&req->ctx->completion_lock) -{ - struct io_kiocb *link = NULL; - bool posted = false; - - if (req->flags & REQ_F_ARM_LTIMEOUT) { - link = req->link; - req->flags &= ~REQ_F_ARM_LTIMEOUT; - if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { - io_remove_next_linked(req); - io_req_tw_post_queue(link, -ECANCELED, 0); - posted = true; - } - } else if (req->flags & REQ_F_LINK_TIMEOUT) { - struct io_ring_ctx *ctx = req->ctx; - - spin_lock_irq(&ctx->timeout_lock); - link = io_disarm_linked_timeout(req); - spin_unlock_irq(&ctx->timeout_lock); - if (link) { - posted = true; - io_req_tw_post_queue(link, -ECANCELED, 0); - } - } - if (unlikely((req->flags & REQ_F_FAIL) && - !(req->flags & REQ_F_HARDLINK))) { - posted |= (req->link != NULL); - io_fail_links(req); - } - return posted; -} - -static void __io_req_find_next_prep(struct io_kiocb *req) -{ - struct io_ring_ctx *ctx = req->ctx; - bool posted; - - spin_lock(&ctx->completion_lock); - posted = io_disarm_next(req); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - if (posted) - io_cqring_ev_posted(ctx); -} - -static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) -{ - struct io_kiocb *nxt; - - /* - * If LINK is set, we have dependent requests in this chain. If we - * didn't fail this request, queue the first one up, moving any other - * dependencies to the next request. In case of failure, fail the rest - * of the chain. - */ - if (unlikely(req->flags & IO_DISARM_MASK)) - __io_req_find_next_prep(req); - nxt = req->link; - req->link = NULL; - return nxt; -} - -static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked) -{ - if (!ctx) - return; - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) - atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); - if (*locked) { - io_submit_flush_completions(ctx); - mutex_unlock(&ctx->uring_lock); - *locked = false; - } - percpu_ref_put(&ctx->refs); -} - -static inline void ctx_commit_and_unlock(struct io_ring_ctx *ctx) -{ - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); -} - -static void handle_prev_tw_list(struct io_wq_work_node *node, - struct io_ring_ctx **ctx, bool *uring_locked) -{ - if (*ctx && !*uring_locked) - spin_lock(&(*ctx)->completion_lock); - - do { - struct io_wq_work_node *next = node->next; - struct io_kiocb *req = container_of(node, struct io_kiocb, - io_task_work.node); - - prefetch(container_of(next, struct io_kiocb, io_task_work.node)); - - if (req->ctx != *ctx) { - if (unlikely(!*uring_locked && *ctx)) - ctx_commit_and_unlock(*ctx); - - ctx_flush_and_put(*ctx, uring_locked); - *ctx = req->ctx; - /* if not contended, grab and improve batching */ - *uring_locked = mutex_trylock(&(*ctx)->uring_lock); - percpu_ref_get(&(*ctx)->refs); - if (unlikely(!*uring_locked)) - spin_lock(&(*ctx)->completion_lock); - } - if (likely(*uring_locked)) - req->io_task_work.func(req, uring_locked); - else - __io_req_complete_post(req, req->cqe.res, - io_put_kbuf_comp(req)); - node = next; - } while (node); - - if (unlikely(!*uring_locked)) - ctx_commit_and_unlock(*ctx); -} - -static void handle_tw_list(struct io_wq_work_node *node, - struct io_ring_ctx **ctx, bool *locked) -{ - do { - struct io_wq_work_node *next = node->next; - struct io_kiocb *req = container_of(node, struct io_kiocb, - io_task_work.node); - - prefetch(container_of(next, struct io_kiocb, io_task_work.node)); - - if (req->ctx != *ctx) { - ctx_flush_and_put(*ctx, locked); - *ctx = req->ctx; - /* if not contended, grab and improve batching */ - *locked = mutex_trylock(&(*ctx)->uring_lock); - percpu_ref_get(&(*ctx)->refs); - } - req->io_task_work.func(req, locked); - node = next; - } while (node); -} - -static void tctx_task_work(struct callback_head *cb) -{ - bool uring_locked = false; - struct io_ring_ctx *ctx = NULL; - struct io_uring_task *tctx = container_of(cb, struct io_uring_task, - task_work); - - while (1) { - struct io_wq_work_node *node1, *node2; - - spin_lock_irq(&tctx->task_lock); - node1 = tctx->prio_task_list.first; - node2 = tctx->task_list.first; - INIT_WQ_LIST(&tctx->task_list); - INIT_WQ_LIST(&tctx->prio_task_list); - if (!node2 && !node1) - tctx->task_running = false; - spin_unlock_irq(&tctx->task_lock); - if (!node2 && !node1) - break; - - if (node1) - handle_prev_tw_list(node1, &ctx, &uring_locked); - if (node2) - handle_tw_list(node2, &ctx, &uring_locked); - cond_resched(); - - if (data_race(!tctx->task_list.first) && - data_race(!tctx->prio_task_list.first) && uring_locked) - io_submit_flush_completions(ctx); - } - - ctx_flush_and_put(ctx, &uring_locked); - - /* relaxed read is enough as only the task itself sets ->in_idle */ - if (unlikely(atomic_read(&tctx->in_idle))) - io_uring_drop_tctx_refs(current); -} - -static void __io_req_task_work_add(struct io_kiocb *req, - struct io_uring_task *tctx, - struct io_wq_work_list *list) -{ - struct io_ring_ctx *ctx = req->ctx; - struct io_wq_work_node *node; - unsigned long flags; - bool running; - - spin_lock_irqsave(&tctx->task_lock, flags); - wq_list_add_tail(&req->io_task_work.node, list); - running = tctx->task_running; - if (!running) - tctx->task_running = true; - spin_unlock_irqrestore(&tctx->task_lock, flags); - - /* task_work already pending, we're done */ - if (running) - return; - - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) - atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); - - if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method))) - return; - - spin_lock_irqsave(&tctx->task_lock, flags); - tctx->task_running = false; - node = wq_list_merge(&tctx->prio_task_list, &tctx->task_list); - spin_unlock_irqrestore(&tctx->task_lock, flags); - - while (node) { - req = container_of(node, struct io_kiocb, io_task_work.node); - node = node->next; - if (llist_add(&req->io_task_work.fallback_node, - &req->ctx->fallback_llist)) - schedule_delayed_work(&req->ctx->fallback_work, 1); - } -} - -static void io_req_task_work_add(struct io_kiocb *req) -{ - struct io_uring_task *tctx = req->task->io_uring; - - __io_req_task_work_add(req, tctx, &tctx->task_list); -} - -static void io_req_task_prio_work_add(struct io_kiocb *req) -{ - struct io_uring_task *tctx = req->task->io_uring; - - if (req->ctx->flags & IORING_SETUP_SQPOLL) - __io_req_task_work_add(req, tctx, &tctx->prio_task_list); - else - __io_req_task_work_add(req, tctx, &tctx->task_list); -} - -static void io_req_tw_post(struct io_kiocb *req, bool *locked) -{ - io_req_complete_post(req, req->cqe.res, req->cqe.flags); -} - -static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags) -{ - req->cqe.res = res; - req->cqe.flags = cflags; - req->io_task_work.func = io_req_tw_post; - io_req_task_work_add(req); -} - -static void io_req_task_cancel(struct io_kiocb *req, bool *locked) -{ - /* not needed for normal modes, but SQPOLL depends on it */ - io_tw_lock(req->ctx, locked); - io_req_complete_failed(req, req->cqe.res); -} - -static void io_req_task_submit(struct io_kiocb *req, bool *locked) -{ - io_tw_lock(req->ctx, locked); - /* req->task == current here, checking PF_EXITING is safe */ - if (likely(!(req->task->flags & PF_EXITING))) - io_queue_sqe(req); - else - io_req_complete_failed(req, -EFAULT); -} - -static void io_req_task_queue_fail(struct io_kiocb *req, int ret) -{ - req->cqe.res = ret; - req->io_task_work.func = io_req_task_cancel; - io_req_task_work_add(req); -} - -static void io_req_task_queue(struct io_kiocb *req) -{ - req->io_task_work.func = io_req_task_submit; - io_req_task_work_add(req); -} - -static void io_req_task_queue_reissue(struct io_kiocb *req) -{ - req->io_task_work.func = io_queue_iowq; - io_req_task_work_add(req); -} - -static void io_queue_next(struct io_kiocb *req) -{ - struct io_kiocb *nxt = io_req_find_next(req); - - if (nxt) - io_req_task_queue(nxt); -} - -static void io_free_batch_list(struct io_ring_ctx *ctx, - struct io_wq_work_node *node) - __must_hold(&ctx->uring_lock) -{ - struct task_struct *task = NULL; - int task_refs = 0; - - do { - struct io_kiocb *req = container_of(node, struct io_kiocb, - comp_list); - - if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) { - if (req->flags & REQ_F_REFCOUNT) { - node = req->comp_list.next; - if (!req_ref_put_and_test(req)) - continue; - } - if ((req->flags & REQ_F_POLLED) && req->apoll) { - struct async_poll *apoll = req->apoll; - - if (apoll->double_poll) - kfree(apoll->double_poll); - list_add(&apoll->poll.wait.entry, - &ctx->apoll_cache); - req->flags &= ~REQ_F_POLLED; - } - if (req->flags & IO_REQ_LINK_FLAGS) - io_queue_next(req); - if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS)) - io_clean_op(req); - } - if (!(req->flags & REQ_F_FIXED_FILE)) - io_put_file(req->file); - - io_req_put_rsrc_locked(req, ctx); - - if (req->task != task) { - if (task) - io_put_task(task, task_refs); - task = req->task; - task_refs = 0; - } - task_refs++; - node = req->comp_list.next; - io_req_add_to_cache(req, ctx); - } while (node); - - if (task) - io_put_task(task, task_refs); -} - -static void __io_submit_flush_completions(struct io_ring_ctx *ctx) - __must_hold(&ctx->uring_lock) -{ - struct io_wq_work_node *node, *prev; - struct io_submit_state *state = &ctx->submit_state; - - if (state->flush_cqes) { - spin_lock(&ctx->completion_lock); - wq_list_for_each(node, prev, &state->compl_reqs) { - struct io_kiocb *req = container_of(node, struct io_kiocb, - comp_list); - - if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe_req(ctx, req); - } - - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); - state->flush_cqes = false; - } - - io_free_batch_list(ctx, state->compl_reqs.first); - INIT_WQ_LIST(&state->compl_reqs); -} - -/* - * Drop reference to request, return next in chain (if there is one) if this - * was the last reference to this request. - */ -static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req) -{ - struct io_kiocb *nxt = NULL; - - if (req_ref_put_and_test(req)) { - if (unlikely(req->flags & IO_REQ_LINK_FLAGS)) - nxt = io_req_find_next(req); - io_free_req(req); - } - return nxt; -} - -static inline void io_put_req(struct io_kiocb *req) -{ - if (req_ref_put_and_test(req)) { - io_queue_next(req); - io_free_req(req); - } -} - -static unsigned io_cqring_events(struct io_ring_ctx *ctx) -{ - /* See comment at the top of this file */ - smp_rmb(); - return __io_cqring_events(ctx); -} - -static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) -{ - struct io_rings *rings = ctx->rings; - - /* make sure SQ entry isn't read before tail */ - return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; -} - -static inline bool io_run_task_work(void) -{ - if (test_thread_flag(TIF_NOTIFY_SIGNAL) || task_work_pending(current)) { - __set_current_state(TASK_RUNNING); - clear_notify_signal(); - if (task_work_pending(current)) - task_work_run(); - return true; - } - - return false; -} - -static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) -{ - struct io_wq_work_node *pos, *start, *prev; - unsigned int poll_flags = BLK_POLL_NOSLEEP; - DEFINE_IO_COMP_BATCH(iob); - int nr_events = 0; - - /* - * Only spin for completions if we don't have multiple devices hanging - * off our complete list. - */ - if (ctx->poll_multi_queue || force_nonspin) - poll_flags |= BLK_POLL_ONESHOT; - - wq_list_for_each(pos, start, &ctx->iopoll_list) { - struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); - struct kiocb *kiocb = &req->rw.kiocb; - int ret; - - /* - * Move completed and retryable entries to our local lists. - * If we find a request that requires polling, break out - * and complete those lists first, if we have entries there. - */ - if (READ_ONCE(req->iopoll_completed)) - break; - - ret = kiocb->ki_filp->f_op->iopoll(kiocb, &iob, poll_flags); - if (unlikely(ret < 0)) - return ret; - else if (ret) - poll_flags |= BLK_POLL_ONESHOT; - - /* iopoll may have completed current req */ - if (!rq_list_empty(iob.req_list) || - READ_ONCE(req->iopoll_completed)) - break; - } - - if (!rq_list_empty(iob.req_list)) - iob.complete(&iob); - else if (!pos) - return 0; - - prev = start; - wq_list_for_each_resume(pos, prev) { - struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); - - /* order with io_complete_rw_iopoll(), e.g. ->result updates */ - if (!smp_load_acquire(&req->iopoll_completed)) - break; - nr_events++; - if (unlikely(req->flags & REQ_F_CQE_SKIP)) - continue; - - req->cqe.flags = io_put_kbuf(req, 0); - __io_fill_cqe_req(req->ctx, req); - } - - if (unlikely(!nr_events)) - return 0; - - io_commit_cqring(ctx); - io_cqring_ev_posted_iopoll(ctx); - pos = start ? start->next : ctx->iopoll_list.first; - wq_list_cut(&ctx->iopoll_list, prev, start); - io_free_batch_list(ctx, pos); - return nr_events; -} - -/* - * We can't just wait for polled events to come to us, we have to actively - * find and complete them. - */ -static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) -{ - if (!(ctx->flags & IORING_SETUP_IOPOLL)) - return; - - mutex_lock(&ctx->uring_lock); - while (!wq_list_empty(&ctx->iopoll_list)) { - /* let it sleep and repeat later if can't complete a request */ - if (io_do_iopoll(ctx, true) == 0) - break; - /* - * Ensure we allow local-to-the-cpu processing to take place, - * in this case we need to ensure that we reap all events. - * Also let task_work, etc. to progress by releasing the mutex - */ - if (need_resched()) { - mutex_unlock(&ctx->uring_lock); - cond_resched(); - mutex_lock(&ctx->uring_lock); - } - } - mutex_unlock(&ctx->uring_lock); -} - -static int io_iopoll_check(struct io_ring_ctx *ctx, long min) -{ - unsigned int nr_events = 0; - int ret = 0; - unsigned long check_cq; - - /* - * Don't enter poll loop if we already have events pending. - * If we do, we can potentially be spinning for commands that - * already triggered a CQE (eg in error). - */ - check_cq = READ_ONCE(ctx->check_cq); - if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) - __io_cqring_overflow_flush(ctx, false); - if (io_cqring_events(ctx)) - return 0; - - /* - * Similarly do not spin if we have not informed the user of any - * dropped CQE. - */ - if (unlikely(check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT))) - return -EBADR; - - do { - /* - * If a submit got punted to a workqueue, we can have the - * application entering polling for a command before it gets - * issued. That app will hold the uring_lock for the duration - * of the poll right here, so we need to take a breather every - * now and then to ensure that the issue has a chance to add - * the poll to the issued list. Otherwise we can spin here - * forever, while the workqueue is stuck trying to acquire the - * very same mutex. - */ - if (wq_list_empty(&ctx->iopoll_list)) { - u32 tail = ctx->cached_cq_tail; - - mutex_unlock(&ctx->uring_lock); - io_run_task_work(); - mutex_lock(&ctx->uring_lock); - - /* some requests don't go through iopoll_list */ - if (tail != ctx->cached_cq_tail || - wq_list_empty(&ctx->iopoll_list)) - break; - } - ret = io_do_iopoll(ctx, !min); - if (ret < 0) - break; - nr_events += ret; - ret = 0; - } while (nr_events < min && !need_resched()); - - return ret; -} - -static void kiocb_end_write(struct io_kiocb *req) -{ - /* - * Tell lockdep we inherited freeze protection from submission - * thread. - */ - if (req->flags & REQ_F_ISREG) { - struct super_block *sb = file_inode(req->file)->i_sb; - - __sb_writers_acquired(sb, SB_FREEZE_WRITE); - sb_end_write(sb); - } -} - -#ifdef CONFIG_BLOCK -static bool io_resubmit_prep(struct io_kiocb *req) -{ - struct io_async_rw *rw = req->async_data; - - if (!req_has_async_data(req)) - return !io_req_prep_async(req); - iov_iter_restore(&rw->s.iter, &rw->s.iter_state); - return true; -} - -static bool io_rw_should_reissue(struct io_kiocb *req) -{ - umode_t mode = file_inode(req->file)->i_mode; - struct io_ring_ctx *ctx = req->ctx; - - if (!S_ISBLK(mode) && !S_ISREG(mode)) - return false; - if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() && - !(ctx->flags & IORING_SETUP_IOPOLL))) - return false; - /* - * If ref is dying, we might be running poll reap from the exit work. - * Don't attempt to reissue from that path, just let it fail with - * -EAGAIN. - */ - if (percpu_ref_is_dying(&ctx->refs)) - return false; - /* - * Play it safe and assume not safe to re-import and reissue if we're - * not in the original thread group (or in task context). - */ - if (!same_thread_group(req->task, current) || !in_task()) - return false; - return true; -} -#else -static bool io_resubmit_prep(struct io_kiocb *req) -{ - return false; -} -static bool io_rw_should_reissue(struct io_kiocb *req) -{ - return false; -} -#endif - -static bool __io_complete_rw_common(struct io_kiocb *req, long res) -{ - if (req->rw.kiocb.ki_flags & IOCB_WRITE) { - kiocb_end_write(req); - fsnotify_modify(req->file); - } else { - fsnotify_access(req->file); - } - if (unlikely(res != req->cqe.res)) { - if ((res == -EAGAIN || res == -EOPNOTSUPP) && - io_rw_should_reissue(req)) { - req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO; - return true; - } - req_set_fail(req); - req->cqe.res = res; - } - return false; -} - -static inline void io_req_task_complete(struct io_kiocb *req, bool *locked) -{ - int res = req->cqe.res; - - if (*locked) { - io_req_complete_state(req, res, io_put_kbuf(req, 0)); - io_req_add_compl_list(req); - } else { - io_req_complete_post(req, res, - io_put_kbuf(req, IO_URING_F_UNLOCKED)); - } -} - -static void __io_complete_rw(struct io_kiocb *req, long res, - unsigned int issue_flags) -{ - if (__io_complete_rw_common(req, res)) - return; - __io_req_complete(req, issue_flags, req->cqe.res, - io_put_kbuf(req, issue_flags)); -} - -static void io_complete_rw(struct kiocb *kiocb, long res) -{ - struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); - - if (__io_complete_rw_common(req, res)) - return; - req->cqe.res = res; - req->io_task_work.func = io_req_task_complete; - io_req_task_prio_work_add(req); -} - -static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) -{ - struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); - - if (kiocb->ki_flags & IOCB_WRITE) - kiocb_end_write(req); - if (unlikely(res != req->cqe.res)) { - if (res == -EAGAIN && io_rw_should_reissue(req)) { - req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO; - return; - } - req->cqe.res = res; - } - - /* order with io_iopoll_complete() checking ->iopoll_completed */ - smp_store_release(&req->iopoll_completed, 1); -} - -/* - * After the iocb has been issued, it's safe to be found on the poll list. - * Adding the kiocb to the list AFTER submission ensures that we don't - * find it from a io_do_iopoll() thread before the issuer is done - * accessing the kiocb cookie. - */ -static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_ring_ctx *ctx = req->ctx; - const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; - - /* workqueue context doesn't hold uring_lock, grab it now */ - if (unlikely(needs_lock)) - mutex_lock(&ctx->uring_lock); - - /* - * Track whether we have multiple files in our lists. This will impact - * how we do polling eventually, not spinning if we're on potentially - * different devices. - */ - if (wq_list_empty(&ctx->iopoll_list)) { - ctx->poll_multi_queue = false; - } else if (!ctx->poll_multi_queue) { - struct io_kiocb *list_req; - - list_req = container_of(ctx->iopoll_list.first, struct io_kiocb, - comp_list); - if (list_req->file != req->file) - ctx->poll_multi_queue = true; - } - - /* - * For fast devices, IO may have already completed. If it has, add - * it to the front so we find it first. - */ - if (READ_ONCE(req->iopoll_completed)) - wq_list_add_head(&req->comp_list, &ctx->iopoll_list); - else - wq_list_add_tail(&req->comp_list, &ctx->iopoll_list); - - if (unlikely(needs_lock)) { - /* - * If IORING_SETUP_SQPOLL is enabled, sqes are either handle - * in sq thread task context or in io worker task context. If - * current task context is sq thread, we don't need to check - * whether should wake up sq thread. - */ - if ((ctx->flags & IORING_SETUP_SQPOLL) && - wq_has_sleeper(&ctx->sq_data->wait)) - wake_up(&ctx->sq_data->wait); - - mutex_unlock(&ctx->uring_lock); - } -} - -static bool io_bdev_nowait(struct block_device *bdev) -{ - return !bdev || blk_queue_nowait(bdev_get_queue(bdev)); -} - -/* - * If we tracked the file through the SCM inflight mechanism, we could support - * any file. For now, just ensure that anything potentially problematic is done - * inline. - */ -static bool __io_file_supports_nowait(struct file *file, umode_t mode) -{ - if (S_ISBLK(mode)) { - if (IS_ENABLED(CONFIG_BLOCK) && - io_bdev_nowait(I_BDEV(file->f_mapping->host))) - return true; - return false; - } - if (S_ISSOCK(mode)) - return true; - if (S_ISREG(mode)) { - if (IS_ENABLED(CONFIG_BLOCK) && - io_bdev_nowait(file->f_inode->i_sb->s_bdev) && - file->f_op != &io_uring_fops) - return true; - return false; - } - - /* any ->read/write should understand O_NONBLOCK */ - if (file->f_flags & O_NONBLOCK) - return true; - return file->f_mode & FMODE_NOWAIT; -} - -/* - * If we tracked the file through the SCM inflight mechanism, we could support - * any file. For now, just ensure that anything potentially problematic is done - * inline. - */ -static unsigned int io_file_get_flags(struct file *file) -{ - umode_t mode = file_inode(file)->i_mode; - unsigned int res = 0; - - if (S_ISREG(mode)) - res |= FFS_ISREG; - if (__io_file_supports_nowait(file, mode)) - res |= FFS_NOWAIT; - if (io_file_need_scm(file)) - res |= FFS_SCM; - return res; -} - -static inline bool io_file_supports_nowait(struct io_kiocb *req) -{ - return req->flags & REQ_F_SUPPORT_NOWAIT; -} - -static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct kiocb *kiocb = &req->rw.kiocb; - unsigned ioprio; - int ret; - - kiocb->ki_pos = READ_ONCE(sqe->off); - /* used for fixed read/write too - just read unconditionally */ - req->buf_index = READ_ONCE(sqe->buf_index); - - if (req->opcode == IORING_OP_READ_FIXED || - req->opcode == IORING_OP_WRITE_FIXED) { - struct io_ring_ctx *ctx = req->ctx; - u16 index; - - if (unlikely(req->buf_index >= ctx->nr_user_bufs)) - return -EFAULT; - index = array_index_nospec(req->buf_index, ctx->nr_user_bufs); - req->imu = ctx->user_bufs[index]; - io_req_set_rsrc_node(req, ctx, 0); - } - - ioprio = READ_ONCE(sqe->ioprio); - if (ioprio) { - ret = ioprio_check_cap(ioprio); - if (ret) - return ret; - - kiocb->ki_ioprio = ioprio; - } else { - kiocb->ki_ioprio = get_current_ioprio(); - } - - req->rw.addr = READ_ONCE(sqe->addr); - req->rw.len = READ_ONCE(sqe->len); - req->rw.flags = READ_ONCE(sqe->rw_flags); - return 0; -} - -static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) -{ - switch (ret) { - case -EIOCBQUEUED: - break; - case -ERESTARTSYS: - case -ERESTARTNOINTR: - case -ERESTARTNOHAND: - case -ERESTART_RESTARTBLOCK: - /* - * We can't just restart the syscall, since previously - * submitted sqes may already be in progress. Just fail this - * IO with EINTR. - */ - ret = -EINTR; - fallthrough; - default: - kiocb->ki_complete(kiocb, ret); - } -} - -static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) -{ - struct kiocb *kiocb = &req->rw.kiocb; - - if (kiocb->ki_pos != -1) - return &kiocb->ki_pos; - - if (!(req->file->f_mode & FMODE_STREAM)) { - req->flags |= REQ_F_CUR_POS; - kiocb->ki_pos = req->file->f_pos; - return &kiocb->ki_pos; - } - - kiocb->ki_pos = 0; - return NULL; -} - -static void kiocb_done(struct io_kiocb *req, ssize_t ret, - unsigned int issue_flags) -{ - struct io_async_rw *io = req->async_data; - - /* add previously done IO, if any */ - if (req_has_async_data(req) && io->bytes_done > 0) { - if (ret < 0) - ret = io->bytes_done; - else - ret += io->bytes_done; - } - - if (req->flags & REQ_F_CUR_POS) - req->file->f_pos = req->rw.kiocb.ki_pos; - if (ret >= 0 && (req->rw.kiocb.ki_complete == io_complete_rw)) - __io_complete_rw(req, ret, issue_flags); - else - io_rw_done(&req->rw.kiocb, ret); - - if (req->flags & REQ_F_REISSUE) { - req->flags &= ~REQ_F_REISSUE; - if (io_resubmit_prep(req)) - io_req_task_queue_reissue(req); - else - io_req_task_queue_fail(req, ret); - } -} - -static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter, - struct io_mapped_ubuf *imu) -{ - size_t len = req->rw.len; - u64 buf_end, buf_addr = req->rw.addr; - size_t offset; - - if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) - return -EFAULT; - /* not inside the mapped region */ - if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end)) - return -EFAULT; - - /* - * May not be a start of buffer, set size appropriately - * and advance us to the beginning. - */ - offset = buf_addr - imu->ubuf; - iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len); - - if (offset) { - /* - * Don't use iov_iter_advance() here, as it's really slow for - * using the latter parts of a big fixed buffer - it iterates - * over each segment manually. We can cheat a bit here, because - * we know that: - * - * 1) it's a BVEC iter, we set it up - * 2) all bvecs are PAGE_SIZE in size, except potentially the - * first and last bvec - * - * So just find our index, and adjust the iterator afterwards. - * If the offset is within the first bvec (or the whole first - * bvec, just use iov_iter_advance(). This makes it easier - * since we can just skip the first segment, which may not - * be PAGE_SIZE aligned. - */ - const struct bio_vec *bvec = imu->bvec; - - if (offset <= bvec->bv_len) { - iov_iter_advance(iter, offset); - } else { - unsigned long seg_skip; - - /* skip first vec */ - offset -= bvec->bv_len; - seg_skip = 1 + (offset >> PAGE_SHIFT); - - iter->bvec = bvec + seg_skip; - iter->nr_segs -= seg_skip; - iter->count -= bvec->bv_len + offset; - iter->iov_offset = offset & ~PAGE_MASK; - } - } - - return 0; -} - -static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter, - unsigned int issue_flags) -{ - if (WARN_ON_ONCE(!req->imu)) - return -EFAULT; - return __io_import_fixed(req, rw, iter, req->imu); -} - -static int io_buffer_add_list(struct io_ring_ctx *ctx, - struct io_buffer_list *bl, unsigned int bgid) -{ - bl->bgid = bgid; - if (bgid < BGID_ARRAY) - return 0; - - return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); -} - -static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, - struct io_buffer_list *bl) -{ - if (!list_empty(&bl->buf_list)) { - struct io_buffer *kbuf; - - kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list); - list_del(&kbuf->list); - if (*len > kbuf->len) - *len = kbuf->len; - req->flags |= REQ_F_BUFFER_SELECTED; - req->kbuf = kbuf; - req->buf_index = kbuf->bid; - return u64_to_user_ptr(kbuf->addr); - } - return NULL; -} - -static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, - struct io_buffer_list *bl, - unsigned int issue_flags) -{ - struct io_uring_buf_ring *br = bl->buf_ring; - struct io_uring_buf *buf; - __u16 head = bl->head; - - if (unlikely(smp_load_acquire(&br->tail) == head)) - return NULL; - - head &= bl->mask; - if (head < IO_BUFFER_LIST_BUF_PER_PAGE) { - buf = &br->bufs[head]; - } else { - int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1); - int index = head / IO_BUFFER_LIST_BUF_PER_PAGE; - buf = page_address(bl->buf_pages[index]); - buf += off; - } - if (*len > buf->len) - *len = buf->len; - req->flags |= REQ_F_BUFFER_RING; - req->buf_list = bl; - req->buf_index = buf->bid; - - if (issue_flags & IO_URING_F_UNLOCKED || !file_can_poll(req->file)) { - /* - * If we came in unlocked, we have no choice but to consume the - * buffer here. This does mean it'll be pinned until the IO - * completes. But coming in unlocked means we're in io-wq - * context, hence there should be no further retry. For the - * locked case, the caller must ensure to call the commit when - * the transfer completes (or if we get -EAGAIN and must poll - * or retry). - */ - req->buf_list = NULL; - bl->head++; - } - return u64_to_user_ptr(buf->addr); -} - -static void __user *io_buffer_select(struct io_kiocb *req, size_t *len, - unsigned int issue_flags) -{ - struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; - void __user *ret = NULL; - - io_ring_submit_lock(req->ctx, issue_flags); - - bl = io_buffer_get_list(ctx, req->buf_index); - if (likely(bl)) { - if (bl->buf_nr_pages) - ret = io_ring_buffer_select(req, len, bl, issue_flags); - else - ret = io_provided_buffer_select(req, len, bl); - } - io_ring_submit_unlock(req->ctx, issue_flags); - return ret; -} - -#ifdef CONFIG_COMPAT -static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, - unsigned int issue_flags) -{ - struct compat_iovec __user *uiov; - compat_ssize_t clen; - void __user *buf; - size_t len; - - uiov = u64_to_user_ptr(req->rw.addr); - if (!access_ok(uiov, sizeof(*uiov))) - return -EFAULT; - if (__get_user(clen, &uiov->iov_len)) - return -EFAULT; - if (clen < 0) - return -EINVAL; - - len = clen; - buf = io_buffer_select(req, &len, issue_flags); - if (!buf) - return -ENOBUFS; - req->rw.addr = (unsigned long) buf; - iov[0].iov_base = buf; - req->rw.len = iov[0].iov_len = (compat_size_t) len; - return 0; -} -#endif - -static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, - unsigned int issue_flags) -{ - struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr); - void __user *buf; - ssize_t len; - - if (copy_from_user(iov, uiov, sizeof(*uiov))) - return -EFAULT; - - len = iov[0].iov_len; - if (len < 0) - return -EINVAL; - buf = io_buffer_select(req, &len, issue_flags); - if (!buf) - return -ENOBUFS; - req->rw.addr = (unsigned long) buf; - iov[0].iov_base = buf; - req->rw.len = iov[0].iov_len = len; - return 0; -} - -static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, - unsigned int issue_flags) -{ - if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) { - iov[0].iov_base = u64_to_user_ptr(req->rw.addr); - iov[0].iov_len = req->rw.len; - return 0; - } - if (req->rw.len != 1) - return -EINVAL; - -#ifdef CONFIG_COMPAT - if (req->ctx->compat) - return io_compat_import(req, iov, issue_flags); -#endif - - return __io_iov_buffer_select(req, iov, issue_flags); -} - -static inline bool io_do_buffer_select(struct io_kiocb *req) -{ - if (!(req->flags & REQ_F_BUFFER_SELECT)) - return false; - return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)); -} - -static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req, - struct io_rw_state *s, - unsigned int issue_flags) -{ - struct iov_iter *iter = &s->iter; - u8 opcode = req->opcode; - struct iovec *iovec; - void __user *buf; - size_t sqe_len; - ssize_t ret; - - if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { - ret = io_import_fixed(req, rw, iter, issue_flags); - if (ret) - return ERR_PTR(ret); - return NULL; - } - - buf = u64_to_user_ptr(req->rw.addr); - sqe_len = req->rw.len; - - if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { - if (io_do_buffer_select(req)) { - buf = io_buffer_select(req, &sqe_len, issue_flags); - if (!buf) - return ERR_PTR(-ENOBUFS); - req->rw.addr = (unsigned long) buf; - req->rw.len = sqe_len; - } - - ret = import_single_range(rw, buf, sqe_len, s->fast_iov, iter); - if (ret) - return ERR_PTR(ret); - return NULL; - } - - iovec = s->fast_iov; - if (req->flags & REQ_F_BUFFER_SELECT) { - ret = io_iov_buffer_select(req, iovec, issue_flags); - if (ret) - return ERR_PTR(ret); - iov_iter_init(iter, rw, iovec, 1, iovec->iov_len); - return NULL; - } - - ret = __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, &iovec, iter, - req->ctx->compat); - if (unlikely(ret < 0)) - return ERR_PTR(ret); - return iovec; -} - -static inline int io_import_iovec(int rw, struct io_kiocb *req, - struct iovec **iovec, struct io_rw_state *s, - unsigned int issue_flags) -{ - *iovec = __io_import_iovec(rw, req, s, issue_flags); - if (unlikely(IS_ERR(*iovec))) - return PTR_ERR(*iovec); - - iov_iter_save_state(&s->iter, &s->iter_state); - return 0; -} - -static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) -{ - return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos; -} - -/* - * For files that don't have ->read_iter() and ->write_iter(), handle them - * by looping over ->read() or ->write() manually. - */ -static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) -{ - struct kiocb *kiocb = &req->rw.kiocb; - struct file *file = req->file; - ssize_t ret = 0; - loff_t *ppos; - - /* - * Don't support polled IO through this interface, and we can't - * support non-blocking either. For the latter, this just causes - * the kiocb to be handled from an async context. - */ - if (kiocb->ki_flags & IOCB_HIPRI) - return -EOPNOTSUPP; - if ((kiocb->ki_flags & IOCB_NOWAIT) && - !(kiocb->ki_filp->f_flags & O_NONBLOCK)) - return -EAGAIN; - - ppos = io_kiocb_ppos(kiocb); - - while (iov_iter_count(iter)) { - struct iovec iovec; - ssize_t nr; - - if (!iov_iter_is_bvec(iter)) { - iovec = iov_iter_iovec(iter); - } else { - iovec.iov_base = u64_to_user_ptr(req->rw.addr); - iovec.iov_len = req->rw.len; - } - - if (rw == READ) { - nr = file->f_op->read(file, iovec.iov_base, - iovec.iov_len, ppos); - } else { - nr = file->f_op->write(file, iovec.iov_base, - iovec.iov_len, ppos); - } - - if (nr < 0) { - if (!ret) - ret = nr; - break; - } - ret += nr; - if (!iov_iter_is_bvec(iter)) { - iov_iter_advance(iter, nr); - } else { - req->rw.addr += nr; - req->rw.len -= nr; - if (!req->rw.len) - break; - } - if (nr != iovec.iov_len) - break; - } - - return ret; -} - -static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, - const struct iovec *fast_iov, struct iov_iter *iter) -{ - struct io_async_rw *rw = req->async_data; - - memcpy(&rw->s.iter, iter, sizeof(*iter)); - rw->free_iovec = iovec; - rw->bytes_done = 0; - /* can only be fixed buffers, no need to do anything */ - if (iov_iter_is_bvec(iter)) - return; - if (!iovec) { - unsigned iov_off = 0; - - rw->s.iter.iov = rw->s.fast_iov; - if (iter->iov != fast_iov) { - iov_off = iter->iov - fast_iov; - rw->s.iter.iov += iov_off; - } - if (rw->s.fast_iov != fast_iov) - memcpy(rw->s.fast_iov + iov_off, fast_iov + iov_off, - sizeof(struct iovec) * iter->nr_segs); - } else { - req->flags |= REQ_F_NEED_CLEANUP; - } -} - -static inline bool io_alloc_async_data(struct io_kiocb *req) -{ - WARN_ON_ONCE(!io_op_defs[req->opcode].async_size); - req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL); - if (req->async_data) { - req->flags |= REQ_F_ASYNC_DATA; - return false; - } - return true; -} - -static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, - struct io_rw_state *s, bool force) -{ - if (!force && !io_op_defs[req->opcode].needs_async_setup) - return 0; - if (!req_has_async_data(req)) { - struct io_async_rw *iorw; - - if (io_alloc_async_data(req)) { - kfree(iovec); - return -ENOMEM; - } - - io_req_map_rw(req, iovec, s->fast_iov, &s->iter); - iorw = req->async_data; - /* we've copied and mapped the iter, ensure state is saved */ - iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state); - } - return 0; -} - -static inline int io_rw_prep_async(struct io_kiocb *req, int rw) -{ - struct io_async_rw *iorw = req->async_data; - struct iovec *iov; - int ret; - - /* submission path, ->uring_lock should already be taken */ - ret = io_import_iovec(rw, req, &iov, &iorw->s, 0); - if (unlikely(ret < 0)) - return ret; - - iorw->bytes_done = 0; - iorw->free_iovec = iov; - if (iov) - req->flags |= REQ_F_NEED_CLEANUP; - return 0; -} - -static int io_readv_prep_async(struct io_kiocb *req) -{ - return io_rw_prep_async(req, READ); -} - -static int io_writev_prep_async(struct io_kiocb *req) -{ - return io_rw_prep_async(req, WRITE); -} - -/* - * This is our waitqueue callback handler, registered through __folio_lock_async() - * when we initially tried to do the IO with the iocb armed our waitqueue. - * This gets called when the page is unlocked, and we generally expect that to - * happen when the page IO is completed and the page is now uptodate. This will - * queue a task_work based retry of the operation, attempting to copy the data - * again. If the latter fails because the page was NOT uptodate, then we will - * do a thread based blocking retry of the operation. That's the unexpected - * slow path. - */ -static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, - int sync, void *arg) -{ - struct wait_page_queue *wpq; - struct io_kiocb *req = wait->private; - struct wait_page_key *key = arg; - - wpq = container_of(wait, struct wait_page_queue, wait); - - if (!wake_page_match(wpq, key)) - return 0; - - req->rw.kiocb.ki_flags &= ~IOCB_WAITQ; - list_del_init(&wait->entry); - io_req_task_queue(req); - return 1; -} - -/* - * This controls whether a given IO request should be armed for async page - * based retry. If we return false here, the request is handed to the async - * worker threads for retry. If we're doing buffered reads on a regular file, - * we prepare a private wait_page_queue entry and retry the operation. This - * will either succeed because the page is now uptodate and unlocked, or it - * will register a callback when the page is unlocked at IO completion. Through - * that callback, io_uring uses task_work to setup a retry of the operation. - * That retry will attempt the buffered read again. The retry will generally - * succeed, or in rare cases where it fails, we then fall back to using the - * async worker threads for a blocking retry. - */ -static bool io_rw_should_retry(struct io_kiocb *req) -{ - struct io_async_rw *rw = req->async_data; - struct wait_page_queue *wait = &rw->wpq; - struct kiocb *kiocb = &req->rw.kiocb; - - /* never retry for NOWAIT, we just complete with -EAGAIN */ - if (req->flags & REQ_F_NOWAIT) - return false; - - /* Only for buffered IO */ - if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI)) - return false; - - /* - * just use poll if we can, and don't attempt if the fs doesn't - * support callback based unlocks - */ - if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC)) - return false; - - wait->wait.func = io_async_buf_func; - wait->wait.private = req; - wait->wait.flags = 0; - INIT_LIST_HEAD(&wait->wait.entry); - kiocb->ki_flags |= IOCB_WAITQ; - kiocb->ki_flags &= ~IOCB_NOWAIT; - kiocb->ki_waitq = wait; - return true; -} - -static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter) -{ - if (likely(req->file->f_op->read_iter)) - return call_read_iter(req->file, &req->rw.kiocb, iter); - else if (req->file->f_op->read) - return loop_rw_iter(READ, req, iter); - else - return -EINVAL; -} - -static bool need_read_all(struct io_kiocb *req) -{ - return req->flags & REQ_F_ISREG || - S_ISBLK(file_inode(req->file)->i_mode); -} - -static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) -{ - struct kiocb *kiocb = &req->rw.kiocb; - struct io_ring_ctx *ctx = req->ctx; - struct file *file = req->file; - int ret; - - if (unlikely(!file || !(file->f_mode & mode))) - return -EBADF; - - if (!io_req_ffs_set(req)) - req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT; - - kiocb->ki_flags = iocb_flags(file); - ret = kiocb_set_rw_flags(kiocb, req->rw.flags); - if (unlikely(ret)) - return ret; - - /* - * If the file is marked O_NONBLOCK, still allow retry for it if it - * supports async. Otherwise it's impossible to use O_NONBLOCK files - * reliably. If not, or it IOCB_NOWAIT is set, don't retry. - */ - if ((kiocb->ki_flags & IOCB_NOWAIT) || - ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req))) - req->flags |= REQ_F_NOWAIT; - - if (ctx->flags & IORING_SETUP_IOPOLL) { - if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll) - return -EOPNOTSUPP; - - kiocb->private = NULL; - kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE; - kiocb->ki_complete = io_complete_rw_iopoll; - req->iopoll_completed = 0; - } else { - if (kiocb->ki_flags & IOCB_HIPRI) - return -EINVAL; - kiocb->ki_complete = io_complete_rw; - } - - return 0; -} - -static int io_read(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_rw_state __s, *s = &__s; - struct iovec *iovec; - struct kiocb *kiocb = &req->rw.kiocb; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - struct io_async_rw *rw; - ssize_t ret, ret2; - loff_t *ppos; - - if (!req_has_async_data(req)) { - ret = io_import_iovec(READ, req, &iovec, s, issue_flags); - if (unlikely(ret < 0)) - return ret; - } else { - /* - * Safe and required to re-import if we're using provided - * buffers, as we dropped the selected one before retry. - */ - if (req->flags & REQ_F_BUFFER_SELECT) { - ret = io_import_iovec(READ, req, &iovec, s, issue_flags); - if (unlikely(ret < 0)) - return ret; - } - - rw = req->async_data; - s = &rw->s; - /* - * We come here from an earlier attempt, restore our state to - * match in case it doesn't. It's cheap enough that we don't - * need to make this conditional. - */ - iov_iter_restore(&s->iter, &s->iter_state); - iovec = NULL; - } - ret = io_rw_init_file(req, FMODE_READ); - if (unlikely(ret)) { - kfree(iovec); - return ret; - } - req->cqe.res = iov_iter_count(&s->iter); - - if (force_nonblock) { - /* If the file doesn't support async, just async punt */ - if (unlikely(!io_file_supports_nowait(req))) { - ret = io_setup_async_rw(req, iovec, s, true); - return ret ?: -EAGAIN; - } - kiocb->ki_flags |= IOCB_NOWAIT; - } else { - /* Ensure we clear previously set non-block flag */ - kiocb->ki_flags &= ~IOCB_NOWAIT; - } - - ppos = io_kiocb_update_pos(req); - - ret = rw_verify_area(READ, req->file, ppos, req->cqe.res); - if (unlikely(ret)) { - kfree(iovec); - return ret; - } - - ret = io_iter_do_read(req, &s->iter); - - if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { - req->flags &= ~REQ_F_REISSUE; - /* if we can poll, just do that */ - if (req->opcode == IORING_OP_READ && file_can_poll(req->file)) - return -EAGAIN; - /* IOPOLL retry should happen for io-wq threads */ - if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) - goto done; - /* no retry on NONBLOCK nor RWF_NOWAIT */ - if (req->flags & REQ_F_NOWAIT) - goto done; - ret = 0; - } else if (ret == -EIOCBQUEUED) { - goto out_free; - } else if (ret == req->cqe.res || ret <= 0 || !force_nonblock || - (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) { - /* read all, failed, already did sync or don't want to retry */ - goto done; - } - - /* - * Don't depend on the iter state matching what was consumed, or being - * untouched in case of error. Restore it and we'll advance it - * manually if we need to. - */ - iov_iter_restore(&s->iter, &s->iter_state); - - ret2 = io_setup_async_rw(req, iovec, s, true); - if (ret2) - return ret2; - - iovec = NULL; - rw = req->async_data; - s = &rw->s; - /* - * Now use our persistent iterator and state, if we aren't already. - * We've restored and mapped the iter to match. - */ - - do { - /* - * We end up here because of a partial read, either from - * above or inside this loop. Advance the iter by the bytes - * that were consumed. - */ - iov_iter_advance(&s->iter, ret); - if (!iov_iter_count(&s->iter)) - break; - rw->bytes_done += ret; - iov_iter_save_state(&s->iter, &s->iter_state); - - /* if we can retry, do so with the callbacks armed */ - if (!io_rw_should_retry(req)) { - kiocb->ki_flags &= ~IOCB_WAITQ; - return -EAGAIN; - } - - /* - * Now retry read with the IOCB_WAITQ parts set in the iocb. If - * we get -EIOCBQUEUED, then we'll get a notification when the - * desired page gets unlocked. We can also get a partial read - * here, and if we do, then just retry at the new offset. - */ - ret = io_iter_do_read(req, &s->iter); - if (ret == -EIOCBQUEUED) - return 0; - /* we got some bytes, but not all. retry. */ - kiocb->ki_flags &= ~IOCB_WAITQ; - iov_iter_restore(&s->iter, &s->iter_state); - } while (ret > 0); -done: - kiocb_done(req, ret, issue_flags); -out_free: - /* it's faster to check here then delegate to kfree */ - if (iovec) - kfree(iovec); - return 0; -} - -static int io_write(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_rw_state __s, *s = &__s; - struct iovec *iovec; - struct kiocb *kiocb = &req->rw.kiocb; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - ssize_t ret, ret2; - loff_t *ppos; - - if (!req_has_async_data(req)) { - ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags); - if (unlikely(ret < 0)) - return ret; - } else { - struct io_async_rw *rw = req->async_data; - - s = &rw->s; - iov_iter_restore(&s->iter, &s->iter_state); - iovec = NULL; - } - ret = io_rw_init_file(req, FMODE_WRITE); - if (unlikely(ret)) { - kfree(iovec); - return ret; - } - req->cqe.res = iov_iter_count(&s->iter); - - if (force_nonblock) { - /* If the file doesn't support async, just async punt */ - if (unlikely(!io_file_supports_nowait(req))) - goto copy_iov; - - /* file path doesn't support NOWAIT for non-direct_IO */ - if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) && - (req->flags & REQ_F_ISREG)) - goto copy_iov; - - kiocb->ki_flags |= IOCB_NOWAIT; - } else { - /* Ensure we clear previously set non-block flag */ - kiocb->ki_flags &= ~IOCB_NOWAIT; - } - - ppos = io_kiocb_update_pos(req); - - ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res); - if (unlikely(ret)) - goto out_free; - - /* - * Open-code file_start_write here to grab freeze protection, - * which will be released by another thread in - * io_complete_rw(). Fool lockdep by telling it the lock got - * released so that it doesn't complain about the held lock when - * we return to userspace. - */ - if (req->flags & REQ_F_ISREG) { - sb_start_write(file_inode(req->file)->i_sb); - __sb_writers_release(file_inode(req->file)->i_sb, - SB_FREEZE_WRITE); - } - kiocb->ki_flags |= IOCB_WRITE; - - if (likely(req->file->f_op->write_iter)) - ret2 = call_write_iter(req->file, kiocb, &s->iter); - else if (req->file->f_op->write) - ret2 = loop_rw_iter(WRITE, req, &s->iter); - else - ret2 = -EINVAL; - - if (req->flags & REQ_F_REISSUE) { - req->flags &= ~REQ_F_REISSUE; - ret2 = -EAGAIN; - } - - /* - * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just - * retry them without IOCB_NOWAIT. - */ - if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) - ret2 = -EAGAIN; - /* no retry on NONBLOCK nor RWF_NOWAIT */ - if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT)) - goto done; - if (!force_nonblock || ret2 != -EAGAIN) { - /* IOPOLL retry should happen for io-wq threads */ - if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL)) - goto copy_iov; -done: - kiocb_done(req, ret2, issue_flags); - } else { -copy_iov: - iov_iter_restore(&s->iter, &s->iter_state); - ret = io_setup_async_rw(req, iovec, s, false); - return ret ?: -EAGAIN; - } -out_free: - /* it's reportedly faster than delegating the null check to kfree() */ - if (iovec) - kfree(iovec); - return ret; -} - -static int io_renameat_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_rename *ren = &req->rename; - const char __user *oldf, *newf; - - if (sqe->buf_index || sqe->splice_fd_in) - return -EINVAL; - if (unlikely(req->flags & REQ_F_FIXED_FILE)) - return -EBADF; - - ren->old_dfd = READ_ONCE(sqe->fd); - oldf = u64_to_user_ptr(READ_ONCE(sqe->addr)); - newf = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - ren->new_dfd = READ_ONCE(sqe->len); - ren->flags = READ_ONCE(sqe->rename_flags); - - ren->oldpath = getname(oldf); - if (IS_ERR(ren->oldpath)) - return PTR_ERR(ren->oldpath); - - ren->newpath = getname(newf); - if (IS_ERR(ren->newpath)) { - putname(ren->oldpath); - return PTR_ERR(ren->newpath); - } - - req->flags |= REQ_F_NEED_CLEANUP; - return 0; -} - -static int io_renameat(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_rename *ren = &req->rename; - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd, - ren->newpath, ren->flags); - - req->flags &= ~REQ_F_NEED_CLEANUP; - io_req_complete(req, ret); - return 0; -} - -static inline void __io_xattr_finish(struct io_kiocb *req) -{ - struct io_xattr *ix = &req->xattr; - - if (ix->filename) - putname(ix->filename); - - kfree(ix->ctx.kname); - kvfree(ix->ctx.kvalue); -} - -static void io_xattr_finish(struct io_kiocb *req, int ret) -{ - req->flags &= ~REQ_F_NEED_CLEANUP; - - __io_xattr_finish(req); - io_req_complete(req, ret); -} - -static int __io_getxattr_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_xattr *ix = &req->xattr; - const char __user *name; - int ret; - - if (unlikely(req->flags & REQ_F_FIXED_FILE)) - return -EBADF; - - ix->filename = NULL; - ix->ctx.kvalue = NULL; - name = u64_to_user_ptr(READ_ONCE(sqe->addr)); - ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - ix->ctx.size = READ_ONCE(sqe->len); - ix->ctx.flags = READ_ONCE(sqe->xattr_flags); - - if (ix->ctx.flags) - return -EINVAL; - - ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL); - if (!ix->ctx.kname) - return -ENOMEM; - - ret = strncpy_from_user(ix->ctx.kname->name, name, - sizeof(ix->ctx.kname->name)); - if (!ret || ret == sizeof(ix->ctx.kname->name)) - ret = -ERANGE; - if (ret < 0) { - kfree(ix->ctx.kname); - return ret; - } - - req->flags |= REQ_F_NEED_CLEANUP; - return 0; -} - -static int io_fgetxattr_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - return __io_getxattr_prep(req, sqe); -} - -static int io_getxattr_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_xattr *ix = &req->xattr; - const char __user *path; - int ret; - - ret = __io_getxattr_prep(req, sqe); - if (ret) - return ret; - - path = u64_to_user_ptr(READ_ONCE(sqe->addr3)); - - ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL); - if (IS_ERR(ix->filename)) { - ret = PTR_ERR(ix->filename); - ix->filename = NULL; - } - - return ret; -} - -static int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_xattr *ix = &req->xattr; - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - ret = do_getxattr(mnt_user_ns(req->file->f_path.mnt), - req->file->f_path.dentry, - &ix->ctx); - - io_xattr_finish(req, ret); - return 0; -} - -static int io_getxattr(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_xattr *ix = &req->xattr; - unsigned int lookup_flags = LOOKUP_FOLLOW; - struct path path; - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - -retry: - ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL); - if (!ret) { - ret = do_getxattr(mnt_user_ns(path.mnt), - path.dentry, - &ix->ctx); - - path_put(&path); - if (retry_estale(ret, lookup_flags)) { - lookup_flags |= LOOKUP_REVAL; - goto retry; - } - } - - io_xattr_finish(req, ret); - return 0; -} - -static int __io_setxattr_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_xattr *ix = &req->xattr; - const char __user *name; - int ret; - - if (unlikely(req->flags & REQ_F_FIXED_FILE)) - return -EBADF; - - ix->filename = NULL; - name = u64_to_user_ptr(READ_ONCE(sqe->addr)); - ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - ix->ctx.kvalue = NULL; - ix->ctx.size = READ_ONCE(sqe->len); - ix->ctx.flags = READ_ONCE(sqe->xattr_flags); - - ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL); - if (!ix->ctx.kname) - return -ENOMEM; - - ret = setxattr_copy(name, &ix->ctx); - if (ret) { - kfree(ix->ctx.kname); - return ret; - } - - req->flags |= REQ_F_NEED_CLEANUP; - return 0; -} - -static int io_setxattr_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_xattr *ix = &req->xattr; - const char __user *path; - int ret; - - ret = __io_setxattr_prep(req, sqe); - if (ret) - return ret; - - path = u64_to_user_ptr(READ_ONCE(sqe->addr3)); - - ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL); - if (IS_ERR(ix->filename)) { - ret = PTR_ERR(ix->filename); - ix->filename = NULL; - } - - return ret; -} - -static int io_fsetxattr_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - return __io_setxattr_prep(req, sqe); -} - -static int __io_setxattr(struct io_kiocb *req, unsigned int issue_flags, - struct path *path) -{ - struct io_xattr *ix = &req->xattr; - int ret; - - ret = mnt_want_write(path->mnt); - if (!ret) { - ret = do_setxattr(mnt_user_ns(path->mnt), path->dentry, &ix->ctx); - mnt_drop_write(path->mnt); - } - - return ret; -} - -static int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags) -{ - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - ret = __io_setxattr(req, issue_flags, &req->file->f_path); - io_xattr_finish(req, ret); - - return 0; -} - -static int io_setxattr(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_xattr *ix = &req->xattr; - unsigned int lookup_flags = LOOKUP_FOLLOW; - struct path path; - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - -retry: - ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL); - if (!ret) { - ret = __io_setxattr(req, issue_flags, &path); - path_put(&path); - if (retry_estale(ret, lookup_flags)) { - lookup_flags |= LOOKUP_REVAL; - goto retry; - } - } - - io_xattr_finish(req, ret); - return 0; -} - -static int io_unlinkat_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_unlink *un = &req->unlink; - const char __user *fname; - - if (sqe->off || sqe->len || sqe->buf_index || sqe->splice_fd_in) - return -EINVAL; - if (unlikely(req->flags & REQ_F_FIXED_FILE)) - return -EBADF; - - un->dfd = READ_ONCE(sqe->fd); - - un->flags = READ_ONCE(sqe->unlink_flags); - if (un->flags & ~AT_REMOVEDIR) - return -EINVAL; - - fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); - un->filename = getname(fname); - if (IS_ERR(un->filename)) - return PTR_ERR(un->filename); - - req->flags |= REQ_F_NEED_CLEANUP; - return 0; -} - -static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_unlink *un = &req->unlink; - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - if (un->flags & AT_REMOVEDIR) - ret = do_rmdir(un->dfd, un->filename); - else - ret = do_unlinkat(un->dfd, un->filename); - - req->flags &= ~REQ_F_NEED_CLEANUP; - io_req_complete(req, ret); - return 0; -} - -static int io_mkdirat_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_mkdir *mkd = &req->mkdir; - const char __user *fname; - - if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) - return -EINVAL; - if (unlikely(req->flags & REQ_F_FIXED_FILE)) - return -EBADF; - - mkd->dfd = READ_ONCE(sqe->fd); - mkd->mode = READ_ONCE(sqe->len); - - fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); - mkd->filename = getname(fname); - if (IS_ERR(mkd->filename)) - return PTR_ERR(mkd->filename); - - req->flags |= REQ_F_NEED_CLEANUP; - return 0; -} - -static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_mkdir *mkd = &req->mkdir; - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode); - - req->flags &= ~REQ_F_NEED_CLEANUP; - io_req_complete(req, ret); - return 0; -} - -static int io_symlinkat_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_symlink *sl = &req->symlink; - const char __user *oldpath, *newpath; - - if (sqe->len || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) - return -EINVAL; - if (unlikely(req->flags & REQ_F_FIXED_FILE)) - return -EBADF; - - sl->new_dfd = READ_ONCE(sqe->fd); - oldpath = u64_to_user_ptr(READ_ONCE(sqe->addr)); - newpath = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - - sl->oldpath = getname(oldpath); - if (IS_ERR(sl->oldpath)) - return PTR_ERR(sl->oldpath); - - sl->newpath = getname(newpath); - if (IS_ERR(sl->newpath)) { - putname(sl->oldpath); - return PTR_ERR(sl->newpath); - } - - req->flags |= REQ_F_NEED_CLEANUP; - return 0; -} - -static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_symlink *sl = &req->symlink; - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath); - - req->flags &= ~REQ_F_NEED_CLEANUP; - io_req_complete(req, ret); - return 0; -} - -static int io_linkat_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_hardlink *lnk = &req->hardlink; - const char __user *oldf, *newf; - - if (sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) - return -EINVAL; - if (unlikely(req->flags & REQ_F_FIXED_FILE)) - return -EBADF; - - lnk->old_dfd = READ_ONCE(sqe->fd); - lnk->new_dfd = READ_ONCE(sqe->len); - oldf = u64_to_user_ptr(READ_ONCE(sqe->addr)); - newf = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - lnk->flags = READ_ONCE(sqe->hardlink_flags); - - lnk->oldpath = getname(oldf); - if (IS_ERR(lnk->oldpath)) - return PTR_ERR(lnk->oldpath); - - lnk->newpath = getname(newf); - if (IS_ERR(lnk->newpath)) { - putname(lnk->oldpath); - return PTR_ERR(lnk->newpath); - } - - req->flags |= REQ_F_NEED_CLEANUP; - return 0; -} - -static int io_linkat(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_hardlink *lnk = &req->hardlink; - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd, - lnk->newpath, lnk->flags); - - req->flags &= ~REQ_F_NEED_CLEANUP; - io_req_complete(req, ret); - return 0; -} - -static void io_uring_cmd_work(struct io_kiocb *req, bool *locked) -{ - req->uring_cmd.task_work_cb(&req->uring_cmd); -} - -void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *)) -{ - struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd); - - req->uring_cmd.task_work_cb = task_work_cb; - req->io_task_work.func = io_uring_cmd_work; - io_req_task_work_add(req); -} -EXPORT_SYMBOL_GPL(io_uring_cmd_complete_in_task); - -static inline void io_req_set_cqe32_extra(struct io_kiocb *req, - u64 extra1, u64 extra2) -{ - req->extra1 = extra1; - req->extra2 = extra2; - req->flags |= REQ_F_CQE32_INIT; -} - -/* - * Called by consumers of io_uring_cmd, if they originally returned - * -EIOCBQUEUED upon receiving the command. - */ -void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2) -{ - struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd); - - if (ret < 0) - req_set_fail(req); - - if (req->ctx->flags & IORING_SETUP_CQE32) - io_req_set_cqe32_extra(req, res2, 0); - io_req_complete(req, ret); -} -EXPORT_SYMBOL_GPL(io_uring_cmd_done); - -static int io_uring_cmd_prep_async(struct io_kiocb *req) -{ - size_t cmd_size; - - cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128); - - memcpy(req->async_data, req->uring_cmd.cmd, cmd_size); - return 0; -} - -static int io_uring_cmd_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_uring_cmd *ioucmd = &req->uring_cmd; - - if (sqe->rw_flags) - return -EINVAL; - ioucmd->cmd = sqe->cmd; - ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); - return 0; -} - -static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_uring_cmd *ioucmd = &req->uring_cmd; - struct io_ring_ctx *ctx = req->ctx; - struct file *file = req->file; - int ret; - - if (!req->file->f_op->uring_cmd) - return -EOPNOTSUPP; - - if (ctx->flags & IORING_SETUP_SQE128) - issue_flags |= IO_URING_F_SQE128; - if (ctx->flags & IORING_SETUP_CQE32) - issue_flags |= IO_URING_F_CQE32; - if (ctx->flags & IORING_SETUP_IOPOLL) - issue_flags |= IO_URING_F_IOPOLL; - - if (req_has_async_data(req)) - ioucmd->cmd = req->async_data; - - ret = file->f_op->uring_cmd(ioucmd, issue_flags); - if (ret == -EAGAIN) { - if (!req_has_async_data(req)) { - if (io_alloc_async_data(req)) - return -ENOMEM; - io_uring_cmd_prep_async(req); - } - return -EAGAIN; - } - - if (ret != -EIOCBQUEUED) - io_uring_cmd_done(ioucmd, ret, 0); - return 0; -} - -static int __io_splice_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_splice *sp = &req->splice; - unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL; - - sp->len = READ_ONCE(sqe->len); - sp->flags = READ_ONCE(sqe->splice_flags); - if (unlikely(sp->flags & ~valid_flags)) - return -EINVAL; - sp->splice_fd_in = READ_ONCE(sqe->splice_fd_in); - return 0; -} - -static int io_tee_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off)) - return -EINVAL; - return __io_splice_prep(req, sqe); -} - -static int io_tee(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_splice *sp = &req->splice; - struct file *out = sp->file_out; - unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; - struct file *in; - long ret = 0; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - if (sp->flags & SPLICE_F_FD_IN_FIXED) - in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags); - else - in = io_file_get_normal(req, sp->splice_fd_in); - if (!in) { - ret = -EBADF; - goto done; - } - - if (sp->len) - ret = do_tee(in, out, sp->len, flags); - - if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) - io_put_file(in); -done: - if (ret != sp->len) - req_set_fail(req); - __io_req_complete(req, 0, ret, 0); - return 0; -} - -static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_splice *sp = &req->splice; - - sp->off_in = READ_ONCE(sqe->splice_off_in); - sp->off_out = READ_ONCE(sqe->off); - return __io_splice_prep(req, sqe); -} - -static int io_splice(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_splice *sp = &req->splice; - struct file *out = sp->file_out; - unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; - loff_t *poff_in, *poff_out; - struct file *in; - long ret = 0; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - if (sp->flags & SPLICE_F_FD_IN_FIXED) - in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags); - else - in = io_file_get_normal(req, sp->splice_fd_in); - if (!in) { - ret = -EBADF; - goto done; - } - - poff_in = (sp->off_in == -1) ? NULL : &sp->off_in; - poff_out = (sp->off_out == -1) ? NULL : &sp->off_out; - - if (sp->len) - ret = do_splice(in, poff_in, out, poff_out, sp->len, flags); - - if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) - io_put_file(in); -done: - if (ret != sp->len) - req_set_fail(req); - __io_req_complete(req, 0, ret, 0); - return 0; -} - -static int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - return 0; -} - -/* - * IORING_OP_NOP just posts a completion event, nothing else. - */ -static int io_nop(struct io_kiocb *req, unsigned int issue_flags) -{ - __io_req_complete(req, issue_flags, 0, 0); - return 0; -} - -static int io_msg_ring_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - if (unlikely(sqe->addr || sqe->rw_flags || sqe->splice_fd_in || - sqe->buf_index || sqe->personality)) - return -EINVAL; - - req->msg.user_data = READ_ONCE(sqe->off); - req->msg.len = READ_ONCE(sqe->len); - return 0; -} - -static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_ring_ctx *target_ctx; - struct io_msg *msg = &req->msg; - bool filled; - int ret; - - ret = -EBADFD; - if (req->file->f_op != &io_uring_fops) - goto done; - - ret = -EOVERFLOW; - target_ctx = req->file->private_data; - - spin_lock(&target_ctx->completion_lock); - filled = io_fill_cqe_aux(target_ctx, msg->user_data, msg->len, 0); - io_commit_cqring(target_ctx); - spin_unlock(&target_ctx->completion_lock); - - if (filled) { - io_cqring_ev_posted(target_ctx); - ret = 0; - } - -done: - if (ret < 0) - req_set_fail(req); - __io_req_complete(req, issue_flags, ret, 0); - /* put file to avoid an attempt to IOPOLL the req */ - io_put_file(req->file); - req->file = NULL; - return 0; -} - -static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in)) - return -EINVAL; - - req->sync.flags = READ_ONCE(sqe->fsync_flags); - if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC)) - return -EINVAL; - - req->sync.off = READ_ONCE(sqe->off); - req->sync.len = READ_ONCE(sqe->len); - return 0; -} - -static int io_fsync(struct io_kiocb *req, unsigned int issue_flags) -{ - loff_t end = req->sync.off + req->sync.len; - int ret; - - /* fsync always requires a blocking context */ - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - ret = vfs_fsync_range(req->file, req->sync.off, - end > 0 ? end : LLONG_MAX, - req->sync.flags & IORING_FSYNC_DATASYNC); - io_req_complete(req, ret); - return 0; -} - -static int io_fallocate_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - if (sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) - return -EINVAL; - - req->sync.off = READ_ONCE(sqe->off); - req->sync.len = READ_ONCE(sqe->addr); - req->sync.mode = READ_ONCE(sqe->len); - return 0; -} - -static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags) -{ - int ret; - - /* fallocate always requiring blocking context */ - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off, - req->sync.len); - if (ret >= 0) - fsnotify_modify(req->file); - io_req_complete(req, ret); - return 0; -} - -static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - const char __user *fname; - int ret; - - if (unlikely(sqe->buf_index)) - return -EINVAL; - if (unlikely(req->flags & REQ_F_FIXED_FILE)) - return -EBADF; - - /* open.how should be already initialised */ - if (!(req->open.how.flags & O_PATH) && force_o_largefile()) - req->open.how.flags |= O_LARGEFILE; - - req->open.dfd = READ_ONCE(sqe->fd); - fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); - req->open.filename = getname(fname); - if (IS_ERR(req->open.filename)) { - ret = PTR_ERR(req->open.filename); - req->open.filename = NULL; - return ret; - } - - req->open.file_slot = READ_ONCE(sqe->file_index); - if (req->open.file_slot && (req->open.how.flags & O_CLOEXEC)) - return -EINVAL; - - req->open.nofile = rlimit(RLIMIT_NOFILE); - req->flags |= REQ_F_NEED_CLEANUP; - return 0; -} - -static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - u64 mode = READ_ONCE(sqe->len); - u64 flags = READ_ONCE(sqe->open_flags); - - req->open.how = build_open_how(flags, mode); - return __io_openat_prep(req, sqe); -} - -static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct open_how __user *how; - size_t len; - int ret; - - how = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - len = READ_ONCE(sqe->len); - if (len < OPEN_HOW_SIZE_VER0) - return -EINVAL; - - ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how, - len); - if (ret) - return ret; - - return __io_openat_prep(req, sqe); -} - -static int io_file_bitmap_get(struct io_ring_ctx *ctx) -{ - struct io_file_table *table = &ctx->file_table; - unsigned long nr = ctx->nr_user_files; - int ret; - - do { - ret = find_next_zero_bit(table->bitmap, nr, table->alloc_hint); - if (ret != nr) - return ret; - - if (!table->alloc_hint) - break; - - nr = table->alloc_hint; - table->alloc_hint = 0; - } while (1); - - return -ENFILE; -} - -/* - * Note when io_fixed_fd_install() returns error value, it will ensure - * fput() is called correspondingly. - */ -static int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, - struct file *file, unsigned int file_slot) -{ - bool alloc_slot = file_slot == IORING_FILE_INDEX_ALLOC; - struct io_ring_ctx *ctx = req->ctx; - int ret; - - io_ring_submit_lock(ctx, issue_flags); - - if (alloc_slot) { - ret = io_file_bitmap_get(ctx); - if (unlikely(ret < 0)) - goto err; - file_slot = ret; - } else { - file_slot--; - } - - ret = io_install_fixed_file(req, file, issue_flags, file_slot); - if (!ret && alloc_slot) - ret = file_slot; -err: - io_ring_submit_unlock(ctx, issue_flags); - if (unlikely(ret < 0)) - fput(file); - return ret; -} - -static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) -{ - struct open_flags op; - struct file *file; - bool resolve_nonblock, nonblock_set; - bool fixed = !!req->open.file_slot; - int ret; - - ret = build_open_flags(&req->open.how, &op); - if (ret) - goto err; - nonblock_set = op.open_flag & O_NONBLOCK; - resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED; - if (issue_flags & IO_URING_F_NONBLOCK) { - /* - * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open, - * it'll always -EAGAIN - */ - if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE)) - return -EAGAIN; - op.lookup_flags |= LOOKUP_CACHED; - op.open_flag |= O_NONBLOCK; - } - - if (!fixed) { - ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile); - if (ret < 0) - goto err; - } - - file = do_filp_open(req->open.dfd, req->open.filename, &op); - if (IS_ERR(file)) { - /* - * We could hang on to this 'fd' on retrying, but seems like - * marginal gain for something that is now known to be a slower - * path. So just put it, and we'll get a new one when we retry. - */ - if (!fixed) - put_unused_fd(ret); - - ret = PTR_ERR(file); - /* only retry if RESOLVE_CACHED wasn't already set by application */ - if (ret == -EAGAIN && - (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK))) - return -EAGAIN; - goto err; - } - - if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set) - file->f_flags &= ~O_NONBLOCK; - fsnotify_open(file); - - if (!fixed) - fd_install(ret, file); - else - ret = io_fixed_fd_install(req, issue_flags, file, - req->open.file_slot); -err: - putname(req->open.filename); - req->flags &= ~REQ_F_NEED_CLEANUP; - if (ret < 0) - req_set_fail(req); - __io_req_complete(req, issue_flags, ret, 0); - return 0; -} - -static int io_openat(struct io_kiocb *req, unsigned int issue_flags) -{ - return io_openat2(req, issue_flags); -} - -static int io_remove_buffers_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_provide_buf *p = &req->pbuf; - u64 tmp; - - if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off || - sqe->splice_fd_in) - return -EINVAL; - - tmp = READ_ONCE(sqe->fd); - if (!tmp || tmp > USHRT_MAX) - return -EINVAL; - - memset(p, 0, sizeof(*p)); - p->nbufs = tmp; - p->bgid = READ_ONCE(sqe->buf_group); - return 0; -} - -static int __io_remove_buffers(struct io_ring_ctx *ctx, - struct io_buffer_list *bl, unsigned nbufs) -{ - unsigned i = 0; - - /* shouldn't happen */ - if (!nbufs) - return 0; - - if (bl->buf_nr_pages) { - int j; - - i = bl->buf_ring->tail - bl->head; - for (j = 0; j < bl->buf_nr_pages; j++) - unpin_user_page(bl->buf_pages[j]); - kvfree(bl->buf_pages); - bl->buf_pages = NULL; - bl->buf_nr_pages = 0; - /* make sure it's seen as empty */ - INIT_LIST_HEAD(&bl->buf_list); - return i; - } - - /* the head kbuf is the list itself */ - while (!list_empty(&bl->buf_list)) { - struct io_buffer *nxt; - - nxt = list_first_entry(&bl->buf_list, struct io_buffer, list); - list_del(&nxt->list); - if (++i == nbufs) - return i; - cond_resched(); - } - i++; - - return i; -} - -static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_provide_buf *p = &req->pbuf; - struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; - int ret = 0; - - io_ring_submit_lock(ctx, issue_flags); - - ret = -ENOENT; - bl = io_buffer_get_list(ctx, p->bgid); - if (bl) { - ret = -EINVAL; - /* can't use provide/remove buffers command on mapped buffers */ - if (!bl->buf_nr_pages) - ret = __io_remove_buffers(ctx, bl, p->nbufs); - } - if (ret < 0) - req_set_fail(req); - - /* complete before unlock, IOPOLL may need the lock */ - __io_req_complete(req, issue_flags, ret, 0); - io_ring_submit_unlock(ctx, issue_flags); - return 0; -} - -static int io_provide_buffers_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - unsigned long size, tmp_check; - struct io_provide_buf *p = &req->pbuf; - u64 tmp; - - if (sqe->rw_flags || sqe->splice_fd_in) - return -EINVAL; - - tmp = READ_ONCE(sqe->fd); - if (!tmp || tmp > USHRT_MAX) - return -E2BIG; - p->nbufs = tmp; - p->addr = READ_ONCE(sqe->addr); - p->len = READ_ONCE(sqe->len); - - if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs, - &size)) - return -EOVERFLOW; - if (check_add_overflow((unsigned long)p->addr, size, &tmp_check)) - return -EOVERFLOW; - - size = (unsigned long)p->len * p->nbufs; - if (!access_ok(u64_to_user_ptr(p->addr), size)) - return -EFAULT; - - p->bgid = READ_ONCE(sqe->buf_group); - tmp = READ_ONCE(sqe->off); - if (tmp > USHRT_MAX) - return -E2BIG; - p->bid = tmp; - return 0; -} - -static int io_refill_buffer_cache(struct io_ring_ctx *ctx) -{ - struct io_buffer *buf; - struct page *page; - int bufs_in_page; - - /* - * Completions that don't happen inline (eg not under uring_lock) will - * add to ->io_buffers_comp. If we don't have any free buffers, check - * the completion list and splice those entries first. - */ - if (!list_empty_careful(&ctx->io_buffers_comp)) { - spin_lock(&ctx->completion_lock); - if (!list_empty(&ctx->io_buffers_comp)) { - list_splice_init(&ctx->io_buffers_comp, - &ctx->io_buffers_cache); - spin_unlock(&ctx->completion_lock); - return 0; - } - spin_unlock(&ctx->completion_lock); - } - - /* - * No free buffers and no completion entries either. Allocate a new - * page worth of buffer entries and add those to our freelist. - */ - page = alloc_page(GFP_KERNEL_ACCOUNT); - if (!page) - return -ENOMEM; - - list_add(&page->lru, &ctx->io_buffers_pages); - - buf = page_address(page); - bufs_in_page = PAGE_SIZE / sizeof(*buf); - while (bufs_in_page) { - list_add_tail(&buf->list, &ctx->io_buffers_cache); - buf++; - bufs_in_page--; - } - - return 0; -} - -static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, - struct io_buffer_list *bl) -{ - struct io_buffer *buf; - u64 addr = pbuf->addr; - int i, bid = pbuf->bid; - - for (i = 0; i < pbuf->nbufs; i++) { - if (list_empty(&ctx->io_buffers_cache) && - io_refill_buffer_cache(ctx)) - break; - buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer, - list); - list_move_tail(&buf->list, &bl->buf_list); - buf->addr = addr; - buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT); - buf->bid = bid; - buf->bgid = pbuf->bgid; - addr += pbuf->len; - bid++; - cond_resched(); - } - - return i ? 0 : -ENOMEM; -} - -static __cold int io_init_bl_list(struct io_ring_ctx *ctx) -{ - int i; - - ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), - GFP_KERNEL); - if (!ctx->io_bl) - return -ENOMEM; - - for (i = 0; i < BGID_ARRAY; i++) { - INIT_LIST_HEAD(&ctx->io_bl[i].buf_list); - ctx->io_bl[i].bgid = i; - } - - return 0; -} - -static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_provide_buf *p = &req->pbuf; - struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; - int ret = 0; - - io_ring_submit_lock(ctx, issue_flags); - - if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) { - ret = io_init_bl_list(ctx); - if (ret) - goto err; - } - - bl = io_buffer_get_list(ctx, p->bgid); - if (unlikely(!bl)) { - bl = kzalloc(sizeof(*bl), GFP_KERNEL); - if (!bl) { - ret = -ENOMEM; - goto err; - } - INIT_LIST_HEAD(&bl->buf_list); - ret = io_buffer_add_list(ctx, bl, p->bgid); - if (ret) { - kfree(bl); - goto err; - } - } - /* can't add buffers via this command for a mapped buffer ring */ - if (bl->buf_nr_pages) { - ret = -EINVAL; - goto err; - } - - ret = io_add_buffers(ctx, p, bl); -err: - if (ret < 0) - req_set_fail(req); - /* complete before unlock, IOPOLL may need the lock */ - __io_req_complete(req, issue_flags, ret, 0); - io_ring_submit_unlock(ctx, issue_flags); - return 0; -} - -static int io_epoll_ctl_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ -#if defined(CONFIG_EPOLL) - if (sqe->buf_index || sqe->splice_fd_in) - return -EINVAL; - - req->epoll.epfd = READ_ONCE(sqe->fd); - req->epoll.op = READ_ONCE(sqe->len); - req->epoll.fd = READ_ONCE(sqe->off); - - if (ep_op_has_event(req->epoll.op)) { - struct epoll_event __user *ev; - - ev = u64_to_user_ptr(READ_ONCE(sqe->addr)); - if (copy_from_user(&req->epoll.event, ev, sizeof(*ev))) - return -EFAULT; - } - - return 0; -#else - return -EOPNOTSUPP; -#endif -} - -static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) -{ -#if defined(CONFIG_EPOLL) - struct io_epoll *ie = &req->epoll; - int ret; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - - ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock); - if (force_nonblock && ret == -EAGAIN) - return -EAGAIN; - - if (ret < 0) - req_set_fail(req); - __io_req_complete(req, issue_flags, ret, 0); - return 0; -#else - return -EOPNOTSUPP; -#endif -} - -static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ -#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) - if (sqe->buf_index || sqe->off || sqe->splice_fd_in) - return -EINVAL; - - req->madvise.addr = READ_ONCE(sqe->addr); - req->madvise.len = READ_ONCE(sqe->len); - req->madvise.advice = READ_ONCE(sqe->fadvise_advice); - return 0; -#else - return -EOPNOTSUPP; -#endif -} - -static int io_madvise(struct io_kiocb *req, unsigned int issue_flags) -{ -#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) - struct io_madvise *ma = &req->madvise; - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); - io_req_complete(req, ret); - return 0; -#else - return -EOPNOTSUPP; -#endif -} - -static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - if (sqe->buf_index || sqe->addr || sqe->splice_fd_in) - return -EINVAL; - - req->fadvise.offset = READ_ONCE(sqe->off); - req->fadvise.len = READ_ONCE(sqe->len); - req->fadvise.advice = READ_ONCE(sqe->fadvise_advice); - return 0; -} - -static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_fadvise *fa = &req->fadvise; - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) { - switch (fa->advice) { - case POSIX_FADV_NORMAL: - case POSIX_FADV_RANDOM: - case POSIX_FADV_SEQUENTIAL: - break; - default: - return -EAGAIN; - } - } - - ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice); - if (ret < 0) - req_set_fail(req); - __io_req_complete(req, issue_flags, ret, 0); - return 0; -} - -static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - const char __user *path; - - if (sqe->buf_index || sqe->splice_fd_in) - return -EINVAL; - if (req->flags & REQ_F_FIXED_FILE) - return -EBADF; - - req->statx.dfd = READ_ONCE(sqe->fd); - req->statx.mask = READ_ONCE(sqe->len); - path = u64_to_user_ptr(READ_ONCE(sqe->addr)); - req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - req->statx.flags = READ_ONCE(sqe->statx_flags); - - req->statx.filename = getname_flags(path, - getname_statx_lookup_flags(req->statx.flags), - NULL); - - if (IS_ERR(req->statx.filename)) { - int ret = PTR_ERR(req->statx.filename); - - req->statx.filename = NULL; - return ret; - } - - req->flags |= REQ_F_NEED_CLEANUP; - return 0; -} - -static int io_statx(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_statx *ctx = &req->statx; - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask, - ctx->buffer); - io_req_complete(req, ret); - return 0; -} - -static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - if (sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index) - return -EINVAL; - if (req->flags & REQ_F_FIXED_FILE) - return -EBADF; - - req->close.fd = READ_ONCE(sqe->fd); - req->close.file_slot = READ_ONCE(sqe->file_index); - if (req->close.file_slot && req->close.fd) - return -EINVAL; - - return 0; -} - -static int io_close(struct io_kiocb *req, unsigned int issue_flags) -{ - struct files_struct *files = current->files; - struct io_close *close = &req->close; - struct fdtable *fdt; - struct file *file; - int ret = -EBADF; - - if (req->close.file_slot) { - ret = io_close_fixed(req, issue_flags); - goto err; - } - - spin_lock(&files->file_lock); - fdt = files_fdtable(files); - if (close->fd >= fdt->max_fds) { - spin_unlock(&files->file_lock); - goto err; - } - file = rcu_dereference_protected(fdt->fd[close->fd], - lockdep_is_held(&files->file_lock)); - if (!file || file->f_op == &io_uring_fops) { - spin_unlock(&files->file_lock); - goto err; - } - - /* if the file has a flush method, be safe and punt to async */ - if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) { - spin_unlock(&files->file_lock); - return -EAGAIN; - } - - file = __close_fd_get_file(close->fd); - spin_unlock(&files->file_lock); - if (!file) - goto err; - - /* No ->flush() or already async, safely close from here */ - ret = filp_close(file, current->files); -err: - if (ret < 0) - req_set_fail(req); - __io_req_complete(req, issue_flags, ret, 0); - return 0; -} - -static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in)) - return -EINVAL; - - req->sync.off = READ_ONCE(sqe->off); - req->sync.len = READ_ONCE(sqe->len); - req->sync.flags = READ_ONCE(sqe->sync_range_flags); - return 0; -} - -static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags) -{ - int ret; - - /* sync_file_range always requires a blocking context */ - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - ret = sync_file_range(req->file, req->sync.off, req->sync.len, - req->sync.flags); - io_req_complete(req, ret); - return 0; -} - -#if defined(CONFIG_NET) -static int io_shutdown_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - if (unlikely(sqe->off || sqe->addr || sqe->rw_flags || - sqe->buf_index || sqe->splice_fd_in)) - return -EINVAL; - - req->shutdown.how = READ_ONCE(sqe->len); - return 0; -} - -static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) -{ - struct socket *sock; - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - sock = sock_from_file(req->file); - if (unlikely(!sock)) - return -ENOTSOCK; - - ret = __sys_shutdown_sock(sock, req->shutdown.how); - io_req_complete(req, ret); - return 0; -} - -static bool io_net_retry(struct socket *sock, int flags) -{ - if (!(flags & MSG_WAITALL)) - return false; - return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; -} - -static int io_setup_async_msg(struct io_kiocb *req, - struct io_async_msghdr *kmsg) -{ - struct io_async_msghdr *async_msg = req->async_data; - - if (async_msg) - return -EAGAIN; - if (io_alloc_async_data(req)) { - kfree(kmsg->free_iov); - return -ENOMEM; - } - async_msg = req->async_data; - req->flags |= REQ_F_NEED_CLEANUP; - memcpy(async_msg, kmsg, sizeof(*kmsg)); - async_msg->msg.msg_name = &async_msg->addr; - /* if were using fast_iov, set it to the new one */ - if (!async_msg->free_iov) - async_msg->msg.msg_iter.iov = async_msg->fast_iov; - - return -EAGAIN; -} - -static int io_sendmsg_copy_hdr(struct io_kiocb *req, - struct io_async_msghdr *iomsg) -{ - iomsg->msg.msg_name = &iomsg->addr; - iomsg->free_iov = iomsg->fast_iov; - return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg, - req->sr_msg.msg_flags, &iomsg->free_iov); -} - -static int io_sendmsg_prep_async(struct io_kiocb *req) -{ - int ret; - - ret = io_sendmsg_copy_hdr(req, req->async_data); - if (!ret) - req->flags |= REQ_F_NEED_CLEANUP; - return ret; -} - -static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_sr_msg *sr = &req->sr_msg; - - if (unlikely(sqe->file_index)) - return -EINVAL; - - sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); - sr->len = READ_ONCE(sqe->len); - sr->flags = READ_ONCE(sqe->addr2); - if (sr->flags & ~IORING_RECVSEND_POLL_FIRST) - return -EINVAL; - sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; - if (sr->msg_flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - -#ifdef CONFIG_COMPAT - if (req->ctx->compat) - sr->msg_flags |= MSG_CMSG_COMPAT; -#endif - sr->done_io = 0; - return 0; -} - -static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_async_msghdr iomsg, *kmsg; - struct io_sr_msg *sr = &req->sr_msg; - struct socket *sock; - unsigned flags; - int min_ret = 0; - int ret; - - sock = sock_from_file(req->file); - if (unlikely(!sock)) - return -ENOTSOCK; - - if (req_has_async_data(req)) { - kmsg = req->async_data; - } else { - ret = io_sendmsg_copy_hdr(req, &iomsg); - if (ret) - return ret; - kmsg = &iomsg; - } - - if (!(req->flags & REQ_F_POLLED) && - (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return io_setup_async_msg(req, kmsg); - - flags = sr->msg_flags; - if (issue_flags & IO_URING_F_NONBLOCK) - flags |= MSG_DONTWAIT; - if (flags & MSG_WAITALL) - min_ret = iov_iter_count(&kmsg->msg.msg_iter); - - ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); - - if (ret < min_ret) { - if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) - return io_setup_async_msg(req, kmsg); - if (ret == -ERESTARTSYS) - ret = -EINTR; - if (ret > 0 && io_net_retry(sock, flags)) { - sr->done_io += ret; - req->flags |= REQ_F_PARTIAL_IO; - return io_setup_async_msg(req, kmsg); - } - req_set_fail(req); - } - /* fast path, check for non-NULL to avoid function call */ - if (kmsg->free_iov) - kfree(kmsg->free_iov); - req->flags &= ~REQ_F_NEED_CLEANUP; - if (ret >= 0) - ret += sr->done_io; - else if (sr->done_io) - ret = sr->done_io; - __io_req_complete(req, issue_flags, ret, 0); - return 0; -} - -static int io_send(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_sr_msg *sr = &req->sr_msg; - struct msghdr msg; - struct iovec iov; - struct socket *sock; - unsigned flags; - int min_ret = 0; - int ret; - - if (!(req->flags & REQ_F_POLLED) && - (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return -EAGAIN; - - sock = sock_from_file(req->file); - if (unlikely(!sock)) - return -ENOTSOCK; - - ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter); - if (unlikely(ret)) - return ret; - - msg.msg_name = NULL; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_namelen = 0; - - flags = sr->msg_flags; - if (issue_flags & IO_URING_F_NONBLOCK) - flags |= MSG_DONTWAIT; - if (flags & MSG_WAITALL) - min_ret = iov_iter_count(&msg.msg_iter); - - msg.msg_flags = flags; - ret = sock_sendmsg(sock, &msg); - if (ret < min_ret) { - if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) - return -EAGAIN; - if (ret == -ERESTARTSYS) - ret = -EINTR; - if (ret > 0 && io_net_retry(sock, flags)) { - sr->len -= ret; - sr->buf += ret; - sr->done_io += ret; - req->flags |= REQ_F_PARTIAL_IO; - return -EAGAIN; - } - req_set_fail(req); - } - if (ret >= 0) - ret += sr->done_io; - else if (sr->done_io) - ret = sr->done_io; - __io_req_complete(req, issue_flags, ret, 0); - return 0; -} - -static int __io_recvmsg_copy_hdr(struct io_kiocb *req, - struct io_async_msghdr *iomsg) -{ - struct io_sr_msg *sr = &req->sr_msg; - struct iovec __user *uiov; - size_t iov_len; - int ret; - - ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg, - &iomsg->uaddr, &uiov, &iov_len); - if (ret) - return ret; - - if (req->flags & REQ_F_BUFFER_SELECT) { - if (iov_len > 1) - return -EINVAL; - if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov))) - return -EFAULT; - sr->len = iomsg->fast_iov[0].iov_len; - iomsg->free_iov = NULL; - } else { - iomsg->free_iov = iomsg->fast_iov; - ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV, - &iomsg->free_iov, &iomsg->msg.msg_iter, - false); - if (ret > 0) - ret = 0; - } - - return ret; -} - -#ifdef CONFIG_COMPAT -static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, - struct io_async_msghdr *iomsg) -{ - struct io_sr_msg *sr = &req->sr_msg; - struct compat_iovec __user *uiov; - compat_uptr_t ptr; - compat_size_t len; - int ret; - - ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr, - &ptr, &len); - if (ret) - return ret; - - uiov = compat_ptr(ptr); - if (req->flags & REQ_F_BUFFER_SELECT) { - compat_ssize_t clen; - - if (len > 1) - return -EINVAL; - if (!access_ok(uiov, sizeof(*uiov))) - return -EFAULT; - if (__get_user(clen, &uiov->iov_len)) - return -EFAULT; - if (clen < 0) - return -EINVAL; - sr->len = clen; - iomsg->free_iov = NULL; - } else { - iomsg->free_iov = iomsg->fast_iov; - ret = __import_iovec(READ, (struct iovec __user *)uiov, len, - UIO_FASTIOV, &iomsg->free_iov, - &iomsg->msg.msg_iter, true); - if (ret < 0) - return ret; - } - - return 0; -} -#endif - -static int io_recvmsg_copy_hdr(struct io_kiocb *req, - struct io_async_msghdr *iomsg) -{ - iomsg->msg.msg_name = &iomsg->addr; - -#ifdef CONFIG_COMPAT - if (req->ctx->compat) - return __io_compat_recvmsg_copy_hdr(req, iomsg); -#endif - - return __io_recvmsg_copy_hdr(req, iomsg); -} - -static int io_recvmsg_prep_async(struct io_kiocb *req) -{ - int ret; - - ret = io_recvmsg_copy_hdr(req, req->async_data); - if (!ret) - req->flags |= REQ_F_NEED_CLEANUP; - return ret; -} - -static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_sr_msg *sr = &req->sr_msg; - - if (unlikely(sqe->file_index)) - return -EINVAL; - - sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); - sr->len = READ_ONCE(sqe->len); - sr->flags = READ_ONCE(sqe->addr2); - if (sr->flags & ~IORING_RECVSEND_POLL_FIRST) - return -EINVAL; - sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; - if (sr->msg_flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - -#ifdef CONFIG_COMPAT - if (req->ctx->compat) - sr->msg_flags |= MSG_CMSG_COMPAT; -#endif - sr->done_io = 0; - return 0; -} - -static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_async_msghdr iomsg, *kmsg; - struct io_sr_msg *sr = &req->sr_msg; - struct socket *sock; - unsigned int cflags; - unsigned flags; - int ret, min_ret = 0; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - - sock = sock_from_file(req->file); - if (unlikely(!sock)) - return -ENOTSOCK; - - if (req_has_async_data(req)) { - kmsg = req->async_data; - } else { - ret = io_recvmsg_copy_hdr(req, &iomsg); - if (ret) - return ret; - kmsg = &iomsg; - } - - if (!(req->flags & REQ_F_POLLED) && - (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return io_setup_async_msg(req, kmsg); - - if (io_do_buffer_select(req)) { - void __user *buf; - - buf = io_buffer_select(req, &sr->len, issue_flags); - if (!buf) - return -ENOBUFS; - kmsg->fast_iov[0].iov_base = buf; - kmsg->fast_iov[0].iov_len = sr->len; - iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1, - sr->len); - } - - flags = sr->msg_flags; - if (force_nonblock) - flags |= MSG_DONTWAIT; - if (flags & MSG_WAITALL) - min_ret = iov_iter_count(&kmsg->msg.msg_iter); - - kmsg->msg.msg_get_inq = 1; - ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags); - if (ret < min_ret) { - if (ret == -EAGAIN && force_nonblock) - return io_setup_async_msg(req, kmsg); - if (ret == -ERESTARTSYS) - ret = -EINTR; - if (ret > 0 && io_net_retry(sock, flags)) { - sr->done_io += ret; - req->flags |= REQ_F_PARTIAL_IO; - return io_setup_async_msg(req, kmsg); - } - req_set_fail(req); - } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { - req_set_fail(req); - } - - /* fast path, check for non-NULL to avoid function call */ - if (kmsg->free_iov) - kfree(kmsg->free_iov); - req->flags &= ~REQ_F_NEED_CLEANUP; - if (ret >= 0) - ret += sr->done_io; - else if (sr->done_io) - ret = sr->done_io; - cflags = io_put_kbuf(req, issue_flags); - if (kmsg->msg.msg_inq) - cflags |= IORING_CQE_F_SOCK_NONEMPTY; - __io_req_complete(req, issue_flags, ret, cflags); - return 0; -} - -static int io_recv(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_sr_msg *sr = &req->sr_msg; - struct msghdr msg; - struct socket *sock; - struct iovec iov; - unsigned int cflags; - unsigned flags; - int ret, min_ret = 0; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - - if (!(req->flags & REQ_F_POLLED) && - (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return -EAGAIN; - - sock = sock_from_file(req->file); - if (unlikely(!sock)) - return -ENOTSOCK; - - if (io_do_buffer_select(req)) { - void __user *buf; - - buf = io_buffer_select(req, &sr->len, issue_flags); - if (!buf) - return -ENOBUFS; - sr->buf = buf; - } - - ret = import_single_range(READ, sr->buf, sr->len, &iov, &msg.msg_iter); - if (unlikely(ret)) - goto out_free; - - msg.msg_name = NULL; - msg.msg_namelen = 0; - msg.msg_control = NULL; - msg.msg_get_inq = 1; - msg.msg_flags = 0; - msg.msg_controllen = 0; - msg.msg_iocb = NULL; - - flags = sr->msg_flags; - if (force_nonblock) - flags |= MSG_DONTWAIT; - if (flags & MSG_WAITALL) - min_ret = iov_iter_count(&msg.msg_iter); - - ret = sock_recvmsg(sock, &msg, flags); - if (ret < min_ret) { - if (ret == -EAGAIN && force_nonblock) - return -EAGAIN; - if (ret == -ERESTARTSYS) - ret = -EINTR; - if (ret > 0 && io_net_retry(sock, flags)) { - sr->len -= ret; - sr->buf += ret; - sr->done_io += ret; - req->flags |= REQ_F_PARTIAL_IO; - return -EAGAIN; - } - req_set_fail(req); - } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { -out_free: - req_set_fail(req); - } - - if (ret >= 0) - ret += sr->done_io; - else if (sr->done_io) - ret = sr->done_io; - cflags = io_put_kbuf(req, issue_flags); - if (msg.msg_inq) - cflags |= IORING_CQE_F_SOCK_NONEMPTY; - __io_req_complete(req, issue_flags, ret, cflags); - return 0; -} - -static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_accept *accept = &req->accept; - unsigned flags; - - if (sqe->len || sqe->buf_index) - return -EINVAL; - - accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); - accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - accept->flags = READ_ONCE(sqe->accept_flags); - accept->nofile = rlimit(RLIMIT_NOFILE); - flags = READ_ONCE(sqe->ioprio); - if (flags & ~IORING_ACCEPT_MULTISHOT) - return -EINVAL; - - accept->file_slot = READ_ONCE(sqe->file_index); - if (accept->file_slot) { - if (accept->flags & SOCK_CLOEXEC) - return -EINVAL; - if (flags & IORING_ACCEPT_MULTISHOT && - accept->file_slot != IORING_FILE_INDEX_ALLOC) - return -EINVAL; - } - if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) - return -EINVAL; - if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK)) - accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK; - if (flags & IORING_ACCEPT_MULTISHOT) - req->flags |= REQ_F_APOLL_MULTISHOT; - return 0; -} - -static int io_accept(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_ring_ctx *ctx = req->ctx; - struct io_accept *accept = &req->accept; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; - bool fixed = !!accept->file_slot; - struct file *file; - int ret, fd; - -retry: - if (!fixed) { - fd = __get_unused_fd_flags(accept->flags, accept->nofile); - if (unlikely(fd < 0)) - return fd; - } - file = do_accept(req->file, file_flags, accept->addr, accept->addr_len, - accept->flags); - if (IS_ERR(file)) { - if (!fixed) - put_unused_fd(fd); - ret = PTR_ERR(file); - if (ret == -EAGAIN && force_nonblock) { - /* - * if it's multishot and polled, we don't need to - * return EAGAIN to arm the poll infra since it - * has already been done - */ - if ((req->flags & IO_APOLL_MULTI_POLLED) == - IO_APOLL_MULTI_POLLED) - ret = 0; - return ret; - } - if (ret == -ERESTARTSYS) - ret = -EINTR; - req_set_fail(req); - } else if (!fixed) { - fd_install(fd, file); - ret = fd; - } else { - ret = io_fixed_fd_install(req, issue_flags, file, - accept->file_slot); - } - - if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { - __io_req_complete(req, issue_flags, ret, 0); - return 0; - } - if (ret >= 0) { - bool filled; - - spin_lock(&ctx->completion_lock); - filled = io_fill_cqe_aux(ctx, req->cqe.user_data, ret, - IORING_CQE_F_MORE); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - if (filled) { - io_cqring_ev_posted(ctx); - goto retry; - } - ret = -ECANCELED; - } - - return ret; -} - -static int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_socket *sock = &req->sock; - - if (sqe->addr || sqe->rw_flags || sqe->buf_index) - return -EINVAL; - - sock->domain = READ_ONCE(sqe->fd); - sock->type = READ_ONCE(sqe->off); - sock->protocol = READ_ONCE(sqe->len); - sock->file_slot = READ_ONCE(sqe->file_index); - sock->nofile = rlimit(RLIMIT_NOFILE); - - sock->flags = sock->type & ~SOCK_TYPE_MASK; - if (sock->file_slot && (sock->flags & SOCK_CLOEXEC)) - return -EINVAL; - if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) - return -EINVAL; - return 0; -} - -static int io_socket(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_socket *sock = &req->sock; - bool fixed = !!sock->file_slot; - struct file *file; - int ret, fd; - - if (!fixed) { - fd = __get_unused_fd_flags(sock->flags, sock->nofile); - if (unlikely(fd < 0)) - return fd; - } - file = __sys_socket_file(sock->domain, sock->type, sock->protocol); - if (IS_ERR(file)) { - if (!fixed) - put_unused_fd(fd); - ret = PTR_ERR(file); - if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) - return -EAGAIN; - if (ret == -ERESTARTSYS) - ret = -EINTR; - req_set_fail(req); - } else if (!fixed) { - fd_install(fd, file); - ret = fd; - } else { - ret = io_fixed_fd_install(req, issue_flags, file, - sock->file_slot); - } - __io_req_complete(req, issue_flags, ret, 0); - return 0; -} - -static int io_connect_prep_async(struct io_kiocb *req) -{ - struct io_async_connect *io = req->async_data; - struct io_connect *conn = &req->connect; - - return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address); -} - -static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_connect *conn = &req->connect; - - if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) - return -EINVAL; - - conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); - conn->addr_len = READ_ONCE(sqe->addr2); - return 0; -} - -static int io_connect(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_async_connect __io, *io; - unsigned file_flags; - int ret; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - - if (req_has_async_data(req)) { - io = req->async_data; - } else { - ret = move_addr_to_kernel(req->connect.addr, - req->connect.addr_len, - &__io.address); - if (ret) - goto out; - io = &__io; - } - - file_flags = force_nonblock ? O_NONBLOCK : 0; - - ret = __sys_connect_file(req->file, &io->address, - req->connect.addr_len, file_flags); - if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) { - if (req_has_async_data(req)) - return -EAGAIN; - if (io_alloc_async_data(req)) { - ret = -ENOMEM; - goto out; - } - memcpy(req->async_data, &__io, sizeof(__io)); - return -EAGAIN; - } - if (ret == -ERESTARTSYS) - ret = -EINTR; -out: - if (ret < 0) - req_set_fail(req); - __io_req_complete(req, issue_flags, ret, 0); - return 0; -} -#else /* !CONFIG_NET */ -#define IO_NETOP_FN(op) \ -static int io_##op(struct io_kiocb *req, unsigned int issue_flags) \ -{ \ - return -EOPNOTSUPP; \ -} - -#define IO_NETOP_PREP(op) \ -IO_NETOP_FN(op) \ -static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \ -{ \ - return -EOPNOTSUPP; \ -} \ - -#define IO_NETOP_PREP_ASYNC(op) \ -IO_NETOP_PREP(op) \ -static int io_##op##_prep_async(struct io_kiocb *req) \ -{ \ - return -EOPNOTSUPP; \ -} - -IO_NETOP_PREP_ASYNC(sendmsg); -IO_NETOP_PREP_ASYNC(recvmsg); -IO_NETOP_PREP_ASYNC(connect); -IO_NETOP_PREP(accept); -IO_NETOP_PREP(socket); -IO_NETOP_PREP(shutdown); -IO_NETOP_FN(send); -IO_NETOP_FN(recv); -#endif /* CONFIG_NET */ - -struct io_poll_table { - struct poll_table_struct pt; - struct io_kiocb *req; - int nr_entries; - int error; -}; - -#define IO_POLL_CANCEL_FLAG BIT(31) -#define IO_POLL_REF_MASK GENMASK(30, 0) - -/* - * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can - * bump it and acquire ownership. It's disallowed to modify requests while not - * owning it, that prevents from races for enqueueing task_work's and b/w - * arming poll and wakeups. - */ -static inline bool io_poll_get_ownership(struct io_kiocb *req) -{ - return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK); -} - -static void io_poll_mark_cancelled(struct io_kiocb *req) -{ - atomic_or(IO_POLL_CANCEL_FLAG, &req->poll_refs); -} - -static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req) -{ - /* pure poll stashes this in ->async_data, poll driven retry elsewhere */ - if (req->opcode == IORING_OP_POLL_ADD) - return req->async_data; - return req->apoll->double_poll; -} - -static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req) -{ - if (req->opcode == IORING_OP_POLL_ADD) - return &req->poll; - return &req->apoll->poll; -} - -static void io_poll_req_insert(struct io_kiocb *req) -{ - struct io_ring_ctx *ctx = req->ctx; - struct hlist_head *list; - - list = &ctx->cancel_hash[hash_long(req->cqe.user_data, ctx->cancel_hash_bits)]; - hlist_add_head(&req->hash_node, list); -} - -static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events, - wait_queue_func_t wake_func) -{ - poll->head = NULL; -#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP) - /* mask in events that we always want/need */ - poll->events = events | IO_POLL_UNMASK; - INIT_LIST_HEAD(&poll->wait.entry); - init_waitqueue_func_entry(&poll->wait, wake_func); -} - -static inline void io_poll_remove_entry(struct io_poll_iocb *poll) -{ - struct wait_queue_head *head = smp_load_acquire(&poll->head); - - if (head) { - spin_lock_irq(&head->lock); - list_del_init(&poll->wait.entry); - poll->head = NULL; - spin_unlock_irq(&head->lock); - } -} - -static void io_poll_remove_entries(struct io_kiocb *req) -{ - /* - * Nothing to do if neither of those flags are set. Avoid dipping - * into the poll/apoll/double cachelines if we can. - */ - if (!(req->flags & (REQ_F_SINGLE_POLL | REQ_F_DOUBLE_POLL))) - return; - - /* - * While we hold the waitqueue lock and the waitqueue is nonempty, - * wake_up_pollfree() will wait for us. However, taking the waitqueue - * lock in the first place can race with the waitqueue being freed. - * - * We solve this as eventpoll does: by taking advantage of the fact that - * all users of wake_up_pollfree() will RCU-delay the actual free. If - * we enter rcu_read_lock() and see that the pointer to the queue is - * non-NULL, we can then lock it without the memory being freed out from - * under us. - * - * Keep holding rcu_read_lock() as long as we hold the queue lock, in - * case the caller deletes the entry from the queue, leaving it empty. - * In that case, only RCU prevents the queue memory from being freed. - */ - rcu_read_lock(); - if (req->flags & REQ_F_SINGLE_POLL) - io_poll_remove_entry(io_poll_get_single(req)); - if (req->flags & REQ_F_DOUBLE_POLL) - io_poll_remove_entry(io_poll_get_double(req)); - rcu_read_unlock(); -} - -static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags); -/* - * All poll tw should go through this. Checks for poll events, manages - * references, does rewait, etc. - * - * Returns a negative error on failure. >0 when no action require, which is - * either spurious wakeup or multishot CQE is served. 0 when it's done with - * the request, then the mask is stored in req->cqe.res. - */ -static int io_poll_check_events(struct io_kiocb *req, bool *locked) -{ - struct io_ring_ctx *ctx = req->ctx; - int v, ret; - - /* req->task == current here, checking PF_EXITING is safe */ - if (unlikely(req->task->flags & PF_EXITING)) - return -ECANCELED; - - do { - v = atomic_read(&req->poll_refs); - - /* tw handler should be the owner, and so have some references */ - if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK))) - return 0; - if (v & IO_POLL_CANCEL_FLAG) - return -ECANCELED; - - if (!req->cqe.res) { - struct poll_table_struct pt = { ._key = req->apoll_events }; - req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events; - } - - if ((unlikely(!req->cqe.res))) - continue; - if (req->apoll_events & EPOLLONESHOT) - return 0; - - /* multishot, just fill a CQE and proceed */ - if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { - __poll_t mask = mangle_poll(req->cqe.res & - req->apoll_events); - bool filled; - - spin_lock(&ctx->completion_lock); - filled = io_fill_cqe_aux(ctx, req->cqe.user_data, - mask, IORING_CQE_F_MORE); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - if (filled) { - io_cqring_ev_posted(ctx); - continue; - } - return -ECANCELED; - } - - io_tw_lock(req->ctx, locked); - if (unlikely(req->task->flags & PF_EXITING)) - return -EFAULT; - ret = io_issue_sqe(req, - IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); - if (ret) - return ret; - - /* - * Release all references, retry if someone tried to restart - * task_work while we were executing it. - */ - } while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs)); - - return 1; -} - -static void io_poll_task_func(struct io_kiocb *req, bool *locked) -{ - struct io_ring_ctx *ctx = req->ctx; - int ret; - - ret = io_poll_check_events(req, locked); - if (ret > 0) - return; - - if (!ret) { - req->cqe.res = mangle_poll(req->cqe.res & req->poll.events); - } else { - req->cqe.res = ret; - req_set_fail(req); - } - - io_poll_remove_entries(req); - spin_lock(&ctx->completion_lock); - hash_del(&req->hash_node); - __io_req_complete_post(req, req->cqe.res, 0); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); -} - -static void io_apoll_task_func(struct io_kiocb *req, bool *locked) -{ - struct io_ring_ctx *ctx = req->ctx; - int ret; - - ret = io_poll_check_events(req, locked); - if (ret > 0) - return; - - io_poll_remove_entries(req); - spin_lock(&ctx->completion_lock); - hash_del(&req->hash_node); - spin_unlock(&ctx->completion_lock); - - if (!ret) - io_req_task_submit(req, locked); - else - io_req_complete_failed(req, ret); -} - -static void __io_poll_execute(struct io_kiocb *req, int mask, - __poll_t __maybe_unused events) -{ - req->cqe.res = mask; - /* - * This is useful for poll that is armed on behalf of another - * request, and where the wakeup path could be on a different - * CPU. We want to avoid pulling in req->apoll->events for that - * case. - */ - if (req->opcode == IORING_OP_POLL_ADD) - req->io_task_work.func = io_poll_task_func; - else - req->io_task_work.func = io_apoll_task_func; - - trace_io_uring_task_add(req->ctx, req, req->cqe.user_data, req->opcode, mask); - io_req_task_work_add(req); -} - -static inline void io_poll_execute(struct io_kiocb *req, int res, - __poll_t events) -{ - if (io_poll_get_ownership(req)) - __io_poll_execute(req, res, events); -} - -static void io_poll_cancel_req(struct io_kiocb *req) -{ - io_poll_mark_cancelled(req); - /* kick tw, which should complete the request */ - io_poll_execute(req, 0, 0); -} - -#define wqe_to_req(wait) ((void *)((unsigned long) (wait)->private & ~1)) -#define wqe_is_double(wait) ((unsigned long) (wait)->private & 1) -#define IO_ASYNC_POLL_COMMON (EPOLLONESHOT | EPOLLPRI) - -static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, - void *key) -{ - struct io_kiocb *req = wqe_to_req(wait); - struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb, - wait); - __poll_t mask = key_to_poll(key); - - if (unlikely(mask & POLLFREE)) { - io_poll_mark_cancelled(req); - /* we have to kick tw in case it's not already */ - io_poll_execute(req, 0, poll->events); - - /* - * If the waitqueue is being freed early but someone is already - * holds ownership over it, we have to tear down the request as - * best we can. That means immediately removing the request from - * its waitqueue and preventing all further accesses to the - * waitqueue via the request. - */ - list_del_init(&poll->wait.entry); - - /* - * Careful: this *must* be the last step, since as soon - * as req->head is NULL'ed out, the request can be - * completed and freed, since aio_poll_complete_work() - * will no longer need to take the waitqueue lock. - */ - smp_store_release(&poll->head, NULL); - return 1; - } - - /* for instances that support it check for an event match first */ - if (mask && !(mask & (poll->events & ~IO_ASYNC_POLL_COMMON))) - return 0; - - if (io_poll_get_ownership(req)) { - /* optional, saves extra locking for removal in tw handler */ - if (mask && poll->events & EPOLLONESHOT) { - list_del_init(&poll->wait.entry); - poll->head = NULL; - if (wqe_is_double(wait)) - req->flags &= ~REQ_F_DOUBLE_POLL; - else - req->flags &= ~REQ_F_SINGLE_POLL; - } - __io_poll_execute(req, mask, poll->events); - } - return 1; -} - -static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, - struct wait_queue_head *head, - struct io_poll_iocb **poll_ptr) -{ - struct io_kiocb *req = pt->req; - unsigned long wqe_private = (unsigned long) req; - - /* - * The file being polled uses multiple waitqueues for poll handling - * (e.g. one for read, one for write). Setup a separate io_poll_iocb - * if this happens. - */ - if (unlikely(pt->nr_entries)) { - struct io_poll_iocb *first = poll; - - /* double add on the same waitqueue head, ignore */ - if (first->head == head) - return; - /* already have a 2nd entry, fail a third attempt */ - if (*poll_ptr) { - if ((*poll_ptr)->head == head) - return; - pt->error = -EINVAL; - return; - } - - poll = kmalloc(sizeof(*poll), GFP_ATOMIC); - if (!poll) { - pt->error = -ENOMEM; - return; - } - /* mark as double wq entry */ - wqe_private |= 1; - req->flags |= REQ_F_DOUBLE_POLL; - io_init_poll_iocb(poll, first->events, first->wait.func); - *poll_ptr = poll; - if (req->opcode == IORING_OP_POLL_ADD) - req->flags |= REQ_F_ASYNC_DATA; - } - - req->flags |= REQ_F_SINGLE_POLL; - pt->nr_entries++; - poll->head = head; - poll->wait.private = (void *) wqe_private; - - if (poll->events & EPOLLEXCLUSIVE) - add_wait_queue_exclusive(head, &poll->wait); - else - add_wait_queue(head, &poll->wait); -} - -static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, - struct poll_table_struct *p) -{ - struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); - - __io_queue_proc(&pt->req->poll, pt, head, - (struct io_poll_iocb **) &pt->req->async_data); -} - -static int __io_arm_poll_handler(struct io_kiocb *req, - struct io_poll_iocb *poll, - struct io_poll_table *ipt, __poll_t mask) -{ - struct io_ring_ctx *ctx = req->ctx; - int v; - - INIT_HLIST_NODE(&req->hash_node); - req->work.cancel_seq = atomic_read(&ctx->cancel_seq); - io_init_poll_iocb(poll, mask, io_poll_wake); - poll->file = req->file; - - req->apoll_events = poll->events; - - ipt->pt._key = mask; - ipt->req = req; - ipt->error = 0; - ipt->nr_entries = 0; - - /* - * Take the ownership to delay any tw execution up until we're done - * with poll arming. see io_poll_get_ownership(). - */ - atomic_set(&req->poll_refs, 1); - mask = vfs_poll(req->file, &ipt->pt) & poll->events; - - if (mask && (poll->events & EPOLLONESHOT)) { - io_poll_remove_entries(req); - /* no one else has access to the req, forget about the ref */ - return mask; - } - if (!mask && unlikely(ipt->error || !ipt->nr_entries)) { - io_poll_remove_entries(req); - if (!ipt->error) - ipt->error = -EINVAL; - return 0; - } - - spin_lock(&ctx->completion_lock); - io_poll_req_insert(req); - spin_unlock(&ctx->completion_lock); - - if (mask) { - /* can't multishot if failed, just queue the event we've got */ - if (unlikely(ipt->error || !ipt->nr_entries)) { - poll->events |= EPOLLONESHOT; - req->apoll_events |= EPOLLONESHOT; - ipt->error = 0; - } - __io_poll_execute(req, mask, poll->events); - return 0; - } - - /* - * Release ownership. If someone tried to queue a tw while it was - * locked, kick it off for them. - */ - v = atomic_dec_return(&req->poll_refs); - if (unlikely(v & IO_POLL_REF_MASK)) - __io_poll_execute(req, 0, poll->events); - return 0; -} - -static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, - struct poll_table_struct *p) -{ - struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); - struct async_poll *apoll = pt->req->apoll; - - __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll); -} - -enum { - IO_APOLL_OK, - IO_APOLL_ABORTED, - IO_APOLL_READY -}; - -static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) -{ - const struct io_op_def *def = &io_op_defs[req->opcode]; - struct io_ring_ctx *ctx = req->ctx; - struct async_poll *apoll; - struct io_poll_table ipt; - __poll_t mask = POLLPRI | POLLERR; - int ret; - - if (!def->pollin && !def->pollout) - return IO_APOLL_ABORTED; - if (!file_can_poll(req->file)) - return IO_APOLL_ABORTED; - if ((req->flags & (REQ_F_POLLED|REQ_F_PARTIAL_IO)) == REQ_F_POLLED) - return IO_APOLL_ABORTED; - if (!(req->flags & REQ_F_APOLL_MULTISHOT)) - mask |= EPOLLONESHOT; - - if (def->pollin) { - mask |= EPOLLIN | EPOLLRDNORM; - - /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */ - if ((req->opcode == IORING_OP_RECVMSG) && - (req->sr_msg.msg_flags & MSG_ERRQUEUE)) - mask &= ~EPOLLIN; - } else { - mask |= EPOLLOUT | EPOLLWRNORM; - } - if (def->poll_exclusive) - mask |= EPOLLEXCLUSIVE; - if (req->flags & REQ_F_POLLED) { - apoll = req->apoll; - kfree(apoll->double_poll); - } else if (!(issue_flags & IO_URING_F_UNLOCKED) && - !list_empty(&ctx->apoll_cache)) { - apoll = list_first_entry(&ctx->apoll_cache, struct async_poll, - poll.wait.entry); - list_del_init(&apoll->poll.wait.entry); - } else { - apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); - if (unlikely(!apoll)) - return IO_APOLL_ABORTED; - } - apoll->double_poll = NULL; - req->apoll = apoll; - req->flags |= REQ_F_POLLED; - ipt.pt._qproc = io_async_queue_proc; - - io_kbuf_recycle(req, issue_flags); - - ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask); - if (ret || ipt.error) - return ret ? IO_APOLL_READY : IO_APOLL_ABORTED; - - trace_io_uring_poll_arm(ctx, req, req->cqe.user_data, req->opcode, - mask, apoll->poll.events); - return IO_APOLL_OK; -} - -/* - * Returns true if we found and killed one or more poll requests - */ -static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, - struct task_struct *tsk, bool cancel_all) -{ - struct hlist_node *tmp; - struct io_kiocb *req; - bool found = false; - int i; - - spin_lock(&ctx->completion_lock); - for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { - struct hlist_head *list; - - list = &ctx->cancel_hash[i]; - hlist_for_each_entry_safe(req, tmp, list, hash_node) { - if (io_match_task_safe(req, tsk, cancel_all)) { - hlist_del_init(&req->hash_node); - io_poll_cancel_req(req); - found = true; - } - } - } - spin_unlock(&ctx->completion_lock); - return found; -} - -static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, - struct io_cancel_data *cd) - __must_hold(&ctx->completion_lock) -{ - struct hlist_head *list; - struct io_kiocb *req; - - list = &ctx->cancel_hash[hash_long(cd->data, ctx->cancel_hash_bits)]; - hlist_for_each_entry(req, list, hash_node) { - if (cd->data != req->cqe.user_data) - continue; - if (poll_only && req->opcode != IORING_OP_POLL_ADD) - continue; - if (cd->flags & IORING_ASYNC_CANCEL_ALL) { - if (cd->seq == req->work.cancel_seq) - continue; - req->work.cancel_seq = cd->seq; - } - return req; - } - return NULL; -} - -static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx, - struct io_cancel_data *cd) - __must_hold(&ctx->completion_lock) -{ - struct io_kiocb *req; - int i; - - for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { - struct hlist_head *list; - - list = &ctx->cancel_hash[i]; - hlist_for_each_entry(req, list, hash_node) { - if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) && - req->file != cd->file) - continue; - if (cd->seq == req->work.cancel_seq) - continue; - req->work.cancel_seq = cd->seq; - return req; - } - } - return NULL; -} - -static bool io_poll_disarm(struct io_kiocb *req) - __must_hold(&ctx->completion_lock) -{ - if (!io_poll_get_ownership(req)) - return false; - io_poll_remove_entries(req); - hash_del(&req->hash_node); - return true; -} - -static int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) - __must_hold(&ctx->completion_lock) -{ - struct io_kiocb *req; - - if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY)) - req = io_poll_file_find(ctx, cd); - else - req = io_poll_find(ctx, false, cd); - if (!req) - return -ENOENT; - io_poll_cancel_req(req); - return 0; -} - -static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe, - unsigned int flags) -{ - u32 events; - - events = READ_ONCE(sqe->poll32_events); -#ifdef __BIG_ENDIAN - events = swahw32(events); -#endif - if (!(flags & IORING_POLL_ADD_MULTI)) - events |= EPOLLONESHOT; - return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT)); -} - -static int io_poll_remove_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_poll_update *upd = &req->poll_update; - u32 flags; - - if (sqe->buf_index || sqe->splice_fd_in) - return -EINVAL; - flags = READ_ONCE(sqe->len); - if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA | - IORING_POLL_ADD_MULTI)) - return -EINVAL; - /* meaningless without update */ - if (flags == IORING_POLL_ADD_MULTI) - return -EINVAL; - - upd->old_user_data = READ_ONCE(sqe->addr); - upd->update_events = flags & IORING_POLL_UPDATE_EVENTS; - upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA; - - upd->new_user_data = READ_ONCE(sqe->off); - if (!upd->update_user_data && upd->new_user_data) - return -EINVAL; - if (upd->update_events) - upd->events = io_poll_parse_events(sqe, flags); - else if (sqe->poll32_events) - return -EINVAL; - - return 0; -} - -static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_poll_iocb *poll = &req->poll; - u32 flags; - - if (sqe->buf_index || sqe->off || sqe->addr) - return -EINVAL; - flags = READ_ONCE(sqe->len); - if (flags & ~IORING_POLL_ADD_MULTI) - return -EINVAL; - if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP)) - return -EINVAL; - - io_req_set_refcount(req); - poll->events = io_poll_parse_events(sqe, flags); - return 0; -} - -static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_poll_iocb *poll = &req->poll; - struct io_poll_table ipt; - int ret; - - ipt.pt._qproc = io_poll_queue_proc; - - ret = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events); - if (!ret && ipt.error) - req_set_fail(req); - ret = ret ?: ipt.error; - if (ret) - __io_req_complete(req, issue_flags, ret, 0); - return 0; -} - -static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_cancel_data cd = { .data = req->poll_update.old_user_data, }; - struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *preq; - int ret2, ret = 0; - bool locked; - - spin_lock(&ctx->completion_lock); - preq = io_poll_find(ctx, true, &cd); - if (!preq || !io_poll_disarm(preq)) { - spin_unlock(&ctx->completion_lock); - ret = preq ? -EALREADY : -ENOENT; - goto out; - } - spin_unlock(&ctx->completion_lock); - - if (req->poll_update.update_events || req->poll_update.update_user_data) { - /* only mask one event flags, keep behavior flags */ - if (req->poll_update.update_events) { - preq->poll.events &= ~0xffff; - preq->poll.events |= req->poll_update.events & 0xffff; - preq->poll.events |= IO_POLL_UNMASK; - } - if (req->poll_update.update_user_data) - preq->cqe.user_data = req->poll_update.new_user_data; - - ret2 = io_poll_add(preq, issue_flags); - /* successfully updated, don't complete poll request */ - if (!ret2) - goto out; - } - - req_set_fail(preq); - preq->cqe.res = -ECANCELED; - locked = !(issue_flags & IO_URING_F_UNLOCKED); - io_req_task_complete(preq, &locked); -out: - if (ret < 0) - req_set_fail(req); - /* complete update request, we're done with it */ - __io_req_complete(req, issue_flags, ret, 0); - return 0; -} - -static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) -{ - struct io_timeout_data *data = container_of(timer, - struct io_timeout_data, timer); - struct io_kiocb *req = data->req; - struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; - - spin_lock_irqsave(&ctx->timeout_lock, flags); - list_del_init(&req->timeout.list); - atomic_set(&req->ctx->cq_timeouts, - atomic_read(&req->ctx->cq_timeouts) + 1); - spin_unlock_irqrestore(&ctx->timeout_lock, flags); - - if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS)) - req_set_fail(req); - - req->cqe.res = -ETIME; - req->io_task_work.func = io_req_task_complete; - io_req_task_work_add(req); - return HRTIMER_NORESTART; -} - -static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, - struct io_cancel_data *cd) - __must_hold(&ctx->timeout_lock) -{ - struct io_timeout_data *io; - struct io_kiocb *req; - bool found = false; - - list_for_each_entry(req, &ctx->timeout_list, timeout.list) { - if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) && - cd->data != req->cqe.user_data) - continue; - if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) { - if (cd->seq == req->work.cancel_seq) - continue; - req->work.cancel_seq = cd->seq; - } - found = true; - break; - } - if (!found) - return ERR_PTR(-ENOENT); - - io = req->async_data; - if (hrtimer_try_to_cancel(&io->timer) == -1) - return ERR_PTR(-EALREADY); - list_del_init(&req->timeout.list); - return req; -} - -static int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) - __must_hold(&ctx->completion_lock) -{ - struct io_kiocb *req; - - spin_lock_irq(&ctx->timeout_lock); - req = io_timeout_extract(ctx, cd); - spin_unlock_irq(&ctx->timeout_lock); - - if (IS_ERR(req)) - return PTR_ERR(req); - io_req_task_queue_fail(req, -ECANCELED); - return 0; -} - -static clockid_t io_timeout_get_clock(struct io_timeout_data *data) -{ - switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) { - case IORING_TIMEOUT_BOOTTIME: - return CLOCK_BOOTTIME; - case IORING_TIMEOUT_REALTIME: - return CLOCK_REALTIME; - default: - /* can't happen, vetted at prep time */ - WARN_ON_ONCE(1); - fallthrough; - case 0: - return CLOCK_MONOTONIC; - } -} - -static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, - struct timespec64 *ts, enum hrtimer_mode mode) - __must_hold(&ctx->timeout_lock) -{ - struct io_timeout_data *io; - struct io_kiocb *req; - bool found = false; - - list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) { - found = user_data == req->cqe.user_data; - if (found) - break; - } - if (!found) - return -ENOENT; - - io = req->async_data; - if (hrtimer_try_to_cancel(&io->timer) == -1) - return -EALREADY; - hrtimer_init(&io->timer, io_timeout_get_clock(io), mode); - io->timer.function = io_link_timeout_fn; - hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode); - return 0; -} - -static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, - struct timespec64 *ts, enum hrtimer_mode mode) - __must_hold(&ctx->timeout_lock) -{ - struct io_cancel_data cd = { .data = user_data, }; - struct io_kiocb *req = io_timeout_extract(ctx, &cd); - struct io_timeout_data *data; - - if (IS_ERR(req)) - return PTR_ERR(req); - - req->timeout.off = 0; /* noseq */ - data = req->async_data; - list_add_tail(&req->timeout.list, &ctx->timeout_list); - hrtimer_init(&data->timer, io_timeout_get_clock(data), mode); - data->timer.function = io_timeout_fn; - hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode); - return 0; -} - -static int io_timeout_remove_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_timeout_rem *tr = &req->timeout_rem; - - if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) - return -EINVAL; - if (sqe->buf_index || sqe->len || sqe->splice_fd_in) - return -EINVAL; - - tr->ltimeout = false; - tr->addr = READ_ONCE(sqe->addr); - tr->flags = READ_ONCE(sqe->timeout_flags); - if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) { - if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1) - return -EINVAL; - if (tr->flags & IORING_LINK_TIMEOUT_UPDATE) - tr->ltimeout = true; - if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS)) - return -EINVAL; - if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2))) - return -EFAULT; - if (tr->ts.tv_sec < 0 || tr->ts.tv_nsec < 0) - return -EINVAL; - } else if (tr->flags) { - /* timeout removal doesn't support flags */ - return -EINVAL; - } - - return 0; -} - -static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags) -{ - return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS - : HRTIMER_MODE_REL; -} - -/* - * Remove or update an existing timeout command - */ -static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_timeout_rem *tr = &req->timeout_rem; - struct io_ring_ctx *ctx = req->ctx; - int ret; - - if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) { - struct io_cancel_data cd = { .data = tr->addr, }; - - spin_lock(&ctx->completion_lock); - ret = io_timeout_cancel(ctx, &cd); - spin_unlock(&ctx->completion_lock); - } else { - enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags); - - spin_lock_irq(&ctx->timeout_lock); - if (tr->ltimeout) - ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode); - else - ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode); - spin_unlock_irq(&ctx->timeout_lock); - } - - if (ret < 0) - req_set_fail(req); - io_req_complete_post(req, ret, 0); - return 0; -} - -static int __io_timeout_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe, - bool is_timeout_link) -{ - struct io_timeout_data *data; - unsigned flags; - u32 off = READ_ONCE(sqe->off); - - if (sqe->buf_index || sqe->len != 1 || sqe->splice_fd_in) - return -EINVAL; - if (off && is_timeout_link) - return -EINVAL; - flags = READ_ONCE(sqe->timeout_flags); - if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK | - IORING_TIMEOUT_ETIME_SUCCESS)) - return -EINVAL; - /* more than one clock specified is invalid, obviously */ - if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1) - return -EINVAL; - - INIT_LIST_HEAD(&req->timeout.list); - req->timeout.off = off; - if (unlikely(off && !req->ctx->off_timeout_used)) - req->ctx->off_timeout_used = true; - - if (WARN_ON_ONCE(req_has_async_data(req))) - return -EFAULT; - if (io_alloc_async_data(req)) - return -ENOMEM; - - data = req->async_data; - data->req = req; - data->flags = flags; - - if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) - return -EFAULT; - - if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0) - return -EINVAL; - - INIT_LIST_HEAD(&req->timeout.list); - data->mode = io_translate_timeout_mode(flags); - hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode); - - if (is_timeout_link) { - struct io_submit_link *link = &req->ctx->submit_state.link; - - if (!link->head) - return -EINVAL; - if (link->last->opcode == IORING_OP_LINK_TIMEOUT) - return -EINVAL; - req->timeout.head = link->last; - link->last->flags |= REQ_F_ARM_LTIMEOUT; - } - return 0; -} - -static int io_timeout_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - return __io_timeout_prep(req, sqe, false); -} - -static int io_link_timeout_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - return __io_timeout_prep(req, sqe, true); -} - -static int io_timeout(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_ring_ctx *ctx = req->ctx; - struct io_timeout_data *data = req->async_data; - struct list_head *entry; - u32 tail, off = req->timeout.off; - - spin_lock_irq(&ctx->timeout_lock); - - /* - * sqe->off holds how many events that need to occur for this - * timeout event to be satisfied. If it isn't set, then this is - * a pure timeout request, sequence isn't used. - */ - if (io_is_timeout_noseq(req)) { - entry = ctx->timeout_list.prev; - goto add; - } - - tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); - req->timeout.target_seq = tail + off; - - /* Update the last seq here in case io_flush_timeouts() hasn't. - * This is safe because ->completion_lock is held, and submissions - * and completions are never mixed in the same ->completion_lock section. - */ - ctx->cq_last_tm_flush = tail; - - /* - * Insertion sort, ensuring the first entry in the list is always - * the one we need first. - */ - list_for_each_prev(entry, &ctx->timeout_list) { - struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, - timeout.list); - - if (io_is_timeout_noseq(nxt)) - continue; - /* nxt.seq is behind @tail, otherwise would've been completed */ - if (off >= nxt->timeout.target_seq - tail) - break; - } -add: - list_add(&req->timeout.list, entry); - data->timer.function = io_timeout_fn; - hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); - spin_unlock_irq(&ctx->timeout_lock); - return 0; -} - -static bool io_cancel_cb(struct io_wq_work *work, void *data) -{ - struct io_kiocb *req = container_of(work, struct io_kiocb, work); - struct io_cancel_data *cd = data; - - if (req->ctx != cd->ctx) - return false; - if (cd->flags & IORING_ASYNC_CANCEL_ANY) { - ; - } else if (cd->flags & IORING_ASYNC_CANCEL_FD) { - if (req->file != cd->file) - return false; - } else { - if (req->cqe.user_data != cd->data) - return false; - } - if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) { - if (cd->seq == req->work.cancel_seq) - return false; - req->work.cancel_seq = cd->seq; - } - return true; -} - -static int io_async_cancel_one(struct io_uring_task *tctx, - struct io_cancel_data *cd) -{ - enum io_wq_cancel cancel_ret; - int ret = 0; - bool all; - - if (!tctx || !tctx->io_wq) - return -ENOENT; - - all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY); - cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, cd, all); - switch (cancel_ret) { - case IO_WQ_CANCEL_OK: - ret = 0; - break; - case IO_WQ_CANCEL_RUNNING: - ret = -EALREADY; - break; - case IO_WQ_CANCEL_NOTFOUND: - ret = -ENOENT; - break; - } - - return ret; -} - -static int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd) -{ - struct io_ring_ctx *ctx = req->ctx; - int ret; - - WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current); - - ret = io_async_cancel_one(req->task->io_uring, cd); - /* - * Fall-through even for -EALREADY, as we may have poll armed - * that need unarming. - */ - if (!ret) - return 0; - - spin_lock(&ctx->completion_lock); - ret = io_poll_cancel(ctx, cd); - if (ret != -ENOENT) - goto out; - if (!(cd->flags & IORING_ASYNC_CANCEL_FD)) - ret = io_timeout_cancel(ctx, cd); -out: - spin_unlock(&ctx->completion_lock); - return ret; -} - -#define CANCEL_FLAGS (IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \ - IORING_ASYNC_CANCEL_ANY) - -static int io_async_cancel_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - if (unlikely(req->flags & REQ_F_BUFFER_SELECT)) - return -EINVAL; - if (sqe->off || sqe->len || sqe->splice_fd_in) - return -EINVAL; - - req->cancel.addr = READ_ONCE(sqe->addr); - req->cancel.flags = READ_ONCE(sqe->cancel_flags); - if (req->cancel.flags & ~CANCEL_FLAGS) - return -EINVAL; - if (req->cancel.flags & IORING_ASYNC_CANCEL_FD) { - if (req->cancel.flags & IORING_ASYNC_CANCEL_ANY) - return -EINVAL; - req->cancel.fd = READ_ONCE(sqe->fd); - } - - return 0; -} - -static int __io_async_cancel(struct io_cancel_data *cd, struct io_kiocb *req, - unsigned int issue_flags) -{ - bool all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY); - struct io_ring_ctx *ctx = cd->ctx; - struct io_tctx_node *node; - int ret, nr = 0; - - do { - ret = io_try_cancel(req, cd); - if (ret == -ENOENT) - break; - if (!all) - return ret; - nr++; - } while (1); - - /* slow path, try all io-wq's */ - io_ring_submit_lock(ctx, issue_flags); - ret = -ENOENT; - list_for_each_entry(node, &ctx->tctx_list, ctx_node) { - struct io_uring_task *tctx = node->task->io_uring; - - ret = io_async_cancel_one(tctx, cd); - if (ret != -ENOENT) { - if (!all) - break; - nr++; - } - } - io_ring_submit_unlock(ctx, issue_flags); - return all ? nr : ret; -} - -static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_cancel_data cd = { - .ctx = req->ctx, - .data = req->cancel.addr, - .flags = req->cancel.flags, - .seq = atomic_inc_return(&req->ctx->cancel_seq), - }; - int ret; - - if (cd.flags & IORING_ASYNC_CANCEL_FD) { - if (req->flags & REQ_F_FIXED_FILE) - req->file = io_file_get_fixed(req, req->cancel.fd, - issue_flags); - else - req->file = io_file_get_normal(req, req->cancel.fd); - if (!req->file) { - ret = -EBADF; - goto done; - } - cd.file = req->file; - } - - ret = __io_async_cancel(&cd, req, issue_flags); -done: - if (ret < 0) - req_set_fail(req); - io_req_complete_post(req, ret, 0); - return 0; -} - -static int io_files_update_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) - return -EINVAL; - if (sqe->rw_flags || sqe->splice_fd_in) - return -EINVAL; - - req->rsrc_update.offset = READ_ONCE(sqe->off); - req->rsrc_update.nr_args = READ_ONCE(sqe->len); - if (!req->rsrc_update.nr_args) - return -EINVAL; - req->rsrc_update.arg = READ_ONCE(sqe->addr); - return 0; -} - -static int io_files_update_with_index_alloc(struct io_kiocb *req, - unsigned int issue_flags) -{ - __s32 __user *fds = u64_to_user_ptr(req->rsrc_update.arg); - unsigned int done; - struct file *file; - int ret, fd; - - for (done = 0; done < req->rsrc_update.nr_args; done++) { - if (copy_from_user(&fd, &fds[done], sizeof(fd))) { - ret = -EFAULT; - break; - } - - file = fget(fd); - if (!file) { - ret = -EBADF; - break; - } - ret = io_fixed_fd_install(req, issue_flags, file, - IORING_FILE_INDEX_ALLOC); - if (ret < 0) - break; - if (copy_to_user(&fds[done], &ret, sizeof(ret))) { - __io_close_fixed(req, issue_flags, ret); - ret = -EFAULT; - break; - } - } - - if (done) - return done; - return ret; -} - -static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_ring_ctx *ctx = req->ctx; - struct io_uring_rsrc_update2 up; - int ret; - - up.offset = req->rsrc_update.offset; - up.data = req->rsrc_update.arg; - up.nr = 0; - up.tags = 0; - up.resv = 0; - up.resv2 = 0; - - if (req->rsrc_update.offset == IORING_FILE_INDEX_ALLOC) { - ret = io_files_update_with_index_alloc(req, issue_flags); - } else { - io_ring_submit_lock(ctx, issue_flags); - ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, - &up, req->rsrc_update.nr_args); - io_ring_submit_unlock(ctx, issue_flags); - } - - if (ret < 0) - req_set_fail(req); - __io_req_complete(req, issue_flags, ret, 0); - return 0; -} - -static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - switch (req->opcode) { - case IORING_OP_NOP: - return io_nop_prep(req, sqe); - case IORING_OP_READV: - case IORING_OP_READ_FIXED: - case IORING_OP_READ: - case IORING_OP_WRITEV: - case IORING_OP_WRITE_FIXED: - case IORING_OP_WRITE: - return io_prep_rw(req, sqe); - case IORING_OP_POLL_ADD: - return io_poll_add_prep(req, sqe); - case IORING_OP_POLL_REMOVE: - return io_poll_remove_prep(req, sqe); - case IORING_OP_FSYNC: - return io_fsync_prep(req, sqe); - case IORING_OP_SYNC_FILE_RANGE: - return io_sfr_prep(req, sqe); - case IORING_OP_SENDMSG: - case IORING_OP_SEND: - return io_sendmsg_prep(req, sqe); - case IORING_OP_RECVMSG: - case IORING_OP_RECV: - return io_recvmsg_prep(req, sqe); - case IORING_OP_CONNECT: - return io_connect_prep(req, sqe); - case IORING_OP_TIMEOUT: - return io_timeout_prep(req, sqe); - case IORING_OP_TIMEOUT_REMOVE: - return io_timeout_remove_prep(req, sqe); - case IORING_OP_ASYNC_CANCEL: - return io_async_cancel_prep(req, sqe); - case IORING_OP_LINK_TIMEOUT: - return io_link_timeout_prep(req, sqe); - case IORING_OP_ACCEPT: - return io_accept_prep(req, sqe); - case IORING_OP_FALLOCATE: - return io_fallocate_prep(req, sqe); - case IORING_OP_OPENAT: - return io_openat_prep(req, sqe); - case IORING_OP_CLOSE: - return io_close_prep(req, sqe); - case IORING_OP_FILES_UPDATE: - return io_files_update_prep(req, sqe); - case IORING_OP_STATX: - return io_statx_prep(req, sqe); - case IORING_OP_FADVISE: - return io_fadvise_prep(req, sqe); - case IORING_OP_MADVISE: - return io_madvise_prep(req, sqe); - case IORING_OP_OPENAT2: - return io_openat2_prep(req, sqe); - case IORING_OP_EPOLL_CTL: - return io_epoll_ctl_prep(req, sqe); - case IORING_OP_SPLICE: - return io_splice_prep(req, sqe); - case IORING_OP_PROVIDE_BUFFERS: - return io_provide_buffers_prep(req, sqe); - case IORING_OP_REMOVE_BUFFERS: - return io_remove_buffers_prep(req, sqe); - case IORING_OP_TEE: - return io_tee_prep(req, sqe); - case IORING_OP_SHUTDOWN: - return io_shutdown_prep(req, sqe); - case IORING_OP_RENAMEAT: - return io_renameat_prep(req, sqe); - case IORING_OP_UNLINKAT: - return io_unlinkat_prep(req, sqe); - case IORING_OP_MKDIRAT: - return io_mkdirat_prep(req, sqe); - case IORING_OP_SYMLINKAT: - return io_symlinkat_prep(req, sqe); - case IORING_OP_LINKAT: - return io_linkat_prep(req, sqe); - case IORING_OP_MSG_RING: - return io_msg_ring_prep(req, sqe); - case IORING_OP_FSETXATTR: - return io_fsetxattr_prep(req, sqe); - case IORING_OP_SETXATTR: - return io_setxattr_prep(req, sqe); - case IORING_OP_FGETXATTR: - return io_fgetxattr_prep(req, sqe); - case IORING_OP_GETXATTR: - return io_getxattr_prep(req, sqe); - case IORING_OP_SOCKET: - return io_socket_prep(req, sqe); - case IORING_OP_URING_CMD: - return io_uring_cmd_prep(req, sqe); - } - - printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", - req->opcode); - return -EINVAL; -} - -static int io_req_prep_async(struct io_kiocb *req) -{ - const struct io_op_def *def = &io_op_defs[req->opcode]; - - /* assign early for deferred execution for non-fixed file */ - if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE)) - req->file = io_file_get_normal(req, req->cqe.fd); - if (!def->needs_async_setup) - return 0; - if (WARN_ON_ONCE(req_has_async_data(req))) - return -EFAULT; - if (io_alloc_async_data(req)) - return -EAGAIN; - - switch (req->opcode) { - case IORING_OP_READV: - return io_readv_prep_async(req); - case IORING_OP_WRITEV: - return io_writev_prep_async(req); - case IORING_OP_SENDMSG: - return io_sendmsg_prep_async(req); - case IORING_OP_RECVMSG: - return io_recvmsg_prep_async(req); - case IORING_OP_CONNECT: - return io_connect_prep_async(req); - case IORING_OP_URING_CMD: - return io_uring_cmd_prep_async(req); - } - printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n", - req->opcode); - return -EFAULT; -} - -static u32 io_get_sequence(struct io_kiocb *req) -{ - u32 seq = req->ctx->cached_sq_head; - struct io_kiocb *cur; - - /* need original cached_sq_head, but it was increased for each req */ - io_for_each_link(cur, req) - seq--; - return seq; -} - -static __cold void io_drain_req(struct io_kiocb *req) -{ - struct io_ring_ctx *ctx = req->ctx; - struct io_defer_entry *de; - int ret; - u32 seq = io_get_sequence(req); - - /* Still need defer if there is pending req in defer list. */ - spin_lock(&ctx->completion_lock); - if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) { - spin_unlock(&ctx->completion_lock); -queue: - ctx->drain_active = false; - io_req_task_queue(req); - return; - } - spin_unlock(&ctx->completion_lock); - - ret = io_req_prep_async(req); - if (ret) { -fail: - io_req_complete_failed(req, ret); - return; - } - io_prep_async_link(req); - de = kmalloc(sizeof(*de), GFP_KERNEL); - if (!de) { - ret = -ENOMEM; - goto fail; - } - - spin_lock(&ctx->completion_lock); - if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) { - spin_unlock(&ctx->completion_lock); - kfree(de); - goto queue; - } - - trace_io_uring_defer(ctx, req, req->cqe.user_data, req->opcode); - de->req = req; - de->seq = seq; - list_add_tail(&de->list, &ctx->defer_list); - spin_unlock(&ctx->completion_lock); -} - -static void io_clean_op(struct io_kiocb *req) -{ - if (req->flags & REQ_F_BUFFER_SELECTED) { - spin_lock(&req->ctx->completion_lock); - io_put_kbuf_comp(req); - spin_unlock(&req->ctx->completion_lock); - } - - if (req->flags & REQ_F_NEED_CLEANUP) { - switch (req->opcode) { - case IORING_OP_READV: - case IORING_OP_READ_FIXED: - case IORING_OP_READ: - case IORING_OP_WRITEV: - case IORING_OP_WRITE_FIXED: - case IORING_OP_WRITE: { - struct io_async_rw *io = req->async_data; - - kfree(io->free_iovec); - break; - } - case IORING_OP_RECVMSG: - case IORING_OP_SENDMSG: { - struct io_async_msghdr *io = req->async_data; - - kfree(io->free_iov); - break; - } - case IORING_OP_OPENAT: - case IORING_OP_OPENAT2: - if (req->open.filename) - putname(req->open.filename); - break; - case IORING_OP_RENAMEAT: - putname(req->rename.oldpath); - putname(req->rename.newpath); - break; - case IORING_OP_UNLINKAT: - putname(req->unlink.filename); - break; - case IORING_OP_MKDIRAT: - putname(req->mkdir.filename); - break; - case IORING_OP_SYMLINKAT: - putname(req->symlink.oldpath); - putname(req->symlink.newpath); - break; - case IORING_OP_LINKAT: - putname(req->hardlink.oldpath); - putname(req->hardlink.newpath); - break; - case IORING_OP_STATX: - if (req->statx.filename) - putname(req->statx.filename); - break; - case IORING_OP_SETXATTR: - case IORING_OP_FSETXATTR: - case IORING_OP_GETXATTR: - case IORING_OP_FGETXATTR: - __io_xattr_finish(req); - break; - } - } - if ((req->flags & REQ_F_POLLED) && req->apoll) { - kfree(req->apoll->double_poll); - kfree(req->apoll); - req->apoll = NULL; - } - if (req->flags & REQ_F_INFLIGHT) { - struct io_uring_task *tctx = req->task->io_uring; - - atomic_dec(&tctx->inflight_tracked); - } - if (req->flags & REQ_F_CREDS) - put_cred(req->creds); - if (req->flags & REQ_F_ASYNC_DATA) { - kfree(req->async_data); - req->async_data = NULL; - } - req->flags &= ~IO_REQ_CLEAN_FLAGS; -} - -static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags) -{ - if (req->file || !io_op_defs[req->opcode].needs_file) - return true; - - if (req->flags & REQ_F_FIXED_FILE) - req->file = io_file_get_fixed(req, req->cqe.fd, issue_flags); - else - req->file = io_file_get_normal(req, req->cqe.fd); - - return !!req->file; -} - -static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) -{ - const struct io_op_def *def = &io_op_defs[req->opcode]; - const struct cred *creds = NULL; - int ret; - - if (unlikely(!io_assign_file(req, issue_flags))) - return -EBADF; - - if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred())) - creds = override_creds(req->creds); - - if (!def->audit_skip) - audit_uring_entry(req->opcode); - - switch (req->opcode) { - case IORING_OP_NOP: - ret = io_nop(req, issue_flags); - break; - case IORING_OP_READV: - case IORING_OP_READ_FIXED: - case IORING_OP_READ: - ret = io_read(req, issue_flags); - break; - case IORING_OP_WRITEV: - case IORING_OP_WRITE_FIXED: - case IORING_OP_WRITE: - ret = io_write(req, issue_flags); - break; - case IORING_OP_FSYNC: - ret = io_fsync(req, issue_flags); - break; - case IORING_OP_POLL_ADD: - ret = io_poll_add(req, issue_flags); - break; - case IORING_OP_POLL_REMOVE: - ret = io_poll_remove(req, issue_flags); - break; - case IORING_OP_SYNC_FILE_RANGE: - ret = io_sync_file_range(req, issue_flags); - break; - case IORING_OP_SENDMSG: - ret = io_sendmsg(req, issue_flags); - break; - case IORING_OP_SEND: - ret = io_send(req, issue_flags); - break; - case IORING_OP_RECVMSG: - ret = io_recvmsg(req, issue_flags); - break; - case IORING_OP_RECV: - ret = io_recv(req, issue_flags); - break; - case IORING_OP_TIMEOUT: - ret = io_timeout(req, issue_flags); - break; - case IORING_OP_TIMEOUT_REMOVE: - ret = io_timeout_remove(req, issue_flags); - break; - case IORING_OP_ACCEPT: - ret = io_accept(req, issue_flags); - break; - case IORING_OP_CONNECT: - ret = io_connect(req, issue_flags); - break; - case IORING_OP_ASYNC_CANCEL: - ret = io_async_cancel(req, issue_flags); - break; - case IORING_OP_FALLOCATE: - ret = io_fallocate(req, issue_flags); - break; - case IORING_OP_OPENAT: - ret = io_openat(req, issue_flags); - break; - case IORING_OP_CLOSE: - ret = io_close(req, issue_flags); - break; - case IORING_OP_FILES_UPDATE: - ret = io_files_update(req, issue_flags); - break; - case IORING_OP_STATX: - ret = io_statx(req, issue_flags); - break; - case IORING_OP_FADVISE: - ret = io_fadvise(req, issue_flags); - break; - case IORING_OP_MADVISE: - ret = io_madvise(req, issue_flags); - break; - case IORING_OP_OPENAT2: - ret = io_openat2(req, issue_flags); - break; - case IORING_OP_EPOLL_CTL: - ret = io_epoll_ctl(req, issue_flags); - break; - case IORING_OP_SPLICE: - ret = io_splice(req, issue_flags); - break; - case IORING_OP_PROVIDE_BUFFERS: - ret = io_provide_buffers(req, issue_flags); - break; - case IORING_OP_REMOVE_BUFFERS: - ret = io_remove_buffers(req, issue_flags); - break; - case IORING_OP_TEE: - ret = io_tee(req, issue_flags); - break; - case IORING_OP_SHUTDOWN: - ret = io_shutdown(req, issue_flags); - break; - case IORING_OP_RENAMEAT: - ret = io_renameat(req, issue_flags); - break; - case IORING_OP_UNLINKAT: - ret = io_unlinkat(req, issue_flags); - break; - case IORING_OP_MKDIRAT: - ret = io_mkdirat(req, issue_flags); - break; - case IORING_OP_SYMLINKAT: - ret = io_symlinkat(req, issue_flags); - break; - case IORING_OP_LINKAT: - ret = io_linkat(req, issue_flags); - break; - case IORING_OP_MSG_RING: - ret = io_msg_ring(req, issue_flags); - break; - case IORING_OP_FSETXATTR: - ret = io_fsetxattr(req, issue_flags); - break; - case IORING_OP_SETXATTR: - ret = io_setxattr(req, issue_flags); - break; - case IORING_OP_FGETXATTR: - ret = io_fgetxattr(req, issue_flags); - break; - case IORING_OP_GETXATTR: - ret = io_getxattr(req, issue_flags); - break; - case IORING_OP_SOCKET: - ret = io_socket(req, issue_flags); - break; - case IORING_OP_URING_CMD: - ret = io_uring_cmd(req, issue_flags); - break; - default: - ret = -EINVAL; - break; - } - - if (!def->audit_skip) - audit_uring_exit(!ret, ret); - - if (creds) - revert_creds(creds); - if (ret) - return ret; - /* If the op doesn't have a file, we're not polling for it */ - if ((req->ctx->flags & IORING_SETUP_IOPOLL) && req->file) - io_iopoll_req_issued(req, issue_flags); - - return 0; -} - -static struct io_wq_work *io_wq_free_work(struct io_wq_work *work) -{ - struct io_kiocb *req = container_of(work, struct io_kiocb, work); - - req = io_put_req_find_next(req); - return req ? &req->work : NULL; -} - -static void io_wq_submit_work(struct io_wq_work *work) -{ - struct io_kiocb *req = container_of(work, struct io_kiocb, work); - const struct io_op_def *def = &io_op_defs[req->opcode]; - unsigned int issue_flags = IO_URING_F_UNLOCKED; - bool needs_poll = false; - int ret = 0, err = -ECANCELED; - - /* one will be dropped by ->io_free_work() after returning to io-wq */ - if (!(req->flags & REQ_F_REFCOUNT)) - __io_req_set_refcount(req, 2); - else - req_ref_get(req); - - io_arm_ltimeout(req); - - /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ - if (work->flags & IO_WQ_WORK_CANCEL) { -fail: - io_req_task_queue_fail(req, err); - return; - } - if (!io_assign_file(req, issue_flags)) { - err = -EBADF; - work->flags |= IO_WQ_WORK_CANCEL; - goto fail; - } - - if (req->flags & REQ_F_FORCE_ASYNC) { - bool opcode_poll = def->pollin || def->pollout; - - if (opcode_poll && file_can_poll(req->file)) { - needs_poll = true; - issue_flags |= IO_URING_F_NONBLOCK; - } - } - - do { - ret = io_issue_sqe(req, issue_flags); - if (ret != -EAGAIN) - break; - /* - * We can get EAGAIN for iopolled IO even though we're - * forcing a sync submission from here, since we can't - * wait for request slots on the block side. - */ - if (!needs_poll) { - if (!(req->ctx->flags & IORING_SETUP_IOPOLL)) - break; - cond_resched(); - continue; - } - - if (io_arm_poll_handler(req, issue_flags) == IO_APOLL_OK) - return; - /* aborted or ready, in either case retry blocking */ - needs_poll = false; - issue_flags &= ~IO_URING_F_NONBLOCK; - } while (1); - - /* avoid locking problems by failing it from a clean context */ - if (ret) - io_req_task_queue_fail(req, ret); -} - -static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table, - unsigned i) -{ - return &table->files[i]; -} - -static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, - int index) -{ - struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index); - - return (struct file *) (slot->file_ptr & FFS_MASK); -} - -static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file) -{ - unsigned long file_ptr = (unsigned long) file; - - file_ptr |= io_file_get_flags(file); - file_slot->file_ptr = file_ptr; -} - -static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, - unsigned int issue_flags) -{ - struct io_ring_ctx *ctx = req->ctx; - struct file *file = NULL; - unsigned long file_ptr; - - io_ring_submit_lock(ctx, issue_flags); - - if (unlikely((unsigned int)fd >= ctx->nr_user_files)) - goto out; - fd = array_index_nospec(fd, ctx->nr_user_files); - file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr; - file = (struct file *) (file_ptr & FFS_MASK); - file_ptr &= ~FFS_MASK; - /* mask in overlapping REQ_F and FFS bits */ - req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT); - io_req_set_rsrc_node(req, ctx, 0); - WARN_ON_ONCE(file && !test_bit(fd, ctx->file_table.bitmap)); -out: - io_ring_submit_unlock(ctx, issue_flags); - return file; -} - -static struct file *io_file_get_normal(struct io_kiocb *req, int fd) -{ - struct file *file = fget(fd); - - trace_io_uring_file_get(req->ctx, req, req->cqe.user_data, fd); - - /* we don't allow fixed io_uring files */ - if (file && file->f_op == &io_uring_fops) - io_req_track_inflight(req); - return file; -} - -static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) -{ - struct io_kiocb *prev = req->timeout.prev; - int ret = -ENOENT; - - if (prev) { - if (!(req->task->flags & PF_EXITING)) { - struct io_cancel_data cd = { - .ctx = req->ctx, - .data = prev->cqe.user_data, - }; - - ret = io_try_cancel(req, &cd); - } - io_req_complete_post(req, ret ?: -ETIME, 0); - io_put_req(prev); - } else { - io_req_complete_post(req, -ETIME, 0); - } -} - -static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) -{ - struct io_timeout_data *data = container_of(timer, - struct io_timeout_data, timer); - struct io_kiocb *prev, *req = data->req; - struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; - - spin_lock_irqsave(&ctx->timeout_lock, flags); - prev = req->timeout.head; - req->timeout.head = NULL; - - /* - * We don't expect the list to be empty, that will only happen if we - * race with the completion of the linked work. - */ - if (prev) { - io_remove_next_linked(prev); - if (!req_ref_inc_not_zero(prev)) - prev = NULL; - } - list_del(&req->timeout.list); - req->timeout.prev = prev; - spin_unlock_irqrestore(&ctx->timeout_lock, flags); - - req->io_task_work.func = io_req_task_link_timeout; - io_req_task_work_add(req); - return HRTIMER_NORESTART; -} - -static void io_queue_linked_timeout(struct io_kiocb *req) -{ - struct io_ring_ctx *ctx = req->ctx; - - spin_lock_irq(&ctx->timeout_lock); - /* - * If the back reference is NULL, then our linked request finished - * before we got a chance to setup the timer - */ - if (req->timeout.head) { - struct io_timeout_data *data = req->async_data; - - data->timer.function = io_link_timeout_fn; - hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), - data->mode); - list_add_tail(&req->timeout.list, &ctx->ltimeout_list); - } - spin_unlock_irq(&ctx->timeout_lock); - /* drop submission reference */ - io_put_req(req); -} - -static void io_queue_async(struct io_kiocb *req, int ret) - __must_hold(&req->ctx->uring_lock) -{ - struct io_kiocb *linked_timeout; - - if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) { - io_req_complete_failed(req, ret); - return; - } - - linked_timeout = io_prep_linked_timeout(req); - - switch (io_arm_poll_handler(req, 0)) { - case IO_APOLL_READY: - io_req_task_queue(req); - break; - case IO_APOLL_ABORTED: - /* - * Queued up for async execution, worker will release - * submit reference when the iocb is actually submitted. - */ - io_kbuf_recycle(req, 0); - io_queue_iowq(req, NULL); - break; - case IO_APOLL_OK: - break; - } - - if (linked_timeout) - io_queue_linked_timeout(linked_timeout); -} - -static inline void io_queue_sqe(struct io_kiocb *req) - __must_hold(&req->ctx->uring_lock) -{ - int ret; - - ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); - - if (req->flags & REQ_F_COMPLETE_INLINE) { - io_req_add_compl_list(req); - return; - } - /* - * We async punt it if the file wasn't marked NOWAIT, or if the file - * doesn't support non-blocking read/write attempts - */ - if (likely(!ret)) - io_arm_ltimeout(req); - else - io_queue_async(req, ret); -} - -static void io_queue_sqe_fallback(struct io_kiocb *req) - __must_hold(&req->ctx->uring_lock) -{ - if (unlikely(req->flags & REQ_F_FAIL)) { - /* - * We don't submit, fail them all, for that replace hardlinks - * with normal links. Extra REQ_F_LINK is tolerated. - */ - req->flags &= ~REQ_F_HARDLINK; - req->flags |= REQ_F_LINK; - io_req_complete_failed(req, req->cqe.res); - } else if (unlikely(req->ctx->drain_active)) { - io_drain_req(req); - } else { - int ret = io_req_prep_async(req); - - if (unlikely(ret)) - io_req_complete_failed(req, ret); - else - io_queue_iowq(req, NULL); - } -} - -/* - * Check SQE restrictions (opcode and flags). - * - * Returns 'true' if SQE is allowed, 'false' otherwise. - */ -static inline bool io_check_restriction(struct io_ring_ctx *ctx, - struct io_kiocb *req, - unsigned int sqe_flags) -{ - if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) - return false; - - if ((sqe_flags & ctx->restrictions.sqe_flags_required) != - ctx->restrictions.sqe_flags_required) - return false; - - if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed | - ctx->restrictions.sqe_flags_required)) - return false; - - return true; -} - -static void io_init_req_drain(struct io_kiocb *req) -{ - struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *head = ctx->submit_state.link.head; - - ctx->drain_active = true; - if (head) { - /* - * If we need to drain a request in the middle of a link, drain - * the head request and the next request/link after the current - * link. Considering sequential execution of links, - * REQ_F_IO_DRAIN will be maintained for every request of our - * link. - */ - head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC; - ctx->drain_next = true; - } -} - -static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, - const struct io_uring_sqe *sqe) - __must_hold(&ctx->uring_lock) -{ - const struct io_op_def *def; - unsigned int sqe_flags; - int personality; - u8 opcode; - - /* req is partially pre-initialised, see io_preinit_req() */ - req->opcode = opcode = READ_ONCE(sqe->opcode); - /* same numerical values with corresponding REQ_F_*, safe to copy */ - req->flags = sqe_flags = READ_ONCE(sqe->flags); - req->cqe.user_data = READ_ONCE(sqe->user_data); - req->file = NULL; - req->rsrc_node = NULL; - req->task = current; - - if (unlikely(opcode >= IORING_OP_LAST)) { - req->opcode = 0; - return -EINVAL; - } - def = &io_op_defs[opcode]; - if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) { - /* enforce forwards compatibility on users */ - if (sqe_flags & ~SQE_VALID_FLAGS) - return -EINVAL; - if (sqe_flags & IOSQE_BUFFER_SELECT) { - if (!def->buffer_select) - return -EOPNOTSUPP; - req->buf_index = READ_ONCE(sqe->buf_group); - } - if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS) - ctx->drain_disabled = true; - if (sqe_flags & IOSQE_IO_DRAIN) { - if (ctx->drain_disabled) - return -EOPNOTSUPP; - io_init_req_drain(req); - } - } - if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) { - if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags)) - return -EACCES; - /* knock it to the slow queue path, will be drained there */ - if (ctx->drain_active) - req->flags |= REQ_F_FORCE_ASYNC; - /* if there is no link, we're at "next" request and need to drain */ - if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) { - ctx->drain_next = false; - ctx->drain_active = true; - req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC; - } - } - - if (!def->ioprio && sqe->ioprio) - return -EINVAL; - if (!def->iopoll && (ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - - if (def->needs_file) { - struct io_submit_state *state = &ctx->submit_state; - - req->cqe.fd = READ_ONCE(sqe->fd); - - /* - * Plug now if we have more than 2 IO left after this, and the - * target is potentially a read/write to block based storage. - */ - if (state->need_plug && def->plug) { - state->plug_started = true; - state->need_plug = false; - blk_start_plug_nr_ios(&state->plug, state->submit_nr); - } - } - - personality = READ_ONCE(sqe->personality); - if (personality) { - int ret; - - req->creds = xa_load(&ctx->personalities, personality); - if (!req->creds) - return -EINVAL; - get_cred(req->creds); - ret = security_uring_override_creds(req->creds); - if (ret) { - put_cred(req->creds); - return ret; - } - req->flags |= REQ_F_CREDS; - } - - return io_req_prep(req, sqe); -} - -static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe, - struct io_kiocb *req, int ret) -{ - struct io_ring_ctx *ctx = req->ctx; - struct io_submit_link *link = &ctx->submit_state.link; - struct io_kiocb *head = link->head; - - trace_io_uring_req_failed(sqe, ctx, req, ret); - - /* - * Avoid breaking links in the middle as it renders links with SQPOLL - * unusable. Instead of failing eagerly, continue assembling the link if - * applicable and mark the head with REQ_F_FAIL. The link flushing code - * should find the flag and handle the rest. - */ - req_fail_link_node(req, ret); - if (head && !(head->flags & REQ_F_FAIL)) - req_fail_link_node(head, -ECANCELED); - - if (!(req->flags & IO_REQ_LINK_FLAGS)) { - if (head) { - link->last->link = req; - link->head = NULL; - req = head; - } - io_queue_sqe_fallback(req); - return ret; - } - - if (head) - link->last->link = req; - else - link->head = req; - link->last = req; - return 0; -} - -static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, - const struct io_uring_sqe *sqe) - __must_hold(&ctx->uring_lock) -{ - struct io_submit_link *link = &ctx->submit_state.link; - int ret; - - ret = io_init_req(ctx, req, sqe); - if (unlikely(ret)) - return io_submit_fail_init(sqe, req, ret); - - /* don't need @sqe from now on */ - trace_io_uring_submit_sqe(ctx, req, req->cqe.user_data, req->opcode, - req->flags, true, - ctx->flags & IORING_SETUP_SQPOLL); - - /* - * If we already have a head request, queue this one for async - * submittal once the head completes. If we don't have a head but - * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be - * submitted sync once the chain is complete. If none of those - * conditions are true (normal request), then just queue it. - */ - if (unlikely(link->head)) { - ret = io_req_prep_async(req); - if (unlikely(ret)) - return io_submit_fail_init(sqe, req, ret); - - trace_io_uring_link(ctx, req, link->head); - link->last->link = req; - link->last = req; - - if (req->flags & IO_REQ_LINK_FLAGS) - return 0; - /* last request of the link, flush it */ - req = link->head; - link->head = NULL; - if (req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)) - goto fallback; - - } else if (unlikely(req->flags & (IO_REQ_LINK_FLAGS | - REQ_F_FORCE_ASYNC | REQ_F_FAIL))) { - if (req->flags & IO_REQ_LINK_FLAGS) { - link->head = req; - link->last = req; - } else { -fallback: - io_queue_sqe_fallback(req); - } - return 0; - } - - io_queue_sqe(req); - return 0; -} - -/* - * Batched submission is done, ensure local IO is flushed out. - */ -static void io_submit_state_end(struct io_ring_ctx *ctx) -{ - struct io_submit_state *state = &ctx->submit_state; - - if (unlikely(state->link.head)) - io_queue_sqe_fallback(state->link.head); - /* flush only after queuing links as they can generate completions */ - io_submit_flush_completions(ctx); - if (state->plug_started) - blk_finish_plug(&state->plug); -} - -/* - * Start submission side cache. - */ -static void io_submit_state_start(struct io_submit_state *state, - unsigned int max_ios) -{ - state->plug_started = false; - state->need_plug = max_ios > 2; - state->submit_nr = max_ios; - /* set only head, no need to init link_last in advance */ - state->link.head = NULL; -} - -static void io_commit_sqring(struct io_ring_ctx *ctx) -{ - struct io_rings *rings = ctx->rings; - - /* - * Ensure any loads from the SQEs are done at this point, - * since once we write the new head, the application could - * write new data to them. - */ - smp_store_release(&rings->sq.head, ctx->cached_sq_head); -} - -/* - * Fetch an sqe, if one is available. Note this returns a pointer to memory - * that is mapped by userspace. This means that care needs to be taken to - * ensure that reads are stable, as we cannot rely on userspace always - * being a good citizen. If members of the sqe are validated and then later - * used, it's important that those reads are done through READ_ONCE() to - * prevent a re-load down the line. - */ -static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) -{ - unsigned head, mask = ctx->sq_entries - 1; - unsigned sq_idx = ctx->cached_sq_head++ & mask; - - /* - * The cached sq head (or cq tail) serves two purposes: - * - * 1) allows us to batch the cost of updating the user visible - * head updates. - * 2) allows the kernel side to track the head on its own, even - * though the application is the one updating it. - */ - head = READ_ONCE(ctx->sq_array[sq_idx]); - if (likely(head < ctx->sq_entries)) { - /* double index for 128-byte SQEs, twice as long */ - if (ctx->flags & IORING_SETUP_SQE128) - head <<= 1; - return &ctx->sq_sqes[head]; - } - - /* drop invalid entries */ - ctx->cq_extra--; - WRITE_ONCE(ctx->rings->sq_dropped, - READ_ONCE(ctx->rings->sq_dropped) + 1); - return NULL; -} - -static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) - __must_hold(&ctx->uring_lock) -{ - unsigned int entries = io_sqring_entries(ctx); - unsigned int left; - int ret; - - if (unlikely(!entries)) - return 0; - /* make sure SQ entry isn't read before tail */ - ret = left = min3(nr, ctx->sq_entries, entries); - io_get_task_refs(left); - io_submit_state_start(&ctx->submit_state, left); - - do { - const struct io_uring_sqe *sqe; - struct io_kiocb *req; - - if (unlikely(!io_alloc_req_refill(ctx))) - break; - req = io_alloc_req(ctx); - sqe = io_get_sqe(ctx); - if (unlikely(!sqe)) { - io_req_add_to_cache(req, ctx); - break; - } - - /* - * Continue submitting even for sqe failure if the - * ring was setup with IORING_SETUP_SUBMIT_ALL - */ - if (unlikely(io_submit_sqe(ctx, req, sqe)) && - !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) { - left--; - break; - } - } while (--left); - - if (unlikely(left)) { - ret -= left; - /* try again if it submitted nothing and can't allocate a req */ - if (!ret && io_req_cache_empty(ctx)) - ret = -EAGAIN; - current->io_uring->cached_refs += left; - } - - io_submit_state_end(ctx); - /* Commit SQ ring head once we've consumed and submitted all SQEs */ - io_commit_sqring(ctx); - return ret; -} - -static inline bool io_sqd_events_pending(struct io_sq_data *sqd) -{ - return READ_ONCE(sqd->state); -} - -static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) -{ - unsigned int to_submit; - int ret = 0; - - to_submit = io_sqring_entries(ctx); - /* if we're handling multiple rings, cap submit size for fairness */ - if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE) - to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE; - - if (!wq_list_empty(&ctx->iopoll_list) || to_submit) { - const struct cred *creds = NULL; - - if (ctx->sq_creds != current_cred()) - creds = override_creds(ctx->sq_creds); - - mutex_lock(&ctx->uring_lock); - if (!wq_list_empty(&ctx->iopoll_list)) - io_do_iopoll(ctx, true); - - /* - * Don't submit if refs are dying, good for io_uring_register(), - * but also it is relied upon by io_ring_exit_work() - */ - if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) && - !(ctx->flags & IORING_SETUP_R_DISABLED)) - ret = io_submit_sqes(ctx, to_submit); - mutex_unlock(&ctx->uring_lock); - - if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait)) - wake_up(&ctx->sqo_sq_wait); - if (creds) - revert_creds(creds); - } - - return ret; -} - -static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd) -{ - struct io_ring_ctx *ctx; - unsigned sq_thread_idle = 0; - - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) - sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle); - sqd->sq_thread_idle = sq_thread_idle; -} - -static bool io_sqd_handle_event(struct io_sq_data *sqd) -{ - bool did_sig = false; - struct ksignal ksig; - - if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) || - signal_pending(current)) { - mutex_unlock(&sqd->lock); - if (signal_pending(current)) - did_sig = get_signal(&ksig); - cond_resched(); - mutex_lock(&sqd->lock); - } - return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); -} - -static int io_sq_thread(void *data) -{ - struct io_sq_data *sqd = data; - struct io_ring_ctx *ctx; - unsigned long timeout = 0; - char buf[TASK_COMM_LEN]; - DEFINE_WAIT(wait); - - snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid); - set_task_comm(current, buf); - - if (sqd->sq_cpu != -1) - set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu)); - else - set_cpus_allowed_ptr(current, cpu_online_mask); - current->flags |= PF_NO_SETAFFINITY; - - audit_alloc_kernel(current); - - mutex_lock(&sqd->lock); - while (1) { - bool cap_entries, sqt_spin = false; - - if (io_sqd_events_pending(sqd) || signal_pending(current)) { - if (io_sqd_handle_event(sqd)) - break; - timeout = jiffies + sqd->sq_thread_idle; - } - - cap_entries = !list_is_singular(&sqd->ctx_list); - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { - int ret = __io_sq_thread(ctx, cap_entries); - - if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list))) - sqt_spin = true; - } - if (io_run_task_work()) - sqt_spin = true; - - if (sqt_spin || !time_after(jiffies, timeout)) { - cond_resched(); - if (sqt_spin) - timeout = jiffies + sqd->sq_thread_idle; - continue; - } - - prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE); - if (!io_sqd_events_pending(sqd) && !task_work_pending(current)) { - bool needs_sched = true; - - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { - atomic_or(IORING_SQ_NEED_WAKEUP, - &ctx->rings->sq_flags); - if ((ctx->flags & IORING_SETUP_IOPOLL) && - !wq_list_empty(&ctx->iopoll_list)) { - needs_sched = false; - break; - } - - /* - * Ensure the store of the wakeup flag is not - * reordered with the load of the SQ tail - */ - smp_mb__after_atomic(); - - if (io_sqring_entries(ctx)) { - needs_sched = false; - break; - } - } - - if (needs_sched) { - mutex_unlock(&sqd->lock); - schedule(); - mutex_lock(&sqd->lock); - } - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) - atomic_andnot(IORING_SQ_NEED_WAKEUP, - &ctx->rings->sq_flags); - } - - finish_wait(&sqd->wait, &wait); - timeout = jiffies + sqd->sq_thread_idle; - } - - io_uring_cancel_generic(true, sqd); - sqd->thread = NULL; - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) - atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags); - io_run_task_work(); - mutex_unlock(&sqd->lock); - - audit_free(current); - - complete(&sqd->exited); - do_exit(0); -} - -struct io_wait_queue { - struct wait_queue_entry wq; - struct io_ring_ctx *ctx; - unsigned cq_tail; - unsigned nr_timeouts; -}; - -static inline bool io_should_wake(struct io_wait_queue *iowq) -{ - struct io_ring_ctx *ctx = iowq->ctx; - int dist = ctx->cached_cq_tail - (int) iowq->cq_tail; - - /* - * Wake up if we have enough events, or if a timeout occurred since we - * started waiting. For timeouts, we always want to return to userspace, - * regardless of event count. - */ - return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; -} - -static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, - int wake_flags, void *key) -{ - struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, - wq); - - /* - * Cannot safely flush overflowed CQEs from here, ensure we wake up - * the task, and the next invocation will do it. - */ - if (io_should_wake(iowq) || - test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &iowq->ctx->check_cq)) - return autoremove_wake_function(curr, mode, wake_flags, key); - return -1; -} - -static int io_run_task_work_sig(void) -{ - if (io_run_task_work()) - return 1; - if (test_thread_flag(TIF_NOTIFY_SIGNAL)) - return -ERESTARTSYS; - if (task_sigpending(current)) - return -EINTR; - return 0; -} - -/* when returns >0, the caller should retry */ -static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, - struct io_wait_queue *iowq, - ktime_t timeout) -{ - int ret; - unsigned long check_cq; - - /* make sure we run task_work before checking for signals */ - ret = io_run_task_work_sig(); - if (ret || io_should_wake(iowq)) - return ret; - check_cq = READ_ONCE(ctx->check_cq); - /* let the caller flush overflows, retry */ - if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) - return 1; - if (unlikely(check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT))) - return -EBADR; - if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS)) - return -ETIME; - return 1; -} - -/* - * Wait until events become available, if we don't already have some. The - * application must reap them itself, as they reside on the shared cq ring. - */ -static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, - const sigset_t __user *sig, size_t sigsz, - struct __kernel_timespec __user *uts) -{ - struct io_wait_queue iowq; - struct io_rings *rings = ctx->rings; - ktime_t timeout = KTIME_MAX; - int ret; - - do { - io_cqring_overflow_flush(ctx); - if (io_cqring_events(ctx) >= min_events) - return 0; - if (!io_run_task_work()) - break; - } while (1); - - if (sig) { -#ifdef CONFIG_COMPAT - if (in_compat_syscall()) - ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig, - sigsz); - else -#endif - ret = set_user_sigmask(sig, sigsz); - - if (ret) - return ret; - } - - if (uts) { - struct timespec64 ts; - - if (get_timespec64(&ts, uts)) - return -EFAULT; - timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns()); - } - - init_waitqueue_func_entry(&iowq.wq, io_wake_function); - iowq.wq.private = current; - INIT_LIST_HEAD(&iowq.wq.entry); - iowq.ctx = ctx; - iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); - iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; - - trace_io_uring_cqring_wait(ctx, min_events); - do { - /* if we can't even flush overflow, don't wait for more */ - if (!io_cqring_overflow_flush(ctx)) { - ret = -EBUSY; - break; - } - prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, - TASK_INTERRUPTIBLE); - ret = io_cqring_wait_schedule(ctx, &iowq, timeout); - cond_resched(); - } while (ret > 0); - - finish_wait(&ctx->cq_wait, &iowq.wq); - restore_saved_sigmask_unless(ret == -EINTR); - - return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; -} - -static void io_free_page_table(void **table, size_t size) -{ - unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); - - for (i = 0; i < nr_tables; i++) - kfree(table[i]); - kfree(table); -} - -static __cold void **io_alloc_page_table(size_t size) -{ - unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); - size_t init_size = size; - void **table; - - table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT); - if (!table) - return NULL; - - for (i = 0; i < nr_tables; i++) { - unsigned int this_size = min_t(size_t, size, PAGE_SIZE); - - table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT); - if (!table[i]) { - io_free_page_table(table, init_size); - return NULL; - } - size -= this_size; - } - return table; -} - -static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node) -{ - percpu_ref_exit(&ref_node->refs); - kfree(ref_node); -} - -static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref) -{ - struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs); - struct io_ring_ctx *ctx = node->rsrc_data->ctx; - unsigned long flags; - bool first_add = false; - unsigned long delay = HZ; - - spin_lock_irqsave(&ctx->rsrc_ref_lock, flags); - node->done = true; - - /* if we are mid-quiesce then do not delay */ - if (node->rsrc_data->quiesce) - delay = 0; - - while (!list_empty(&ctx->rsrc_ref_list)) { - node = list_first_entry(&ctx->rsrc_ref_list, - struct io_rsrc_node, node); - /* recycle ref nodes in order */ - if (!node->done) - break; - list_del(&node->node); - first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist); - } - spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags); - - if (first_add) - mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay); -} - -static struct io_rsrc_node *io_rsrc_node_alloc(void) -{ - struct io_rsrc_node *ref_node; - - ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); - if (!ref_node) - return NULL; - - if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero, - 0, GFP_KERNEL)) { - kfree(ref_node); - return NULL; - } - INIT_LIST_HEAD(&ref_node->node); - INIT_LIST_HEAD(&ref_node->rsrc_list); - ref_node->done = false; - return ref_node; -} - -static void io_rsrc_node_switch(struct io_ring_ctx *ctx, - struct io_rsrc_data *data_to_kill) - __must_hold(&ctx->uring_lock) -{ - WARN_ON_ONCE(!ctx->rsrc_backup_node); - WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node); - - io_rsrc_refs_drop(ctx); - - if (data_to_kill) { - struct io_rsrc_node *rsrc_node = ctx->rsrc_node; - - rsrc_node->rsrc_data = data_to_kill; - spin_lock_irq(&ctx->rsrc_ref_lock); - list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list); - spin_unlock_irq(&ctx->rsrc_ref_lock); - - atomic_inc(&data_to_kill->refs); - percpu_ref_kill(&rsrc_node->refs); - ctx->rsrc_node = NULL; - } - - if (!ctx->rsrc_node) { - ctx->rsrc_node = ctx->rsrc_backup_node; - ctx->rsrc_backup_node = NULL; - } -} - -static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) -{ - if (ctx->rsrc_backup_node) - return 0; - ctx->rsrc_backup_node = io_rsrc_node_alloc(); - return ctx->rsrc_backup_node ? 0 : -ENOMEM; -} - -static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data, - struct io_ring_ctx *ctx) -{ - int ret; - - /* As we may drop ->uring_lock, other task may have started quiesce */ - if (data->quiesce) - return -ENXIO; - - data->quiesce = true; - do { - ret = io_rsrc_node_switch_start(ctx); - if (ret) - break; - io_rsrc_node_switch(ctx, data); - - /* kill initial ref, already quiesced if zero */ - if (atomic_dec_and_test(&data->refs)) - break; - mutex_unlock(&ctx->uring_lock); - flush_delayed_work(&ctx->rsrc_put_work); - ret = wait_for_completion_interruptible(&data->done); - if (!ret) { - mutex_lock(&ctx->uring_lock); - if (atomic_read(&data->refs) > 0) { - /* - * it has been revived by another thread while - * we were unlocked - */ - mutex_unlock(&ctx->uring_lock); - } else { - break; - } - } - - atomic_inc(&data->refs); - /* wait for all works potentially completing data->done */ - flush_delayed_work(&ctx->rsrc_put_work); - reinit_completion(&data->done); - - ret = io_run_task_work_sig(); - mutex_lock(&ctx->uring_lock); - } while (ret >= 0); - data->quiesce = false; - - return ret; -} - -static u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx) -{ - unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK; - unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT; - - return &data->tags[table_idx][off]; -} - -static void io_rsrc_data_free(struct io_rsrc_data *data) -{ - size_t size = data->nr * sizeof(data->tags[0][0]); - - if (data->tags) - io_free_page_table((void **)data->tags, size); - kfree(data); -} - -static __cold int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put, - u64 __user *utags, unsigned nr, - struct io_rsrc_data **pdata) -{ - struct io_rsrc_data *data; - int ret = -ENOMEM; - unsigned i; - - data = kzalloc(sizeof(*data), GFP_KERNEL); - if (!data) - return -ENOMEM; - data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0])); - if (!data->tags) { - kfree(data); - return -ENOMEM; - } - - data->nr = nr; - data->ctx = ctx; - data->do_put = do_put; - if (utags) { - ret = -EFAULT; - for (i = 0; i < nr; i++) { - u64 *tag_slot = io_get_tag_slot(data, i); - - if (copy_from_user(tag_slot, &utags[i], - sizeof(*tag_slot))) - goto fail; - } - } - - atomic_set(&data->refs, 1); - init_completion(&data->done); - *pdata = data; - return 0; -fail: - io_rsrc_data_free(data); - return ret; -} - -static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files) -{ - table->files = kvcalloc(nr_files, sizeof(table->files[0]), - GFP_KERNEL_ACCOUNT); - if (unlikely(!table->files)) - return false; - - table->bitmap = bitmap_zalloc(nr_files, GFP_KERNEL_ACCOUNT); - if (unlikely(!table->bitmap)) { - kvfree(table->files); - return false; - } - - return true; -} - -static void io_free_file_tables(struct io_file_table *table) -{ - kvfree(table->files); - bitmap_free(table->bitmap); - table->files = NULL; - table->bitmap = NULL; -} - -static inline void io_file_bitmap_set(struct io_file_table *table, int bit) -{ - WARN_ON_ONCE(test_bit(bit, table->bitmap)); - __set_bit(bit, table->bitmap); - table->alloc_hint = bit + 1; -} - -static inline void io_file_bitmap_clear(struct io_file_table *table, int bit) -{ - __clear_bit(bit, table->bitmap); - table->alloc_hint = bit; -} - -static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) -{ -#if !defined(IO_URING_SCM_ALL) - int i; - - for (i = 0; i < ctx->nr_user_files; i++) { - struct file *file = io_file_from_index(ctx, i); - - if (!file) - continue; - if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM) - continue; - io_file_bitmap_clear(&ctx->file_table, i); - fput(file); - } -#endif - -#if defined(CONFIG_UNIX) - if (ctx->ring_sock) { - struct sock *sock = ctx->ring_sock->sk; - struct sk_buff *skb; - - while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL) - kfree_skb(skb); - } -#endif - io_free_file_tables(&ctx->file_table); - io_rsrc_data_free(ctx->file_data); - ctx->file_data = NULL; - ctx->nr_user_files = 0; -} - -static int io_sqe_files_unregister(struct io_ring_ctx *ctx) -{ - unsigned nr = ctx->nr_user_files; - int ret; - - if (!ctx->file_data) - return -ENXIO; - - /* - * Quiesce may unlock ->uring_lock, and while it's not held - * prevent new requests using the table. - */ - ctx->nr_user_files = 0; - ret = io_rsrc_ref_quiesce(ctx->file_data, ctx); - ctx->nr_user_files = nr; - if (!ret) - __io_sqe_files_unregister(ctx); - return ret; -} - -static void io_sq_thread_unpark(struct io_sq_data *sqd) - __releases(&sqd->lock) -{ - WARN_ON_ONCE(sqd->thread == current); - - /* - * Do the dance but not conditional clear_bit() because it'd race with - * other threads incrementing park_pending and setting the bit. - */ - clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); - if (atomic_dec_return(&sqd->park_pending)) - set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); - mutex_unlock(&sqd->lock); -} - -static void io_sq_thread_park(struct io_sq_data *sqd) - __acquires(&sqd->lock) -{ - WARN_ON_ONCE(sqd->thread == current); - - atomic_inc(&sqd->park_pending); - set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); - mutex_lock(&sqd->lock); - if (sqd->thread) - wake_up_process(sqd->thread); -} - -static void io_sq_thread_stop(struct io_sq_data *sqd) -{ - WARN_ON_ONCE(sqd->thread == current); - WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)); - - set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); - mutex_lock(&sqd->lock); - if (sqd->thread) - wake_up_process(sqd->thread); - mutex_unlock(&sqd->lock); - wait_for_completion(&sqd->exited); -} - -static void io_put_sq_data(struct io_sq_data *sqd) -{ - if (refcount_dec_and_test(&sqd->refs)) { - WARN_ON_ONCE(atomic_read(&sqd->park_pending)); - - io_sq_thread_stop(sqd); - kfree(sqd); - } -} - -static void io_sq_thread_finish(struct io_ring_ctx *ctx) -{ - struct io_sq_data *sqd = ctx->sq_data; - - if (sqd) { - io_sq_thread_park(sqd); - list_del_init(&ctx->sqd_list); - io_sqd_update_thread_idle(sqd); - io_sq_thread_unpark(sqd); - - io_put_sq_data(sqd); - ctx->sq_data = NULL; - } -} - -static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p) -{ - struct io_ring_ctx *ctx_attach; - struct io_sq_data *sqd; - struct fd f; - - f = fdget(p->wq_fd); - if (!f.file) - return ERR_PTR(-ENXIO); - if (f.file->f_op != &io_uring_fops) { - fdput(f); - return ERR_PTR(-EINVAL); - } - - ctx_attach = f.file->private_data; - sqd = ctx_attach->sq_data; - if (!sqd) { - fdput(f); - return ERR_PTR(-EINVAL); - } - if (sqd->task_tgid != current->tgid) { - fdput(f); - return ERR_PTR(-EPERM); - } - - refcount_inc(&sqd->refs); - fdput(f); - return sqd; -} - -static struct io_sq_data *io_get_sq_data(struct io_uring_params *p, - bool *attached) -{ - struct io_sq_data *sqd; - - *attached = false; - if (p->flags & IORING_SETUP_ATTACH_WQ) { - sqd = io_attach_sq_data(p); - if (!IS_ERR(sqd)) { - *attached = true; - return sqd; - } - /* fall through for EPERM case, setup new sqd/task */ - if (PTR_ERR(sqd) != -EPERM) - return sqd; - } - - sqd = kzalloc(sizeof(*sqd), GFP_KERNEL); - if (!sqd) - return ERR_PTR(-ENOMEM); - - atomic_set(&sqd->park_pending, 0); - refcount_set(&sqd->refs, 1); - INIT_LIST_HEAD(&sqd->ctx_list); - mutex_init(&sqd->lock); - init_waitqueue_head(&sqd->wait); - init_completion(&sqd->exited); - return sqd; -} - -/* - * Ensure the UNIX gc is aware of our file set, so we are certain that - * the io_uring can be safely unregistered on process exit, even if we have - * loops in the file referencing. We account only files that can hold other - * files because otherwise they can't form a loop and so are not interesting - * for GC. - */ -static int io_scm_file_account(struct io_ring_ctx *ctx, struct file *file) -{ -#if defined(CONFIG_UNIX) - struct sock *sk = ctx->ring_sock->sk; - struct sk_buff_head *head = &sk->sk_receive_queue; - struct scm_fp_list *fpl; - struct sk_buff *skb; - - if (likely(!io_file_need_scm(file))) - return 0; - - /* - * See if we can merge this file into an existing skb SCM_RIGHTS - * file set. If there's no room, fall back to allocating a new skb - * and filling it in. - */ - spin_lock_irq(&head->lock); - skb = skb_peek(head); - if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD) - __skb_unlink(skb, head); - else - skb = NULL; - spin_unlock_irq(&head->lock); - - if (!skb) { - fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); - if (!fpl) - return -ENOMEM; - - skb = alloc_skb(0, GFP_KERNEL); - if (!skb) { - kfree(fpl); - return -ENOMEM; - } - - fpl->user = get_uid(current_user()); - fpl->max = SCM_MAX_FD; - fpl->count = 0; - - UNIXCB(skb).fp = fpl; - skb->sk = sk; - skb->destructor = unix_destruct_scm; - refcount_add(skb->truesize, &sk->sk_wmem_alloc); - } - - fpl = UNIXCB(skb).fp; - fpl->fp[fpl->count++] = get_file(file); - unix_inflight(fpl->user, file); - skb_queue_head(head, skb); - fput(file); -#endif - return 0; -} - -static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) -{ - struct file *file = prsrc->file; -#if defined(CONFIG_UNIX) - struct sock *sock = ctx->ring_sock->sk; - struct sk_buff_head list, *head = &sock->sk_receive_queue; - struct sk_buff *skb; - int i; - - if (!io_file_need_scm(file)) { - fput(file); - return; - } - - __skb_queue_head_init(&list); - - /* - * Find the skb that holds this file in its SCM_RIGHTS. When found, - * remove this entry and rearrange the file array. - */ - skb = skb_dequeue(head); - while (skb) { - struct scm_fp_list *fp; - - fp = UNIXCB(skb).fp; - for (i = 0; i < fp->count; i++) { - int left; - - if (fp->fp[i] != file) - continue; - - unix_notinflight(fp->user, fp->fp[i]); - left = fp->count - 1 - i; - if (left) { - memmove(&fp->fp[i], &fp->fp[i + 1], - left * sizeof(struct file *)); - } - fp->count--; - if (!fp->count) { - kfree_skb(skb); - skb = NULL; - } else { - __skb_queue_tail(&list, skb); - } - fput(file); - file = NULL; - break; - } - - if (!file) - break; - - __skb_queue_tail(&list, skb); - - skb = skb_dequeue(head); - } - - if (skb_peek(&list)) { - spin_lock_irq(&head->lock); - while ((skb = __skb_dequeue(&list)) != NULL) - __skb_queue_tail(head, skb); - spin_unlock_irq(&head->lock); - } -#else - fput(file); -#endif -} - -static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) -{ - struct io_rsrc_data *rsrc_data = ref_node->rsrc_data; - struct io_ring_ctx *ctx = rsrc_data->ctx; - struct io_rsrc_put *prsrc, *tmp; - - list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) { - list_del(&prsrc->list); - - if (prsrc->tag) { - if (ctx->flags & IORING_SETUP_IOPOLL) - mutex_lock(&ctx->uring_lock); - - spin_lock(&ctx->completion_lock); - io_fill_cqe_aux(ctx, prsrc->tag, 0, 0); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); - - if (ctx->flags & IORING_SETUP_IOPOLL) - mutex_unlock(&ctx->uring_lock); - } - - rsrc_data->do_put(ctx, prsrc); - kfree(prsrc); - } - - io_rsrc_node_destroy(ref_node); - if (atomic_dec_and_test(&rsrc_data->refs)) - complete(&rsrc_data->done); -} - -static void io_rsrc_put_work(struct work_struct *work) -{ - struct io_ring_ctx *ctx; - struct llist_node *node; - - ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work); - node = llist_del_all(&ctx->rsrc_put_llist); - - while (node) { - struct io_rsrc_node *ref_node; - struct llist_node *next = node->next; - - ref_node = llist_entry(node, struct io_rsrc_node, llist); - __io_rsrc_put_work(ref_node); - node = next; - } -} - -static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, - unsigned nr_args, u64 __user *tags) -{ - __s32 __user *fds = (__s32 __user *) arg; - struct file *file; - int fd, ret; - unsigned i; - - if (ctx->file_data) - return -EBUSY; - if (!nr_args) - return -EINVAL; - if (nr_args > IORING_MAX_FIXED_FILES) - return -EMFILE; - if (nr_args > rlimit(RLIMIT_NOFILE)) - return -EMFILE; - ret = io_rsrc_node_switch_start(ctx); - if (ret) - return ret; - ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args, - &ctx->file_data); - if (ret) - return ret; - - if (!io_alloc_file_tables(&ctx->file_table, nr_args)) { - io_rsrc_data_free(ctx->file_data); - ctx->file_data = NULL; - return -ENOMEM; - } - - for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { - struct io_fixed_file *file_slot; - - if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) { - ret = -EFAULT; - goto fail; - } - /* allow sparse sets */ - if (!fds || fd == -1) { - ret = -EINVAL; - if (unlikely(*io_get_tag_slot(ctx->file_data, i))) - goto fail; - continue; - } - - file = fget(fd); - ret = -EBADF; - if (unlikely(!file)) - goto fail; - - /* - * Don't allow io_uring instances to be registered. If UNIX - * isn't enabled, then this causes a reference cycle and this - * instance can never get freed. If UNIX is enabled we'll - * handle it just fine, but there's still no point in allowing - * a ring fd as it doesn't support regular read/write anyway. - */ - if (file->f_op == &io_uring_fops) { - fput(file); - goto fail; - } - ret = io_scm_file_account(ctx, file); - if (ret) { - fput(file); - goto fail; - } - file_slot = io_fixed_file_slot(&ctx->file_table, i); - io_fixed_file_set(file_slot, file); - io_file_bitmap_set(&ctx->file_table, i); - } - - io_rsrc_node_switch(ctx, NULL); - return 0; -fail: - __io_sqe_files_unregister(ctx); - return ret; -} - -static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, - struct io_rsrc_node *node, void *rsrc) -{ - u64 *tag_slot = io_get_tag_slot(data, idx); - struct io_rsrc_put *prsrc; - - prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL); - if (!prsrc) - return -ENOMEM; - - prsrc->tag = *tag_slot; - *tag_slot = 0; - prsrc->rsrc = rsrc; - list_add(&prsrc->list, &node->rsrc_list); - return 0; -} - -static int io_install_fixed_file(struct io_kiocb *req, struct file *file, - unsigned int issue_flags, u32 slot_index) - __must_hold(&req->ctx->uring_lock) -{ - struct io_ring_ctx *ctx = req->ctx; - bool needs_switch = false; - struct io_fixed_file *file_slot; - int ret; - - if (file->f_op == &io_uring_fops) - return -EBADF; - if (!ctx->file_data) - return -ENXIO; - if (slot_index >= ctx->nr_user_files) - return -EINVAL; - - slot_index = array_index_nospec(slot_index, ctx->nr_user_files); - file_slot = io_fixed_file_slot(&ctx->file_table, slot_index); - - if (file_slot->file_ptr) { - struct file *old_file; - - ret = io_rsrc_node_switch_start(ctx); - if (ret) - goto err; - - old_file = (struct file *)(file_slot->file_ptr & FFS_MASK); - ret = io_queue_rsrc_removal(ctx->file_data, slot_index, - ctx->rsrc_node, old_file); - if (ret) - goto err; - file_slot->file_ptr = 0; - io_file_bitmap_clear(&ctx->file_table, slot_index); - needs_switch = true; - } - - ret = io_scm_file_account(ctx, file); - if (!ret) { - *io_get_tag_slot(ctx->file_data, slot_index) = 0; - io_fixed_file_set(file_slot, file); - io_file_bitmap_set(&ctx->file_table, slot_index); - } -err: - if (needs_switch) - io_rsrc_node_switch(ctx, ctx->file_data); - if (ret) - fput(file); - return ret; -} - -static int __io_close_fixed(struct io_kiocb *req, unsigned int issue_flags, - unsigned int offset) -{ - struct io_ring_ctx *ctx = req->ctx; - struct io_fixed_file *file_slot; - struct file *file; - int ret; - - io_ring_submit_lock(ctx, issue_flags); - ret = -ENXIO; - if (unlikely(!ctx->file_data)) - goto out; - ret = -EINVAL; - if (offset >= ctx->nr_user_files) - goto out; - ret = io_rsrc_node_switch_start(ctx); - if (ret) - goto out; - - offset = array_index_nospec(offset, ctx->nr_user_files); - file_slot = io_fixed_file_slot(&ctx->file_table, offset); - ret = -EBADF; - if (!file_slot->file_ptr) - goto out; - - file = (struct file *)(file_slot->file_ptr & FFS_MASK); - ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file); - if (ret) - goto out; - - file_slot->file_ptr = 0; - io_file_bitmap_clear(&ctx->file_table, offset); - io_rsrc_node_switch(ctx, ctx->file_data); - ret = 0; -out: - io_ring_submit_unlock(ctx, issue_flags); - return ret; -} - -static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) -{ - return __io_close_fixed(req, issue_flags, req->close.file_slot - 1); -} - -static int __io_sqe_files_update(struct io_ring_ctx *ctx, - struct io_uring_rsrc_update2 *up, - unsigned nr_args) -{ - u64 __user *tags = u64_to_user_ptr(up->tags); - __s32 __user *fds = u64_to_user_ptr(up->data); - struct io_rsrc_data *data = ctx->file_data; - struct io_fixed_file *file_slot; - struct file *file; - int fd, i, err = 0; - unsigned int done; - bool needs_switch = false; - - if (!ctx->file_data) - return -ENXIO; - if (up->offset + nr_args > ctx->nr_user_files) - return -EINVAL; - - for (done = 0; done < nr_args; done++) { - u64 tag = 0; - - if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || - copy_from_user(&fd, &fds[done], sizeof(fd))) { - err = -EFAULT; - break; - } - if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { - err = -EINVAL; - break; - } - if (fd == IORING_REGISTER_FILES_SKIP) - continue; - - i = array_index_nospec(up->offset + done, ctx->nr_user_files); - file_slot = io_fixed_file_slot(&ctx->file_table, i); - - if (file_slot->file_ptr) { - file = (struct file *)(file_slot->file_ptr & FFS_MASK); - err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file); - if (err) - break; - file_slot->file_ptr = 0; - io_file_bitmap_clear(&ctx->file_table, i); - needs_switch = true; - } - if (fd != -1) { - file = fget(fd); - if (!file) { - err = -EBADF; - break; - } - /* - * Don't allow io_uring instances to be registered. If - * UNIX isn't enabled, then this causes a reference - * cycle and this instance can never get freed. If UNIX - * is enabled we'll handle it just fine, but there's - * still no point in allowing a ring fd as it doesn't - * support regular read/write anyway. - */ - if (file->f_op == &io_uring_fops) { - fput(file); - err = -EBADF; - break; - } - err = io_scm_file_account(ctx, file); - if (err) { - fput(file); - break; - } - *io_get_tag_slot(data, i) = tag; - io_fixed_file_set(file_slot, file); - io_file_bitmap_set(&ctx->file_table, i); - } - } - - if (needs_switch) - io_rsrc_node_switch(ctx, data); - return done ? done : err; -} - -static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx, - struct task_struct *task) -{ - struct io_wq_hash *hash; - struct io_wq_data data; - unsigned int concurrency; - - mutex_lock(&ctx->uring_lock); - hash = ctx->hash_map; - if (!hash) { - hash = kzalloc(sizeof(*hash), GFP_KERNEL); - if (!hash) { - mutex_unlock(&ctx->uring_lock); - return ERR_PTR(-ENOMEM); - } - refcount_set(&hash->refs, 1); - init_waitqueue_head(&hash->wait); - ctx->hash_map = hash; - } - mutex_unlock(&ctx->uring_lock); - - data.hash = hash; - data.task = task; - data.free_work = io_wq_free_work; - data.do_work = io_wq_submit_work; - - /* Do QD, or 4 * CPUS, whatever is smallest */ - concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); - - return io_wq_create(concurrency, &data); -} - -static __cold int io_uring_alloc_task_context(struct task_struct *task, - struct io_ring_ctx *ctx) -{ - struct io_uring_task *tctx; - int ret; - - tctx = kzalloc(sizeof(*tctx), GFP_KERNEL); - if (unlikely(!tctx)) - return -ENOMEM; - - tctx->registered_rings = kcalloc(IO_RINGFD_REG_MAX, - sizeof(struct file *), GFP_KERNEL); - if (unlikely(!tctx->registered_rings)) { - kfree(tctx); - return -ENOMEM; - } - - ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL); - if (unlikely(ret)) { - kfree(tctx->registered_rings); - kfree(tctx); - return ret; - } - - tctx->io_wq = io_init_wq_offload(ctx, task); - if (IS_ERR(tctx->io_wq)) { - ret = PTR_ERR(tctx->io_wq); - percpu_counter_destroy(&tctx->inflight); - kfree(tctx->registered_rings); - kfree(tctx); - return ret; - } - - xa_init(&tctx->xa); - init_waitqueue_head(&tctx->wait); - atomic_set(&tctx->in_idle, 0); - atomic_set(&tctx->inflight_tracked, 0); - task->io_uring = tctx; - spin_lock_init(&tctx->task_lock); - INIT_WQ_LIST(&tctx->task_list); - INIT_WQ_LIST(&tctx->prio_task_list); - init_task_work(&tctx->task_work, tctx_task_work); - return 0; -} - -void __io_uring_free(struct task_struct *tsk) -{ - struct io_uring_task *tctx = tsk->io_uring; - - WARN_ON_ONCE(!xa_empty(&tctx->xa)); - WARN_ON_ONCE(tctx->io_wq); - WARN_ON_ONCE(tctx->cached_refs); - - kfree(tctx->registered_rings); - percpu_counter_destroy(&tctx->inflight); - kfree(tctx); - tsk->io_uring = NULL; -} - -static __cold int io_sq_offload_create(struct io_ring_ctx *ctx, - struct io_uring_params *p) -{ - int ret; - - /* Retain compatibility with failing for an invalid attach attempt */ - if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) == - IORING_SETUP_ATTACH_WQ) { - struct fd f; - - f = fdget(p->wq_fd); - if (!f.file) - return -ENXIO; - if (f.file->f_op != &io_uring_fops) { - fdput(f); - return -EINVAL; - } - fdput(f); - } - if (ctx->flags & IORING_SETUP_SQPOLL) { - struct task_struct *tsk; - struct io_sq_data *sqd; - bool attached; - - ret = security_uring_sqpoll(); - if (ret) - return ret; - - sqd = io_get_sq_data(p, &attached); - if (IS_ERR(sqd)) { - ret = PTR_ERR(sqd); - goto err; - } - - ctx->sq_creds = get_current_cred(); - ctx->sq_data = sqd; - ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle); - if (!ctx->sq_thread_idle) - ctx->sq_thread_idle = HZ; - - io_sq_thread_park(sqd); - list_add(&ctx->sqd_list, &sqd->ctx_list); - io_sqd_update_thread_idle(sqd); - /* don't attach to a dying SQPOLL thread, would be racy */ - ret = (attached && !sqd->thread) ? -ENXIO : 0; - io_sq_thread_unpark(sqd); - - if (ret < 0) - goto err; - if (attached) - return 0; - - if (p->flags & IORING_SETUP_SQ_AFF) { - int cpu = p->sq_thread_cpu; - - ret = -EINVAL; - if (cpu >= nr_cpu_ids || !cpu_online(cpu)) - goto err_sqpoll; - sqd->sq_cpu = cpu; - } else { - sqd->sq_cpu = -1; - } - - sqd->task_pid = current->pid; - sqd->task_tgid = current->tgid; - tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE); - if (IS_ERR(tsk)) { - ret = PTR_ERR(tsk); - goto err_sqpoll; - } - - sqd->thread = tsk; - ret = io_uring_alloc_task_context(tsk, ctx); - wake_up_new_task(tsk); - if (ret) - goto err; - } else if (p->flags & IORING_SETUP_SQ_AFF) { - /* Can't have SQ_AFF without SQPOLL */ - ret = -EINVAL; - goto err; - } - - return 0; -err_sqpoll: - complete(&ctx->sq_data->exited); -err: - io_sq_thread_finish(ctx); - return ret; -} - -static inline void __io_unaccount_mem(struct user_struct *user, - unsigned long nr_pages) -{ - atomic_long_sub(nr_pages, &user->locked_vm); -} - -static inline int __io_account_mem(struct user_struct *user, - unsigned long nr_pages) -{ - unsigned long page_limit, cur_pages, new_pages; - - /* Don't allow more pages than we can safely lock */ - page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - - do { - cur_pages = atomic_long_read(&user->locked_vm); - new_pages = cur_pages + nr_pages; - if (new_pages > page_limit) - return -ENOMEM; - } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages, - new_pages) != cur_pages); - - return 0; -} - -static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) -{ - if (ctx->user) - __io_unaccount_mem(ctx->user, nr_pages); - - if (ctx->mm_account) - atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); -} - -static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) -{ - int ret; - - if (ctx->user) { - ret = __io_account_mem(ctx->user, nr_pages); - if (ret) - return ret; - } - - if (ctx->mm_account) - atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); - - return 0; -} - -static void io_mem_free(void *ptr) -{ - struct page *page; - - if (!ptr) - return; - - page = virt_to_head_page(ptr); - if (put_page_testzero(page)) - free_compound_page(page); -} - -static void *io_mem_alloc(size_t size) -{ - gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; - - return (void *) __get_free_pages(gfp, get_order(size)); -} - -static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries, - unsigned int cq_entries, size_t *sq_offset) -{ - struct io_rings *rings; - size_t off, sq_array_size; - - off = struct_size(rings, cqes, cq_entries); - if (off == SIZE_MAX) - return SIZE_MAX; - if (ctx->flags & IORING_SETUP_CQE32) { - if (check_shl_overflow(off, 1, &off)) - return SIZE_MAX; - } - -#ifdef CONFIG_SMP - off = ALIGN(off, SMP_CACHE_BYTES); - if (off == 0) - return SIZE_MAX; -#endif - - if (sq_offset) - *sq_offset = off; - - sq_array_size = array_size(sizeof(u32), sq_entries); - if (sq_array_size == SIZE_MAX) - return SIZE_MAX; - - if (check_add_overflow(off, sq_array_size, &off)) - return SIZE_MAX; - - return off; -} - -static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot) -{ - struct io_mapped_ubuf *imu = *slot; - unsigned int i; - - if (imu != ctx->dummy_ubuf) { - for (i = 0; i < imu->nr_bvecs; i++) - unpin_user_page(imu->bvec[i].bv_page); - if (imu->acct_pages) - io_unaccount_mem(ctx, imu->acct_pages); - kvfree(imu); - } - *slot = NULL; -} - -static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) -{ - io_buffer_unmap(ctx, &prsrc->buf); - prsrc->buf = NULL; -} - -static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx) -{ - unsigned int i; - - for (i = 0; i < ctx->nr_user_bufs; i++) - io_buffer_unmap(ctx, &ctx->user_bufs[i]); - kfree(ctx->user_bufs); - io_rsrc_data_free(ctx->buf_data); - ctx->user_bufs = NULL; - ctx->buf_data = NULL; - ctx->nr_user_bufs = 0; -} - -static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) -{ - unsigned nr = ctx->nr_user_bufs; - int ret; - - if (!ctx->buf_data) - return -ENXIO; - - /* - * Quiesce may unlock ->uring_lock, and while it's not held - * prevent new requests using the table. - */ - ctx->nr_user_bufs = 0; - ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx); - ctx->nr_user_bufs = nr; - if (!ret) - __io_sqe_buffers_unregister(ctx); - return ret; -} - -static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, - void __user *arg, unsigned index) -{ - struct iovec __user *src; - -#ifdef CONFIG_COMPAT - if (ctx->compat) { - struct compat_iovec __user *ciovs; - struct compat_iovec ciov; - - ciovs = (struct compat_iovec __user *) arg; - if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov))) - return -EFAULT; - - dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base); - dst->iov_len = ciov.iov_len; - return 0; - } -#endif - src = (struct iovec __user *) arg; - if (copy_from_user(dst, &src[index], sizeof(*dst))) - return -EFAULT; - return 0; -} - -/* - * Not super efficient, but this is just a registration time. And we do cache - * the last compound head, so generally we'll only do a full search if we don't - * match that one. - * - * We check if the given compound head page has already been accounted, to - * avoid double accounting it. This allows us to account the full size of the - * page, not just the constituent pages of a huge page. - */ -static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, - int nr_pages, struct page *hpage) -{ - int i, j; - - /* check current page array */ - for (i = 0; i < nr_pages; i++) { - if (!PageCompound(pages[i])) - continue; - if (compound_head(pages[i]) == hpage) - return true; - } - - /* check previously registered pages */ - for (i = 0; i < ctx->nr_user_bufs; i++) { - struct io_mapped_ubuf *imu = ctx->user_bufs[i]; - - for (j = 0; j < imu->nr_bvecs; j++) { - if (!PageCompound(imu->bvec[j].bv_page)) - continue; - if (compound_head(imu->bvec[j].bv_page) == hpage) - return true; - } - } - - return false; -} - -static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, - int nr_pages, struct io_mapped_ubuf *imu, - struct page **last_hpage) -{ - int i, ret; - - imu->acct_pages = 0; - for (i = 0; i < nr_pages; i++) { - if (!PageCompound(pages[i])) { - imu->acct_pages++; - } else { - struct page *hpage; - - hpage = compound_head(pages[i]); - if (hpage == *last_hpage) - continue; - *last_hpage = hpage; - if (headpage_already_acct(ctx, pages, i, hpage)) - continue; - imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; - } - } - - if (!imu->acct_pages) - return 0; - - ret = io_account_mem(ctx, imu->acct_pages); - if (ret) - imu->acct_pages = 0; - return ret; -} - -static struct page **io_pin_pages(unsigned long ubuf, unsigned long len, - int *npages) -{ - unsigned long start, end, nr_pages; - struct vm_area_struct **vmas = NULL; - struct page **pages = NULL; - int i, pret, ret = -ENOMEM; - - end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - start = ubuf >> PAGE_SHIFT; - nr_pages = end - start; - - pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); - if (!pages) - goto done; - - vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *), - GFP_KERNEL); - if (!vmas) - goto done; - - ret = 0; - mmap_read_lock(current->mm); - pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, - pages, vmas); - if (pret == nr_pages) { - /* don't support file backed memory */ - for (i = 0; i < nr_pages; i++) { - struct vm_area_struct *vma = vmas[i]; - - if (vma_is_shmem(vma)) - continue; - if (vma->vm_file && - !is_file_hugepages(vma->vm_file)) { - ret = -EOPNOTSUPP; - break; - } - } - *npages = nr_pages; - } else { - ret = pret < 0 ? pret : -EFAULT; - } - mmap_read_unlock(current->mm); - if (ret) { - /* - * if we did partial map, or found file backed vmas, - * release any pages we did get - */ - if (pret > 0) - unpin_user_pages(pages, pret); - goto done; - } - ret = 0; -done: - kvfree(vmas); - if (ret < 0) { - kvfree(pages); - pages = ERR_PTR(ret); - } - return pages; -} - -static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, - struct io_mapped_ubuf **pimu, - struct page **last_hpage) -{ - struct io_mapped_ubuf *imu = NULL; - struct page **pages = NULL; - unsigned long off; - size_t size; - int ret, nr_pages, i; - - if (!iov->iov_base) { - *pimu = ctx->dummy_ubuf; - return 0; - } - - *pimu = NULL; - ret = -ENOMEM; - - pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, - &nr_pages); - if (IS_ERR(pages)) { - ret = PTR_ERR(pages); - pages = NULL; - goto done; - } - - imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); - if (!imu) - goto done; - - ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); - if (ret) { - unpin_user_pages(pages, nr_pages); - goto done; - } - - off = (unsigned long) iov->iov_base & ~PAGE_MASK; - size = iov->iov_len; - for (i = 0; i < nr_pages; i++) { - size_t vec_len; - - vec_len = min_t(size_t, size, PAGE_SIZE - off); - imu->bvec[i].bv_page = pages[i]; - imu->bvec[i].bv_len = vec_len; - imu->bvec[i].bv_offset = off; - off = 0; - size -= vec_len; - } - /* store original address for later verification */ - imu->ubuf = (unsigned long) iov->iov_base; - imu->ubuf_end = imu->ubuf + iov->iov_len; - imu->nr_bvecs = nr_pages; - *pimu = imu; - ret = 0; -done: - if (ret) - kvfree(imu); - kvfree(pages); - return ret; -} - -static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args) -{ - ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL); - return ctx->user_bufs ? 0 : -ENOMEM; -} - -static int io_buffer_validate(struct iovec *iov) -{ - unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); - - /* - * Don't impose further limits on the size and buffer - * constraints here, we'll -EINVAL later when IO is - * submitted if they are wrong. - */ - if (!iov->iov_base) - return iov->iov_len ? -EFAULT : 0; - if (!iov->iov_len) - return -EFAULT; - - /* arbitrary limit, but we need something */ - if (iov->iov_len > SZ_1G) - return -EFAULT; - - if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) - return -EOVERFLOW; - - return 0; -} - -static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, - unsigned int nr_args, u64 __user *tags) -{ - struct page *last_hpage = NULL; - struct io_rsrc_data *data; - int i, ret; - struct iovec iov; - - if (ctx->user_bufs) - return -EBUSY; - if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) - return -EINVAL; - ret = io_rsrc_node_switch_start(ctx); - if (ret) - return ret; - ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data); - if (ret) - return ret; - ret = io_buffers_map_alloc(ctx, nr_args); - if (ret) { - io_rsrc_data_free(data); - return ret; - } - - for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { - if (arg) { - ret = io_copy_iov(ctx, &iov, arg, i); - if (ret) - break; - ret = io_buffer_validate(&iov); - if (ret) - break; - } else { - memset(&iov, 0, sizeof(iov)); - } - - if (!iov.iov_base && *io_get_tag_slot(data, i)) { - ret = -EINVAL; - break; - } - - ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i], - &last_hpage); - if (ret) - break; - } - - WARN_ON_ONCE(ctx->buf_data); - - ctx->buf_data = data; - if (ret) - __io_sqe_buffers_unregister(ctx); - else - io_rsrc_node_switch(ctx, NULL); - return ret; -} - -static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, - struct io_uring_rsrc_update2 *up, - unsigned int nr_args) -{ - u64 __user *tags = u64_to_user_ptr(up->tags); - struct iovec iov, __user *iovs = u64_to_user_ptr(up->data); - struct page *last_hpage = NULL; - bool needs_switch = false; - __u32 done; - int i, err; - - if (!ctx->buf_data) - return -ENXIO; - if (up->offset + nr_args > ctx->nr_user_bufs) - return -EINVAL; - - for (done = 0; done < nr_args; done++) { - struct io_mapped_ubuf *imu; - int offset = up->offset + done; - u64 tag = 0; - - err = io_copy_iov(ctx, &iov, iovs, done); - if (err) - break; - if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { - err = -EFAULT; - break; - } - err = io_buffer_validate(&iov); - if (err) - break; - if (!iov.iov_base && tag) { - err = -EINVAL; - break; - } - err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage); - if (err) - break; - - i = array_index_nospec(offset, ctx->nr_user_bufs); - if (ctx->user_bufs[i] != ctx->dummy_ubuf) { - err = io_queue_rsrc_removal(ctx->buf_data, i, - ctx->rsrc_node, ctx->user_bufs[i]); - if (unlikely(err)) { - io_buffer_unmap(ctx, &imu); - break; - } - ctx->user_bufs[i] = NULL; - needs_switch = true; - } - - ctx->user_bufs[i] = imu; - *io_get_tag_slot(ctx->buf_data, offset) = tag; - } - - if (needs_switch) - io_rsrc_node_switch(ctx, ctx->buf_data); - return done ? done : err; -} - -static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, - unsigned int eventfd_async) -{ - struct io_ev_fd *ev_fd; - __s32 __user *fds = arg; - int fd; - - ev_fd = rcu_dereference_protected(ctx->io_ev_fd, - lockdep_is_held(&ctx->uring_lock)); - if (ev_fd) - return -EBUSY; - - if (copy_from_user(&fd, fds, sizeof(*fds))) - return -EFAULT; - - ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL); - if (!ev_fd) - return -ENOMEM; - - ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); - if (IS_ERR(ev_fd->cq_ev_fd)) { - int ret = PTR_ERR(ev_fd->cq_ev_fd); - kfree(ev_fd); - return ret; - } - ev_fd->eventfd_async = eventfd_async; - ctx->has_evfd = true; - rcu_assign_pointer(ctx->io_ev_fd, ev_fd); - return 0; -} - -static void io_eventfd_put(struct rcu_head *rcu) -{ - struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); - - eventfd_ctx_put(ev_fd->cq_ev_fd); - kfree(ev_fd); -} - -static int io_eventfd_unregister(struct io_ring_ctx *ctx) -{ - struct io_ev_fd *ev_fd; - - ev_fd = rcu_dereference_protected(ctx->io_ev_fd, - lockdep_is_held(&ctx->uring_lock)); - if (ev_fd) { - ctx->has_evfd = false; - rcu_assign_pointer(ctx->io_ev_fd, NULL); - call_rcu(&ev_fd->rcu, io_eventfd_put); - return 0; - } - - return -ENXIO; -} - -static void io_destroy_buffers(struct io_ring_ctx *ctx) -{ - struct io_buffer_list *bl; - unsigned long index; - int i; - - for (i = 0; i < BGID_ARRAY; i++) { - if (!ctx->io_bl) - break; - __io_remove_buffers(ctx, &ctx->io_bl[i], -1U); - } - - xa_for_each(&ctx->io_bl_xa, index, bl) { - xa_erase(&ctx->io_bl_xa, bl->bgid); - __io_remove_buffers(ctx, bl, -1U); - kfree(bl); - } - - while (!list_empty(&ctx->io_buffers_pages)) { - struct page *page; - - page = list_first_entry(&ctx->io_buffers_pages, struct page, lru); - list_del_init(&page->lru); - __free_page(page); - } -} - -static void io_req_caches_free(struct io_ring_ctx *ctx) -{ - struct io_submit_state *state = &ctx->submit_state; - int nr = 0; - - mutex_lock(&ctx->uring_lock); - io_flush_cached_locked_reqs(ctx, state); - - while (!io_req_cache_empty(ctx)) { - struct io_wq_work_node *node; - struct io_kiocb *req; - - node = wq_stack_extract(&state->free_list); - req = container_of(node, struct io_kiocb, comp_list); - kmem_cache_free(req_cachep, req); - nr++; - } - if (nr) - percpu_ref_put_many(&ctx->refs, nr); - mutex_unlock(&ctx->uring_lock); -} - -static void io_wait_rsrc_data(struct io_rsrc_data *data) -{ - if (data && !atomic_dec_and_test(&data->refs)) - wait_for_completion(&data->done); -} - -static void io_flush_apoll_cache(struct io_ring_ctx *ctx) -{ - struct async_poll *apoll; - - while (!list_empty(&ctx->apoll_cache)) { - apoll = list_first_entry(&ctx->apoll_cache, struct async_poll, - poll.wait.entry); - list_del(&apoll->poll.wait.entry); - kfree(apoll); - } -} - -static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) -{ - io_sq_thread_finish(ctx); - - if (ctx->mm_account) { - mmdrop(ctx->mm_account); - ctx->mm_account = NULL; - } - - io_rsrc_refs_drop(ctx); - /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */ - io_wait_rsrc_data(ctx->buf_data); - io_wait_rsrc_data(ctx->file_data); - - mutex_lock(&ctx->uring_lock); - if (ctx->buf_data) - __io_sqe_buffers_unregister(ctx); - if (ctx->file_data) - __io_sqe_files_unregister(ctx); - if (ctx->rings) - __io_cqring_overflow_flush(ctx, true); - io_eventfd_unregister(ctx); - io_flush_apoll_cache(ctx); - mutex_unlock(&ctx->uring_lock); - io_destroy_buffers(ctx); - if (ctx->sq_creds) - put_cred(ctx->sq_creds); - - /* there are no registered resources left, nobody uses it */ - if (ctx->rsrc_node) - io_rsrc_node_destroy(ctx->rsrc_node); - if (ctx->rsrc_backup_node) - io_rsrc_node_destroy(ctx->rsrc_backup_node); - flush_delayed_work(&ctx->rsrc_put_work); - flush_delayed_work(&ctx->fallback_work); - - WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)); - WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist)); - -#if defined(CONFIG_UNIX) - if (ctx->ring_sock) { - ctx->ring_sock->file = NULL; /* so that iput() is called */ - sock_release(ctx->ring_sock); - } -#endif - WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); - - io_mem_free(ctx->rings); - io_mem_free(ctx->sq_sqes); - - percpu_ref_exit(&ctx->refs); - free_uid(ctx->user); - io_req_caches_free(ctx); - if (ctx->hash_map) - io_wq_put_hash(ctx->hash_map); - kfree(ctx->cancel_hash); - kfree(ctx->dummy_ubuf); - kfree(ctx->io_bl); - xa_destroy(&ctx->io_bl_xa); - kfree(ctx); -} - -static __poll_t io_uring_poll(struct file *file, poll_table *wait) -{ - struct io_ring_ctx *ctx = file->private_data; - __poll_t mask = 0; - - poll_wait(file, &ctx->cq_wait, wait); - /* - * synchronizes with barrier from wq_has_sleeper call in - * io_commit_cqring - */ - smp_rmb(); - if (!io_sqring_full(ctx)) - mask |= EPOLLOUT | EPOLLWRNORM; - - /* - * Don't flush cqring overflow list here, just do a simple check. - * Otherwise there could possible be ABBA deadlock: - * CPU0 CPU1 - * ---- ---- - * lock(&ctx->uring_lock); - * lock(&ep->mtx); - * lock(&ctx->uring_lock); - * lock(&ep->mtx); - * - * Users may get EPOLLIN meanwhile seeing nothing in cqring, this - * pushs them to do the flush. - */ - if (io_cqring_events(ctx) || - test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) - mask |= EPOLLIN | EPOLLRDNORM; - - return mask; -} - -static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) -{ - const struct cred *creds; - - creds = xa_erase(&ctx->personalities, id); - if (creds) { - put_cred(creds); - return 0; - } - - return -EINVAL; -} - -struct io_tctx_exit { - struct callback_head task_work; - struct completion completion; - struct io_ring_ctx *ctx; -}; - -static __cold void io_tctx_exit_cb(struct callback_head *cb) -{ - struct io_uring_task *tctx = current->io_uring; - struct io_tctx_exit *work; - - work = container_of(cb, struct io_tctx_exit, task_work); - /* - * When @in_idle, we're in cancellation and it's racy to remove the - * node. It'll be removed by the end of cancellation, just ignore it. - */ - if (!atomic_read(&tctx->in_idle)) - io_uring_del_tctx_node((unsigned long)work->ctx); - complete(&work->completion); -} - -static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data) -{ - struct io_kiocb *req = container_of(work, struct io_kiocb, work); - - return req->ctx == data; -} - -static __cold void io_ring_exit_work(struct work_struct *work) -{ - struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work); - unsigned long timeout = jiffies + HZ * 60 * 5; - unsigned long interval = HZ / 20; - struct io_tctx_exit exit; - struct io_tctx_node *node; - int ret; - - /* - * If we're doing polled IO and end up having requests being - * submitted async (out-of-line), then completions can come in while - * we're waiting for refs to drop. We need to reap these manually, - * as nobody else will be looking for them. - */ - do { - io_uring_try_cancel_requests(ctx, NULL, true); - if (ctx->sq_data) { - struct io_sq_data *sqd = ctx->sq_data; - struct task_struct *tsk; - - io_sq_thread_park(sqd); - tsk = sqd->thread; - if (tsk && tsk->io_uring && tsk->io_uring->io_wq) - io_wq_cancel_cb(tsk->io_uring->io_wq, - io_cancel_ctx_cb, ctx, true); - io_sq_thread_unpark(sqd); - } - - io_req_caches_free(ctx); - - if (WARN_ON_ONCE(time_after(jiffies, timeout))) { - /* there is little hope left, don't run it too often */ - interval = HZ * 60; - } - } while (!wait_for_completion_timeout(&ctx->ref_comp, interval)); - - init_completion(&exit.completion); - init_task_work(&exit.task_work, io_tctx_exit_cb); - exit.ctx = ctx; - /* - * Some may use context even when all refs and requests have been put, - * and they are free to do so while still holding uring_lock or - * completion_lock, see io_req_task_submit(). Apart from other work, - * this lock/unlock section also waits them to finish. - */ - mutex_lock(&ctx->uring_lock); - while (!list_empty(&ctx->tctx_list)) { - WARN_ON_ONCE(time_after(jiffies, timeout)); - - node = list_first_entry(&ctx->tctx_list, struct io_tctx_node, - ctx_node); - /* don't spin on a single task if cancellation failed */ - list_rotate_left(&ctx->tctx_list); - ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL); - if (WARN_ON_ONCE(ret)) - continue; - - mutex_unlock(&ctx->uring_lock); - wait_for_completion(&exit.completion); - mutex_lock(&ctx->uring_lock); - } - mutex_unlock(&ctx->uring_lock); - spin_lock(&ctx->completion_lock); - spin_unlock(&ctx->completion_lock); - - io_ring_ctx_free(ctx); -} - -/* Returns true if we found and killed one or more timeouts */ -static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, - struct task_struct *tsk, bool cancel_all) -{ - struct io_kiocb *req, *tmp; - int canceled = 0; - - spin_lock(&ctx->completion_lock); - spin_lock_irq(&ctx->timeout_lock); - list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { - if (io_match_task(req, tsk, cancel_all)) { - io_kill_timeout(req, -ECANCELED); - canceled++; - } - } - spin_unlock_irq(&ctx->timeout_lock); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - if (canceled != 0) - io_cqring_ev_posted(ctx); - return canceled != 0; -} - -static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) -{ - unsigned long index; - struct creds *creds; - - mutex_lock(&ctx->uring_lock); - percpu_ref_kill(&ctx->refs); - if (ctx->rings) - __io_cqring_overflow_flush(ctx, true); - xa_for_each(&ctx->personalities, index, creds) - io_unregister_personality(ctx, index); - mutex_unlock(&ctx->uring_lock); - - /* failed during ring init, it couldn't have issued any requests */ - if (ctx->rings) { - io_kill_timeouts(ctx, NULL, true); - io_poll_remove_all(ctx, NULL, true); - /* if we failed setting up the ctx, we might not have any rings */ - io_iopoll_try_reap_events(ctx); - } - - INIT_WORK(&ctx->exit_work, io_ring_exit_work); - /* - * Use system_unbound_wq to avoid spawning tons of event kworkers - * if we're exiting a ton of rings at the same time. It just adds - * noise and overhead, there's no discernable change in runtime - * over using system_wq. - */ - queue_work(system_unbound_wq, &ctx->exit_work); -} - -static int io_uring_release(struct inode *inode, struct file *file) -{ - struct io_ring_ctx *ctx = file->private_data; - - file->private_data = NULL; - io_ring_ctx_wait_and_kill(ctx); - return 0; -} - -struct io_task_cancel { - struct task_struct *task; - bool all; -}; - -static bool io_cancel_task_cb(struct io_wq_work *work, void *data) -{ - struct io_kiocb *req = container_of(work, struct io_kiocb, work); - struct io_task_cancel *cancel = data; - - return io_match_task_safe(req, cancel->task, cancel->all); -} - -static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx, - struct task_struct *task, - bool cancel_all) -{ - struct io_defer_entry *de; - LIST_HEAD(list); - - spin_lock(&ctx->completion_lock); - list_for_each_entry_reverse(de, &ctx->defer_list, list) { - if (io_match_task_safe(de->req, task, cancel_all)) { - list_cut_position(&list, &ctx->defer_list, &de->list); - break; - } - } - spin_unlock(&ctx->completion_lock); - if (list_empty(&list)) - return false; - - while (!list_empty(&list)) { - de = list_first_entry(&list, struct io_defer_entry, list); - list_del_init(&de->list); - io_req_complete_failed(de->req, -ECANCELED); - kfree(de); - } - return true; -} - -static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) -{ - struct io_tctx_node *node; - enum io_wq_cancel cret; - bool ret = false; - - mutex_lock(&ctx->uring_lock); - list_for_each_entry(node, &ctx->tctx_list, ctx_node) { - struct io_uring_task *tctx = node->task->io_uring; - - /* - * io_wq will stay alive while we hold uring_lock, because it's - * killed after ctx nodes, which requires to take the lock. - */ - if (!tctx || !tctx->io_wq) - continue; - cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true); - ret |= (cret != IO_WQ_CANCEL_NOTFOUND); - } - mutex_unlock(&ctx->uring_lock); - - return ret; -} - -static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, - struct task_struct *task, - bool cancel_all) -{ - struct io_task_cancel cancel = { .task = task, .all = cancel_all, }; - struct io_uring_task *tctx = task ? task->io_uring : NULL; - - /* failed during ring init, it couldn't have issued any requests */ - if (!ctx->rings) - return; - - while (1) { - enum io_wq_cancel cret; - bool ret = false; - - if (!task) { - ret |= io_uring_try_cancel_iowq(ctx); - } else if (tctx && tctx->io_wq) { - /* - * Cancels requests of all rings, not only @ctx, but - * it's fine as the task is in exit/exec. - */ - cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb, - &cancel, true); - ret |= (cret != IO_WQ_CANCEL_NOTFOUND); - } - - /* SQPOLL thread does its own polling */ - if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) || - (ctx->sq_data && ctx->sq_data->thread == current)) { - while (!wq_list_empty(&ctx->iopoll_list)) { - io_iopoll_try_reap_events(ctx); - ret = true; - } - } - - ret |= io_cancel_defer_files(ctx, task, cancel_all); - ret |= io_poll_remove_all(ctx, task, cancel_all); - ret |= io_kill_timeouts(ctx, task, cancel_all); - if (task) - ret |= io_run_task_work(); - if (!ret) - break; - cond_resched(); - } -} - -static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) -{ - struct io_uring_task *tctx = current->io_uring; - struct io_tctx_node *node; - int ret; - - if (unlikely(!tctx)) { - ret = io_uring_alloc_task_context(current, ctx); - if (unlikely(ret)) - return ret; - - tctx = current->io_uring; - if (ctx->iowq_limits_set) { - unsigned int limits[2] = { ctx->iowq_limits[0], - ctx->iowq_limits[1], }; - - ret = io_wq_max_workers(tctx->io_wq, limits); - if (ret) - return ret; - } - } - if (!xa_load(&tctx->xa, (unsigned long)ctx)) { - node = kmalloc(sizeof(*node), GFP_KERNEL); - if (!node) - return -ENOMEM; - node->ctx = ctx; - node->task = current; - - ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx, - node, GFP_KERNEL)); - if (ret) { - kfree(node); - return ret; - } - - mutex_lock(&ctx->uring_lock); - list_add(&node->ctx_node, &ctx->tctx_list); - mutex_unlock(&ctx->uring_lock); - } - tctx->last = ctx; - return 0; -} - -/* - * Note that this task has used io_uring. We use it for cancelation purposes. - */ -static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx) -{ - struct io_uring_task *tctx = current->io_uring; - - if (likely(tctx && tctx->last == ctx)) - return 0; - return __io_uring_add_tctx_node(ctx); -} - -/* - * Remove this io_uring_file -> task mapping. - */ -static __cold void io_uring_del_tctx_node(unsigned long index) -{ - struct io_uring_task *tctx = current->io_uring; - struct io_tctx_node *node; - - if (!tctx) - return; - node = xa_erase(&tctx->xa, index); - if (!node) - return; - - WARN_ON_ONCE(current != node->task); - WARN_ON_ONCE(list_empty(&node->ctx_node)); - - mutex_lock(&node->ctx->uring_lock); - list_del(&node->ctx_node); - mutex_unlock(&node->ctx->uring_lock); - - if (tctx->last == node->ctx) - tctx->last = NULL; - kfree(node); -} - -static __cold void io_uring_clean_tctx(struct io_uring_task *tctx) -{ - struct io_wq *wq = tctx->io_wq; - struct io_tctx_node *node; - unsigned long index; - - xa_for_each(&tctx->xa, index, node) { - io_uring_del_tctx_node(index); - cond_resched(); - } - if (wq) { - /* - * Must be after io_uring_del_tctx_node() (removes nodes under - * uring_lock) to avoid race with io_uring_try_cancel_iowq(). - */ - io_wq_put_and_exit(wq); - tctx->io_wq = NULL; - } -} - -static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked) -{ - if (tracked) - return atomic_read(&tctx->inflight_tracked); - return percpu_counter_sum(&tctx->inflight); -} - -/* - * Find any io_uring ctx that this task has registered or done IO on, and cancel - * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation. - */ -static __cold void io_uring_cancel_generic(bool cancel_all, - struct io_sq_data *sqd) -{ - struct io_uring_task *tctx = current->io_uring; - struct io_ring_ctx *ctx; - s64 inflight; - DEFINE_WAIT(wait); - - WARN_ON_ONCE(sqd && sqd->thread != current); - - if (!current->io_uring) - return; - if (tctx->io_wq) - io_wq_exit_start(tctx->io_wq); - - atomic_inc(&tctx->in_idle); - do { - io_uring_drop_tctx_refs(current); - /* read completions before cancelations */ - inflight = tctx_inflight(tctx, !cancel_all); - if (!inflight) - break; - - if (!sqd) { - struct io_tctx_node *node; - unsigned long index; - - xa_for_each(&tctx->xa, index, node) { - /* sqpoll task will cancel all its requests */ - if (node->ctx->sq_data) - continue; - io_uring_try_cancel_requests(node->ctx, current, - cancel_all); - } - } else { - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) - io_uring_try_cancel_requests(ctx, current, - cancel_all); - } - - prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE); - io_run_task_work(); - io_uring_drop_tctx_refs(current); - - /* - * If we've seen completions, retry without waiting. This - * avoids a race where a completion comes in before we did - * prepare_to_wait(). - */ - if (inflight == tctx_inflight(tctx, !cancel_all)) - schedule(); - finish_wait(&tctx->wait, &wait); - } while (1); - - io_uring_clean_tctx(tctx); - if (cancel_all) { - /* - * We shouldn't run task_works after cancel, so just leave - * ->in_idle set for normal exit. - */ - atomic_dec(&tctx->in_idle); - /* for exec all current's requests should be gone, kill tctx */ - __io_uring_free(current); - } -} - -void __io_uring_cancel(bool cancel_all) -{ - io_uring_cancel_generic(cancel_all, NULL); -} - -void io_uring_unreg_ringfd(void) -{ - struct io_uring_task *tctx = current->io_uring; - int i; - - for (i = 0; i < IO_RINGFD_REG_MAX; i++) { - if (tctx->registered_rings[i]) { - fput(tctx->registered_rings[i]); - tctx->registered_rings[i] = NULL; - } - } -} - -static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd, - int start, int end) -{ - struct file *file; - int offset; - - for (offset = start; offset < end; offset++) { - offset = array_index_nospec(offset, IO_RINGFD_REG_MAX); - if (tctx->registered_rings[offset]) - continue; - - file = fget(fd); - if (!file) { - return -EBADF; - } else if (file->f_op != &io_uring_fops) { - fput(file); - return -EOPNOTSUPP; - } - tctx->registered_rings[offset] = file; - return offset; - } - - return -EBUSY; -} - -/* - * Register a ring fd to avoid fdget/fdput for each io_uring_enter() - * invocation. User passes in an array of struct io_uring_rsrc_update - * with ->data set to the ring_fd, and ->offset given for the desired - * index. If no index is desired, application may set ->offset == -1U - * and we'll find an available index. Returns number of entries - * successfully processed, or < 0 on error if none were processed. - */ -static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg, - unsigned nr_args) -{ - struct io_uring_rsrc_update __user *arg = __arg; - struct io_uring_rsrc_update reg; - struct io_uring_task *tctx; - int ret, i; - - if (!nr_args || nr_args > IO_RINGFD_REG_MAX) - return -EINVAL; - - mutex_unlock(&ctx->uring_lock); - ret = io_uring_add_tctx_node(ctx); - mutex_lock(&ctx->uring_lock); - if (ret) - return ret; - - tctx = current->io_uring; - for (i = 0; i < nr_args; i++) { - int start, end; - - if (copy_from_user(®, &arg[i], sizeof(reg))) { - ret = -EFAULT; - break; - } - - if (reg.resv) { - ret = -EINVAL; - break; - } - - if (reg.offset == -1U) { - start = 0; - end = IO_RINGFD_REG_MAX; - } else { - if (reg.offset >= IO_RINGFD_REG_MAX) { - ret = -EINVAL; - break; - } - start = reg.offset; - end = start + 1; - } - - ret = io_ring_add_registered_fd(tctx, reg.data, start, end); - if (ret < 0) - break; - - reg.offset = ret; - if (copy_to_user(&arg[i], ®, sizeof(reg))) { - fput(tctx->registered_rings[reg.offset]); - tctx->registered_rings[reg.offset] = NULL; - ret = -EFAULT; - break; - } - } - - return i ? i : ret; -} - -static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg, - unsigned nr_args) -{ - struct io_uring_rsrc_update __user *arg = __arg; - struct io_uring_task *tctx = current->io_uring; - struct io_uring_rsrc_update reg; - int ret = 0, i; - - if (!nr_args || nr_args > IO_RINGFD_REG_MAX) - return -EINVAL; - if (!tctx) - return 0; - - for (i = 0; i < nr_args; i++) { - if (copy_from_user(®, &arg[i], sizeof(reg))) { - ret = -EFAULT; - break; - } - if (reg.resv || reg.data || reg.offset >= IO_RINGFD_REG_MAX) { - ret = -EINVAL; - break; - } - - reg.offset = array_index_nospec(reg.offset, IO_RINGFD_REG_MAX); - if (tctx->registered_rings[reg.offset]) { - fput(tctx->registered_rings[reg.offset]); - tctx->registered_rings[reg.offset] = NULL; - } - } - - return i ? i : ret; -} - -static void *io_uring_validate_mmap_request(struct file *file, - loff_t pgoff, size_t sz) -{ - struct io_ring_ctx *ctx = file->private_data; - loff_t offset = pgoff << PAGE_SHIFT; - struct page *page; - void *ptr; - - switch (offset) { - case IORING_OFF_SQ_RING: - case IORING_OFF_CQ_RING: - ptr = ctx->rings; - break; - case IORING_OFF_SQES: - ptr = ctx->sq_sqes; - break; - default: - return ERR_PTR(-EINVAL); - } - - page = virt_to_head_page(ptr); - if (sz > page_size(page)) - return ERR_PTR(-EINVAL); - - return ptr; -} - -#ifdef CONFIG_MMU - -static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) -{ - size_t sz = vma->vm_end - vma->vm_start; - unsigned long pfn; - void *ptr; - - ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); - if (IS_ERR(ptr)) - return PTR_ERR(ptr); - - pfn = virt_to_phys(ptr) >> PAGE_SHIFT; - return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); -} - -#else /* !CONFIG_MMU */ - -static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) -{ - return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL; -} - -static unsigned int io_uring_nommu_mmap_capabilities(struct file *file) -{ - return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE; -} - -static unsigned long io_uring_nommu_get_unmapped_area(struct file *file, - unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags) -{ - void *ptr; - - ptr = io_uring_validate_mmap_request(file, pgoff, len); - if (IS_ERR(ptr)) - return PTR_ERR(ptr); - - return (unsigned long) ptr; -} - -#endif /* !CONFIG_MMU */ - -static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx) -{ - DEFINE_WAIT(wait); - - do { - if (!io_sqring_full(ctx)) - break; - prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE); - - if (!io_sqring_full(ctx)) - break; - schedule(); - } while (!signal_pending(current)); - - finish_wait(&ctx->sqo_sq_wait, &wait); - return 0; -} - -static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz) -{ - if (flags & IORING_ENTER_EXT_ARG) { - struct io_uring_getevents_arg arg; - - if (argsz != sizeof(arg)) - return -EINVAL; - if (copy_from_user(&arg, argp, sizeof(arg))) - return -EFAULT; - } - return 0; -} - -static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz, - struct __kernel_timespec __user **ts, - const sigset_t __user **sig) -{ - struct io_uring_getevents_arg arg; - - /* - * If EXT_ARG isn't set, then we have no timespec and the argp pointer - * is just a pointer to the sigset_t. - */ - if (!(flags & IORING_ENTER_EXT_ARG)) { - *sig = (const sigset_t __user *) argp; - *ts = NULL; - return 0; - } - - /* - * EXT_ARG is set - ensure we agree on the size of it and copy in our - * timespec and sigset_t pointers if good. - */ - if (*argsz != sizeof(arg)) - return -EINVAL; - if (copy_from_user(&arg, argp, sizeof(arg))) - return -EFAULT; - if (arg.pad) - return -EINVAL; - *sig = u64_to_user_ptr(arg.sigmask); - *argsz = arg.sigmask_sz; - *ts = u64_to_user_ptr(arg.ts); - return 0; -} - -SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, - u32, min_complete, u32, flags, const void __user *, argp, - size_t, argsz) -{ - struct io_ring_ctx *ctx; - struct fd f; - long ret; - - io_run_task_work(); - - if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | - IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG | - IORING_ENTER_REGISTERED_RING))) - return -EINVAL; - - /* - * Ring fd has been registered via IORING_REGISTER_RING_FDS, we - * need only dereference our task private array to find it. - */ - if (flags & IORING_ENTER_REGISTERED_RING) { - struct io_uring_task *tctx = current->io_uring; - - if (!tctx || fd >= IO_RINGFD_REG_MAX) - return -EINVAL; - fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); - f.file = tctx->registered_rings[fd]; - f.flags = 0; - } else { - f = fdget(fd); - } - - if (unlikely(!f.file)) - return -EBADF; - - ret = -EOPNOTSUPP; - if (unlikely(f.file->f_op != &io_uring_fops)) - goto out_fput; - - ret = -ENXIO; - ctx = f.file->private_data; - if (unlikely(!percpu_ref_tryget(&ctx->refs))) - goto out_fput; - - ret = -EBADFD; - if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED)) - goto out; - - /* - * For SQ polling, the thread will do all submissions and completions. - * Just return the requested submit count, and wake the thread if - * we were asked to. - */ - ret = 0; - if (ctx->flags & IORING_SETUP_SQPOLL) { - io_cqring_overflow_flush(ctx); - - if (unlikely(ctx->sq_data->thread == NULL)) { - ret = -EOWNERDEAD; - goto out; - } - if (flags & IORING_ENTER_SQ_WAKEUP) - wake_up(&ctx->sq_data->wait); - if (flags & IORING_ENTER_SQ_WAIT) { - ret = io_sqpoll_wait_sq(ctx); - if (ret) - goto out; - } - ret = to_submit; - } else if (to_submit) { - ret = io_uring_add_tctx_node(ctx); - if (unlikely(ret)) - goto out; - - mutex_lock(&ctx->uring_lock); - ret = io_submit_sqes(ctx, to_submit); - if (ret != to_submit) { - mutex_unlock(&ctx->uring_lock); - goto out; - } - if ((flags & IORING_ENTER_GETEVENTS) && ctx->syscall_iopoll) - goto iopoll_locked; - mutex_unlock(&ctx->uring_lock); - } - if (flags & IORING_ENTER_GETEVENTS) { - int ret2; - if (ctx->syscall_iopoll) { - /* - * We disallow the app entering submit/complete with - * polling, but we still need to lock the ring to - * prevent racing with polled issue that got punted to - * a workqueue. - */ - mutex_lock(&ctx->uring_lock); -iopoll_locked: - ret2 = io_validate_ext_arg(flags, argp, argsz); - if (likely(!ret2)) { - min_complete = min(min_complete, - ctx->cq_entries); - ret2 = io_iopoll_check(ctx, min_complete); - } - mutex_unlock(&ctx->uring_lock); - } else { - const sigset_t __user *sig; - struct __kernel_timespec __user *ts; - - ret2 = io_get_ext_arg(flags, argp, &argsz, &ts, &sig); - if (likely(!ret2)) { - min_complete = min(min_complete, - ctx->cq_entries); - ret2 = io_cqring_wait(ctx, min_complete, sig, - argsz, ts); - } - } - - if (!ret) { - ret = ret2; - - /* - * EBADR indicates that one or more CQE were dropped. - * Once the user has been informed we can clear the bit - * as they are obviously ok with those drops. - */ - if (unlikely(ret2 == -EBADR)) - clear_bit(IO_CHECK_CQ_DROPPED_BIT, - &ctx->check_cq); - } - } - -out: - percpu_ref_put(&ctx->refs); -out_fput: - fdput(f); - return ret; -} - -#ifdef CONFIG_PROC_FS -static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id, - const struct cred *cred) -{ - struct user_namespace *uns = seq_user_ns(m); - struct group_info *gi; - kernel_cap_t cap; - unsigned __capi; - int g; - - seq_printf(m, "%5d\n", id); - seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid)); - seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid)); - seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid)); - seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid)); - seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid)); - seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid)); - seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid)); - seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid)); - seq_puts(m, "\n\tGroups:\t"); - gi = cred->group_info; - for (g = 0; g < gi->ngroups; g++) { - seq_put_decimal_ull(m, g ? " " : "", - from_kgid_munged(uns, gi->gid[g])); - } - seq_puts(m, "\n\tCapEff:\t"); - cap = cred->cap_effective; - CAP_FOR_EACH_U32(__capi) - seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8); - seq_putc(m, '\n'); - return 0; -} - -static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, - struct seq_file *m) -{ - struct io_sq_data *sq = NULL; - struct io_overflow_cqe *ocqe; - struct io_rings *r = ctx->rings; - unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1; - unsigned int sq_head = READ_ONCE(r->sq.head); - unsigned int sq_tail = READ_ONCE(r->sq.tail); - unsigned int cq_head = READ_ONCE(r->cq.head); - unsigned int cq_tail = READ_ONCE(r->cq.tail); - unsigned int cq_shift = 0; - unsigned int sq_entries, cq_entries; - bool has_lock; - bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); - unsigned int i; - - if (is_cqe32) - cq_shift = 1; - - /* - * we may get imprecise sqe and cqe info if uring is actively running - * since we get cached_sq_head and cached_cq_tail without uring_lock - * and sq_tail and cq_head are changed by userspace. But it's ok since - * we usually use these info when it is stuck. - */ - seq_printf(m, "SqMask:\t0x%x\n", sq_mask); - seq_printf(m, "SqHead:\t%u\n", sq_head); - seq_printf(m, "SqTail:\t%u\n", sq_tail); - seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head); - seq_printf(m, "CqMask:\t0x%x\n", cq_mask); - seq_printf(m, "CqHead:\t%u\n", cq_head); - seq_printf(m, "CqTail:\t%u\n", cq_tail); - seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail); - seq_printf(m, "SQEs:\t%u\n", sq_tail - ctx->cached_sq_head); - sq_entries = min(sq_tail - sq_head, ctx->sq_entries); - for (i = 0; i < sq_entries; i++) { - unsigned int entry = i + sq_head; - unsigned int sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]); - struct io_uring_sqe *sqe; - - if (sq_idx > sq_mask) - continue; - sqe = &ctx->sq_sqes[sq_idx]; - seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n", - sq_idx, sqe->opcode, sqe->fd, sqe->flags, - sqe->user_data); - } - seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head); - cq_entries = min(cq_tail - cq_head, ctx->cq_entries); - for (i = 0; i < cq_entries; i++) { - unsigned int entry = i + cq_head; - struct io_uring_cqe *cqe = &r->cqes[(entry & cq_mask) << cq_shift]; - - if (!is_cqe32) { - seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n", - entry & cq_mask, cqe->user_data, cqe->res, - cqe->flags); - } else { - seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x, " - "extra1:%llu, extra2:%llu\n", - entry & cq_mask, cqe->user_data, cqe->res, - cqe->flags, cqe->big_cqe[0], cqe->big_cqe[1]); - } - } - - /* - * Avoid ABBA deadlock between the seq lock and the io_uring mutex, - * since fdinfo case grabs it in the opposite direction of normal use - * cases. If we fail to get the lock, we just don't iterate any - * structures that could be going away outside the io_uring mutex. - */ - has_lock = mutex_trylock(&ctx->uring_lock); - - if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) { - sq = ctx->sq_data; - if (!sq->thread) - sq = NULL; - } - - seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1); - seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1); - seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files); - for (i = 0; has_lock && i < ctx->nr_user_files; i++) { - struct file *f = io_file_from_index(ctx, i); - - if (f) - seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname); - else - seq_printf(m, "%5u: <none>\n", i); - } - seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs); - for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) { - struct io_mapped_ubuf *buf = ctx->user_bufs[i]; - unsigned int len = buf->ubuf_end - buf->ubuf; - - seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len); - } - if (has_lock && !xa_empty(&ctx->personalities)) { - unsigned long index; - const struct cred *cred; - - seq_printf(m, "Personalities:\n"); - xa_for_each(&ctx->personalities, index, cred) - io_uring_show_cred(m, index, cred); - } - if (has_lock) - mutex_unlock(&ctx->uring_lock); - - seq_puts(m, "PollList:\n"); - spin_lock(&ctx->completion_lock); - for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { - struct hlist_head *list = &ctx->cancel_hash[i]; - struct io_kiocb *req; - - hlist_for_each_entry(req, list, hash_node) - seq_printf(m, " op=%d, task_works=%d\n", req->opcode, - task_work_pending(req->task)); - } - - seq_puts(m, "CqOverflowList:\n"); - list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) { - struct io_uring_cqe *cqe = &ocqe->cqe; - - seq_printf(m, " user_data=%llu, res=%d, flags=%x\n", - cqe->user_data, cqe->res, cqe->flags); - - } - - spin_unlock(&ctx->completion_lock); -} - -static __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f) -{ - struct io_ring_ctx *ctx = f->private_data; - - if (percpu_ref_tryget(&ctx->refs)) { - __io_uring_show_fdinfo(ctx, m); - percpu_ref_put(&ctx->refs); - } -} -#endif - -static const struct file_operations io_uring_fops = { - .release = io_uring_release, - .mmap = io_uring_mmap, -#ifndef CONFIG_MMU - .get_unmapped_area = io_uring_nommu_get_unmapped_area, - .mmap_capabilities = io_uring_nommu_mmap_capabilities, -#endif - .poll = io_uring_poll, -#ifdef CONFIG_PROC_FS - .show_fdinfo = io_uring_show_fdinfo, -#endif -}; - -static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, - struct io_uring_params *p) -{ - struct io_rings *rings; - size_t size, sq_array_offset; - - /* make sure these are sane, as we already accounted them */ - ctx->sq_entries = p->sq_entries; - ctx->cq_entries = p->cq_entries; - - size = rings_size(ctx, p->sq_entries, p->cq_entries, &sq_array_offset); - if (size == SIZE_MAX) - return -EOVERFLOW; - - rings = io_mem_alloc(size); - if (!rings) - return -ENOMEM; - - ctx->rings = rings; - ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); - rings->sq_ring_mask = p->sq_entries - 1; - rings->cq_ring_mask = p->cq_entries - 1; - rings->sq_ring_entries = p->sq_entries; - rings->cq_ring_entries = p->cq_entries; - - if (p->flags & IORING_SETUP_SQE128) - size = array_size(2 * sizeof(struct io_uring_sqe), p->sq_entries); - else - size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); - if (size == SIZE_MAX) { - io_mem_free(ctx->rings); - ctx->rings = NULL; - return -EOVERFLOW; - } - - ctx->sq_sqes = io_mem_alloc(size); - if (!ctx->sq_sqes) { - io_mem_free(ctx->rings); - ctx->rings = NULL; - return -ENOMEM; - } - - return 0; -} - -static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file) -{ - int ret, fd; - - fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); - if (fd < 0) - return fd; - - ret = io_uring_add_tctx_node(ctx); - if (ret) { - put_unused_fd(fd); - return ret; - } - fd_install(fd, file); - return fd; -} - -/* - * Allocate an anonymous fd, this is what constitutes the application - * visible backing of an io_uring instance. The application mmaps this - * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled, - * we have to tie this fd to a socket for file garbage collection purposes. - */ -static struct file *io_uring_get_file(struct io_ring_ctx *ctx) -{ - struct file *file; -#if defined(CONFIG_UNIX) - int ret; - - ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP, - &ctx->ring_sock); - if (ret) - return ERR_PTR(ret); -#endif - - file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx, - O_RDWR | O_CLOEXEC, NULL); -#if defined(CONFIG_UNIX) - if (IS_ERR(file)) { - sock_release(ctx->ring_sock); - ctx->ring_sock = NULL; - } else { - ctx->ring_sock->file = file; - } -#endif - return file; -} - -static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, - struct io_uring_params __user *params) -{ - struct io_ring_ctx *ctx; - struct file *file; - int ret; - - if (!entries) - return -EINVAL; - if (entries > IORING_MAX_ENTRIES) { - if (!(p->flags & IORING_SETUP_CLAMP)) - return -EINVAL; - entries = IORING_MAX_ENTRIES; - } - - /* - * Use twice as many entries for the CQ ring. It's possible for the - * application to drive a higher depth than the size of the SQ ring, - * since the sqes are only used at submission time. This allows for - * some flexibility in overcommitting a bit. If the application has - * set IORING_SETUP_CQSIZE, it will have passed in the desired number - * of CQ ring entries manually. - */ - p->sq_entries = roundup_pow_of_two(entries); - if (p->flags & IORING_SETUP_CQSIZE) { - /* - * If IORING_SETUP_CQSIZE is set, we do the same roundup - * to a power-of-two, if it isn't already. We do NOT impose - * any cq vs sq ring sizing. - */ - if (!p->cq_entries) - return -EINVAL; - if (p->cq_entries > IORING_MAX_CQ_ENTRIES) { - if (!(p->flags & IORING_SETUP_CLAMP)) - return -EINVAL; - p->cq_entries = IORING_MAX_CQ_ENTRIES; - } - p->cq_entries = roundup_pow_of_two(p->cq_entries); - if (p->cq_entries < p->sq_entries) - return -EINVAL; - } else { - p->cq_entries = 2 * p->sq_entries; - } - - ctx = io_ring_ctx_alloc(p); - if (!ctx) - return -ENOMEM; - - /* - * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user - * space applications don't need to do io completion events - * polling again, they can rely on io_sq_thread to do polling - * work, which can reduce cpu usage and uring_lock contention. - */ - if (ctx->flags & IORING_SETUP_IOPOLL && - !(ctx->flags & IORING_SETUP_SQPOLL)) - ctx->syscall_iopoll = 1; - - ctx->compat = in_compat_syscall(); - if (!capable(CAP_IPC_LOCK)) - ctx->user = get_uid(current_user()); - - /* - * For SQPOLL, we just need a wakeup, always. For !SQPOLL, if - * COOP_TASKRUN is set, then IPIs are never needed by the app. - */ - ret = -EINVAL; - if (ctx->flags & IORING_SETUP_SQPOLL) { - /* IPI related flags don't make sense with SQPOLL */ - if (ctx->flags & (IORING_SETUP_COOP_TASKRUN | - IORING_SETUP_TASKRUN_FLAG)) - goto err; - ctx->notify_method = TWA_SIGNAL_NO_IPI; - } else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) { - ctx->notify_method = TWA_SIGNAL_NO_IPI; - } else { - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) - goto err; - ctx->notify_method = TWA_SIGNAL; - } - - /* - * This is just grabbed for accounting purposes. When a process exits, - * the mm is exited and dropped before the files, hence we need to hang - * on to this mm purely for the purposes of being able to unaccount - * memory (locked/pinned vm). It's not used for anything else. - */ - mmgrab(current->mm); - ctx->mm_account = current->mm; - - ret = io_allocate_scq_urings(ctx, p); - if (ret) - goto err; - - ret = io_sq_offload_create(ctx, p); - if (ret) - goto err; - /* always set a rsrc node */ - ret = io_rsrc_node_switch_start(ctx); - if (ret) - goto err; - io_rsrc_node_switch(ctx, NULL); - - memset(&p->sq_off, 0, sizeof(p->sq_off)); - p->sq_off.head = offsetof(struct io_rings, sq.head); - p->sq_off.tail = offsetof(struct io_rings, sq.tail); - p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask); - p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries); - p->sq_off.flags = offsetof(struct io_rings, sq_flags); - p->sq_off.dropped = offsetof(struct io_rings, sq_dropped); - p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; - - memset(&p->cq_off, 0, sizeof(p->cq_off)); - p->cq_off.head = offsetof(struct io_rings, cq.head); - p->cq_off.tail = offsetof(struct io_rings, cq.tail); - p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask); - p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries); - p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); - p->cq_off.cqes = offsetof(struct io_rings, cqes); - p->cq_off.flags = offsetof(struct io_rings, cq_flags); - - p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | - IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | - IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | - IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | - IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | - IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP | - IORING_FEAT_LINKED_FILE; - - if (copy_to_user(params, p, sizeof(*p))) { - ret = -EFAULT; - goto err; - } - - file = io_uring_get_file(ctx); - if (IS_ERR(file)) { - ret = PTR_ERR(file); - goto err; - } - - /* - * Install ring fd as the very last thing, so we don't risk someone - * having closed it before we finish setup - */ - ret = io_uring_install_fd(ctx, file); - if (ret < 0) { - /* fput will clean it up */ - fput(file); - return ret; - } - - trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); - return ret; -err: - io_ring_ctx_wait_and_kill(ctx); - return ret; -} - -/* - * Sets up an aio uring context, and returns the fd. Applications asks for a - * ring size, we return the actual sq/cq ring sizes (among other things) in the - * params structure passed in. - */ -static long io_uring_setup(u32 entries, struct io_uring_params __user *params) -{ - struct io_uring_params p; - int i; - - if (copy_from_user(&p, params, sizeof(p))) - return -EFAULT; - for (i = 0; i < ARRAY_SIZE(p.resv); i++) { - if (p.resv[i]) - return -EINVAL; - } - - if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | - IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | - IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ | - IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL | - IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG | - IORING_SETUP_SQE128 | IORING_SETUP_CQE32)) - return -EINVAL; - - return io_uring_create(entries, &p, params); -} - -SYSCALL_DEFINE2(io_uring_setup, u32, entries, - struct io_uring_params __user *, params) -{ - return io_uring_setup(entries, params); -} - -static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, - unsigned nr_args) -{ - struct io_uring_probe *p; - size_t size; - int i, ret; - - size = struct_size(p, ops, nr_args); - if (size == SIZE_MAX) - return -EOVERFLOW; - p = kzalloc(size, GFP_KERNEL); - if (!p) - return -ENOMEM; - - ret = -EFAULT; - if (copy_from_user(p, arg, size)) - goto out; - ret = -EINVAL; - if (memchr_inv(p, 0, size)) - goto out; - - p->last_op = IORING_OP_LAST - 1; - if (nr_args > IORING_OP_LAST) - nr_args = IORING_OP_LAST; - - for (i = 0; i < nr_args; i++) { - p->ops[i].op = i; - if (!io_op_defs[i].not_supported) - p->ops[i].flags = IO_URING_OP_SUPPORTED; - } - p->ops_len = i; - - ret = 0; - if (copy_to_user(arg, p, size)) - ret = -EFAULT; -out: - kfree(p); - return ret; -} - -static int io_register_personality(struct io_ring_ctx *ctx) -{ - const struct cred *creds; - u32 id; - int ret; - - creds = get_current_cred(); - - ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds, - XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL); - if (ret < 0) { - put_cred(creds); - return ret; - } - return id; -} - -static __cold int io_register_restrictions(struct io_ring_ctx *ctx, - void __user *arg, unsigned int nr_args) -{ - struct io_uring_restriction *res; - size_t size; - int i, ret; - - /* Restrictions allowed only if rings started disabled */ - if (!(ctx->flags & IORING_SETUP_R_DISABLED)) - return -EBADFD; - - /* We allow only a single restrictions registration */ - if (ctx->restrictions.registered) - return -EBUSY; - - if (!arg || nr_args > IORING_MAX_RESTRICTIONS) - return -EINVAL; - - size = array_size(nr_args, sizeof(*res)); - if (size == SIZE_MAX) - return -EOVERFLOW; - - res = memdup_user(arg, size); - if (IS_ERR(res)) - return PTR_ERR(res); - - ret = 0; - - for (i = 0; i < nr_args; i++) { - switch (res[i].opcode) { - case IORING_RESTRICTION_REGISTER_OP: - if (res[i].register_op >= IORING_REGISTER_LAST) { - ret = -EINVAL; - goto out; - } - - __set_bit(res[i].register_op, - ctx->restrictions.register_op); - break; - case IORING_RESTRICTION_SQE_OP: - if (res[i].sqe_op >= IORING_OP_LAST) { - ret = -EINVAL; - goto out; - } - - __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op); - break; - case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: - ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags; - break; - case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: - ctx->restrictions.sqe_flags_required = res[i].sqe_flags; - break; - default: - ret = -EINVAL; - goto out; - } - } - -out: - /* Reset all restrictions if an error happened */ - if (ret != 0) - memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); - else - ctx->restrictions.registered = true; - - kfree(res); - return ret; -} - -static int io_register_enable_rings(struct io_ring_ctx *ctx) -{ - if (!(ctx->flags & IORING_SETUP_R_DISABLED)) - return -EBADFD; - - if (ctx->restrictions.registered) - ctx->restricted = 1; - - ctx->flags &= ~IORING_SETUP_R_DISABLED; - if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait)) - wake_up(&ctx->sq_data->wait); - return 0; -} - -static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, - struct io_uring_rsrc_update2 *up, - unsigned nr_args) -{ - __u32 tmp; - int err; - - if (check_add_overflow(up->offset, nr_args, &tmp)) - return -EOVERFLOW; - err = io_rsrc_node_switch_start(ctx); - if (err) - return err; - - switch (type) { - case IORING_RSRC_FILE: - return __io_sqe_files_update(ctx, up, nr_args); - case IORING_RSRC_BUFFER: - return __io_sqe_buffers_update(ctx, up, nr_args); - } - return -EINVAL; -} - -static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, - unsigned nr_args) -{ - struct io_uring_rsrc_update2 up; - - if (!nr_args) - return -EINVAL; - memset(&up, 0, sizeof(up)); - if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) - return -EFAULT; - if (up.resv || up.resv2) - return -EINVAL; - return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); -} - -static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, - unsigned size, unsigned type) -{ - struct io_uring_rsrc_update2 up; - - if (size != sizeof(up)) - return -EINVAL; - if (copy_from_user(&up, arg, sizeof(up))) - return -EFAULT; - if (!up.nr || up.resv || up.resv2) - return -EINVAL; - return __io_register_rsrc_update(ctx, type, &up, up.nr); -} - -static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, - unsigned int size, unsigned int type) -{ - struct io_uring_rsrc_register rr; - - /* keep it extendible */ - if (size != sizeof(rr)) - return -EINVAL; - - memset(&rr, 0, sizeof(rr)); - if (copy_from_user(&rr, arg, size)) - return -EFAULT; - if (!rr.nr || rr.resv2) - return -EINVAL; - if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) - return -EINVAL; - - switch (type) { - case IORING_RSRC_FILE: - if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) - break; - return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), - rr.nr, u64_to_user_ptr(rr.tags)); - case IORING_RSRC_BUFFER: - if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) - break; - return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), - rr.nr, u64_to_user_ptr(rr.tags)); - } - return -EINVAL; -} - -static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, - void __user *arg, unsigned len) -{ - struct io_uring_task *tctx = current->io_uring; - cpumask_var_t new_mask; - int ret; - - if (!tctx || !tctx->io_wq) - return -EINVAL; - - if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) - return -ENOMEM; - - cpumask_clear(new_mask); - if (len > cpumask_size()) - len = cpumask_size(); - - if (in_compat_syscall()) { - ret = compat_get_bitmap(cpumask_bits(new_mask), - (const compat_ulong_t __user *)arg, - len * 8 /* CHAR_BIT */); - } else { - ret = copy_from_user(new_mask, arg, len); - } - - if (ret) { - free_cpumask_var(new_mask); - return -EFAULT; - } - - ret = io_wq_cpu_affinity(tctx->io_wq, new_mask); - free_cpumask_var(new_mask); - return ret; -} - -static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx) -{ - struct io_uring_task *tctx = current->io_uring; - - if (!tctx || !tctx->io_wq) - return -EINVAL; - - return io_wq_cpu_affinity(tctx->io_wq, NULL); -} - -static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, - void __user *arg) - __must_hold(&ctx->uring_lock) -{ - struct io_tctx_node *node; - struct io_uring_task *tctx = NULL; - struct io_sq_data *sqd = NULL; - __u32 new_count[2]; - int i, ret; - - if (copy_from_user(new_count, arg, sizeof(new_count))) - return -EFAULT; - for (i = 0; i < ARRAY_SIZE(new_count); i++) - if (new_count[i] > INT_MAX) - return -EINVAL; - - if (ctx->flags & IORING_SETUP_SQPOLL) { - sqd = ctx->sq_data; - if (sqd) { - /* - * Observe the correct sqd->lock -> ctx->uring_lock - * ordering. Fine to drop uring_lock here, we hold - * a ref to the ctx. - */ - refcount_inc(&sqd->refs); - mutex_unlock(&ctx->uring_lock); - mutex_lock(&sqd->lock); - mutex_lock(&ctx->uring_lock); - if (sqd->thread) - tctx = sqd->thread->io_uring; - } - } else { - tctx = current->io_uring; - } - - BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits)); - - for (i = 0; i < ARRAY_SIZE(new_count); i++) - if (new_count[i]) - ctx->iowq_limits[i] = new_count[i]; - ctx->iowq_limits_set = true; - - if (tctx && tctx->io_wq) { - ret = io_wq_max_workers(tctx->io_wq, new_count); - if (ret) - goto err; - } else { - memset(new_count, 0, sizeof(new_count)); - } - - if (sqd) { - mutex_unlock(&sqd->lock); - io_put_sq_data(sqd); - } - - if (copy_to_user(arg, new_count, sizeof(new_count))) - return -EFAULT; - - /* that's it for SQPOLL, only the SQPOLL task creates requests */ - if (sqd) - return 0; - - /* now propagate the restriction to all registered users */ - list_for_each_entry(node, &ctx->tctx_list, ctx_node) { - struct io_uring_task *tctx = node->task->io_uring; - - if (WARN_ON_ONCE(!tctx->io_wq)) - continue; - - for (i = 0; i < ARRAY_SIZE(new_count); i++) - new_count[i] = ctx->iowq_limits[i]; - /* ignore errors, it always returns zero anyway */ - (void)io_wq_max_workers(tctx->io_wq, new_count); - } - return 0; -err: - if (sqd) { - mutex_unlock(&sqd->lock); - io_put_sq_data(sqd); - } - return ret; -} - -static int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) -{ - struct io_uring_buf_ring *br; - struct io_uring_buf_reg reg; - struct io_buffer_list *bl; - struct page **pages; - int nr_pages; - - if (copy_from_user(®, arg, sizeof(reg))) - return -EFAULT; - - if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2]) - return -EINVAL; - if (!reg.ring_addr) - return -EFAULT; - if (reg.ring_addr & ~PAGE_MASK) - return -EINVAL; - if (!is_power_of_2(reg.ring_entries)) - return -EINVAL; - - /* cannot disambiguate full vs empty due to head/tail size */ - if (reg.ring_entries >= 65536) - return -EINVAL; - - if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) { - int ret = io_init_bl_list(ctx); - if (ret) - return ret; - } - - bl = io_buffer_get_list(ctx, reg.bgid); - if (bl) { - /* if mapped buffer ring OR classic exists, don't allow */ - if (bl->buf_nr_pages || !list_empty(&bl->buf_list)) - return -EEXIST; - } else { - bl = kzalloc(sizeof(*bl), GFP_KERNEL); - if (!bl) - return -ENOMEM; - } - - pages = io_pin_pages(reg.ring_addr, - struct_size(br, bufs, reg.ring_entries), - &nr_pages); - if (IS_ERR(pages)) { - kfree(bl); - return PTR_ERR(pages); - } - - br = page_address(pages[0]); - bl->buf_pages = pages; - bl->buf_nr_pages = nr_pages; - bl->nr_entries = reg.ring_entries; - bl->buf_ring = br; - bl->mask = reg.ring_entries - 1; - io_buffer_add_list(ctx, bl, reg.bgid); - return 0; -} - -static int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) -{ - struct io_uring_buf_reg reg; - struct io_buffer_list *bl; - - if (copy_from_user(®, arg, sizeof(reg))) - return -EFAULT; - if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2]) - return -EINVAL; - - bl = io_buffer_get_list(ctx, reg.bgid); - if (!bl) - return -ENOENT; - if (!bl->buf_nr_pages) - return -EINVAL; - - __io_remove_buffers(ctx, bl, -1U); - if (bl->bgid >= BGID_ARRAY) { - xa_erase(&ctx->io_bl_xa, bl->bgid); - kfree(bl); - } - return 0; -} - -static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, - void __user *arg, unsigned nr_args) - __releases(ctx->uring_lock) - __acquires(ctx->uring_lock) -{ - int ret; - - /* - * We're inside the ring mutex, if the ref is already dying, then - * someone else killed the ctx or is already going through - * io_uring_register(). - */ - if (percpu_ref_is_dying(&ctx->refs)) - return -ENXIO; - - if (ctx->restricted) { - if (opcode >= IORING_REGISTER_LAST) - return -EINVAL; - opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); - if (!test_bit(opcode, ctx->restrictions.register_op)) - return -EACCES; - } - - switch (opcode) { - case IORING_REGISTER_BUFFERS: - ret = -EFAULT; - if (!arg) - break; - ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL); - break; - case IORING_UNREGISTER_BUFFERS: - ret = -EINVAL; - if (arg || nr_args) - break; - ret = io_sqe_buffers_unregister(ctx); - break; - case IORING_REGISTER_FILES: - ret = -EFAULT; - if (!arg) - break; - ret = io_sqe_files_register(ctx, arg, nr_args, NULL); - break; - case IORING_UNREGISTER_FILES: - ret = -EINVAL; - if (arg || nr_args) - break; - ret = io_sqe_files_unregister(ctx); - break; - case IORING_REGISTER_FILES_UPDATE: - ret = io_register_files_update(ctx, arg, nr_args); - break; - case IORING_REGISTER_EVENTFD: - ret = -EINVAL; - if (nr_args != 1) - break; - ret = io_eventfd_register(ctx, arg, 0); - break; - case IORING_REGISTER_EVENTFD_ASYNC: - ret = -EINVAL; - if (nr_args != 1) - break; - ret = io_eventfd_register(ctx, arg, 1); - break; - case IORING_UNREGISTER_EVENTFD: - ret = -EINVAL; - if (arg || nr_args) - break; - ret = io_eventfd_unregister(ctx); - break; - case IORING_REGISTER_PROBE: - ret = -EINVAL; - if (!arg || nr_args > 256) - break; - ret = io_probe(ctx, arg, nr_args); - break; - case IORING_REGISTER_PERSONALITY: - ret = -EINVAL; - if (arg || nr_args) - break; - ret = io_register_personality(ctx); - break; - case IORING_UNREGISTER_PERSONALITY: - ret = -EINVAL; - if (arg) - break; - ret = io_unregister_personality(ctx, nr_args); - break; - case IORING_REGISTER_ENABLE_RINGS: - ret = -EINVAL; - if (arg || nr_args) - break; - ret = io_register_enable_rings(ctx); - break; - case IORING_REGISTER_RESTRICTIONS: - ret = io_register_restrictions(ctx, arg, nr_args); - break; - case IORING_REGISTER_FILES2: - ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE); - break; - case IORING_REGISTER_FILES_UPDATE2: - ret = io_register_rsrc_update(ctx, arg, nr_args, - IORING_RSRC_FILE); - break; - case IORING_REGISTER_BUFFERS2: - ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER); - break; - case IORING_REGISTER_BUFFERS_UPDATE: - ret = io_register_rsrc_update(ctx, arg, nr_args, - IORING_RSRC_BUFFER); - break; - case IORING_REGISTER_IOWQ_AFF: - ret = -EINVAL; - if (!arg || !nr_args) - break; - ret = io_register_iowq_aff(ctx, arg, nr_args); - break; - case IORING_UNREGISTER_IOWQ_AFF: - ret = -EINVAL; - if (arg || nr_args) - break; - ret = io_unregister_iowq_aff(ctx); - break; - case IORING_REGISTER_IOWQ_MAX_WORKERS: - ret = -EINVAL; - if (!arg || nr_args != 2) - break; - ret = io_register_iowq_max_workers(ctx, arg); - break; - case IORING_REGISTER_RING_FDS: - ret = io_ringfd_register(ctx, arg, nr_args); - break; - case IORING_UNREGISTER_RING_FDS: - ret = io_ringfd_unregister(ctx, arg, nr_args); - break; - case IORING_REGISTER_PBUF_RING: - ret = -EINVAL; - if (!arg || nr_args != 1) - break; - ret = io_register_pbuf_ring(ctx, arg); - break; - case IORING_UNREGISTER_PBUF_RING: - ret = -EINVAL; - if (!arg || nr_args != 1) - break; - ret = io_unregister_pbuf_ring(ctx, arg); - break; - default: - ret = -EINVAL; - break; - } - - return ret; -} - -SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, - void __user *, arg, unsigned int, nr_args) -{ - struct io_ring_ctx *ctx; - long ret = -EBADF; - struct fd f; - - f = fdget(fd); - if (!f.file) - return -EBADF; - - ret = -EOPNOTSUPP; - if (f.file->f_op != &io_uring_fops) - goto out_fput; - - ctx = f.file->private_data; - - io_run_task_work(); - - mutex_lock(&ctx->uring_lock); - ret = __io_uring_register(ctx, opcode, arg, nr_args); - mutex_unlock(&ctx->uring_lock); - trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret); -out_fput: - fdput(f); - return ret; -} - -static int __init io_uring_init(void) -{ -#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \ - BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \ - BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \ -} while (0) - -#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \ - __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename) - BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64); - BUILD_BUG_SQE_ELEM(0, __u8, opcode); - BUILD_BUG_SQE_ELEM(1, __u8, flags); - BUILD_BUG_SQE_ELEM(2, __u16, ioprio); - BUILD_BUG_SQE_ELEM(4, __s32, fd); - BUILD_BUG_SQE_ELEM(8, __u64, off); - BUILD_BUG_SQE_ELEM(8, __u64, addr2); - BUILD_BUG_SQE_ELEM(16, __u64, addr); - BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in); - BUILD_BUG_SQE_ELEM(24, __u32, len); - BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags); - BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags); - BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags); - BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags); - BUILD_BUG_SQE_ELEM(28, /* compat */ __u16, poll_events); - BUILD_BUG_SQE_ELEM(28, __u32, poll32_events); - BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags); - BUILD_BUG_SQE_ELEM(28, __u32, msg_flags); - BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags); - BUILD_BUG_SQE_ELEM(28, __u32, accept_flags); - BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags); - BUILD_BUG_SQE_ELEM(28, __u32, open_flags); - BUILD_BUG_SQE_ELEM(28, __u32, statx_flags); - BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice); - BUILD_BUG_SQE_ELEM(28, __u32, splice_flags); - BUILD_BUG_SQE_ELEM(32, __u64, user_data); - BUILD_BUG_SQE_ELEM(40, __u16, buf_index); - BUILD_BUG_SQE_ELEM(40, __u16, buf_group); - BUILD_BUG_SQE_ELEM(42, __u16, personality); - BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in); - BUILD_BUG_SQE_ELEM(44, __u32, file_index); - BUILD_BUG_SQE_ELEM(48, __u64, addr3); - - BUILD_BUG_ON(sizeof(struct io_uring_files_update) != - sizeof(struct io_uring_rsrc_update)); - BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) > - sizeof(struct io_uring_rsrc_update2)); - - /* ->buf_index is u16 */ - BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); - BUILD_BUG_ON(BGID_ARRAY * sizeof(struct io_buffer_list) > PAGE_SIZE); - BUILD_BUG_ON(offsetof(struct io_uring_buf_ring, bufs) != 0); - BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) != - offsetof(struct io_uring_buf_ring, tail)); - - /* should fit into one byte */ - BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8)); - BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8)); - BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS); - - BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); - BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int)); - - BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32)); - - BUILD_BUG_ON(sizeof(struct io_uring_cmd) > 64); - - req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | - SLAB_ACCOUNT); - return 0; -}; -__initcall(io_uring_init); diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index d2a9f699e17e..2b82c7f1de88 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -44,20 +44,28 @@ static inline struct iomap_page *to_iomap_page(struct folio *folio) static struct bio_set iomap_ioend_bioset; static struct iomap_page * -iomap_page_create(struct inode *inode, struct folio *folio) +iomap_page_create(struct inode *inode, struct folio *folio, unsigned int flags) { struct iomap_page *iop = to_iomap_page(folio); unsigned int nr_blocks = i_blocks_per_folio(inode, folio); + gfp_t gfp; if (iop || nr_blocks <= 1) return iop; + if (flags & IOMAP_NOWAIT) + gfp = GFP_NOWAIT; + else + gfp = GFP_NOFS | __GFP_NOFAIL; + iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)), - GFP_NOFS | __GFP_NOFAIL); - spin_lock_init(&iop->uptodate_lock); - if (folio_test_uptodate(folio)) - bitmap_fill(iop->uptodate, nr_blocks); - folio_attach_private(folio, iop); + gfp); + if (iop) { + spin_lock_init(&iop->uptodate_lock); + if (folio_test_uptodate(folio)) + bitmap_fill(iop->uptodate, nr_blocks); + folio_attach_private(folio, iop); + } return iop; } @@ -154,9 +162,6 @@ static void iomap_iop_set_range_uptodate(struct folio *folio, static void iomap_set_range_uptodate(struct folio *folio, struct iomap_page *iop, size_t off, size_t len) { - if (folio_test_error(folio)) - return; - if (iop) iomap_iop_set_range_uptodate(folio, iop, off, len); else @@ -226,7 +231,7 @@ static int iomap_read_inline_data(const struct iomap_iter *iter, if (WARN_ON_ONCE(size > iomap->length)) return -EIO; if (offset > 0) - iop = iomap_page_create(iter->inode, folio); + iop = iomap_page_create(iter->inode, folio, iter->flags); else iop = to_iomap_page(folio); @@ -264,7 +269,7 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter, return iomap_read_inline_data(iter, folio); /* zero post-eof blocks as the page may be mapped */ - iop = iomap_page_create(iter->inode, folio); + iop = iomap_page_create(iter->inode, folio, iter->flags); iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen); if (plen == 0) goto done; @@ -492,31 +497,6 @@ void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len) } EXPORT_SYMBOL_GPL(iomap_invalidate_folio); -#ifdef CONFIG_MIGRATION -int -iomap_migrate_page(struct address_space *mapping, struct page *newpage, - struct page *page, enum migrate_mode mode) -{ - struct folio *folio = page_folio(page); - struct folio *newfolio = page_folio(newpage); - int ret; - - ret = folio_migrate_mapping(mapping, newfolio, folio, 0); - if (ret != MIGRATEPAGE_SUCCESS) - return ret; - - if (folio_test_private(folio)) - folio_attach_private(newfolio, folio_detach_private(folio)); - - if (mode != MIGRATE_SYNC_NO_COPY) - folio_migrate_copy(newfolio, folio); - else - folio_migrate_flags(newfolio, folio); - return MIGRATEPAGE_SUCCESS; -} -EXPORT_SYMBOL_GPL(iomap_migrate_page); -#endif /* CONFIG_MIGRATION */ - static void iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) { @@ -547,10 +527,11 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, size_t len, struct folio *folio) { const struct iomap *srcmap = iomap_iter_srcmap(iter); - struct iomap_page *iop = iomap_page_create(iter->inode, folio); + struct iomap_page *iop; loff_t block_size = i_blocksize(iter->inode); loff_t block_start = round_down(pos, block_size); loff_t block_end = round_up(pos + len, block_size); + unsigned int nr_blocks = i_blocks_per_folio(iter->inode, folio); size_t from = offset_in_folio(folio, pos), to = from + len; size_t poff, plen; @@ -558,6 +539,10 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, return 0; folio_clear_error(folio); + iop = iomap_page_create(iter->inode, folio, iter->flags); + if ((iter->flags & IOMAP_NOWAIT) && !iop && nr_blocks > 1) + return -EAGAIN; + do { iomap_adjust_read_range(iter->inode, folio, &block_start, block_end - block_start, &poff, &plen); @@ -574,7 +559,12 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, return -EIO; folio_zero_segments(folio, poff, from, to, poff + plen); } else { - int status = iomap_read_folio_sync(block_start, folio, + int status; + + if (iter->flags & IOMAP_NOWAIT) + return -EAGAIN; + + status = iomap_read_folio_sync(block_start, folio, poff, plen, srcmap); if (status) return status; @@ -603,6 +593,9 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos, unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS; int status = 0; + if (iter->flags & IOMAP_NOWAIT) + fgp |= FGP_NOWAIT; + BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length); if (srcmap != &iter->iomap) BUG_ON(pos + len > srcmap->offset + srcmap->length); @@ -622,7 +615,7 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos, folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, fgp, mapping_gfp_mask(iter->inode->i_mapping)); if (!folio) { - status = -ENOMEM; + status = (iter->flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOMEM; goto out_no_page; } if (pos + len > folio_pos(folio) + folio_size(folio)) @@ -740,6 +733,8 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) loff_t pos = iter->pos; ssize_t written = 0; long status = 0; + struct address_space *mapping = iter->inode->i_mapping; + unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0; do { struct folio *folio; @@ -752,6 +747,11 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) bytes = min_t(unsigned long, PAGE_SIZE - offset, iov_iter_count(i)); again: + status = balance_dirty_pages_ratelimited_flags(mapping, + bdp_flags); + if (unlikely(status)) + break; + if (bytes > length) bytes = length; @@ -760,6 +760,10 @@ again: * Otherwise there's a nasty deadlock on copying from the * same page as we're writing to, without it being marked * up-to-date. + * + * For async buffered writes the assumption is that the user + * page has already been faulted in. This can be optimized by + * faulting the user page. */ if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) { status = -EFAULT; @@ -771,7 +775,7 @@ again: break; page = folio_file_page(folio, pos >> PAGE_SHIFT); - if (mapping_writably_mapped(iter->inode->i_mapping)) + if (mapping_writably_mapped(mapping)) flush_dcache_page(page); copied = copy_page_from_iter_atomic(page, offset, bytes, i); @@ -796,10 +800,12 @@ again: pos += status; written += status; length -= status; - - balance_dirty_pages_ratelimited(iter->inode->i_mapping); } while (iov_iter_count(i) && length); + if (status == -EAGAIN) { + iov_iter_revert(i, written); + return -EAGAIN; + } return written ? written : status; } @@ -815,6 +821,9 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, }; int ret; + if (iocb->ki_flags & IOCB_NOWAIT) + iter.flags |= IOMAP_NOWAIT; + while ((ret = iomap_iter(&iter, ops)) > 0) iter.processed = iomap_write_iter(&iter, i); if (iter.pos == iocb->ki_pos) @@ -917,10 +926,10 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) pos += bytes; length -= bytes; written += bytes; - if (did_zero) - *did_zero = true; } while (length > 0); + if (did_zero) + *did_zero = true; return written; } @@ -1329,7 +1338,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, struct writeback_control *wbc, struct inode *inode, struct folio *folio, u64 end_pos) { - struct iomap_page *iop = iomap_page_create(inode, folio); + struct iomap_page *iop = iomap_page_create(inode, folio, 0); struct iomap_ioend *ioend, *next; unsigned len = i_blocksize(inode); unsigned nblocks = i_blocks_per_folio(inode, folio); @@ -1478,10 +1487,10 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data) pgoff_t end_index = isize >> PAGE_SHIFT; /* - * Skip the page if it's fully outside i_size, e.g. due to a - * truncate operation that's in progress. We must redirty the - * page so that reclaim stops reclaiming it. Otherwise - * iomap_release_folio() is called on it and gets confused. + * Skip the page if it's fully outside i_size, e.g. + * due to a truncate operation that's in progress. We've + * cleaned this page and truncate will finish things off for + * us. * * Note that the end_index is unsigned long. If the given * offset is greater than 16TB on a 32-bit system then if we @@ -1496,7 +1505,7 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data) */ if (folio->index > end_index || (folio->index == end_index && poff == 0)) - goto redirty; + goto unlock; /* * The page straddles i_size. It must be zeroed out on each @@ -1514,6 +1523,7 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data) redirty: folio_redirty_for_writepage(wbc, folio); +unlock: folio_unlock(folio); return 0; } diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 370c3241618a..c75d33d5c3ce 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -52,7 +52,7 @@ struct iomap_dio { }; static struct bio *iomap_dio_alloc_bio(const struct iomap_iter *iter, - struct iomap_dio *dio, unsigned short nr_vecs, unsigned int opf) + struct iomap_dio *dio, unsigned short nr_vecs, blk_opf_t opf) { if (dio->dops && dio->dops->bio_set) return bio_alloc_bioset(iter->iomap.bdev, nr_vecs, opf, @@ -212,10 +212,10 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, * mapping, and whether or not we want FUA. Note that we can end up * clearing the WRITE_FUA flag in the dio request. */ -static inline unsigned int iomap_dio_bio_opflags(struct iomap_dio *dio, +static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, const struct iomap *iomap, bool use_fua) { - unsigned int opflags = REQ_SYNC | REQ_IDLE; + blk_opf_t opflags = REQ_SYNC | REQ_IDLE; if (!(dio->flags & IOMAP_DIO_WRITE)) { WARN_ON_ONCE(iomap->flags & IOMAP_F_ZONE_APPEND); @@ -242,10 +242,9 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, struct inode *inode = iter->inode; unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev)); unsigned int fs_block_size = i_blocksize(inode), pad; - unsigned int align = iov_iter_alignment(dio->submit.iter); loff_t length = iomap_length(iter); loff_t pos = iter->pos; - unsigned int bio_opf; + blk_opf_t bio_opf; struct bio *bio; bool need_zeroout = false; bool use_fua = false; @@ -253,7 +252,8 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, size_t copied = 0; size_t orig_count; - if ((pos | length | align) & ((1 << blkbits) - 1)) + if ((pos | length) & ((1 << blkbits) - 1) || + !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter)) return -EINVAL; if (iomap->type == IOMAP_UNWRITTEN) { @@ -548,17 +548,18 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, } /* for data sync or sync, we need sync completion processing */ - if (iocb->ki_flags & IOCB_DSYNC) + if (iocb_is_dsync(iocb) && !(dio_flags & IOMAP_DIO_NOSYNC)) { dio->flags |= IOMAP_DIO_NEED_SYNC; - /* - * For datasync only writes, we optimistically try using FUA for - * this IO. Any non-FUA write that occurs will clear this flag, - * hence we know before completion whether a cache flush is - * necessary. - */ - if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC) - dio->flags |= IOMAP_DIO_WRITE_FUA; + /* + * For datasync only writes, we optimistically try + * using FUA for this IO. Any non-FUA write that + * occurs will clear this flag, hence we know before + * completion whether a cache flush is necessary. + */ + if (!(iocb->ki_flags & IOCB_SYNC)) + dio->flags |= IOMAP_DIO_WRITE_FUA; + } } if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) { diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c index 95a19f25d61c..b466172eec25 100644 --- a/fs/isofs/compress.c +++ b/fs/isofs/compress.c @@ -82,7 +82,7 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, return 0; } haveblocks = isofs_get_blocks(inode, blocknum, bhs, needblocks); - ll_rw_block(REQ_OP_READ, 0, haveblocks, bhs); + ll_rw_block(REQ_OP_READ, haveblocks, bhs); curbh = 0; curpage = 0; diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 746132998c57..51bd38da21cd 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -203,7 +203,7 @@ int jbd2_log_do_checkpoint(journal_t *journal) tid_t this_tid; int result, batch_count = 0; - jbd_debug(1, "Start checkpoint\n"); + jbd2_debug(1, "Start checkpoint\n"); /* * First thing: if there are any transactions in the log which @@ -212,7 +212,7 @@ int jbd2_log_do_checkpoint(journal_t *journal) */ result = jbd2_cleanup_journal_tail(journal); trace_jbd2_checkpoint(journal, result); - jbd_debug(1, "cleanup_journal_tail returned %d\n", result); + jbd2_debug(1, "cleanup_journal_tail returned %d\n", result); if (result <= 0) return result; @@ -804,5 +804,5 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact trace_jbd2_drop_transaction(journal, transaction); - jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); + jbd2_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); } diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index eb315e81f1a6..b2b2bc9b88d9 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -155,10 +155,10 @@ static int journal_submit_commit_record(journal_t *journal, if (journal->j_flags & JBD2_BARRIER && !jbd2_has_feature_async_commit(journal)) - ret = submit_bh(REQ_OP_WRITE, - REQ_SYNC | REQ_PREFLUSH | REQ_FUA, bh); + ret = submit_bh(REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | + REQ_FUA, bh); else - ret = submit_bh(REQ_OP_WRITE, REQ_SYNC, bh); + ret = submit_bh(REQ_OP_WRITE | REQ_SYNC, bh); *cbh = bh; return ret; @@ -421,7 +421,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) /* Do we need to erase the effects of a prior jbd2_journal_flush? */ if (journal->j_flags & JBD2_FLUSHED) { - jbd_debug(3, "super block updated\n"); + jbd2_debug(3, "super block updated\n"); mutex_lock_io(&journal->j_checkpoint_mutex); /* * We hold j_checkpoint_mutex so tail cannot change under us. @@ -435,7 +435,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) REQ_SYNC); mutex_unlock(&journal->j_checkpoint_mutex); } else { - jbd_debug(3, "superblock not updated\n"); + jbd2_debug(3, "superblock not updated\n"); } J_ASSERT(journal->j_running_transaction != NULL); @@ -467,7 +467,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) commit_transaction = journal->j_running_transaction; trace_jbd2_start_commit(journal, commit_transaction); - jbd_debug(1, "JBD2: starting commit of transaction %d\n", + jbd2_debug(1, "JBD2: starting commit of transaction %d\n", commit_transaction->t_tid); write_lock(&journal->j_state_lock); @@ -540,7 +540,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) __jbd2_journal_clean_checkpoint_list(journal, false); spin_unlock(&journal->j_list_lock); - jbd_debug(3, "JBD2: commit phase 1\n"); + jbd2_debug(3, "JBD2: commit phase 1\n"); /* * Clear revoked flag to reflect there is no revoked buffers @@ -553,13 +553,13 @@ void jbd2_journal_commit_transaction(journal_t *journal) */ jbd2_journal_switch_revoke_table(journal); + write_lock(&journal->j_state_lock); /* * Reserved credits cannot be claimed anymore, free them */ atomic_sub(atomic_read(&journal->j_reserved_credits), &commit_transaction->t_outstanding_credits); - write_lock(&journal->j_state_lock); trace_jbd2_commit_flushing(journal, commit_transaction); stats.run.rs_flushing = jiffies; stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked, @@ -573,7 +573,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) wake_up(&journal->j_wait_transaction_locked); write_unlock(&journal->j_state_lock); - jbd_debug(3, "JBD2: commit phase 2a\n"); + jbd2_debug(3, "JBD2: commit phase 2a\n"); /* * Now start flushing things to disk, in the order they appear @@ -586,7 +586,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) blk_start_plug(&plug); jbd2_journal_write_revoke_records(commit_transaction, &log_bufs); - jbd_debug(3, "JBD2: commit phase 2b\n"); + jbd2_debug(3, "JBD2: commit phase 2b\n"); /* * Way to go: we have now written out all of the data for a @@ -642,7 +642,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) if (!descriptor) { J_ASSERT (bufs == 0); - jbd_debug(4, "JBD2: get descriptor\n"); + jbd2_debug(4, "JBD2: get descriptor\n"); descriptor = jbd2_journal_get_descriptor_buffer( commit_transaction, @@ -652,7 +652,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) continue; } - jbd_debug(4, "JBD2: got buffer %llu (%p)\n", + jbd2_debug(4, "JBD2: got buffer %llu (%p)\n", (unsigned long long)descriptor->b_blocknr, descriptor->b_data); tagp = &descriptor->b_data[sizeof(journal_header_t)]; @@ -737,7 +737,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) commit_transaction->t_buffers == NULL || space_left < tag_bytes + 16 + csum_size) { - jbd_debug(4, "JBD2: Submit %d IOs\n", bufs); + jbd2_debug(4, "JBD2: Submit %d IOs\n", bufs); /* Write an end-of-descriptor marker before submitting the IOs. "tag" still points to @@ -763,7 +763,7 @@ start_journal_io: clear_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = journal_end_buffer_io_sync; - submit_bh(REQ_OP_WRITE, REQ_SYNC, bh); + submit_bh(REQ_OP_WRITE | REQ_SYNC, bh); } cond_resched(); @@ -839,7 +839,7 @@ start_journal_io: so we incur less scheduling load. */ - jbd_debug(3, "JBD2: commit phase 3\n"); + jbd2_debug(3, "JBD2: commit phase 3\n"); while (!list_empty(&io_bufs)) { struct buffer_head *bh = list_entry(io_bufs.prev, @@ -882,7 +882,7 @@ start_journal_io: J_ASSERT (commit_transaction->t_shadow_list == NULL); - jbd_debug(3, "JBD2: commit phase 4\n"); + jbd2_debug(3, "JBD2: commit phase 4\n"); /* Here we wait for the revoke record and descriptor record buffers */ while (!list_empty(&log_bufs)) { @@ -906,7 +906,7 @@ start_journal_io: if (err) jbd2_journal_abort(journal, err); - jbd_debug(3, "JBD2: commit phase 5\n"); + jbd2_debug(3, "JBD2: commit phase 5\n"); write_lock(&journal->j_state_lock); J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH); commit_transaction->t_state = T_COMMIT_JFLUSH; @@ -945,7 +945,7 @@ start_journal_io: transaction can be removed from any checkpoint list it was on before. */ - jbd_debug(3, "JBD2: commit phase 6\n"); + jbd2_debug(3, "JBD2: commit phase 6\n"); J_ASSERT(list_empty(&commit_transaction->t_inode_list)); J_ASSERT(commit_transaction->t_buffers == NULL); @@ -1122,7 +1122,7 @@ restart_loop: /* Done with this transaction! */ - jbd_debug(3, "JBD2: commit phase 7\n"); + jbd2_debug(3, "JBD2: commit phase 7\n"); J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH); @@ -1164,7 +1164,7 @@ restart_loop: journal->j_fc_cleanup_callback(journal, 1, commit_transaction->t_tid); trace_jbd2_end_commit(journal, commit_transaction); - jbd_debug(1, "JBD2: commit %d complete, head %d\n", + jbd2_debug(1, "JBD2: commit %d complete, head %d\n", journal->j_commit_sequence, journal->j_tail_sequence); write_lock(&journal->j_state_lock); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index c0cbeeaec2d1..6350d3857c89 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -49,8 +49,7 @@ #include <asm/page.h> #ifdef CONFIG_JBD2_DEBUG -ushort jbd2_journal_enable_debug __read_mostly; -EXPORT_SYMBOL(jbd2_journal_enable_debug); +static ushort jbd2_journal_enable_debug __read_mostly; module_param_named(jbd2_debug, jbd2_journal_enable_debug, ushort, 0644); MODULE_PARM_DESC(jbd2_debug, "Debugging level for jbd2"); @@ -81,7 +80,6 @@ EXPORT_SYMBOL(jbd2_journal_errno); EXPORT_SYMBOL(jbd2_journal_ack_err); EXPORT_SYMBOL(jbd2_journal_clear_err); EXPORT_SYMBOL(jbd2_log_wait_commit); -EXPORT_SYMBOL(jbd2_log_start_commit); EXPORT_SYMBOL(jbd2_journal_start_commit); EXPORT_SYMBOL(jbd2_journal_force_commit_nested); EXPORT_SYMBOL(jbd2_journal_wipe); @@ -115,7 +113,6 @@ void __jbd2_debug(int level, const char *file, const char *func, printk(KERN_DEBUG "%s: (%s, %u): %pV", file, func, line, &vaf); va_end(args); } -EXPORT_SYMBOL(__jbd2_debug); #endif /* Checksumming functions */ @@ -203,11 +200,11 @@ loop: if (journal->j_flags & JBD2_UNMOUNT) goto end_loop; - jbd_debug(1, "commit_sequence=%u, commit_request=%u\n", + jbd2_debug(1, "commit_sequence=%u, commit_request=%u\n", journal->j_commit_sequence, journal->j_commit_request); if (journal->j_commit_sequence != journal->j_commit_request) { - jbd_debug(1, "OK, requests differ\n"); + jbd2_debug(1, "OK, requests differ\n"); write_unlock(&journal->j_state_lock); del_timer_sync(&journal->j_commit_timer); jbd2_journal_commit_transaction(journal); @@ -222,7 +219,7 @@ loop: * good idea, because that depends on threads that may * be already stopped. */ - jbd_debug(1, "Now suspending kjournald2\n"); + jbd2_debug(1, "Now suspending kjournald2\n"); write_unlock(&journal->j_state_lock); try_to_freeze(); write_lock(&journal->j_state_lock); @@ -252,7 +249,7 @@ loop: finish_wait(&journal->j_wait_commit, &wait); } - jbd_debug(1, "kjournald2 wakes\n"); + jbd2_debug(1, "kjournald2 wakes\n"); /* * Were we woken up by a commit wakeup event? @@ -260,7 +257,7 @@ loop: transaction = journal->j_running_transaction; if (transaction && time_after_eq(jiffies, transaction->t_expires)) { journal->j_commit_request = transaction->t_tid; - jbd_debug(1, "woke because of timeout\n"); + jbd2_debug(1, "woke because of timeout\n"); } goto loop; @@ -268,7 +265,7 @@ end_loop: del_timer_sync(&journal->j_commit_timer); journal->j_task = NULL; wake_up(&journal->j_wait_done_commit); - jbd_debug(1, "Journal thread exiting.\n"); + jbd2_debug(1, "Journal thread exiting.\n"); write_unlock(&journal->j_state_lock); return 0; } @@ -481,7 +478,7 @@ repeat: * Called with j_state_lock locked for writing. * Returns true if a transaction commit was started. */ -int __jbd2_log_start_commit(journal_t *journal, tid_t target) +static int __jbd2_log_start_commit(journal_t *journal, tid_t target) { /* Return if the txn has already requested to be committed */ if (journal->j_commit_request == target) @@ -500,7 +497,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target) */ journal->j_commit_request = target; - jbd_debug(1, "JBD2: requesting commit %u/%u\n", + jbd2_debug(1, "JBD2: requesting commit %u/%u\n", journal->j_commit_request, journal->j_commit_sequence); journal->j_running_transaction->t_requested = jiffies; @@ -705,7 +702,7 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) } #endif while (tid_gt(tid, journal->j_commit_sequence)) { - jbd_debug(1, "JBD2: want %u, j_commit_sequence=%u\n", + jbd2_debug(1, "JBD2: want %u, j_commit_sequence=%u\n", tid, journal->j_commit_sequence); read_unlock(&journal->j_state_lock); wake_up(&journal->j_wait_commit); @@ -1117,7 +1114,7 @@ int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) freed += journal->j_last - journal->j_first; trace_jbd2_update_log_tail(journal, tid, block, freed); - jbd_debug(1, + jbd2_debug(1, "Cleaning journal tail from %u to %u (offset %lu), " "freeing %lu\n", journal->j_tail_sequence, tid, block, freed); @@ -1418,7 +1415,8 @@ static journal_t *journal_init_common(struct block_device *bdev, if (percpu_counter_init(&journal->j_checkpoint_jh_count, 0, GFP_KERNEL)) goto err_cleanup; - if (register_shrinker(&journal->j_shrinker)) { + if (register_shrinker(&journal->j_shrinker, "jbd2-journal:(%u:%u)", + MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev))) { percpu_counter_destroy(&journal->j_checkpoint_jh_count); goto err_cleanup; } @@ -1465,7 +1463,8 @@ journal_t *jbd2_journal_init_dev(struct block_device *bdev, if (!journal) return NULL; - bdevname(journal->j_dev, journal->j_devname); + snprintf(journal->j_devname, sizeof(journal->j_devname), + "%pg", journal->j_dev); strreplace(journal->j_devname, '/', '!'); jbd2_stats_proc_init(journal); @@ -1496,7 +1495,7 @@ journal_t *jbd2_journal_init_inode(struct inode *inode) return NULL; } - jbd_debug(1, "JBD2: inode %s/%ld, size %lld, bits %d, blksize %ld\n", + jbd2_debug(1, "JBD2: inode %s/%ld, size %lld, bits %d, blksize %ld\n", inode->i_sb->s_id, inode->i_ino, (long long) inode->i_size, inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize); @@ -1507,7 +1506,8 @@ journal_t *jbd2_journal_init_inode(struct inode *inode) return NULL; journal->j_inode = inode; - bdevname(journal->j_dev, journal->j_devname); + snprintf(journal->j_devname, sizeof(journal->j_devname), + "%pg", journal->j_dev); p = strreplace(journal->j_devname, '/', '!'); sprintf(p, "-%lu", journal->j_inode->i_ino); jbd2_stats_proc_init(journal); @@ -1575,7 +1575,7 @@ static int journal_reset(journal_t *journal) * attempting a write to a potential-readonly device. */ if (sb->s_start == 0) { - jbd_debug(1, "JBD2: Skipping superblock update on recovered sb " + jbd2_debug(1, "JBD2: Skipping superblock update on recovered sb " "(start %ld, seq %u, errno %d)\n", journal->j_tail, journal->j_tail_sequence, journal->j_errno); @@ -1602,7 +1602,7 @@ static int journal_reset(journal_t *journal) * This function expects that the caller will have locked the journal * buffer head, and will return with it unlocked */ -static int jbd2_write_superblock(journal_t *journal, int write_flags) +static int jbd2_write_superblock(journal_t *journal, blk_opf_t write_flags) { struct buffer_head *bh = journal->j_sb_buffer; journal_superblock_t *sb = journal->j_superblock; @@ -1636,7 +1636,7 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags) sb->s_checksum = jbd2_superblock_csum(journal, sb); get_bh(bh); bh->b_end_io = end_buffer_write_sync; - ret = submit_bh(REQ_OP_WRITE, write_flags, bh); + ret = submit_bh(REQ_OP_WRITE | write_flags, bh); wait_on_buffer(bh); if (buffer_write_io_error(bh)) { clear_buffer_write_io_error(bh); @@ -1659,13 +1659,14 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags) * @journal: The journal to update. * @tail_tid: TID of the new transaction at the tail of the log * @tail_block: The first block of the transaction at the tail of the log - * @write_op: With which operation should we write the journal sb + * @write_flags: Flags for the journal sb write operation * * Update a journal's superblock information about log tail and write it to * disk, waiting for the IO to complete. */ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, - unsigned long tail_block, int write_op) + unsigned long tail_block, + blk_opf_t write_flags) { journal_superblock_t *sb = journal->j_superblock; int ret; @@ -1678,14 +1679,14 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, } BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); - jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n", + jbd2_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n", tail_block, tail_tid); lock_buffer(journal->j_sb_buffer); sb->s_sequence = cpu_to_be32(tail_tid); sb->s_start = cpu_to_be32(tail_block); - ret = jbd2_write_superblock(journal, write_op); + ret = jbd2_write_superblock(journal, write_flags); if (ret) goto out; @@ -1702,12 +1703,12 @@ out: /** * jbd2_mark_journal_empty() - Mark on disk journal as empty. * @journal: The journal to update. - * @write_op: With which operation should we write the journal sb + * @write_flags: Flags for the journal sb write operation * * Update a journal's dynamic superblock fields to show that journal is empty. * Write updated superblock to disk waiting for IO to complete. */ -static void jbd2_mark_journal_empty(journal_t *journal, int write_op) +static void jbd2_mark_journal_empty(journal_t *journal, blk_opf_t write_flags) { journal_superblock_t *sb = journal->j_superblock; bool had_fast_commit = false; @@ -1719,7 +1720,7 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op) return; } - jbd_debug(1, "JBD2: Marking journal as empty (seq %u)\n", + jbd2_debug(1, "JBD2: Marking journal as empty (seq %u)\n", journal->j_tail_sequence); sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); @@ -1733,7 +1734,7 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op) had_fast_commit = true; } - jbd2_write_superblock(journal, write_op); + jbd2_write_superblock(journal, write_flags); if (had_fast_commit) jbd2_set_feature_fast_commit(journal); @@ -1862,7 +1863,7 @@ void jbd2_journal_update_sb_errno(journal_t *journal) errcode = journal->j_errno; if (errcode == -ESHUTDOWN) errcode = 0; - jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode); + jbd2_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode); sb->s_errno = cpu_to_be32(errcode); jbd2_write_superblock(journal, REQ_SYNC | REQ_FUA); @@ -1898,7 +1899,7 @@ static int journal_get_superblock(journal_t *journal) J_ASSERT(bh != NULL); if (!buffer_uptodate(bh)) { - ll_rw_block(REQ_OP_READ, 0, 1, &bh); + ll_rw_block(REQ_OP_READ, 1, &bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { printk(KERN_ERR @@ -2334,7 +2335,7 @@ int jbd2_journal_set_features(journal_t *journal, unsigned long compat, compat & JBD2_FEATURE_COMPAT_CHECKSUM) compat &= ~JBD2_FEATURE_COMPAT_CHECKSUM; - jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n", + jbd2_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n", compat, ro, incompat); sb = journal->j_superblock; @@ -2403,7 +2404,7 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat, { journal_superblock_t *sb; - jbd_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n", + jbd2_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n", compat, ro, incompat); sb = journal->j_superblock; @@ -2860,7 +2861,7 @@ static struct journal_head *journal_alloc_journal_head(void) #endif ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS); if (!ret) { - jbd_debug(1, "out of memory for journal_head\n"); + jbd2_debug(1, "out of memory for journal_head\n"); pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__); ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS | __GFP_NOFAIL); diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 8ca3527189f8..f548479615c6 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -100,7 +100,7 @@ static int do_readahead(journal_t *journal, unsigned int start) if (!buffer_uptodate(bh) && !buffer_locked(bh)) { bufs[nbufs++] = bh; if (nbufs == MAXBUF) { - ll_rw_block(REQ_OP_READ, 0, nbufs, bufs); + ll_rw_block(REQ_OP_READ, nbufs, bufs); journal_brelse_array(bufs, nbufs); nbufs = 0; } @@ -109,7 +109,7 @@ static int do_readahead(journal_t *journal, unsigned int start) } if (nbufs) - ll_rw_block(REQ_OP_READ, 0, nbufs, bufs); + ll_rw_block(REQ_OP_READ, nbufs, bufs); err = 0; failed: @@ -245,11 +245,11 @@ static int fc_do_one_pass(journal_t *journal, return 0; while (next_fc_block <= journal->j_fc_last) { - jbd_debug(3, "Fast commit replay: next block %ld\n", + jbd2_debug(3, "Fast commit replay: next block %ld\n", next_fc_block); err = jread(&bh, journal, next_fc_block); if (err) { - jbd_debug(3, "Fast commit replay: read error\n"); + jbd2_debug(3, "Fast commit replay: read error\n"); break; } @@ -263,7 +263,7 @@ static int fc_do_one_pass(journal_t *journal, } if (err) - jbd_debug(3, "Fast commit replay failed, err = %d\n", err); + jbd2_debug(3, "Fast commit replay failed, err = %d\n", err); return err; } @@ -297,7 +297,7 @@ int jbd2_journal_recover(journal_t *journal) */ if (!sb->s_start) { - jbd_debug(1, "No recovery required, last transaction %d\n", + jbd2_debug(1, "No recovery required, last transaction %d\n", be32_to_cpu(sb->s_sequence)); journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1; return 0; @@ -309,10 +309,10 @@ int jbd2_journal_recover(journal_t *journal) if (!err) err = do_one_pass(journal, &info, PASS_REPLAY); - jbd_debug(1, "JBD2: recovery, exit status %d, " + jbd2_debug(1, "JBD2: recovery, exit status %d, " "recovered transactions %u to %u\n", err, info.start_transaction, info.end_transaction); - jbd_debug(1, "JBD2: Replayed %d and revoked %d/%d blocks\n", + jbd2_debug(1, "JBD2: Replayed %d and revoked %d/%d blocks\n", info.nr_replays, info.nr_revoke_hits, info.nr_revokes); /* Restart the log at the next transaction ID, thus invalidating @@ -362,7 +362,7 @@ int jbd2_journal_skip_recovery(journal_t *journal) #ifdef CONFIG_JBD2_DEBUG int dropped = info.end_transaction - be32_to_cpu(journal->j_superblock->s_sequence); - jbd_debug(1, + jbd2_debug(1, "JBD2: ignoring %d transaction%s from the journal.\n", dropped, (dropped == 1) ? "" : "s"); #endif @@ -484,7 +484,7 @@ static int do_one_pass(journal_t *journal, if (pass == PASS_SCAN) info->start_transaction = first_commit_ID; - jbd_debug(1, "Starting recovery pass %d\n", pass); + jbd2_debug(1, "Starting recovery pass %d\n", pass); /* * Now we walk through the log, transaction by transaction, @@ -510,7 +510,7 @@ static int do_one_pass(journal_t *journal, if (tid_geq(next_commit_ID, info->end_transaction)) break; - jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n", + jbd2_debug(2, "Scanning for sequence ID %u at %lu/%lu\n", next_commit_ID, next_log_block, jbd2_has_feature_fast_commit(journal) ? journal->j_fc_last : journal->j_last); @@ -519,7 +519,7 @@ static int do_one_pass(journal_t *journal, * either the next descriptor block or the final commit * record. */ - jbd_debug(3, "JBD2: checking block %ld\n", next_log_block); + jbd2_debug(3, "JBD2: checking block %ld\n", next_log_block); err = jread(&bh, journal, next_log_block); if (err) goto failed; @@ -542,7 +542,7 @@ static int do_one_pass(journal_t *journal, blocktype = be32_to_cpu(tmp->h_blocktype); sequence = be32_to_cpu(tmp->h_sequence); - jbd_debug(3, "Found magic %d, sequence %d\n", + jbd2_debug(3, "Found magic %d, sequence %d\n", blocktype, sequence); if (sequence != next_commit_ID) { @@ -575,7 +575,7 @@ static int do_one_pass(journal_t *journal, goto failed; } need_check_commit_time = true; - jbd_debug(1, + jbd2_debug(1, "invalid descriptor block found in %lu\n", next_log_block); } @@ -758,7 +758,7 @@ static int do_one_pass(journal_t *journal, * It likely does not belong to same journal, * just end this recovery with success. */ - jbd_debug(1, "JBD2: Invalid checksum ignored in transaction %u, likely stale data\n", + jbd2_debug(1, "JBD2: Invalid checksum ignored in transaction %u, likely stale data\n", next_commit_ID); brelse(bh); goto done; @@ -826,7 +826,7 @@ static int do_one_pass(journal_t *journal, if (pass == PASS_SCAN && !jbd2_descriptor_block_csum_verify(journal, bh->b_data)) { - jbd_debug(1, "JBD2: invalid revoke block found in %lu\n", + jbd2_debug(1, "JBD2: invalid revoke block found in %lu\n", next_log_block); need_check_commit_time = true; } @@ -845,7 +845,7 @@ static int do_one_pass(journal_t *journal, continue; default: - jbd_debug(3, "Unrecognised magic %d, end of scan.\n", + jbd2_debug(3, "Unrecognised magic %d, end of scan.\n", blocktype); brelse(bh); goto done; diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index fa608788b93d..4556e4689024 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -398,7 +398,7 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr, } handle->h_revoke_credits--; - jbd_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in); + jbd2_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in); err = insert_revoke_hash(journal, blocknr, handle->h_transaction->t_tid); BUFFER_TRACE(bh_in, "exit"); @@ -428,7 +428,7 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) int did_revoke = 0; /* akpm: debug */ struct buffer_head *bh = jh2bh(jh); - jbd_debug(4, "journal_head %p, cancelling revoke\n", jh); + jbd2_debug(4, "journal_head %p, cancelling revoke\n", jh); /* Is the existing Revoke bit valid? If so, we trust it, and * only perform the full cancel if the revoke bit is set. If @@ -444,7 +444,7 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) if (need_cancel) { record = find_revoke_record(journal, bh->b_blocknr); if (record) { - jbd_debug(4, "cancelled existing revoke on " + jbd2_debug(4, "cancelled existing revoke on " "blocknr %llu\n", (unsigned long long)bh->b_blocknr); spin_lock(&journal->j_revoke_lock); list_del(&record->hash); @@ -560,7 +560,7 @@ void jbd2_journal_write_revoke_records(transaction_t *transaction, } if (descriptor) flush_descriptor(journal, descriptor, offset); - jbd_debug(1, "Wrote %d revoke records\n", count); + jbd2_debug(1, "Wrote %d revoke records\n", count); } /* diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index e9c308ae475f..e1be93ccd81c 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -373,7 +373,7 @@ alloc_transaction: return -ENOMEM; } - jbd_debug(3, "New handle %p going live.\n", handle); + jbd2_debug(3, "New handle %p going live.\n", handle); /* * We need to hold j_state_lock until t_updates has been incremented, @@ -453,7 +453,7 @@ repeat: handle->h_start_jiffies = jiffies; atomic_inc(&transaction->t_updates); atomic_inc(&transaction->t_handle_count); - jbd_debug(4, "Handle %p given %d credits (total %d, free %lu)\n", + jbd2_debug(4, "Handle %p given %d credits (total %d, free %lu)\n", handle, blocks, atomic_read(&transaction->t_outstanding_credits), jbd2_log_space_left(journal)); @@ -674,7 +674,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records) /* Don't extend a locked-down transaction! */ if (transaction->t_state != T_RUNNING) { - jbd_debug(3, "denied handle %p %d blocks: " + jbd2_debug(3, "denied handle %p %d blocks: " "transaction not running\n", handle, nblocks); goto error_out; } @@ -689,7 +689,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records) &transaction->t_outstanding_credits); if (wanted > journal->j_max_transaction_buffers) { - jbd_debug(3, "denied handle %p %d blocks: " + jbd2_debug(3, "denied handle %p %d blocks: " "transaction too large\n", handle, nblocks); atomic_sub(nblocks, &transaction->t_outstanding_credits); goto error_out; @@ -707,7 +707,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records) handle->h_revoke_credits_requested += revoke_records; result = 0; - jbd_debug(3, "extended handle %p by %d\n", handle, nblocks); + jbd2_debug(3, "extended handle %p by %d\n", handle, nblocks); error_out: read_unlock(&journal->j_state_lock); return result; @@ -795,7 +795,7 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, int revoke_records, * First unlink the handle from its current transaction, and start the * commit on that. */ - jbd_debug(2, "restarting handle %p\n", handle); + jbd2_debug(2, "restarting handle %p\n", handle); stop_this_handle(handle); handle->h_transaction = NULL; @@ -979,7 +979,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh, journal = transaction->t_journal; - jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy); + jbd2_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy); JBUFFER_TRACE(jh, "entry"); repeat: @@ -1271,7 +1271,7 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) struct journal_head *jh = jbd2_journal_add_journal_head(bh); int err; - jbd_debug(5, "journal_head %p\n", jh); + jbd2_debug(5, "journal_head %p\n", jh); err = -EROFS; if (is_handle_aborted(handle)) goto out; @@ -1486,8 +1486,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) struct journal_head *jh; int ret = 0; - if (is_handle_aborted(handle)) - return -EROFS; if (!buffer_jbd(bh)) return -EUCLEAN; @@ -1496,7 +1494,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) * of the running transaction. */ jh = bh2jh(bh); - jbd_debug(5, "journal_head %p\n", jh); + jbd2_debug(5, "journal_head %p\n", jh); JBUFFER_TRACE(jh, "entry"); /* @@ -1534,6 +1532,18 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) journal = transaction->t_journal; spin_lock(&jh->b_state_lock); + if (is_handle_aborted(handle)) { + /* + * Check journal aborting with @jh->b_state_lock locked, + * since 'jh->b_transaction' could be replaced with + * 'jh->b_next_transaction' during old transaction + * committing if journal aborted, which may fail + * assertion on 'jh->b_frozen_data == NULL'. + */ + ret = -EROFS; + goto out_unlock_bh; + } + if (jh->b_modified == 0) { /* * This buffer's got modified and becoming part @@ -1818,7 +1828,7 @@ int jbd2_journal_stop(handle_t *handle) pid_t pid; if (--handle->h_ref > 0) { - jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, + jbd2_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, handle->h_ref); if (is_handle_aborted(handle)) return -EIO; @@ -1838,7 +1848,7 @@ int jbd2_journal_stop(handle_t *handle) if (is_handle_aborted(handle)) err = -EIO; - jbd_debug(4, "Handle %p going down\n", handle); + jbd2_debug(4, "Handle %p going down\n", handle); trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev, tid, handle->h_type, handle->h_line_no, jiffies - handle->h_start_jiffies, @@ -1916,7 +1926,7 @@ int jbd2_journal_stop(handle_t *handle) * completes the commit thread, it just doesn't write * anything to disk. */ - jbd_debug(2, "transaction too old, requesting commit for " + jbd2_debug(2, "transaction too old, requesting commit for " "handle %p\n", handle); /* This is non-blocking */ jbd2_log_start_commit(journal, tid); @@ -2662,7 +2672,7 @@ static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode, return -EROFS; journal = transaction->t_journal; - jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino, + jbd2_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino, transaction->t_tid); spin_lock(&journal->j_list_lock); diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 1d732fd223d4..332dc9ac47a9 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -95,14 +95,14 @@ int jfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (rc) return rc; - if (is_quota_modification(inode, iattr)) { + if (is_quota_modification(mnt_userns, inode, iattr)) { rc = dquot_initialize(inode); if (rc) return rc; } if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) || (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) { - rc = dquot_transfer(inode, iattr); + rc = dquot_transfer(mnt_userns, inode, iattr); if (rc) return rc; } diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 259326556ada..d1ec920aa030 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -301,13 +301,25 @@ static int jfs_write_begin(struct file *file, struct address_space *mapping, { int ret; - ret = nobh_write_begin(mapping, pos, len, pagep, fsdata, jfs_get_block); + ret = block_write_begin(mapping, pos, len, pagep, jfs_get_block); if (unlikely(ret)) jfs_write_failed(mapping, pos + len); return ret; } +static int jfs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, struct page *page, + void *fsdata) +{ + int ret; + + ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + if (ret < len) + jfs_write_failed(mapping, pos + len); + return ret; +} + static sector_t jfs_bmap(struct address_space *mapping, sector_t block) { return generic_block_bmap(mapping, block, jfs_get_block); @@ -346,7 +358,7 @@ const struct address_space_operations jfs_aops = { .writepage = jfs_writepage, .writepages = jfs_writepages, .write_begin = jfs_write_begin, - .write_end = nobh_write_end, + .write_end = jfs_write_end, .bmap = jfs_bmap, .direct_IO = jfs_direct_IO, }; @@ -399,7 +411,7 @@ void jfs_truncate(struct inode *ip) { jfs_info("jfs_truncate: size = 0x%lx", (ulong) ip->i_size); - nobh_truncate_page(ip->i_mapping, ip->i_size, jfs_get_block); + block_truncate_page(ip->i_mapping, ip->i_size, jfs_get_block); IWRITE_LOCK(ip, RDWRLOCK_NORMAL); jfs_truncate_nolock(ip, ip->i_size); diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 387652ae14c2..2e8461ce74de 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -618,7 +618,7 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock, SetPageUptodate(page); } else { page = read_mapping_page(mapping, page_index, NULL); - if (IS_ERR(page) || !PageUptodate(page)) { + if (IS_ERR(page)) { jfs_err("read_mapping_page failed!"); return NULL; } diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c index 1b07550485b9..5d826274570c 100644 --- a/fs/kernel_read_file.c +++ b/fs/kernel_read_file.c @@ -29,15 +29,15 @@ * change between calls to kernel_read_file(). * * Returns number of bytes read (no single read will be bigger - * than INT_MAX), or negative on error. + * than SSIZE_MAX), or negative on error. * */ -int kernel_read_file(struct file *file, loff_t offset, void **buf, - size_t buf_size, size_t *file_size, - enum kernel_read_file_id id) +ssize_t kernel_read_file(struct file *file, loff_t offset, void **buf, + size_t buf_size, size_t *file_size, + enum kernel_read_file_id id) { loff_t i_size, pos; - size_t copied; + ssize_t copied; void *allocated = NULL; bool whole_file; int ret; @@ -58,7 +58,7 @@ int kernel_read_file(struct file *file, loff_t offset, void **buf, goto out; } /* The file is too big for sane activities. */ - if (i_size > INT_MAX) { + if (i_size > SSIZE_MAX) { ret = -EFBIG; goto out; } @@ -124,12 +124,12 @@ out: } EXPORT_SYMBOL_GPL(kernel_read_file); -int kernel_read_file_from_path(const char *path, loff_t offset, void **buf, - size_t buf_size, size_t *file_size, - enum kernel_read_file_id id) +ssize_t kernel_read_file_from_path(const char *path, loff_t offset, void **buf, + size_t buf_size, size_t *file_size, + enum kernel_read_file_id id) { struct file *file; - int ret; + ssize_t ret; if (!path || !*path) return -EINVAL; @@ -144,14 +144,14 @@ int kernel_read_file_from_path(const char *path, loff_t offset, void **buf, } EXPORT_SYMBOL_GPL(kernel_read_file_from_path); -int kernel_read_file_from_path_initns(const char *path, loff_t offset, - void **buf, size_t buf_size, - size_t *file_size, - enum kernel_read_file_id id) +ssize_t kernel_read_file_from_path_initns(const char *path, loff_t offset, + void **buf, size_t buf_size, + size_t *file_size, + enum kernel_read_file_id id) { struct file *file; struct path root; - int ret; + ssize_t ret; if (!path || !*path) return -EINVAL; @@ -171,12 +171,12 @@ int kernel_read_file_from_path_initns(const char *path, loff_t offset, } EXPORT_SYMBOL_GPL(kernel_read_file_from_path_initns); -int kernel_read_file_from_fd(int fd, loff_t offset, void **buf, - size_t buf_size, size_t *file_size, - enum kernel_read_file_id id) +ssize_t kernel_read_file_from_fd(int fd, loff_t offset, void **buf, + size_t buf_size, size_t *file_size, + enum kernel_read_file_id id) { struct fd f = fdget(fd); - int ret = -EBADF; + ssize_t ret = -EBADF; if (!f.file || !(f.file->f_mode & FMODE_READ)) goto out; diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 6eca72cfa1f2..1cc88ba6de90 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -1343,14 +1343,17 @@ static void __kernfs_remove(struct kernfs_node *kn) { struct kernfs_node *pos; + /* Short-circuit if non-root @kn has already finished removal. */ + if (!kn) + return; + lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem); /* - * Short-circuit if non-root @kn has already finished removal. * This is for kernfs_remove_self() which plays with active ref * after removal. */ - if (!kn || (kn->parent && RB_EMPTY_NODE(&kn->rb))) + if (kn->parent && RB_EMPTY_NODE(&kn->rb)) return; pr_debug("kernfs %s: removing\n", kn->name); diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index e3abfa843879..b3ec34386b43 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -18,21 +18,8 @@ #include "kernfs-internal.h" -/* - * There's one kernfs_open_file for each open file and one kernfs_open_node - * for each kernfs_node with one or more open files. - * - * kernfs_node->attr.open points to kernfs_open_node. attr.open is - * protected by kernfs_open_node_lock. - * - * filp->private_data points to seq_file whose ->private points to - * kernfs_open_file. kernfs_open_files are chained at - * kernfs_open_node->files, which is protected by kernfs_open_file_mutex. - */ -static DEFINE_SPINLOCK(kernfs_open_node_lock); -static DEFINE_MUTEX(kernfs_open_file_mutex); - struct kernfs_open_node { + struct rcu_head rcu_head; atomic_t event; wait_queue_head_t poll; struct list_head files; /* goes through kernfs_open_file.list */ @@ -51,6 +38,70 @@ struct kernfs_open_node { static DEFINE_SPINLOCK(kernfs_notify_lock); static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL; +static inline struct mutex *kernfs_open_file_mutex_ptr(struct kernfs_node *kn) +{ + int idx = hash_ptr(kn, NR_KERNFS_LOCK_BITS); + + return &kernfs_locks->open_file_mutex[idx]; +} + +static inline struct mutex *kernfs_open_file_mutex_lock(struct kernfs_node *kn) +{ + struct mutex *lock; + + lock = kernfs_open_file_mutex_ptr(kn); + + mutex_lock(lock); + + return lock; +} + +/** + * kernfs_deref_open_node - Get kernfs_open_node corresponding to @kn. + * + * @of: associated kernfs_open_file instance. + * @kn: target kernfs_node. + * + * Fetch and return ->attr.open of @kn if @of->list is non empty. + * If @of->list is not empty we can safely assume that @of is on + * @kn->attr.open->files list and this guarantees that @kn->attr.open + * will not vanish i.e. dereferencing outside RCU read-side critical + * section is safe here. + * + * The caller needs to make sure that @of->list is not empty. + */ +static struct kernfs_open_node * +kernfs_deref_open_node(struct kernfs_open_file *of, struct kernfs_node *kn) +{ + struct kernfs_open_node *on; + + on = rcu_dereference_check(kn->attr.open, !list_empty(&of->list)); + + return on; +} + +/** + * kernfs_deref_open_node_protected - Get kernfs_open_node corresponding to @kn + * + * @kn: target kernfs_node. + * + * Fetch and return ->attr.open of @kn when caller holds the + * kernfs_open_file_mutex_ptr(kn). + * + * Update of ->attr.open happens under kernfs_open_file_mutex_ptr(kn). So when + * the caller guarantees that this mutex is being held, other updaters can't + * change ->attr.open and this means that we can safely deref ->attr.open + * outside RCU read-side critical section. + * + * The caller needs to make sure that kernfs_open_file_mutex is held. + */ +static struct kernfs_open_node * +kernfs_deref_open_node_protected(struct kernfs_node *kn) +{ + return rcu_dereference_protected(kn->attr.open, + lockdep_is_held(kernfs_open_file_mutex_ptr(kn))); +} + static struct kernfs_open_file *kernfs_of(struct file *file) { return ((struct seq_file *)file->private_data)->private; @@ -156,8 +207,12 @@ static void kernfs_seq_stop(struct seq_file *sf, void *v) static int kernfs_seq_show(struct seq_file *sf, void *v) { struct kernfs_open_file *of = sf->private; + struct kernfs_open_node *on = kernfs_deref_open_node(of, of->kn); - of->event = atomic_read(&of->kn->attr.open->event); + if (!on) + return -EINVAL; + + of->event = atomic_read(&on->event); return of->kn->attr.ops->seq_show(sf, v); } @@ -180,6 +235,7 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) struct kernfs_open_file *of = kernfs_of(iocb->ki_filp); ssize_t len = min_t(size_t, iov_iter_count(iter), PAGE_SIZE); const struct kernfs_ops *ops; + struct kernfs_open_node *on; char *buf; buf = of->prealloc_buf; @@ -201,7 +257,15 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) goto out_free; } - of->event = atomic_read(&of->kn->attr.open->event); + on = kernfs_deref_open_node(of, of->kn); + if (!on) { + len = -EINVAL; + mutex_unlock(&of->mutex); + goto out_free; + } + + of->event = atomic_read(&on->event); + ops = kernfs_ops(of->kn); if (ops->read) len = ops->read(of, buf, len, iocb->ki_pos); @@ -243,7 +307,7 @@ static ssize_t kernfs_fop_read_iter(struct kiocb *iocb, struct iov_iter *iter) * There is no easy way for us to know if userspace is only doing a partial * write, so we don't support them. We expect the entire buffer to come on * the first write. Hint: if you're writing a value, first read the file, - * modify only the the value you're changing, then write entire buffer + * modify only the value you're changing, then write entire buffer * back. */ static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter) @@ -484,7 +548,6 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) * It is not possible to successfully wrap close. * So error if someone is trying to use close. */ - rc = -EINVAL; if (vma->vm_ops && vma->vm_ops->close) goto out_put; @@ -518,37 +581,31 @@ static int kernfs_get_open_node(struct kernfs_node *kn, struct kernfs_open_file *of) { struct kernfs_open_node *on, *new_on = NULL; + struct mutex *mutex = NULL; - retry: - mutex_lock(&kernfs_open_file_mutex); - spin_lock_irq(&kernfs_open_node_lock); - - if (!kn->attr.open && new_on) { - kn->attr.open = new_on; - new_on = NULL; - } - - on = kn->attr.open; - if (on) - list_add_tail(&of->list, &on->files); - - spin_unlock_irq(&kernfs_open_node_lock); - mutex_unlock(&kernfs_open_file_mutex); + mutex = kernfs_open_file_mutex_lock(kn); + on = kernfs_deref_open_node_protected(kn); if (on) { - kfree(new_on); + list_add_tail(&of->list, &on->files); + mutex_unlock(mutex); return 0; + } else { + /* not there, initialize a new one */ + new_on = kmalloc(sizeof(*new_on), GFP_KERNEL); + if (!new_on) { + mutex_unlock(mutex); + return -ENOMEM; + } + atomic_set(&new_on->event, 1); + init_waitqueue_head(&new_on->poll); + INIT_LIST_HEAD(&new_on->files); + list_add_tail(&of->list, &new_on->files); + rcu_assign_pointer(kn->attr.open, new_on); } + mutex_unlock(mutex); - /* not there, initialize a new one and retry */ - new_on = kmalloc(sizeof(*new_on), GFP_KERNEL); - if (!new_on) - return -ENOMEM; - - atomic_set(&new_on->event, 1); - init_waitqueue_head(&new_on->poll); - INIT_LIST_HEAD(&new_on->files); - goto retry; + return 0; } /** @@ -567,24 +624,26 @@ static int kernfs_get_open_node(struct kernfs_node *kn, static void kernfs_unlink_open_file(struct kernfs_node *kn, struct kernfs_open_file *of) { - struct kernfs_open_node *on = kn->attr.open; - unsigned long flags; + struct kernfs_open_node *on; + struct mutex *mutex = NULL; - mutex_lock(&kernfs_open_file_mutex); - spin_lock_irqsave(&kernfs_open_node_lock, flags); + mutex = kernfs_open_file_mutex_lock(kn); + + on = kernfs_deref_open_node_protected(kn); + if (!on) { + mutex_unlock(mutex); + return; + } if (of) list_del(&of->list); - if (list_empty(&on->files)) - kn->attr.open = NULL; - else - on = NULL; - - spin_unlock_irqrestore(&kernfs_open_node_lock, flags); - mutex_unlock(&kernfs_open_file_mutex); + if (list_empty(&on->files)) { + rcu_assign_pointer(kn->attr.open, NULL); + kfree_rcu(on, rcu_head); + } - kfree(on); + mutex_unlock(mutex); } static int kernfs_fop_open(struct inode *inode, struct file *file) @@ -722,11 +781,11 @@ static void kernfs_release_file(struct kernfs_node *kn, /* * @of is guaranteed to have no other file operations in flight and * we just want to synchronize release and drain paths. - * @kernfs_open_file_mutex is enough. @of->mutex can't be used + * @kernfs_open_file_mutex_ptr(kn) is enough. @of->mutex can't be used * here because drain path may be called from places which can * cause circular dependency. */ - lockdep_assert_held(&kernfs_open_file_mutex); + lockdep_assert_held(kernfs_open_file_mutex_ptr(kn)); if (!of->released) { /* @@ -743,11 +802,12 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp) { struct kernfs_node *kn = inode->i_private; struct kernfs_open_file *of = kernfs_of(filp); + struct mutex *mutex = NULL; if (kn->flags & KERNFS_HAS_RELEASE) { - mutex_lock(&kernfs_open_file_mutex); + mutex = kernfs_open_file_mutex_lock(kn); kernfs_release_file(kn, of); - mutex_unlock(&kernfs_open_file_mutex); + mutex_unlock(mutex); } kernfs_unlink_open_file(kn, of); @@ -762,6 +822,7 @@ void kernfs_drain_open_files(struct kernfs_node *kn) { struct kernfs_open_node *on; struct kernfs_open_file *of; + struct mutex *mutex = NULL; if (!(kn->flags & (KERNFS_HAS_MMAP | KERNFS_HAS_RELEASE))) return; @@ -771,20 +832,19 @@ void kernfs_drain_open_files(struct kernfs_node *kn) * ->attr.open at this point of time. This check allows early bail out * if ->attr.open is already NULL. kernfs_unlink_open_file makes * ->attr.open NULL only while holding kernfs_open_file_mutex so below - * check under kernfs_open_file_mutex will ensure bailing out if + * check under kernfs_open_file_mutex_ptr(kn) will ensure bailing out if * ->attr.open became NULL while waiting for the mutex. */ - if (!kn->attr.open) + if (!rcu_access_pointer(kn->attr.open)) return; - mutex_lock(&kernfs_open_file_mutex); - if (!kn->attr.open) { - mutex_unlock(&kernfs_open_file_mutex); + mutex = kernfs_open_file_mutex_lock(kn); + on = kernfs_deref_open_node_protected(kn); + if (!on) { + mutex_unlock(mutex); return; } - on = kn->attr.open; - list_for_each_entry(of, &on->files, list) { struct inode *inode = file_inode(of->file); @@ -795,7 +855,7 @@ void kernfs_drain_open_files(struct kernfs_node *kn) kernfs_release_file(kn, of); } - mutex_unlock(&kernfs_open_file_mutex); + mutex_unlock(mutex); } /* @@ -815,7 +875,10 @@ void kernfs_drain_open_files(struct kernfs_node *kn) __poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait) { struct kernfs_node *kn = kernfs_dentry_node(of->file->f_path.dentry); - struct kernfs_open_node *on = kn->attr.open; + struct kernfs_open_node *on = kernfs_deref_open_node(of, kn); + + if (!on) + return EPOLLERR; poll_wait(of->file, &on->poll, wait); @@ -922,13 +985,13 @@ void kernfs_notify(struct kernfs_node *kn) return; /* kick poll immediately */ - spin_lock_irqsave(&kernfs_open_node_lock, flags); - on = kn->attr.open; + rcu_read_lock(); + on = rcu_dereference(kn->attr.open); if (on) { atomic_inc(&on->event); wake_up_interruptible(&on->poll); } - spin_unlock_irqrestore(&kernfs_open_node_lock, flags); + rcu_read_unlock(); /* schedule work to kick fsnotify */ spin_lock_irqsave(&kernfs_notify_lock, flags); diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index eeaa779b929c..3ae214d02d44 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -164,4 +164,8 @@ void kernfs_drain_open_files(struct kernfs_node *kn); */ extern const struct inode_operations kernfs_symlink_iops; +/* + * kernfs locks + */ +extern struct kernfs_global_locks *kernfs_locks; #endif /* __KERNFS_INTERNAL_H */ diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index cfa79715fc1a..d0859f72d2d6 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -20,6 +20,7 @@ #include "kernfs-internal.h" struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache; +struct kernfs_global_locks *kernfs_locks; static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry) { @@ -387,6 +388,22 @@ void kernfs_kill_sb(struct super_block *sb) kfree(info); } +static void __init kernfs_mutex_init(void) +{ + int count; + + for (count = 0; count < NR_KERNFS_LOCKS; count++) + mutex_init(&kernfs_locks->open_file_mutex[count]); +} + +static void __init kernfs_lock_init(void) +{ + kernfs_locks = kmalloc(sizeof(struct kernfs_global_locks), GFP_KERNEL); + WARN_ON(!kernfs_locks); + + kernfs_mutex_init(); +} + void __init kernfs_init(void) { kernfs_node_cache = kmem_cache_create("kernfs_node_cache", @@ -397,4 +414,6 @@ void __init kernfs_init(void) kernfs_iattrs_cache = kmem_cache_create("kernfs_iattrs_cache", sizeof(struct kernfs_iattrs), 0, SLAB_PANIC, NULL); + + kernfs_lock_init(); } diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index e6f4ccc12f49..353f047e783c 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -6490,6 +6490,7 @@ int smb2_write(struct ksmbd_work *work) goto out; } + ksmbd_debug(SMB, "flags %u\n", le32_to_cpu(req->Flags)); if (le32_to_cpu(req->Flags) & SMB2_WRITEFLAG_WRITE_THROUGH) writethrough = true; @@ -6505,10 +6506,6 @@ int smb2_write(struct ksmbd_work *work) data_buf = (char *)(((char *)&req->hdr.ProtocolId) + le16_to_cpu(req->DataOffset)); - ksmbd_debug(SMB, "flags %u\n", le32_to_cpu(req->Flags)); - if (le32_to_cpu(req->Flags) & SMB2_WRITEFLAG_WRITE_THROUGH) - writethrough = true; - ksmbd_debug(SMB, "filename %pd, offset %lld, len %zu\n", fp->filp->f_path.dentry, offset, length); err = ksmbd_vfs_write(work, fp, data_buf, length, &offset, @@ -7703,7 +7700,7 @@ int smb2_ioctl(struct ksmbd_work *work) { struct file_zero_data_information *zero_data; struct ksmbd_file *fp; - loff_t off, len; + loff_t off, len, bfz; if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) { ksmbd_debug(SMB, @@ -7720,19 +7717,26 @@ int smb2_ioctl(struct ksmbd_work *work) zero_data = (struct file_zero_data_information *)&req->Buffer[0]; - fp = ksmbd_lookup_fd_fast(work, id); - if (!fp) { - ret = -ENOENT; + off = le64_to_cpu(zero_data->FileOffset); + bfz = le64_to_cpu(zero_data->BeyondFinalZero); + if (off > bfz) { + ret = -EINVAL; goto out; } - off = le64_to_cpu(zero_data->FileOffset); - len = le64_to_cpu(zero_data->BeyondFinalZero) - off; + len = bfz - off; + if (len) { + fp = ksmbd_lookup_fd_fast(work, id); + if (!fp) { + ret = -ENOENT; + goto out; + } - ret = ksmbd_vfs_zero_data(work, fp, off, len); - ksmbd_fd_put(work, fp); - if (ret < 0) - goto out; + ret = ksmbd_vfs_zero_data(work, fp, off, len); + ksmbd_fd_put(work, fp); + if (ret < 0) + goto out; + } break; } case FSCTL_QUERY_ALLOCATED_RANGES: @@ -7806,14 +7810,24 @@ int smb2_ioctl(struct ksmbd_work *work) src_off = le64_to_cpu(dup_ext->SourceFileOffset); dst_off = le64_to_cpu(dup_ext->TargetFileOffset); length = le64_to_cpu(dup_ext->ByteCount); - cloned = vfs_clone_file_range(fp_in->filp, src_off, fp_out->filp, - dst_off, length, 0); + /* + * XXX: It is not clear if FSCTL_DUPLICATE_EXTENTS_TO_FILE + * should fall back to vfs_copy_file_range(). This could be + * beneficial when re-exporting nfs/smb mount, but note that + * this can result in partial copy that returns an error status. + * If/when FSCTL_DUPLICATE_EXTENTS_TO_FILE_EX is implemented, + * fall back to vfs_copy_file_range(), should be avoided when + * the flag DUPLICATE_EXTENTS_DATA_EX_SOURCE_ATOMIC is set. + */ + cloned = vfs_clone_file_range(fp_in->filp, src_off, + fp_out->filp, dst_off, length, 0); if (cloned == -EXDEV || cloned == -EOPNOTSUPP) { ret = -EOPNOTSUPP; goto dup_ext_out; } else if (cloned != length) { cloned = vfs_copy_file_range(fp_in->filp, src_off, - fp_out->filp, dst_off, length, 0); + fp_out->filp, dst_off, + length, 0); if (cloned != length) { if (cloned < 0) ret = cloned; diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index d035e060c2f0..35b55ee94fe5 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -5,16 +5,6 @@ * * Author(s): Long Li <longli@microsoft.com>, * Hyunchul Lee <hyc.lee@gmail.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU General Public License for more details. */ #define SUBMOD_NAME "smb_direct" diff --git a/fs/ksmbd/transport_tcp.c b/fs/ksmbd/transport_tcp.c index 8fef9de787d3..143bba4e4db8 100644 --- a/fs/ksmbd/transport_tcp.c +++ b/fs/ksmbd/transport_tcp.c @@ -230,7 +230,7 @@ static int ksmbd_kthread_fn(void *p) break; } ret = kernel_accept(iface->ksmbd_socket, &client_sk, - O_NONBLOCK); + SOCK_NONBLOCK); mutex_unlock(&iface->sock_release_lock); if (ret) { if (ret == -EAGAIN) diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c index dcdd07c6efff..7c849024999f 100644 --- a/fs/ksmbd/vfs.c +++ b/fs/ksmbd/vfs.c @@ -963,7 +963,7 @@ ssize_t ksmbd_vfs_getxattr(struct user_namespace *user_ns, */ int ksmbd_vfs_setxattr(struct user_namespace *user_ns, struct dentry *dentry, const char *attr_name, - const void *attr_value, size_t attr_size, int flags) + void *attr_value, size_t attr_size, int flags) { int err; @@ -1015,7 +1015,9 @@ int ksmbd_vfs_zero_data(struct ksmbd_work *work, struct ksmbd_file *fp, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, off, len); - return vfs_fallocate(fp->filp, FALLOC_FL_ZERO_RANGE, off, len); + return vfs_fallocate(fp->filp, + FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE, + off, len); } int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length, @@ -1046,7 +1048,7 @@ int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length, *out_count = 0; end = start + length; while (start < end && *out_count < in_count) { - extent_start = f->f_op->llseek(f, start, SEEK_DATA); + extent_start = vfs_llseek(f, start, SEEK_DATA); if (extent_start < 0) { if (extent_start != -ENXIO) ret = (int)extent_start; @@ -1056,7 +1058,7 @@ int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length, if (extent_start >= end) break; - extent_end = f->f_op->llseek(f, extent_start, SEEK_HOLE); + extent_end = vfs_llseek(f, extent_start, SEEK_HOLE); if (extent_end < 0) { if (extent_end != -ENXIO) ret = (int)extent_end; @@ -1777,6 +1779,10 @@ int ksmbd_vfs_copy_file_ranges(struct ksmbd_work *work, ret = vfs_copy_file_range(src_fp->filp, src_off, dst_fp->filp, dst_off, len, 0); + if (ret == -EOPNOTSUPP || ret == -EXDEV) + ret = generic_copy_file_range(src_fp->filp, src_off, + dst_fp->filp, dst_off, + len, 0); if (ret < 0) return ret; diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h index 8c37aaf936ab..70da4c0ba7ad 100644 --- a/fs/ksmbd/vfs.h +++ b/fs/ksmbd/vfs.h @@ -109,7 +109,7 @@ ssize_t ksmbd_vfs_casexattr_len(struct user_namespace *user_ns, int attr_name_len); int ksmbd_vfs_setxattr(struct user_namespace *user_ns, struct dentry *dentry, const char *attr_name, - const void *attr_value, size_t attr_size, int flags); + void *attr_value, size_t attr_size, int flags); int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name, size_t *xattr_stream_name_size, int s_type); int ksmbd_vfs_remove_xattr(struct user_namespace *user_ns, diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 0a22a2faf552..e1c4617de771 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -176,7 +176,7 @@ nlm_delete_file(struct nlm_file *file) } } -static int nlm_unlock_files(struct nlm_file *file) +static int nlm_unlock_files(struct nlm_file *file, fl_owner_t owner) { struct file_lock lock; @@ -184,6 +184,7 @@ static int nlm_unlock_files(struct nlm_file *file) lock.fl_type = F_UNLCK; lock.fl_start = 0; lock.fl_end = OFFSET_MAX; + lock.fl_owner = owner; if (file->f_file[O_RDONLY] && vfs_lock_file(file->f_file[O_RDONLY], F_SETLK, &lock, NULL)) goto out_err; @@ -225,7 +226,7 @@ again: if (match(lockhost, host)) { spin_unlock(&flctx->flc_lock); - if (nlm_unlock_files(file)) + if (nlm_unlock_files(file, fl->fl_owner)) return 1; goto again; } @@ -282,11 +283,10 @@ nlm_file_inuse(struct nlm_file *file) static void nlm_close_files(struct nlm_file *file) { - struct file *f; - - for (f = file->f_file[0]; f <= file->f_file[1]; f++) - if (f) - nlmsvc_ops->fclose(f); + if (file->f_file[O_RDONLY]) + nlmsvc_ops->fclose(file->f_file[O_RDONLY]); + if (file->f_file[O_WRONLY]) + nlmsvc_ops->fclose(file->f_file[O_WRONLY]); } /* diff --git a/fs/locks.c b/fs/locks.c index ca28e0e50e56..c266cfdc3291 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -425,21 +425,9 @@ static inline int flock_translate_cmd(int cmd) { } /* Fill in a file_lock structure with an appropriate FLOCK lock. */ -static struct file_lock * -flock_make_lock(struct file *filp, unsigned int cmd, struct file_lock *fl) +static void flock_make_lock(struct file *filp, struct file_lock *fl, int type) { - int type = flock_translate_cmd(cmd); - - if (type < 0) - return ERR_PTR(type); - - if (fl == NULL) { - fl = locks_alloc_lock(); - if (fl == NULL) - return ERR_PTR(-ENOMEM); - } else { - locks_init_lock(fl); - } + locks_init_lock(fl); fl->fl_file = filp; fl->fl_owner = filp; @@ -447,8 +435,6 @@ flock_make_lock(struct file *filp, unsigned int cmd, struct file_lock *fl) fl->fl_flags = FL_FLOCK; fl->fl_type = type; fl->fl_end = OFFSET_MAX; - - return fl; } static int assign_type(struct file_lock *fl, long type) @@ -2097,21 +2083,9 @@ EXPORT_SYMBOL(locks_lock_inode_wait); */ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) { - struct fd f = fdget(fd); - struct file_lock *lock; - int can_sleep, unlock; - int error; - - error = -EBADF; - if (!f.file) - goto out; - - can_sleep = !(cmd & LOCK_NB); - cmd &= ~LOCK_NB; - unlock = (cmd == LOCK_UN); - - if (!unlock && !(f.file->f_mode & (FMODE_READ|FMODE_WRITE))) - goto out_putf; + int can_sleep, error, type; + struct file_lock fl; + struct fd f; /* * LOCK_MAND locks were broken for a long time in that they never @@ -2123,36 +2097,41 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) */ if (cmd & LOCK_MAND) { pr_warn_once("Attempt to set a LOCK_MAND lock via flock(2). This support has been removed and the request ignored.\n"); - error = 0; - goto out_putf; + return 0; } - lock = flock_make_lock(f.file, cmd, NULL); - if (IS_ERR(lock)) { - error = PTR_ERR(lock); + type = flock_translate_cmd(cmd & ~LOCK_NB); + if (type < 0) + return type; + + error = -EBADF; + f = fdget(fd); + if (!f.file) + return error; + + if (type != F_UNLCK && !(f.file->f_mode & (FMODE_READ | FMODE_WRITE))) goto out_putf; - } - if (can_sleep) - lock->fl_flags |= FL_SLEEP; + flock_make_lock(f.file, &fl, type); - error = security_file_lock(f.file, lock->fl_type); + error = security_file_lock(f.file, fl.fl_type); if (error) - goto out_free; + goto out_putf; + + can_sleep = !(cmd & LOCK_NB); + if (can_sleep) + fl.fl_flags |= FL_SLEEP; if (f.file->f_op->flock) error = f.file->f_op->flock(f.file, - (can_sleep) ? F_SETLKW : F_SETLK, - lock); + (can_sleep) ? F_SETLKW : F_SETLK, + &fl); else - error = locks_lock_file_wait(f.file, lock); - - out_free: - locks_free_lock(lock); + error = locks_lock_file_wait(f.file, &fl); out_putf: fdput(f); - out: + return error; } @@ -2614,7 +2593,7 @@ locks_remove_flock(struct file *filp, struct file_lock_context *flctx) if (list_empty(&flctx->flc_flock)) return; - flock_make_lock(filp, LOCK_UN, &fl); + flock_make_lock(filp, &fl, F_UNLCK); fl.fl_flags |= FL_CLOSE; if (filp->f_op->flock) diff --git a/fs/mbcache.c b/fs/mbcache.c index 97c54d3a2227..47ccfcbe0a22 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -11,7 +11,7 @@ /* * Mbcache is a simple key-value store. Keys need not be unique, however * key-value pairs are expected to be unique (we use this fact in - * mb_cache_entry_delete()). + * mb_cache_entry_delete_or_get()). * * Ext2 and ext4 use this cache for deduplication of extended attribute blocks. * Ext4 also uses it for deduplication of xattr values stored in inodes. @@ -90,7 +90,7 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, return -ENOMEM; INIT_LIST_HEAD(&entry->e_list); - /* One ref for hash, one ref returned */ + /* Initial hash reference */ atomic_set(&entry->e_refcnt, 1); entry->e_key = key; entry->e_value = value; @@ -106,25 +106,45 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, } } hlist_bl_add_head(&entry->e_hash_list, head); - hlist_bl_unlock(head); - + /* + * Add entry to LRU list before it can be found by + * mb_cache_entry_delete() to avoid races + */ spin_lock(&cache->c_list_lock); list_add_tail(&entry->e_list, &cache->c_list); - /* Grab ref for LRU list */ - atomic_inc(&entry->e_refcnt); cache->c_entry_count++; spin_unlock(&cache->c_list_lock); + hlist_bl_unlock(head); return 0; } EXPORT_SYMBOL(mb_cache_entry_create); -void __mb_cache_entry_free(struct mb_cache_entry *entry) +void __mb_cache_entry_free(struct mb_cache *cache, struct mb_cache_entry *entry) { + struct hlist_bl_head *head; + + head = mb_cache_entry_head(cache, entry->e_key); + hlist_bl_lock(head); + hlist_bl_del(&entry->e_hash_list); + hlist_bl_unlock(head); kmem_cache_free(mb_entry_cache, entry); } EXPORT_SYMBOL(__mb_cache_entry_free); +/* + * mb_cache_entry_wait_unused - wait to be the last user of the entry + * + * @entry - entry to work on + * + * Wait to be the last user of the entry. + */ +void mb_cache_entry_wait_unused(struct mb_cache_entry *entry) +{ + wait_var_event(&entry->e_refcnt, atomic_read(&entry->e_refcnt) <= 2); +} +EXPORT_SYMBOL(mb_cache_entry_wait_unused); + static struct mb_cache_entry *__entry_find(struct mb_cache *cache, struct mb_cache_entry *entry, u32 key) @@ -142,10 +162,9 @@ static struct mb_cache_entry *__entry_find(struct mb_cache *cache, while (node) { entry = hlist_bl_entry(node, struct mb_cache_entry, e_hash_list); - if (entry->e_key == key && entry->e_reusable) { - atomic_inc(&entry->e_refcnt); + if (entry->e_key == key && entry->e_reusable && + atomic_inc_not_zero(&entry->e_refcnt)) goto out; - } node = node->next; } entry = NULL; @@ -205,10 +224,9 @@ struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key, head = mb_cache_entry_head(cache, key); hlist_bl_lock(head); hlist_bl_for_each_entry(entry, node, head, e_hash_list) { - if (entry->e_key == key && entry->e_value == value) { - atomic_inc(&entry->e_refcnt); + if (entry->e_key == key && entry->e_value == value && + atomic_inc_not_zero(&entry->e_refcnt)) goto out; - } } entry = NULL; out: @@ -217,42 +235,42 @@ out: } EXPORT_SYMBOL(mb_cache_entry_get); -/* mb_cache_entry_delete - remove a cache entry +/* mb_cache_entry_delete_or_get - remove a cache entry if it has no users * @cache - cache we work with * @key - key * @value - value * - * Remove entry from cache @cache with key @key and value @value. + * Remove entry from cache @cache with key @key and value @value. The removal + * happens only if the entry is unused. The function returns NULL in case the + * entry was successfully removed or there's no entry in cache. Otherwise the + * function grabs reference of the entry that we failed to delete because it + * still has users and return it. */ -void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value) +struct mb_cache_entry *mb_cache_entry_delete_or_get(struct mb_cache *cache, + u32 key, u64 value) { - struct hlist_bl_node *node; - struct hlist_bl_head *head; struct mb_cache_entry *entry; - head = mb_cache_entry_head(cache, key); - hlist_bl_lock(head); - hlist_bl_for_each_entry(entry, node, head, e_hash_list) { - if (entry->e_key == key && entry->e_value == value) { - /* We keep hash list reference to keep entry alive */ - hlist_bl_del_init(&entry->e_hash_list); - hlist_bl_unlock(head); - spin_lock(&cache->c_list_lock); - if (!list_empty(&entry->e_list)) { - list_del_init(&entry->e_list); - if (!WARN_ONCE(cache->c_entry_count == 0, - "mbcache: attempt to decrement c_entry_count past zero")) - cache->c_entry_count--; - atomic_dec(&entry->e_refcnt); - } - spin_unlock(&cache->c_list_lock); - mb_cache_entry_put(cache, entry); - return; - } - } - hlist_bl_unlock(head); + entry = mb_cache_entry_get(cache, key, value); + if (!entry) + return NULL; + + /* + * Drop the ref we got from mb_cache_entry_get() and the initial hash + * ref if we are the last user + */ + if (atomic_cmpxchg(&entry->e_refcnt, 2, 0) != 2) + return entry; + + spin_lock(&cache->c_list_lock); + if (!list_empty(&entry->e_list)) + list_del_init(&entry->e_list); + cache->c_entry_count--; + spin_unlock(&cache->c_list_lock); + __mb_cache_entry_free(cache, entry); + return NULL; } -EXPORT_SYMBOL(mb_cache_entry_delete); +EXPORT_SYMBOL(mb_cache_entry_delete_or_get); /* mb_cache_entry_touch - cache entry got used * @cache - cache the entry belongs to @@ -281,34 +299,24 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache, unsigned long nr_to_scan) { struct mb_cache_entry *entry; - struct hlist_bl_head *head; unsigned long shrunk = 0; spin_lock(&cache->c_list_lock); while (nr_to_scan-- && !list_empty(&cache->c_list)) { entry = list_first_entry(&cache->c_list, struct mb_cache_entry, e_list); - if (entry->e_referenced) { + /* Drop initial hash reference if there is no user */ + if (entry->e_referenced || + atomic_cmpxchg(&entry->e_refcnt, 1, 0) != 1) { entry->e_referenced = 0; list_move_tail(&entry->e_list, &cache->c_list); continue; } list_del_init(&entry->e_list); cache->c_entry_count--; - /* - * We keep LRU list reference so that entry doesn't go away - * from under us. - */ spin_unlock(&cache->c_list_lock); - head = mb_cache_entry_head(cache, entry->e_key); - hlist_bl_lock(head); - if (!hlist_bl_unhashed(&entry->e_hash_list)) { - hlist_bl_del_init(&entry->e_hash_list); - atomic_dec(&entry->e_refcnt); - } - hlist_bl_unlock(head); - if (mb_cache_entry_put(cache, entry)) - shrunk++; + __mb_cache_entry_free(cache, entry); + shrunk++; cond_resched(); spin_lock(&cache->c_list_lock); } @@ -367,7 +375,7 @@ struct mb_cache *mb_cache_create(int bucket_bits) cache->c_shrink.count_objects = mb_cache_count; cache->c_shrink.scan_objects = mb_cache_scan; cache->c_shrink.seeks = DEFAULT_SEEKS; - if (register_shrinker(&cache->c_shrink)) { + if (register_shrinker(&cache->c_shrink, "mbcache-shrinker")) { kfree(cache->c_hash); kfree(cache); goto err_out; @@ -400,11 +408,6 @@ void mb_cache_destroy(struct mb_cache *cache) * point. */ list_for_each_entry_safe(entry, next, &cache->c_list, e_list) { - if (!hlist_bl_unhashed(&entry->e_hash_list)) { - hlist_bl_del_init(&entry->e_hash_list); - atomic_dec(&entry->e_refcnt); - } else - WARN_ON(1); list_del(&entry->e_list); WARN_ON(atomic_read(&entry->e_refcnt) != 1); mb_cache_entry_put(cache, entry); diff --git a/fs/mount.h b/fs/mount.h index 0b6e08cf8afb..130c07c2f8d2 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -100,7 +100,6 @@ static inline int is_mounted(struct vfsmount *mnt) extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *); extern int __legitimize_mnt(struct vfsmount *, unsigned); -extern bool legitimize_mnt(struct vfsmount *, unsigned); static inline bool __path_is_mountpoint(const struct path *path) { diff --git a/fs/mpage.c b/fs/mpage.c index 0d25f44f5707..0f8ae954a579 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -75,26 +75,28 @@ static struct bio *mpage_bio_submit(struct bio *bio) * them. So when the buffer is up to date and the page size == block size, * this marks the page up to date instead of adding new buffers. */ -static void -map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block) +static void map_buffer_to_folio(struct folio *folio, struct buffer_head *bh, + int page_block) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct buffer_head *page_bh, *head; int block = 0; - if (!page_has_buffers(page)) { + head = folio_buffers(folio); + if (!head) { /* * don't make any buffers if there is only one buffer on - * the page and the page just needs to be set up to date + * the folio and the folio just needs to be set up to date */ if (inode->i_blkbits == PAGE_SHIFT && buffer_uptodate(bh)) { - SetPageUptodate(page); + folio_mark_uptodate(folio); return; } - create_empty_buffers(page, i_blocksize(inode), 0); + create_empty_buffers(&folio->page, i_blocksize(inode), 0); + head = folio_buffers(folio); } - head = page_buffers(page); + page_bh = head; do { if (block == page_block) { @@ -110,7 +112,7 @@ map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block) struct mpage_readpage_args { struct bio *bio; - struct page *page; + struct folio *folio; unsigned int nr_pages; bool is_readahead; sector_t last_block_in_bio; @@ -130,8 +132,8 @@ struct mpage_readpage_args { */ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) { - struct page *page = args->page; - struct inode *inode = page->mapping->host; + struct folio *folio = args->folio; + struct inode *inode = folio->mapping->host; const unsigned blkbits = inode->i_blkbits; const unsigned blocks_per_page = PAGE_SIZE >> blkbits; const unsigned blocksize = 1 << blkbits; @@ -145,20 +147,23 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) struct block_device *bdev = NULL; int length; int fully_mapped = 1; - int op = REQ_OP_READ; + blk_opf_t opf = REQ_OP_READ; unsigned nblocks; unsigned relative_block; - gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); + gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL); + + /* MAX_BUF_PER_PAGE, for example */ + VM_BUG_ON_FOLIO(folio_test_large(folio), folio); if (args->is_readahead) { - op |= REQ_RAHEAD; + opf |= REQ_RAHEAD; gfp |= __GFP_NORETRY | __GFP_NOWARN; } - if (page_has_buffers(page)) + if (folio_buffers(folio)) goto confused; - block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits); + block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits); last_block = block_in_file + args->nr_pages * blocks_per_page; last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; if (last_block > last_block_in_file) @@ -191,9 +196,9 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) } /* - * Then do more get_blocks calls until we are done with this page. + * Then do more get_blocks calls until we are done with this folio. */ - map_bh->b_page = page; + map_bh->b_page = &folio->page; while (page_block < blocks_per_page) { map_bh->b_state = 0; map_bh->b_size = 0; @@ -216,12 +221,12 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) /* some filesystems will copy data into the page during * the get_block call, in which case we don't want to - * read it again. map_buffer_to_page copies the data - * we just collected from get_block into the page's buffers - * so readpage doesn't have to repeat the get_block call + * read it again. map_buffer_to_folio copies the data + * we just collected from get_block into the folio's buffers + * so read_folio doesn't have to repeat the get_block call */ if (buffer_uptodate(map_bh)) { - map_buffer_to_page(page, map_bh, page_block); + map_buffer_to_folio(folio, map_bh, page_block); goto confused; } @@ -246,18 +251,18 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) } if (first_hole != blocks_per_page) { - zero_user_segment(page, first_hole << blkbits, PAGE_SIZE); + folio_zero_segment(folio, first_hole << blkbits, PAGE_SIZE); if (first_hole == 0) { - SetPageUptodate(page); - unlock_page(page); + folio_mark_uptodate(folio); + folio_unlock(folio); goto out; } } else if (fully_mapped) { - SetPageMappedToDisk(page); + folio_set_mappedtodisk(folio); } /* - * This page will go to BIO. Do we need to send this BIO off first? + * This folio will go to BIO. Do we need to send this BIO off first? */ if (args->bio && (args->last_block_in_bio != blocks[0] - 1)) args->bio = mpage_bio_submit(args->bio); @@ -266,10 +271,10 @@ alloc_new: if (args->bio == NULL) { if (first_hole == blocks_per_page) { if (!bdev_read_page(bdev, blocks[0] << (blkbits - 9), - page)) + &folio->page)) goto out; } - args->bio = bio_alloc(bdev, bio_max_segs(args->nr_pages), op, + args->bio = bio_alloc(bdev, bio_max_segs(args->nr_pages), opf, gfp); if (args->bio == NULL) goto confused; @@ -277,7 +282,7 @@ alloc_new: } length = first_hole << blkbits; - if (bio_add_page(args->bio, page, length, 0) < length) { + if (!bio_add_folio(args->bio, folio, length, 0)) { args->bio = mpage_bio_submit(args->bio); goto alloc_new; } @@ -295,10 +300,10 @@ out: confused: if (args->bio) args->bio = mpage_bio_submit(args->bio); - if (!PageUptodate(page)) - block_read_full_folio(page_folio(page), args->get_block); + if (!folio_test_uptodate(folio)) + block_read_full_folio(folio, args->get_block); else - unlock_page(page); + folio_unlock(folio); goto out; } @@ -343,18 +348,17 @@ confused: */ void mpage_readahead(struct readahead_control *rac, get_block_t get_block) { - struct page *page; + struct folio *folio; struct mpage_readpage_args args = { .get_block = get_block, .is_readahead = true, }; - while ((page = readahead_page(rac))) { - prefetchw(&page->flags); - args.page = page; + while ((folio = readahead_folio(rac))) { + prefetchw(&folio->flags); + args.folio = folio; args.nr_pages = readahead_count(rac); args.bio = do_mpage_readpage(&args); - put_page(page); } if (args.bio) mpage_bio_submit(args.bio); @@ -367,13 +371,11 @@ EXPORT_SYMBOL(mpage_readahead); int mpage_read_folio(struct folio *folio, get_block_t get_block) { struct mpage_readpage_args args = { - .page = &folio->page, + .folio = folio, .nr_pages = 1, .get_block = get_block, }; - VM_BUG_ON_FOLIO(folio_test_large(folio), folio); - args.bio = do_mpage_readpage(&args); if (args.bio) mpage_bio_submit(args.bio); @@ -402,7 +404,6 @@ struct mpage_data { struct bio *bio; sector_t last_block_in_bio; get_block_t *get_block; - unsigned use_writepage; }; /* @@ -622,15 +623,10 @@ confused: if (bio) bio = mpage_bio_submit(bio); - if (mpd->use_writepage) { - ret = mapping->a_ops->writepage(page, wbc); - } else { - ret = -EAGAIN; - goto out; - } /* * The caller has a ref on the inode, so *mapping is stable */ + ret = block_write_full_page(page, mpd->get_block, wbc); mapping_set_error(mapping, ret); out: mpd->bio = bio; @@ -642,8 +638,6 @@ out: * @mapping: address space structure to write * @wbc: subtract the number of written pages from *@wbc->nr_to_write * @get_block: the filesystem's block mapper function. - * If this is NULL then use a_ops->writepage. Otherwise, go - * direct-to-BIO. * * This is a library function, which implements the writepages() * address_space_operation. @@ -660,42 +654,17 @@ int mpage_writepages(struct address_space *mapping, struct writeback_control *wbc, get_block_t get_block) { + struct mpage_data mpd = { + .get_block = get_block, + }; struct blk_plug plug; int ret; blk_start_plug(&plug); - - if (!get_block) - ret = generic_writepages(mapping, wbc); - else { - struct mpage_data mpd = { - .bio = NULL, - .last_block_in_bio = 0, - .get_block = get_block, - .use_writepage = 1, - }; - - ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd); - if (mpd.bio) - mpage_bio_submit(mpd.bio); - } - blk_finish_plug(&plug); - return ret; -} -EXPORT_SYMBOL(mpage_writepages); - -int mpage_writepage(struct page *page, get_block_t get_block, - struct writeback_control *wbc) -{ - struct mpage_data mpd = { - .bio = NULL, - .last_block_in_bio = 0, - .get_block = get_block, - .use_writepage = 0, - }; - int ret = __mpage_writepage(page, wbc, &mpd); + ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd); if (mpd.bio) mpage_bio_submit(mpd.bio); + blk_finish_plug(&plug); return ret; } -EXPORT_SYMBOL(mpage_writepage); +EXPORT_SYMBOL(mpage_writepages); diff --git a/fs/namei.c b/fs/namei.c index 1f28d3f463c3..ed3ffd9b22a3 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -567,7 +567,7 @@ struct nameidata { struct path root; struct inode *inode; /* path.dentry.d_inode */ unsigned int flags, state; - unsigned seq, m_seq, r_seq; + unsigned seq, next_seq, m_seq, r_seq; int last_type; unsigned depth; int total_link_count; @@ -665,6 +665,13 @@ static void drop_links(struct nameidata *nd) } } +static void leave_rcu(struct nameidata *nd) +{ + nd->flags &= ~LOOKUP_RCU; + nd->seq = nd->next_seq = 0; + rcu_read_unlock(); +} + static void terminate_walk(struct nameidata *nd) { drop_links(nd); @@ -678,8 +685,7 @@ static void terminate_walk(struct nameidata *nd) nd->state &= ~ND_ROOT_GRABBED; } } else { - nd->flags &= ~LOOKUP_RCU; - rcu_read_unlock(); + leave_rcu(nd); } nd->depth = 0; nd->path.mnt = NULL; @@ -765,14 +771,13 @@ static bool try_to_unlazy(struct nameidata *nd) BUG_ON(!(nd->flags & LOOKUP_RCU)); - nd->flags &= ~LOOKUP_RCU; if (unlikely(!legitimize_links(nd))) goto out1; if (unlikely(!legitimize_path(nd, &nd->path, nd->seq))) goto out; if (unlikely(!legitimize_root(nd))) goto out; - rcu_read_unlock(); + leave_rcu(nd); BUG_ON(nd->inode != parent->d_inode); return true; @@ -780,7 +785,7 @@ out1: nd->path.mnt = NULL; nd->path.dentry = NULL; out: - rcu_read_unlock(); + leave_rcu(nd); return false; } @@ -788,7 +793,6 @@ out: * try_to_unlazy_next - try to switch to ref-walk mode. * @nd: nameidata pathwalk data * @dentry: next dentry to step into - * @seq: seq number to check @dentry against * Returns: true on success, false on failure * * Similar to try_to_unlazy(), but here we have the next dentry already @@ -797,15 +801,19 @@ out: * Nothing should touch nameidata between try_to_unlazy_next() failure and * terminate_walk(). */ -static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsigned seq) +static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry) { + int res; BUG_ON(!(nd->flags & LOOKUP_RCU)); - nd->flags &= ~LOOKUP_RCU; if (unlikely(!legitimize_links(nd))) goto out2; - if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq))) - goto out2; + res = __legitimize_mnt(nd->path.mnt, nd->m_seq); + if (unlikely(res)) { + if (res > 0) + goto out2; + goto out1; + } if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref))) goto out1; @@ -818,7 +826,7 @@ static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsi */ if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) goto out; - if (unlikely(read_seqcount_retry(&dentry->d_seq, seq))) + if (read_seqcount_retry(&dentry->d_seq, nd->next_seq)) goto out_dput; /* * Sequence counts matched. Now make sure that the root is @@ -826,7 +834,7 @@ static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsi */ if (unlikely(!legitimize_root(nd))) goto out_dput; - rcu_read_unlock(); + leave_rcu(nd); return true; out2: @@ -834,10 +842,10 @@ out2: out1: nd->path.dentry = NULL; out: - rcu_read_unlock(); + leave_rcu(nd); return false; out_dput: - rcu_read_unlock(); + leave_rcu(nd); dput(dentry); return false; } @@ -962,7 +970,7 @@ static int nd_jump_root(struct nameidata *nd) d = nd->path.dentry; nd->inode = d->d_inode; nd->seq = nd->root_seq; - if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq))) + if (read_seqcount_retry(&d->d_seq, nd->seq)) return -ECHILD; } else { path_put(&nd->path); @@ -1466,8 +1474,7 @@ EXPORT_SYMBOL(follow_down); * Try to skip to top of mountpoint pile in rcuwalk mode. Fail if * we meet a managed dentry that would need blocking. */ -static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, - struct inode **inode, unsigned *seqp) +static bool __follow_mount_rcu(struct nameidata *nd, struct path *path) { struct dentry *dentry = path->dentry; unsigned int flags = dentry->d_flags; @@ -1496,15 +1503,12 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, path->mnt = &mounted->mnt; dentry = path->dentry = mounted->mnt.mnt_root; nd->state |= ND_JUMPED; - *seqp = read_seqcount_begin(&dentry->d_seq); - *inode = dentry->d_inode; - /* - * We don't need to re-check ->d_seq after this - * ->d_inode read - there will be an RCU delay - * between mount hash removal and ->mnt_root - * becoming unpinned. - */ + nd->next_seq = read_seqcount_begin(&dentry->d_seq); flags = dentry->d_flags; + // makes sure that non-RCU pathwalk could reach + // this state. + if (read_seqretry(&mount_lock, nd->m_seq)) + return false; continue; } if (read_seqretry(&mount_lock, nd->m_seq)) @@ -1515,8 +1519,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, } static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, - struct path *path, struct inode **inode, - unsigned int *seqp) + struct path *path) { bool jumped; int ret; @@ -1524,16 +1527,15 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, path->mnt = nd->path.mnt; path->dentry = dentry; if (nd->flags & LOOKUP_RCU) { - unsigned int seq = *seqp; - if (unlikely(!*inode)) - return -ENOENT; - if (likely(__follow_mount_rcu(nd, path, inode, seqp))) + unsigned int seq = nd->next_seq; + if (likely(__follow_mount_rcu(nd, path))) return 0; - if (!try_to_unlazy_next(nd, dentry, seq)) - return -ECHILD; - // *path might've been clobbered by __follow_mount_rcu() + // *path and nd->next_seq might've been clobbered path->mnt = nd->path.mnt; path->dentry = dentry; + nd->next_seq = seq; + if (!try_to_unlazy_next(nd, dentry)) + return -ECHILD; } ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags); if (jumped) { @@ -1546,9 +1548,6 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, dput(path->dentry); if (path->mnt != nd->path.mnt) mntput(path->mnt); - } else { - *inode = d_backing_inode(path->dentry); - *seqp = 0; /* out of RCU mode, so the value doesn't matter */ } return ret; } @@ -1607,9 +1606,7 @@ static struct dentry *__lookup_hash(const struct qstr *name, return dentry; } -static struct dentry *lookup_fast(struct nameidata *nd, - struct inode **inode, - unsigned *seqp) +static struct dentry *lookup_fast(struct nameidata *nd) { struct dentry *dentry, *parent = nd->path.dentry; int status = 1; @@ -1620,8 +1617,7 @@ static struct dentry *lookup_fast(struct nameidata *nd, * going to fall back to non-racy lookup. */ if (nd->flags & LOOKUP_RCU) { - unsigned seq; - dentry = __d_lookup_rcu(parent, &nd->last, &seq); + dentry = __d_lookup_rcu(parent, &nd->last, &nd->next_seq); if (unlikely(!dentry)) { if (!try_to_unlazy(nd)) return ERR_PTR(-ECHILD); @@ -1629,28 +1625,16 @@ static struct dentry *lookup_fast(struct nameidata *nd, } /* - * This sequence count validates that the inode matches - * the dentry name information from lookup. - */ - *inode = d_backing_inode(dentry); - if (unlikely(read_seqcount_retry(&dentry->d_seq, seq))) - return ERR_PTR(-ECHILD); - - /* * This sequence count validates that the parent had no * changes while we did the lookup of the dentry above. - * - * The memory barrier in read_seqcount_begin of child is - * enough, we can use __read_seqcount_retry here. */ - if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq))) + if (read_seqcount_retry(&parent->d_seq, nd->seq)) return ERR_PTR(-ECHILD); - *seqp = seq; status = d_revalidate(dentry, nd->flags); if (likely(status > 0)) return dentry; - if (!try_to_unlazy_next(nd, dentry, seq)) + if (!try_to_unlazy_next(nd, dentry)) return ERR_PTR(-ECHILD); if (status == -ECHILD) /* we'd been told to redo it in non-rcu mode */ @@ -1731,7 +1715,7 @@ static inline int may_lookup(struct user_namespace *mnt_userns, return inode_permission(mnt_userns, nd->inode, MAY_EXEC); } -static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq) +static int reserve_stack(struct nameidata *nd, struct path *link) { if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) return -ELOOP; @@ -1746,7 +1730,7 @@ static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq) if (nd->flags & LOOKUP_RCU) { // we need to grab link before we do unlazy. And we can't skip // unlazy even if we fail to grab the link - cleanup needs it - bool grabbed_link = legitimize_path(nd, link, seq); + bool grabbed_link = legitimize_path(nd, link, nd->next_seq); if (!try_to_unlazy(nd) || !grabbed_link) return -ECHILD; @@ -1760,11 +1744,11 @@ static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq) enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4}; static const char *pick_link(struct nameidata *nd, struct path *link, - struct inode *inode, unsigned seq, int flags) + struct inode *inode, int flags) { struct saved *last; const char *res; - int error = reserve_stack(nd, link, seq); + int error = reserve_stack(nd, link); if (unlikely(error)) { if (!(nd->flags & LOOKUP_RCU)) @@ -1774,7 +1758,7 @@ static const char *pick_link(struct nameidata *nd, struct path *link, last = nd->stack + nd->depth++; last->link = *link; clear_delayed_call(&last->done); - last->seq = seq; + last->seq = nd->next_seq; if (flags & WALK_TRAILING) { error = may_follow_link(nd, inode); @@ -1836,43 +1820,50 @@ all_done: // pure jump * to do this check without having to look at inode->i_op, * so we keep a cache of "no, this doesn't need follow_link" * for the common case. + * + * NOTE: dentry must be what nd->next_seq had been sampled from. */ static const char *step_into(struct nameidata *nd, int flags, - struct dentry *dentry, struct inode *inode, unsigned seq) + struct dentry *dentry) { struct path path; - int err = handle_mounts(nd, dentry, &path, &inode, &seq); + struct inode *inode; + int err = handle_mounts(nd, dentry, &path); if (err < 0) return ERR_PTR(err); + inode = path.dentry->d_inode; if (likely(!d_is_symlink(path.dentry)) || ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) || (flags & WALK_NOFOLLOW)) { /* not a symlink or should not follow */ - if (!(nd->flags & LOOKUP_RCU)) { + if (nd->flags & LOOKUP_RCU) { + if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq)) + return ERR_PTR(-ECHILD); + if (unlikely(!inode)) + return ERR_PTR(-ENOENT); + } else { dput(nd->path.dentry); if (nd->path.mnt != path.mnt) mntput(nd->path.mnt); } nd->path = path; nd->inode = inode; - nd->seq = seq; + nd->seq = nd->next_seq; return NULL; } if (nd->flags & LOOKUP_RCU) { /* make sure that d_is_symlink above matches inode */ - if (read_seqcount_retry(&path.dentry->d_seq, seq)) + if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq)) return ERR_PTR(-ECHILD); } else { if (path.mnt == nd->path.mnt) mntget(path.mnt); } - return pick_link(nd, &path, inode, seq, flags); + return pick_link(nd, &path, inode, flags); } -static struct dentry *follow_dotdot_rcu(struct nameidata *nd, - struct inode **inodep, - unsigned *seqp) +static struct dentry *follow_dotdot_rcu(struct nameidata *nd) { struct dentry *parent, *old; @@ -1889,30 +1880,30 @@ static struct dentry *follow_dotdot_rcu(struct nameidata *nd, nd->path = path; nd->inode = path.dentry->d_inode; nd->seq = seq; - if (unlikely(read_seqretry(&mount_lock, nd->m_seq))) + // makes sure that non-RCU pathwalk could reach this state + if (read_seqretry(&mount_lock, nd->m_seq)) return ERR_PTR(-ECHILD); /* we know that mountpoint was pinned */ } old = nd->path.dentry; parent = old->d_parent; - *inodep = parent->d_inode; - *seqp = read_seqcount_begin(&parent->d_seq); - if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq))) + nd->next_seq = read_seqcount_begin(&parent->d_seq); + // makes sure that non-RCU pathwalk could reach this state + if (read_seqcount_retry(&old->d_seq, nd->seq)) return ERR_PTR(-ECHILD); if (unlikely(!path_connected(nd->path.mnt, parent))) return ERR_PTR(-ECHILD); return parent; in_root: - if (unlikely(read_seqretry(&mount_lock, nd->m_seq))) + if (read_seqretry(&mount_lock, nd->m_seq)) return ERR_PTR(-ECHILD); if (unlikely(nd->flags & LOOKUP_BENEATH)) return ERR_PTR(-ECHILD); - return NULL; + nd->next_seq = nd->seq; + return nd->path.dentry; } -static struct dentry *follow_dotdot(struct nameidata *nd, - struct inode **inodep, - unsigned *seqp) +static struct dentry *follow_dotdot(struct nameidata *nd) { struct dentry *parent; @@ -1936,15 +1927,12 @@ static struct dentry *follow_dotdot(struct nameidata *nd, dput(parent); return ERR_PTR(-ENOENT); } - *seqp = 0; - *inodep = parent->d_inode; return parent; in_root: if (unlikely(nd->flags & LOOKUP_BENEATH)) return ERR_PTR(-EXDEV); - dget(nd->path.dentry); - return NULL; + return dget(nd->path.dentry); } static const char *handle_dots(struct nameidata *nd, int type) @@ -1952,8 +1940,6 @@ static const char *handle_dots(struct nameidata *nd, int type) if (type == LAST_DOTDOT) { const char *error = NULL; struct dentry *parent; - struct inode *inode; - unsigned seq; if (!nd->root.mnt) { error = ERR_PTR(set_root(nd)); @@ -1961,17 +1947,12 @@ static const char *handle_dots(struct nameidata *nd, int type) return error; } if (nd->flags & LOOKUP_RCU) - parent = follow_dotdot_rcu(nd, &inode, &seq); + parent = follow_dotdot_rcu(nd); else - parent = follow_dotdot(nd, &inode, &seq); + parent = follow_dotdot(nd); if (IS_ERR(parent)) return ERR_CAST(parent); - if (unlikely(!parent)) - error = step_into(nd, WALK_NOFOLLOW, - nd->path.dentry, nd->inode, nd->seq); - else - error = step_into(nd, WALK_NOFOLLOW, - parent, inode, seq); + error = step_into(nd, WALK_NOFOLLOW, parent); if (unlikely(error)) return error; @@ -1983,9 +1964,9 @@ static const char *handle_dots(struct nameidata *nd, int type) * some fallback). */ smp_rmb(); - if (unlikely(__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq))) + if (__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq)) return ERR_PTR(-EAGAIN); - if (unlikely(__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq))) + if (__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq)) return ERR_PTR(-EAGAIN); } } @@ -1995,8 +1976,6 @@ static const char *handle_dots(struct nameidata *nd, int type) static const char *walk_component(struct nameidata *nd, int flags) { struct dentry *dentry; - struct inode *inode; - unsigned seq; /* * "." and ".." are special - ".." especially so because it has * to be able to know about the current root directory and @@ -2007,7 +1986,7 @@ static const char *walk_component(struct nameidata *nd, int flags) put_link(nd); return handle_dots(nd, nd->last_type); } - dentry = lookup_fast(nd, &inode, &seq); + dentry = lookup_fast(nd); if (IS_ERR(dentry)) return ERR_CAST(dentry); if (unlikely(!dentry)) { @@ -2017,7 +1996,7 @@ static const char *walk_component(struct nameidata *nd, int flags) } if (!(flags & WALK_MORE) && nd->depth) put_link(nd); - return step_into(nd, flags, dentry, inode, seq); + return step_into(nd, flags, dentry); } /* @@ -2372,6 +2351,8 @@ static const char *path_init(struct nameidata *nd, unsigned flags) flags &= ~LOOKUP_RCU; if (flags & LOOKUP_RCU) rcu_read_lock(); + else + nd->seq = nd->next_seq = 0; nd->flags = flags; nd->state |= ND_JUMPED; @@ -2473,8 +2454,8 @@ static int handle_lookup_down(struct nameidata *nd) { if (!(nd->flags & LOOKUP_RCU)) dget(nd->path.dentry); - return PTR_ERR(step_into(nd, WALK_NOFOLLOW, - nd->path.dentry, nd->inode, nd->seq)); + nd->next_seq = nd->seq; + return PTR_ERR(step_into(nd, WALK_NOFOLLOW, nd->path.dentry)); } /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ @@ -3393,8 +3374,6 @@ static const char *open_last_lookups(struct nameidata *nd, struct dentry *dir = nd->path.dentry; int open_flag = op->open_flag; bool got_write = false; - unsigned seq; - struct inode *inode; struct dentry *dentry; const char *res; @@ -3410,7 +3389,7 @@ static const char *open_last_lookups(struct nameidata *nd, if (nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; /* we _can_ be in RCU mode here */ - dentry = lookup_fast(nd, &inode, &seq); + dentry = lookup_fast(nd); if (IS_ERR(dentry)) return ERR_CAST(dentry); if (likely(dentry)) @@ -3464,7 +3443,7 @@ static const char *open_last_lookups(struct nameidata *nd, finish_lookup: if (nd->depth) put_link(nd); - res = step_into(nd, WALK_TRAILING, dentry, inode, seq); + res = step_into(nd, WALK_TRAILING, dentry); if (unlikely(res)) nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); return res; diff --git a/fs/namespace.c b/fs/namespace.c index e6a7e769d25d..68789f896f08 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -648,7 +648,7 @@ int __legitimize_mnt(struct vfsmount *bastard, unsigned seq) } /* call under rcu_read_lock */ -bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) +static bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) { int res = __legitimize_mnt(bastard, seq); if (likely(!res)) diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 42f892c5712e..0ce535852151 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -319,8 +319,9 @@ zero_out: * conflicting writes once the folio is grabbed and locked. It is passed a * pointer to the fsdata cookie that gets returned to the VM to be passed to * write_end. It is permitted to sleep. It should return 0 if the request - * should go ahead; unlock the folio and return -EAGAIN to cause the folio to - * be regot; or return an error. + * should go ahead or it may return an error. It may also unlock and put the + * folio, provided it sets ``*foliop`` to NULL, in which case a return of 0 + * will cause the folio to be re-got and the process to be retried. * * The calling netfs must initialise a netfs context contiguous to the vfs * inode before calling this. @@ -348,13 +349,13 @@ retry: if (ctx->ops->check_write_begin) { /* Allow the netfs (eg. ceph) to flush conflicts. */ - ret = ctx->ops->check_write_begin(file, pos, len, folio, _fsdata); + ret = ctx->ops->check_write_begin(file, pos, len, &folio, _fsdata); if (ret < 0) { trace_netfs_failure(NULL, NULL, ret, netfs_fail_check_write_begin); - if (ret == -EAGAIN) - goto retry; goto error; } + if (!folio) + goto retry; } if (folio_test_uptodate(folio)) @@ -416,8 +417,10 @@ have_folio_no_wait: error_put: netfs_put_request(rreq, false, netfs_rreq_trace_put_failed); error: - folio_unlock(folio); - folio_put(folio); + if (folio) { + folio_unlock(folio); + folio_put(folio); + } _leave(" = %d", ret); return ret; } diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 79a8b451791f..943aeea1eb16 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -121,7 +121,7 @@ static bool offset_in_map(u64 offset, struct pnfs_block_dev_map *map) } static struct bio * -do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect, +do_add_page_to_bio(struct bio *bio, int npg, enum req_op op, sector_t isect, struct page *page, struct pnfs_block_dev_map *map, struct pnfs_block_extent *be, bio_end_io_t end_io, struct parallel_io *par, unsigned int offset, int *len) @@ -131,7 +131,7 @@ do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect, u64 disk_addr, end; dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__, - npg, rw, (unsigned long long)isect, offset, *len); + npg, (__force u32)op, (unsigned long long)isect, offset, *len); /* translate to device offset */ isect += be->be_v_offset; @@ -154,7 +154,7 @@ do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect, retry: if (!bio) { - bio = bio_alloc(map->bdev, bio_max_segs(npg), rw, GFP_NOIO); + bio = bio_alloc(map->bdev, bio_max_segs(npg), op, GFP_NOIO); bio->bi_iter.bi_sector = disk_addr >> SECTOR_SHIFT; bio->bi_end_io = end_io; bio->bi_private = par; @@ -291,7 +291,7 @@ bl_read_pagelist(struct nfs_pgio_header *header) } else { bio = do_add_page_to_bio(bio, header->page_array.npages - i, - READ, + REQ_OP_READ, isect, pages[i], &map, &be, bl_end_io_read, par, pg_offset, &pg_len); @@ -420,9 +420,8 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync) pg_len = PAGE_SIZE; bio = do_add_page_to_bio(bio, header->page_array.npages - i, - WRITE, isect, pages[i], &map, &be, - bl_end_io_write, par, - 0, &pg_len); + REQ_OP_WRITE, isect, pages[i], &map, + &be, bl_end_io_write, par, 0, &pg_len); if (IS_ERR(bio)) { header->pnfs_error = PTR_ERR(bio); bio = NULL; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 2d72b1b7ed74..549baed76351 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -533,9 +533,7 @@ const struct address_space_operations nfs_file_aops = { .write_end = nfs_write_end, .invalidate_folio = nfs_invalidate_folio, .release_folio = nfs_release_folio, -#ifdef CONFIG_MIGRATION - .migratepage = nfs_migrate_page, -#endif + .migrate_folio = nfs_migrate_folio, .launder_folio = nfs_launder_folio, .is_dirty_writeback = nfs_check_dirty_writeback, .error_remove_page = generic_error_remove_page, diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 8f8cd6e2d4db..437ebe544aaf 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -578,8 +578,10 @@ void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo) #endif #ifdef CONFIG_MIGRATION -extern int nfs_migrate_page(struct address_space *, - struct page *, struct page *, enum migrate_mode); +int nfs_migrate_folio(struct address_space *, struct folio *dst, + struct folio *src, enum migrate_mode); +#else +#define nfs_migrate_folio NULL #endif static inline int diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c index e7b34f7e0614..a9bf09fdf2c3 100644 --- a/fs/nfs/nfs42xattr.c +++ b/fs/nfs/nfs42xattr.c @@ -1017,15 +1017,16 @@ int __init nfs4_xattr_cache_init(void) if (ret) goto out2; - ret = register_shrinker(&nfs4_xattr_cache_shrinker); + ret = register_shrinker(&nfs4_xattr_cache_shrinker, "nfs-xattr_cache"); if (ret) goto out1; - ret = register_shrinker(&nfs4_xattr_entry_shrinker); + ret = register_shrinker(&nfs4_xattr_entry_shrinker, "nfs-xattr_entry"); if (ret) goto out; - ret = register_shrinker(&nfs4_xattr_large_entry_shrinker); + ret = register_shrinker(&nfs4_xattr_large_entry_shrinker, + "nfs-xattr_large_entry"); if (!ret) return 0; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index c0fdcf8c0032..bb0e84a46d61 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4012,22 +4012,29 @@ static int _nfs4_discover_trunking(struct nfs_server *server, } page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL); - if (page == NULL || locations == NULL) - goto out; + if (!locations) + goto out_free; + locations->fattr = nfs_alloc_fattr(); + if (!locations->fattr) + goto out_free_2; status = nfs4_proc_get_locations(server, fhandle, locations, page, cred); if (status) - goto out; + goto out_free_3; for (i = 0; i < locations->nlocations; i++) test_fs_location_for_trunking(&locations->locations[i], clp, server); -out: - if (page) - __free_page(page); +out_free_3: + kfree(locations->fattr); +out_free_2: kfree(locations); +out_free: + __free_page(page); return status; } diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 2540b35ec187..9bab3e9c702a 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2753,5 +2753,6 @@ again: goto again; nfs_put_client(clp); + module_put_and_kthread_exit(0); return 0; } diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 5a9b043662e9..8ae2c8d1219d 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -120,12 +120,8 @@ static void nfs_readpage_release(struct nfs_page *req, int error) if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT) SetPageError(page); if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) { - struct address_space *mapping = page_file_mapping(page); - if (PageUptodate(page)) nfs_fscache_write_page(inode, page); - else if (!PageError(page) && !PagePrivate(page)) - generic_error_remove_page(mapping, page); unlock_page(page); } nfs_release_request(req); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 6ab5eeb000dc..82944e14fcea 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -149,7 +149,7 @@ int __init register_nfs_fs(void) ret = nfs_register_sysctl(); if (ret < 0) goto error_2; - ret = register_shrinker(&acl_shrinker); + ret = register_shrinker(&acl_shrinker, "nfs-acl"); if (ret < 0) goto error_3; #ifdef CONFIG_NFS_V4_2 diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 1c706465d090..69569696dde0 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -2119,27 +2119,27 @@ out_error: } #ifdef CONFIG_MIGRATION -int nfs_migrate_page(struct address_space *mapping, struct page *newpage, - struct page *page, enum migrate_mode mode) +int nfs_migrate_folio(struct address_space *mapping, struct folio *dst, + struct folio *src, enum migrate_mode mode) { /* - * If PagePrivate is set, then the page is currently associated with + * If the private flag is set, the folio is currently associated with * an in-progress read or write request. Don't try to migrate it. * * FIXME: we could do this in principle, but we'll need a way to ensure * that we can safely release the inode reference while holding - * the page lock. + * the folio lock. */ - if (PagePrivate(page)) + if (folio_test_private(src)) return -EBUSY; - if (PageFsCache(page)) { + if (folio_test_fscache(src)) { if (mode == MIGRATE_ASYNC) return -EBUSY; - wait_on_page_fscache(page); + folio_wait_fscache(src); } - return migrate_page(mapping, newpage, page, mode); + return migrate_folio(mapping, dst, src, mode); } #endif diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 9cb2d590c036..a605c0e39b09 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -670,7 +670,7 @@ nfsd_file_cache_init(void) goto out_err; } - ret = register_shrinker(&nfsd_file_shrinker); + ret = register_shrinker(&nfsd_file_shrinker, "nfsd-filecache"); if (ret) { pr_err("nfsd: failed to register nfsd_file_shrinker: %d\n", ret); goto out_lru; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 61b2aae81abb..2acea7792bb2 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -470,6 +470,15 @@ nfsd4_decode_fattr4(struct nfsd4_compoundargs *argp, u32 *bmval, u32 bmlen, return nfserr_bad_xdr; } } + if (bmval[1] & FATTR4_WORD1_TIME_CREATE) { + struct timespec64 ts; + + /* No Linux filesystem supports setting this attribute. */ + bmval[1] &= ~FATTR4_WORD1_TIME_CREATE; + status = nfsd4_decode_nfstime4(argp, &ts); + if (status) + return status; + } if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) { u32 set_it; diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 7da88bdc0d6c..9b31e1103e7b 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -176,7 +176,8 @@ int nfsd_reply_cache_init(struct nfsd_net *nn) nn->nfsd_reply_cache_shrinker.scan_objects = nfsd_reply_cache_scan; nn->nfsd_reply_cache_shrinker.count_objects = nfsd_reply_cache_count; nn->nfsd_reply_cache_shrinker.seeks = 1; - status = register_shrinker(&nn->nfsd_reply_cache_shrinker); + status = register_shrinker(&nn->nfsd_reply_cache_shrinker, + "nfsd-reply:%s", nn->nfsd_name); if (status) goto out_stats_destroy; diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 847b482155ae..9a8b09afc173 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -465,7 +465,8 @@ static inline bool nfsd_attrs_supported(u32 minorversion, const u32 *bmval) (FATTR4_WORD0_SIZE | FATTR4_WORD0_ACL) #define NFSD_WRITEABLE_ATTRS_WORD1 \ (FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \ - | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) + | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_CREATE \ + | FATTR4_WORD1_TIME_MODIFY_SET) #ifdef CONFIG_NFSD_V4_SECURITY_LABEL #define MAYBE_FATTR4_WORD2_SECURITY_LABEL \ FATTR4_WORD2_SECURITY_LABEL diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 840e3af63a6f..d79db56475d4 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -577,6 +577,7 @@ out_err: ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst, u64 dst_pos, u64 count) { + ssize_t ret; /* * Limit copy to 4MB to prevent indefinitely blocking an nfsd @@ -587,7 +588,12 @@ ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst, * limit like this and pipeline multiple COPY requests. */ count = min_t(u64, count, 1 << 22); - return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0); + ret = vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0); + + if (ret == -EOPNOTSUPP || ret == -EXDEV) + ret = generic_copy_file_range(src, src_pos, dst, dst_pos, + count, 0); + return ret; } __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp, @@ -1173,6 +1179,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, u64 offset, nfsd_copy_write_verifier(verf, nn); err2 = filemap_check_wb_err(nf->nf_file->f_mapping, since); + err = nfserrno(err2); break; case -EINVAL: err = nfserr_notsupp; @@ -1180,8 +1187,8 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, u64 offset, default: nfsd_reset_write_verifier(nn); trace_nfsd_writeverf_reset(nn, rqstp, err2); + err = nfserrno(err2); } - err = nfserrno(err2); } else nfsd_copy_write_verifier(verf, nn); diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index ca611ac09f7c..e74fda212620 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -70,7 +70,7 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr) } int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, - sector_t pblocknr, int mode, int mode_flags, + sector_t pblocknr, blk_opf_t opf, struct buffer_head **pbh, sector_t *submit_ptr) { struct buffer_head *bh; @@ -103,13 +103,13 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, } } - if (mode_flags & REQ_RAHEAD) { + if (opf & REQ_RAHEAD) { if (pblocknr != *submit_ptr + 1 || !trylock_buffer(bh)) { err = -EBUSY; /* internal code */ brelse(bh); goto out_locked; } - } else { /* mode == READ */ + } else { /* opf == REQ_OP_READ */ lock_buffer(bh); } if (buffer_uptodate(bh)) { @@ -122,7 +122,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, bh->b_blocknr = pblocknr; /* set block address for read */ bh->b_end_io = end_buffer_read_sync; get_bh(bh); - submit_bh(mode, mode_flags, bh); + submit_bh(opf, bh); bh->b_blocknr = blocknr; /* set back to the given block address */ *submit_ptr = pblocknr; err = 0; diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h index bd5544e63a01..4bc5612dff94 100644 --- a/fs/nilfs2/btnode.h +++ b/fs/nilfs2/btnode.h @@ -34,8 +34,8 @@ void nilfs_init_btnc_inode(struct inode *btnc_inode); void nilfs_btnode_cache_clear(struct address_space *); struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr); -int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t, int, - int, struct buffer_head **, sector_t *); +int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t, + blk_opf_t, struct buffer_head **, sector_t *); void nilfs_btnode_delete(struct buffer_head *); int nilfs_btnode_prepare_change_key(struct address_space *, struct nilfs_btnode_chkey_ctxt *); diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index f544c22fff78..9f4d9432d38a 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -477,7 +477,7 @@ static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr, sector_t submit_ptr = 0; int ret; - ret = nilfs_btnode_submit_block(btnc, ptr, 0, REQ_OP_READ, 0, &bh, + ret = nilfs_btnode_submit_block(btnc, ptr, 0, REQ_OP_READ, &bh, &submit_ptr); if (ret) { if (ret != -EEXIST) @@ -495,8 +495,8 @@ static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr, ptr2 = nilfs_btree_node_get_ptr(ra->node, i, ra->ncmax); ret = nilfs_btnode_submit_block(btnc, ptr2, 0, - REQ_OP_READ, REQ_RAHEAD, - &ra_bh, &submit_ptr); + REQ_OP_READ | REQ_RAHEAD, + &ra_bh, &submit_ptr); if (likely(!ret || ret == -EEXIST)) brelse(ra_bh); else if (ret != -EBUSY) diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index f8f4c2ff52f4..decd6471300b 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -194,7 +194,7 @@ static struct page *nilfs_get_page(struct inode *dir, unsigned long n) if (!IS_ERR(page)) { kmap(page); if (unlikely(!PageChecked(page))) { - if (PageError(page) || !nilfs_check_page(page)) + if (!nilfs_check_page(page)) goto fail; } } diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index 04fdd420eae7..b0d22ff24b67 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -92,7 +92,7 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff, bh->b_blocknr = pbn; bh->b_end_io = end_buffer_read_sync; get_bh(bh); - submit_bh(REQ_OP_READ, 0, bh); + submit_bh(REQ_OP_READ, bh); if (vbn) bh->b_blocknr = vbn; out: @@ -129,9 +129,8 @@ int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn, struct inode *btnc_inode = NILFS_I(inode)->i_assoc_inode; int ret; - ret = nilfs_btnode_submit_block(btnc_inode->i_mapping, - vbn ? : pbn, pbn, REQ_OP_READ, 0, - out_bh, &pbn); + ret = nilfs_btnode_submit_block(btnc_inode->i_mapping, vbn ? : pbn, pbn, + REQ_OP_READ, out_bh, &pbn); if (ret == -EEXIST) /* internal code (cache hit) */ ret = 0; return ret; diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index d29a0f2b9c16..cbf4fa60eea2 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -111,8 +111,8 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block, } static int -nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, - int mode, int mode_flags, struct buffer_head **out_bh) +nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, blk_opf_t opf, + struct buffer_head **out_bh) { struct buffer_head *bh; __u64 blknum = 0; @@ -126,12 +126,12 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, if (buffer_uptodate(bh)) goto out; - if (mode_flags & REQ_RAHEAD) { + if (opf & REQ_RAHEAD) { if (!trylock_buffer(bh)) { ret = -EBUSY; goto failed_bh; } - } else /* mode == READ */ + } else /* opf == REQ_OP_READ */ lock_buffer(bh); if (buffer_uptodate(bh)) { @@ -148,10 +148,11 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, bh->b_end_io = end_buffer_read_sync; get_bh(bh); - submit_bh(mode, mode_flags, bh); + submit_bh(opf, bh); ret = 0; - trace_nilfs2_mdt_submit_block(inode, inode->i_ino, blkoff, mode); + trace_nilfs2_mdt_submit_block(inode, inode->i_ino, blkoff, + opf & REQ_OP_MASK); out: get_bh(bh); *out_bh = bh; @@ -172,7 +173,7 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block, int i, nr_ra_blocks = NILFS_MDT_MAX_RA_BLOCKS; int err; - err = nilfs_mdt_submit_block(inode, block, REQ_OP_READ, 0, &first_bh); + err = nilfs_mdt_submit_block(inode, block, REQ_OP_READ, &first_bh); if (err == -EEXIST) /* internal code */ goto out; @@ -182,8 +183,8 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block, if (readahead) { blkoff = block + 1; for (i = 0; i < nr_ra_blocks; i++, blkoff++) { - err = nilfs_mdt_submit_block(inode, blkoff, REQ_OP_READ, - REQ_RAHEAD, &bh); + err = nilfs_mdt_submit_block(inode, blkoff, + REQ_OP_READ | REQ_RAHEAD, &bh); if (likely(!err || err == -EEXIST)) brelse(bh); else if (err != -EBUSY) diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 1344f7d475d3..aecda4fc95f5 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -198,6 +198,9 @@ static inline int nilfs_acl_chmod(struct inode *inode) static inline int nilfs_init_acl(struct inode *inode, struct inode *dir) { + if (S_ISLNK(inode->i_mode)) + return 0; + inode->i_mode &= ~current_umask(); return 0; } diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index a8e88cc38e16..3267e96c256c 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -294,57 +294,57 @@ repeat: void nilfs_copy_back_pages(struct address_space *dmap, struct address_space *smap) { - struct pagevec pvec; + struct folio_batch fbatch; unsigned int i, n; - pgoff_t index = 0; + pgoff_t start = 0; - pagevec_init(&pvec); + folio_batch_init(&fbatch); repeat: - n = pagevec_lookup(&pvec, smap, &index); + n = filemap_get_folios(smap, &start, ~0UL, &fbatch); if (!n) return; - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i], *dpage; - pgoff_t offset = page->index; - - lock_page(page); - dpage = find_lock_page(dmap, offset); - if (dpage) { - /* overwrite existing page in the destination cache */ - WARN_ON(PageDirty(dpage)); - nilfs_copy_page(dpage, page, 0); - unlock_page(dpage); - put_page(dpage); - /* Do we not need to remove page from smap here? */ + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i], *dfolio; + pgoff_t index = folio->index; + + folio_lock(folio); + dfolio = filemap_lock_folio(dmap, index); + if (dfolio) { + /* overwrite existing folio in the destination cache */ + WARN_ON(folio_test_dirty(dfolio)); + nilfs_copy_page(&dfolio->page, &folio->page, 0); + folio_unlock(dfolio); + folio_put(dfolio); + /* Do we not need to remove folio from smap here? */ } else { - struct page *p; + struct folio *f; - /* move the page to the destination cache */ + /* move the folio to the destination cache */ xa_lock_irq(&smap->i_pages); - p = __xa_erase(&smap->i_pages, offset); - WARN_ON(page != p); + f = __xa_erase(&smap->i_pages, index); + WARN_ON(folio != f); smap->nrpages--; xa_unlock_irq(&smap->i_pages); xa_lock_irq(&dmap->i_pages); - p = __xa_store(&dmap->i_pages, offset, page, GFP_NOFS); - if (unlikely(p)) { + f = __xa_store(&dmap->i_pages, index, folio, GFP_NOFS); + if (unlikely(f)) { /* Probably -ENOMEM */ - page->mapping = NULL; - put_page(page); + folio->mapping = NULL; + folio_put(folio); } else { - page->mapping = dmap; + folio->mapping = dmap; dmap->nrpages++; - if (PageDirty(page)) - __xa_set_mark(&dmap->i_pages, offset, + if (folio_test_dirty(folio)) + __xa_set_mark(&dmap->i_pages, index, PAGECACHE_TAG_DIRTY); } xa_unlock_irq(&dmap->i_pages); } - unlock_page(page); + folio_unlock(folio); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); goto repeat; diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 4f897e109547..cd7d09a569ff 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -295,12 +295,13 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, const void *data, int data_type, struct inode *dir) { - __u32 marks_mask = 0, marks_ignored_mask = 0; + __u32 marks_mask = 0, marks_ignore_mask = 0; __u32 test_mask, user_mask = FANOTIFY_OUTGOING_EVENTS | FANOTIFY_EVENT_FLAGS; const struct path *path = fsnotify_data_path(data, data_type); unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); struct fsnotify_mark *mark; + bool ondir = event_mask & FAN_ONDIR; int type; pr_debug("%s: report_mask=%x mask=%x data=%p data_type=%d\n", @@ -315,19 +316,21 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, return 0; } else if (!(fid_mode & FAN_REPORT_FID)) { /* Do we have a directory inode to report? */ - if (!dir && !(event_mask & FS_ISDIR)) + if (!dir && !ondir) return 0; } fsnotify_foreach_iter_mark_type(iter_info, mark, type) { - /* Apply ignore mask regardless of mark's ISDIR flag */ - marks_ignored_mask |= mark->ignored_mask; + /* + * Apply ignore mask depending on event flags in ignore mask. + */ + marks_ignore_mask |= + fsnotify_effective_ignore_mask(mark, ondir, type); /* - * If the event is on dir and this mark doesn't care about - * events on dir, don't send it! + * Send the event depending on event flags in mark mask. */ - if (event_mask & FS_ISDIR && !(mark->mask & FS_ISDIR)) + if (!fsnotify_mask_applicable(mark->mask, ondir, type)) continue; marks_mask |= mark->mask; @@ -336,7 +339,7 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, *match_mask |= 1U << type; } - test_mask = event_mask & marks_mask & ~marks_ignored_mask; + test_mask = event_mask & marks_mask & ~marks_ignore_mask; /* * For dirent modification events (create/delete/move) that do not carry diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 80e0ec95b113..1d9f11255c64 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -499,6 +499,8 @@ static inline unsigned int fanotify_mark_user_flags(struct fsnotify_mark *mark) mflags |= FAN_MARK_IGNORED_SURV_MODIFY; if (mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF) mflags |= FAN_MARK_EVICTABLE; + if (mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS) + mflags |= FAN_MARK_IGNORE; return mflags; } diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index c2255b440df9..f0e49a406ffa 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1009,10 +1009,10 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, mask &= ~umask; spin_lock(&fsn_mark->lock); oldmask = fsnotify_calc_mask(fsn_mark); - if (!(flags & FAN_MARK_IGNORED_MASK)) { + if (!(flags & FANOTIFY_MARK_IGNORE_BITS)) { fsn_mark->mask &= ~mask; } else { - fsn_mark->ignored_mask &= ~mask; + fsn_mark->ignore_mask &= ~mask; } newmask = fsnotify_calc_mask(fsn_mark); /* @@ -1021,7 +1021,7 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, * changes to the mask. * Destroy mark when only umask bits remain. */ - *destroy = !((fsn_mark->mask | fsn_mark->ignored_mask) & ~umask); + *destroy = !((fsn_mark->mask | fsn_mark->ignore_mask) & ~umask); spin_unlock(&fsn_mark->lock); return oldmask & ~newmask; @@ -1085,15 +1085,24 @@ static bool fanotify_mark_update_flags(struct fsnotify_mark *fsn_mark, unsigned int fan_flags) { bool want_iref = !(fan_flags & FAN_MARK_EVICTABLE); + unsigned int ignore = fan_flags & FANOTIFY_MARK_IGNORE_BITS; bool recalc = false; /* + * When using FAN_MARK_IGNORE for the first time, mark starts using + * independent event flags in ignore mask. After that, trying to + * update the ignore mask with the old FAN_MARK_IGNORED_MASK API + * will result in EEXIST error. + */ + if (ignore == FAN_MARK_IGNORE) + fsn_mark->flags |= FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS; + + /* * Setting FAN_MARK_IGNORED_SURV_MODIFY for the first time may lead to * the removal of the FS_MODIFY bit in calculated mask if it was set - * because of an ignored mask that is now going to survive FS_MODIFY. + * because of an ignore mask that is now going to survive FS_MODIFY. */ - if ((fan_flags & FAN_MARK_IGNORED_MASK) && - (fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) && + if (ignore && (fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) && !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) { fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY; if (!(fsn_mark->mask & FS_MODIFY)) @@ -1120,10 +1129,10 @@ static bool fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, bool recalc; spin_lock(&fsn_mark->lock); - if (!(fan_flags & FAN_MARK_IGNORED_MASK)) + if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS)) fsn_mark->mask |= mask; else - fsn_mark->ignored_mask |= mask; + fsn_mark->ignore_mask |= mask; recalc = fsnotify_calc_mask(fsn_mark) & ~fsnotify_conn_mask(fsn_mark->connector); @@ -1187,6 +1196,37 @@ static int fanotify_group_init_error_pool(struct fsnotify_group *group) sizeof(struct fanotify_error_event)); } +static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark, + unsigned int fan_flags) +{ + /* + * Non evictable mark cannot be downgraded to evictable mark. + */ + if (fan_flags & FAN_MARK_EVICTABLE && + !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)) + return -EEXIST; + + /* + * New ignore mask semantics cannot be downgraded to old semantics. + */ + if (fan_flags & FAN_MARK_IGNORED_MASK && + fsn_mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS) + return -EEXIST; + + /* + * An ignore mask that survives modify could never be downgraded to not + * survive modify. With new FAN_MARK_IGNORE semantics we make that rule + * explicit and return an error when trying to update the ignore mask + * without the original FAN_MARK_IGNORED_SURV_MODIFY value. + */ + if (fan_flags & FAN_MARK_IGNORE && + !(fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) && + fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY) + return -EEXIST; + + return 0; +} + static int fanotify_add_mark(struct fsnotify_group *group, fsnotify_connp_t *connp, unsigned int obj_type, __u32 mask, unsigned int fan_flags, @@ -1208,19 +1248,18 @@ static int fanotify_add_mark(struct fsnotify_group *group, } /* - * Non evictable mark cannot be downgraded to evictable mark. + * Check if requested mark flags conflict with an existing mark flags. */ - if (fan_flags & FAN_MARK_EVICTABLE && - !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)) { - ret = -EEXIST; + ret = fanotify_may_update_existing_mark(fsn_mark, fan_flags); + if (ret) goto out; - } /* * Error events are pre-allocated per group, only if strictly * needed (i.e. FAN_FS_ERROR was requested). */ - if (!(fan_flags & FAN_MARK_IGNORED_MASK) && (mask & FAN_FS_ERROR)) { + if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS) && + (mask & FAN_FS_ERROR)) { ret = fanotify_group_init_error_pool(group); if (ret) goto out; @@ -1261,10 +1300,10 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group, /* * If some other task has this inode open for write we should not add - * an ignored mark, unless that ignored mark is supposed to survive + * an ignore mask, unless that ignore mask is supposed to survive * modification changes anyway. */ - if ((flags & FAN_MARK_IGNORED_MASK) && + if ((flags & FANOTIFY_MARK_IGNORE_BITS) && !(flags & FAN_MARK_IGNORED_SURV_MODIFY) && inode_is_open_for_write(inode)) return 0; @@ -1513,8 +1552,16 @@ static int fanotify_test_fid(struct dentry *dentry) return 0; } -static int fanotify_events_supported(struct path *path, __u64 mask) +static int fanotify_events_supported(struct fsnotify_group *group, + struct path *path, __u64 mask, + unsigned int flags) { + unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; + /* Strict validation of events in non-dir inode mask with v5.17+ APIs */ + bool strict_dir_events = FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID) || + (mask & FAN_RENAME) || + (flags & FAN_MARK_IGNORE); + /* * Some filesystems such as 'proc' acquire unusual locks when opening * files. For them fanotify permission events have high chances of @@ -1526,6 +1573,16 @@ static int fanotify_events_supported(struct path *path, __u64 mask) if (mask & FANOTIFY_PERM_EVENTS && path->mnt->mnt_sb->s_type->fs_flags & FS_DISALLOW_NOTIFY_PERM) return -EINVAL; + + /* + * We shouldn't have allowed setting dirent events and the directory + * flags FAN_ONDIR and FAN_EVENT_ON_CHILD in mask of non-dir inode, + * but because we always allowed it, error only when using new APIs. + */ + if (strict_dir_events && mark_type == FAN_MARK_INODE && + !d_is_dir(path->dentry) && (mask & FANOTIFY_DIRONLY_EVENT_BITS)) + return -ENOTDIR; + return 0; } @@ -1540,7 +1597,8 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, __kernel_fsid_t __fsid, *fsid = NULL; u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS; unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; - bool ignored = flags & FAN_MARK_IGNORED_MASK; + unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS; + unsigned int ignore = flags & FANOTIFY_MARK_IGNORE_BITS; unsigned int obj_type, fid_mode; u32 umask = 0; int ret; @@ -1569,7 +1627,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, return -EINVAL; } - switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) { + switch (mark_cmd) { case FAN_MARK_ADD: case FAN_MARK_REMOVE: if (!mask) @@ -1589,9 +1647,19 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, if (mask & ~valid_mask) return -EINVAL; - /* Event flags (ONDIR, ON_CHILD) are meaningless in ignored mask */ - if (ignored) + + /* We don't allow FAN_MARK_IGNORE & FAN_MARK_IGNORED_MASK together */ + if (ignore == (FAN_MARK_IGNORE | FAN_MARK_IGNORED_MASK)) + return -EINVAL; + + /* + * Event flags (FAN_ONDIR, FAN_EVENT_ON_CHILD) have no effect with + * FAN_MARK_IGNORED_MASK. + */ + if (ignore == FAN_MARK_IGNORED_MASK) { mask &= ~FANOTIFY_EVENT_FLAGS; + umask = FANOTIFY_EVENT_FLAGS; + } f = fdget(fanotify_fd); if (unlikely(!f.file)) @@ -1655,7 +1723,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, if (mask & FAN_RENAME && !(fid_mode & FAN_REPORT_NAME)) goto fput_and_out; - if (flags & FAN_MARK_FLUSH) { + if (mark_cmd == FAN_MARK_FLUSH) { ret = 0; if (mark_type == FAN_MARK_MOUNT) fsnotify_clear_vfsmount_marks_by_group(group); @@ -1671,8 +1739,8 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, if (ret) goto fput_and_out; - if (flags & FAN_MARK_ADD) { - ret = fanotify_events_supported(&path, mask); + if (mark_cmd == FAN_MARK_ADD) { + ret = fanotify_events_supported(group, &path, mask, flags); if (ret) goto path_put_and_out; } @@ -1695,17 +1763,11 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, else mnt = path.mnt; - /* - * FAN_RENAME is not allowed on non-dir (for now). - * We shouldn't have allowed setting any dirent events in mask of - * non-dir, but because we always allowed it, error only if group - * was initialized with the new flag FAN_REPORT_TARGET_FID. - */ - ret = -ENOTDIR; - if (inode && !S_ISDIR(inode->i_mode) && - ((mask & FAN_RENAME) || - ((mask & FANOTIFY_DIRENT_EVENTS) && - FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID)))) + ret = mnt ? -EINVAL : -EISDIR; + /* FAN_MARK_IGNORE requires SURV_MODIFY for sb/mount/dir marks */ + if (mark_cmd == FAN_MARK_ADD && ignore == FAN_MARK_IGNORE && + (mnt || S_ISDIR(inode->i_mode)) && + !(flags & FAN_MARK_IGNORED_SURV_MODIFY)) goto path_put_and_out; /* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */ @@ -1717,12 +1779,12 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, * events with parent/name info for non-directory. */ if ((fid_mode & FAN_REPORT_DIR_FID) && - (flags & FAN_MARK_ADD) && !ignored) + (flags & FAN_MARK_ADD) && !ignore) mask |= FAN_EVENT_ON_CHILD; } /* create/update an inode mark */ - switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) { + switch (mark_cmd) { case FAN_MARK_ADD: if (mark_type == FAN_MARK_MOUNT) ret = fanotify_add_vfsmount_mark(group, mnt, mask, @@ -1800,7 +1862,7 @@ static int __init fanotify_user_setup(void) BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS); BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 12); - BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 10); + BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11); fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC|SLAB_ACCOUNT); diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 59fb40abe33d..55081ae3a6ec 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -113,7 +113,7 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) return; seq_printf(m, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x ", inode->i_ino, inode->i_sb->s_dev, - mflags, mark->mask, mark->ignored_mask); + mflags, mark->mask, mark->ignore_mask); show_mark_fhandle(m, inode); seq_putc(m, '\n'); iput(inode); @@ -121,12 +121,12 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) struct mount *mnt = fsnotify_conn_mount(mark->connector); seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n", - mnt->mnt_id, mflags, mark->mask, mark->ignored_mask); + mnt->mnt_id, mflags, mark->mask, mark->ignore_mask); } else if (mark->connector->type == FSNOTIFY_OBJ_TYPE_SB) { struct super_block *sb = fsnotify_conn_sb(mark->connector); seq_printf(m, "fanotify sdev:%x mflags:%x mask:%x ignored_mask:%x\n", - sb->s_dev, mflags, mark->mask, mark->ignored_mask); + sb->s_dev, mflags, mark->mask, mark->ignore_mask); } } diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 0b3e74935cb4..7974e91ffe13 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -100,7 +100,7 @@ void fsnotify_sb_delete(struct super_block *sb) * Given an inode, first check if we care what happens to our children. Inotify * and dnotify both tell their parents about events. If we care about any event * on a child we run all of our children and set a dentry flag saying that the - * parent cares. Thus when an event happens on a child it can quickly tell if + * parent cares. Thus when an event happens on a child it can quickly tell * if there is a need to find a parent and send the event to the parent. */ void __fsnotify_update_child_dentry_flags(struct inode *inode) @@ -324,7 +324,8 @@ static int send_to_group(__u32 mask, const void *data, int data_type, struct fsnotify_group *group = NULL; __u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS); __u32 marks_mask = 0; - __u32 marks_ignored_mask = 0; + __u32 marks_ignore_mask = 0; + bool is_dir = mask & FS_ISDIR; struct fsnotify_mark *mark; int type; @@ -336,7 +337,7 @@ static int send_to_group(__u32 mask, const void *data, int data_type, fsnotify_foreach_iter_mark_type(iter_info, mark, type) { if (!(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) - mark->ignored_mask = 0; + mark->ignore_mask = 0; } } @@ -344,14 +345,15 @@ static int send_to_group(__u32 mask, const void *data, int data_type, fsnotify_foreach_iter_mark_type(iter_info, mark, type) { group = mark->group; marks_mask |= mark->mask; - marks_ignored_mask |= mark->ignored_mask; + marks_ignore_mask |= + fsnotify_effective_ignore_mask(mark, is_dir, type); } - pr_debug("%s: group=%p mask=%x marks_mask=%x marks_ignored_mask=%x data=%p data_type=%d dir=%p cookie=%d\n", - __func__, group, mask, marks_mask, marks_ignored_mask, + pr_debug("%s: group=%p mask=%x marks_mask=%x marks_ignore_mask=%x data=%p data_type=%d dir=%p cookie=%d\n", + __func__, group, mask, marks_mask, marks_ignore_mask, data, data_type, dir, cookie); - if (!(test_mask & marks_mask & ~marks_ignored_mask)) + if (!(test_mask & marks_mask & ~marks_ignore_mask)) return 0; if (group->ops->handle_event) { @@ -423,7 +425,8 @@ static bool fsnotify_iter_select_report_types( * But is *this mark* watching children? */ if (type == FSNOTIFY_ITER_TYPE_PARENT && - !(mark->mask & FS_EVENT_ON_CHILD)) + !(mark->mask & FS_EVENT_ON_CHILD) && + !(fsnotify_ignore_mask(mark) & FS_EVENT_ON_CHILD)) continue; fsnotify_iter_set_report_type(iter_info, type); @@ -532,8 +535,8 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, /* - * If this is a modify event we may need to clear some ignored masks. - * In that case, the object with ignored masks will have the FS_MODIFY + * If this is a modify event we may need to clear some ignore masks. + * In that case, the object with ignore masks will have the FS_MODIFY * event in its mask. * Otherwise, return if none of the marks care about this type of event. */ diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index ed42a189faa2..1c4bfdab008d 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -136,7 +136,7 @@ static inline u32 inotify_mask_to_arg(__u32 mask) IN_Q_OVERFLOW); } -/* intofiy userspace file descriptor functions */ +/* inotify userspace file descriptor functions */ static __poll_t inotify_poll(struct file *file, poll_table *wait) { struct fsnotify_group *group = file->private_data; diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 9e3964ea2ea0..9364d35b4a10 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -342,7 +342,7 @@ handle_zblock: for (i = 0; i < nr; i++) { tbh = arr[i]; if (likely(!buffer_uptodate(tbh))) - submit_bh(REQ_OP_READ, 0, tbh); + submit_bh(REQ_OP_READ, tbh); else ntfs_end_buffer_async_read(tbh, 1); } @@ -859,7 +859,7 @@ lock_retry_remap: do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { - submit_bh(REQ_OP_WRITE, 0, bh); + submit_bh(REQ_OP_WRITE, bh); need_end_writeback = false; } bh = next; @@ -1187,7 +1187,7 @@ lock_retry_remap: BUG_ON(!buffer_mapped(tbh)); get_bh(tbh); tbh->b_end_io = end_buffer_write_sync; - submit_bh(REQ_OP_WRITE, 0, tbh); + submit_bh(REQ_OP_WRITE, tbh); } /* Synchronize the mft mirror now if not @sync. */ if (is_mft && !sync) @@ -1659,7 +1659,7 @@ const struct address_space_operations ntfs_normal_aops = { .dirty_folio = block_dirty_folio, #endif /* NTFS_RW */ .bmap = ntfs_bmap, - .migratepage = buffer_migrate_page, + .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; @@ -1673,7 +1673,7 @@ const struct address_space_operations ntfs_compressed_aops = { .writepage = ntfs_writepage, .dirty_folio = block_dirty_folio, #endif /* NTFS_RW */ - .migratepage = buffer_migrate_page, + .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; @@ -1688,7 +1688,7 @@ const struct address_space_operations ntfs_mst_aops = { .writepage = ntfs_writepage, /* Write dirty page to disk. */ .dirty_folio = filemap_dirty_folio, #endif /* NTFS_RW */ - .migratepage = buffer_migrate_page, + .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; diff --git a/fs/ntfs/aops.h b/fs/ntfs/aops.h index 934d5f79b9e7..0cac5458c023 100644 --- a/fs/ntfs/aops.h +++ b/fs/ntfs/aops.h @@ -74,13 +74,8 @@ static inline struct page *ntfs_map_page(struct address_space *mapping, { struct page *page = read_mapping_page(mapping, index, NULL); - if (!IS_ERR(page)) { + if (!IS_ERR(page)) kmap(page); - if (!PageError(page)) - return page; - ntfs_unmap_page(page); - return ERR_PTR(-EIO); - } return page; } diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c index 4de597a83b88..52615e6090e1 100644 --- a/fs/ntfs/attrib.c +++ b/fs/ntfs/attrib.c @@ -592,8 +592,12 @@ static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name, a = (ATTR_RECORD*)((u8*)ctx->attr + le32_to_cpu(ctx->attr->length)); for (;; a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) { - if ((u8*)a < (u8*)ctx->mrec || (u8*)a > (u8*)ctx->mrec + - le32_to_cpu(ctx->mrec->bytes_allocated)) + u8 *mrec_end = (u8 *)ctx->mrec + + le32_to_cpu(ctx->mrec->bytes_allocated); + u8 *name_end = (u8 *)a + le16_to_cpu(a->name_offset) + + a->name_length * sizeof(ntfschar); + if ((u8*)a < (u8*)ctx->mrec || (u8*)a > mrec_end || + name_end > mrec_end) break; ctx->attr = a; if (unlikely(le32_to_cpu(a->type) > le32_to_cpu(type) || diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c index a60f543e7557..587e9b187873 100644 --- a/fs/ntfs/compress.c +++ b/fs/ntfs/compress.c @@ -658,7 +658,7 @@ lock_retry_remap: } get_bh(tbh); tbh->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, 0, tbh); + submit_bh(REQ_OP_READ, tbh); } /* Wait for io completion on all buffer heads. */ diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index a8abe2296514..58b660dbbee9 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -219,11 +219,6 @@ do_non_resident_extend: err = PTR_ERR(page); goto init_err_out; } - if (unlikely(PageError(page))) { - put_page(page); - err = -EIO; - goto init_err_out; - } /* * Update the initialized size in the ntfs inode. This is * enough to make ntfs_writepage() work. @@ -537,7 +532,7 @@ static inline int ntfs_submit_bh_for_read(struct buffer_head *bh) lock_buffer(bh); get_bh(bh); bh->b_end_io = end_buffer_read_sync; - return submit_bh(REQ_OP_READ, 0, bh); + return submit_bh(REQ_OP_READ, bh); } /** diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c index bc1bf217b38e..6ce60ffc6ac0 100644 --- a/fs/ntfs/logfile.c +++ b/fs/ntfs/logfile.c @@ -807,7 +807,7 @@ map_vcn: * completed ignore errors afterwards as we can assume * that if one buffer worked all of them will work. */ - submit_bh(REQ_OP_WRITE, 0, bh); + submit_bh(REQ_OP_WRITE, bh); if (should_wait) { should_wait = false; wait_on_buffer(bh); diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 0d62cd5bb7f8..f7bf5ce960cc 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -583,7 +583,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, clear_buffer_dirty(tbh); get_bh(tbh); tbh->b_end_io = end_buffer_write_sync; - submit_bh(REQ_OP_WRITE, 0, tbh); + submit_bh(REQ_OP_WRITE, tbh); } /* Wait on i/o completion of buffers. */ for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { @@ -780,7 +780,7 @@ int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync) clear_buffer_dirty(tbh); get_bh(tbh); tbh->b_end_io = end_buffer_write_sync; - submit_bh(REQ_OP_WRITE, 0, tbh); + submit_bh(REQ_OP_WRITE, tbh); } /* Synchronize the mft mirror now if not @sync. */ if (!sync && ni->mft_no < vol->mftmirr_size) diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 8e9d2b35175f..4a21745711fe 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -242,7 +242,7 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to) lock_buffer(bh); bh->b_end_io = end_buffer_read_sync; get_bh(bh); - submit_bh(REQ_OP_READ, 0, bh); + submit_bh(REQ_OP_READ, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c index 3de5700a9b83..1835e35199c2 100644 --- a/fs/ntfs3/fsntfs.c +++ b/fs/ntfs3/fsntfs.c @@ -1448,7 +1448,7 @@ int ntfs_write_bh(struct ntfs_sb_info *sbi, struct NTFS_RECORD_HEADER *rhdr, */ int ntfs_bio_pages(struct ntfs_sb_info *sbi, const struct runs_tree *run, struct page **pages, u32 nr_pages, u64 vbo, u32 bytes, - u32 op) + enum req_op op) { int err = 0; struct bio *new, *bio = NULL; diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index be4ebdd8048b..80104afeb2cd 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -629,7 +629,7 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo, bh->b_size = block_size; off = vbo & (PAGE_SIZE - 1); set_bh_page(bh, page, off); - ll_rw_block(REQ_OP_READ, 0, 1, &bh); + ll_rw_block(REQ_OP_READ, 1, &bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { err = -EIO; @@ -851,12 +851,10 @@ static int ntfs_writepage(struct page *page, struct writeback_control *wbc) static int ntfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { - struct inode *inode = mapping->host; - struct ntfs_inode *ni = ntfs_i(inode); /* Redirect call to 'ntfs_writepage' for resident files. */ - get_block_t *get_block = is_resident(ni) ? NULL : &ntfs_get_block; - - return mpage_writepages(mapping, wbc, get_block); + if (is_resident(ntfs_i(mapping->host))) + return generic_writepages(mapping, wbc); + return mpage_writepages(mapping, wbc, ntfs_get_block); } static int ntfs_get_block_write_begin(struct inode *inode, sector_t vbn, diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index 8de129a6419b..8dbdca03e1af 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -617,7 +617,7 @@ int ntfs_write_bh(struct ntfs_sb_info *sbi, struct NTFS_RECORD_HEADER *rhdr, struct ntfs_buffers *nb, int sync); int ntfs_bio_pages(struct ntfs_sb_info *sbi, const struct runs_tree *run, struct page **pages, u32 nr_pages, u64 vbo, u32 bytes, - u32 op); + enum req_op op); int ntfs_bio_fill_1(struct ntfs_sb_info *sbi, const struct runs_tree *run); int ntfs_vbo_to_lbo(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo, u64 *lbo, u64 *bytes); @@ -896,13 +896,8 @@ static inline struct page *ntfs_map_page(struct address_space *mapping, { struct page *page = read_mapping_page(mapping, index, NULL); - if (!IS_ERR(page)) { + if (!IS_ERR(page)) kmap(page); - if (!PageError(page)) - return page; - ntfs_unmap_page(page); - return ERR_PTR(-EIO); - } return page; } diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 35d40a67204c..af4157f61927 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -277,16 +277,14 @@ out: static int ocfs2_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct ocfs2_inode_info *oi = OCFS2_I(inode); - loff_t start = (loff_t)page->index << PAGE_SHIFT; + loff_t start = folio_pos(folio); int ret, unlock = 1; - trace_ocfs2_readpage((unsigned long long)oi->ip_blkno, - (page ? page->index : 0)); + trace_ocfs2_readpage((unsigned long long)oi->ip_blkno, folio->index); - ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page); + ret = ocfs2_inode_lock_with_page(inode, NULL, 0, &folio->page); if (ret != 0) { if (ret == AOP_TRUNCATED_PAGE) unlock = 0; @@ -296,11 +294,11 @@ static int ocfs2_read_folio(struct file *file, struct folio *folio) if (down_read_trylock(&oi->ip_alloc_sem) == 0) { /* - * Unlock the page and cycle ip_alloc_sem so that we don't + * Unlock the folio and cycle ip_alloc_sem so that we don't * busyloop waiting for ip_alloc_sem to unlock */ ret = AOP_TRUNCATED_PAGE; - unlock_page(page); + folio_unlock(folio); unlock = 0; down_read(&oi->ip_alloc_sem); up_read(&oi->ip_alloc_sem); @@ -313,21 +311,21 @@ static int ocfs2_read_folio(struct file *file, struct folio *folio) * block_read_full_folio->get_block freaks out if it is asked to read * beyond the end of a file, so we check here. Callers * (generic_file_read, vm_ops->fault) are clever enough to check i_size - * and notice that the page they just read isn't needed. + * and notice that the folio they just read isn't needed. * * XXX sys_readahead() seems to get that wrong? */ if (start >= i_size_read(inode)) { - zero_user(page, 0, PAGE_SIZE); - SetPageUptodate(page); + folio_zero_segment(folio, 0, folio_size(folio)); + folio_mark_uptodate(folio); ret = 0; goto out_alloc; } if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) - ret = ocfs2_readpage_inline(inode, page); + ret = ocfs2_readpage_inline(inode, &folio->page); else - ret = block_read_full_folio(page_folio(page), ocfs2_get_block); + ret = block_read_full_folio(folio, ocfs2_get_block); unlock = 0; out_alloc: @@ -336,7 +334,7 @@ out_inode_unlock: ocfs2_inode_unlock(inode, 0); out: if (unlock) - unlock_page(page); + folio_unlock(folio); return ret; } @@ -638,7 +636,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, !buffer_new(bh) && ocfs2_should_read_blk(inode, page, block_start) && (block_start < from || block_end > to)) { - ll_rw_block(REQ_OP_READ, 0, 1, &bh); + ll_rw_block(REQ_OP_READ, 1, &bh); *wait_bh++=bh; } @@ -2464,7 +2462,7 @@ const struct address_space_operations ocfs2_aops = { .direct_IO = ocfs2_direct_IO, .invalidate_folio = block_invalidate_folio, .release_folio = ocfs2_release_folio, - .migratepage = buffer_migrate_page, + .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index e7758778abef..196638a22b48 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -64,7 +64,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, get_bh(bh); /* for end_buffer_write_sync() */ bh->b_end_io = end_buffer_write_sync; - submit_bh(REQ_OP_WRITE, 0, bh); + submit_bh(REQ_OP_WRITE, bh); wait_on_buffer(bh); @@ -147,7 +147,7 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, get_bh(bh); /* for end_buffer_read_sync() */ bh->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, 0, bh); + submit_bh(REQ_OP_READ, bh); } read_failure: @@ -328,7 +328,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, if (validate) set_buffer_needs_validate(bh); bh->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, 0, bh); + submit_bh(REQ_OP_READ, bh); continue; } } @@ -449,7 +449,7 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb, get_bh(bh); /* for end_buffer_write_sync() */ bh->b_end_io = end_buffer_write_sync; ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &di->i_check); - submit_bh(REQ_OP_WRITE, 0, bh); + submit_bh(REQ_OP_WRITE, bh); wait_on_buffer(bh); diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index ea0e70c0fce0..b13d344d40b6 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -235,8 +235,6 @@ struct o2hb_region { * (hr_steady_iterations == 0) within hr_unsteady_iterations */ atomic_t hr_unsteady_iterations; - char hr_dev_name[BDEVNAME_SIZE]; - unsigned int hr_timeout_ms; /* randomized as the region goes up and down so that a node @@ -287,8 +285,8 @@ static void o2hb_write_timeout(struct work_struct *work) container_of(work, struct o2hb_region, hr_write_timeout_work.work); - mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u " - "milliseconds\n", reg->hr_dev_name, + mlog(ML_ERROR, "Heartbeat write timeout to device %pg after %u " + "milliseconds\n", reg->hr_bdev, jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); if (o2hb_global_heartbeat_active()) { @@ -383,9 +381,9 @@ static void o2hb_nego_timeout(struct work_struct *work) if (master_node == o2nm_this_node()) { if (!test_bit(master_node, reg->hr_nego_node_bitmap)) { - printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%s).\n", + printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%pg).\n", o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000, - config_item_name(®->hr_item), reg->hr_dev_name); + config_item_name(®->hr_item), reg->hr_bdev); set_bit(master_node, reg->hr_nego_node_bitmap); } if (memcmp(reg->hr_nego_node_bitmap, live_node_bitmap, @@ -399,8 +397,8 @@ static void o2hb_nego_timeout(struct work_struct *work) return; } - printk(KERN_NOTICE "o2hb: all nodes hb write hung, maybe region %s (%s) is down.\n", - config_item_name(®->hr_item), reg->hr_dev_name); + printk(KERN_NOTICE "o2hb: all nodes hb write hung, maybe region %s (%pg) is down.\n", + config_item_name(®->hr_item), reg->hr_bdev); /* approve negotiate timeout request. */ o2hb_arm_timeout(reg); @@ -419,9 +417,9 @@ static void o2hb_nego_timeout(struct work_struct *work) } } else { /* negotiate timeout with master node. */ - printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%s), negotiate timeout with node %d.\n", + printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%pg), negotiate timeout with node %d.\n", o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000, config_item_name(®->hr_item), - reg->hr_dev_name, master_node); + reg->hr_bdev, master_node); ret = o2hb_send_nego_msg(reg->hr_key, O2HB_NEGO_TIMEOUT_MSG, master_node); if (ret) @@ -437,8 +435,8 @@ static int o2hb_nego_timeout_handler(struct o2net_msg *msg, u32 len, void *data, struct o2hb_nego_msg *nego_msg; nego_msg = (struct o2hb_nego_msg *)msg->buf; - printk(KERN_NOTICE "o2hb: receive negotiate timeout message from node %d on region %s (%s).\n", - nego_msg->node_num, config_item_name(®->hr_item), reg->hr_dev_name); + printk(KERN_NOTICE "o2hb: receive negotiate timeout message from node %d on region %s (%pg).\n", + nego_msg->node_num, config_item_name(®->hr_item), reg->hr_bdev); if (nego_msg->node_num < O2NM_MAX_NODES) set_bit(nego_msg->node_num, reg->hr_nego_node_bitmap); else @@ -452,8 +450,8 @@ static int o2hb_nego_approve_handler(struct o2net_msg *msg, u32 len, void *data, { struct o2hb_region *reg = data; - printk(KERN_NOTICE "o2hb: negotiate timeout approved by master node on region %s (%s).\n", - config_item_name(®->hr_item), reg->hr_dev_name); + printk(KERN_NOTICE "o2hb: negotiate timeout approved by master node on region %s (%pg).\n", + config_item_name(®->hr_item), reg->hr_bdev); o2hb_arm_timeout(reg); return 0; } @@ -503,8 +501,7 @@ static void o2hb_bio_end_io(struct bio *bio) static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, struct o2hb_bio_wait_ctxt *wc, unsigned int *current_slot, - unsigned int max_slots, int op, - int op_flags) + unsigned int max_slots, blk_opf_t opf) { int len, current_page; unsigned int vec_len, vec_start; @@ -518,7 +515,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, * GFP_KERNEL that the local node can get fenced. It would be * nicest if we could pre-allocate these bios and avoid this * all together. */ - bio = bio_alloc(reg->hr_bdev, 16, op | op_flags, GFP_ATOMIC); + bio = bio_alloc(reg->hr_bdev, 16, opf, GFP_ATOMIC); if (!bio) { mlog(ML_ERROR, "Could not alloc slots BIO!\n"); bio = ERR_PTR(-ENOMEM); @@ -566,7 +563,7 @@ static int o2hb_read_slots(struct o2hb_region *reg, while(current_slot < max_slots) { bio = o2hb_setup_one_bio(reg, &wc, ¤t_slot, max_slots, - REQ_OP_READ, 0); + REQ_OP_READ); if (IS_ERR(bio)) { status = PTR_ERR(bio); mlog_errno(status); @@ -598,8 +595,8 @@ static int o2hb_issue_node_write(struct o2hb_region *reg, slot = o2nm_this_node(); - bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1, REQ_OP_WRITE, - REQ_SYNC); + bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1, + REQ_OP_WRITE | REQ_SYNC); if (IS_ERR(bio)) { status = PTR_ERR(bio); mlog_errno(status); @@ -689,8 +686,8 @@ static int o2hb_check_own_slot(struct o2hb_region *reg) else errstr = ERRSTR3; - mlog(ML_ERROR, "%s (%s): expected(%u:0x%llx, 0x%llx), " - "ondisk(%u:0x%llx, 0x%llx)\n", errstr, reg->hr_dev_name, + mlog(ML_ERROR, "%s (%pg): expected(%u:0x%llx, 0x%llx), " + "ondisk(%u:0x%llx, 0x%llx)\n", errstr, reg->hr_bdev, slot->ds_node_num, (unsigned long long)slot->ds_last_generation, (unsigned long long)slot->ds_last_time, hb_block->hb_node, (unsigned long long)le64_to_cpu(hb_block->hb_generation), @@ -863,8 +860,8 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg) sizeof(o2hb_live_node_bitmap))) goto unlock; - printk(KERN_NOTICE "o2hb: Region %s (%s) is now a quorum device\n", - config_item_name(®->hr_item), reg->hr_dev_name); + printk(KERN_NOTICE "o2hb: Region %s (%pg) is now a quorum device\n", + config_item_name(®->hr_item), reg->hr_bdev); set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); @@ -922,8 +919,8 @@ static int o2hb_check_slot(struct o2hb_region *reg, /* The node is live but pushed out a bad crc. We * consider it a transient miss but don't populate any * other values as they may be junk. */ - mlog(ML_ERROR, "Node %d has written a bad crc to %s\n", - slot->ds_node_num, reg->hr_dev_name); + mlog(ML_ERROR, "Node %d has written a bad crc to %pg\n", + slot->ds_node_num, reg->hr_bdev); o2hb_dump_slot(hb_block); slot->ds_equal_samples++; @@ -1002,11 +999,11 @@ fire_callbacks: slot_dead_ms = le32_to_cpu(hb_block->hb_dead_ms); if (slot_dead_ms && slot_dead_ms != dead_ms) { /* TODO: Perhaps we can fail the region here. */ - mlog(ML_ERROR, "Node %d on device %s has a dead count " + mlog(ML_ERROR, "Node %d on device %pg has a dead count " "of %u ms, but our count is %u ms.\n" "Please double check your configuration values " "for 'O2CB_HEARTBEAT_THRESHOLD'\n", - slot->ds_node_num, reg->hr_dev_name, slot_dead_ms, + slot->ds_node_num, reg->hr_bdev, slot_dead_ms, dead_ms); } goto out; @@ -1145,8 +1142,8 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) /* Do not re-arm the write timeout on I/O error - we * can't be sure that the new block ever made it to * disk */ - mlog(ML_ERROR, "Write error %d on device \"%s\"\n", - write_wc.wc_error, reg->hr_dev_name); + mlog(ML_ERROR, "Write error %d on device \"%pg\"\n", + write_wc.wc_error, reg->hr_bdev); ret = write_wc.wc_error; goto bail; } @@ -1170,9 +1167,9 @@ bail: if (atomic_read(®->hr_steady_iterations) != 0) { if (atomic_dec_and_test(®->hr_unsteady_iterations)) { printk(KERN_NOTICE "o2hb: Unable to stabilize " - "heartbeat on region %s (%s)\n", + "heartbeat on region %s (%pg)\n", config_item_name(®->hr_item), - reg->hr_dev_name); + reg->hr_bdev); atomic_set(®->hr_steady_iterations, 0); reg->hr_aborted_start = 1; wake_up(&o2hb_steady_queue); @@ -1494,7 +1491,7 @@ static void o2hb_region_release(struct config_item *item) struct page *page; struct o2hb_region *reg = to_o2hb_region(item); - mlog(ML_HEARTBEAT, "hb region release (%s)\n", reg->hr_dev_name); + mlog(ML_HEARTBEAT, "hb region release (%pg)\n", reg->hr_bdev); kfree(reg->hr_tmp_block); @@ -1641,7 +1638,7 @@ static ssize_t o2hb_region_dev_show(struct config_item *item, char *page) unsigned int ret = 0; if (to_o2hb_region(item)->hr_bdev) - ret = sprintf(page, "%s\n", to_o2hb_region(item)->hr_dev_name); + ret = sprintf(page, "%pg\n", to_o2hb_region(item)->hr_bdev); return ret; } @@ -1798,8 +1795,6 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, goto out2; } - bdevname(reg->hr_bdev, reg->hr_dev_name); - sectsize = bdev_logical_block_size(reg->hr_bdev); if (sectsize != reg->hr_block_bytes) { mlog(ML_ERROR, @@ -1895,8 +1890,8 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, ret = -EIO; if (hb_task && o2hb_global_heartbeat_active()) - printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%s)\n", - config_item_name(®->hr_item), reg->hr_dev_name); + printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%pg)\n", + config_item_name(®->hr_item), reg->hr_bdev); out3: if (ret < 0) { @@ -2088,10 +2083,10 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group, quorum_region = 1; clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); spin_unlock(&o2hb_live_lock); - printk(KERN_NOTICE "o2hb: Heartbeat %s on region %s (%s)\n", + printk(KERN_NOTICE "o2hb: Heartbeat %s on region %s (%pg)\n", ((atomic_read(®->hr_steady_iterations) == 0) ? "stopped" : "start aborted"), config_item_name(item), - reg->hr_dev_name); + reg->hr_bdev); } /* diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index e360543ad7e7..8b2020f92b5f 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -296,17 +296,25 @@ static void dlmfs_evict_inode(struct inode *inode) { int status; struct dlmfs_inode_private *ip; + struct user_lock_res *lockres; + int teardown; clear_inode(inode); mlog(0, "inode %lu\n", inode->i_ino); ip = DLMFS_I(inode); + lockres = &ip->ip_lockres; if (S_ISREG(inode->i_mode)) { - status = user_dlm_destroy_lock(&ip->ip_lockres); - if (status < 0) - mlog_errno(status); + spin_lock(&lockres->l_lock); + teardown = !!(lockres->l_flags & USER_LOCK_IN_TEARDOWN); + spin_unlock(&lockres->l_lock); + if (!teardown) { + status = user_dlm_destroy_lock(lockres); + if (status < 0) + mlog_errno(status); + } iput(ip->ip_parent); goto clear_fields; } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 7497cd592258..9c67edd215d5 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1146,7 +1146,7 @@ int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (status) return status; - if (is_quota_modification(inode, attr)) { + if (is_quota_modification(mnt_userns, inode, attr)) { status = dquot_initialize(inode); if (status) return status; diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c index 9099d8fc7599..22da768e65b7 100644 --- a/fs/ocfs2/heartbeat.c +++ b/fs/ocfs2/heartbeat.c @@ -2,12 +2,13 @@ /* * heartbeat.c * - * Register ourselves with the heartbaet service, keep our node maps + * Register ourselves with the heartbeat service, keep our node maps * up to date, and fire off recovery when needed. * * Copyright (C) 2002, 2004 Oracle. All rights reserved. */ +#include <linux/bitmap.h> #include <linux/fs.h> #include <linux/types.h> #include <linux/highmem.h> @@ -24,18 +25,12 @@ #include "buffer_head_io.h" -static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map, - int bit); -static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map, - int bit); - /* special case -1 for now * TODO: should *really* make sure the calling func never passes -1!! */ static void ocfs2_node_map_init(struct ocfs2_node_map *map) { map->num_nodes = OCFS2_NODE_MAP_MAX_NODES; - memset(map->map, 0, BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES) * - sizeof(unsigned long)); + bitmap_zero(map->map, OCFS2_NODE_MAP_MAX_NODES); } void ocfs2_init_node_maps(struct ocfs2_super *osb) @@ -65,12 +60,6 @@ void ocfs2_do_node_down(int node_num, void *data) ocfs2_recovery_thread(osb, node_num); } -static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map, - int bit) -{ - set_bit(bit, map->map); -} - void ocfs2_node_map_set_bit(struct ocfs2_super *osb, struct ocfs2_node_map *map, int bit) @@ -79,16 +68,10 @@ void ocfs2_node_map_set_bit(struct ocfs2_super *osb, return; BUG_ON(bit >= map->num_nodes); spin_lock(&osb->node_map_lock); - __ocfs2_node_map_set_bit(map, bit); + set_bit(bit, map->map); spin_unlock(&osb->node_map_lock); } -static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map, - int bit) -{ - clear_bit(bit, map->map); -} - void ocfs2_node_map_clear_bit(struct ocfs2_super *osb, struct ocfs2_node_map *map, int bit) @@ -97,7 +80,7 @@ void ocfs2_node_map_clear_bit(struct ocfs2_super *osb, return; BUG_ON(bit >= map->num_nodes); spin_lock(&osb->node_map_lock); - __ocfs2_node_map_clear_bit(map, bit); + clear_bit(bit, map->map); spin_unlock(&osb->node_map_lock); } diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 337527571461..740b64238312 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -277,7 +277,6 @@ enum ocfs2_mount_options OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */ OCFS2_MOUNT_ERRORS_CONT = 1 << 16, /* Return EIO to the calling process on error */ OCFS2_MOUNT_ERRORS_ROFS = 1 << 17, /* Change filesystem to read-only on error */ - OCFS2_MOUNT_NOCLUSTER = 1 << 18, /* No cluster aware filesystem mount */ }; #define OCFS2_OSB_SOFT_RO 0x0001 @@ -673,8 +672,7 @@ static inline int ocfs2_cluster_o2cb_global_heartbeat(struct ocfs2_super *osb) static inline int ocfs2_mount_local(struct ocfs2_super *osb) { - return ((osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT) - || (osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER)); + return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT); } static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb) diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index 0b6f551a342a..dc9f76ab7e13 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -412,7 +412,7 @@ out_unlock: goto out_err; } -/* Write information to global quota file. Expects exlusive lock on quota +/* Write information to global quota file. Expects exclusive lock on quota * file inode and quota info */ static int __ocfs2_global_write_info(struct super_block *sb, int type) { diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index e04358a46b68..1358981e80a3 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -3146,48 +3146,18 @@ int ocfs2_cow_sync_writeback(struct super_block *sb, struct inode *inode, u32 cpos, u32 num_clusters) { - int ret = 0; - loff_t offset, end, map_end; - pgoff_t page_index; - struct page *page; + int ret; + loff_t start, end; if (ocfs2_should_order_data(inode)) return 0; - offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; - end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits); + start = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; + end = start + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits) - 1; - ret = filemap_fdatawrite_range(inode->i_mapping, - offset, end - 1); - if (ret < 0) { + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret < 0) mlog_errno(ret); - return ret; - } - - while (offset < end) { - page_index = offset >> PAGE_SHIFT; - map_end = ((loff_t)page_index + 1) << PAGE_SHIFT; - if (map_end > end) - map_end = end; - - page = find_or_create_page(inode->i_mapping, - page_index, GFP_NOFS); - BUG_ON(!page); - - wait_on_page_writeback(page); - if (PageError(page)) { - ret = -EIO; - mlog_errno(ret); - } else - mark_page_accessed(page); - - unlock_page(page); - put_page(page); - page = NULL; - offset = map_end; - if (ret) - break; - } return ret; } diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index 0b0ae3ebb0cf..da7718cef735 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -252,16 +252,14 @@ static int __ocfs2_find_empty_slot(struct ocfs2_slot_info *si, int i, ret = -ENOSPC; if ((preferred >= 0) && (preferred < si->si_num_slots)) { - if (!si->si_slots[preferred].sl_valid || - !si->si_slots[preferred].sl_node_num) { + if (!si->si_slots[preferred].sl_valid) { ret = preferred; goto out; } } for(i = 0; i < si->si_num_slots; i++) { - if (!si->si_slots[i].sl_valid || - !si->si_slots[i].sl_node_num) { + if (!si->si_slots[i].sl_valid) { ret = i; break; } @@ -456,30 +454,24 @@ int ocfs2_find_slot(struct ocfs2_super *osb) spin_lock(&osb->osb_lock); ocfs2_update_slot_info(si); - if (ocfs2_mount_local(osb)) - /* use slot 0 directly in local mode */ - slot = 0; - else { - /* search for ourselves first and take the slot if it already - * exists. Perhaps we need to mark this in a variable for our - * own journal recovery? Possibly not, though we certainly - * need to warn to the user */ - slot = __ocfs2_node_num_to_slot(si, osb->node_num); + /* search for ourselves first and take the slot if it already + * exists. Perhaps we need to mark this in a variable for our + * own journal recovery? Possibly not, though we certainly + * need to warn to the user */ + slot = __ocfs2_node_num_to_slot(si, osb->node_num); + if (slot < 0) { + /* if no slot yet, then just take 1st available + * one. */ + slot = __ocfs2_find_empty_slot(si, osb->preferred_slot); if (slot < 0) { - /* if no slot yet, then just take 1st available - * one. */ - slot = __ocfs2_find_empty_slot(si, osb->preferred_slot); - if (slot < 0) { - spin_unlock(&osb->osb_lock); - mlog(ML_ERROR, "no free slots available!\n"); - status = -EINVAL; - goto bail; - } - } else - printk(KERN_INFO "ocfs2: Slot %d on device (%s) was " - "already allocated to this node!\n", - slot, osb->dev_str); - } + spin_unlock(&osb->osb_lock); + mlog(ML_ERROR, "no free slots available!\n"); + status = -EINVAL; + goto bail; + } + } else + printk(KERN_INFO "ocfs2: Slot %d on device (%s) was already " + "allocated to this node!\n", slot, osb->dev_str); ocfs2_set_slot(si, slot, osb->node_num); osb->slot_num = slot; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index f7298816d8d9..013a727bd7c8 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -172,7 +172,6 @@ enum { Opt_dir_resv_level, Opt_journal_async_commit, Opt_err_cont, - Opt_nocluster, Opt_err, }; @@ -206,7 +205,6 @@ static const match_table_t tokens = { {Opt_dir_resv_level, "dir_resv_level=%u"}, {Opt_journal_async_commit, "journal_async_commit"}, {Opt_err_cont, "errors=continue"}, - {Opt_nocluster, "nocluster"}, {Opt_err, NULL} }; @@ -618,13 +616,6 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) goto out; } - tmp = OCFS2_MOUNT_NOCLUSTER; - if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) { - ret = -EINVAL; - mlog(ML_ERROR, "Cannot change nocluster option on remount\n"); - goto out; - } - tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL | OCFS2_MOUNT_HB_NONE; if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) { @@ -865,7 +856,6 @@ static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb, } if (ocfs2_userspace_stack(osb) && - !(osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER) && strncmp(osb->osb_cluster_stack, mopt->cluster_stack, OCFS2_STACK_LABEL_LEN)) { mlog(ML_ERROR, @@ -1137,11 +1127,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" : "ordered"); - if ((osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER) && - !(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT)) - printk(KERN_NOTICE "ocfs2: The shared device (%s) is mounted " - "without cluster aware mode.\n", osb->dev_str); - atomic_set(&osb->vol_state, VOLUME_MOUNTED); wake_up(&osb->osb_mount_event); @@ -1452,9 +1437,6 @@ static int ocfs2_parse_options(struct super_block *sb, case Opt_journal_async_commit: mopt->mount_opt |= OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT; break; - case Opt_nocluster: - mopt->mount_opt |= OCFS2_MOUNT_NOCLUSTER; - break; default: mlog(ML_ERROR, "Unrecognized mount option \"%s\" " @@ -1566,9 +1548,6 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root) if (opts & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT) seq_printf(s, ",journal_async_commit"); - if (opts & OCFS2_MOUNT_NOCLUSTER) - seq_printf(s, ",nocluster"); - return 0; } @@ -1785,7 +1764,7 @@ static int ocfs2_get_sector(struct super_block *sb, if (!buffer_dirty(*bh)) clear_buffer_uptodate(*bh); unlock_buffer(*bh); - ll_rw_block(REQ_OP_READ, 0, 1, bh); + ll_rw_block(REQ_OP_READ, 1, bh); wait_on_buffer(*bh); if (!buffer_uptodate(*bh)) { mlog_errno(-EIO); diff --git a/fs/open.c b/fs/open.c index 1d57fbde2feb..8a813fa5ca56 100644 --- a/fs/open.c +++ b/fs/open.c @@ -663,6 +663,42 @@ SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode) return do_fchmodat(AT_FDCWD, filename, mode); } +/** + * setattr_vfsuid - check and set ia_fsuid attribute + * @kuid: new inode owner + * + * Check whether @kuid is valid and if so generate and set vfsuid_t in + * ia_vfsuid. + * + * Return: true if @kuid is valid, false if not. + */ +static inline bool setattr_vfsuid(struct iattr *attr, kuid_t kuid) +{ + if (!uid_valid(kuid)) + return false; + attr->ia_valid |= ATTR_UID; + attr->ia_vfsuid = VFSUIDT_INIT(kuid); + return true; +} + +/** + * setattr_vfsgid - check and set ia_fsgid attribute + * @kgid: new inode owner + * + * Check whether @kgid is valid and if so generate and set vfsgid_t in + * ia_vfsgid. + * + * Return: true if @kgid is valid, false if not. + */ +static inline bool setattr_vfsgid(struct iattr *attr, kgid_t kgid) +{ + if (!gid_valid(kgid)) + return false; + attr->ia_valid |= ATTR_GID; + attr->ia_vfsgid = VFSGIDT_INIT(kgid); + return true; +} + int chown_common(const struct path *path, uid_t user, gid_t group) { struct user_namespace *mnt_userns, *fs_userns; @@ -678,28 +714,22 @@ int chown_common(const struct path *path, uid_t user, gid_t group) mnt_userns = mnt_user_ns(path->mnt); fs_userns = i_user_ns(inode); - uid = mapped_kuid_user(mnt_userns, fs_userns, uid); - gid = mapped_kgid_user(mnt_userns, fs_userns, gid); retry_deleg: newattrs.ia_valid = ATTR_CTIME; - if (user != (uid_t) -1) { - if (!uid_valid(uid)) - return -EINVAL; - newattrs.ia_valid |= ATTR_UID; - newattrs.ia_uid = uid; - } - if (group != (gid_t) -1) { - if (!gid_valid(gid)) - return -EINVAL; - newattrs.ia_valid |= ATTR_GID; - newattrs.ia_gid = gid; - } + if ((user != (uid_t)-1) && !setattr_vfsuid(&newattrs, uid)) + return -EINVAL; + if ((group != (gid_t)-1) && !setattr_vfsgid(&newattrs, gid)) + return -EINVAL; if (!S_ISDIR(inode->i_mode)) newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV; inode_lock(inode); - error = security_path_chown(path, uid, gid); + /* Continue to send actual fs values, not the mount values. */ + error = security_path_chown( + path, + from_vfsuid(mnt_userns, fs_userns, newattrs.ia_vfsuid), + from_vfsgid(mnt_userns, fs_userns, newattrs.ia_vfsgid)); if (!error) error = notify_change(mnt_userns, path->dentry, &newattrs, &delegated_inode); @@ -858,10 +888,13 @@ static int do_dentry_open(struct file *f, if ((f->f_mode & FMODE_WRITE) && likely(f->f_op->write || f->f_op->write_iter)) f->f_mode |= FMODE_CAN_WRITE; + if ((f->f_mode & FMODE_LSEEK) && !f->f_op->llseek) + f->f_mode &= ~FMODE_LSEEK; if (f->f_mapping->a_ops && f->f_mapping->a_ops->direct_IO) f->f_mode |= FMODE_CAN_ODIRECT; f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); + f->f_iocb_flags = iocb_flags(f); file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 5ce27dde3c79..7a8c0c6e698d 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -307,7 +307,7 @@ static int orangefs_read_folio(struct file *file, struct folio *folio) ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter, folio_size(folio), inode->i_size, NULL, NULL, file); - /* this will only zero remaining unread portions of the page data */ + /* this will only zero remaining unread portions of the folio data */ iov_iter_zero(~0U, &iter); /* takes care of potential aliasing */ flush_dcache_folio(folio); @@ -315,8 +315,6 @@ static int orangefs_read_folio(struct file *file, struct folio *folio) folio_set_error(folio); } else { folio_mark_uptodate(folio); - if (folio_test_error(folio)) - folio_clear_error(folio); ret = 0; } /* unlock the folio after the ->read_folio() routine completes */ diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 714ec569d25b..fdde6c56cc3d 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -226,8 +226,7 @@ static int ovl_copy_up_data(struct ovl_fs *ofs, struct path *old, /* Couldn't clone, so now we try to copy the data */ /* Check if lower fs supports seek operation */ - if (old_file->f_mode & FMODE_LSEEK && - old_file->f_op->llseek) + if (old_file->f_mode & FMODE_LSEEK) skip_hole = true; while (len) { @@ -331,8 +330,8 @@ int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upperdentry, if (!err) { struct iattr attr = { .ia_valid = ATTR_UID | ATTR_GID, - .ia_uid = stat->uid, - .ia_gid = stat->gid, + .ia_vfsuid = VFSUIDT_INIT(stat->uid), + .ia_vfsgid = VFSGIDT_INIT(stat->gid), }; err = ovl_do_notify_change(ofs, upperdentry, &attr); } diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 2eada97bbd23..e065a5b9a442 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -259,7 +259,7 @@ static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len, return FILEID_INVALID; dentry = d_find_any_alias(inode); - if (WARN_ON(!dentry)) + if (!dentry) return FILEID_INVALID; bytes = ovl_dentry_to_fid(ofs, dentry, fid, buflen); diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 492eddeb481f..b45fea69fff3 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -454,24 +454,97 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) return res; } +#ifdef CONFIG_FS_POSIX_ACL +/* + * Apply the idmapping of the layer to POSIX ACLs. The caller must pass a clone + * of the POSIX ACLs retrieved from the lower layer to this function to not + * alter the POSIX ACLs for the underlying filesystem. + */ +static void ovl_idmap_posix_acl(struct user_namespace *mnt_userns, + struct posix_acl *acl) +{ + for (unsigned int i = 0; i < acl->a_count; i++) { + vfsuid_t vfsuid; + vfsgid_t vfsgid; + + struct posix_acl_entry *e = &acl->a_entries[i]; + switch (e->e_tag) { + case ACL_USER: + vfsuid = make_vfsuid(mnt_userns, &init_user_ns, e->e_uid); + e->e_uid = vfsuid_into_kuid(vfsuid); + break; + case ACL_GROUP: + vfsgid = make_vfsgid(mnt_userns, &init_user_ns, e->e_gid); + e->e_gid = vfsgid_into_kgid(vfsgid); + break; + } + } +} + +/* + * When the relevant layer is an idmapped mount we need to take the idmapping + * of the layer into account and translate any ACL_{GROUP,USER} values + * according to the idmapped mount. + * + * We cannot alter the ACLs returned from the relevant layer as that would + * alter the cached values filesystem wide for the lower filesystem. Instead we + * can clone the ACLs and then apply the relevant idmapping of the layer. + * + * This is obviously only relevant when idmapped layers are used. + */ struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu) { struct inode *realinode = ovl_inode_real(inode); - const struct cred *old_cred; - struct posix_acl *acl; + struct posix_acl *acl, *clone; + struct path realpath; - if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !IS_POSIXACL(realinode)) + if (!IS_POSIXACL(realinode)) return NULL; - if (rcu) - return get_cached_acl_rcu(realinode, type); + /* Careful in RCU walk mode */ + ovl_i_path_real(inode, &realpath); + if (!realpath.dentry) { + WARN_ON(!rcu); + return ERR_PTR(-ECHILD); + } - old_cred = ovl_override_creds(inode->i_sb); - acl = get_acl(realinode, type); - revert_creds(old_cred); + if (rcu) { + acl = get_cached_acl_rcu(realinode, type); + } else { + const struct cred *old_cred; + + old_cred = ovl_override_creds(inode->i_sb); + acl = get_acl(realinode, type); + revert_creds(old_cred); + } + /* + * If there are no POSIX ACLs, or we encountered an error, + * or the layer isn't idmapped we don't need to do anything. + */ + if (!is_idmapped_mnt(realpath.mnt) || IS_ERR_OR_NULL(acl)) + return acl; - return acl; + /* + * We only get here if the layer is idmapped. So drop out of RCU path + * walk so we can clone the ACLs. There's no need to release the ACLs + * since get_cached_acl_rcu() doesn't take a reference on the ACLs. + */ + if (rcu) + return ERR_PTR(-ECHILD); + + clone = posix_acl_clone(acl, GFP_KERNEL); + if (!clone) + clone = ERR_PTR(-ENOMEM); + else + ovl_idmap_posix_acl(mnt_user_ns(realpath.mnt), clone); + /* + * Since we're not in RCU path walk we always need to release the + * original ACLs. + */ + posix_acl_release(acl); + return clone; } +#endif int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags) { diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 65c4346a5b43..69dc577974f8 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -42,7 +42,7 @@ static int ovl_check_redirect(struct path *path, struct ovl_lookup_data *d, * One of the ancestor path elements in an absolute path * lookup in ovl_lookup_layer() could have been opaque and * that will stop further lookup in lower layers (d->stop=true) - * But we have found an absolute redirect in decendant path + * But we have found an absolute redirect in descendant path * element and that should force continue lookup in lower * layers (reset d->stop). */ @@ -648,7 +648,7 @@ static int ovl_get_index_name_fh(struct ovl_fh *fh, struct qstr *name) * If the index dentry for a copy up origin inode is positive, but points * to an inode different than the upper inode, then either the upper inode * has been copied up and not indexed or it was indexed, but since then - * index dir was cleared. Either way, that index cannot be used to indentify + * index dir was cleared. Either way, that index cannot be used to identify * the overlay inode. */ int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin, diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 4f34b7e02eee..87759165d32b 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -139,17 +139,7 @@ static inline int ovl_do_notify_change(struct ovl_fs *ofs, struct dentry *upperdentry, struct iattr *attr) { - struct user_namespace *upper_mnt_userns = ovl_upper_mnt_userns(ofs); - struct user_namespace *fs_userns = i_user_ns(d_inode(upperdentry)); - - if (attr->ia_valid & ATTR_UID) - attr->ia_uid = mapped_kuid_user(upper_mnt_userns, - fs_userns, attr->ia_uid); - if (attr->ia_valid & ATTR_GID) - attr->ia_gid = mapped_kgid_user(upper_mnt_userns, - fs_userns, attr->ia_gid); - - return notify_change(upper_mnt_userns, upperdentry, attr, NULL); + return notify_change(ovl_upper_mnt_userns(ofs), upperdentry, attr, NULL); } static inline int ovl_do_rmdir(struct ovl_fs *ofs, @@ -259,7 +249,8 @@ static inline int ovl_do_setxattr(struct ovl_fs *ofs, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - int err = vfs_setxattr(ovl_upper_mnt_userns(ofs), dentry, name, value, size, flags); + int err = vfs_setxattr(ovl_upper_mnt_userns(ofs), dentry, name, + (void *)value, size, flags); pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, %d) = %i\n", dentry, name, min((int)size, 48), value, size, flags, err); @@ -599,7 +590,13 @@ int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, void *value, size_t size); ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); + +#ifdef CONFIG_FS_POSIX_ACL struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu); +#else +#define ovl_get_acl NULL +#endif + int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags); bool ovl_is_private_xattr(struct super_block *sb, const char *name); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index e0a2e0468ee7..ec746d447f1b 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -301,7 +301,7 @@ static int ovl_sync_fs(struct super_block *sb, int wait) /** * ovl_statfs - * @sb: The overlayfs super block + * @dentry: The dentry to query * @buf: The struct kstatfs to fill in with stats * * Get the filesystem statistics. As writes always target the upper layer @@ -349,6 +349,8 @@ static inline int ovl_xino_def(void) /** * ovl_show_options + * @m: the seq_file handle + * @dentry: The dentry to query * * Prints the mount options for a given superblock. * Returns zero; does not fail. @@ -1412,11 +1414,12 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, */ err = ovl_setxattr(ofs, ofs->workdir, OVL_XATTR_OPAQUE, "0", 1); if (err) { + pr_warn("failed to set xattr on upper\n"); ofs->noxattr = true; if (ofs->config.index || ofs->config.metacopy) { ofs->config.index = false; ofs->config.metacopy = false; - pr_warn("upper fs does not support xattr, falling back to index=off,metacopy=off.\n"); + pr_warn("...falling back to index=off,metacopy=off.\n"); } /* * xattr support is required for persistent st_ino. @@ -1424,8 +1427,10 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, */ if (ofs->config.xino == OVL_XINO_AUTO) { ofs->config.xino = OVL_XINO_OFF; - pr_warn("upper fs does not support xattr, falling back to xino=off.\n"); + pr_warn("...falling back to xino=off.\n"); } + if (err == -EPERM && !ofs->config.userxattr) + pr_info("try mounting with 'userxattr' option\n"); err = 0; } else { ovl_removexattr(ofs, ofs->workdir, OVL_XATTR_OPAQUE); @@ -2032,7 +2037,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) sb->s_stack_depth = 0; sb->s_maxbytes = MAX_LFS_FILESIZE; atomic_long_set(&ofs->last_ino, 1); - /* Assume underlaying fs uses 32bit inodes unless proven otherwise */ + /* Assume underlying fs uses 32bit inodes unless proven otherwise */ if (ofs->config.xino != OVL_XINO_OFF) { ofs->xino_mode = BITS_PER_LONG - 32; if (!ofs->xino_mode) { diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 962d32468eb4..1d17d7b13dcd 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -199,7 +199,7 @@ EXPORT_SYMBOL(posix_acl_alloc); /* * Clone an ACL. */ -static struct posix_acl * +struct posix_acl * posix_acl_clone(const struct posix_acl *acl, gfp_t flags) { struct posix_acl *clone = NULL; @@ -213,6 +213,7 @@ posix_acl_clone(const struct posix_acl *acl, gfp_t flags) } return clone; } +EXPORT_SYMBOL_GPL(posix_acl_clone); /* * Check if an acl is valid. Returns 0 if it is, or -E... otherwise. @@ -361,8 +362,8 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode, { const struct posix_acl_entry *pa, *pe, *mask_obj; int found = 0; - kuid_t uid; - kgid_t gid; + vfsuid_t vfsuid; + vfsgid_t vfsgid; want &= MAY_READ | MAY_WRITE | MAY_EXEC; @@ -370,30 +371,28 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode, switch(pa->e_tag) { case ACL_USER_OBJ: /* (May have been checked already) */ - uid = i_uid_into_mnt(mnt_userns, inode); - if (uid_eq(uid, current_fsuid())) + vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + if (vfsuid_eq_kuid(vfsuid, current_fsuid())) goto check_perm; break; case ACL_USER: - uid = mapped_kuid_fs(mnt_userns, - i_user_ns(inode), + vfsuid = make_vfsuid(mnt_userns, &init_user_ns, pa->e_uid); - if (uid_eq(uid, current_fsuid())) + if (vfsuid_eq_kuid(vfsuid, current_fsuid())) goto mask; break; case ACL_GROUP_OBJ: - gid = i_gid_into_mnt(mnt_userns, inode); - if (in_group_p(gid)) { + vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + if (vfsgid_in_group_p(vfsgid)) { found = 1; if ((pa->e_perm & want) == want) goto mask; } break; case ACL_GROUP: - gid = mapped_kgid_fs(mnt_userns, - i_user_ns(inode), + vfsgid = make_vfsgid(mnt_userns, &init_user_ns, pa->e_gid); - if (in_group_p(gid)) { + if (vfsgid_in_group_p(vfsgid)) { found = 1; if ((pa->e_perm & want) == want) goto mask; @@ -699,7 +698,7 @@ int posix_acl_update_mode(struct user_namespace *mnt_userns, return error; if (error == 0) *acl = NULL; - if (!in_group_p(i_gid_into_mnt(mnt_userns, inode)) && + if (!vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode)) && !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) mode &= ~S_ISGID; *mode_p = mode; @@ -710,46 +709,127 @@ EXPORT_SYMBOL(posix_acl_update_mode); /* * Fix up the uids and gids in posix acl extended attributes in place. */ -static void posix_acl_fix_xattr_userns( - struct user_namespace *to, struct user_namespace *from, - struct user_namespace *mnt_userns, - void *value, size_t size, bool from_user) +static int posix_acl_fix_xattr_common(void *value, size_t size) +{ + struct posix_acl_xattr_header *header = value; + int count; + + if (!header) + return -EINVAL; + if (size < sizeof(struct posix_acl_xattr_header)) + return -EINVAL; + if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) + return -EINVAL; + + count = posix_acl_xattr_count(size); + if (count < 0) + return -EINVAL; + if (count == 0) + return -EINVAL; + + return count; +} + +void posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns, + const struct inode *inode, + void *value, size_t size) { struct posix_acl_xattr_header *header = value; struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end; int count; + vfsuid_t vfsuid; + vfsgid_t vfsgid; kuid_t uid; kgid_t gid; - if (!value) + if (no_idmapping(mnt_userns, i_user_ns(inode))) return; - if (size < sizeof(struct posix_acl_xattr_header)) + + count = posix_acl_fix_xattr_common(value, size); + if (count < 0) return; - if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) + + for (end = entry + count; entry != end; entry++) { + switch (le16_to_cpu(entry->e_tag)) { + case ACL_USER: + uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id)); + vfsuid = make_vfsuid(mnt_userns, &init_user_ns, uid); + entry->e_id = cpu_to_le32(from_kuid(&init_user_ns, + vfsuid_into_kuid(vfsuid))); + break; + case ACL_GROUP: + gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id)); + vfsgid = make_vfsgid(mnt_userns, &init_user_ns, gid); + entry->e_id = cpu_to_le32(from_kgid(&init_user_ns, + vfsgid_into_kgid(vfsgid))); + break; + default: + break; + } + } +} + +void posix_acl_setxattr_idmapped_mnt(struct user_namespace *mnt_userns, + const struct inode *inode, + void *value, size_t size) +{ + struct posix_acl_xattr_header *header = value; + struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end; + int count; + vfsuid_t vfsuid; + vfsgid_t vfsgid; + kuid_t uid; + kgid_t gid; + + if (no_idmapping(mnt_userns, i_user_ns(inode))) return; - count = posix_acl_xattr_count(size); + count = posix_acl_fix_xattr_common(value, size); if (count < 0) return; - if (count == 0) + + for (end = entry + count; entry != end; entry++) { + switch (le16_to_cpu(entry->e_tag)) { + case ACL_USER: + uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id)); + vfsuid = VFSUIDT_INIT(uid); + uid = from_vfsuid(mnt_userns, &init_user_ns, vfsuid); + entry->e_id = cpu_to_le32(from_kuid(&init_user_ns, uid)); + break; + case ACL_GROUP: + gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id)); + vfsgid = VFSGIDT_INIT(gid); + gid = from_vfsgid(mnt_userns, &init_user_ns, vfsgid); + entry->e_id = cpu_to_le32(from_kgid(&init_user_ns, gid)); + break; + default: + break; + } + } +} + +static void posix_acl_fix_xattr_userns( + struct user_namespace *to, struct user_namespace *from, + void *value, size_t size) +{ + struct posix_acl_xattr_header *header = value; + struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end; + int count; + kuid_t uid; + kgid_t gid; + + count = posix_acl_fix_xattr_common(value, size); + if (count < 0) return; for (end = entry + count; entry != end; entry++) { switch(le16_to_cpu(entry->e_tag)) { case ACL_USER: uid = make_kuid(from, le32_to_cpu(entry->e_id)); - if (from_user) - uid = mapped_kuid_user(mnt_userns, &init_user_ns, uid); - else - uid = mapped_kuid_fs(mnt_userns, &init_user_ns, uid); entry->e_id = cpu_to_le32(from_kuid(to, uid)); break; case ACL_GROUP: gid = make_kgid(from, le32_to_cpu(entry->e_id)); - if (from_user) - gid = mapped_kgid_user(mnt_userns, &init_user_ns, gid); - else - gid = mapped_kgid_fs(mnt_userns, &init_user_ns, gid); entry->e_id = cpu_to_le32(from_kgid(to, gid)); break; default: @@ -758,34 +838,20 @@ static void posix_acl_fix_xattr_userns( } } -void posix_acl_fix_xattr_from_user(struct user_namespace *mnt_userns, - struct inode *inode, - void *value, size_t size) +void posix_acl_fix_xattr_from_user(void *value, size_t size) { struct user_namespace *user_ns = current_user_ns(); - - /* Leave ids untouched on non-idmapped mounts. */ - if (no_idmapping(mnt_userns, i_user_ns(inode))) - mnt_userns = &init_user_ns; - if ((user_ns == &init_user_ns) && (mnt_userns == &init_user_ns)) + if (user_ns == &init_user_ns) return; - posix_acl_fix_xattr_userns(&init_user_ns, user_ns, mnt_userns, value, - size, true); + posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size); } -void posix_acl_fix_xattr_to_user(struct user_namespace *mnt_userns, - struct inode *inode, - void *value, size_t size) +void posix_acl_fix_xattr_to_user(void *value, size_t size) { struct user_namespace *user_ns = current_user_ns(); - - /* Leave ids untouched on non-idmapped mounts. */ - if (no_idmapping(mnt_userns, i_user_ns(inode))) - mnt_userns = &init_user_ns; - if ((user_ns == &init_user_ns) && (mnt_userns == &init_user_ns)) + if (user_ns == &init_user_ns) return; - posix_acl_fix_xattr_userns(user_ns, &init_user_ns, mnt_userns, value, - size, false); + posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size); } /* diff --git a/fs/proc/array.c b/fs/proc/array.c index eb815759842c..99fcbfda8e25 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -69,7 +69,6 @@ #include <linux/sched/cputime.h> #include <linux/proc_fs.h> #include <linux/ioport.h> -#include <linux/uaccess.h> #include <linux/io.h> #include <linux/mm.h> #include <linux/hugetlb.h> @@ -100,6 +99,10 @@ void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape) { char tcomm[64]; + /* + * Test before PF_KTHREAD because all workqueue worker threads are + * kernel threads. + */ if (p->flags & PF_WQ_WORKER) wq_worker_comm(tcomm, sizeof(tcomm), p); else if (p->flags & PF_KTHREAD) diff --git a/fs/proc/base.c b/fs/proc/base.c index 8dfa36a99c74..93f7e3d971e4 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1885,7 +1885,7 @@ void proc_pid_evict_inode(struct proc_inode *ei) put_pid(pid); } -struct inode *proc_pid_make_inode(struct super_block * sb, +struct inode *proc_pid_make_inode(struct super_block *sb, struct task_struct *task, umode_t mode) { struct inode * inode; @@ -1914,11 +1914,6 @@ struct inode *proc_pid_make_inode(struct super_block * sb, /* Let the pid remember us for quick removal */ ei->pid = pid; - if (S_ISDIR(mode)) { - spin_lock(&pid->lock); - hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes); - spin_unlock(&pid->lock); - } task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid); security_task_to_inode(task, inode); @@ -1931,6 +1926,39 @@ out_unlock: return NULL; } +/* + * Generating an inode and adding it into @pid->inodes, so that task will + * invalidate inode's dentry before being released. + * + * This helper is used for creating dir-type entries under '/proc' and + * '/proc/<tgid>/task'. Other entries(eg. fd, stat) under '/proc/<tgid>' + * can be released by invalidating '/proc/<tgid>' dentry. + * In theory, dentries under '/proc/<tgid>/task' can also be released by + * invalidating '/proc/<tgid>' dentry, we reserve it to handle single + * thread exiting situation: Any one of threads should invalidate its + * '/proc/<tgid>/task/<pid>' dentry before released. + */ +static struct inode *proc_pid_make_base_inode(struct super_block *sb, + struct task_struct *task, umode_t mode) +{ + struct inode *inode; + struct proc_inode *ei; + struct pid *pid; + + inode = proc_pid_make_inode(sb, task, mode); + if (!inode) + return NULL; + + /* Let proc_flush_pid find this directory inode */ + ei = PROC_I(inode); + pid = ei->pid; + spin_lock(&pid->lock); + hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes); + spin_unlock(&pid->lock); + + return inode; +} + int pid_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { @@ -3369,7 +3397,8 @@ static struct dentry *proc_pid_instantiate(struct dentry * dentry, { struct inode *inode; - inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO); + inode = proc_pid_make_base_inode(dentry->d_sb, task, + S_IFDIR | S_IRUGO | S_IXUGO); if (!inode) return ERR_PTR(-ENOENT); @@ -3671,7 +3700,8 @@ static struct dentry *proc_task_instantiate(struct dentry *dentry, struct task_struct *task, const void *ptr) { struct inode *inode; - inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO); + inode = proc_pid_make_base_inode(dentry->d_sb, task, + S_IFDIR | S_IRUGO | S_IXUGO); if (!inode) return ERR_PTR(-ENOENT); diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 73aeb4e6d32e..f130499ad843 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -26,8 +26,6 @@ #include <linux/mount.h> #include <linux/bug.h> -#include <linux/uaccess.h> - #include "internal.h" static void proc_evict_inode(struct inode *inode) @@ -214,7 +212,15 @@ static void unuse_pde(struct proc_dir_entry *pde) complete(pde->pde_unload_completion); } -/* pde is locked on entry, unlocked on exit */ +/* + * At most 2 contexts can enter this function: the one doing the last + * close on the descriptor and whoever is deleting PDE itself. + * + * First to enter calls ->proc_release hook and signals its completion + * to the second one which waits and then does nothing. + * + * PDE is locked on entry, unlocked on exit. + */ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) __releases(&pde->pde_unload_lock) { @@ -224,9 +230,6 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) * * rmmod (remove_proc_entry() et al) can't delete an entry and proceed: * "struct file" needs to be available at the right moment. - * - * Therefore, first process to enter this function does ->release() and - * signals its completion to the other process which does nothing. */ if (pdeo->closing) { /* somebody else is doing that, just wait */ @@ -240,10 +243,12 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) pdeo->closing = true; spin_unlock(&pde->pde_unload_lock); + file = pdeo->file; pde->proc_ops->proc_release(file_inode(file), file); + spin_lock(&pde->pde_unload_lock); - /* After ->release. */ + /* Strictly after ->proc_release, see above. */ list_del(&pdeo->lh); c = pdeo->c; spin_unlock(&pde->pde_unload_lock); diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index b38ad552887f..592e6dc7c110 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -15,7 +15,6 @@ #include <linux/fs.h> #include <linux/syslog.h> -#include <linux/uaccess.h> #include <asm/io.h> extern wait_queue_head_t log_wait; diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index 13452b32e2bd..4d3493579458 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -21,7 +21,6 @@ #include <linux/seq_file.h> #include <linux/hugetlb.h> #include <linux/vmalloc.h> -#include <linux/uaccess.h> #include <asm/tlb.h> #include <asm/div64.h> #include "internal.h" diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 913e5acefbb6..856839b8ae8b 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -8,9 +8,6 @@ * * proc net directory handling functions */ - -#include <linux/uaccess.h> - #include <linux/errno.h> #include <linux/time.h> #include <linux/proc_fs.h> @@ -353,6 +350,12 @@ static __net_init int proc_net_ns_init(struct net *net) kgid_t gid; int err; + /* + * This PDE acts only as an anchor for /proc/${pid}/net hierarchy. + * Corresponding inode (PDE(inode) == net->proc_net) is never + * instantiated therefore blanket zeroing is fine. + * net->proc_net_stat inode is instantiated normally. + */ err = -ENOMEM; netd = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL); if (!netd) diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c index c69ff191e5d8..5c6a5ceab2f1 100644 --- a/fs/proc/proc_tty.c +++ b/fs/proc/proc_tty.c @@ -4,8 +4,6 @@ * * Copyright 1997, Theodore Ts'o */ - -#include <linux/uaccess.h> #include <linux/module.h> #include <linux/init.h> #include <linux/errno.h> diff --git a/fs/proc/root.c b/fs/proc/root.c index c7e3b1350ef8..3c2ee3eb1138 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -6,9 +6,6 @@ * * proc root directory handling functions */ - -#include <linux/uaccess.h> - #include <linux/errno.h> #include <linux/time.h> #include <linux/proc_fs.h> @@ -305,6 +302,11 @@ void __init proc_root_init(void) proc_mkdir("bus", NULL); proc_sys_init(); + /* + * Last things last. It is not like userspace processes eager + * to open /proc files exist at this point but register last + * anyway. + */ register_filesystem(&proc_fs_type); } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 2d04e3470d4c..a3398d0f1927 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -406,6 +406,7 @@ struct mem_size_stats { u64 pss_anon; u64 pss_file; u64 pss_shmem; + u64 pss_dirty; u64 pss_locked; u64 swap_pss; }; @@ -427,6 +428,7 @@ static void smaps_page_accumulate(struct mem_size_stats *mss, mss->pss_locked += pss; if (dirty || PageDirty(page)) { + mss->pss_dirty += pss; if (private) mss->private_dirty += size; else @@ -808,6 +810,7 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss, { SEQ_PUT_DEC("Rss: ", mss->resident); SEQ_PUT_DEC(" kB\nPss: ", mss->pss >> PSS_SHIFT); + SEQ_PUT_DEC(" kB\nPss_Dirty: ", mss->pss_dirty >> PSS_SHIFT); if (rollup_mode) { /* * These are meaningful only for smaps_rollup, otherwise two of @@ -860,7 +863,7 @@ static int show_smap(struct seq_file *m, void *v) __show_smap(m, &mss, false); seq_printf(m, "THPeligible: %d\n", - transparent_hugepage_active(vma)); + hugepage_vma_check(vma, vma->vm_flags, true, false)); if (arch_pkeys_enabled()) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); @@ -1792,7 +1795,7 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, return NULL; page = vm_normal_page(vma, addr, pte); - if (!page) + if (!page || is_zone_device_page(page)) return NULL; if (PageReserved(page)) diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 4eaeb645e759..f2aa86c421f2 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -25,7 +25,6 @@ #include <linux/mutex.h> #include <linux/vmalloc.h> #include <linux/pagemap.h> -#include <linux/uaccess.h> #include <linux/uio.h> #include <linux/cc_platform.h> #include <asm/io.h> diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 14658b009f1b..ffbadb8b3032 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -55,6 +55,7 @@ static void free_pstore_private(struct pstore_private *private) return; if (private->record) { kfree(private->record->buf); + kfree(private->record->priv); kfree(private->record); } kfree(private); diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index e26162f102ff..b2fd3c20e7c2 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -28,11 +28,14 @@ #include <linux/crypto.h> #include <linux/string.h> #include <linux/timer.h> +#include <linux/scatterlist.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/jiffies.h> #include <linux/workqueue.h> +#include <crypto/acompress.h> + #include "internal.h" /* @@ -90,7 +93,8 @@ module_param(compress, charp, 0444); MODULE_PARM_DESC(compress, "compression to use"); /* Compression parameters */ -static struct crypto_comp *tfm; +static struct crypto_acomp *tfm; +static struct acomp_req *creq; struct pstore_zbackend { int (*zbufsize)(size_t size); @@ -268,12 +272,21 @@ static const struct pstore_zbackend zbackends[] = { static int pstore_compress(const void *in, void *out, unsigned int inlen, unsigned int outlen) { + struct scatterlist src, dst; int ret; if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS)) return -EINVAL; - ret = crypto_comp_compress(tfm, in, inlen, out, &outlen); + sg_init_table(&src, 1); + sg_set_buf(&src, in, inlen); + + sg_init_table(&dst, 1); + sg_set_buf(&dst, out, outlen); + + acomp_request_set_params(creq, &src, &dst, inlen, outlen); + + ret = crypto_acomp_compress(creq); if (ret) { pr_err("crypto_comp_compress failed, ret = %d!\n", ret); return ret; @@ -284,7 +297,7 @@ static int pstore_compress(const void *in, void *out, static void allocate_buf_for_compression(void) { - struct crypto_comp *ctx; + struct crypto_acomp *acomp; int size; char *buf; @@ -296,7 +309,7 @@ static void allocate_buf_for_compression(void) if (!psinfo || tfm) return; - if (!crypto_has_comp(zbackend->name, 0, 0)) { + if (!crypto_has_acomp(zbackend->name, 0, CRYPTO_ALG_ASYNC)) { pr_err("Unknown compression: %s\n", zbackend->name); return; } @@ -315,16 +328,24 @@ static void allocate_buf_for_compression(void) return; } - ctx = crypto_alloc_comp(zbackend->name, 0, 0); - if (IS_ERR_OR_NULL(ctx)) { + acomp = crypto_alloc_acomp(zbackend->name, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR_OR_NULL(acomp)) { kfree(buf); pr_err("crypto_alloc_comp('%s') failed: %ld\n", zbackend->name, - PTR_ERR(ctx)); + PTR_ERR(acomp)); + return; + } + + creq = acomp_request_alloc(acomp); + if (!creq) { + crypto_free_acomp(acomp); + kfree(buf); + pr_err("acomp_request_alloc('%s') failed\n", zbackend->name); return; } /* A non-NULL big_oops_buf indicates compression is available. */ - tfm = ctx; + tfm = acomp; big_oops_buf_sz = size; big_oops_buf = buf; @@ -334,7 +355,8 @@ static void allocate_buf_for_compression(void) static void free_buf_for_compression(void) { if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && tfm) { - crypto_free_comp(tfm); + acomp_request_free(creq); + crypto_free_acomp(tfm); tfm = NULL; } kfree(big_oops_buf); @@ -671,6 +693,8 @@ static void decompress_record(struct pstore_record *record) int ret; int unzipped_len; char *unzipped, *workspace; + struct acomp_req *dreq; + struct scatterlist src, dst; if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS) || !record->compressed) return; @@ -694,16 +718,30 @@ static void decompress_record(struct pstore_record *record) if (!workspace) return; + dreq = acomp_request_alloc(tfm); + if (!dreq) { + kfree(workspace); + return; + } + + sg_init_table(&src, 1); + sg_set_buf(&src, record->buf, record->size); + + sg_init_table(&dst, 1); + sg_set_buf(&dst, workspace, unzipped_len); + + acomp_request_set_params(dreq, &src, &dst, record->size, unzipped_len); + /* After decompression "unzipped_len" is almost certainly smaller. */ - ret = crypto_comp_decompress(tfm, record->buf, record->size, - workspace, &unzipped_len); + ret = crypto_acomp_decompress(dreq); if (ret) { - pr_err("crypto_comp_decompress failed, ret = %d!\n", ret); + pr_err("crypto_acomp_decompress failed, ret = %d!\n", ret); kfree(workspace); return; } /* Append ECC notice to decompressed buffer. */ + unzipped_len = dreq->dlen; memcpy(workspace + unzipped_len, record->buf + record->size, record->ecc_notice_size); @@ -711,6 +749,7 @@ static void decompress_record(struct pstore_record *record) unzipped = kmemdup(workspace, unzipped_len + record->ecc_notice_size, GFP_KERNEL); kfree(workspace); + acomp_request_free(dreq); if (!unzipped) return; @@ -769,6 +808,7 @@ void pstore_get_backend_records(struct pstore_info *psi, if (rc) { /* pstore_mkfile() did not take record, so free it. */ kfree(record->buf); + kfree(record->priv); kfree(record); if (rc != -EEXIST || !quiet) failed++; diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c index 7c8f8feac6c3..017d0d4ad329 100644 --- a/fs/pstore/zone.c +++ b/fs/pstore/zone.c @@ -363,7 +363,7 @@ static int psz_kmsg_recover_data(struct psz_context *cxt) rcnt = info->read((char *)buf, zone->buffer_size + sizeof(*buf), zone->off); if (rcnt != zone->buffer_size + sizeof(*buf)) - return (int)rcnt < 0 ? (int)rcnt : -EIO; + return rcnt < 0 ? rcnt : -EIO; } return 0; } @@ -372,7 +372,7 @@ static int psz_kmsg_recover_meta(struct psz_context *cxt) { struct pstore_zone_info *info = cxt->pstore_zone_info; struct pstore_zone *zone; - size_t rcnt, len; + ssize_t rcnt, len; struct psz_buffer *buf; struct psz_kmsg_header *hdr; struct timespec64 time = { }; @@ -400,7 +400,7 @@ static int psz_kmsg_recover_meta(struct psz_context *cxt) continue; } else if (rcnt != len) { pr_err("read %s with id %lu failed\n", zone->name, i); - return (int)rcnt < 0 ? (int)rcnt : -EIO; + return rcnt < 0 ? rcnt : -EIO; } if (buf->sig != zone->buffer->sig) { @@ -502,7 +502,7 @@ static int psz_recover_zone(struct psz_context *cxt, struct pstore_zone *zone) rcnt = info->read((char *)&tmpbuf, len, zone->off); if (rcnt != len) { pr_debug("read zone %s failed\n", zone->name); - return (int)rcnt < 0 ? (int)rcnt : -EIO; + return rcnt < 0 ? rcnt : -EIO; } if (tmpbuf.sig != zone->buffer->sig) { @@ -544,7 +544,7 @@ static int psz_recover_zone(struct psz_context *cxt, struct pstore_zone *zone) rcnt = info->read(buf, len - start, off + start); if (rcnt != len - start) { pr_err("read zone %s failed\n", zone->name); - ret = (int)rcnt < 0 ? (int)rcnt : -EIO; + ret = rcnt < 0 ? rcnt : -EIO; goto free_oldbuf; } @@ -552,7 +552,7 @@ static int psz_recover_zone(struct psz_context *cxt, struct pstore_zone *zone) rcnt = info->read(buf + len - start, start, off); if (rcnt != start) { pr_err("read zone %s failed\n", zone->name); - ret = (int)rcnt < 0 ? (int)rcnt : -EIO; + ret = rcnt < 0 ? rcnt : -EIO; goto free_oldbuf; } diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 09d1307959d0..0427b44bfee5 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2085,7 +2085,8 @@ EXPORT_SYMBOL(__dquot_transfer); /* Wrapper for transferring ownership of an inode for uid/gid only * Called from FSXXX_setattr() */ -int dquot_transfer(struct inode *inode, struct iattr *iattr) +int dquot_transfer(struct user_namespace *mnt_userns, struct inode *inode, + struct iattr *iattr) { struct dquot *transfer_to[MAXQUOTAS] = {}; struct dquot *dquot; @@ -2095,8 +2096,11 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) if (!dquot_active(inode)) return 0; - if (iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)){ - dquot = dqget(sb, make_kqid_uid(iattr->ia_uid)); + if (i_uid_needs_update(mnt_userns, iattr, inode)) { + kuid_t kuid = from_vfsuid(mnt_userns, i_user_ns(inode), + iattr->ia_vfsuid); + + dquot = dqget(sb, make_kqid_uid(kuid)); if (IS_ERR(dquot)) { if (PTR_ERR(dquot) != -ESRCH) { ret = PTR_ERR(dquot); @@ -2106,8 +2110,11 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) } transfer_to[USRQUOTA] = dquot; } - if (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid)){ - dquot = dqget(sb, make_kqid_gid(iattr->ia_gid)); + if (i_gid_needs_update(mnt_userns, iattr, inode)) { + kgid_t kgid = from_vfsgid(mnt_userns, i_user_ns(inode), + iattr->ia_vfsgid); + + dquot = dqget(sb, make_kqid_gid(kgid)); if (IS_ERR(dquot)) { if (PTR_ERR(dquot) != -ESRCH) { ret = PTR_ERR(dquot); @@ -2995,7 +3002,7 @@ static int __init dquot_init(void) pr_info("VFS: Dquot-cache hash table entries: %ld (order %ld," " %ld bytes)\n", nr_hash, order, (PAGE_SIZE << order)); - if (register_shrinker(&dqcache_shrinker)) + if (register_shrinker(&dqcache_shrinker, "dquota-cache")) panic("Cannot register dquot shrinker"); return 0; diff --git a/fs/read_write.c b/fs/read_write.c index b1b1cdfee9d3..ea59dd0095c2 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -227,12 +227,6 @@ loff_t noop_llseek(struct file *file, loff_t offset, int whence) } EXPORT_SYMBOL(noop_llseek); -loff_t no_llseek(struct file *file, loff_t offset, int whence) -{ - return -ESPIPE; -} -EXPORT_SYMBOL(no_llseek); - loff_t default_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file_inode(file); @@ -290,14 +284,9 @@ EXPORT_SYMBOL(default_llseek); loff_t vfs_llseek(struct file *file, loff_t offset, int whence) { - loff_t (*fn)(struct file *, loff_t, int); - - fn = no_llseek; - if (file->f_mode & FMODE_LSEEK) { - if (file->f_op->llseek) - fn = file->f_op->llseek; - } - return fn(file, offset, whence); + if (!(file->f_mode & FMODE_LSEEK)) + return -ESPIPE; + return file->f_op->llseek(file, offset, whence); } EXPORT_SYMBOL(vfs_llseek); @@ -1263,6 +1252,9 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, count, fl); file_end_write(out.file); } else { + if (out.file->f_flags & O_NONBLOCK) + fl |= SPLICE_F_NONBLOCK; + retval = splice_file_to_pipe(in.file, opipe, &pos, count, fl); } @@ -1397,28 +1389,6 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in, } EXPORT_SYMBOL(generic_copy_file_range); -static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - size_t len, unsigned int flags) -{ - /* - * Although we now allow filesystems to handle cross sb copy, passing - * a file of the wrong filesystem type to filesystem driver can result - * in an attempt to dereference the wrong type of ->private_data, so - * avoid doing that until we really have a good reason. NFS defines - * several different file_system_type structures, but they all end up - * using the same ->copy_file_range() function pointer. - */ - if (file_out->f_op->copy_file_range && - file_out->f_op->copy_file_range == file_in->f_op->copy_file_range) - return file_out->f_op->copy_file_range(file_in, pos_in, - file_out, pos_out, - len, flags); - - return generic_copy_file_range(file_in, pos_in, file_out, pos_out, len, - flags); -} - /* * Performs necessary checks before doing a file copy * @@ -1440,6 +1410,24 @@ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in, if (ret) return ret; + /* + * We allow some filesystems to handle cross sb copy, but passing + * a file of the wrong filesystem type to filesystem driver can result + * in an attempt to dereference the wrong type of ->private_data, so + * avoid doing that until we really have a good reason. + * + * nfs and cifs define several different file_system_type structures + * and several different sets of file_operations, but they all end up + * using the same ->copy_file_range() function pointer. + */ + if (file_out->f_op->copy_file_range) { + if (file_in->f_op->copy_file_range != + file_out->f_op->copy_file_range) + return -EXDEV; + } else if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) { + return -EXDEV; + } + /* Don't touch certain kinds of inodes */ if (IS_IMMUTABLE(inode_out)) return -EPERM; @@ -1505,26 +1493,41 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, file_start_write(file_out); /* - * Try cloning first, this is supported by more file systems, and - * more efficient if both clone and copy are supported (e.g. NFS). + * Cloning is supported by more file systems, so we implement copy on + * same sb using clone, but for filesystems where both clone and copy + * are supported (e.g. nfs,cifs), we only call the copy method. */ + if (file_out->f_op->copy_file_range) { + ret = file_out->f_op->copy_file_range(file_in, pos_in, + file_out, pos_out, + len, flags); + goto done; + } + if (file_in->f_op->remap_file_range && file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) { - loff_t cloned; - - cloned = file_in->f_op->remap_file_range(file_in, pos_in, + ret = file_in->f_op->remap_file_range(file_in, pos_in, file_out, pos_out, min_t(loff_t, MAX_RW_COUNT, len), REMAP_FILE_CAN_SHORTEN); - if (cloned > 0) { - ret = cloned; + if (ret > 0) goto done; - } } - ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len, - flags); - WARN_ON_ONCE(ret == -EOPNOTSUPP); + /* + * We can get here for same sb copy of filesystems that do not implement + * ->copy_file_range() in case filesystem does not support clone or in + * case filesystem supports clone but rejected the clone request (e.g. + * because it was not block aligned). + * + * In both cases, fall back to kernel copy so we are able to maintain a + * consistent story about which filesystems support copy_file_range() + * and which filesystems do not, that will allow userspace tools to + * make consistent desicions w.r.t using copy_file_range(). + */ + ret = generic_copy_file_range(file_in, pos_in, file_out, pos_out, len, + flags); + done: if (ret > 0) { fsnotify_access(file_in); @@ -1649,7 +1652,9 @@ int generic_write_checks_count(struct kiocb *iocb, loff_t *count) if (iocb->ki_flags & IOCB_APPEND) iocb->ki_pos = i_size_read(inode); - if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) + if ((iocb->ki_flags & IOCB_NOWAIT) && + !((iocb->ki_flags & IOCB_DIRECT) || + (file->f_mode & FMODE_BUF_WASYNC))) return -EINVAL; return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count); diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 0cffe054b78e..b9580a6515ee 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -290,7 +290,7 @@ static int _get_block_create_0(struct inode *inode, sector_t block, struct buffer_head *bh; struct item_head *ih, tmp_ih; b_blocknr_t blocknr; - char *p = NULL; + char *p; int chars; int ret; int result; @@ -305,8 +305,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block, result = search_for_position_by_key(inode->i_sb, &key, &path); if (result != POSITION_FOUND) { pathrelse(&path); - if (p) - kunmap(bh_result->b_page); if (result == IO_ERROR) return -EIO; /* @@ -352,8 +350,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block, } pathrelse(&path); - if (p) - kunmap(bh_result->b_page); return ret; } /* requested data are in direct item(s) */ @@ -363,8 +359,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block, * when it is stored in direct item(s) */ pathrelse(&path); - if (p) - kunmap(bh_result->b_page); return -ENOENT; } @@ -396,9 +390,7 @@ static int _get_block_create_0(struct inode *inode, sector_t block, * sure we need to. But, this means the item might move if * kmap schedules */ - if (!p) - p = (char *)kmap(bh_result->b_page); - + p = (char *)kmap(bh_result->b_page); p += offset; memset(p, 0, inode->i_sb->s_blocksize); do { @@ -2664,7 +2656,7 @@ static int reiserfs_write_full_page(struct page *page, do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { - submit_bh(REQ_OP_WRITE, 0, bh); + submit_bh(REQ_OP_WRITE, bh); nr++; } put_bh(bh); @@ -2724,7 +2716,7 @@ fail: struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { clear_buffer_dirty(bh); - submit_bh(REQ_OP_WRITE, 0, bh); + submit_bh(REQ_OP_WRITE, bh); nr++; } put_bh(bh); @@ -3284,7 +3276,7 @@ int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, /* must be turned off for recursive notify_change calls */ ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID); - if (is_quota_modification(inode, attr)) { + if (is_quota_modification(mnt_userns, inode, attr)) { error = dquot_initialize(inode); if (error) return error; @@ -3367,7 +3359,7 @@ int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, reiserfs_write_unlock(inode->i_sb); if (error) goto out; - error = dquot_transfer(inode, attr); + error = dquot_transfer(mnt_userns, inode, attr); reiserfs_write_lock(inode->i_sb); if (error) { journal_end(&th); diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index d8cc9a366124..94addfcefede 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -650,7 +650,7 @@ static void submit_logged_buffer(struct buffer_head *bh) BUG(); if (!buffer_uptodate(bh)) BUG(); - submit_bh(REQ_OP_WRITE, 0, bh); + submit_bh(REQ_OP_WRITE, bh); } static void submit_ordered_buffer(struct buffer_head *bh) @@ -660,7 +660,7 @@ static void submit_ordered_buffer(struct buffer_head *bh) clear_buffer_dirty(bh); if (!buffer_uptodate(bh)) BUG(); - submit_bh(REQ_OP_WRITE, 0, bh); + submit_bh(REQ_OP_WRITE, bh); } #define CHUNK_SIZE 32 @@ -868,7 +868,7 @@ loop_next: */ if (buffer_dirty(bh) && unlikely(bh->b_page->mapping == NULL)) { spin_unlock(lock); - ll_rw_block(REQ_OP_WRITE, 0, 1, &bh); + ll_rw_block(REQ_OP_WRITE, 1, &bh); spin_lock(lock); } put_bh(bh); @@ -1054,7 +1054,7 @@ static int flush_commit_list(struct super_block *s, if (tbh) { if (buffer_dirty(tbh)) { depth = reiserfs_write_unlock_nested(s); - ll_rw_block(REQ_OP_WRITE, 0, 1, &tbh); + ll_rw_block(REQ_OP_WRITE, 1, &tbh); reiserfs_write_lock_nested(s, depth); } put_bh(tbh) ; @@ -2240,7 +2240,7 @@ abort_replay: } } /* read in the log blocks, memcpy to the corresponding real block */ - ll_rw_block(REQ_OP_READ, 0, get_desc_trans_len(desc), log_blocks); + ll_rw_block(REQ_OP_READ, get_desc_trans_len(desc), log_blocks); for (i = 0; i < get_desc_trans_len(desc); i++) { wait_on_buffer(log_blocks[i]); @@ -2342,7 +2342,7 @@ static struct buffer_head *reiserfs_breada(struct block_device *dev, } else bhlist[j++] = bh; } - ll_rw_block(REQ_OP_READ, 0, j, bhlist); + ll_rw_block(REQ_OP_READ, j, bhlist); for (i = 1; i < j; i++) brelse(bhlist[i]); bh = bhlist[0]; diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index ef42729216d1..9a293609a022 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -579,7 +579,7 @@ static int search_by_key_reada(struct super_block *s, if (!buffer_uptodate(bh[j])) { if (depth == -1) depth = reiserfs_write_unlock_nested(s); - ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, bh + j); + ll_rw_block(REQ_OP_READ | REQ_RAHEAD, 1, bh + j); } brelse(bh[j]); } @@ -685,7 +685,7 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, if (!buffer_uptodate(bh) && depth == -1) depth = reiserfs_write_unlock_nested(sb); - ll_rw_block(REQ_OP_READ, 0, 1, &bh); + ll_rw_block(REQ_OP_READ, 1, &bh); wait_on_buffer(bh); if (depth != -1) diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index cfb7c44c7366..c88cd2ce0665 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1702,7 +1702,7 @@ static int read_super_block(struct super_block *s, int offset) /* after journal replay, reread all bitmap and super blocks */ static int reread_meta_blocks(struct super_block *s) { - ll_rw_block(REQ_OP_READ, 0, 1, &SB_BUFFER_WITH_SB(s)); + ll_rw_block(REQ_OP_READ, 1, &SB_BUFFER_WITH_SB(s)); wait_on_buffer(SB_BUFFER_WITH_SB(s)); if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) { reiserfs_warning(s, "reiserfs-2504", "error reading the super"); diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index bd073836e141..436641369283 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -440,16 +440,9 @@ static struct page *reiserfs_get_page(struct inode *dir, size_t n) */ mapping_set_gfp_mask(mapping, GFP_NOFS); page = read_mapping_page(mapping, n >> PAGE_SHIFT, NULL); - if (!IS_ERR(page)) { + if (!IS_ERR(page)) kmap(page); - if (PageError(page)) - goto fail; - } return page; - -fail: - reiserfs_put_page(page); - return ERR_PTR(-EIO); } static inline __u32 xattr_hash(const char *msg, int len) diff --git a/fs/remap_range.c b/fs/remap_range.c index e112b5424cdb..654912d06862 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -14,6 +14,7 @@ #include <linux/compat.h> #include <linux/mount.h> #include <linux/fs.h> +#include <linux/dax.h> #include "internal.h" #include <linux/uaccess.h> @@ -71,7 +72,8 @@ static int generic_remap_checks(struct file *file_in, loff_t pos_in, * Otherwise, make sure the count is also block-aligned, having * already confirmed the starting offsets' block alignment. */ - if (pos_in + count == size_in) { + if (pos_in + count == size_in && + (!(remap_flags & REMAP_FILE_DEDUP) || pos_out + count == size_out)) { bcount = ALIGN(size_in, bs) - pos_in; } else { if (!IS_ALIGNED(count, bs)) @@ -148,16 +150,7 @@ static int generic_remap_check_len(struct inode *inode_in, /* Read a page's worth of file data into the page cache. */ static struct folio *vfs_dedupe_get_folio(struct file *file, loff_t pos) { - struct folio *folio; - - folio = read_mapping_folio(file->f_mapping, pos >> PAGE_SHIFT, file); - if (IS_ERR(folio)) - return folio; - if (!folio_test_uptodate(folio)) { - folio_put(folio); - return ERR_PTR(-EIO); - } - return folio; + return read_mapping_folio(file->f_mapping, pos >> PAGE_SHIFT, file); } /* @@ -271,9 +264,11 @@ out_error: * If there's an error, then the usual negative error code is returned. * Otherwise returns 0 with *len set to the request length. */ -int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *len, unsigned int remap_flags) +int +__generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags, + const struct iomap_ops *dax_read_ops) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); @@ -333,8 +328,18 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, if (remap_flags & REMAP_FILE_DEDUP) { bool is_same = false; - ret = vfs_dedupe_file_range_compare(file_in, pos_in, - file_out, pos_out, *len, &is_same); + if (*len == 0) + return 0; + + if (!IS_DAX(inode_in)) + ret = vfs_dedupe_file_range_compare(file_in, pos_in, + file_out, pos_out, *len, &is_same); + else if (dax_read_ops) + ret = dax_dedupe_file_range_compare(inode_in, pos_in, + inode_out, pos_out, *len, &is_same, + dax_read_ops); + else + return -EINVAL; if (ret) return ret; if (!is_same) @@ -352,6 +357,14 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, return ret; } + +int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags) +{ + return __generic_remap_file_range_prep(file_in, pos_in, file_out, + pos_out, len, remap_flags, NULL); +} EXPORT_SYMBOL(generic_remap_file_range_prep); loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, diff --git a/fs/splice.c b/fs/splice.c index 047b79db8eb5..93a2c9bf6249 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -814,17 +814,15 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, { struct pipe_inode_info *pipe; long ret, bytes; - umode_t i_mode; size_t len; int i, flags, more; /* - * We require the input being a regular file, as we don't want to - * randomly drop data for eg socket -> socket splicing. Use the - * piped splicing for that! + * We require the input to be seekable, as we don't want to randomly + * drop data for eg socket -> socket splicing. Use the piped splicing + * for that! */ - i_mode = file_inode(in)->i_mode; - if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode))) + if (unlikely(!(in->f_mode & FMODE_LSEEK))) return -EINVAL; /* diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile index 7bd9b8b856d0..477c89a519ee 100644 --- a/fs/squashfs/Makefile +++ b/fs/squashfs/Makefile @@ -5,9 +5,9 @@ obj-$(CONFIG_SQUASHFS) += squashfs.o squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o -squashfs-y += namei.o super.o symlink.o decompressor.o +squashfs-y += namei.o super.o symlink.o decompressor.o page_actor.o squashfs-$(CONFIG_SQUASHFS_FILE_CACHE) += file_cache.o -squashfs-$(CONFIG_SQUASHFS_FILE_DIRECT) += file_direct.o page_actor.o +squashfs-$(CONFIG_SQUASHFS_FILE_DIRECT) += file_direct.o squashfs-$(CONFIG_SQUASHFS_DECOMP_SINGLE) += decompressor_single.o squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI) += decompressor_multi.o squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) += decompressor_multi_percpu.o diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 8879d052f96c..833aca92301f 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -34,12 +34,15 @@ static int copy_bio_to_actor(struct bio *bio, struct squashfs_page_actor *actor, int offset, int req_length) { - void *actor_addr = squashfs_first_page(actor); + void *actor_addr; struct bvec_iter_all iter_all = {}; struct bio_vec *bvec = bvec_init_iter_all(&iter_all); int copied_bytes = 0; int actor_offset = 0; + squashfs_actor_nobuff(actor); + actor_addr = squashfs_first_page(actor); + if (WARN_ON_ONCE(!bio_next_segment(bio, &iter_all))) return 0; @@ -49,8 +52,9 @@ static int copy_bio_to_actor(struct bio *bio, bytes_to_copy = min_t(int, bytes_to_copy, req_length - copied_bytes); - memcpy(actor_addr + actor_offset, bvec_virt(bvec) + offset, - bytes_to_copy); + if (!IS_ERR(actor_addr)) + memcpy(actor_addr + actor_offset, bvec_virt(bvec) + + offset, bytes_to_copy); actor_offset += bytes_to_copy; copied_bytes += bytes_to_copy; diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h index 1b9ccfd0aa51..19ab60834389 100644 --- a/fs/squashfs/decompressor.h +++ b/fs/squashfs/decompressor.h @@ -20,6 +20,7 @@ struct squashfs_decompressor { struct bio *, int, int, struct squashfs_page_actor *); int id; char *name; + int alloc_buffer; int supported; }; diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index a8e495d8eb86..98e64fec75b7 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -39,6 +39,7 @@ #include "squashfs_fs_sb.h" #include "squashfs_fs_i.h" #include "squashfs.h" +#include "page_actor.h" /* * Locate cache slot in range [offset, index] for specified inode. If @@ -454,7 +455,7 @@ static int squashfs_read_folio(struct file *file, struct folio *folio) int expected = index == file_end ? (i_size_read(inode) & (msblk->block_size - 1)) : msblk->block_size; - int res; + int res = 0; void *pageaddr; TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n", @@ -467,14 +468,15 @@ static int squashfs_read_folio(struct file *file, struct folio *folio) if (index < file_end || squashfs_i(inode)->fragment_block == SQUASHFS_INVALID_BLK) { u64 block = 0; - int bsize = read_blocklist(inode, index, &block); - if (bsize < 0) + + res = read_blocklist(inode, index, &block); + if (res < 0) goto error_out; - if (bsize == 0) + if (res == 0) res = squashfs_readpage_sparse(page, expected); else - res = squashfs_readpage_block(page, block, bsize, expected); + res = squashfs_readpage_block(page, block, res, expected); } else res = squashfs_readpage_fragment(page, expected); @@ -488,14 +490,144 @@ out: memset(pageaddr, 0, PAGE_SIZE); kunmap_atomic(pageaddr); flush_dcache_page(page); - if (!PageError(page)) + if (res == 0) SetPageUptodate(page); unlock_page(page); - return 0; + return res; } +static int squashfs_readahead_fragment(struct page **page, + unsigned int pages, unsigned int expected) +{ + struct inode *inode = page[0]->mapping->host; + struct squashfs_cache_entry *buffer = squashfs_get_fragment(inode->i_sb, + squashfs_i(inode)->fragment_block, + squashfs_i(inode)->fragment_size); + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + unsigned int n, mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1; + + if (buffer->error) + goto out; + + expected += squashfs_i(inode)->fragment_offset; + + for (n = 0; n < pages; n++) { + unsigned int base = (page[n]->index & mask) << PAGE_SHIFT; + unsigned int offset = base + squashfs_i(inode)->fragment_offset; + + if (expected > offset) { + unsigned int avail = min_t(unsigned int, expected - + offset, PAGE_SIZE); + + squashfs_fill_page(page[n], buffer, offset, avail); + } + + unlock_page(page[n]); + put_page(page[n]); + } + +out: + squashfs_cache_put(buffer); + return buffer->error; +} + +static void squashfs_readahead(struct readahead_control *ractl) +{ + struct inode *inode = ractl->mapping->host; + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + size_t mask = (1UL << msblk->block_log) - 1; + unsigned short shift = msblk->block_log - PAGE_SHIFT; + loff_t start = readahead_pos(ractl) & ~mask; + size_t len = readahead_length(ractl) + readahead_pos(ractl) - start; + struct squashfs_page_actor *actor; + unsigned int nr_pages = 0; + struct page **pages; + int i, file_end = i_size_read(inode) >> msblk->block_log; + unsigned int max_pages = 1UL << shift; + + readahead_expand(ractl, start, (len | mask) + 1); + + pages = kmalloc_array(max_pages, sizeof(void *), GFP_KERNEL); + if (!pages) + return; + + for (;;) { + pgoff_t index; + int res, bsize; + u64 block = 0; + unsigned int expected; + + nr_pages = __readahead_batch(ractl, pages, max_pages); + if (!nr_pages) + break; + + if (readahead_pos(ractl) >= i_size_read(inode)) + goto skip_pages; + + index = pages[0]->index >> shift; + if ((pages[nr_pages - 1]->index >> shift) != index) + goto skip_pages; + + expected = index == file_end ? + (i_size_read(inode) & (msblk->block_size - 1)) : + msblk->block_size; + + if (index == file_end && squashfs_i(inode)->fragment_block != + SQUASHFS_INVALID_BLK) { + res = squashfs_readahead_fragment(pages, nr_pages, + expected); + if (res) + goto skip_pages; + continue; + } + + bsize = read_blocklist(inode, index, &block); + if (bsize == 0) + goto skip_pages; + + actor = squashfs_page_actor_init_special(msblk, pages, nr_pages, + expected); + if (!actor) + goto skip_pages; + + res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor); + + kfree(actor); + + if (res == expected) { + int bytes; + + /* Last page (if present) may have trailing bytes not filled */ + bytes = res % PAGE_SIZE; + if (pages[nr_pages - 1]->index == file_end && bytes) + memzero_page(pages[nr_pages - 1], bytes, + PAGE_SIZE - bytes); + + for (i = 0; i < nr_pages; i++) { + flush_dcache_page(pages[i]); + SetPageUptodate(pages[i]); + } + } + + for (i = 0; i < nr_pages; i++) { + unlock_page(pages[i]); + put_page(pages[i]); + } + } + + kfree(pages); + return; + +skip_pages: + for (i = 0; i < nr_pages; i++) { + unlock_page(pages[i]); + put_page(pages[i]); + } + kfree(pages); +} const struct address_space_operations squashfs_aops = { - .read_folio = squashfs_read_folio + .read_folio = squashfs_read_folio, + .readahead = squashfs_readahead }; diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c index a4894cc59447..be4b12d31e0c 100644 --- a/fs/squashfs/file_direct.c +++ b/fs/squashfs/file_direct.c @@ -18,9 +18,6 @@ #include "squashfs.h" #include "page_actor.h" -static int squashfs_read_cache(struct page *target_page, u64 block, int bsize, - int pages, struct page **page, int bytes); - /* Read separately compressed datablock directly into page cache */ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, int expected) @@ -33,7 +30,7 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, int mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1; int start_index = target_page->index & ~mask; int end_index = start_index | mask; - int i, n, pages, missing_pages, bytes, res = -ENOMEM; + int i, n, pages, bytes, res = -ENOMEM; struct page **page; struct squashfs_page_actor *actor; void *pageaddr; @@ -47,50 +44,38 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, if (page == NULL) return res; - /* - * Create a "page actor" which will kmap and kunmap the - * page cache pages appropriately within the decompressor - */ - actor = squashfs_page_actor_init_special(page, pages, 0); - if (actor == NULL) - goto out; - /* Try to grab all the pages covered by the Squashfs block */ - for (missing_pages = 0, i = 0, n = start_index; i < pages; i++, n++) { + for (i = 0, n = start_index; n <= end_index; n++) { page[i] = (n == target_page->index) ? target_page : grab_cache_page_nowait(target_page->mapping, n); - if (page[i] == NULL) { - missing_pages++; + if (page[i] == NULL) continue; - } if (PageUptodate(page[i])) { unlock_page(page[i]); put_page(page[i]); - page[i] = NULL; - missing_pages++; + continue; } + + i++; } - if (missing_pages) { - /* - * Couldn't get one or more pages, this page has either - * been VM reclaimed, but others are still in the page cache - * and uptodate, or we're racing with another thread in - * squashfs_readpage also trying to grab them. Fall back to - * using an intermediate buffer. - */ - res = squashfs_read_cache(target_page, block, bsize, pages, - page, expected); - if (res < 0) - goto mark_errored; + pages = i; + /* + * Create a "page actor" which will kmap and kunmap the + * page cache pages appropriately within the decompressor + */ + actor = squashfs_page_actor_init_special(msblk, page, pages, expected); + if (actor == NULL) goto out; - } /* Decompress directly into the page cache buffers */ res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor); + + kfree(actor); + if (res < 0) goto mark_errored; @@ -99,12 +84,12 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, goto mark_errored; } - /* Last page may have trailing bytes not filled */ + /* Last page (if present) may have trailing bytes not filled */ bytes = res % PAGE_SIZE; - if (bytes) { - pageaddr = kmap_atomic(page[pages - 1]); + if (page[pages - 1]->index == end_index && bytes) { + pageaddr = kmap_local_page(page[pages - 1]); memset(pageaddr + bytes, 0, PAGE_SIZE - bytes); - kunmap_atomic(pageaddr); + kunmap_local(pageaddr); } /* Mark pages as uptodate, unlock and release */ @@ -116,7 +101,6 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, put_page(page[i]); } - kfree(actor); kfree(page); return 0; @@ -135,40 +119,6 @@ mark_errored: } out: - kfree(actor); kfree(page); return res; } - - -static int squashfs_read_cache(struct page *target_page, u64 block, int bsize, - int pages, struct page **page, int bytes) -{ - struct inode *i = target_page->mapping->host; - struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb, - block, bsize); - int res = buffer->error, n, offset = 0; - - if (res) { - ERROR("Unable to read page, block %llx, size %x\n", block, - bsize); - goto out; - } - - for (n = 0; n < pages && bytes > 0; n++, - bytes -= PAGE_SIZE, offset += PAGE_SIZE) { - int avail = min_t(int, bytes, PAGE_SIZE); - - if (page[n] == NULL) - continue; - - squashfs_fill_page(page[n], buffer, offset, avail); - unlock_page(page[n]); - if (page[n] != target_page) - put_page(page[n]); - } - -out: - squashfs_cache_put(buffer); - return res; -} diff --git a/fs/squashfs/lz4_wrapper.c b/fs/squashfs/lz4_wrapper.c index b685b6238316..49797729f143 100644 --- a/fs/squashfs/lz4_wrapper.c +++ b/fs/squashfs/lz4_wrapper.c @@ -119,10 +119,12 @@ static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm, buff = stream->output; while (data) { if (bytes <= PAGE_SIZE) { - memcpy(data, buff, bytes); + if (!IS_ERR(data)) + memcpy(data, buff, bytes); break; } - memcpy(data, buff, PAGE_SIZE); + if (!IS_ERR(data)) + memcpy(data, buff, PAGE_SIZE); buff += PAGE_SIZE; bytes -= PAGE_SIZE; data = squashfs_next_page(output); @@ -139,5 +141,6 @@ const struct squashfs_decompressor squashfs_lz4_comp_ops = { .decompress = lz4_uncompress, .id = LZ4_COMPRESSION, .name = "lz4", + .alloc_buffer = 0, .supported = 1 }; diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c index cb510a631968..d216aeefa865 100644 --- a/fs/squashfs/lzo_wrapper.c +++ b/fs/squashfs/lzo_wrapper.c @@ -93,10 +93,12 @@ static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm, buff = stream->output; while (data) { if (bytes <= PAGE_SIZE) { - memcpy(data, buff, bytes); + if (!IS_ERR(data)) + memcpy(data, buff, bytes); break; } else { - memcpy(data, buff, PAGE_SIZE); + if (!IS_ERR(data)) + memcpy(data, buff, PAGE_SIZE); buff += PAGE_SIZE; bytes -= PAGE_SIZE; data = squashfs_next_page(output); @@ -116,5 +118,6 @@ const struct squashfs_decompressor squashfs_lzo_comp_ops = { .decompress = lzo_uncompress, .id = LZO_COMPRESSION, .name = "lzo", + .alloc_buffer = 0, .supported = 1 }; diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c index 520d323a99ce..b23b780d8f42 100644 --- a/fs/squashfs/page_actor.c +++ b/fs/squashfs/page_actor.c @@ -7,6 +7,8 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/pagemap.h> +#include "squashfs_fs_sb.h" +#include "decompressor.h" #include "page_actor.h" /* @@ -57,29 +59,62 @@ struct squashfs_page_actor *squashfs_page_actor_init(void **buffer, } /* Implementation of page_actor for decompressing directly into page cache. */ +static void *handle_next_page(struct squashfs_page_actor *actor) +{ + int max_pages = (actor->length + PAGE_SIZE - 1) >> PAGE_SHIFT; + + if (actor->returned_pages == max_pages) + return NULL; + + if ((actor->next_page == actor->pages) || + (actor->next_index != actor->page[actor->next_page]->index)) { + if (actor->alloc_buffer) { + void *tmp_buffer = kmalloc(PAGE_SIZE, GFP_KERNEL); + + if (tmp_buffer) { + actor->tmp_buffer = tmp_buffer; + actor->next_index++; + actor->returned_pages++; + return tmp_buffer; + } + } + + actor->next_index++; + actor->returned_pages++; + return ERR_PTR(-ENOMEM); + } + + actor->next_index++; + actor->returned_pages++; + return actor->pageaddr = kmap_local_page(actor->page[actor->next_page++]); +} + static void *direct_first_page(struct squashfs_page_actor *actor) { - actor->next_page = 1; - return actor->pageaddr = kmap_atomic(actor->page[0]); + return handle_next_page(actor); } static void *direct_next_page(struct squashfs_page_actor *actor) { if (actor->pageaddr) - kunmap_atomic(actor->pageaddr); + kunmap_local(actor->pageaddr); + + kfree(actor->tmp_buffer); + actor->pageaddr = actor->tmp_buffer = NULL; - return actor->pageaddr = actor->next_page == actor->pages ? NULL : - kmap_atomic(actor->page[actor->next_page++]); + return handle_next_page(actor); } static void direct_finish_page(struct squashfs_page_actor *actor) { if (actor->pageaddr) - kunmap_atomic(actor->pageaddr); + kunmap_local(actor->pageaddr); + + kfree(actor->tmp_buffer); } -struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page, - int pages, int length) +struct squashfs_page_actor *squashfs_page_actor_init_special(struct squashfs_sb_info *msblk, + struct page **page, int pages, int length) { struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL); @@ -90,7 +125,11 @@ struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page, actor->page = page; actor->pages = pages; actor->next_page = 0; + actor->returned_pages = 0; + actor->next_index = page[0]->index & ~((1 << (msblk->block_log - PAGE_SHIFT)) - 1); actor->pageaddr = NULL; + actor->tmp_buffer = NULL; + actor->alloc_buffer = msblk->decompressor->alloc_buffer; actor->squashfs_first_page = direct_first_page; actor->squashfs_next_page = direct_next_page; actor->squashfs_finish_page = direct_finish_page; diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h index 2e3073ace009..24841d28bc0f 100644 --- a/fs/squashfs/page_actor.h +++ b/fs/squashfs/page_actor.h @@ -6,63 +6,29 @@ * Phillip Lougher <phillip@squashfs.org.uk> */ -#ifndef CONFIG_SQUASHFS_FILE_DIRECT -struct squashfs_page_actor { - void **page; - int pages; - int length; - int next_page; -}; - -static inline struct squashfs_page_actor *squashfs_page_actor_init(void **page, - int pages, int length) -{ - struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL); - - if (actor == NULL) - return NULL; - - actor->length = length ? : pages * PAGE_SIZE; - actor->page = page; - actor->pages = pages; - actor->next_page = 0; - return actor; -} - -static inline void *squashfs_first_page(struct squashfs_page_actor *actor) -{ - actor->next_page = 1; - return actor->page[0]; -} - -static inline void *squashfs_next_page(struct squashfs_page_actor *actor) -{ - return actor->next_page == actor->pages ? NULL : - actor->page[actor->next_page++]; -} - -static inline void squashfs_finish_page(struct squashfs_page_actor *actor) -{ - /* empty */ -} -#else struct squashfs_page_actor { union { void **buffer; struct page **page; }; void *pageaddr; + void *tmp_buffer; void *(*squashfs_first_page)(struct squashfs_page_actor *); void *(*squashfs_next_page)(struct squashfs_page_actor *); void (*squashfs_finish_page)(struct squashfs_page_actor *); int pages; int length; int next_page; + int alloc_buffer; + int returned_pages; + pgoff_t next_index; }; -extern struct squashfs_page_actor *squashfs_page_actor_init(void **, int, int); -extern struct squashfs_page_actor *squashfs_page_actor_init_special(struct page - **, int, int); +extern struct squashfs_page_actor *squashfs_page_actor_init(void **buffer, + int pages, int length); +extern struct squashfs_page_actor *squashfs_page_actor_init_special( + struct squashfs_sb_info *msblk, + struct page **page, int pages, int length); static inline void *squashfs_first_page(struct squashfs_page_actor *actor) { return actor->squashfs_first_page(actor); @@ -75,5 +41,8 @@ static inline void squashfs_finish_page(struct squashfs_page_actor *actor) { actor->squashfs_finish_page(actor); } -#endif +static inline void squashfs_actor_nobuff(struct squashfs_page_actor *actor) +{ + actor->alloc_buffer = 0; +} #endif diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 6d594ba2ed28..32565dafa7f3 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -29,7 +29,6 @@ #include <linux/module.h> #include <linux/magic.h> #include <linux/xattr.h> -#include <linux/backing-dev.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" @@ -113,24 +112,6 @@ static const struct squashfs_decompressor *supported_squashfs_filesystem( return decompressor; } -static int squashfs_bdi_init(struct super_block *sb) -{ - int err; - unsigned int major = MAJOR(sb->s_dev); - unsigned int minor = MINOR(sb->s_dev); - - bdi_put(sb->s_bdi); - sb->s_bdi = &noop_backing_dev_info; - - err = super_setup_bdi_name(sb, "squashfs_%u_%u", major, minor); - if (err) - return err; - - sb->s_bdi->ra_pages = 0; - sb->s_bdi->io_pages = 0; - - return 0; -} static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) { @@ -146,20 +127,6 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) TRACE("Entered squashfs_fill_superblock\n"); - /* - * squashfs provides 'backing_dev_info' in order to disable read-ahead. For - * squashfs, I/O is not deferred, it is done immediately in read_folio, - * which means the user would always have to wait their own I/O. So the effect - * of readahead is very weak for squashfs. squashfs_bdi_init will set - * sb->s_bdi->ra_pages and sb->s_bdi->io_pages to 0 and close readahead for - * squashfs. - */ - err = squashfs_bdi_init(sb); - if (err) { - errorf(fc, "squashfs init bdi failed"); - return err; - } - sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL); if (sb->s_fs_info == NULL) { ERROR("Failed to allocate squashfs_sb_info\n"); diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c index 68f6d09bb3a2..6c49481a2f8c 100644 --- a/fs/squashfs/xz_wrapper.c +++ b/fs/squashfs/xz_wrapper.c @@ -131,6 +131,10 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, stream->buf.out_pos = 0; stream->buf.out_size = PAGE_SIZE; stream->buf.out = squashfs_first_page(output); + if (IS_ERR(stream->buf.out)) { + error = PTR_ERR(stream->buf.out); + goto finish; + } for (;;) { enum xz_ret xz_err; @@ -156,7 +160,10 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, if (stream->buf.out_pos == stream->buf.out_size) { stream->buf.out = squashfs_next_page(output); - if (stream->buf.out != NULL) { + if (IS_ERR(stream->buf.out)) { + error = PTR_ERR(stream->buf.out); + break; + } else if (stream->buf.out != NULL) { stream->buf.out_pos = 0; total += PAGE_SIZE; } @@ -171,6 +178,7 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, } } +finish: squashfs_finish_page(output); return error ? error : total + stream->buf.out_pos; @@ -183,5 +191,6 @@ const struct squashfs_decompressor squashfs_xz_comp_ops = { .decompress = squashfs_xz_uncompress, .id = XZ_COMPRESSION, .name = "xz", + .alloc_buffer = 1, .supported = 1 }; diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c index a20e9042146b..cbb7afe7bc46 100644 --- a/fs/squashfs/zlib_wrapper.c +++ b/fs/squashfs/zlib_wrapper.c @@ -62,6 +62,11 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, stream->next_out = squashfs_first_page(output); stream->avail_in = 0; + if (IS_ERR(stream->next_out)) { + error = PTR_ERR(stream->next_out); + goto finish; + } + for (;;) { int zlib_err; @@ -85,7 +90,10 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, if (stream->avail_out == 0) { stream->next_out = squashfs_next_page(output); - if (stream->next_out != NULL) + if (IS_ERR(stream->next_out)) { + error = PTR_ERR(stream->next_out); + break; + } else if (stream->next_out != NULL) stream->avail_out = PAGE_SIZE; } @@ -107,6 +115,7 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, } } +finish: squashfs_finish_page(output); if (!error) @@ -122,6 +131,7 @@ const struct squashfs_decompressor squashfs_zlib_comp_ops = { .decompress = zlib_uncompress, .id = ZLIB_COMPRESSION, .name = "zlib", + .alloc_buffer = 1, .supported = 1 }; diff --git a/fs/squashfs/zstd_wrapper.c b/fs/squashfs/zstd_wrapper.c index c40445dbf38c..0e407c4d8b3b 100644 --- a/fs/squashfs/zstd_wrapper.c +++ b/fs/squashfs/zstd_wrapper.c @@ -80,6 +80,10 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm, out_buf.size = PAGE_SIZE; out_buf.dst = squashfs_first_page(output); + if (IS_ERR(out_buf.dst)) { + error = PTR_ERR(out_buf.dst); + goto finish; + } for (;;) { size_t zstd_err; @@ -104,7 +108,10 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm, if (out_buf.pos == out_buf.size) { out_buf.dst = squashfs_next_page(output); - if (out_buf.dst == NULL) { + if (IS_ERR(out_buf.dst)) { + error = PTR_ERR(out_buf.dst); + break; + } else if (out_buf.dst == NULL) { /* Shouldn't run out of pages * before stream is done. */ @@ -129,6 +136,8 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm, } } +finish: + squashfs_finish_page(output); return error ? error : total_out; @@ -140,5 +149,6 @@ const struct squashfs_decompressor squashfs_zstd_comp_ops = { .decompress = zstd_uncompress, .id = ZSTD_COMPRESSION, .name = "zstd", + .alloc_buffer = 1, .supported = 1 }; diff --git a/fs/super.c b/fs/super.c index 60f57c7bc0a6..734ed584a946 100644 --- a/fs/super.c +++ b/fs/super.c @@ -265,7 +265,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, s->s_shrink.count_objects = super_cache_count; s->s_shrink.batch = 1024; s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE; - if (prealloc_shrinker(&s->s_shrink)) + if (prealloc_shrinker(&s->s_shrink, "sb-%s", type->name)) goto fail; if (list_lru_init_memcg(&s->s_dentry_lru, &s->s_shrink)) goto fail; @@ -423,6 +423,35 @@ bool trylock_super(struct super_block *sb) } /** + * retire_super - prevents superblock from being reused + * @sb: superblock to retire + * + * The function marks superblock to be ignored in superblock test, which + * prevents it from being reused for any new mounts. If the superblock has + * a private bdi, it also unregisters it, but doesn't reduce the refcount + * of the superblock to prevent potential races. The refcount is reduced + * by generic_shutdown_super(). The function can not be called + * concurrently with generic_shutdown_super(). It is safe to call the + * function multiple times, subsequent calls have no effect. + * + * The marker will affect the re-use only for block-device-based + * superblocks. Other superblocks will still get marked if this function + * is used, but that will not affect their reusability. + */ +void retire_super(struct super_block *sb) +{ + WARN_ON(!sb->s_bdev); + down_write(&sb->s_umount); + if (sb->s_iflags & SB_I_PERSB_BDI) { + bdi_unregister(sb->s_bdi); + sb->s_iflags &= ~SB_I_PERSB_BDI; + } + sb->s_iflags |= SB_I_RETIRED; + up_write(&sb->s_umount); +} +EXPORT_SYMBOL(retire_super); + +/** * generic_shutdown_super - common helper for ->kill_sb() * @sb: superblock to kill * @@ -1216,7 +1245,7 @@ static int set_bdev_super_fc(struct super_block *s, struct fs_context *fc) static int test_bdev_super_fc(struct super_block *s, struct fs_context *fc) { - return s->s_bdev == fc->sget_key; + return !(s->s_iflags & SB_I_RETIRED) && s->s_bdev == fc->sget_key; } /** @@ -1288,6 +1317,8 @@ int get_tree_bdev(struct fs_context *fc, } else { s->s_mode = mode; snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); + shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", + fc->fs_type->name, s->s_id); sb_set_blocksize(s, block_size(bdev)); error = fill_super(s, fc); if (error) { @@ -1307,7 +1338,7 @@ EXPORT_SYMBOL(get_tree_bdev); static int test_bdev_super(struct super_block *s, void *data) { - return (void *)s->s_bdev == data; + return !(s->s_iflags & SB_I_RETIRED) && (void *)s->s_bdev == data; } struct dentry *mount_bdev(struct file_system_type *fs_type, @@ -1363,6 +1394,8 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, } else { s->s_mode = mode; snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); + shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", + fs_type->name, s->s_id); sb_set_blocksize(s, block_size(bdev)); error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); if (error) { diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 04ced154960f..f2353dd676ef 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1461,29 +1461,6 @@ static bool ubifs_dirty_folio(struct address_space *mapping, return ret; } -#ifdef CONFIG_MIGRATION -static int ubifs_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page, enum migrate_mode mode) -{ - int rc; - - rc = migrate_page_move_mapping(mapping, newpage, page, 0); - if (rc != MIGRATEPAGE_SUCCESS) - return rc; - - if (PagePrivate(page)) { - detach_page_private(page); - attach_page_private(newpage, (void *)1); - } - - if (mode != MIGRATE_SYNC_NO_COPY) - migrate_page_copy(newpage, page); - else - migrate_page_states(newpage, page); - return MIGRATEPAGE_SUCCESS; -} -#endif - static bool ubifs_release_folio(struct folio *folio, gfp_t unused_gfp_flags) { struct inode *inode = folio->mapping->host; @@ -1649,10 +1626,8 @@ const struct address_space_operations ubifs_file_address_operations = { .write_end = ubifs_write_end, .invalidate_folio = ubifs_invalidate_folio, .dirty_folio = ubifs_dirty_folio, -#ifdef CONFIG_MIGRATION - .migratepage = ubifs_migrate_page, -#endif - .release_folio = ubifs_release_folio, + .migrate_folio = filemap_migrate_folio, + .release_folio = ubifs_release_folio, }; const struct inode_operations ubifs_file_inode_operations = { diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 0978d01b0ea4..d0c9a09988bc 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -2430,7 +2430,7 @@ static int __init ubifs_init(void) if (!ubifs_inode_slab) return -ENOMEM; - err = register_shrinker(&ubifs_shrinker_info); + err = register_shrinker(&ubifs_shrinker_info, "ubifs-slab"); if (err) goto out_slab; diff --git a/fs/udf/dir.c b/fs/udf/dir.c index 42e3e551fa4c..cad3772f9dbe 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -130,7 +130,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) brelse(tmp); } if (num) { - ll_rw_block(REQ_OP_READ, REQ_RAHEAD, num, bha); + ll_rw_block(REQ_OP_READ | REQ_RAHEAD, num, bha); for (i = 0; i < num; i++) brelse(bha[i]); } diff --git a/fs/udf/directory.c b/fs/udf/directory.c index 73720320f0ab..a2adf6293093 100644 --- a/fs/udf/directory.c +++ b/fs/udf/directory.c @@ -89,7 +89,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, brelse(tmp); } if (num) { - ll_rw_block(REQ_OP_READ, REQ_RAHEAD, num, bha); + ll_rw_block(REQ_OP_READ | REQ_RAHEAD, num, bha); for (i = 0; i < num; i++) brelse(bha[i]); } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index edc88716751a..8d06daed549f 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1214,7 +1214,7 @@ struct buffer_head *udf_bread(struct inode *inode, udf_pblk_t block, if (buffer_uptodate(bh)) return bh; - ll_rw_block(REQ_OP_READ, 0, 1, &bh); + ll_rw_block(REQ_OP_READ, 1, &bh); wait_on_buffer(bh); if (buffer_uptodate(bh)) diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index 075d3d9114c8..bd810d8239f2 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -296,7 +296,7 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg, if (!buffer_mapped(bh)) map_bh(bh, inode->i_sb, oldb + pos); if (!buffer_uptodate(bh)) { - ll_rw_block(REQ_OP_READ, 0, 1, &bh); + ll_rw_block(REQ_OP_READ, 1, &bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { ufs_error(inode->i_sb, __func__, diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index b721d0bda5e5..391efaf1d528 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -193,7 +193,7 @@ static struct page *ufs_get_page(struct inode *dir, unsigned long n) if (!IS_ERR(page)) { kmap(page); if (unlikely(!PageChecked(page))) { - if (PageError(page) || !ufs_check_page(page)) + if (!ufs_check_page(page)) goto fail; } } diff --git a/fs/ufs/util.c b/fs/ufs/util.c index 4fa633f84274..08ddf41eaaad 100644 --- a/fs/ufs/util.c +++ b/fs/ufs/util.c @@ -264,17 +264,6 @@ struct page *ufs_get_locked_page(struct address_space *mapping, put_page(page); return NULL; } - - if (!PageUptodate(page) || PageError(page)) { - unlock_page(page); - put_page(page); - - printk(KERN_ERR "ufs_change_blocknr: " - "can not read page: ino %lu, index: %lu\n", - inode->i_ino, index); - - return ERR_PTR(-EIO); - } } if (!page_has_buffers(page)) create_empty_buffers(page, 1 << inode->i_blkbits, 0); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index e943370107d0..1c44bf75f916 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -192,17 +192,19 @@ static inline void msg_init(struct uffd_msg *msg) } static inline struct uffd_msg userfault_msg(unsigned long address, + unsigned long real_address, unsigned int flags, unsigned long reason, unsigned int features) { struct uffd_msg msg; + msg_init(&msg); msg.event = UFFD_EVENT_PAGEFAULT; - if (!(features & UFFD_FEATURE_EXACT_ADDRESS)) - address &= PAGE_MASK; - msg.arg.pagefault.address = address; + msg.arg.pagefault.address = (features & UFFD_FEATURE_EXACT_ADDRESS) ? + real_address : address; + /* * These flags indicate why the userfault occurred: * - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault. @@ -488,8 +490,8 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); uwq.wq.private = current; - uwq.msg = userfault_msg(vmf->real_address, vmf->flags, reason, - ctx->features); + uwq.msg = userfault_msg(vmf->address, vmf->real_address, vmf->flags, + reason, ctx->features); uwq.ctx = ctx; uwq.waken = false; @@ -1923,10 +1925,8 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, ret = -EFAULT; if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api))) goto out; - features = uffdio_api.features; - ret = -EINVAL; - if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES)) - goto err_out; + /* Ignore unsupported features (userspace built against newer kernel) */ + features = uffdio_api.features & UFFD_API_FEATURES; ret = -EPERM; if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE)) goto err_out; diff --git a/fs/verity/Kconfig b/fs/verity/Kconfig index 54598cd80145..aad1f1d998b9 100644 --- a/fs/verity/Kconfig +++ b/fs/verity/Kconfig @@ -14,11 +14,11 @@ config FS_VERITY help This option enables fs-verity. fs-verity is the dm-verity mechanism implemented at the file level. On supported - filesystems (currently EXT4 and F2FS), userspace can use an - ioctl to enable verity for a file, which causes the filesystem - to build a Merkle tree for the file. The filesystem will then - transparently verify any data read from the file against the - Merkle tree. The file is also made read-only. + filesystems (currently ext4, f2fs, and btrfs), userspace can + use an ioctl to enable verity for a file, which causes the + filesystem to build a Merkle tree for the file. The filesystem + will then transparently verify any data read from the file + against the Merkle tree. The file is also made read-only. This serves as an integrity check, but the availability of the Merkle tree root hash also allows efficiently supporting diff --git a/fs/xattr.c b/fs/xattr.c index e8dd03e4561e..a1f4998bc6be 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -282,9 +282,15 @@ out: } EXPORT_SYMBOL_GPL(__vfs_setxattr_locked); +static inline bool is_posix_acl_xattr(const char *name) +{ + return (strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || + (strcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0); +} + int vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, - const char *name, const void *value, size_t size, int flags) + const char *name, void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; struct inode *delegated_inode = NULL; @@ -292,12 +298,16 @@ vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, int error; if (size && strcmp(name, XATTR_NAME_CAPS) == 0) { - error = cap_convert_nscap(mnt_userns, dentry, &value, size); + error = cap_convert_nscap(mnt_userns, dentry, + (const void **)&value, size); if (error < 0) return error; size = error; } + if (size && is_posix_acl_xattr(name)) + posix_acl_setxattr_idmapped_mnt(mnt_userns, inode, value, size); + retry_deleg: inode_lock(inode); error = __vfs_setxattr_locked(mnt_userns, dentry, name, value, size, @@ -431,7 +441,10 @@ vfs_getxattr(struct user_namespace *mnt_userns, struct dentry *dentry, return ret; } nolsm: - return __vfs_getxattr(dentry, inode, name, value, size); + error = __vfs_getxattr(dentry, inode, name, value, size); + if (error > 0 && is_posix_acl_xattr(name)) + posix_acl_getxattr_idmapped_mnt(mnt_userns, inode, value, size); + return error; } EXPORT_SYMBOL_GPL(vfs_getxattr); @@ -577,8 +590,7 @@ static void setxattr_convert(struct user_namespace *mnt_userns, if (ctx->size && ((strcmp(ctx->kname->name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || (strcmp(ctx->kname->name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))) - posix_acl_fix_xattr_from_user(mnt_userns, d_inode(d), - ctx->kvalue, ctx->size); + posix_acl_fix_xattr_from_user(ctx->kvalue, ctx->size); } int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, @@ -695,8 +707,7 @@ do_getxattr(struct user_namespace *mnt_userns, struct dentry *d, if (error > 0) { if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)) - posix_acl_fix_xattr_to_user(mnt_userns, d_inode(d), - ctx->kvalue, error); + posix_acl_fix_xattr_to_user(ctx->kvalue, error); if (ctx->size && copy_to_user(ctx->value, ctx->kvalue, error)) error = -EFAULT; } else if (error == -ERANGE && ctx->size >= XATTR_SIZE_MAX) { diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index b056cfc6398e..03135a1c31b6 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -106,6 +106,7 @@ xfs-y += xfs_log.o \ xfs_icreate_item.o \ xfs_inode_item.o \ xfs_inode_item_recover.o \ + xfs_iunlink_item.o \ xfs_refcount_item.o \ xfs_rmap_item.o \ xfs_log_recover.o \ @@ -129,6 +130,11 @@ xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o xfs-$(CONFIG_EXPORTFS_BLOCK_OPS) += xfs_pnfs.o +# notify failure +ifeq ($(CONFIG_MEMORY_FAILURE),y) +xfs-$(CONFIG_FS_DAX) += xfs_notify_failure.o +endif + # online scrub/repair ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 3e920cf1b454..bb0c700afe3c 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -120,18 +120,18 @@ xfs_initialize_perag_data( for (index = 0; index < agcount; index++) { /* - * read the agf, then the agi. This gets us - * all the information we need and populates the - * per-ag structures for us. + * Read the AGF and AGI buffers to populate the per-ag + * structures for us. */ - error = xfs_alloc_pagf_init(mp, NULL, index, 0); - if (error) + pag = xfs_perag_get(mp, index); + error = xfs_alloc_read_agf(pag, NULL, 0, NULL); + if (!error) + error = xfs_ialloc_read_agi(pag, NULL, NULL); + if (error) { + xfs_perag_put(pag); return error; + } - error = xfs_ialloc_pagi_init(mp, NULL, index); - if (error) - return error; - pag = xfs_perag_get(mp, index); ifree += pag->pagi_freecount; ialloc += pag->pagi_count; bfree += pag->pagf_freeblks; @@ -194,17 +194,76 @@ xfs_free_perag( XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0); cancel_delayed_work_sync(&pag->pag_blockgc_work); - xfs_iunlink_destroy(pag); xfs_buf_hash_destroy(pag); call_rcu(&pag->rcu_head, __xfs_free_perag); } } +/* Find the size of the AG, in blocks. */ +static xfs_agblock_t +__xfs_ag_block_count( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agnumber_t agcount, + xfs_rfsblock_t dblocks) +{ + ASSERT(agno < agcount); + + if (agno < agcount - 1) + return mp->m_sb.sb_agblocks; + return dblocks - (agno * mp->m_sb.sb_agblocks); +} + +xfs_agblock_t +xfs_ag_block_count( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + return __xfs_ag_block_count(mp, agno, mp->m_sb.sb_agcount, + mp->m_sb.sb_dblocks); +} + +/* Calculate the first and last possible inode number in an AG. */ +static void +__xfs_agino_range( + struct xfs_mount *mp, + xfs_agblock_t eoag, + xfs_agino_t *first, + xfs_agino_t *last) +{ + xfs_agblock_t bno; + + /* + * Calculate the first inode, which will be in the first + * cluster-aligned block after the AGFL. + */ + bno = round_up(XFS_AGFL_BLOCK(mp) + 1, M_IGEO(mp)->cluster_align); + *first = XFS_AGB_TO_AGINO(mp, bno); + + /* + * Calculate the last inode, which will be at the end of the + * last (aligned) cluster that can be allocated in the AG. + */ + bno = round_down(eoag, M_IGEO(mp)->cluster_align); + *last = XFS_AGB_TO_AGINO(mp, bno) - 1; +} + +void +xfs_agino_range( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agino_t *first, + xfs_agino_t *last) +{ + return __xfs_agino_range(mp, xfs_ag_block_count(mp, agno), first, last); +} + int xfs_initialize_perag( struct xfs_mount *mp, xfs_agnumber_t agcount, + xfs_rfsblock_t dblocks, xfs_agnumber_t *maxagi) { struct xfs_perag *pag; @@ -263,13 +322,18 @@ xfs_initialize_perag( if (error) goto out_remove_pag; - error = xfs_iunlink_init(pag); - if (error) - goto out_hash_destroy; - /* first new pag is fully initialized */ if (first_initialised == NULLAGNUMBER) first_initialised = index; + + /* + * Pre-calculated geometry + */ + pag->block_count = __xfs_ag_block_count(mp, index, agcount, + dblocks); + pag->min_block = XFS_AGFL_BLOCK(mp); + __xfs_agino_range(mp, pag->block_count, &pag->agino_min, + &pag->agino_max); } index = xfs_set_inode_alloc(mp, agcount); @@ -280,8 +344,6 @@ xfs_initialize_perag( mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp); return 0; -out_hash_destroy: - xfs_buf_hash_destroy(pag); out_remove_pag: radix_tree_delete(&mp->m_perag_tree, index); out_free_pag: @@ -293,7 +355,6 @@ out_unwind_new_pags: if (!pag) break; xfs_buf_hash_destroy(pag); - xfs_iunlink_destroy(pag); kmem_free(pag); } return error; @@ -321,12 +382,6 @@ xfs_get_aghdr_buf( return 0; } -static inline bool is_log_ag(struct xfs_mount *mp, struct aghdr_init_data *id) -{ - return mp->m_sb.sb_logstart > 0 && - id->agno == XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart); -} - /* * Generic btree root block init function */ @@ -352,7 +407,7 @@ xfs_freesp_init_recs( arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks); - if (is_log_ag(mp, id)) { + if (xfs_ag_contains_log(mp, id->agno)) { struct xfs_alloc_rec *nrec; xfs_agblock_t start = XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart); @@ -479,7 +534,7 @@ xfs_rmaproot_init( } /* account for the log space */ - if (is_log_ag(mp, id)) { + if (xfs_ag_contains_log(mp, id->agno)) { rrec = XFS_RMAP_REC_ADDR(block, be16_to_cpu(block->bb_numrecs) + 1); rrec->rm_startblock = cpu_to_be32( @@ -550,7 +605,7 @@ xfs_agfblock_init( agf->agf_refcount_blocks = cpu_to_be32(1); } - if (is_log_ag(mp, id)) { + if (xfs_ag_contains_log(mp, id->agno)) { int64_t logblocks = mp->m_sb.sb_logblocks; be32_add_cpu(&agf->agf_freeblks, -logblocks); @@ -761,11 +816,11 @@ xfs_ag_init_headers( int xfs_ag_shrink_space( - struct xfs_mount *mp, + struct xfs_perag *pag, struct xfs_trans **tpp, - xfs_agnumber_t agno, xfs_extlen_t delta) { + struct xfs_mount *mp = pag->pag_mount; struct xfs_alloc_arg args = { .tp = *tpp, .mp = mp, @@ -782,14 +837,14 @@ xfs_ag_shrink_space( xfs_agblock_t aglen; int error, err2; - ASSERT(agno == mp->m_sb.sb_agcount - 1); - error = xfs_ialloc_read_agi(mp, *tpp, agno, &agibp); + ASSERT(pag->pag_agno == mp->m_sb.sb_agcount - 1); + error = xfs_ialloc_read_agi(pag, *tpp, &agibp); if (error) return error; agi = agibp->b_addr; - error = xfs_alloc_read_agf(mp, *tpp, agno, 0, &agfbp); + error = xfs_alloc_read_agf(pag, *tpp, 0, &agfbp); if (error) return error; @@ -801,13 +856,14 @@ xfs_ag_shrink_space( if (delta >= aglen) return -EINVAL; - args.fsbno = XFS_AGB_TO_FSB(mp, agno, aglen - delta); + args.fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, aglen - delta); /* * Make sure that the last inode cluster cannot overlap with the new * end of the AG, even if it's sparse. */ - error = xfs_ialloc_check_shrink(*tpp, agno, agibp, aglen - delta); + error = xfs_ialloc_check_shrink(*tpp, pag->pag_agno, agibp, + aglen - delta); if (error) return error; @@ -815,7 +871,7 @@ xfs_ag_shrink_space( * Disable perag reservations so it doesn't cause the allocation request * to fail. We'll reestablish reservation before we return. */ - error = xfs_ag_resv_free(agibp->b_pag); + error = xfs_ag_resv_free(pag); if (error) return error; @@ -844,7 +900,7 @@ xfs_ag_shrink_space( be32_add_cpu(&agi->agi_length, -delta); be32_add_cpu(&agf->agf_length, -delta); - err2 = xfs_ag_resv_init(agibp->b_pag, *tpp); + err2 = xfs_ag_resv_init(pag, *tpp); if (err2) { be32_add_cpu(&agi->agi_length, delta); be32_add_cpu(&agf->agf_length, delta); @@ -868,8 +924,9 @@ xfs_ag_shrink_space( xfs_ialloc_log_agi(*tpp, agibp, XFS_AGI_LENGTH); xfs_alloc_log_agf(*tpp, agfbp, XFS_AGF_LENGTH); return 0; + resv_init_out: - err2 = xfs_ag_resv_init(agibp->b_pag, *tpp); + err2 = xfs_ag_resv_init(pag, *tpp); if (!err2) return error; resv_err: @@ -883,9 +940,8 @@ resv_err: */ int xfs_ag_extend_space( - struct xfs_mount *mp, + struct xfs_perag *pag, struct xfs_trans *tp, - struct aghdr_init_data *id, xfs_extlen_t len) { struct xfs_buf *bp; @@ -893,23 +949,20 @@ xfs_ag_extend_space( struct xfs_agf *agf; int error; - /* - * Change the agi length. - */ - error = xfs_ialloc_read_agi(mp, tp, id->agno, &bp); + ASSERT(pag->pag_agno == pag->pag_mount->m_sb.sb_agcount - 1); + + error = xfs_ialloc_read_agi(pag, tp, &bp); if (error) return error; agi = bp->b_addr; be32_add_cpu(&agi->agi_length, len); - ASSERT(id->agno == mp->m_sb.sb_agcount - 1 || - be32_to_cpu(agi->agi_length) == mp->m_sb.sb_agblocks); xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH); /* * Change agf length. */ - error = xfs_alloc_read_agf(mp, tp, id->agno, 0, &bp); + error = xfs_alloc_read_agf(pag, tp, 0, &bp); if (error) return error; @@ -924,49 +977,49 @@ xfs_ag_extend_space( * XFS_RMAP_OINFO_SKIP_UPDATE is used here to tell the rmap btree that * this doesn't actually exist in the rmap btree. */ - error = xfs_rmap_free(tp, bp, bp->b_pag, - be32_to_cpu(agf->agf_length) - len, + error = xfs_rmap_free(tp, bp, pag, be32_to_cpu(agf->agf_length) - len, len, &XFS_RMAP_OINFO_SKIP_UPDATE); if (error) return error; - return xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, id->agno, + error = xfs_free_extent(tp, XFS_AGB_TO_FSB(pag->pag_mount, pag->pag_agno, be32_to_cpu(agf->agf_length) - len), len, &XFS_RMAP_OINFO_SKIP_UPDATE, XFS_AG_RESV_NONE); + if (error) + return error; + + /* Update perag geometry */ + pag->block_count = be32_to_cpu(agf->agf_length); + __xfs_agino_range(pag->pag_mount, pag->block_count, &pag->agino_min, + &pag->agino_max); + return 0; } /* Retrieve AG geometry. */ int xfs_ag_get_geometry( - struct xfs_mount *mp, - xfs_agnumber_t agno, + struct xfs_perag *pag, struct xfs_ag_geometry *ageo) { struct xfs_buf *agi_bp; struct xfs_buf *agf_bp; struct xfs_agi *agi; struct xfs_agf *agf; - struct xfs_perag *pag; unsigned int freeblks; int error; - if (agno >= mp->m_sb.sb_agcount) - return -EINVAL; - /* Lock the AG headers. */ - error = xfs_ialloc_read_agi(mp, NULL, agno, &agi_bp); + error = xfs_ialloc_read_agi(pag, NULL, &agi_bp); if (error) return error; - error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agf_bp); + error = xfs_alloc_read_agf(pag, NULL, 0, &agf_bp); if (error) goto out_agi; - pag = agi_bp->b_pag; - /* Fill out form. */ memset(ageo, 0, sizeof(*ageo)); - ageo->ag_number = agno; + ageo->ag_number = pag->pag_agno; agi = agi_bp->b_addr; ageo->ag_icount = be32_to_cpu(agi->agi_count); diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index e411d51c2589..517a138faa66 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -67,6 +67,12 @@ struct xfs_perag { /* for rcu-safe freeing */ struct rcu_head rcu_head; + /* Precalculated geometry info */ + xfs_agblock_t block_count; + xfs_agblock_t min_block; + xfs_agino_t agino_min; + xfs_agino_t agino_max; + #ifdef __KERNEL__ /* -- kernel only structures below this line -- */ @@ -97,17 +103,11 @@ struct xfs_perag { /* background prealloc block trimming */ struct delayed_work pag_blockgc_work; - /* - * Unlinked inode information. This incore information reflects - * data stored in the AGI, so callers must hold the AGI buffer lock - * or have some other means to control concurrency. - */ - struct rhashtable pagi_unlinked_hash; #endif /* __KERNEL__ */ }; int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t agcount, - xfs_agnumber_t *maxagi); + xfs_rfsblock_t dcount, xfs_agnumber_t *maxagi); int xfs_initialize_perag_data(struct xfs_mount *mp, xfs_agnumber_t agno); void xfs_free_perag(struct xfs_mount *mp); @@ -117,6 +117,56 @@ struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno, void xfs_perag_put(struct xfs_perag *pag); /* + * Per-ag geometry infomation and validation + */ +xfs_agblock_t xfs_ag_block_count(struct xfs_mount *mp, xfs_agnumber_t agno); +void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agino_t *first, xfs_agino_t *last); + +static inline bool +xfs_verify_agbno(struct xfs_perag *pag, xfs_agblock_t agbno) +{ + if (agbno >= pag->block_count) + return false; + if (agbno <= pag->min_block) + return false; + return true; +} + +/* + * Verify that an AG inode number pointer neither points outside the AG + * nor points at static metadata. + */ +static inline bool +xfs_verify_agino(struct xfs_perag *pag, xfs_agino_t agino) +{ + if (agino < pag->agino_min) + return false; + if (agino > pag->agino_max) + return false; + return true; +} + +/* + * Verify that an AG inode number pointer neither points outside the AG + * nor points at static metadata, or is NULLAGINO. + */ +static inline bool +xfs_verify_agino_or_null(struct xfs_perag *pag, xfs_agino_t agino) +{ + if (agino == NULLAGINO) + return true; + return xfs_verify_agino(pag, agino); +} + +static inline bool +xfs_ag_contains_log(struct xfs_mount *mp, xfs_agnumber_t agno) +{ + return mp->m_sb.sb_logstart > 0 && + agno == XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart); +} + +/* * Perag iteration APIs */ static inline struct xfs_perag * @@ -168,11 +218,10 @@ struct aghdr_init_data { }; int xfs_ag_init_headers(struct xfs_mount *mp, struct aghdr_init_data *id); -int xfs_ag_shrink_space(struct xfs_mount *mp, struct xfs_trans **tpp, - xfs_agnumber_t agno, xfs_extlen_t delta); -int xfs_ag_extend_space(struct xfs_mount *mp, struct xfs_trans *tp, - struct aghdr_init_data *id, xfs_extlen_t len); -int xfs_ag_get_geometry(struct xfs_mount *mp, xfs_agnumber_t agno, - struct xfs_ag_geometry *ageo); +int xfs_ag_shrink_space(struct xfs_perag *pag, struct xfs_trans **tpp, + xfs_extlen_t delta); +int xfs_ag_extend_space(struct xfs_perag *pag, struct xfs_trans *tp, + xfs_extlen_t len); +int xfs_ag_get_geometry(struct xfs_perag *pag, struct xfs_ag_geometry *ageo); #endif /* __LIBXFS_AG_H */ diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index fe94058d4e9e..5af123d13a63 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -322,7 +322,7 @@ out: * address. */ if (has_resv) { - error2 = xfs_alloc_pagf_init(mp, tp, pag->pag_agno, 0); + error2 = xfs_alloc_read_agf(pag, tp, 0, NULL); if (error2) return error2; diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index d3f2886fdc08..e2bdf089c0a3 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -84,7 +84,7 @@ xfs_prealloc_blocks( /* * The number of blocks per AG that we withhold from xfs_mod_fdblocks to * guarantee that we can refill the AGFL prior to allocating space in a nearly - * full AG. Although the the space described by the free space btrees, the + * full AG. Although the space described by the free space btrees, the * blocks used by the freesp btrees themselves, and the blocks owned by the * AGFL are counted in the ondisk fdblocks, it's a mistake to let the ondisk * free space in the AG drop so low that the free space btrees cannot refill an @@ -248,7 +248,7 @@ xfs_alloc_get_rec( int *stat) /* output: success/failure */ { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno; + struct xfs_perag *pag = cur->bc_ag.pag; union xfs_btree_rec *rec; int error; @@ -263,11 +263,11 @@ xfs_alloc_get_rec( goto out_bad_rec; /* check for valid extent range, including overflow */ - if (!xfs_verify_agbno(mp, agno, *bno)) + if (!xfs_verify_agbno(pag, *bno)) goto out_bad_rec; if (*bno > *bno + *len) goto out_bad_rec; - if (!xfs_verify_agbno(mp, agno, *bno + *len - 1)) + if (!xfs_verify_agbno(pag, *bno + *len - 1)) goto out_bad_rec; return 0; @@ -275,7 +275,8 @@ xfs_alloc_get_rec( out_bad_rec: xfs_warn(mp, "%s Freespace BTree record corruption in AG %d detected!", - cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size", agno); + cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size", + pag->pag_agno); xfs_warn(mp, "start block 0x%x block count 0x%x", *bno, *len); return -EFSCORRUPTED; @@ -703,20 +704,19 @@ const struct xfs_buf_ops xfs_agfl_buf_ops = { /* * Read in the allocation group free block array. */ -int /* error */ +int xfs_alloc_read_agfl( - xfs_mount_t *mp, /* mount point structure */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - struct xfs_buf **bpp) /* buffer for the ag free block array */ + struct xfs_perag *pag, + struct xfs_trans *tp, + struct xfs_buf **bpp) { - struct xfs_buf *bp; /* return value */ - int error; + struct xfs_mount *mp = pag->pag_mount; + struct xfs_buf *bp; + int error; - ASSERT(agno != NULLAGNUMBER); error = xfs_trans_read_buf( mp, tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), + XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGFL_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops); if (error) return error; @@ -1075,7 +1075,8 @@ xfs_alloc_ag_vextent_small( be32_to_cpu(agf->agf_flcount) <= args->minleft) goto out; - error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0); + error = xfs_alloc_get_freelist(args->pag, args->tp, args->agbp, + &fbno, 0); if (error) goto error; if (fbno == NULLAGBLOCK) @@ -2609,7 +2610,7 @@ xfs_alloc_fix_freelist( ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); if (!pag->pagf_init) { - error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp); + error = xfs_alloc_read_agf(pag, tp, flags, &agbp); if (error) { /* Couldn't lock the AGF so skip this AG. */ if (error == -EAGAIN) @@ -2639,7 +2640,7 @@ xfs_alloc_fix_freelist( * Can fail if we're not blocking on locks, and it's held. */ if (!agbp) { - error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp); + error = xfs_alloc_read_agf(pag, tp, flags, &agbp); if (error) { /* Couldn't lock the AGF so skip this AG. */ if (error == -EAGAIN) @@ -2697,7 +2698,7 @@ xfs_alloc_fix_freelist( else targs.oinfo = XFS_RMAP_OINFO_AG; while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) { - error = xfs_alloc_get_freelist(tp, agbp, &bno, 0); + error = xfs_alloc_get_freelist(pag, tp, agbp, &bno, 0); if (error) goto out_agbp_relse; @@ -2712,7 +2713,7 @@ xfs_alloc_fix_freelist( targs.alignment = targs.minlen = targs.prod = 1; targs.type = XFS_ALLOCTYPE_THIS_AG; targs.pag = pag; - error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp); + error = xfs_alloc_read_agfl(pag, tp, &agflbp); if (error) goto out_agbp_relse; @@ -2741,7 +2742,7 @@ xfs_alloc_fix_freelist( * Put each allocated block on the list. */ for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) { - error = xfs_alloc_put_freelist(tp, agbp, + error = xfs_alloc_put_freelist(pag, tp, agbp, agflbp, bno, 0); if (error) goto out_agflbp_relse; @@ -2767,6 +2768,7 @@ out_no_agbp: */ int xfs_alloc_get_freelist( + struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, xfs_agblock_t *bnop, @@ -2779,7 +2781,6 @@ xfs_alloc_get_freelist( int error; uint32_t logflags; struct xfs_mount *mp = tp->t_mountp; - struct xfs_perag *pag; /* * Freelist is empty, give up. @@ -2791,8 +2792,7 @@ xfs_alloc_get_freelist( /* * Read the array of free blocks. */ - error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno), - &agflbp); + error = xfs_alloc_read_agfl(pag, tp, &agflbp); if (error) return error; @@ -2807,7 +2807,6 @@ xfs_alloc_get_freelist( if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp)) agf->agf_flfirst = 0; - pag = agbp->b_pag; ASSERT(!pag->pagf_agflreset); be32_add_cpu(&agf->agf_flcount, -1); pag->pagf_flcount--; @@ -2868,29 +2867,11 @@ xfs_alloc_log_agf( } /* - * Interface for inode allocation to force the pag data to be initialized. - */ -int /* error */ -xfs_alloc_pagf_init( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - int flags) /* XFS_ALLOC_FLAGS_... */ -{ - struct xfs_buf *bp; - int error; - - error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp); - if (!error) - xfs_trans_brelse(tp, bp); - return error; -} - -/* * Put the block on the freelist for the allocation group. */ int xfs_alloc_put_freelist( + struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, struct xfs_buf *agflbp, @@ -2899,21 +2880,22 @@ xfs_alloc_put_freelist( { struct xfs_mount *mp = tp->t_mountp; struct xfs_agf *agf = agbp->b_addr; - struct xfs_perag *pag; __be32 *blockp; int error; uint32_t logflags; __be32 *agfl_bno; int startoff; - if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp, - be32_to_cpu(agf->agf_seqno), &agflbp))) - return error; + if (!agflbp) { + error = xfs_alloc_read_agfl(pag, tp, &agflbp); + if (error) + return error; + } + be32_add_cpu(&agf->agf_fllast, 1); if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp)) agf->agf_fllast = 0; - pag = agbp->b_pag; ASSERT(!pag->pagf_agflreset); be32_add_cpu(&agf->agf_flcount, 1); pag->pagf_flcount++; @@ -3070,61 +3052,57 @@ const struct xfs_buf_ops xfs_agf_buf_ops = { /* * Read in the allocation group header (free/alloc section). */ -int /* error */ +int xfs_read_agf( - struct xfs_mount *mp, /* mount point structure */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - int flags, /* XFS_BUF_ */ - struct xfs_buf **bpp) /* buffer for the ag freelist header */ + struct xfs_perag *pag, + struct xfs_trans *tp, + int flags, + struct xfs_buf **agfbpp) { - int error; + struct xfs_mount *mp = pag->pag_mount; + int error; - trace_xfs_read_agf(mp, agno); + trace_xfs_read_agf(pag->pag_mount, pag->pag_agno); - ASSERT(agno != NULLAGNUMBER); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops); + XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGF_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops); if (error) return error; - ASSERT(!(*bpp)->b_error); - xfs_buf_set_ref(*bpp, XFS_AGF_REF); + xfs_buf_set_ref(*agfbpp, XFS_AGF_REF); return 0; } /* - * Read in the allocation group header (free/alloc section). + * Read in the allocation group header (free/alloc section) and initialise the + * perag structure if necessary. If the caller provides @agfbpp, then return the + * locked buffer to the caller, otherwise free it. */ -int /* error */ +int xfs_alloc_read_agf( - struct xfs_mount *mp, /* mount point structure */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - int flags, /* XFS_ALLOC_FLAG_... */ - struct xfs_buf **bpp) /* buffer for the ag freelist header */ + struct xfs_perag *pag, + struct xfs_trans *tp, + int flags, + struct xfs_buf **agfbpp) { - struct xfs_agf *agf; /* ag freelist header */ - struct xfs_perag *pag; /* per allocation group data */ + struct xfs_buf *agfbp; + struct xfs_agf *agf; int error; int allocbt_blks; - trace_xfs_alloc_read_agf(mp, agno); + trace_xfs_alloc_read_agf(pag->pag_mount, pag->pag_agno); /* We don't support trylock when freeing. */ ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) != (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)); - ASSERT(agno != NULLAGNUMBER); - error = xfs_read_agf(mp, tp, agno, + error = xfs_read_agf(pag, tp, (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0, - bpp); + &agfbp); if (error) return error; - ASSERT(!(*bpp)->b_error); - agf = (*bpp)->b_addr; - pag = (*bpp)->b_pag; + agf = agfbp->b_addr; if (!pag->pagf_init) { pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); @@ -3138,7 +3116,7 @@ xfs_alloc_read_agf( be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]); pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level); pag->pagf_init = 1; - pag->pagf_agflreset = xfs_agfl_needs_reset(mp, agf); + pag->pagf_agflreset = xfs_agfl_needs_reset(pag->pag_mount, agf); /* * Update the in-core allocbt counter. Filter out the rmapbt @@ -3148,13 +3126,14 @@ xfs_alloc_read_agf( * counter only tracks non-root blocks. */ allocbt_blks = pag->pagf_btreeblks; - if (xfs_has_rmapbt(mp)) + if (xfs_has_rmapbt(pag->pag_mount)) allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1; if (allocbt_blks > 0) - atomic64_add(allocbt_blks, &mp->m_allocbt_blks); + atomic64_add(allocbt_blks, + &pag->pag_mount->m_allocbt_blks); } #ifdef DEBUG - else if (!xfs_is_shutdown(mp)) { + else if (!xfs_is_shutdown(pag->pag_mount)) { ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks)); ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks)); ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount)); @@ -3165,6 +3144,10 @@ xfs_alloc_read_agf( be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi])); } #endif + if (agfbpp) + *agfbpp = agfbp; + else + xfs_trans_brelse(tp, agfbp); return 0; } diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 84ca09b2223f..2c3f762dfb58 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -95,6 +95,11 @@ xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_perag *pag, xfs_extlen_t need, xfs_extlen_t reserved); unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp, struct xfs_perag *pag); +int xfs_alloc_get_freelist(struct xfs_perag *pag, struct xfs_trans *tp, + struct xfs_buf *agfbp, xfs_agblock_t *bnop, int btreeblk); +int xfs_alloc_put_freelist(struct xfs_perag *pag, struct xfs_trans *tp, + struct xfs_buf *agfbp, struct xfs_buf *agflbp, + xfs_agblock_t bno, int btreeblk); /* * Compute and fill in value of m_alloc_maxlevels. @@ -104,17 +109,6 @@ xfs_alloc_compute_maxlevels( struct xfs_mount *mp); /* file system mount structure */ /* - * Get a block from the freelist. - * Returns with the buffer for the block gotten. - */ -int /* error */ -xfs_alloc_get_freelist( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_buf *agbp, /* buffer containing the agf structure */ - xfs_agblock_t *bnop, /* block address retrieved from freelist */ - int btreeblk); /* destination is a AGF btree */ - -/* * Log the given fields from the agf structure. */ void @@ -124,38 +118,6 @@ xfs_alloc_log_agf( uint32_t fields);/* mask of fields to be logged (XFS_AGF_...) */ /* - * Interface for inode allocation to force the pag data to be initialized. - */ -int /* error */ -xfs_alloc_pagf_init( - struct xfs_mount *mp, /* file system mount structure */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - int flags); /* XFS_ALLOC_FLAGS_... */ - -/* - * Put the block on the freelist for the allocation group. - */ -int /* error */ -xfs_alloc_put_freelist( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_buf *agbp, /* buffer for a.g. freelist header */ - struct xfs_buf *agflbp,/* buffer for a.g. free block array */ - xfs_agblock_t bno, /* block being freed */ - int btreeblk); /* owner was a AGF btree */ - -/* - * Read in the allocation group header (free/alloc section). - */ -int /* error */ -xfs_alloc_read_agf( - struct xfs_mount *mp, /* mount point structure */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - int flags, /* XFS_ALLOC_FLAG_... */ - struct xfs_buf **bpp); /* buffer for the ag freelist header */ - -/* * Allocate an extent (variable-size). */ int /* error */ @@ -206,10 +168,12 @@ xfs_alloc_get_rec( xfs_extlen_t *len, /* output: length of extent */ int *stat); /* output: success/failure */ -int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); -int xfs_alloc_read_agfl(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, struct xfs_buf **bpp); +int xfs_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags, + struct xfs_buf **agfbpp); +int xfs_alloc_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags, + struct xfs_buf **agfbpp); +int xfs_alloc_read_agfl(struct xfs_perag *pag, struct xfs_trans *tp, + struct xfs_buf **bpp); int xfs_free_agfl_block(struct xfs_trans *, xfs_agnumber_t, xfs_agblock_t, struct xfs_buf *, struct xfs_owner_info *); int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags); diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 8c9f73cc0bee..549a3cba0234 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -60,8 +60,8 @@ xfs_allocbt_alloc_block( xfs_agblock_t bno; /* Allocate the new block from the freelist. If we can't, give up. */ - error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_ag.agbp, - &bno, 1); + error = xfs_alloc_get_freelist(cur->bc_ag.pag, cur->bc_tp, + cur->bc_ag.agbp, &bno, 1); if (error) return error; @@ -71,7 +71,7 @@ xfs_allocbt_alloc_block( } atomic64_inc(&cur->bc_mp->m_allocbt_blks); - xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agbp->b_pag, bno, 1, false); + xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.pag, bno, 1, false); new->s = cpu_to_be32(bno); @@ -89,7 +89,8 @@ xfs_allocbt_free_block( int error; bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp)); - error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1); + error = xfs_alloc_put_freelist(cur->bc_ag.pag, cur->bc_tp, agbp, NULL, + bno, 1); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 1824f61621a2..e28d93d232de 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -50,7 +50,7 @@ STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args); STATIC int xfs_attr_leaf_get(xfs_da_args_t *args); STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args); STATIC int xfs_attr_leaf_hasname(struct xfs_da_args *args, struct xfs_buf **bp); -STATIC int xfs_attr_leaf_try_add(struct xfs_da_args *args, struct xfs_buf *bp); +STATIC int xfs_attr_leaf_try_add(struct xfs_da_args *args); /* * Internal routines when attribute list is more than one block. @@ -67,12 +67,10 @@ int xfs_inode_hasattr( struct xfs_inode *ip) { - if (!XFS_IFORK_Q(ip)) + if (!xfs_inode_has_attr_fork(ip)) return 0; - if (!ip->i_afp) - return 0; - if (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS && - ip->i_afp->if_nextents == 0) + if (ip->i_af.if_format == XFS_DINODE_FMT_EXTENTS && + ip->i_af.if_nextents == 0) return 0; return 1; } @@ -85,7 +83,7 @@ bool xfs_attr_is_leaf( struct xfs_inode *ip) { - struct xfs_ifork *ifp = ip->i_afp; + struct xfs_ifork *ifp = &ip->i_af; struct xfs_iext_cursor icur; struct xfs_bmbt_irec imap; @@ -231,7 +229,7 @@ xfs_attr_get_ilocked( if (!xfs_inode_hasattr(args->dp)) return -ENOATTR; - if (args->dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) + if (args->dp->i_af.if_format == XFS_DINODE_FMT_LOCAL) return xfs_attr_shortform_getvalue(args); if (xfs_attr_is_leaf(args->dp)) return xfs_attr_leaf_get(args); @@ -354,7 +352,7 @@ xfs_attr_try_sf_addname( /* * Build initial attribute list (if required). */ - if (dp->i_afp->if_format == XFS_DINODE_FMT_EXTENTS) + if (dp->i_af.if_format == XFS_DINODE_FMT_EXTENTS) xfs_attr_shortform_create(args); error = xfs_attr_shortform_addname(args); @@ -393,16 +391,10 @@ xfs_attr_sf_addname( * It won't fit in the shortform, transform to a leaf block. GROT: * another possible req'mt for a double-split btree op. */ - error = xfs_attr_shortform_to_leaf(args, &attr->xattri_leaf_bp); + error = xfs_attr_shortform_to_leaf(args); if (error) return error; - /* - * Prevent the leaf buffer from being unlocked so that a concurrent AIL - * push cannot grab the half-baked leaf buffer and run into problems - * with the write verifier. - */ - xfs_trans_bhold(args->trans, attr->xattri_leaf_bp); attr->xattri_dela_state = XFS_DAS_LEAF_ADD; out: trace_xfs_attr_sf_addname_return(attr->xattri_dela_state, args->dp); @@ -447,11 +439,9 @@ xfs_attr_leaf_addname( /* * Use the leaf buffer we may already hold locked as a result of - * a sf-to-leaf conversion. The held buffer is no longer valid - * after this call, regardless of the result. + * a sf-to-leaf conversion. */ - error = xfs_attr_leaf_try_add(args, attr->xattri_leaf_bp); - attr->xattri_leaf_bp = NULL; + error = xfs_attr_leaf_try_add(args); if (error == -ENOSPC) { error = xfs_attr3_leaf_to_node(args); @@ -497,8 +487,6 @@ xfs_attr_node_addname( struct xfs_da_args *args = attr->xattri_da_args; int error; - ASSERT(!attr->xattri_leaf_bp); - error = xfs_attr_node_addname_find_attr(attr); if (error) return error; @@ -874,7 +862,7 @@ xfs_attr_lookup( if (!xfs_inode_hasattr(dp)) return -ENOATTR; - if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) + if (dp->i_af.if_format == XFS_DINODE_FMT_LOCAL) return xfs_attr_sf_findname(args, NULL, NULL); if (xfs_attr_is_leaf(dp)) { @@ -1011,7 +999,7 @@ xfs_attr_set( * If the inode doesn't have an attribute fork, add one. * (inode must not be locked when we call this routine) */ - if (XFS_IFORK_Q(dp) == 0) { + if (xfs_inode_has_attr_fork(dp) == 0) { int sf_size = sizeof(struct xfs_attr_sf_hdr) + xfs_attr_sf_entsize_byname(args->namelen, args->valuelen); @@ -1111,7 +1099,7 @@ static inline int xfs_attr_sf_totsize(struct xfs_inode *dp) { struct xfs_attr_shortform *sf; - sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data; + sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data; return be16_to_cpu(sf->hdr.totsize); } @@ -1215,24 +1203,14 @@ xfs_attr_restore_rmt_blk( */ STATIC int xfs_attr_leaf_try_add( - struct xfs_da_args *args, - struct xfs_buf *bp) + struct xfs_da_args *args) { + struct xfs_buf *bp; int error; - /* - * If the caller provided a buffer to us, it is locked and held in - * the transaction because it just did a shortform to leaf conversion. - * Hence we don't need to read it again. Otherwise read in the leaf - * buffer. - */ - if (bp) { - xfs_trans_bhold_release(args->trans, bp); - } else { - error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); - if (error) - return error; - } + error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); + if (error) + return error; /* * Look up the xattr name to set the insertion point for the new xattr. @@ -1578,7 +1556,7 @@ xfs_attr_node_get( * If not in a transaction, we have to release all the buffers. */ out_release: - for (i = 0; state != NULL && i < state->path.active; i++) { + for (i = 0; i < state->path.active; i++) { xfs_trans_brelse(args->trans, state->path.blk[i].bp); state->path.blk[i].bp = NULL; } diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index b4a2fc77017e..81be9b3e4004 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -515,11 +515,6 @@ struct xfs_attr_intent { */ struct xfs_attri_log_nameval *xattri_nameval; - /* - * Used by xfs_attr_set to hold a leaf buffer across a transaction roll - */ - struct xfs_buf *xattri_leaf_bp; - /* Used to keep track of current state of delayed operation */ enum xfs_delattr_state xattri_dela_state; @@ -565,9 +560,9 @@ static inline bool xfs_attr_is_shortform( struct xfs_inode *ip) { - return ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL || - (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS && - ip->i_afp->if_nextents == 0); + return ip->i_af.if_format == XFS_DINODE_FMT_LOCAL || + (ip->i_af.if_format == XFS_DINODE_FMT_EXTENTS && + ip->i_af.if_nextents == 0); } static inline enum xfs_delattr_state @@ -578,10 +573,10 @@ xfs_attr_init_add_state(struct xfs_da_args *args) * next state, the attribute fork may be null. This can occur only occur * on a pure remove, but we grab the next state before we check if a * replace operation is being performed. If we are called from any other - * context, i_afp is guaranteed to exist. Hence if the attr fork is + * context, i_af is guaranteed to exist. Hence if the attr fork is * null, we were called from a pure remove operation and so we are done. */ - if (!args->dp->i_afp) + if (!xfs_inode_has_attr_fork(args->dp)) return XFS_DAS_DONE; args->op_flags |= XFS_DA_OP_ADDNAME; diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 37e7c33f6283..beee51ad75ce 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -289,6 +289,23 @@ xfs_attr3_leaf_verify_entry( return NULL; } +/* + * Validate an attribute leaf block. + * + * Empty leaf blocks can occur under the following circumstances: + * + * 1. setxattr adds a new extended attribute to a file; + * 2. The file has zero existing attributes; + * 3. The attribute is too large to fit in the attribute fork; + * 4. The attribute is small enough to fit in a leaf block; + * 5. A log flush occurs after committing the transaction that creates + * the (empty) leaf block; and + * 6. The filesystem goes down after the log flush but before the new + * attribute can be committed to the leaf block. + * + * Hence we need to ensure that we don't fail the validation purely + * because the leaf is empty. + */ static xfs_failaddr_t xfs_attr3_leaf_verify( struct xfs_buf *bp) @@ -311,15 +328,6 @@ xfs_attr3_leaf_verify( return fa; /* - * Empty leaf blocks should never occur; they imply the existence of a - * software bug that needs fixing. xfs_repair also flags them as a - * corruption that needs fixing, so we should never let these go to - * disk. - */ - if (ichdr.count == 0) - return __this_address; - - /* * firstused is the block offset of the first name info structure. * Make sure it doesn't go off the block or crash into the header. */ @@ -582,7 +590,7 @@ xfs_attr_shortform_bytesfit( * to real extents, or the delalloc conversion will take care of the * literal area rebalancing. */ - if (bytes <= XFS_IFORK_ASIZE(dp)) + if (bytes <= xfs_inode_attr_fork_size(dp)) return dp->i_forkoff; /* @@ -674,7 +682,7 @@ xfs_attr_shortform_create( struct xfs_da_args *args) { struct xfs_inode *dp = args->dp; - struct xfs_ifork *ifp = dp->i_afp; + struct xfs_ifork *ifp = &dp->i_af; struct xfs_attr_sf_hdr *hdr; trace_xfs_attr_sf_create(args); @@ -711,7 +719,7 @@ xfs_attr_sf_findname( int end; int i; - sf = (struct xfs_attr_shortform *)args->dp->i_afp->if_u1.if_data; + sf = (struct xfs_attr_shortform *)args->dp->i_af.if_u1.if_data; sfe = &sf->list[0]; end = sf->hdr.count; for (i = 0; i < end; sfe = xfs_attr_sf_nextentry(sfe), @@ -756,7 +764,7 @@ xfs_attr_shortform_add( mp = dp->i_mount; dp->i_forkoff = forkoff; - ifp = dp->i_afp; + ifp = &dp->i_af; ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; if (xfs_attr_sf_findname(args, &sfe, NULL) == -EEXIST) @@ -789,11 +797,9 @@ xfs_attr_fork_remove( struct xfs_inode *ip, struct xfs_trans *tp) { - ASSERT(ip->i_afp->if_nextents == 0); + ASSERT(ip->i_af.if_nextents == 0); - xfs_idestroy_fork(ip->i_afp); - kmem_cache_free(xfs_ifork_cache, ip->i_afp); - ip->i_afp = NULL; + xfs_ifork_zap_attr(ip); ip->i_forkoff = 0; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } @@ -817,7 +823,7 @@ xfs_attr_sf_removename( dp = args->dp; mp = dp->i_mount; - sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data; + sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data; error = xfs_attr_sf_findname(args, &sfe, &base); @@ -881,7 +887,7 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args) trace_xfs_attr_sf_lookup(args); - ifp = args->dp->i_afp; + ifp = &args->dp->i_af; ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; sfe = &sf->list[0]; @@ -909,8 +915,8 @@ xfs_attr_shortform_getvalue( struct xfs_attr_sf_entry *sfe; int i; - ASSERT(args->dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL); - sf = (struct xfs_attr_shortform *)args->dp->i_afp->if_u1.if_data; + ASSERT(args->dp->i_af.if_format == XFS_DINODE_FMT_LOCAL); + sf = (struct xfs_attr_shortform *)args->dp->i_af.if_u1.if_data; sfe = &sf->list[0]; for (i = 0; i < sf->hdr.count; sfe = xfs_attr_sf_nextentry(sfe), i++) { @@ -922,14 +928,10 @@ xfs_attr_shortform_getvalue( return -ENOATTR; } -/* - * Convert from using the shortform to the leaf. On success, return the - * buffer so that we can keep it locked until we're totally done with it. - */ +/* Convert from using the shortform to the leaf format. */ int xfs_attr_shortform_to_leaf( - struct xfs_da_args *args, - struct xfs_buf **leaf_bp) + struct xfs_da_args *args) { struct xfs_inode *dp; struct xfs_attr_shortform *sf; @@ -944,7 +946,7 @@ xfs_attr_shortform_to_leaf( trace_xfs_attr_sf_to_leaf(args); dp = args->dp; - ifp = dp->i_afp; + ifp = &dp->i_af; sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; size = be16_to_cpu(sf->hdr.totsize); tmpbuffer = kmem_alloc(size, 0); @@ -991,7 +993,6 @@ xfs_attr_shortform_to_leaf( sfe = xfs_attr_sf_nextentry(sfe); } error = 0; - *leaf_bp = bp; out: kmem_free(tmpbuffer); return error; @@ -1052,8 +1053,8 @@ xfs_attr_shortform_verify( int i; int64_t size; - ASSERT(ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL); - ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK); + ASSERT(ip->i_af.if_format == XFS_DINODE_FMT_LOCAL); + ifp = xfs_ifork_ptr(ip, XFS_ATTR_FORK); sfp = (struct xfs_attr_shortform *)ifp->if_u1.if_data; size = ifp->if_bytes; diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index efa757f1e912..368f4d9fa1d5 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -49,8 +49,7 @@ void xfs_attr_shortform_create(struct xfs_da_args *args); void xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff); int xfs_attr_shortform_lookup(struct xfs_da_args *args); int xfs_attr_shortform_getvalue(struct xfs_da_args *args); -int xfs_attr_shortform_to_leaf(struct xfs_da_args *args, - struct xfs_buf **leaf_bp); +int xfs_attr_shortform_to_leaf(struct xfs_da_args *args); int xfs_attr_sf_removename(struct xfs_da_args *args); int xfs_attr_sf_findname(struct xfs_da_args *args, struct xfs_attr_sf_entry **sfep, diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 7298c148f848..d440393b40eb 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -543,6 +543,7 @@ xfs_attr_rmtval_stale( { struct xfs_mount *mp = ip->i_mount; struct xfs_buf *bp; + int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -550,14 +551,18 @@ xfs_attr_rmtval_stale( XFS_IS_CORRUPT(mp, map->br_startblock == HOLESTARTBLOCK)) return -EFSCORRUPTED; - bp = xfs_buf_incore(mp->m_ddev_targp, + error = xfs_buf_incore(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, map->br_startblock), - XFS_FSB_TO_BB(mp, map->br_blockcount), incore_flags); - if (bp) { - xfs_buf_stale(bp); - xfs_buf_relse(bp); + XFS_FSB_TO_BB(mp, map->br_blockcount), + incore_flags, &bp); + if (error) { + if (error == -ENOENT) + return 0; + return error; } + xfs_buf_stale(bp); + xfs_buf_relse(bp); return 0; } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 6833110d1bd4..e56723dc9cd5 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -128,7 +128,7 @@ xfs_bmbt_lookup_first( */ static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); return whichfork != XFS_COW_FORK && ifp->if_format == XFS_DINODE_FMT_EXTENTS && @@ -140,7 +140,7 @@ static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork) */ static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); return whichfork != XFS_COW_FORK && ifp->if_format == XFS_DINODE_FMT_BTREE && @@ -319,7 +319,7 @@ xfs_bmap_check_leaf_extents( int whichfork) /* data or attr fork */ { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_block *block; /* current btree block */ xfs_fsblock_t bno; /* block # of "block" */ struct xfs_buf *bp; /* buffer for "block" */ @@ -538,7 +538,7 @@ xfs_bmap_btree_to_extents( int *logflagsp, /* inode logging flags */ int whichfork) /* data or attr fork */ { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_mount *mp = ip->i_mount; struct xfs_btree_block *rblock = ifp->if_broot; struct xfs_btree_block *cblock;/* child btree block */ @@ -616,7 +616,7 @@ xfs_bmap_extents_to_btree( mp = ip->i_mount; ASSERT(whichfork != XFS_COW_FORK); - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); ASSERT(ifp->if_format == XFS_DINODE_FMT_EXTENTS); /* @@ -745,7 +745,7 @@ xfs_bmap_local_to_extents_empty( struct xfs_inode *ip, int whichfork) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); ASSERT(whichfork != XFS_COW_FORK); ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); @@ -785,7 +785,7 @@ xfs_bmap_local_to_extents( * So sending the data fork of a regular inode is invalid. */ ASSERT(!(S_ISREG(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK)); - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); if (!ifp->if_bytes) { @@ -880,7 +880,7 @@ xfs_bmap_add_attrfork_btree( mp = ip->i_mount; - if (XFS_BMAP_BMDR_SPACE(block) <= XFS_IFORK_DSIZE(ip)) + if (XFS_BMAP_BMDR_SPACE(block) <= xfs_inode_data_fork_size(ip)) *flags |= XFS_ILOG_DBROOT; else { cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK); @@ -920,7 +920,7 @@ xfs_bmap_add_attrfork_extents( int error; /* error return value */ if (ip->i_df.if_nextents * sizeof(struct xfs_bmbt_rec) <= - XFS_IFORK_DSIZE(ip)) + xfs_inode_data_fork_size(ip)) return 0; cur = NULL; error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0, flags, @@ -951,7 +951,7 @@ xfs_bmap_add_attrfork_local( { struct xfs_da_args dargs; /* args for dir/attr code */ - if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip)) + if (ip->i_df.if_bytes <= xfs_inode_data_fork_size(ip)) return 0; if (S_ISDIR(VFS_I(ip)->i_mode)) { @@ -1023,7 +1023,7 @@ xfs_bmap_add_attrfork( int logflags; /* logging flags */ int error; /* error return value */ - ASSERT(XFS_IFORK_Q(ip) == 0); + ASSERT(xfs_inode_has_attr_fork(ip) == 0); mp = ip->i_mount; ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); @@ -1034,16 +1034,15 @@ xfs_bmap_add_attrfork( rsvd, &tp); if (error) return error; - if (XFS_IFORK_Q(ip)) + if (xfs_inode_has_attr_fork(ip)) goto trans_cancel; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); error = xfs_bmap_set_attrforkoff(ip, size, &version); if (error) goto trans_cancel; - ASSERT(ip->i_afp == NULL); - ip->i_afp = xfs_ifork_alloc(XFS_DINODE_FMT_EXTENTS, 0); + xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0); logflags = 0; switch (ip->i_df.if_format) { case XFS_DINODE_FMT_LOCAL: @@ -1116,7 +1115,7 @@ xfs_iread_bmbt_block( xfs_extnum_t num_recs; xfs_extnum_t j; int whichfork = cur->bc_ino.whichfork; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); block = xfs_btree_get_block(cur, level, &bp); @@ -1164,7 +1163,7 @@ xfs_iread_extents( int whichfork) { struct xfs_iread_state ir; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_mount *mp = ip->i_mount; struct xfs_btree_cur *cur; int error; @@ -1208,7 +1207,7 @@ xfs_bmap_first_unused( xfs_fileoff_t *first_unused, /* unused block */ int whichfork) /* data or attr fork */ { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_bmbt_irec got; struct xfs_iext_cursor icur; xfs_fileoff_t lastaddr = 0; @@ -1255,7 +1254,7 @@ xfs_bmap_last_before( xfs_fileoff_t *last_block, /* last block */ int whichfork) /* data or attr fork */ { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_bmbt_irec got; struct xfs_iext_cursor icur; int error; @@ -1289,7 +1288,7 @@ xfs_bmap_last_extent( struct xfs_bmbt_irec *rec, int *is_empty) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_iext_cursor icur; int error; @@ -1355,7 +1354,7 @@ xfs_bmap_last_offset( xfs_fileoff_t *last_block, int whichfork) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_bmbt_irec rec; int is_empty; int error; @@ -1389,7 +1388,7 @@ xfs_bmap_add_extent_delay_real( int whichfork) { struct xfs_mount *mp = bma->ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork); struct xfs_bmbt_irec *new = &bma->got; int error; /* error return value */ int i; /* temp state */ @@ -1955,7 +1954,7 @@ xfs_bmap_add_extent_unwritten_real( *logflagsp = 0; cur = *curp; - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); ASSERT(!isnullstartblock(new->br_startblock)); @@ -2480,7 +2479,7 @@ xfs_bmap_add_extent_hole_delay( uint32_t state = xfs_bmap_fork_to_state(whichfork); xfs_filblks_t temp; /* temp for indirect calculations */ - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); ASSERT(isnullstartblock(new->br_startblock)); /* @@ -2616,7 +2615,7 @@ xfs_bmap_add_extent_hole_real( int *logflagsp, uint32_t flags) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_mount *mp = ip->i_mount; struct xfs_btree_cur *cur = *curp; int error; /* error return value */ @@ -3185,7 +3184,8 @@ xfs_bmap_longest_free_extent( pag = xfs_perag_get(mp, ag); if (!pag->pagf_init) { - error = xfs_alloc_pagf_init(mp, tp, ag, XFS_ALLOC_FLAG_TRYLOCK); + error = xfs_alloc_read_agf(pag, tp, XFS_ALLOC_FLAG_TRYLOCK, + NULL); if (error) { /* Couldn't lock the AGF, so skip this AG. */ if (error == -EAGAIN) { @@ -3866,7 +3866,7 @@ xfs_bmapi_read( { struct xfs_mount *mp = ip->i_mount; int whichfork = xfs_bmapi_whichfork(flags); - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_bmbt_irec got; xfs_fileoff_t obno; xfs_fileoff_t end; @@ -3959,7 +3959,7 @@ xfs_bmapi_reserve_delalloc( int eof) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); xfs_extlen_t alen; xfs_extlen_t indlen; int error; @@ -4086,7 +4086,7 @@ xfs_bmapi_allocate( { struct xfs_mount *mp = bma->ip->i_mount; int whichfork = xfs_bmapi_whichfork(bma->flags); - struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork); int tmp_logflags = 0; int error; @@ -4185,7 +4185,7 @@ xfs_bmapi_convert_unwritten( uint32_t flags) { int whichfork = xfs_bmapi_whichfork(flags); - struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork); int tmp_logflags = 0; int error; @@ -4262,7 +4262,7 @@ xfs_bmapi_minleft( struct xfs_inode *ip, int fork) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, fork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, fork); if (tp && tp->t_firstblock != NULLFSBLOCK) return 0; @@ -4283,7 +4283,7 @@ xfs_bmapi_finish( int whichfork, int error) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork); if ((bma->logflags & xfs_ilog_fext(whichfork)) && ifp->if_format != XFS_DINODE_FMT_EXTENTS) @@ -4322,7 +4322,7 @@ xfs_bmapi_write( }; struct xfs_mount *mp = ip->i_mount; int whichfork = xfs_bmapi_whichfork(flags); - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); xfs_fileoff_t end; /* end of mapped file region */ bool eof = false; /* after the end of extents */ int error; /* error return */ @@ -4503,7 +4503,7 @@ xfs_bmapi_convert_delalloc( struct iomap *iomap, unsigned int *seq) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); struct xfs_bmalloca bma = { NULL }; @@ -4640,7 +4640,7 @@ xfs_bmapi_remap( int whichfork = xfs_bmapi_whichfork(flags); int logflags = 0, error; - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); ASSERT(len > 0); ASSERT(len <= (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -4797,7 +4797,7 @@ xfs_bmap_del_extent_delay( struct xfs_bmbt_irec *del) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_bmbt_irec new; int64_t da_old, da_new, da_diff = 0; xfs_fileoff_t del_endoff, got_endoff; @@ -4924,7 +4924,7 @@ xfs_bmap_del_extent_cow( struct xfs_bmbt_irec *del) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); struct xfs_bmbt_irec new; xfs_fileoff_t del_endoff, got_endoff; uint32_t state = BMAP_COWFORK; @@ -5022,7 +5022,7 @@ xfs_bmap_del_extent_real( mp = ip->i_mount; XFS_STATS_INC(mp, xs_del_exlist); - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); ASSERT(del->br_blockcount > 0); xfs_iext_get_extent(ifp, icur, &got); ASSERT(got.br_startoff <= del->br_startoff); @@ -5288,7 +5288,7 @@ __xfs_bunmapi( whichfork = xfs_bmapi_whichfork(flags); ASSERT(whichfork != XFS_COW_FORK); - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp))) return -EFSCORRUPTED; if (xfs_is_shutdown(mp)) @@ -5629,7 +5629,7 @@ xfs_bmse_merge( struct xfs_btree_cur *cur, int *logflags) /* output */ { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_bmbt_irec new; xfs_filblks_t blockcount; int error, i; @@ -5750,7 +5750,7 @@ xfs_bmap_collapse_extents( { int whichfork = XFS_DATA_FORK; struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_cur *cur = NULL; struct xfs_bmbt_irec got, prev; struct xfs_iext_cursor icur; @@ -5865,7 +5865,7 @@ xfs_bmap_insert_extents( { int whichfork = XFS_DATA_FORK; struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_cur *cur = NULL; struct xfs_bmbt_irec got, next; struct xfs_iext_cursor icur; @@ -5965,7 +5965,7 @@ xfs_bmap_split_extent( xfs_fileoff_t split_fsb) { int whichfork = XFS_DATA_FORK; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_cur *cur = NULL; struct xfs_bmbt_irec got; struct xfs_bmbt_irec new; /* split extent */ diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 2b77d45c215f..cfa052d40105 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -304,7 +304,7 @@ xfs_bmbt_get_minrecs( if (level == cur->bc_nlevels - 1) { struct xfs_ifork *ifp; - ifp = XFS_IFORK_PTR(cur->bc_ino.ip, + ifp = xfs_ifork_ptr(cur->bc_ino.ip, cur->bc_ino.whichfork); return xfs_bmbt_maxrecs(cur->bc_mp, @@ -322,7 +322,7 @@ xfs_bmbt_get_maxrecs( if (level == cur->bc_nlevels - 1) { struct xfs_ifork *ifp; - ifp = XFS_IFORK_PTR(cur->bc_ino.ip, + ifp = xfs_ifork_ptr(cur->bc_ino.ip, cur->bc_ino.whichfork); return xfs_bmbt_maxrecs(cur->bc_mp, @@ -550,7 +550,7 @@ xfs_bmbt_init_cursor( struct xfs_inode *ip, /* inode owning the btree */ int whichfork) /* data or attr fork */ { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_cur *cur; ASSERT(whichfork != XFS_COW_FORK); @@ -564,7 +564,7 @@ xfs_bmbt_init_cursor( if (xfs_has_crc(mp)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; - cur->bc_ino.forksize = XFS_IFORK_SIZE(ip, whichfork); + cur->bc_ino.forksize = xfs_inode_fork_size(ip, whichfork); cur->bc_ino.ip = ip; cur->bc_ino.allocated = 0; cur->bc_ino.flags = 0; @@ -664,7 +664,7 @@ xfs_bmbt_change_owner( ASSERT(tp || buffer_list); ASSERT(!(tp && buffer_list)); - ASSERT(XFS_IFORK_PTR(ip, whichfork)->if_format == XFS_DINODE_FMT_BTREE); + ASSERT(xfs_ifork_ptr(ip, whichfork)->if_format == XFS_DINODE_FMT_BTREE); cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork); cur->bc_ino.flags |= XFS_BTCUR_BMBT_INVALID_OWNER; diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 2eecc49fc1b2..4c16c8c31fcb 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -91,10 +91,9 @@ xfs_btree_check_lblock_siblings( static inline xfs_failaddr_t xfs_btree_check_sblock_siblings( - struct xfs_mount *mp, + struct xfs_perag *pag, struct xfs_btree_cur *cur, int level, - xfs_agnumber_t agno, xfs_agblock_t agbno, __be32 dsibling) { @@ -110,7 +109,7 @@ xfs_btree_check_sblock_siblings( if (!xfs_btree_check_sptr(cur, sibling, level + 1)) return __this_address; } else { - if (!xfs_verify_agbno(mp, agno, sibling)) + if (!xfs_verify_agbno(pag, sibling)) return __this_address; } return NULL; @@ -195,11 +194,11 @@ __xfs_btree_check_sblock( struct xfs_buf *bp) { struct xfs_mount *mp = cur->bc_mp; + struct xfs_perag *pag = cur->bc_ag.pag; xfs_btnum_t btnum = cur->bc_btnum; int crc = xfs_has_crc(mp); xfs_failaddr_t fa; xfs_agblock_t agbno = NULLAGBLOCK; - xfs_agnumber_t agno = NULLAGNUMBER; if (crc) { if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) @@ -217,16 +216,14 @@ __xfs_btree_check_sblock( cur->bc_ops->get_maxrecs(cur, level)) return __this_address; - if (bp) { + if (bp) agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); - agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp)); - } - fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno, agbno, + fa = xfs_btree_check_sblock_siblings(pag, cur, level, agbno, block->bb_u.s.bb_leftsib); if (!fa) - fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno, - agbno, block->bb_u.s.bb_rightsib); + fa = xfs_btree_check_sblock_siblings(pag, cur, level, agbno, + block->bb_u.s.bb_rightsib); return fa; } @@ -288,7 +285,7 @@ xfs_btree_check_sptr( { if (level <= 0) return false; - return xfs_verify_agbno(cur->bc_mp, cur->bc_ag.pag->pag_agno, agbno); + return xfs_verify_agbno(cur->bc_ag.pag, agbno); } /* @@ -725,7 +722,7 @@ xfs_btree_ifork_ptr( if (cur->bc_flags & XFS_BTREE_STAGING) return cur->bc_ino.ifake->if_fork; - return XFS_IFORK_PTR(cur->bc_ino.ip, cur->bc_ino.whichfork); + return xfs_ifork_ptr(cur->bc_ino.ip, cur->bc_ino.whichfork); } /* @@ -3559,7 +3556,7 @@ xfs_btree_kill_iroot( { int whichfork = cur->bc_ino.whichfork; struct xfs_inode *ip = cur->bc_ino.ip; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_block *block; struct xfs_btree_block *cblock; union xfs_btree_key *kp; @@ -4595,7 +4592,6 @@ xfs_btree_sblock_verify( { struct xfs_mount *mp = bp->b_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); - xfs_agnumber_t agno; xfs_agblock_t agbno; xfs_failaddr_t fa; @@ -4604,12 +4600,11 @@ xfs_btree_sblock_verify( return __this_address; /* sibling pointer verification */ - agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp)); agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); - fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno, + fa = xfs_btree_check_sblock_siblings(bp->b_pag, NULL, -1, agbno, block->bb_u.s.bb_leftsib); if (!fa) - fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno, + fa = xfs_btree_check_sblock_siblings(bp->b_pag, NULL, -1, agbno, block->bb_u.s.bb_rightsib); return fa; } diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 3cd51fa3837b..76eedc2756b3 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -193,7 +193,7 @@ xfs_dir_isempty( ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); if (dp->i_disk_size == 0) /* might happen during shutdown. */ return 1; - if (dp->i_disk_size > XFS_IFORK_DSIZE(dp)) + if (dp->i_disk_size > xfs_inode_data_fork_size(dp)) return 0; sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; return !sfp->count; diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index df0869bba275..00f960a703b2 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -842,7 +842,7 @@ xfs_dir2_block_removename( * See if the size as a shortform is good enough. */ size = xfs_dir2_block_sfsize(dp, hdr, &sfh); - if (size > XFS_IFORK_DSIZE(dp)) + if (size > xfs_inode_data_fork_size(dp)) return 0; /* @@ -1055,7 +1055,7 @@ xfs_dir2_leaf_to_block( * Now see if the resulting block can be shrunken to shortform. */ size = xfs_dir2_block_sfsize(dp, hdr, &sfh); - if (size > XFS_IFORK_DSIZE(dp)) + if (size > xfs_inode_data_fork_size(dp)) return 0; return xfs_dir2_block_to_sf(args, dbp, size, &sfh); @@ -1071,7 +1071,7 @@ xfs_dir2_sf_to_block( struct xfs_trans *tp = args->trans; struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK); + struct xfs_ifork *ifp = xfs_ifork_ptr(dp, XFS_DATA_FORK); struct xfs_da_geometry *geo = args->geo; xfs_dir2_db_t blkno; /* dir-relative block # (0) */ xfs_dir2_data_hdr_t *hdr; /* block header */ diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 5a97a87eaa20..003812fd7d35 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -237,7 +237,7 @@ xfs_dir2_block_sfsize( (i8count ? /* inumber */ count * XFS_INO64_SIZE : count * XFS_INO32_SIZE); - if (size > XFS_IFORK_DSIZE(dp)) + if (size > xfs_inode_data_fork_size(dp)) return size; /* size value is a failure */ } /* @@ -406,7 +406,7 @@ xfs_dir2_sf_addname( * Won't fit as shortform any more (due to size), * or the pick routine says it won't (due to offset values). */ - if (new_isize > XFS_IFORK_DSIZE(dp) || + if (new_isize > xfs_inode_data_fork_size(dp) || (pick = xfs_dir2_sf_addname_pick(args, objchange, &sfep, &offset)) == 0) { /* @@ -710,7 +710,7 @@ xfs_dir2_sf_verify( struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); struct xfs_dir2_sf_hdr *sfp; struct xfs_dir2_sf_entry *sfep; struct xfs_dir2_sf_entry *next_sfep; @@ -1031,7 +1031,7 @@ xfs_dir2_sf_replace_needblock( newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF; return inum > XFS_DIR2_MAX_SHORT_INUM && - sfp->i8count == 0 && newsize > XFS_IFORK_DSIZE(dp); + sfp->i8count == 0 && newsize > xfs_inode_data_fork_size(dp); } /* diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index afdfc8108c5f..b55bdfa9c8a8 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -704,7 +704,7 @@ struct xfs_agfl { * When the bigtime feature is enabled, ondisk inode timestamps become an * unsigned 64-bit nanoseconds counter. This means that the bigtime inode * timestamp epoch is the start of the classic timestamp range, which is - * Dec 31 20:45:52 UTC 1901. Because the epochs are not the same, callers + * Dec 13 20:45:52 UTC 1901. Because the epochs are not the same, callers * /must/ use the bigtime conversion functions when encoding and decoding raw * timestamps. */ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index bf2f4bc89193..6cdfd64bc56b 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -105,7 +105,6 @@ xfs_inobt_get_rec( int *stat) { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno; union xfs_btree_rec *rec; int error; uint64_t realfree; @@ -116,7 +115,7 @@ xfs_inobt_get_rec( xfs_inobt_btrec_to_irec(mp, rec, irec); - if (!xfs_verify_agino(mp, agno, irec->ir_startino)) + if (!xfs_verify_agino(cur->bc_ag.pag, irec->ir_startino)) goto out_bad_rec; if (irec->ir_count < XFS_INODES_PER_HOLEMASK_BIT || irec->ir_count > XFS_INODES_PER_CHUNK) @@ -137,7 +136,8 @@ xfs_inobt_get_rec( out_bad_rec: xfs_warn(mp, "%s Inode BTree record corruption in AG %d detected!", - cur->bc_btnum == XFS_BTNUM_INO ? "Used" : "Free", agno); + cur->bc_btnum == XFS_BTNUM_INO ? "Used" : "Free", + cur->bc_ag.pag->pag_agno); xfs_warn(mp, "start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x", irec->ir_startino, irec->ir_count, irec->ir_freecount, @@ -1610,7 +1610,7 @@ xfs_dialloc_good_ag( return false; if (!pag->pagi_init) { - error = xfs_ialloc_pagi_init(mp, tp, pag->pag_agno); + error = xfs_ialloc_read_agi(pag, tp, NULL); if (error) return false; } @@ -1621,7 +1621,7 @@ xfs_dialloc_good_ag( return false; if (!pag->pagf_init) { - error = xfs_alloc_pagf_init(mp, tp, pag->pag_agno, flags); + error = xfs_alloc_read_agf(pag, tp, flags, NULL); if (error) return false; } @@ -1679,7 +1679,7 @@ xfs_dialloc_try_ag( * Then read in the AGI buffer and recheck with the AGI buffer * lock held. */ - error = xfs_ialloc_read_agi(pag->pag_mount, *tpp, pag->pag_agno, &agbp); + error = xfs_ialloc_read_agi(pag, *tpp, &agbp); if (error) return error; @@ -2169,7 +2169,7 @@ xfs_difree( /* * Get the allocation group header. */ - error = xfs_ialloc_read_agi(mp, tp, pag->pag_agno, &agbp); + error = xfs_ialloc_read_agi(pag, tp, &agbp); if (error) { xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.", __func__, error); @@ -2215,7 +2215,7 @@ xfs_imap_lookup( int error; int i; - error = xfs_ialloc_read_agi(mp, tp, pag->pag_agno, &agbp); + error = xfs_ialloc_read_agi(pag, tp, &agbp); if (error) { xfs_alert(mp, "%s: xfs_ialloc_read_agi() returned error %d, agno %d", @@ -2571,47 +2571,48 @@ const struct xfs_buf_ops xfs_agi_buf_ops = { */ int xfs_read_agi( - struct xfs_mount *mp, /* file system mount structure */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - struct xfs_buf **bpp) /* allocation group hdr buf */ + struct xfs_perag *pag, + struct xfs_trans *tp, + struct xfs_buf **agibpp) { + struct xfs_mount *mp = pag->pag_mount; int error; - trace_xfs_read_agi(mp, agno); + trace_xfs_read_agi(pag->pag_mount, pag->pag_agno); - ASSERT(agno != NULLAGNUMBER); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops); + XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGI_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0, agibpp, &xfs_agi_buf_ops); if (error) return error; if (tp) - xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_AGI_BUF); + xfs_trans_buf_set_type(tp, *agibpp, XFS_BLFT_AGI_BUF); - xfs_buf_set_ref(*bpp, XFS_AGI_REF); + xfs_buf_set_ref(*agibpp, XFS_AGI_REF); return 0; } +/* + * Read in the agi and initialise the per-ag data. If the caller supplies a + * @agibpp, return the locked AGI buffer to them, otherwise release it. + */ int xfs_ialloc_read_agi( - struct xfs_mount *mp, /* file system mount structure */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - struct xfs_buf **bpp) /* allocation group hdr buf */ + struct xfs_perag *pag, + struct xfs_trans *tp, + struct xfs_buf **agibpp) { - struct xfs_agi *agi; /* allocation group header */ - struct xfs_perag *pag; /* per allocation group data */ + struct xfs_buf *agibp; + struct xfs_agi *agi; int error; - trace_xfs_ialloc_read_agi(mp, agno); + trace_xfs_ialloc_read_agi(pag->pag_mount, pag->pag_agno); - error = xfs_read_agi(mp, tp, agno, bpp); + error = xfs_read_agi(pag, tp, &agibp); if (error) return error; - agi = (*bpp)->b_addr; - pag = (*bpp)->b_pag; + agi = agibp->b_addr; if (!pag->pagi_init) { pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); pag->pagi_count = be32_to_cpu(agi->agi_count); @@ -2623,27 +2624,11 @@ xfs_ialloc_read_agi( * we are in the middle of a forced shutdown. */ ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) || - xfs_is_shutdown(mp)); - return 0; -} - -/* - * Read in the agi to initialise the per-ag data in the mount structure - */ -int -xfs_ialloc_pagi_init( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_agnumber_t agno) /* allocation group number */ -{ - struct xfs_buf *bp = NULL; - int error; - - error = xfs_ialloc_read_agi(mp, tp, agno, &bp); - if (error) - return error; - if (bp) - xfs_trans_brelse(tp, bp); + xfs_is_shutdown(pag->pag_mount)); + if (agibpp) + *agibpp = agibp; + else + xfs_trans_brelse(tp, agibp); return 0; } @@ -2912,8 +2897,7 @@ xfs_ialloc_calc_rootino( * allocation group, or very odd geometries created by old mkfs * versions on very small filesystems. */ - if (mp->m_sb.sb_logstart && - XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == 0) + if (xfs_ag_contains_log(mp, 0)) first_bno += mp->m_sb.sb_logblocks; /* diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index a7705b6a1fd3..9bbbca6ac4ed 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -62,25 +62,10 @@ xfs_ialloc_log_agi( struct xfs_buf *bp, /* allocation group header buffer */ uint32_t fields); /* bitmask of fields to log */ -/* - * Read in the allocation group header (inode allocation section) - */ -int /* error */ -xfs_ialloc_read_agi( - struct xfs_mount *mp, /* file system mount structure */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - struct xfs_buf **bpp); /* allocation group hdr buf */ - -/* - * Read in the allocation group header to initialise the per-ag data - * in the mount structure - */ -int -xfs_ialloc_pagi_init( - struct xfs_mount *mp, /* file system mount structure */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno); /* allocation group number */ +int xfs_read_agi(struct xfs_perag *pag, struct xfs_trans *tp, + struct xfs_buf **agibpp); +int xfs_ialloc_read_agi(struct xfs_perag *pag, struct xfs_trans *tp, + struct xfs_buf **agibpp); /* * Lookup a record by ino in the btree given by cur. @@ -102,8 +87,6 @@ int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_agblock_t length, unsigned int gen); -int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, struct xfs_buf **bpp); union xfs_btree_rec; void xfs_inobt_btrec_to_irec(struct xfs_mount *mp, diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index b2ad2fdc40f5..8c83e265770c 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -683,10 +683,10 @@ xfs_inobt_rec_check_count( static xfs_extlen_t xfs_inobt_max_size( - struct xfs_mount *mp, - xfs_agnumber_t agno) + struct xfs_perag *pag) { - xfs_agblock_t agblocks = xfs_ag_block_count(mp, agno); + struct xfs_mount *mp = pag->pag_mount; + xfs_agblock_t agblocks = pag->block_count; /* Bail out if we're uninitialized, which can happen in mkfs. */ if (M_IGEO(mp)->inobt_mxr[0] == 0) @@ -697,8 +697,7 @@ xfs_inobt_max_size( * never be available for the kinds of things that would require btree * expansion. We therefore can pretend the space isn't there. */ - if (mp->m_sb.sb_logstart && - XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == agno) + if (xfs_ag_contains_log(mp, pag->pag_agno)) agblocks -= mp->m_sb.sb_logblocks; return xfs_btree_calc_size(M_IGEO(mp)->inobt_mnr, @@ -722,7 +721,7 @@ xfs_inobt_cur( ASSERT(*agi_bpp == NULL); ASSERT(*curpp == NULL); - error = xfs_ialloc_read_agi(mp, tp, pag->pag_agno, agi_bpp); + error = xfs_ialloc_read_agi(pag, tp, agi_bpp); if (error) return error; @@ -757,16 +756,15 @@ xfs_inobt_count_blocks( /* Read finobt block count from AGI header. */ static int xfs_finobt_read_blocks( - struct xfs_mount *mp, - struct xfs_trans *tp, struct xfs_perag *pag, + struct xfs_trans *tp, xfs_extlen_t *tree_blocks) { struct xfs_buf *agbp; struct xfs_agi *agi; int error; - error = xfs_ialloc_read_agi(mp, tp, pag->pag_agno, &agbp); + error = xfs_ialloc_read_agi(pag, tp, &agbp); if (error) return error; @@ -794,14 +792,14 @@ xfs_finobt_calc_reserves( return 0; if (xfs_has_inobtcounts(mp)) - error = xfs_finobt_read_blocks(mp, tp, pag, &tree_len); + error = xfs_finobt_read_blocks(pag, tp, &tree_len); else error = xfs_inobt_count_blocks(mp, tp, pag, XFS_BTNUM_FINO, &tree_len); if (error) return error; - *ask += xfs_inobt_max_size(mp, pag->pag_agno); + *ask += xfs_inobt_max_size(pag); *used += tree_len; return 0; } diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 3b1b63f9d886..758aacd8166b 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -10,6 +10,7 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "xfs_ag.h" #include "xfs_inode.h" #include "xfs_errortag.h" #include "xfs_error.h" @@ -41,14 +42,12 @@ xfs_inode_buf_verify( bool readahead) { struct xfs_mount *mp = bp->b_mount; - xfs_agnumber_t agno; int i; int ni; /* * Validate the magic number and version of every inode in the buffer */ - agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp)); ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock; for (i = 0; i < ni; i++) { struct xfs_dinode *dip; @@ -59,7 +58,7 @@ xfs_inode_buf_verify( unlinked_ino = be32_to_cpu(dip->di_next_unlinked); di_ok = xfs_verify_magic16(bp, dip->di_magic) && xfs_dinode_good_version(mp, dip->di_version) && - xfs_verify_agino_or_null(mp, agno, unlinked_ino); + xfs_verify_agino_or_null(bp->b_pag, unlinked_ino); if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP))) { if (readahead) { @@ -178,7 +177,6 @@ xfs_inode_from_disk( xfs_failaddr_t fa; ASSERT(ip->i_cowfp == NULL); - ASSERT(ip->i_afp == NULL); fa = xfs_dinode_verify(ip->i_mount, ip->i_ino, from); if (fa) { @@ -230,7 +228,8 @@ xfs_inode_from_disk( ip->i_nblocks = be64_to_cpu(from->di_nblocks); ip->i_extsize = be32_to_cpu(from->di_extsize); ip->i_forkoff = from->di_forkoff; - ip->i_diflags = be16_to_cpu(from->di_flags); + ip->i_diflags = be16_to_cpu(from->di_flags); + ip->i_next_unlinked = be32_to_cpu(from->di_next_unlinked); if (from->di_dmevmask || from->di_dmstate) xfs_iflags_set(ip, XFS_IPRESERVE_DM_FIELDS); @@ -286,7 +285,7 @@ xfs_inode_to_disk_iext_counters( { if (xfs_inode_has_large_extent_counts(ip)) { to->di_big_nextents = cpu_to_be64(xfs_ifork_nextents(&ip->i_df)); - to->di_big_anextents = cpu_to_be32(xfs_ifork_nextents(ip->i_afp)); + to->di_big_anextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_af)); /* * We might be upgrading the inode to use larger extent counters * than was previously used. Hence zero the unused field. @@ -294,7 +293,7 @@ xfs_inode_to_disk_iext_counters( to->di_nrext64_pad = cpu_to_be16(0); } else { to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df)); - to->di_anextents = cpu_to_be16(xfs_ifork_nextents(ip->i_afp)); + to->di_anextents = cpu_to_be16(xfs_ifork_nextents(&ip->i_af)); } } @@ -326,7 +325,7 @@ xfs_inode_to_disk( to->di_nblocks = cpu_to_be64(ip->i_nblocks); to->di_extsize = cpu_to_be32(ip->i_extsize); to->di_forkoff = ip->i_forkoff; - to->di_aformat = xfs_ifork_format(ip->i_afp); + to->di_aformat = xfs_ifork_format(&ip->i_af); to->di_flags = cpu_to_be16(ip->i_diflags); if (xfs_has_v3inodes(ip->i_mount)) { diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 1a4cdf550f6d..9327a4f39206 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -35,7 +35,7 @@ xfs_init_local_fork( const void *data, int64_t size) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); int mem_size = size; bool zero_terminate; @@ -102,7 +102,7 @@ xfs_iformat_extents( int whichfork) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); int state = xfs_bmap_fork_to_state(whichfork); xfs_extnum_t nex = xfs_dfork_nextents(dip, whichfork); int size = nex * sizeof(xfs_bmbt_rec_t); @@ -173,7 +173,7 @@ xfs_iformat_btree( int size; int level; - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork); size = XFS_BMAP_BROOT_SPACE(mp, dfp); nrecs = be16_to_cpu(dfp->bb_numrecs); @@ -276,17 +276,23 @@ xfs_dfork_attr_shortform_size( return be16_to_cpu(atp->hdr.totsize); } -struct xfs_ifork * -xfs_ifork_alloc( +void +xfs_ifork_init_attr( + struct xfs_inode *ip, enum xfs_dinode_fmt format, xfs_extnum_t nextents) { - struct xfs_ifork *ifp; + ip->i_af.if_format = format; + ip->i_af.if_nextents = nextents; +} - ifp = kmem_cache_zalloc(xfs_ifork_cache, GFP_NOFS | __GFP_NOFAIL); - ifp->if_format = format; - ifp->if_nextents = nextents; - return ifp; +void +xfs_ifork_zap_attr( + struct xfs_inode *ip) +{ + xfs_idestroy_fork(&ip->i_af); + memset(&ip->i_af, 0, sizeof(struct xfs_ifork)); + ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS; } int @@ -301,9 +307,9 @@ xfs_iformat_attr_fork( * Initialize the extent count early, as the per-format routines may * depend on it. */ - ip->i_afp = xfs_ifork_alloc(dip->di_aformat, naextents); + xfs_ifork_init_attr(ip, dip->di_aformat, naextents); - switch (ip->i_afp->if_format) { + switch (ip->i_af.if_format) { case XFS_DINODE_FMT_LOCAL: error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, xfs_dfork_attr_shortform_size(dip)); @@ -323,10 +329,8 @@ xfs_iformat_attr_fork( break; } - if (error) { - kmem_cache_free(xfs_ifork_cache, ip->i_afp); - ip->i_afp = NULL; - } + if (error) + xfs_ifork_zap_attr(ip); return error; } @@ -370,7 +374,7 @@ xfs_iroot_realloc( return; } - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); if (rec_diff > 0) { /* * If there wasn't any memory allocated before, just @@ -400,7 +404,7 @@ xfs_iroot_realloc( (int)new_size); ifp->if_broot_bytes = (int)new_size; ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= - XFS_IFORK_SIZE(ip, whichfork)); + xfs_inode_fork_size(ip, whichfork)); memmove(np, op, cur_max * (uint)sizeof(xfs_fsblock_t)); return; } @@ -454,7 +458,7 @@ xfs_iroot_realloc( ifp->if_broot_bytes = (int)new_size; if (ifp->if_broot) ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= - XFS_IFORK_SIZE(ip, whichfork)); + xfs_inode_fork_size(ip, whichfork)); return; } @@ -480,11 +484,11 @@ xfs_idata_realloc( int64_t byte_diff, int whichfork) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); int64_t new_size = ifp->if_bytes + byte_diff; ASSERT(new_size >= 0); - ASSERT(new_size <= XFS_IFORK_SIZE(ip, whichfork)); + ASSERT(new_size <= xfs_inode_fork_size(ip, whichfork)); if (byte_diff == 0) return; @@ -539,7 +543,7 @@ xfs_iextents_copy( int whichfork) { int state = xfs_bmap_fork_to_state(whichfork); - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_iext_cursor icur; struct xfs_bmbt_irec rec; int64_t copied = 0; @@ -591,7 +595,7 @@ xfs_iflush_fork( if (!iip) return; - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); /* * This can happen if we gave up in iformat in an error path, * for the attribute fork. @@ -607,7 +611,7 @@ xfs_iflush_fork( if ((iip->ili_fields & dataflag[whichfork]) && (ifp->if_bytes > 0)) { ASSERT(ifp->if_u1.if_data != NULL); - ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); + ASSERT(ifp->if_bytes <= xfs_inode_fork_size(ip, whichfork)); memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes); } break; @@ -626,7 +630,7 @@ xfs_iflush_fork( (ifp->if_broot_bytes > 0)) { ASSERT(ifp->if_broot != NULL); ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= - XFS_IFORK_SIZE(ip, whichfork)); + xfs_inode_fork_size(ip, whichfork)); xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes, (xfs_bmdr_block_t *)cp, XFS_DFORK_SIZE(dip, mp, whichfork)); @@ -656,7 +660,7 @@ xfs_iext_state_to_fork( if (state & BMAP_COWFORK) return ip->i_cowfp; else if (state & BMAP_ATTRFORK) - return ip->i_afp; + return &ip->i_af; return &ip->i_df; } @@ -707,18 +711,17 @@ int xfs_ifork_verify_local_attr( struct xfs_inode *ip) { - struct xfs_ifork *ifp = ip->i_afp; + struct xfs_ifork *ifp = &ip->i_af; xfs_failaddr_t fa; - if (!ifp) + if (!xfs_inode_has_attr_fork(ip)) fa = __this_address; else fa = xfs_attr_shortform_verify(ip); if (fa) { xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork", - ifp ? ifp->if_u1.if_data : NULL, - ifp ? ifp->if_bytes : 0, fa); + ifp->if_u1.if_data, ifp->if_bytes, fa); return -EFSCORRUPTED; } @@ -731,7 +734,7 @@ xfs_iext_count_may_overflow( int whichfork, int nr_to_add) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); uint64_t max_exts; uint64_t nr_exts; diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 4f68c1f20beb..d3943d6ad0b9 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -77,28 +77,8 @@ struct xfs_ifork { /* * Fork handling. */ - -#define XFS_IFORK_Q(ip) ((ip)->i_forkoff != 0) -#define XFS_IFORK_BOFF(ip) ((int)((ip)->i_forkoff << 3)) - -#define XFS_IFORK_PTR(ip,w) \ - ((w) == XFS_DATA_FORK ? \ - &(ip)->i_df : \ - ((w) == XFS_ATTR_FORK ? \ - (ip)->i_afp : \ - (ip)->i_cowfp)) -#define XFS_IFORK_DSIZE(ip) \ - (XFS_IFORK_Q(ip) ? XFS_IFORK_BOFF(ip) : XFS_LITINO((ip)->i_mount)) -#define XFS_IFORK_ASIZE(ip) \ - (XFS_IFORK_Q(ip) ? XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : 0) -#define XFS_IFORK_SIZE(ip,w) \ - ((w) == XFS_DATA_FORK ? \ - XFS_IFORK_DSIZE(ip) : \ - ((w) == XFS_ATTR_FORK ? \ - XFS_IFORK_ASIZE(ip) : \ - 0)) #define XFS_IFORK_MAXEXT(ip, w) \ - (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t)) + (xfs_inode_fork_size(ip, w) / sizeof(xfs_bmbt_rec_t)) static inline bool xfs_ifork_has_extents(struct xfs_ifork *ifp) { @@ -179,8 +159,9 @@ xfs_dfork_nextents( return 0; } -struct xfs_ifork *xfs_ifork_alloc(enum xfs_dinode_fmt format, - xfs_extnum_t nextents); +void xfs_ifork_zap_attr(struct xfs_inode *ip); +void xfs_ifork_init_attr(struct xfs_inode *ip, enum xfs_dinode_fmt format, + xfs_extnum_t nextents); struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state); int xfs_iformat_data_fork(struct xfs_inode *, struct xfs_dinode *); diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 97e9e6020596..64b910caafaa 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -111,7 +111,7 @@ xfs_refcount_get_rec( int *stat) { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno; + struct xfs_perag *pag = cur->bc_ag.pag; union xfs_btree_rec *rec; int error; xfs_agblock_t realstart; @@ -121,8 +121,6 @@ xfs_refcount_get_rec( return error; xfs_refcount_btrec_to_irec(rec, irec); - - agno = cur->bc_ag.pag->pag_agno; if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN) goto out_bad_rec; @@ -137,22 +135,23 @@ xfs_refcount_get_rec( } /* check for valid extent range, including overflow */ - if (!xfs_verify_agbno(mp, agno, realstart)) + if (!xfs_verify_agbno(pag, realstart)) goto out_bad_rec; if (realstart > realstart + irec->rc_blockcount) goto out_bad_rec; - if (!xfs_verify_agbno(mp, agno, realstart + irec->rc_blockcount - 1)) + if (!xfs_verify_agbno(pag, realstart + irec->rc_blockcount - 1)) goto out_bad_rec; if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT) goto out_bad_rec; - trace_xfs_refcount_get(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec); + trace_xfs_refcount_get(cur->bc_mp, pag->pag_agno, irec); return 0; out_bad_rec: xfs_warn(mp, - "Refcount BTree record corruption in AG %d detected!", agno); + "Refcount BTree record corruption in AG %d detected!", + pag->pag_agno); xfs_warn(mp, "Start block 0x%x, block count 0x%x, references 0x%x", irec->rc_startblock, irec->rc_blockcount, irec->rc_refcount); @@ -1177,8 +1176,8 @@ xfs_refcount_finish_one( *pcur = NULL; } if (rcur == NULL) { - error = xfs_alloc_read_agf(tp->t_mountp, tp, pag->pag_agno, - XFS_ALLOC_FLAG_FREEING, &agbp); + error = xfs_alloc_read_agf(pag, tp, XFS_ALLOC_FLAG_FREEING, + &agbp); if (error) goto out_drop; @@ -1710,7 +1709,7 @@ xfs_refcount_recover_cow_leftovers( if (error) return error; - error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, &agbp); + error = xfs_alloc_read_agf(pag, tp, 0, &agbp); if (error) goto out_trans; cur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag); diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index d14c1720b0fb..316c1ec0c3c2 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -493,7 +493,7 @@ xfs_refcountbt_calc_reserves( if (!xfs_has_reflink(mp)) return 0; - error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, &agbp); + error = xfs_alloc_read_agf(pag, tp, 0, &agbp); if (error) return error; @@ -507,8 +507,7 @@ xfs_refcountbt_calc_reserves( * never be available for the kinds of things that would require btree * expansion. We therefore can pretend the space isn't there. */ - if (mp->m_sb.sb_logstart && - XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == pag->pag_agno) + if (xfs_ag_contains_log(mp, pag->pag_agno)) agblocks -= mp->m_sb.sb_logblocks; *ask += xfs_refcountbt_max_size(mp, agblocks); diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 2845019d31da..094dfc897ebc 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -215,7 +215,7 @@ xfs_rmap_get_rec( int *stat) { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno; + struct xfs_perag *pag = cur->bc_ag.pag; union xfs_btree_rec *rec; int error; @@ -235,12 +235,12 @@ xfs_rmap_get_rec( goto out_bad_rec; } else { /* check for valid extent range, including overflow */ - if (!xfs_verify_agbno(mp, agno, irec->rm_startblock)) + if (!xfs_verify_agbno(pag, irec->rm_startblock)) goto out_bad_rec; if (irec->rm_startblock > irec->rm_startblock + irec->rm_blockcount) goto out_bad_rec; - if (!xfs_verify_agbno(mp, agno, + if (!xfs_verify_agbno(pag, irec->rm_startblock + irec->rm_blockcount - 1)) goto out_bad_rec; } @@ -254,7 +254,7 @@ xfs_rmap_get_rec( out_bad_rec: xfs_warn(mp, "Reverse Mapping BTree record corruption in AG %d detected!", - agno); + pag->pag_agno); xfs_warn(mp, "Owner 0x%llx, flags 0x%x, start block 0x%x block count 0x%x", irec->rm_owner, irec->rm_flags, irec->rm_startblock, diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 69e104d0277f..7f83f62e51e0 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -90,7 +90,7 @@ xfs_rmapbt_alloc_block( xfs_agblock_t bno; /* Allocate the new block from the freelist. If we can't, give up. */ - error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_ag.agbp, + error = xfs_alloc_get_freelist(pag, cur->bc_tp, cur->bc_ag.agbp, &bno, 1); if (error) return error; @@ -129,7 +129,7 @@ xfs_rmapbt_free_block( bno, 1); be32_add_cpu(&agf->agf_rmap_blocks, -1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); - error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1); + error = xfs_alloc_put_freelist(pag, cur->bc_tp, agbp, NULL, bno, 1); if (error) return error; @@ -652,7 +652,7 @@ xfs_rmapbt_calc_reserves( if (!xfs_has_rmapbt(mp)) return 0; - error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, &agbp); + error = xfs_alloc_read_agf(pag, tp, 0, &agbp); if (error) return error; @@ -666,8 +666,7 @@ xfs_rmapbt_calc_reserves( * never be available for the kinds of things that would require btree * expansion. We therefore can pretend the space isn't there. */ - if (mp->m_sb.sb_logstart && - XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == pag->pag_agno) + if (xfs_ag_contains_log(mp, pag->pag_agno)) agblocks -= mp->m_sb.sb_logblocks; /* Reserve 1% of the AG or enough for 1 block per record. */ diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index 8b9bd178a487..bdc777b9ec4a 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -204,7 +204,7 @@ xfs_failaddr_t xfs_symlink_shortform_verify( struct xfs_inode *ip) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); char *sfp = (char *)ifp->if_u1.if_data; int size = ifp->if_bytes; char *endp = sfp + size; diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c index e810d23f2d97..5c2765934732 100644 --- a/fs/xfs/libxfs/xfs_types.c +++ b/fs/xfs/libxfs/xfs_types.c @@ -13,25 +13,13 @@ #include "xfs_mount.h" #include "xfs_ag.h" -/* Find the size of the AG, in blocks. */ -inline xfs_agblock_t -xfs_ag_block_count( - struct xfs_mount *mp, - xfs_agnumber_t agno) -{ - ASSERT(agno < mp->m_sb.sb_agcount); - - if (agno < mp->m_sb.sb_agcount - 1) - return mp->m_sb.sb_agblocks; - return mp->m_sb.sb_dblocks - (agno * mp->m_sb.sb_agblocks); -} /* * Verify that an AG block number pointer neither points outside the AG * nor points at static metadata. */ -inline bool -xfs_verify_agbno( +static inline bool +xfs_verify_agno_agbno( struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno) @@ -59,7 +47,7 @@ xfs_verify_fsbno( if (agno >= mp->m_sb.sb_agcount) return false; - return xfs_verify_agbno(mp, agno, XFS_FSB_TO_AGBNO(mp, fsbno)); + return xfs_verify_agno_agbno(mp, agno, XFS_FSB_TO_AGBNO(mp, fsbno)); } /* @@ -85,40 +73,12 @@ xfs_verify_fsbext( XFS_FSB_TO_AGNO(mp, fsbno + len - 1); } -/* Calculate the first and last possible inode number in an AG. */ -inline void -xfs_agino_range( - struct xfs_mount *mp, - xfs_agnumber_t agno, - xfs_agino_t *first, - xfs_agino_t *last) -{ - xfs_agblock_t bno; - xfs_agblock_t eoag; - - eoag = xfs_ag_block_count(mp, agno); - - /* - * Calculate the first inode, which will be in the first - * cluster-aligned block after the AGFL. - */ - bno = round_up(XFS_AGFL_BLOCK(mp) + 1, M_IGEO(mp)->cluster_align); - *first = XFS_AGB_TO_AGINO(mp, bno); - - /* - * Calculate the last inode, which will be at the end of the - * last (aligned) cluster that can be allocated in the AG. - */ - bno = round_down(eoag, M_IGEO(mp)->cluster_align); - *last = XFS_AGB_TO_AGINO(mp, bno) - 1; -} - /* * Verify that an AG inode number pointer neither points outside the AG * nor points at static metadata. */ -inline bool -xfs_verify_agino( +static inline bool +xfs_verify_agno_agino( struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino) @@ -131,19 +91,6 @@ xfs_verify_agino( } /* - * Verify that an AG inode number pointer neither points outside the AG - * nor points at static metadata, or is NULLAGINO. - */ -bool -xfs_verify_agino_or_null( - struct xfs_mount *mp, - xfs_agnumber_t agno, - xfs_agino_t agino) -{ - return agino == NULLAGINO || xfs_verify_agino(mp, agno, agino); -} - -/* * Verify that an FS inode number pointer neither points outside the * filesystem nor points at static AG metadata. */ @@ -159,7 +106,7 @@ xfs_verify_ino( return false; if (XFS_AGINO_TO_INO(mp, agno, agino) != ino) return false; - return xfs_verify_agino(mp, agno, agino); + return xfs_verify_agno_agino(mp, agno, agino); } /* Is this an internal inode number? */ @@ -229,12 +176,8 @@ xfs_icount_range( /* root, rtbitmap, rtsum all live in the first chunk */ *min = XFS_INODES_PER_CHUNK; - for_each_perag(mp, agno, pag) { - xfs_agino_t first, last; - - xfs_agino_range(mp, agno, &first, &last); - nr_inos += last - first + 1; - } + for_each_perag(mp, agno, pag) + nr_inos += pag->agino_max - pag->agino_min + 1; *max = nr_inos; } diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 373f64a492a4..a6b7d98cf68f 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -179,19 +179,10 @@ enum xfs_ag_resv_type { */ struct xfs_mount; -xfs_agblock_t xfs_ag_block_count(struct xfs_mount *mp, xfs_agnumber_t agno); -bool xfs_verify_agbno(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t agbno); bool xfs_verify_fsbno(struct xfs_mount *mp, xfs_fsblock_t fsbno); bool xfs_verify_fsbext(struct xfs_mount *mp, xfs_fsblock_t fsbno, xfs_fsblock_t len); -void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agino_t *first, xfs_agino_t *last); -bool xfs_verify_agino(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agino_t agino); -bool xfs_verify_agino_or_null(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agino_t agino); bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino); diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index 90aebfe9dc5f..b7b838bd4ba4 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -541,16 +541,16 @@ xchk_agf( /* Check the AG length */ eoag = be32_to_cpu(agf->agf_length); - if (eoag != xfs_ag_block_count(mp, agno)) + if (eoag != pag->block_count) xchk_block_set_corrupt(sc, sc->sa.agf_bp); /* Check the AGF btree roots and levels */ agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]); - if (!xfs_verify_agbno(mp, agno, agbno)) + if (!xfs_verify_agbno(pag, agbno)) xchk_block_set_corrupt(sc, sc->sa.agf_bp); agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]); - if (!xfs_verify_agbno(mp, agno, agbno)) + if (!xfs_verify_agbno(pag, agbno)) xchk_block_set_corrupt(sc, sc->sa.agf_bp); level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]); @@ -563,7 +563,7 @@ xchk_agf( if (xfs_has_rmapbt(mp)) { agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_RMAP]); - if (!xfs_verify_agbno(mp, agno, agbno)) + if (!xfs_verify_agbno(pag, agbno)) xchk_block_set_corrupt(sc, sc->sa.agf_bp); level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]); @@ -573,7 +573,7 @@ xchk_agf( if (xfs_has_reflink(mp)) { agbno = be32_to_cpu(agf->agf_refcount_root); - if (!xfs_verify_agbno(mp, agno, agbno)) + if (!xfs_verify_agbno(pag, agbno)) xchk_block_set_corrupt(sc, sc->sa.agf_bp); level = be32_to_cpu(agf->agf_refcount_level); @@ -639,9 +639,8 @@ xchk_agfl_block( { struct xchk_agfl_info *sai = priv; struct xfs_scrub *sc = sai->sc; - xfs_agnumber_t agno = sc->sa.pag->pag_agno; - if (xfs_verify_agbno(mp, agno, agbno) && + if (xfs_verify_agbno(sc->sa.pag, agbno) && sai->nr_entries < sai->sz_entries) sai->entries[sai->nr_entries++] = agbno; else @@ -871,12 +870,12 @@ xchk_agi( /* Check the AG length */ eoag = be32_to_cpu(agi->agi_length); - if (eoag != xfs_ag_block_count(mp, agno)) + if (eoag != pag->block_count) xchk_block_set_corrupt(sc, sc->sa.agi_bp); /* Check btree roots and levels */ agbno = be32_to_cpu(agi->agi_root); - if (!xfs_verify_agbno(mp, agno, agbno)) + if (!xfs_verify_agbno(pag, agbno)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); level = be32_to_cpu(agi->agi_level); @@ -885,7 +884,7 @@ xchk_agi( if (xfs_has_finobt(mp)) { agbno = be32_to_cpu(agi->agi_free_root); - if (!xfs_verify_agbno(mp, agno, agbno)) + if (!xfs_verify_agbno(pag, agbno)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); level = be32_to_cpu(agi->agi_free_level); @@ -902,17 +901,17 @@ xchk_agi( /* Check inode pointers */ agino = be32_to_cpu(agi->agi_newino); - if (!xfs_verify_agino_or_null(mp, agno, agino)) + if (!xfs_verify_agino_or_null(pag, agino)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); agino = be32_to_cpu(agi->agi_dirino); - if (!xfs_verify_agino_or_null(mp, agno, agino)) + if (!xfs_verify_agino_or_null(pag, agino)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); /* Check unlinked inode buckets */ for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) { agino = be32_to_cpu(agi->agi_unlinked[i]); - if (!xfs_verify_agino_or_null(mp, agno, agino)) + if (!xfs_verify_agino_or_null(pag, agino)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); } diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 6da7f2ca77de..1b0b4e243f77 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -106,7 +106,7 @@ xrep_agf_check_agfl_block( { struct xfs_scrub *sc = priv; - if (!xfs_verify_agbno(mp, sc->sa.pag->pag_agno, agbno)) + if (!xfs_verify_agbno(sc->sa.pag, agbno)) return -EFSCORRUPTED; return 0; } @@ -130,10 +130,7 @@ xrep_check_btree_root( struct xfs_scrub *sc, struct xrep_find_ag_btree *fab) { - struct xfs_mount *mp = sc->mp; - xfs_agnumber_t agno = sc->sm->sm_agno; - - return xfs_verify_agbno(mp, agno, fab->root) && + return xfs_verify_agbno(sc->sa.pag, fab->root) && fab->height <= fab->maxlevels; } @@ -201,8 +198,7 @@ xrep_agf_init_header( agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION); agf->agf_seqno = cpu_to_be32(sc->sa.pag->pag_agno); - agf->agf_length = cpu_to_be32(xfs_ag_block_count(mp, - sc->sa.pag->pag_agno)); + agf->agf_length = cpu_to_be32(sc->sa.pag->block_count); agf->agf_flfirst = old_agf->agf_flfirst; agf->agf_fllast = old_agf->agf_fllast; agf->agf_flcount = old_agf->agf_flcount; @@ -405,7 +401,7 @@ xrep_agf( * btrees rooted in the AGF. If the AGFL contents are obviously bad * then we'll bail out. */ - error = xfs_alloc_read_agfl(mp, sc->tp, sc->sa.pag->pag_agno, &agfl_bp); + error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp); if (error) return error; @@ -666,8 +662,7 @@ xrep_agfl( * nothing wrong with the AGF, but all the AG header repair functions * have this chicken-and-egg problem. */ - error = xfs_alloc_read_agf(mp, sc->tp, sc->sa.pag->pag_agno, 0, - &agf_bp); + error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp); if (error) return error; @@ -742,8 +737,7 @@ xrep_agi_find_btrees( int error; /* Read the AGF. */ - error = xfs_alloc_read_agf(mp, sc->tp, sc->sa.pag->pag_agno, 0, - &agf_bp); + error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp); if (error) return error; @@ -782,8 +776,7 @@ xrep_agi_init_header( agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION); agi->agi_seqno = cpu_to_be32(sc->sa.pag->pag_agno); - agi->agi_length = cpu_to_be32(xfs_ag_block_count(mp, - sc->sa.pag->pag_agno)); + agi->agi_length = cpu_to_be32(sc->sa.pag->block_count); agi->agi_newino = cpu_to_be32(NULLAGINO); agi->agi_dirino = cpu_to_be32(NULLAGINO); if (xfs_has_crc(mp)) diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index 87518e1292f8..ab427b4d7fe0 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -93,8 +93,7 @@ xchk_allocbt_rec( struct xchk_btree *bs, const union xfs_btree_rec *rec) { - struct xfs_mount *mp = bs->cur->bc_mp; - xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno; + struct xfs_perag *pag = bs->cur->bc_ag.pag; xfs_agblock_t bno; xfs_extlen_t len; @@ -102,8 +101,8 @@ xchk_allocbt_rec( len = be32_to_cpu(rec->alloc.ar_blockcount); if (bno + len <= bno || - !xfs_verify_agbno(mp, agno, bno) || - !xfs_verify_agbno(mp, agno, bno + len - 1)) + !xfs_verify_agbno(pag, bno) || + !xfs_verify_agbno(pag, bno + len - 1)) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); xchk_allocbt_xref(bs->sc, bno, len); diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 285995ba3947..f0b9cb6506fd 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -377,7 +377,7 @@ xchk_bmapbt_rec( struct xfs_inode *ip = bs->cur->bc_ino.ip; struct xfs_buf *bp = NULL; struct xfs_btree_block *block; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, info->whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, info->whichfork); uint64_t owner; int i; @@ -426,7 +426,7 @@ xchk_bmap_btree( struct xchk_bmap_info *info) { struct xfs_owner_info oinfo; - struct xfs_ifork *ifp = XFS_IFORK_PTR(sc->ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, whichfork); struct xfs_mount *mp = sc->mp; struct xfs_inode *ip = sc->ip; struct xfs_btree_cur *cur; @@ -478,7 +478,7 @@ xchk_bmap_check_rmap( return 0; /* Now look up the bmbt record. */ - ifp = XFS_IFORK_PTR(sc->ip, sbcri->whichfork); + ifp = xfs_ifork_ptr(sc->ip, sbcri->whichfork); if (!ifp) { xchk_fblock_set_corrupt(sc, sbcri->whichfork, rec->rm_offset); @@ -540,7 +540,7 @@ xchk_bmap_check_ag_rmaps( struct xfs_buf *agf; int error; - error = xfs_alloc_read_agf(sc->mp, sc->tp, pag->pag_agno, 0, &agf); + error = xfs_alloc_read_agf(pag, sc->tp, 0, &agf); if (error) return error; @@ -563,7 +563,7 @@ xchk_bmap_check_rmaps( struct xfs_scrub *sc, int whichfork) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(sc->ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, whichfork); struct xfs_perag *pag; xfs_agnumber_t agno; bool zero_size; @@ -578,7 +578,7 @@ xchk_bmap_check_rmaps( if (XFS_IS_REALTIME_INODE(sc->ip) && whichfork == XFS_DATA_FORK) return 0; - ASSERT(XFS_IFORK_PTR(sc->ip, whichfork) != NULL); + ASSERT(xfs_ifork_ptr(sc->ip, whichfork) != NULL); /* * Only do this for complex maps that are in btree format, or for @@ -624,7 +624,7 @@ xchk_bmap( struct xchk_bmap_info info = { NULL }; struct xfs_mount *mp = sc->mp; struct xfs_inode *ip = sc->ip; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); xfs_fileoff_t endoff; struct xfs_iext_cursor icur; int error = 0; @@ -689,7 +689,7 @@ xchk_bmap( /* Scrub extent records. */ info.lastoff = 0; - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); for_each_xfs_iext(ifp, &icur, &irec) { if (xchk_should_terminate(sc, &error) || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index 39dd46f038fe..2f4519590dc1 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -462,7 +462,7 @@ xchk_btree_check_iroot_minrecs( */ if (bs->cur->bc_btnum == XFS_BTNUM_BMAP && bs->cur->bc_ino.whichfork == XFS_DATA_FORK && - XFS_IFORK_Q(bs->sc->ip)) + xfs_inode_has_attr_fork(bs->sc->ip)) return false; return true; diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 97b54ac3075f..9bbbf20f401b 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -416,15 +416,15 @@ xchk_ag_read_headers( if (!sa->pag) return -ENOENT; - error = xfs_ialloc_read_agi(mp, sc->tp, agno, &sa->agi_bp); + error = xfs_ialloc_read_agi(sa->pag, sc->tp, &sa->agi_bp); if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI)) return error; - error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &sa->agf_bp); + error = xfs_alloc_read_agf(sa->pag, sc->tp, 0, &sa->agf_bp); if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF)) return error; - error = xfs_alloc_read_agfl(mp, sc->tp, agno, &sa->agfl_bp); + error = xfs_alloc_read_agfl(sa->pag, sc->tp, &sa->agfl_bp); if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGFL)) return error; diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index b962cfbbd92b..84fe3d33d699 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -482,7 +482,7 @@ xchk_da_btree( int error; /* Skip short format data structures; no btree to scan. */ - if (!xfs_ifork_has_extents(XFS_IFORK_PTR(sc->ip, whichfork))) + if (!xfs_ifork_has_extents(xfs_ifork_ptr(sc->ip, whichfork))) return 0; /* Set up initial da state. */ diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index 38897adde7b5..5abb5fdb71d9 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -667,7 +667,7 @@ xchk_directory_blocks( { struct xfs_bmbt_irec got; struct xfs_da_args args; - struct xfs_ifork *ifp = XFS_IFORK_PTR(sc->ip, XFS_DATA_FORK); + struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); struct xfs_mount *mp = sc->mp; xfs_fileoff_t leaf_lblk; xfs_fileoff_t free_lblk; diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index 48a6cbdf95d0..6a6f8fe7f87c 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -78,10 +78,10 @@ xchk_fscount_warmup( continue; /* Lock both AG headers. */ - error = xfs_ialloc_read_agi(mp, sc->tp, agno, &agi_bp); + error = xfs_ialloc_read_agi(pag, sc->tp, &agi_bp); if (error) break; - error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &agf_bp); + error = xfs_alloc_read_agf(pag, sc->tp, 0, &agf_bp); if (error) break; diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c index 2e61df3bca83..aa65ec88a0c0 100644 --- a/fs/xfs/scrub/health.c +++ b/fs/xfs/scrub/health.c @@ -8,6 +8,8 @@ #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_btree.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" #include "xfs_ag.h" #include "xfs_health.h" #include "scrub/scrub.h" diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 00848ee542fb..e1026e07bf94 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -104,13 +104,13 @@ xchk_iallocbt_chunk( xfs_extlen_t len) { struct xfs_mount *mp = bs->cur->bc_mp; - xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno; + struct xfs_perag *pag = bs->cur->bc_ag.pag; xfs_agblock_t bno; bno = XFS_AGINO_TO_AGBNO(mp, agino); if (bno + len <= bno || - !xfs_verify_agbno(mp, agno, bno) || - !xfs_verify_agbno(mp, agno, bno + len - 1)) + !xfs_verify_agbno(pag, bno) || + !xfs_verify_agbno(pag, bno + len - 1)) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); xchk_iallocbt_chunk_xref(bs->sc, irec, agino, bno, len); @@ -421,10 +421,10 @@ xchk_iallocbt_rec( const union xfs_btree_rec *rec) { struct xfs_mount *mp = bs->cur->bc_mp; + struct xfs_perag *pag = bs->cur->bc_ag.pag; struct xchk_iallocbt *iabt = bs->private; struct xfs_inobt_rec_incore irec; uint64_t holes; - xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno; xfs_agino_t agino; xfs_extlen_t len; int holecount; @@ -446,8 +446,8 @@ xchk_iallocbt_rec( agino = irec.ir_startino; /* Record has to be properly aligned within the AG. */ - if (!xfs_verify_agino(mp, agno, agino) || - !xfs_verify_agino(mp, agno, agino + XFS_INODES_PER_CHUNK - 1)) { + if (!xfs_verify_agino(pag, agino) || + !xfs_verify_agino(pag, agino + XFS_INODES_PER_CHUNK - 1)) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); goto out; } diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 3c7506c7553c..21b4c9006859 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -185,7 +185,7 @@ xchk_quota_data_fork( /* Check for data fork problems that apply only to quota files. */ max_dqid_off = ((xfs_dqid_t)-1) / qi->qi_dqperchunk; - ifp = XFS_IFORK_PTR(sc->ip, XFS_DATA_FORK); + ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); for_each_xfs_iext(ifp, &icur, &irec) { if (xchk_should_terminate(sc, &error)) break; diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index 2744eecdbaf0..c68b767dc08f 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -13,6 +13,8 @@ #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" #include "xfs_ag.h" /* @@ -332,9 +334,8 @@ xchk_refcountbt_rec( struct xchk_btree *bs, const union xfs_btree_rec *rec) { - struct xfs_mount *mp = bs->cur->bc_mp; xfs_agblock_t *cow_blocks = bs->private; - xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno; + struct xfs_perag *pag = bs->cur->bc_ag.pag; xfs_agblock_t bno; xfs_extlen_t len; xfs_nlink_t refcount; @@ -354,8 +355,8 @@ xchk_refcountbt_rec( /* Check the extent. */ bno &= ~XFS_REFC_COW_START; if (bno + len <= bno || - !xfs_verify_agbno(mp, agno, bno) || - !xfs_verify_agbno(mp, agno, bno + len - 1)) + !xfs_verify_agbno(pag, bno) || + !xfs_verify_agbno(pag, bno + len - 1)) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); if (refcount == 0) diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 1e7b6b209ee8..c18bd039fce9 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -199,7 +199,7 @@ xrep_calc_ag_resblks( icount = pag->pagi_count; } else { /* Try to get the actual counters from disk. */ - error = xfs_ialloc_read_agi(mp, NULL, sm->sm_agno, &bp); + error = xfs_ialloc_read_agi(pag, NULL, &bp); if (!error) { icount = pag->pagi_count; xfs_buf_relse(bp); @@ -207,9 +207,9 @@ xrep_calc_ag_resblks( } /* Now grab the block counters from the AGF. */ - error = xfs_alloc_read_agf(mp, NULL, sm->sm_agno, 0, &bp); + error = xfs_alloc_read_agf(pag, NULL, 0, &bp); if (error) { - aglen = xfs_ag_block_count(mp, sm->sm_agno); + aglen = pag->block_count; freelen = aglen; usedlen = aglen; } else { @@ -220,25 +220,22 @@ xrep_calc_ag_resblks( usedlen = aglen - freelen; xfs_buf_relse(bp); } - xfs_perag_put(pag); /* If the icount is impossible, make some worst-case assumptions. */ if (icount == NULLAGINO || - !xfs_verify_agino(mp, sm->sm_agno, icount)) { - xfs_agino_t first, last; - - xfs_agino_range(mp, sm->sm_agno, &first, &last); - icount = last - first + 1; + !xfs_verify_agino(pag, icount)) { + icount = pag->agino_max - pag->agino_min + 1; } /* If the block counts are impossible, make worst-case assumptions. */ if (aglen == NULLAGBLOCK || - aglen != xfs_ag_block_count(mp, sm->sm_agno) || + aglen != pag->block_count || freelen >= aglen) { - aglen = xfs_ag_block_count(mp, sm->sm_agno); + aglen = pag->block_count; freelen = aglen; usedlen = aglen; } + xfs_perag_put(pag); trace_xrep_calc_ag_resblks(mp, sm->sm_agno, icount, aglen, freelen, usedlen); @@ -300,13 +297,13 @@ xrep_alloc_ag_block( switch (resv) { case XFS_AG_RESV_AGFL: case XFS_AG_RESV_RMAPBT: - error = xfs_alloc_get_freelist(sc->tp, sc->sa.agf_bp, &bno, 1); + error = xfs_alloc_get_freelist(sc->sa.pag, sc->tp, + sc->sa.agf_bp, &bno, 1); if (error) return error; if (bno == NULLAGBLOCK) return -ENOSPC; - xfs_extent_busy_reuse(sc->mp, sc->sa.pag, bno, - 1, false); + xfs_extent_busy_reuse(sc->mp, sc->sa.pag, bno, 1, false); *fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, bno); if (resv == XFS_AG_RESV_RMAPBT) xfs_ag_resv_rmapbt_alloc(sc->mp, sc->sa.pag->pag_agno); @@ -457,16 +454,19 @@ xrep_invalidate_blocks( * assume it's owned by someone else. */ for_each_xbitmap_block(fsbno, bmr, n, bitmap) { + int error; + /* Skip AG headers and post-EOFS blocks */ if (!xfs_verify_fsbno(sc->mp, fsbno)) continue; - bp = xfs_buf_incore(sc->mp->m_ddev_targp, + error = xfs_buf_incore(sc->mp->m_ddev_targp, XFS_FSB_TO_DADDR(sc->mp, fsbno), - XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK); - if (bp) { - xfs_trans_bjoin(sc->tp, bp); - xfs_trans_binval(sc->tp, bp); - } + XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK, &bp); + if (error) + continue; + + xfs_trans_bjoin(sc->tp, bp); + xfs_trans_binval(sc->tp, bp); } return 0; @@ -516,8 +516,8 @@ xrep_put_freelist( return error; /* Put the block on the AGFL. */ - error = xfs_alloc_put_freelist(sc->tp, sc->sa.agf_bp, sc->sa.agfl_bp, - agbno, 0); + error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp, + sc->sa.agfl_bp, agbno, 0); if (error) return error; xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1, @@ -536,13 +536,12 @@ xrep_reap_block( { struct xfs_btree_cur *cur; struct xfs_buf *agf_bp = NULL; - xfs_agnumber_t agno; xfs_agblock_t agbno; bool has_other_rmap; int error; - agno = XFS_FSB_TO_AGNO(sc->mp, fsbno); agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno); + ASSERT(XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno); /* * If we are repairing per-inode metadata, we need to read in the AGF @@ -550,7 +549,7 @@ xrep_reap_block( * the AGF buffer that the setup functions already grabbed. */ if (sc->ip) { - error = xfs_alloc_read_agf(sc->mp, sc->tp, agno, 0, &agf_bp); + error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp); if (error) return error; } else { diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c index 8dae0345c7df..229826b2e1c0 100644 --- a/fs/xfs/scrub/rmap.c +++ b/fs/xfs/scrub/rmap.c @@ -92,7 +92,7 @@ xchk_rmapbt_rec( { struct xfs_mount *mp = bs->cur->bc_mp; struct xfs_rmap_irec irec; - xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno; + struct xfs_perag *pag = bs->cur->bc_ag.pag; bool non_inode; bool is_unwritten; bool is_bmbt; @@ -121,8 +121,8 @@ xchk_rmapbt_rec( * Otherwise we must point somewhere past the static metadata * but before the end of the FS. Run the regular check. */ - if (!xfs_verify_agbno(mp, agno, irec.rm_startblock) || - !xfs_verify_agbno(mp, agno, irec.rm_startblock + + if (!xfs_verify_agbno(pag, irec.rm_startblock) || + !xfs_verify_agbno(pag, irec.rm_startblock + irec.rm_blockcount - 1)) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); } diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c index 599ee277bba2..75311f8daeeb 100644 --- a/fs/xfs/scrub/symlink.c +++ b/fs/xfs/scrub/symlink.c @@ -41,7 +41,7 @@ xchk_symlink( if (!S_ISLNK(VFS_I(ip)->i_mode)) return -ENOENT; - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); len = ip->i_disk_size; /* Plausible size? */ @@ -52,8 +52,8 @@ xchk_symlink( /* Inline symlink? */ if (ifp->if_format == XFS_DINODE_FMT_LOCAL) { - if (len > XFS_IFORK_DSIZE(ip) || - len > strnlen(ifp->if_u1.if_data, XFS_IFORK_DSIZE(ip))) + if (len > xfs_inode_data_fork_size(ip) || + len > strnlen(ifp->if_u1.if_data, xfs_inode_data_fork_size(ip))) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); goto out; } diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 8ec38b25187b..5d1a995b15f8 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -570,7 +570,7 @@ const struct address_space_operations xfs_address_space_operations = { .invalidate_folio = iomap_invalidate_folio, .bmap = xfs_vm_bmap, .direct_IO = noop_direct_IO, - .migratepage = iomap_migrate_page, + .migrate_folio = filemap_migrate_folio, .is_partially_uptodate = iomap_is_partially_uptodate, .error_remove_page = generic_error_remove_page, .swap_activate = xfs_iomap_swapfile_activate, diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 27265771f247..5db87b34fb6e 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -158,6 +158,7 @@ xfs_attr3_node_inactive( } child_fsb = be32_to_cpu(ichdr.btree[0].before); xfs_trans_brelse(*trans, bp); /* no locks for later trans */ + bp = NULL; /* * If this is the node level just above the leaves, simply loop @@ -211,12 +212,8 @@ xfs_attr3_node_inactive( &child_bp); if (error) return error; - error = bp->b_error; - if (error) { - xfs_trans_brelse(*trans, child_bp); - return error; - } xfs_trans_binval(*trans, child_bp); + child_bp = NULL; /* * If we're not done, re-read the parent to get the next @@ -233,6 +230,7 @@ xfs_attr3_node_inactive( bp->b_addr); child_fsb = be32_to_cpu(phdr.btree[i + 1].before); xfs_trans_brelse(*trans, bp); + bp = NULL; } /* * Atomically commit the whole invalidate stuff. @@ -338,7 +336,7 @@ xfs_attr_inactive( ASSERT(! XFS_NOT_DQATTACHED(mp, dp)); xfs_ilock(dp, lock_mode); - if (!XFS_IFORK_Q(dp)) + if (!xfs_inode_has_attr_fork(dp)) goto out_destroy_fork; xfs_iunlock(dp, lock_mode); @@ -351,7 +349,7 @@ xfs_attr_inactive( lock_mode = XFS_ILOCK_EXCL; xfs_ilock(dp, lock_mode); - if (!XFS_IFORK_Q(dp)) + if (!xfs_inode_has_attr_fork(dp)) goto out_cancel; /* @@ -362,12 +360,11 @@ xfs_attr_inactive( /* * Invalidate and truncate the attribute fork extents. Make sure the - * fork actually has attributes as otherwise the invalidation has no + * fork actually has xattr blocks as otherwise the invalidation has no * blocks to read and returns an error. In this case, just do the fork * removal below. */ - if (xfs_inode_hasattr(dp) && - dp->i_afp->if_format != XFS_DINODE_FMT_LOCAL) { + if (dp->i_af.if_nextents > 0) { error = xfs_attr3_root_inactive(&trans, dp); if (error) goto out_cancel; @@ -388,11 +385,7 @@ out_cancel: xfs_trans_cancel(trans); out_destroy_fork: /* kill the in-core attr fork before we drop the inode lock */ - if (dp->i_afp) { - xfs_idestroy_fork(dp->i_afp); - kmem_cache_free(xfs_ifork_cache, dp->i_afp); - dp->i_afp = NULL; - } + xfs_ifork_zap_attr(dp); if (lock_mode) xfs_iunlock(dp, lock_mode); return error; diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 135d44133477..5077a7ad5646 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -576,7 +576,7 @@ xfs_attri_item_recover( struct xfs_trans_res tres; struct xfs_attri_log_format *attrp; struct xfs_attri_log_nameval *nv = attrip->attri_nameval; - int error, ret = 0; + int error; int total; int local; struct xfs_attrd_log_item *done_item = NULL; @@ -655,29 +655,32 @@ xfs_attri_item_recover( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - ret = xfs_xattri_finish_update(attr, done_item); - if (ret == -EAGAIN) { - /* There's more work to do, so add it to this transaction */ + error = xfs_xattri_finish_update(attr, done_item); + if (error == -EAGAIN) { + /* + * There's more work to do, so add the intent item to this + * transaction so that we can continue it later. + */ xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_ATTR, &attr->xattri_list); - } else - error = ret; + error = xfs_defer_ops_capture_and_commit(tp, capture_list); + if (error) + goto out_unlock; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_irele(ip); + return 0; + } if (error) { xfs_trans_cancel(tp); goto out_unlock; } error = xfs_defer_ops_capture_and_commit(tp, capture_list); - out_unlock: - if (attr->xattri_leaf_bp) - xfs_buf_relse(attr->xattri_leaf_bp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_irele(ip); out: - if (ret != -EAGAIN) - xfs_attr_free_item(attr); + xfs_attr_free_item(attr); return error; } diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 90a14e85e76d..99bbbe1a0e44 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -61,8 +61,7 @@ xfs_attr_shortform_list( int sbsize, nsbuf, count, i; int error = 0; - ASSERT(dp->i_afp != NULL); - sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data; + sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data; ASSERT(sf != NULL); if (!sf->hdr.count) return 0; @@ -80,7 +79,7 @@ xfs_attr_shortform_list( */ if (context->bufsize == 0 || (XFS_ISRESET_CURSOR(cursor) && - (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) { + (dp->i_af.if_bytes + sf->hdr.count * 16) < context->bufsize)) { for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { if (XFS_IS_CORRUPT(context->dp->i_mount, !xfs_attr_namecheck(sfe->nameval, @@ -121,7 +120,7 @@ xfs_attr_shortform_list( for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { if (unlikely( ((char *)sfe < (char *)sf) || - ((char *)sfe >= ((char *)sf + dp->i_afp->if_bytes)))) { + ((char *)sfe >= ((char *)sf + dp->i_af.if_bytes)))) { XFS_CORRUPTION_ERROR("xfs_attr_shortform_list", XFS_ERRLEVEL_LOW, context->dp->i_mount, sfe, @@ -513,7 +512,7 @@ xfs_attr_list_ilocked( */ if (!xfs_inode_hasattr(dp)) return 0; - if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) + if (dp->i_af.if_format == XFS_DINODE_FMT_LOCAL) return xfs_attr_shortform_list(context); if (xfs_attr_is_leaf(dp)) return xfs_attr_leaf_list(context); diff --git a/fs/xfs/xfs_bio_io.c b/fs/xfs/xfs_bio_io.c index ae4345b37621..fe21c76f75b8 100644 --- a/fs/xfs/xfs_bio_io.c +++ b/fs/xfs/xfs_bio_io.c @@ -15,7 +15,7 @@ xfs_rw_bdev( sector_t sector, unsigned int count, char *data, - unsigned int op) + enum req_op op) { unsigned int is_vmalloc = is_vmalloc_addr(data); diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 52be58372c63..04d0c2bff67c 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -256,7 +256,7 @@ xfs_bmap_count_blocks( xfs_filblks_t *count) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_cur *cur; xfs_extlen_t btblocks = 0; int error; @@ -439,29 +439,28 @@ xfs_getbmap( whichfork = XFS_COW_FORK; else whichfork = XFS_DATA_FORK; - ifp = XFS_IFORK_PTR(ip, whichfork); xfs_ilock(ip, XFS_IOLOCK_SHARED); switch (whichfork) { case XFS_ATTR_FORK: - if (!XFS_IFORK_Q(ip)) - goto out_unlock_iolock; + lock = xfs_ilock_attr_map_shared(ip); + if (!xfs_inode_has_attr_fork(ip)) + goto out_unlock_ilock; max_len = 1LL << 32; - lock = xfs_ilock_attr_map_shared(ip); break; case XFS_COW_FORK: + lock = XFS_ILOCK_SHARED; + xfs_ilock(ip, lock); + /* No CoW fork? Just return */ - if (!ifp) - goto out_unlock_iolock; + if (!xfs_ifork_ptr(ip, whichfork)) + goto out_unlock_ilock; if (xfs_get_cowextsz_hint(ip)) max_len = mp->m_super->s_maxbytes; else max_len = XFS_ISIZE(ip); - - lock = XFS_ILOCK_SHARED; - xfs_ilock(ip, lock); break; case XFS_DATA_FORK: if (!(iflags & BMV_IF_DELALLOC) && @@ -491,6 +490,8 @@ xfs_getbmap( break; } + ifp = xfs_ifork_ptr(ip, whichfork); + switch (ifp->if_format) { case XFS_DINODE_FMT_EXTENTS: case XFS_DINODE_FMT_BTREE: @@ -686,6 +687,8 @@ xfs_can_free_eofblocks( * forever. */ end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip)); + if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) + end_fsb = roundup_64(end_fsb, mp->m_sb.sb_rextsize); last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); if (last_fsb <= end_fsb) return false; @@ -1318,8 +1321,8 @@ xfs_swap_extents_check_format( * extent format... */ if (tifp->if_format == XFS_DINODE_FMT_BTREE) { - if (XFS_IFORK_Q(ip) && - XFS_BMAP_BMDR_SPACE(tifp->if_broot) > XFS_IFORK_BOFF(ip)) + if (xfs_inode_has_attr_fork(ip) && + XFS_BMAP_BMDR_SPACE(tifp->if_broot) > xfs_inode_fork_boff(ip)) return -EINVAL; if (tifp->if_nextents <= XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) return -EINVAL; @@ -1327,8 +1330,8 @@ xfs_swap_extents_check_format( /* Reciprocal target->temp btree format checks */ if (ifp->if_format == XFS_DINODE_FMT_BTREE) { - if (XFS_IFORK_Q(tip) && - XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip)) + if (xfs_inode_has_attr_fork(tip) && + XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > xfs_inode_fork_boff(tip)) return -EINVAL; if (ifp->if_nextents <= XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) return -EINVAL; @@ -1504,15 +1507,15 @@ xfs_swap_extent_forks( /* * Count the number of extended attribute blocks */ - if (XFS_IFORK_Q(ip) && ip->i_afp->if_nextents > 0 && - ip->i_afp->if_format != XFS_DINODE_FMT_LOCAL) { + if (xfs_inode_has_attr_fork(ip) && ip->i_af.if_nextents > 0 && + ip->i_af.if_format != XFS_DINODE_FMT_LOCAL) { error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &junk, &aforkblks); if (error) return error; } - if (XFS_IFORK_Q(tip) && tip->i_afp->if_nextents > 0 && - tip->i_afp->if_format != XFS_DINODE_FMT_LOCAL) { + if (xfs_inode_has_attr_fork(tip) && tip->i_af.if_nextents > 0 && + tip->i_af.if_format != XFS_DINODE_FMT_LOCAL) { error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, &junk, &taforkblks); if (error) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index bf4e60871068..dde346450952 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -5,6 +5,7 @@ */ #include "xfs.h" #include <linux/backing-dev.h> +#include <linux/dax.h> #include "xfs_shared.h" #include "xfs_format.h" @@ -21,7 +22,7 @@ #include "xfs_error.h" #include "xfs_ag.h" -static struct kmem_cache *xfs_buf_cache; +struct kmem_cache *xfs_buf_cache; /* * Locking orders @@ -295,6 +296,16 @@ xfs_buf_free_pages( } static void +xfs_buf_free_callback( + struct callback_head *cb) +{ + struct xfs_buf *bp = container_of(cb, struct xfs_buf, b_rcu); + + xfs_buf_free_maps(bp); + kmem_cache_free(xfs_buf_cache, bp); +} + +static void xfs_buf_free( struct xfs_buf *bp) { @@ -307,8 +318,7 @@ xfs_buf_free( else if (bp->b_flags & _XBF_KMEM) kmem_free(bp->b_addr); - xfs_buf_free_maps(bp); - kmem_cache_free(xfs_buf_cache, bp); + call_rcu(&bp->b_rcu, xfs_buf_free_callback); } static int @@ -503,100 +513,45 @@ xfs_buf_hash_destroy( rhashtable_destroy(&pag->pag_buf_hash); } -/* - * Look up a buffer in the buffer cache and return it referenced and locked - * in @found_bp. - * - * If @new_bp is supplied and we have a lookup miss, insert @new_bp into the - * cache. - * - * If XBF_TRYLOCK is set in @flags, only try to lock the buffer and return - * -EAGAIN if we fail to lock it. - * - * Return values are: - * -EFSCORRUPTED if have been supplied with an invalid address - * -EAGAIN on trylock failure - * -ENOENT if we fail to find a match and @new_bp was NULL - * 0, with @found_bp: - * - @new_bp if we inserted it into the cache - * - the buffer we found and locked. - */ static int -xfs_buf_find( +xfs_buf_map_verify( struct xfs_buftarg *btp, - struct xfs_buf_map *map, - int nmaps, - xfs_buf_flags_t flags, - struct xfs_buf *new_bp, - struct xfs_buf **found_bp) + struct xfs_buf_map *map) { - struct xfs_perag *pag; - struct xfs_buf *bp; - struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn }; xfs_daddr_t eofs; - int i; - - *found_bp = NULL; - - for (i = 0; i < nmaps; i++) - cmap.bm_len += map[i].bm_len; /* Check for IOs smaller than the sector size / not sector aligned */ - ASSERT(!(BBTOB(cmap.bm_len) < btp->bt_meta_sectorsize)); - ASSERT(!(BBTOB(cmap.bm_bn) & (xfs_off_t)btp->bt_meta_sectormask)); + ASSERT(!(BBTOB(map->bm_len) < btp->bt_meta_sectorsize)); + ASSERT(!(BBTOB(map->bm_bn) & (xfs_off_t)btp->bt_meta_sectormask)); /* * Corrupted block numbers can get through to here, unfortunately, so we * have to check that the buffer falls within the filesystem bounds. */ eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); - if (cmap.bm_bn < 0 || cmap.bm_bn >= eofs) { + if (map->bm_bn < 0 || map->bm_bn >= eofs) { xfs_alert(btp->bt_mount, "%s: daddr 0x%llx out of range, EOFS 0x%llx", - __func__, cmap.bm_bn, eofs); + __func__, map->bm_bn, eofs); WARN_ON(1); return -EFSCORRUPTED; } - - pag = xfs_perag_get(btp->bt_mount, - xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn)); - - spin_lock(&pag->pag_buf_lock); - bp = rhashtable_lookup_fast(&pag->pag_buf_hash, &cmap, - xfs_buf_hash_params); - if (bp) { - atomic_inc(&bp->b_hold); - goto found; - } - - /* No match found */ - if (!new_bp) { - XFS_STATS_INC(btp->bt_mount, xb_miss_locked); - spin_unlock(&pag->pag_buf_lock); - xfs_perag_put(pag); - return -ENOENT; - } - - /* the buffer keeps the perag reference until it is freed */ - new_bp->b_pag = pag; - rhashtable_insert_fast(&pag->pag_buf_hash, &new_bp->b_rhash_head, - xfs_buf_hash_params); - spin_unlock(&pag->pag_buf_lock); - *found_bp = new_bp; return 0; +} -found: - spin_unlock(&pag->pag_buf_lock); - xfs_perag_put(pag); - - if (!xfs_buf_trylock(bp)) { - if (flags & XBF_TRYLOCK) { - xfs_buf_rele(bp); - XFS_STATS_INC(btp->bt_mount, xb_busy_locked); +static int +xfs_buf_find_lock( + struct xfs_buf *bp, + xfs_buf_flags_t flags) +{ + if (flags & XBF_TRYLOCK) { + if (!xfs_buf_trylock(bp)) { + XFS_STATS_INC(bp->b_mount, xb_busy_locked); return -EAGAIN; } + } else { xfs_buf_lock(bp); - XFS_STATS_INC(btp->bt_mount, xb_get_locked_waited); + XFS_STATS_INC(bp->b_mount, xb_get_locked_waited); } /* @@ -609,57 +564,59 @@ found: bp->b_flags &= _XBF_KMEM | _XBF_PAGES; bp->b_ops = NULL; } - - trace_xfs_buf_find(bp, flags, _RET_IP_); - XFS_STATS_INC(btp->bt_mount, xb_get_locked); - *found_bp = bp; return 0; } -struct xfs_buf * -xfs_buf_incore( - struct xfs_buftarg *target, - xfs_daddr_t blkno, - size_t numblks, - xfs_buf_flags_t flags) +static inline int +xfs_buf_lookup( + struct xfs_perag *pag, + struct xfs_buf_map *map, + xfs_buf_flags_t flags, + struct xfs_buf **bpp) { - struct xfs_buf *bp; + struct xfs_buf *bp; int error; - DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); - error = xfs_buf_find(target, &map, 1, flags, NULL, &bp); - if (error) - return NULL; - return bp; + rcu_read_lock(); + bp = rhashtable_lookup(&pag->pag_buf_hash, map, xfs_buf_hash_params); + if (!bp || !atomic_inc_not_zero(&bp->b_hold)) { + rcu_read_unlock(); + return -ENOENT; + } + rcu_read_unlock(); + + error = xfs_buf_find_lock(bp, flags); + if (error) { + xfs_buf_rele(bp); + return error; + } + + trace_xfs_buf_find(bp, flags, _RET_IP_); + *bpp = bp; + return 0; } /* - * Assembles a buffer covering the specified range. The code is optimised for - * cache hits, as metadata intensive workloads will see 3 orders of magnitude - * more hits than misses. + * Insert the new_bp into the hash table. This consumes the perag reference + * taken for the lookup regardless of the result of the insert. */ -int -xfs_buf_get_map( - struct xfs_buftarg *target, +static int +xfs_buf_find_insert( + struct xfs_buftarg *btp, + struct xfs_perag *pag, + struct xfs_buf_map *cmap, struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp) { - struct xfs_buf *bp; struct xfs_buf *new_bp; + struct xfs_buf *bp; int error; - *bpp = NULL; - error = xfs_buf_find(target, map, nmaps, flags, NULL, &bp); - if (!error) - goto found; - if (error != -ENOENT) - return error; - - error = _xfs_buf_alloc(target, map, nmaps, flags, &new_bp); + error = _xfs_buf_alloc(btp, map, nmaps, flags, &new_bp); if (error) - return error; + goto out_drop_pag; /* * For buffers that fit entirely within a single page, first attempt to @@ -674,18 +631,94 @@ xfs_buf_get_map( goto out_free_buf; } - error = xfs_buf_find(target, map, nmaps, flags, new_bp, &bp); - if (error) + spin_lock(&pag->pag_buf_lock); + bp = rhashtable_lookup_get_insert_fast(&pag->pag_buf_hash, + &new_bp->b_rhash_head, xfs_buf_hash_params); + if (IS_ERR(bp)) { + error = PTR_ERR(bp); + spin_unlock(&pag->pag_buf_lock); goto out_free_buf; + } + if (bp) { + /* found an existing buffer */ + atomic_inc(&bp->b_hold); + spin_unlock(&pag->pag_buf_lock); + error = xfs_buf_find_lock(bp, flags); + if (error) + xfs_buf_rele(bp); + else + *bpp = bp; + goto out_free_buf; + } + + /* The new buffer keeps the perag reference until it is freed. */ + new_bp->b_pag = pag; + spin_unlock(&pag->pag_buf_lock); + *bpp = new_bp; + return 0; - if (bp != new_bp) - xfs_buf_free(new_bp); +out_free_buf: + xfs_buf_free(new_bp); +out_drop_pag: + xfs_perag_put(pag); + return error; +} -found: +/* + * Assembles a buffer covering the specified range. The code is optimised for + * cache hits, as metadata intensive workloads will see 3 orders of magnitude + * more hits than misses. + */ +int +xfs_buf_get_map( + struct xfs_buftarg *btp, + struct xfs_buf_map *map, + int nmaps, + xfs_buf_flags_t flags, + struct xfs_buf **bpp) +{ + struct xfs_perag *pag; + struct xfs_buf *bp = NULL; + struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn }; + int error; + int i; + + for (i = 0; i < nmaps; i++) + cmap.bm_len += map[i].bm_len; + + error = xfs_buf_map_verify(btp, &cmap); + if (error) + return error; + + pag = xfs_perag_get(btp->bt_mount, + xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn)); + + error = xfs_buf_lookup(pag, &cmap, flags, &bp); + if (error && error != -ENOENT) + goto out_put_perag; + + /* cache hits always outnumber misses by at least 10:1 */ + if (unlikely(!bp)) { + XFS_STATS_INC(btp->bt_mount, xb_miss_locked); + + if (flags & XBF_INCORE) + goto out_put_perag; + + /* xfs_buf_find_insert() consumes the perag reference. */ + error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps, + flags, &bp); + if (error) + return error; + } else { + XFS_STATS_INC(btp->bt_mount, xb_get_locked); + xfs_perag_put(pag); + } + + /* We do not hold a perag reference anymore. */ if (!bp->b_addr) { error = _xfs_buf_map_pages(bp, flags); if (unlikely(error)) { - xfs_warn_ratelimited(target->bt_mount, + xfs_warn_ratelimited(btp->bt_mount, "%s: failed to map %u pages", __func__, bp->b_page_count); xfs_buf_relse(bp); @@ -700,12 +733,13 @@ found: if (!(flags & XBF_READ)) xfs_buf_ioerror(bp, 0); - XFS_STATS_INC(target->bt_mount, xb_get); + XFS_STATS_INC(btp->bt_mount, xb_get); trace_xfs_buf_get(bp, flags, _RET_IP_); *bpp = bp; return 0; -out_free_buf: - xfs_buf_free(new_bp); + +out_put_perag: + xfs_perag_put(pag); return error; } @@ -1416,7 +1450,7 @@ xfs_buf_ioapply_map( int map, int *buf_offset, int *count, - int op) + blk_opf_t op) { int page_index; unsigned int total_nr_pages = bp->b_page_count; @@ -1493,7 +1527,7 @@ _xfs_buf_ioapply( struct xfs_buf *bp) { struct blk_plug plug; - int op; + blk_opf_t op; int offset; int size; int i; @@ -1911,7 +1945,7 @@ xfs_free_buftarg( list_lru_destroy(&btp->bt_lru); blkdev_issue_flush(btp->bt_bdev); - fs_put_dax(btp->bt_daxdev); + fs_put_dax(btp->bt_daxdev, btp->bt_mount); kmem_free(btp); } @@ -1958,13 +1992,18 @@ xfs_alloc_buftarg( struct block_device *bdev) { xfs_buftarg_t *btp; + const struct dax_holder_operations *ops = NULL; +#if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE) + ops = &xfs_dax_holder_operations; +#endif btp = kmem_zalloc(sizeof(*btp), KM_NOFS); btp->bt_mount = mp; btp->bt_dev = bdev->bd_dev; btp->bt_bdev = bdev; - btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off); + btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off, + mp, ops); /* * Buffer IO error rate limiting. Limit it to no more than 10 messages @@ -1986,7 +2025,8 @@ xfs_alloc_buftarg( btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan; btp->bt_shrinker.seeks = DEFAULT_SEEKS; btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE; - if (register_shrinker(&btp->bt_shrinker)) + if (register_shrinker(&btp->bt_shrinker, "xfs-buf:%s", + mp->m_super->s_id)) goto error_pcpu; return btp; @@ -2275,29 +2315,6 @@ xfs_buf_delwri_pushbuf( return error; } -int __init -xfs_buf_init(void) -{ - xfs_buf_cache = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0, - SLAB_HWCACHE_ALIGN | - SLAB_RECLAIM_ACCOUNT | - SLAB_MEM_SPREAD, - NULL); - if (!xfs_buf_cache) - goto out; - - return 0; - - out: - return -ENOMEM; -} - -void -xfs_buf_terminate(void) -{ - kmem_cache_destroy(xfs_buf_cache); -} - void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) { /* diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 1ee3056ff9cf..549c60942208 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -15,6 +15,8 @@ #include <linux/uio.h> #include <linux/list_lru.h> +extern struct kmem_cache *xfs_buf_cache; + /* * Base types */ @@ -42,9 +44,11 @@ struct xfs_buf; #define _XBF_DELWRI_Q (1u << 22)/* buffer on a delwri queue */ /* flags used only as arguments to access routines */ +#define XBF_INCORE (1u << 29)/* lookup only, return if found in cache */ #define XBF_TRYLOCK (1u << 30)/* lock requested, but do not wait */ #define XBF_UNMAPPED (1u << 31)/* do not map the buffer */ + typedef unsigned int xfs_buf_flags_t; #define XFS_BUF_FLAGS \ @@ -63,6 +67,7 @@ typedef unsigned int xfs_buf_flags_t; { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" }, \ /* The following interface flags should never be set */ \ + { XBF_INCORE, "INCORE" }, \ { XBF_TRYLOCK, "TRYLOCK" }, \ { XBF_UNMAPPED, "UNMAPPED" } @@ -193,13 +198,10 @@ struct xfs_buf { int b_last_error; const struct xfs_buf_ops *b_ops; + struct rcu_head b_rcu; }; /* Finding and Reading Buffers */ -struct xfs_buf *xfs_buf_incore(struct xfs_buftarg *target, - xfs_daddr_t blkno, size_t numblks, - xfs_buf_flags_t flags); - int xfs_buf_get_map(struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp); int xfs_buf_read_map(struct xfs_buftarg *target, struct xfs_buf_map *map, @@ -210,6 +212,19 @@ void xfs_buf_readahead_map(struct xfs_buftarg *target, const struct xfs_buf_ops *ops); static inline int +xfs_buf_incore( + struct xfs_buftarg *target, + xfs_daddr_t blkno, + size_t numblks, + xfs_buf_flags_t flags, + struct xfs_buf **bpp) +{ + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + + return xfs_buf_get_map(target, &map, 1, XBF_INCORE | flags, bpp); +} + +static inline int xfs_buf_get( struct xfs_buftarg *target, xfs_daddr_t blkno, @@ -294,10 +309,6 @@ extern int xfs_buf_delwri_submit(struct list_head *); extern int xfs_buf_delwri_submit_nowait(struct list_head *); extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *); -/* Buffer Daemon Setup Routines */ -extern int xfs_buf_init(void); -extern void xfs_buf_terminate(void); - static inline xfs_daddr_t xfs_buf_daddr(struct xfs_buf *bp) { return bp->b_maps[0].bm_bn; diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index a7174a5b3203..e295fc8062d8 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -248,7 +248,7 @@ xfs_dir2_leaf_readbuf( struct xfs_inode *dp = args->dp; struct xfs_buf *bp = NULL; struct xfs_da_geometry *geo = args->geo; - struct xfs_ifork *ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK); + struct xfs_ifork *ifp = xfs_ifork_ptr(dp, XFS_DATA_FORK); struct xfs_bmbt_irec map; struct blk_plug plug; xfs_dir2_off_t new_off; diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index c6fe3f6ebb6b..bfc829c07f03 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -45,7 +45,7 @@ xfs_trim_extents( */ xfs_log_force(mp, XFS_LOG_SYNC); - error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + error = xfs_alloc_read_agf(pag, NULL, 0, &agbp); if (error) goto out_put_perag; agf = agbp->b_addr; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 5a6c3c3c4de2..8fb90da89787 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -549,7 +549,7 @@ xfs_dquot_check_type( * at the same time. The non-user quota file can be switched between * group and project quota uses depending on the mount options, which * means that we can encounter the other type when we try to load quota - * defaults. Quotacheck will soon reset the the entire quota file + * defaults. Quotacheck will soon reset the entire quota file * (including the root dquot) anyway, but don't log scary corruption * reports to dmesg. */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 765be054dffe..27ccfcd82f04 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -11,6 +11,7 @@ #include "xfs_bit.h" #include "xfs_shared.h" #include "xfs_mount.h" +#include "xfs_ag.h" #include "xfs_defer.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" @@ -187,12 +188,12 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) { xfs_efi_log_format_t *src_efi_fmt = buf->i_addr; uint i; - uint len = sizeof(xfs_efi_log_format_t) + - (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_t); - uint len32 = sizeof(xfs_efi_log_format_32_t) + - (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_32_t); - uint len64 = sizeof(xfs_efi_log_format_64_t) + - (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_64_t); + uint len = sizeof(xfs_efi_log_format_t) + + (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_t); + uint len32 = sizeof(xfs_efi_log_format_32_t) + + (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_32_t); + uint len64 = sizeof(xfs_efi_log_format_64_t) + + (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_64_t); if (buf->i_len == len) { memcpy((char *)dst_efi_fmt, (char*)src_efi_fmt, len); @@ -551,6 +552,7 @@ xfs_agfl_free_finish_item( xfs_agnumber_t agno; xfs_agblock_t agbno; uint next_extent; + struct xfs_perag *pag; free = container_of(item, struct xfs_extent_free_item, xefi_list); ASSERT(free->xefi_blockcount == 1); @@ -560,9 +562,11 @@ xfs_agfl_free_finish_item( trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, free->xefi_blockcount); - error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); + pag = xfs_perag_get(mp, agno); + error = xfs_alloc_read_agf(pag, tp, 0, &agbp); if (!error) error = xfs_free_agfl_block(tp, agno, agbno, agbp, &oinfo); + xfs_perag_put(pag); /* * Mark the transaction dirty, even on error. This ensures the diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 5a171c0b244b..aa7e458ab169 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -25,6 +25,7 @@ #include "xfs_iomap.h" #include "xfs_reflink.h" +#include <linux/dax.h> #include <linux/falloc.h> #include <linux/backing-dev.h> #include <linux/mman.h> @@ -410,7 +411,7 @@ restart: spin_unlock(&ip->i_flags_lock); out: - return file_modified(file); + return kiocb_modified(iocb); } static int @@ -669,7 +670,7 @@ xfs_file_dax_write( pos = iocb->ki_pos; trace_xfs_file_dax_write(iocb, from); - ret = dax_iomap_rw(iocb, from, &xfs_direct_write_iomap_ops); + ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops); if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { i_size_write(inode, iocb->ki_pos); error = xfs_setfilesize(ip, pos, ret); @@ -700,12 +701,11 @@ xfs_file_buffered_write( bool cleared_space = false; unsigned int iolock; - if (iocb->ki_flags & IOCB_NOWAIT) - return -EOPNOTSUPP; - write_retry: iolock = XFS_IOLOCK_EXCL; - xfs_ilock(ip, iolock); + ret = xfs_ilock_iocb(iocb, iolock); + if (ret) + return ret; ret = xfs_file_write_checks(iocb, from, &iolock); if (ret) @@ -807,7 +807,7 @@ xfs_wait_dax_page( xfs_ilock(ip, XFS_MMAPLOCK_EXCL); } -static int +int xfs_break_dax_layouts( struct inode *inode, bool *retry) @@ -1165,7 +1165,7 @@ xfs_file_open( { if (xfs_is_shutdown(XFS_M(inode->i_sb))) return -EIO; - file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; + file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC; return generic_file_open(inode, file); } @@ -1254,6 +1254,31 @@ xfs_file_llseek( return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); } +#ifdef CONFIG_FS_DAX +static int +xfs_dax_fault( + struct vm_fault *vmf, + enum page_entry_size pe_size, + bool write_fault, + pfn_t *pfn) +{ + return dax_iomap_fault(vmf, pe_size, pfn, NULL, + (write_fault && !vmf->cow_page) ? + &xfs_dax_write_iomap_ops : + &xfs_read_iomap_ops); +} +#else +static int +xfs_dax_fault( + struct vm_fault *vmf, + enum page_entry_size pe_size, + bool write_fault, + pfn_t *pfn) +{ + return 0; +} +#endif + /* * Locking for serialisation of IO during page faults. This results in a lock * ordering of: @@ -1285,10 +1310,7 @@ __xfs_filemap_fault( pfn_t pfn; xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL, - (write_fault && !vmf->cow_page) ? - &xfs_direct_write_iomap_ops : - &xfs_read_iomap_ops); + ret = xfs_dax_fault(vmf, pe_size, write_fault, &pfn); if (ret & VM_FAULT_NEEDDSYNC) ret = dax_finish_sync_fault(vmf, pe_size, pfn); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index be9bcf8a1f99..34b21a29c39b 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -126,7 +126,7 @@ xfs_filestream_pick_ag( pag = xfs_perag_get(mp, ag); if (!pag->pagf_init) { - err = xfs_alloc_pagf_init(mp, NULL, ag, trylock); + err = xfs_alloc_read_agf(pag, NULL, trylock, NULL); if (err) { if (err != -EAGAIN) { xfs_perag_put(pag); @@ -181,7 +181,7 @@ next_ag: if (ag != startag) continue; - /* Allow sleeping in xfs_alloc_pagf_init() on the 2nd pass. */ + /* Allow sleeping in xfs_alloc_read_agf() on the 2nd pass. */ if (trylock != 0) { trylock = 0; continue; diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index bb23199f65c3..d8337274c74d 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -642,8 +642,7 @@ __xfs_getfsmap_datadev( info->agf_bp = NULL; } - error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, - &info->agf_bp); + error = xfs_alloc_read_agf(pag, tp, 0, &info->agf_bp); if (error) break; diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index d4a77c53f94b..13851c0d640b 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -41,6 +41,7 @@ xfs_resizefs_init_new_ags( xfs_agnumber_t oagcount, xfs_agnumber_t nagcount, xfs_rfsblock_t delta, + struct xfs_perag *last_pag, bool *lastag_extended) { struct xfs_mount *mp = tp->t_mountp; @@ -73,7 +74,7 @@ xfs_resizefs_init_new_ags( if (delta) { *lastag_extended = true; - error = xfs_ag_extend_space(mp, tp, id, delta); + error = xfs_ag_extend_space(last_pag, tp, delta); } return error; } @@ -96,6 +97,7 @@ xfs_growfs_data_private( xfs_agnumber_t oagcount; struct xfs_trans *tp; struct aghdr_init_data id = {}; + struct xfs_perag *last_pag; nb = in->newblocks; error = xfs_sb_validate_fsb_count(&mp->m_sb, nb); @@ -128,10 +130,9 @@ xfs_growfs_data_private( return -EINVAL; oagcount = mp->m_sb.sb_agcount; - /* allocate the new per-ag structures */ if (nagcount > oagcount) { - error = xfs_initialize_perag(mp, nagcount, &nagimax); + error = xfs_initialize_perag(mp, nagcount, nb, &nagimax); if (error) return error; } else if (nagcount < oagcount) { @@ -145,15 +146,17 @@ xfs_growfs_data_private( if (error) return error; + last_pag = xfs_perag_get(mp, oagcount - 1); if (delta > 0) { error = xfs_resizefs_init_new_ags(tp, &id, oagcount, nagcount, - delta, &lastag_extended); + delta, last_pag, &lastag_extended); } else { xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SHRINK, "EXPERIMENTAL online shrink feature in use. Use at your own risk!"); - error = xfs_ag_shrink_space(mp, &tp, nagcount - 1, -delta); + error = xfs_ag_shrink_space(last_pag, &tp, -delta); } + xfs_perag_put(last_pag); if (error) goto out_trans_cancel; @@ -528,6 +531,9 @@ xfs_do_force_shutdown( } else if (flags & SHUTDOWN_CORRUPT_INCORE) { tag = XFS_PTAG_SHUTDOWN_CORRUPT; why = "Corruption of in-memory data"; + } else if (flags & SHUTDOWN_CORRUPT_ONDISK) { + tag = XFS_PTAG_SHUTDOWN_CORRUPT; + why = "Corruption of on-disk metadata"; } else { tag = XFS_PTAG_SHUTDOWN_IOERROR; why = "Metadata I/O Error"; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 5269354b1b69..2bbe7916a998 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -98,8 +98,9 @@ xfs_inode_alloc( ip->i_ino = ino; ip->i_mount = mp; memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); - ip->i_afp = NULL; ip->i_cowfp = NULL; + memset(&ip->i_af, 0, sizeof(ip->i_af)); + ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS; memset(&ip->i_df, 0, sizeof(ip->i_df)); ip->i_flags = 0; ip->i_delayed_blks = 0; @@ -111,6 +112,8 @@ xfs_inode_alloc( INIT_WORK(&ip->i_ioend_work, xfs_end_io); INIT_LIST_HEAD(&ip->i_ioend_list); spin_lock_init(&ip->i_ioend_lock); + ip->i_next_unlinked = NULLAGINO; + ip->i_prev_unlinked = NULLAGINO; return ip; } @@ -130,10 +133,8 @@ xfs_inode_free_callback( break; } - if (ip->i_afp) { - xfs_idestroy_fork(ip->i_afp); - kmem_cache_free(xfs_ifork_cache, ip->i_afp); - } + xfs_ifork_zap_attr(ip); + if (ip->i_cowfp) { xfs_idestroy_fork(ip->i_cowfp); kmem_cache_free(xfs_ifork_cache, ip->i_cowfp); @@ -440,7 +441,7 @@ xfs_inodegc_queue_all( for_each_online_cpu(cpu) { gc = per_cpu_ptr(mp->m_inodegc, cpu); if (!llist_empty(&gc->list)) - queue_work_on(cpu, mp->m_inodegc_wq, &gc->work); + mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0); } } @@ -912,6 +913,7 @@ reclaim: ip->i_checked = 0; spin_unlock(&ip->i_flags_lock); + ASSERT(!ip->i_itemp || ip->i_itemp->ili_item.li_buf == NULL); xfs_iunlock(ip, XFS_ILOCK_EXCL); XFS_STATS_INC(ip->i_mount, xs_ig_reclaims); @@ -1774,7 +1776,7 @@ xfs_check_delalloc( struct xfs_inode *ip, int whichfork) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_bmbt_irec got; struct xfs_iext_cursor icur; @@ -1841,8 +1843,8 @@ void xfs_inodegc_worker( struct work_struct *work) { - struct xfs_inodegc *gc = container_of(work, struct xfs_inodegc, - work); + struct xfs_inodegc *gc = container_of(to_delayed_work(work), + struct xfs_inodegc, work); struct llist_node *node = llist_del_all(&gc->list); struct xfs_inode *ip, *n; @@ -1862,19 +1864,29 @@ xfs_inodegc_worker( } /* - * Force all currently queued inode inactivation work to run immediately and - * wait for the work to finish. + * Expedite all pending inodegc work to run immediately. This does not wait for + * completion of the work. */ void -xfs_inodegc_flush( +xfs_inodegc_push( struct xfs_mount *mp) { if (!xfs_is_inodegc_enabled(mp)) return; + trace_xfs_inodegc_push(mp, __return_address); + xfs_inodegc_queue_all(mp); +} +/* + * Force all currently queued inode inactivation work to run immediately and + * wait for the work to finish. + */ +void +xfs_inodegc_flush( + struct xfs_mount *mp) +{ + xfs_inodegc_push(mp); trace_xfs_inodegc_flush(mp, __return_address); - - xfs_inodegc_queue_all(mp); flush_workqueue(mp->m_inodegc_wq); } @@ -2014,6 +2026,7 @@ xfs_inodegc_queue( struct xfs_inodegc *gc; int items; unsigned int shrinker_hits; + unsigned long queue_delay = 1; trace_xfs_inode_set_need_inactive(ip); spin_lock(&ip->i_flags_lock); @@ -2025,19 +2038,26 @@ xfs_inodegc_queue( items = READ_ONCE(gc->items); WRITE_ONCE(gc->items, items + 1); shrinker_hits = READ_ONCE(gc->shrinker_hits); - put_cpu_ptr(gc); - if (!xfs_is_inodegc_enabled(mp)) + /* + * We queue the work while holding the current CPU so that the work + * is scheduled to run on this CPU. + */ + if (!xfs_is_inodegc_enabled(mp)) { + put_cpu_ptr(gc); return; - - if (xfs_inodegc_want_queue_work(ip, items)) { - trace_xfs_inodegc_queue(mp, __return_address); - queue_work(mp->m_inodegc_wq, &gc->work); } + if (xfs_inodegc_want_queue_work(ip, items)) + queue_delay = 0; + + trace_xfs_inodegc_queue(mp, __return_address); + mod_delayed_work(mp->m_inodegc_wq, &gc->work, queue_delay); + put_cpu_ptr(gc); + if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) { trace_xfs_inodegc_throttle(mp, __return_address); - flush_work(&gc->work); + flush_delayed_work(&gc->work); } } @@ -2054,7 +2074,7 @@ xfs_inodegc_cpu_dead( unsigned int count = 0; dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu); - cancel_work_sync(&dead_gc->work); + cancel_delayed_work_sync(&dead_gc->work); if (llist_empty(&dead_gc->list)) return; @@ -2073,12 +2093,12 @@ xfs_inodegc_cpu_dead( llist_add_batch(first, last, &gc->list); count += READ_ONCE(gc->items); WRITE_ONCE(gc->items, count); - put_cpu_ptr(gc); if (xfs_is_inodegc_enabled(mp)) { trace_xfs_inodegc_queue(mp, __return_address); - queue_work(mp->m_inodegc_wq, &gc->work); + mod_delayed_work(mp->m_inodegc_wq, &gc->work, 0); } + put_cpu_ptr(gc); } /* @@ -2173,7 +2193,7 @@ xfs_inodegc_shrinker_scan( unsigned int h = READ_ONCE(gc->shrinker_hits); WRITE_ONCE(gc->shrinker_hits, h + 1); - queue_work_on(cpu, mp->m_inodegc_wq, &gc->work); + mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0); no_items = false; } } @@ -2201,5 +2221,5 @@ xfs_inodegc_register_shrinker( shrink->flags = SHRINKER_NONSLAB; shrink->batch = XFS_INODEGC_SHRINKER_BATCH; - return register_shrinker(shrink); + return register_shrinker(shrink, "xfs-inodegc:%s", mp->m_super->s_id); } diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 2e4cfddf8b8e..6cd180721659 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -76,6 +76,7 @@ void xfs_blockgc_stop(struct xfs_mount *mp); void xfs_blockgc_start(struct xfs_mount *mp); void xfs_inodegc_worker(struct work_struct *work); +void xfs_inodegc_push(struct xfs_mount *mp); void xfs_inodegc_flush(struct xfs_mount *mp); void xfs_inodegc_stop(struct xfs_mount *mp); void xfs_inodegc_start(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 52d6f2c7d58b..28493c8e9bb2 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -20,6 +20,7 @@ #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_inode_item.h" +#include "xfs_iunlink_item.h" #include "xfs_ialloc.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" @@ -125,13 +126,33 @@ xfs_ilock_attr_map_shared( { uint lock_mode = XFS_ILOCK_SHARED; - if (ip->i_afp && xfs_need_iread_extents(ip->i_afp)) + if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af)) lock_mode = XFS_ILOCK_EXCL; xfs_ilock(ip, lock_mode); return lock_mode; } /* + * You can't set both SHARED and EXCL for the same lock, + * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_MMAPLOCK_SHARED, + * XFS_MMAPLOCK_EXCL, XFS_ILOCK_SHARED, XFS_ILOCK_EXCL are valid values + * to set in lock_flags. + */ +static inline void +xfs_lock_flags_assert( + uint lock_flags) +{ + ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != + (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != + (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); + ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != + (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); + ASSERT(lock_flags != 0); +} + +/* * In addition to i_rwsem in the VFS inode, the xfs inode contains 2 * multi-reader locks: invalidate_lock and the i_lock. This routine allows * various combinations of the locks to be obtained. @@ -168,18 +189,7 @@ xfs_ilock( { trace_xfs_ilock(ip, lock_flags, _RET_IP_); - /* - * You can't set both SHARED and EXCL for the same lock, - * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, - * and XFS_ILOCK_EXCL are valid values to set in lock_flags. - */ - ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != - (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); - ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != - (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); - ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != - (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); + xfs_lock_flags_assert(lock_flags); if (lock_flags & XFS_IOLOCK_EXCL) { down_write_nested(&VFS_I(ip)->i_rwsem, @@ -222,18 +232,7 @@ xfs_ilock_nowait( { trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_); - /* - * You can't set both SHARED and EXCL for the same lock, - * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, - * and XFS_ILOCK_EXCL are valid values to set in lock_flags. - */ - ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != - (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); - ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != - (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); - ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != - (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); + xfs_lock_flags_assert(lock_flags); if (lock_flags & XFS_IOLOCK_EXCL) { if (!down_write_trylock(&VFS_I(ip)->i_rwsem)) @@ -291,19 +290,7 @@ xfs_iunlock( xfs_inode_t *ip, uint lock_flags) { - /* - * You can't set both SHARED and EXCL for the same lock, - * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, - * and XFS_ILOCK_EXCL are valid values to set in lock_flags. - */ - ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != - (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); - ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != - (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); - ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != - (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); - ASSERT(lock_flags != 0); + xfs_lock_flags_assert(lock_flags); if (lock_flags & XFS_IOLOCK_EXCL) up_write(&VFS_I(ip)->i_rwsem); @@ -379,8 +366,8 @@ xfs_isilocked( } if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) { - return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem, - (lock_flags & XFS_IOLOCK_SHARED)); + return __xfs_rwsem_islocked(&VFS_I(ip)->i_mapping->invalidate_lock, + (lock_flags & XFS_MMAPLOCK_SHARED)); } if (lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) { @@ -649,7 +636,7 @@ xfs_ip2xflags( flags |= FS_XFLAG_COWEXTSIZE; } - if (XFS_IFORK_Q(ip)) + if (xfs_inode_has_attr_fork(ip)) flags |= FS_XFLAG_HASATTR; return flags; } @@ -907,7 +894,7 @@ xfs_init_new_inode( */ if (init_xattrs && xfs_has_attr(mp)) { ip->i_forkoff = xfs_default_attroffset(ip) >> 3; - ip->i_afp = xfs_ifork_alloc(XFS_DINODE_FMT_EXTENTS, 0); + xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0); } /* @@ -1307,8 +1294,8 @@ xfs_itruncate_clear_reflink_flags( if (!xfs_is_reflink_inode(ip)) return; - dfork = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - cfork = XFS_IFORK_PTR(ip, XFS_COW_FORK); + dfork = xfs_ifork_ptr(ip, XFS_DATA_FORK); + cfork = xfs_ifork_ptr(ip, XFS_COW_FORK); if (dfork->if_bytes == 0 && cfork->if_bytes == 0) ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; if (cfork->if_bytes == 0) @@ -1657,7 +1644,7 @@ xfs_inode_needs_inactive( struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *cow_ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + struct xfs_ifork *cow_ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); /* * If the inode is already free, then there can be nothing @@ -1776,13 +1763,12 @@ xfs_inactive( * now. The code calls a routine that recursively deconstructs the * attribute fork. If also blows away the in-core attribute fork. */ - if (XFS_IFORK_Q(ip)) { + if (xfs_inode_has_attr_fork(ip)) { error = xfs_attr_inactive(ip); if (error) goto out; } - ASSERT(!ip->i_afp); ASSERT(ip->i_forkoff == 0); /* @@ -1815,195 +1801,69 @@ out: * because we must walk that list to find the inode that points to the inode * being removed from the unlinked hash bucket list. * - * What if we modelled the unlinked list as a collection of records capturing - * "X.next_unlinked = Y" relations? If we indexed those records on Y, we'd - * have a fast way to look up unlinked list predecessors, which avoids the - * slow list walk. That's exactly what we do here (in-core) with a per-AG - * rhashtable. - * - * Because this is a backref cache, we ignore operational failures since the - * iunlink code can fall back to the slow bucket walk. The only errors that - * should bubble out are for obviously incorrect situations. + * Hence we keep an in-memory double linked list to link each inode on an + * unlinked list. Because there are 64 unlinked lists per AGI, keeping pointer + * based lists would require having 64 list heads in the perag, one for each + * list. This is expensive in terms of memory (think millions of AGs) and cache + * misses on lookups. Instead, use the fact that inodes on the unlinked list + * must be referenced at the VFS level to keep them on the list and hence we + * have an existence guarantee for inodes on the unlinked list. * - * All users of the backref cache MUST hold the AGI buffer lock to serialize - * access or have otherwise provided for concurrency control. + * Given we have an existence guarantee, we can use lockless inode cache lookups + * to resolve aginos to xfs inodes. This means we only need 8 bytes per inode + * for the double linked unlinked list, and we don't need any extra locking to + * keep the list safe as all manipulations are done under the AGI buffer lock. + * Keeping the list up to date does not require memory allocation, just finding + * the XFS inode and updating the next/prev unlinked list aginos. */ -/* Capture a "X.next_unlinked = Y" relationship. */ -struct xfs_iunlink { - struct rhash_head iu_rhash_head; - xfs_agino_t iu_agino; /* X */ - xfs_agino_t iu_next_unlinked; /* Y */ -}; - -/* Unlinked list predecessor lookup hashtable construction */ -static int -xfs_iunlink_obj_cmpfn( - struct rhashtable_compare_arg *arg, - const void *obj) -{ - const xfs_agino_t *key = arg->key; - const struct xfs_iunlink *iu = obj; - - if (iu->iu_next_unlinked != *key) - return 1; - return 0; -} - -static const struct rhashtable_params xfs_iunlink_hash_params = { - .min_size = XFS_AGI_UNLINKED_BUCKETS, - .key_len = sizeof(xfs_agino_t), - .key_offset = offsetof(struct xfs_iunlink, - iu_next_unlinked), - .head_offset = offsetof(struct xfs_iunlink, iu_rhash_head), - .automatic_shrinking = true, - .obj_cmpfn = xfs_iunlink_obj_cmpfn, -}; - /* - * Return X, where X.next_unlinked == @agino. Returns NULLAGINO if no such - * relation is found. + * Find an inode on the unlinked list. This does not take references to the + * inode as we have existence guarantees by holding the AGI buffer lock and that + * only unlinked, referenced inodes can be on the unlinked inode list. If we + * don't find the inode in cache, then let the caller handle the situation. */ -static xfs_agino_t -xfs_iunlink_lookup_backref( +static struct xfs_inode * +xfs_iunlink_lookup( struct xfs_perag *pag, xfs_agino_t agino) { - struct xfs_iunlink *iu; - - iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino, - xfs_iunlink_hash_params); - return iu ? iu->iu_agino : NULLAGINO; -} + struct xfs_inode *ip; -/* - * Take ownership of an iunlink cache entry and insert it into the hash table. - * If successful, the entry will be owned by the cache; if not, it is freed. - * Either way, the caller does not own @iu after this call. - */ -static int -xfs_iunlink_insert_backref( - struct xfs_perag *pag, - struct xfs_iunlink *iu) -{ - int error; + rcu_read_lock(); + ip = radix_tree_lookup(&pag->pag_ici_root, agino); - error = rhashtable_insert_fast(&pag->pagi_unlinked_hash, - &iu->iu_rhash_head, xfs_iunlink_hash_params); /* - * Fail loudly if there already was an entry because that's a sign of - * corruption of in-memory data. Also fail loudly if we see an error - * code we didn't anticipate from the rhashtable code. Currently we - * only anticipate ENOMEM. + * Inode not in memory or in RCU freeing limbo should not happen. + * Warn about this and let the caller handle the failure. */ - if (error) { - WARN(error != -ENOMEM, "iunlink cache insert error %d", error); - kmem_free(iu); + if (WARN_ON_ONCE(!ip || !ip->i_ino)) { + rcu_read_unlock(); + return NULL; } - /* - * Absorb any runtime errors that aren't a result of corruption because - * this is a cache and we can always fall back to bucket list scanning. - */ - if (error != 0 && error != -EEXIST) - error = 0; - return error; + ASSERT(!xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM)); + rcu_read_unlock(); + return ip; } -/* Remember that @prev_agino.next_unlinked = @this_agino. */ +/* Update the prev pointer of the next agino. */ static int -xfs_iunlink_add_backref( +xfs_iunlink_update_backref( struct xfs_perag *pag, xfs_agino_t prev_agino, - xfs_agino_t this_agino) -{ - struct xfs_iunlink *iu; - - if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK)) - return 0; - - iu = kmem_zalloc(sizeof(*iu), KM_NOFS); - iu->iu_agino = prev_agino; - iu->iu_next_unlinked = this_agino; - - return xfs_iunlink_insert_backref(pag, iu); -} - -/* - * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked. - * If @next_unlinked is NULLAGINO, we drop the backref and exit. If there - * wasn't any such entry then we don't bother. - */ -static int -xfs_iunlink_change_backref( - struct xfs_perag *pag, - xfs_agino_t agino, - xfs_agino_t next_unlinked) + xfs_agino_t next_agino) { - struct xfs_iunlink *iu; - int error; - - /* Look up the old entry; if there wasn't one then exit. */ - iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino, - xfs_iunlink_hash_params); - if (!iu) - return 0; - - /* - * Remove the entry. This shouldn't ever return an error, but if we - * couldn't remove the old entry we don't want to add it again to the - * hash table, and if the entry disappeared on us then someone's - * violated the locking rules and we need to fail loudly. Either way - * we cannot remove the inode because internal state is or would have - * been corrupt. - */ - error = rhashtable_remove_fast(&pag->pagi_unlinked_hash, - &iu->iu_rhash_head, xfs_iunlink_hash_params); - if (error) - return error; + struct xfs_inode *ip; - /* If there is no new next entry just free our item and return. */ - if (next_unlinked == NULLAGINO) { - kmem_free(iu); + /* No update necessary if we are at the end of the list. */ + if (next_agino == NULLAGINO) return 0; - } - /* Update the entry and re-add it to the hash table. */ - iu->iu_next_unlinked = next_unlinked; - return xfs_iunlink_insert_backref(pag, iu); -} - -/* Set up the in-core predecessor structures. */ -int -xfs_iunlink_init( - struct xfs_perag *pag) -{ - return rhashtable_init(&pag->pagi_unlinked_hash, - &xfs_iunlink_hash_params); -} - -/* Free the in-core predecessor structures. */ -static void -xfs_iunlink_free_item( - void *ptr, - void *arg) -{ - struct xfs_iunlink *iu = ptr; - bool *freed_anything = arg; - - *freed_anything = true; - kmem_free(iu); -} - -void -xfs_iunlink_destroy( - struct xfs_perag *pag) -{ - bool freed_anything = false; - - rhashtable_free_and_destroy(&pag->pagi_unlinked_hash, - xfs_iunlink_free_item, &freed_anything); - - ASSERT(freed_anything == false || xfs_is_shutdown(pag->pag_mount)); + ip = xfs_iunlink_lookup(pag, next_agino); + if (!ip) + return -EFSCORRUPTED; + ip->i_prev_unlinked = prev_agino; + return 0; } /* @@ -2022,7 +1882,7 @@ xfs_iunlink_update_bucket( xfs_agino_t old_value; int offset; - ASSERT(xfs_verify_agino_or_null(tp->t_mountp, pag->pag_agno, new_agino)); + ASSERT(xfs_verify_agino_or_null(pag, new_agino)); old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]); trace_xfs_iunlink_update_bucket(tp->t_mountp, pag->pag_agno, bucket_index, @@ -2045,88 +1905,53 @@ xfs_iunlink_update_bucket( return 0; } -/* Set an on-disk inode's next_unlinked pointer. */ -STATIC void -xfs_iunlink_update_dinode( - struct xfs_trans *tp, - struct xfs_perag *pag, - xfs_agino_t agino, - struct xfs_buf *ibp, - struct xfs_dinode *dip, - struct xfs_imap *imap, - xfs_agino_t next_agino) -{ - struct xfs_mount *mp = tp->t_mountp; - int offset; - - ASSERT(xfs_verify_agino_or_null(mp, pag->pag_agno, next_agino)); - - trace_xfs_iunlink_update_dinode(mp, pag->pag_agno, agino, - be32_to_cpu(dip->di_next_unlinked), next_agino); - - dip->di_next_unlinked = cpu_to_be32(next_agino); - offset = imap->im_boffset + - offsetof(struct xfs_dinode, di_next_unlinked); - - /* need to recalc the inode CRC if appropriate */ - xfs_dinode_calc_crc(mp, dip); - xfs_trans_inode_buf(tp, ibp); - xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1); -} - -/* Set an in-core inode's unlinked pointer and return the old value. */ -STATIC int -xfs_iunlink_update_inode( +static int +xfs_iunlink_insert_inode( struct xfs_trans *tp, - struct xfs_inode *ip, struct xfs_perag *pag, - xfs_agino_t next_agino, - xfs_agino_t *old_next_agino) + struct xfs_buf *agibp, + struct xfs_inode *ip) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_dinode *dip; - struct xfs_buf *ibp; - xfs_agino_t old_value; + struct xfs_agi *agi = agibp->b_addr; + xfs_agino_t next_agino; + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; int error; - ASSERT(xfs_verify_agino_or_null(mp, pag->pag_agno, next_agino)); - - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &ibp); - if (error) - return error; - dip = xfs_buf_offset(ibp, ip->i_imap.im_boffset); - - /* Make sure the old pointer isn't garbage. */ - old_value = be32_to_cpu(dip->di_next_unlinked); - if (!xfs_verify_agino_or_null(mp, pag->pag_agno, old_value)) { - xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, - sizeof(*dip), __this_address); - error = -EFSCORRUPTED; - goto out; + /* + * Get the index into the agi hash table for the list this inode will + * go on. Make sure the pointer isn't garbage and that this inode + * isn't already on the list. + */ + next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); + if (next_agino == agino || + !xfs_verify_agino_or_null(pag, next_agino)) { + xfs_buf_mark_corrupt(agibp); + return -EFSCORRUPTED; } /* - * Since we're updating a linked list, we should never find that the - * current pointer is the same as the new value, unless we're - * terminating the list. + * Update the prev pointer in the next inode to point back to this + * inode. */ - *old_next_agino = old_value; - if (old_value == next_agino) { - if (next_agino != NULLAGINO) { - xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, - dip, sizeof(*dip), __this_address); - error = -EFSCORRUPTED; - } - goto out; + error = xfs_iunlink_update_backref(pag, agino, next_agino); + if (error) + return error; + + if (next_agino != NULLAGINO) { + /* + * There is already another inode in the bucket, so point this + * inode to the current head of the list. + */ + error = xfs_iunlink_log_inode(tp, ip, pag, next_agino); + if (error) + return error; + ip->i_next_unlinked = next_agino; } - /* Ok, update the new pointer. */ - xfs_iunlink_update_dinode(tp, pag, XFS_INO_TO_AGINO(mp, ip->i_ino), - ibp, dip, &ip->i_imap, next_agino); - return 0; -out: - xfs_trans_brelse(tp, ibp); - return error; + /* Point the head of the list to point to this inode. */ + return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino); } /* @@ -2143,11 +1968,7 @@ xfs_iunlink( { struct xfs_mount *mp = tp->t_mountp; struct xfs_perag *pag; - struct xfs_agi *agi; struct xfs_buf *agibp; - xfs_agino_t next_agino; - xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); - short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; int error; ASSERT(VFS_I(ip)->i_nlink == 0); @@ -2157,202 +1978,38 @@ xfs_iunlink( pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); /* Get the agi buffer first. It ensures lock ordering on the list. */ - error = xfs_read_agi(mp, tp, pag->pag_agno, &agibp); + error = xfs_read_agi(pag, tp, &agibp); if (error) goto out; - agi = agibp->b_addr; - /* - * Get the index into the agi hash table for the list this inode will - * go on. Make sure the pointer isn't garbage and that this inode - * isn't already on the list. - */ - next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); - if (next_agino == agino || - !xfs_verify_agino_or_null(mp, pag->pag_agno, next_agino)) { - xfs_buf_mark_corrupt(agibp); - error = -EFSCORRUPTED; - goto out; - } - - if (next_agino != NULLAGINO) { - xfs_agino_t old_agino; - - /* - * There is already another inode in the bucket, so point this - * inode to the current head of the list. - */ - error = xfs_iunlink_update_inode(tp, ip, pag, next_agino, - &old_agino); - if (error) - goto out; - ASSERT(old_agino == NULLAGINO); - - /* - * agino has been unlinked, add a backref from the next inode - * back to agino. - */ - error = xfs_iunlink_add_backref(pag, agino, next_agino); - if (error) - goto out; - } - - /* Point the head of the list to point to this inode. */ - error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino); + error = xfs_iunlink_insert_inode(tp, pag, agibp, ip); out: xfs_perag_put(pag); return error; } -/* Return the imap, dinode pointer, and buffer for an inode. */ -STATIC int -xfs_iunlink_map_ino( - struct xfs_trans *tp, - xfs_agnumber_t agno, - xfs_agino_t agino, - struct xfs_imap *imap, - struct xfs_dinode **dipp, - struct xfs_buf **bpp) -{ - struct xfs_mount *mp = tp->t_mountp; - int error; - - imap->im_blkno = 0; - error = xfs_imap(mp, tp, XFS_AGINO_TO_INO(mp, agno, agino), imap, 0); - if (error) { - xfs_warn(mp, "%s: xfs_imap returned error %d.", - __func__, error); - return error; - } - - error = xfs_imap_to_bp(mp, tp, imap, bpp); - if (error) { - xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.", - __func__, error); - return error; - } - - *dipp = xfs_buf_offset(*bpp, imap->im_boffset); - return 0; -} - -/* - * Walk the unlinked chain from @head_agino until we find the inode that - * points to @target_agino. Return the inode number, map, dinode pointer, - * and inode cluster buffer of that inode as @agino, @imap, @dipp, and @bpp. - * - * @tp, @pag, @head_agino, and @target_agino are input parameters. - * @agino, @imap, @dipp, and @bpp are all output parameters. - * - * Do not call this function if @target_agino is the head of the list. - */ -STATIC int -xfs_iunlink_map_prev( - struct xfs_trans *tp, - struct xfs_perag *pag, - xfs_agino_t head_agino, - xfs_agino_t target_agino, - xfs_agino_t *agino, - struct xfs_imap *imap, - struct xfs_dinode **dipp, - struct xfs_buf **bpp) -{ - struct xfs_mount *mp = tp->t_mountp; - xfs_agino_t next_agino; - int error; - - ASSERT(head_agino != target_agino); - *bpp = NULL; - - /* See if our backref cache can find it faster. */ - *agino = xfs_iunlink_lookup_backref(pag, target_agino); - if (*agino != NULLAGINO) { - error = xfs_iunlink_map_ino(tp, pag->pag_agno, *agino, imap, - dipp, bpp); - if (error) - return error; - - if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino) - return 0; - - /* - * If we get here the cache contents were corrupt, so drop the - * buffer and fall back to walking the bucket list. - */ - xfs_trans_brelse(tp, *bpp); - *bpp = NULL; - WARN_ON_ONCE(1); - } - - trace_xfs_iunlink_map_prev_fallback(mp, pag->pag_agno); - - /* Otherwise, walk the entire bucket until we find it. */ - next_agino = head_agino; - while (next_agino != target_agino) { - xfs_agino_t unlinked_agino; - - if (*bpp) - xfs_trans_brelse(tp, *bpp); - - *agino = next_agino; - error = xfs_iunlink_map_ino(tp, pag->pag_agno, next_agino, imap, - dipp, bpp); - if (error) - return error; - - unlinked_agino = be32_to_cpu((*dipp)->di_next_unlinked); - /* - * Make sure this pointer is valid and isn't an obvious - * infinite loop. - */ - if (!xfs_verify_agino(mp, pag->pag_agno, unlinked_agino) || - next_agino == unlinked_agino) { - XFS_CORRUPTION_ERROR(__func__, - XFS_ERRLEVEL_LOW, mp, - *dipp, sizeof(**dipp)); - error = -EFSCORRUPTED; - return error; - } - next_agino = unlinked_agino; - } - - return 0; -} - -/* - * Pull the on-disk inode from the AGI unlinked list. - */ -STATIC int -xfs_iunlink_remove( +static int +xfs_iunlink_remove_inode( struct xfs_trans *tp, struct xfs_perag *pag, + struct xfs_buf *agibp, struct xfs_inode *ip) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_agi *agi; - struct xfs_buf *agibp; - struct xfs_buf *last_ibp; - struct xfs_dinode *last_dip = NULL; + struct xfs_agi *agi = agibp->b_addr; xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); - xfs_agino_t next_agino; xfs_agino_t head_agino; short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; int error; trace_xfs_iunlink_remove(ip); - /* Get the agi buffer first. It ensures lock ordering on the list. */ - error = xfs_read_agi(mp, tp, pag->pag_agno, &agibp); - if (error) - return error; - agi = agibp->b_addr; - /* * Get the index into the agi hash table for the list this inode will * go on. Make sure the head pointer isn't garbage. */ head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); - if (!xfs_verify_agino(mp, pag->pag_agno, head_agino)) { + if (!xfs_verify_agino(pag, head_agino)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agi, sizeof(*agi)); return -EFSCORRUPTED; @@ -2363,52 +2020,60 @@ xfs_iunlink_remove( * the old pointer value so that we can update whatever was previous * to us in the list to point to whatever was next in the list. */ - error = xfs_iunlink_update_inode(tp, ip, pag, NULLAGINO, &next_agino); + error = xfs_iunlink_log_inode(tp, ip, pag, NULLAGINO); if (error) return error; /* - * If there was a backref pointing from the next inode back to this - * one, remove it because we've removed this inode from the list. - * - * Later, if this inode was in the middle of the list we'll update - * this inode's backref to point from the next inode. + * Update the prev pointer in the next inode to point back to previous + * inode in the chain. */ - if (next_agino != NULLAGINO) { - error = xfs_iunlink_change_backref(pag, next_agino, NULLAGINO); - if (error) - return error; - } + error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked, + ip->i_next_unlinked); + if (error) + return error; if (head_agino != agino) { - struct xfs_imap imap; - xfs_agino_t prev_agino; + struct xfs_inode *prev_ip; - /* We need to search the list for the inode being freed. */ - error = xfs_iunlink_map_prev(tp, pag, head_agino, agino, - &prev_agino, &imap, &last_dip, &last_ibp); - if (error) - return error; - - /* Point the previous inode on the list to the next inode. */ - xfs_iunlink_update_dinode(tp, pag, prev_agino, last_ibp, - last_dip, &imap, next_agino); + prev_ip = xfs_iunlink_lookup(pag, ip->i_prev_unlinked); + if (!prev_ip) + return -EFSCORRUPTED; - /* - * Now we deal with the backref for this inode. If this inode - * pointed at a real inode, change the backref that pointed to - * us to point to our old next. If this inode was the end of - * the list, delete the backref that pointed to us. Note that - * change_backref takes care of deleting the backref if - * next_agino is NULLAGINO. - */ - return xfs_iunlink_change_backref(agibp->b_pag, agino, - next_agino); + error = xfs_iunlink_log_inode(tp, prev_ip, pag, + ip->i_next_unlinked); + prev_ip->i_next_unlinked = ip->i_next_unlinked; + } else { + /* Point the head of the list to the next unlinked inode. */ + error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, + ip->i_next_unlinked); } - /* Point the head of the list to the next unlinked inode. */ - return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, - next_agino); + ip->i_next_unlinked = NULLAGINO; + ip->i_prev_unlinked = NULLAGINO; + return error; +} + +/* + * Pull the on-disk inode from the AGI unlinked list. + */ +STATIC int +xfs_iunlink_remove( + struct xfs_trans *tp, + struct xfs_perag *pag, + struct xfs_inode *ip) +{ + struct xfs_buf *agibp; + int error; + + trace_xfs_iunlink_remove(ip); + + /* Get the agi buffer first. It ensures lock ordering on the list. */ + error = xfs_read_agi(pag, tp, &agibp); + if (error) + return error; + + return xfs_iunlink_remove_inode(tp, pag, agibp, ip); } /* @@ -3046,10 +2711,12 @@ out_trans_abort: static int xfs_rename_alloc_whiteout( struct user_namespace *mnt_userns, + struct xfs_name *src_name, struct xfs_inode *dp, struct xfs_inode **wip) { struct xfs_inode *tmpfile; + struct qstr name; int error; error = xfs_create_tmpfile(mnt_userns, dp, S_IFCHR | WHITEOUT_MODE, @@ -3057,6 +2724,15 @@ xfs_rename_alloc_whiteout( if (error) return error; + name.name = src_name->name; + name.len = src_name->len; + error = xfs_inode_init_security(VFS_I(tmpfile), VFS_I(dp), &name); + if (error) { + xfs_finish_inode_setup(tmpfile); + xfs_irele(tmpfile); + return error; + } + /* * Prepare the tmpfile inode as if it were created through the VFS. * Complete the inode setup and flag it as linkable. nlink is already @@ -3107,7 +2783,8 @@ xfs_rename( * appropriately. */ if (flags & RENAME_WHITEOUT) { - error = xfs_rename_alloc_whiteout(mnt_userns, target_dp, &wip); + error = xfs_rename_alloc_whiteout(mnt_userns, src_name, + target_dp, &wip); if (error) return error; @@ -3243,11 +2920,13 @@ retry: if (inodes[i] == wip || (inodes[i] == target_ip && (VFS_I(target_ip)->i_nlink == 1 || src_is_directory))) { - struct xfs_buf *bp; - xfs_agnumber_t agno; + struct xfs_perag *pag; + struct xfs_buf *bp; - agno = XFS_INO_TO_AGNO(mp, inodes[i]->i_ino); - error = xfs_read_agi(mp, tp, agno, &bp); + pag = xfs_perag_get(mp, + XFS_INO_TO_AGNO(mp, inodes[i]->i_ino)); + error = xfs_read_agi(pag, tp, &bp); + xfs_perag_put(pag); if (error) goto out_trans_cancel; } @@ -3466,13 +3145,13 @@ xfs_iflush( goto flush_out; } } - if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp) > + if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af) > ip->i_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: detected corrupt incore inode %llu, " "total extents = %llu nblocks = %lld, ptr "PTR_FMT, __func__, ip->i_ino, - ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp), + ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af), ip->i_nblocks, ip); goto flush_out; } @@ -3502,7 +3181,8 @@ xfs_iflush( if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL && xfs_ifork_verify_local_data(ip)) goto flush_out; - if (ip->i_afp && ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL && + if (xfs_inode_has_attr_fork(ip) && + ip->i_af.if_format == XFS_DINODE_FMT_LOCAL && xfs_ifork_verify_local_attr(ip)) goto flush_out; @@ -3520,7 +3200,7 @@ xfs_iflush( } xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); - if (XFS_IFORK_Q(ip)) + if (xfs_inode_has_attr_fork(ip)) xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); /* @@ -3767,6 +3447,50 @@ retry: return 0; } +static int +xfs_mmaplock_two_inodes_and_break_dax_layout( + struct xfs_inode *ip1, + struct xfs_inode *ip2) +{ + int error; + bool retry; + struct page *page; + + if (ip1->i_ino > ip2->i_ino) + swap(ip1, ip2); + +again: + retry = false; + /* Lock the first inode */ + xfs_ilock(ip1, XFS_MMAPLOCK_EXCL); + error = xfs_break_dax_layouts(VFS_I(ip1), &retry); + if (error || retry) { + xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); + if (error == 0 && retry) + goto again; + return error; + } + + if (ip1 == ip2) + return 0; + + /* Nested lock the second inode */ + xfs_ilock(ip2, xfs_lock_inumorder(XFS_MMAPLOCK_EXCL, 1)); + /* + * We cannot use xfs_break_dax_layouts() directly here because it may + * need to unlock & lock the XFS_MMAPLOCK_EXCL which is not suitable + * for this nested lock case. + */ + page = dax_layout_busy_page(VFS_I(ip2)->i_mapping); + if (page && page_ref_count(page) != 1) { + xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); + xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); + goto again; + } + + return 0; +} + /* * Lock two inodes so that userspace cannot initiate I/O via file syscalls or * mmap activity. @@ -3781,8 +3505,19 @@ xfs_ilock2_io_mmap( ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2)); if (ret) return ret; - filemap_invalidate_lock_two(VFS_I(ip1)->i_mapping, - VFS_I(ip2)->i_mapping); + + if (IS_DAX(VFS_I(ip1)) && IS_DAX(VFS_I(ip2))) { + ret = xfs_mmaplock_two_inodes_and_break_dax_layout(ip1, ip2); + if (ret) { + inode_unlock(VFS_I(ip2)); + if (ip1 != ip2) + inode_unlock(VFS_I(ip1)); + return ret; + } + } else + filemap_invalidate_lock_two(VFS_I(ip1)->i_mapping, + VFS_I(ip2)->i_mapping); + return 0; } @@ -3792,8 +3527,14 @@ xfs_iunlock2_io_mmap( struct xfs_inode *ip1, struct xfs_inode *ip2) { - filemap_invalidate_unlock_two(VFS_I(ip1)->i_mapping, - VFS_I(ip2)->i_mapping); + if (IS_DAX(VFS_I(ip1)) && IS_DAX(VFS_I(ip2))) { + xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); + if (ip1 != ip2) + xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); + } else + filemap_invalidate_unlock_two(VFS_I(ip1)->i_mapping, + VFS_I(ip2)->i_mapping); + inode_unlock(VFS_I(ip2)); if (ip1 != ip2) inode_unlock(VFS_I(ip1)); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 7be6f8e705ab..fa780f08dc89 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -33,9 +33,9 @@ typedef struct xfs_inode { struct xfs_imap i_imap; /* location for xfs_imap() */ /* Extent information. */ - struct xfs_ifork *i_afp; /* attribute fork pointer */ struct xfs_ifork *i_cowfp; /* copy on write extents */ struct xfs_ifork i_df; /* data fork */ + struct xfs_ifork i_af; /* attribute fork */ /* Transaction and locking information. */ struct xfs_inode_log_item *i_itemp; /* logging information */ @@ -68,6 +68,10 @@ typedef struct xfs_inode { uint64_t i_diflags2; /* XFS_DIFLAG2_... */ struct timespec64 i_crtime; /* time created */ + /* unlinked list pointers */ + xfs_agino_t i_next_unlinked; + xfs_agino_t i_prev_unlinked; + /* VFS inode */ struct inode i_vnode; /* embedded VFS inode */ @@ -77,6 +81,66 @@ typedef struct xfs_inode { struct list_head i_ioend_list; } xfs_inode_t; +static inline bool xfs_inode_has_attr_fork(struct xfs_inode *ip) +{ + return ip->i_forkoff > 0; +} + +static inline struct xfs_ifork * +xfs_ifork_ptr( + struct xfs_inode *ip, + int whichfork) +{ + switch (whichfork) { + case XFS_DATA_FORK: + return &ip->i_df; + case XFS_ATTR_FORK: + if (!xfs_inode_has_attr_fork(ip)) + return NULL; + return &ip->i_af; + case XFS_COW_FORK: + return ip->i_cowfp; + default: + ASSERT(0); + return NULL; + } +} + +static inline unsigned int xfs_inode_fork_boff(struct xfs_inode *ip) +{ + return ip->i_forkoff << 3; +} + +static inline unsigned int xfs_inode_data_fork_size(struct xfs_inode *ip) +{ + if (xfs_inode_has_attr_fork(ip)) + return xfs_inode_fork_boff(ip); + + return XFS_LITINO(ip->i_mount); +} + +static inline unsigned int xfs_inode_attr_fork_size(struct xfs_inode *ip) +{ + if (xfs_inode_has_attr_fork(ip)) + return XFS_LITINO(ip->i_mount) - xfs_inode_fork_boff(ip); + return 0; +} + +static inline unsigned int +xfs_inode_fork_size( + struct xfs_inode *ip, + int whichfork) +{ + switch (whichfork) { + case XFS_DATA_FORK: + return xfs_inode_data_fork_size(ip); + case XFS_ATTR_FORK: + return xfs_inode_attr_fork_size(ip); + default: + return 0; + } +} + /* Convert from vfs inode to xfs inode */ static inline struct xfs_inode *XFS_I(struct inode *inode) { @@ -467,6 +531,7 @@ xfs_itruncate_extents( } /* from xfs_file.c */ +int xfs_break_dax_layouts(struct inode *inode, bool *retry); int xfs_break_layouts(struct inode *inode, uint *iolock, enum layout_break_reason reason); @@ -505,9 +570,6 @@ extern struct kmem_cache *xfs_inode_cache; bool xfs_inode_needs_inactive(struct xfs_inode *ip); -int xfs_iunlink_init(struct xfs_perag *pag); -void xfs_iunlink_destroy(struct xfs_perag *pag); - void xfs_end_io(struct work_struct *work); int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 721def0639fd..6e19ece916bf 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -57,7 +57,7 @@ xfs_inode_item_data_fork_size( ip->i_df.if_nextents > 0 && ip->i_df.if_bytes > 0) { /* worst case, doesn't subtract delalloc extents */ - *nbytes += XFS_IFORK_DSIZE(ip); + *nbytes += xfs_inode_data_fork_size(ip); *nvecs += 1; } break; @@ -92,27 +92,27 @@ xfs_inode_item_attr_fork_size( { struct xfs_inode *ip = iip->ili_inode; - switch (ip->i_afp->if_format) { + switch (ip->i_af.if_format) { case XFS_DINODE_FMT_EXTENTS: if ((iip->ili_fields & XFS_ILOG_AEXT) && - ip->i_afp->if_nextents > 0 && - ip->i_afp->if_bytes > 0) { + ip->i_af.if_nextents > 0 && + ip->i_af.if_bytes > 0) { /* worst case, doesn't subtract unused space */ - *nbytes += XFS_IFORK_ASIZE(ip); + *nbytes += xfs_inode_attr_fork_size(ip); *nvecs += 1; } break; case XFS_DINODE_FMT_BTREE: if ((iip->ili_fields & XFS_ILOG_ABROOT) && - ip->i_afp->if_broot_bytes > 0) { - *nbytes += ip->i_afp->if_broot_bytes; + ip->i_af.if_broot_bytes > 0) { + *nbytes += ip->i_af.if_broot_bytes; *nvecs += 1; } break; case XFS_DINODE_FMT_LOCAL: if ((iip->ili_fields & XFS_ILOG_ADATA) && - ip->i_afp->if_bytes > 0) { - *nbytes += xlog_calc_iovec_len(ip->i_afp->if_bytes); + ip->i_af.if_bytes > 0) { + *nbytes += xlog_calc_iovec_len(ip->i_af.if_bytes); *nvecs += 1; } break; @@ -143,7 +143,7 @@ xfs_inode_item_size( xfs_log_dinode_size(ip->i_mount); xfs_inode_item_data_fork_size(iip, nvecs, nbytes); - if (XFS_IFORK_Q(ip)) + if (xfs_inode_has_attr_fork(ip)) xfs_inode_item_attr_fork_size(iip, nvecs, nbytes); } @@ -237,18 +237,18 @@ xfs_inode_item_format_attr_fork( struct xfs_inode *ip = iip->ili_inode; size_t data_bytes; - switch (ip->i_afp->if_format) { + switch (ip->i_af.if_format) { case XFS_DINODE_FMT_EXTENTS: iip->ili_fields &= ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT); if ((iip->ili_fields & XFS_ILOG_AEXT) && - ip->i_afp->if_nextents > 0 && - ip->i_afp->if_bytes > 0) { + ip->i_af.if_nextents > 0 && + ip->i_af.if_bytes > 0) { struct xfs_bmbt_rec *p; - ASSERT(xfs_iext_count(ip->i_afp) == - ip->i_afp->if_nextents); + ASSERT(xfs_iext_count(&ip->i_af) == + ip->i_af.if_nextents); p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_EXT); data_bytes = xfs_iextents_copy(ip, p, XFS_ATTR_FORK); @@ -265,13 +265,13 @@ xfs_inode_item_format_attr_fork( ~(XFS_ILOG_ADATA | XFS_ILOG_AEXT); if ((iip->ili_fields & XFS_ILOG_ABROOT) && - ip->i_afp->if_broot_bytes > 0) { - ASSERT(ip->i_afp->if_broot != NULL); + ip->i_af.if_broot_bytes > 0) { + ASSERT(ip->i_af.if_broot != NULL); xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_BROOT, - ip->i_afp->if_broot, - ip->i_afp->if_broot_bytes); - ilf->ilf_asize = ip->i_afp->if_broot_bytes; + ip->i_af.if_broot, + ip->i_af.if_broot_bytes); + ilf->ilf_asize = ip->i_af.if_broot_bytes; ilf->ilf_size++; } else { iip->ili_fields &= ~XFS_ILOG_ABROOT; @@ -282,12 +282,12 @@ xfs_inode_item_format_attr_fork( ~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT); if ((iip->ili_fields & XFS_ILOG_ADATA) && - ip->i_afp->if_bytes > 0) { - ASSERT(ip->i_afp->if_u1.if_data != NULL); + ip->i_af.if_bytes > 0) { + ASSERT(ip->i_af.if_u1.if_data != NULL); xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL, - ip->i_afp->if_u1.if_data, - ip->i_afp->if_bytes); - ilf->ilf_asize = (unsigned)ip->i_afp->if_bytes; + ip->i_af.if_u1.if_data, + ip->i_af.if_bytes); + ilf->ilf_asize = (unsigned)ip->i_af.if_bytes; ilf->ilf_size++; } else { iip->ili_fields &= ~XFS_ILOG_ADATA; @@ -355,11 +355,11 @@ xfs_inode_to_log_dinode_iext_counters( { if (xfs_inode_has_large_extent_counts(ip)) { to->di_big_nextents = xfs_ifork_nextents(&ip->i_df); - to->di_big_anextents = xfs_ifork_nextents(ip->i_afp); + to->di_big_anextents = xfs_ifork_nextents(&ip->i_af); to->di_nrext64_pad = 0; } else { to->di_nextents = xfs_ifork_nextents(&ip->i_df); - to->di_anextents = xfs_ifork_nextents(ip->i_afp); + to->di_anextents = xfs_ifork_nextents(&ip->i_af); } } @@ -390,7 +390,7 @@ xfs_inode_to_log_dinode( to->di_nblocks = ip->i_nblocks; to->di_extsize = ip->i_extsize; to->di_forkoff = ip->i_forkoff; - to->di_aformat = xfs_ifork_format(ip->i_afp); + to->di_aformat = xfs_ifork_format(&ip->i_af); to->di_flags = ip->i_diflags; xfs_copy_dm_fields_to_log_dinode(ip, to); @@ -480,7 +480,7 @@ xfs_inode_item_format( xfs_inode_item_format_core(ip, lv, &vecp); xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp); - if (XFS_IFORK_Q(ip)) { + if (xfs_inode_has_attr_fork(ip)) { xfs_inode_item_format_attr_fork(iip, ilf, lv, &vecp); } else { iip->ili_fields &= diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 0d67ff8a8961..1f783e979629 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -955,6 +955,7 @@ xfs_ioc_ag_geometry( struct xfs_mount *mp, void __user *arg) { + struct xfs_perag *pag; struct xfs_ag_geometry ageo; int error; @@ -965,7 +966,12 @@ xfs_ioc_ag_geometry( if (memchr_inv(&ageo.ag_reserved, 0, sizeof(ageo.ag_reserved))) return -EINVAL; - error = xfs_ag_get_geometry(mp, ageo.ag_number, &ageo); + pag = xfs_perag_get(mp, ageo.ag_number); + if (!pag) + return -EINVAL; + + error = xfs_ag_get_geometry(pag, &ageo); + xfs_perag_put(pag); if (error) return error; @@ -985,7 +991,7 @@ xfs_fill_fsxattr( struct fileattr *fa) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); fileattr_fill_xflags(fa, xfs_ip2xflags(ip)); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 5a393259a3a3..07da03976ec1 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -159,7 +159,7 @@ xfs_iomap_eof_align_last_fsb( struct xfs_inode *ip, xfs_fileoff_t end_fsb) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); xfs_extlen_t extsz = xfs_get_extsz_hint(ip); xfs_extlen_t align = xfs_eof_alignment(ip); struct xfs_bmbt_irec irec; @@ -370,7 +370,7 @@ xfs_iomap_prealloc_size( struct xfs_iext_cursor ncur = *icur; struct xfs_bmbt_irec prev, got; struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); int64_t freesp; xfs_fsblock_t qblocks; @@ -664,7 +664,7 @@ xfs_ilock_for_iomap( unsigned flags, unsigned *lockmode) { - unsigned mode = XFS_ILOCK_SHARED; + unsigned int mode = *lockmode; bool is_write = flags & (IOMAP_WRITE | IOMAP_ZERO); /* @@ -742,7 +742,7 @@ xfs_direct_write_iomap_begin( int nimaps = 1, error = 0; bool shared = false; u16 iomap_flags = 0; - unsigned lockmode; + unsigned int lockmode = XFS_ILOCK_SHARED; ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO)); @@ -773,7 +773,8 @@ xfs_direct_write_iomap_begin( /* may drop and re-acquire the ilock */ error = xfs_reflink_allocate_cow(ip, &imap, &cmap, &shared, - &lockmode, flags & IOMAP_DIRECT); + &lockmode, + (flags & IOMAP_DIRECT) || IS_DAX(inode)); if (error) goto out_unlock; if (shared) @@ -868,6 +869,33 @@ const struct iomap_ops xfs_direct_write_iomap_ops = { }; static int +xfs_dax_write_iomap_end( + struct inode *inode, + loff_t pos, + loff_t length, + ssize_t written, + unsigned flags, + struct iomap *iomap) +{ + struct xfs_inode *ip = XFS_I(inode); + + if (!xfs_is_cow_inode(ip)) + return 0; + + if (!written) { + xfs_reflink_cancel_cow_range(ip, pos, length, true); + return 0; + } + + return xfs_reflink_end_cow(ip, pos, written); +} + +const struct iomap_ops xfs_dax_write_iomap_ops = { + .iomap_begin = xfs_direct_write_iomap_begin, + .iomap_end = xfs_dax_write_iomap_end, +}; + +static int xfs_buffered_write_iomap_begin( struct inode *inode, loff_t offset, @@ -886,6 +914,7 @@ xfs_buffered_write_iomap_begin( bool eof = false, cow_eof = false, shared = false; int allocfork = XFS_DATA_FORK; int error = 0; + unsigned int lockmode = XFS_ILOCK_EXCL; if (xfs_is_shutdown(mp)) return -EIO; @@ -897,7 +926,9 @@ xfs_buffered_write_iomap_begin( ASSERT(!XFS_IS_REALTIME_INODE(ip)); - xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_ilock_for_iomap(ip, flags, &lockmode); + if (error) + return error; if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { @@ -1172,7 +1203,7 @@ xfs_read_iomap_begin( xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length); int nimaps = 1, error = 0; bool shared = false; - unsigned lockmode; + unsigned int lockmode = XFS_ILOCK_SHARED; ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO))); @@ -1307,12 +1338,12 @@ xfs_xattr_iomap_begin( lockmode = xfs_ilock_attr_map_shared(ip); /* if there are no attribute fork or extents, return ENOENT */ - if (!XFS_IFORK_Q(ip) || !ip->i_afp->if_nextents) { + if (!xfs_inode_has_attr_fork(ip) || !ip->i_af.if_nextents) { error = -ENOENT; goto out_unlock; } - ASSERT(ip->i_afp->if_format != XFS_DINODE_FMT_LOCAL); + ASSERT(ip->i_af.if_format != XFS_DINODE_FMT_LOCAL); error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, XFS_BMAPI_ATTRFORK); out_unlock: diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index e88dc162c785..c782e8c0479c 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -51,5 +51,6 @@ extern const struct iomap_ops xfs_direct_write_iomap_ops; extern const struct iomap_ops xfs_read_iomap_ops; extern const struct iomap_ops xfs_seek_iomap_ops; extern const struct iomap_ops xfs_xattr_iomap_ops; +extern const struct iomap_ops xfs_dax_write_iomap_ops; #endif /* __XFS_IOMAP_H__*/ diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 29f5b8b8aca6..45518b8c613c 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -75,9 +75,8 @@ xfs_initxattrs( * these attrs can be journalled at inode creation time (along with the * inode, of course, such that log replay can't cause these to be lost). */ - -STATIC int -xfs_init_security( +int +xfs_inode_init_security( struct inode *inode, struct inode *dir, const struct qstr *qstr) @@ -122,7 +121,7 @@ xfs_cleanup_inode( /* Oh, the horror. * If we can't add the ACL or we fail in - * xfs_init_security we must back out. + * xfs_inode_init_security we must back out. * ENOSPC can hit here, among other things. */ xfs_dentry_to_name(&teardown, dentry); @@ -208,7 +207,7 @@ xfs_generic_create( inode = VFS_I(ip); - error = xfs_init_security(inode, dir, &dentry->d_name); + error = xfs_inode_init_security(inode, dir, &dentry->d_name); if (unlikely(error)) goto out_cleanup_inode; @@ -424,7 +423,7 @@ xfs_vn_symlink( inode = VFS_I(cip); - error = xfs_init_security(inode, dir, &dentry->d_name); + error = xfs_inode_init_security(inode, dir, &dentry->d_name); if (unlikely(error)) goto out_cleanup_inode; @@ -667,13 +666,15 @@ xfs_setattr_nonsize( uint qflags = 0; if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp)) { - uid = iattr->ia_uid; + uid = from_vfsuid(mnt_userns, i_user_ns(inode), + iattr->ia_vfsuid); qflags |= XFS_QMOPT_UQUOTA; } else { uid = inode->i_uid; } if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) { - gid = iattr->ia_gid; + gid = from_vfsgid(mnt_userns, i_user_ns(inode), + iattr->ia_vfsgid); qflags |= XFS_QMOPT_GQUOTA; } else { gid = inode->i_gid; @@ -704,13 +705,13 @@ xfs_setattr_nonsize( * didn't have the inode locked, inode's dquot(s) would have changed * also. */ - if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp) && - !uid_eq(inode->i_uid, iattr->ia_uid)) { + if (XFS_IS_UQUOTA_ON(mp) && + i_uid_needs_update(mnt_userns, iattr, inode)) { ASSERT(udqp); old_udqp = xfs_qm_vop_chown(tp, ip, &ip->i_udquot, udqp); } - if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp) && - !gid_eq(inode->i_gid, iattr->ia_gid)) { + if (XFS_IS_GQUOTA_ON(mp) && + i_gid_needs_update(mnt_userns, iattr, inode)) { ASSERT(xfs_has_pquotino(mp) || !XFS_IS_PQUOTA_ON(mp)); ASSERT(gdqp); old_gdqp = xfs_qm_vop_chown(tp, ip, &ip->i_gdquot, gdqp); @@ -1280,7 +1281,7 @@ xfs_setup_inode( * If there is no attribute fork no ACL can exist on this inode, * and it can't have any file capabilities attached to it either. */ - if (!XFS_IFORK_Q(ip)) { + if (!xfs_inode_has_attr_fork(ip)) { inode_has_no_xattr(inode); cache_no_acl(inode); } diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h index 278949056048..cb5fc68c9ea0 100644 --- a/fs/xfs/xfs_iops.h +++ b/fs/xfs/xfs_iops.h @@ -17,4 +17,7 @@ extern void xfs_setattr_time(struct xfs_inode *ip, struct iattr *iattr); int xfs_vn_setattr_size(struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr *vap); +int xfs_inode_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr); + #endif /* __XFS_IOPS_H__ */ diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index f74c9fff72bb..36312b00b164 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -111,8 +111,8 @@ xfs_bulkstat_one_int( buf->bs_extents64 = nextents; xfs_bulkstat_health(ip, buf); - buf->bs_aextents = xfs_ifork_nextents(ip->i_afp); - buf->bs_forkoff = XFS_IFORK_BOFF(ip); + buf->bs_aextents = xfs_ifork_nextents(&ip->i_af); + buf->bs_forkoff = xfs_inode_fork_boff(ip); buf->bs_version = XFS_BULKSTAT_VERSION_V5; if (xfs_has_v3inodes(mp)) { diff --git a/fs/xfs/xfs_iunlink_item.c b/fs/xfs/xfs_iunlink_item.c new file mode 100644 index 000000000000..43005ce8bd48 --- /dev/null +++ b/fs/xfs/xfs_iunlink_item.c @@ -0,0 +1,180 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2020-2022, Red Hat, Inc. + * All Rights Reserved. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_ag.h" +#include "xfs_iunlink_item.h" +#include "xfs_trace.h" +#include "xfs_error.h" + +struct kmem_cache *xfs_iunlink_cache; + +static inline struct xfs_iunlink_item *IUL_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_iunlink_item, item); +} + +static void +xfs_iunlink_item_release( + struct xfs_log_item *lip) +{ + struct xfs_iunlink_item *iup = IUL_ITEM(lip); + + xfs_perag_put(iup->pag); + kmem_cache_free(xfs_iunlink_cache, IUL_ITEM(lip)); +} + + +static uint64_t +xfs_iunlink_item_sort( + struct xfs_log_item *lip) +{ + return IUL_ITEM(lip)->ip->i_ino; +} + +/* + * Look up the inode cluster buffer and log the on-disk unlinked inode change + * we need to make. + */ +static int +xfs_iunlink_log_dinode( + struct xfs_trans *tp, + struct xfs_iunlink_item *iup) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_inode *ip = iup->ip; + struct xfs_dinode *dip; + struct xfs_buf *ibp; + int offset; + int error; + + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &ibp); + if (error) + return error; + /* + * Don't log the unlinked field on stale buffers as this may be the + * transaction that frees the inode cluster and relogging the buffer + * here will incorrectly remove the stale state. + */ + if (ibp->b_flags & XBF_STALE) + goto out; + + dip = xfs_buf_offset(ibp, ip->i_imap.im_boffset); + + /* Make sure the old pointer isn't garbage. */ + if (be32_to_cpu(dip->di_next_unlinked) != iup->old_agino) { + xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, + sizeof(*dip), __this_address); + error = -EFSCORRUPTED; + goto out; + } + + trace_xfs_iunlink_update_dinode(mp, iup->pag->pag_agno, + XFS_INO_TO_AGINO(mp, ip->i_ino), + be32_to_cpu(dip->di_next_unlinked), iup->next_agino); + + dip->di_next_unlinked = cpu_to_be32(iup->next_agino); + offset = ip->i_imap.im_boffset + + offsetof(struct xfs_dinode, di_next_unlinked); + + xfs_dinode_calc_crc(mp, dip); + xfs_trans_inode_buf(tp, ibp); + xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1); + return 0; +out: + xfs_trans_brelse(tp, ibp); + return error; +} + +/* + * On precommit, we grab the inode cluster buffer for the inode number we were + * passed, then update the next unlinked field for that inode in the buffer and + * log the buffer. This ensures that the inode cluster buffer was logged in the + * correct order w.r.t. other inode cluster buffers. We can then remove the + * iunlink item from the transaction and release it as it is has now served it's + * purpose. + */ +static int +xfs_iunlink_item_precommit( + struct xfs_trans *tp, + struct xfs_log_item *lip) +{ + struct xfs_iunlink_item *iup = IUL_ITEM(lip); + int error; + + error = xfs_iunlink_log_dinode(tp, iup); + list_del(&lip->li_trans); + xfs_iunlink_item_release(lip); + return error; +} + +static const struct xfs_item_ops xfs_iunlink_item_ops = { + .iop_release = xfs_iunlink_item_release, + .iop_sort = xfs_iunlink_item_sort, + .iop_precommit = xfs_iunlink_item_precommit, +}; + + +/* + * Initialize the inode log item for a newly allocated (in-core) inode. + * + * Inode extents can only reside within an AG. Hence specify the starting + * block for the inode chunk by offset within an AG as well as the + * length of the allocated extent. + * + * This joins the item to the transaction and marks it dirty so + * that we don't need a separate call to do this, nor does the + * caller need to know anything about the iunlink item. + */ +int +xfs_iunlink_log_inode( + struct xfs_trans *tp, + struct xfs_inode *ip, + struct xfs_perag *pag, + xfs_agino_t next_agino) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_iunlink_item *iup; + + ASSERT(xfs_verify_agino_or_null(pag, next_agino)); + ASSERT(xfs_verify_agino_or_null(pag, ip->i_next_unlinked)); + + /* + * Since we're updating a linked list, we should never find that the + * current pointer is the same as the new value, unless we're + * terminating the list. + */ + if (ip->i_next_unlinked == next_agino) { + if (next_agino != NULLAGINO) + return -EFSCORRUPTED; + return 0; + } + + iup = kmem_cache_zalloc(xfs_iunlink_cache, GFP_KERNEL | __GFP_NOFAIL); + xfs_log_item_init(mp, &iup->item, XFS_LI_IUNLINK, + &xfs_iunlink_item_ops); + + iup->ip = ip; + iup->next_agino = next_agino; + iup->old_agino = ip->i_next_unlinked; + + atomic_inc(&pag->pag_ref); + iup->pag = pag; + + xfs_trans_add_item(tp, &iup->item); + tp->t_flags |= XFS_TRANS_DIRTY; + set_bit(XFS_LI_DIRTY, &iup->item.li_flags); + return 0; +} + diff --git a/fs/xfs/xfs_iunlink_item.h b/fs/xfs/xfs_iunlink_item.h new file mode 100644 index 000000000000..c793cdcaccde --- /dev/null +++ b/fs/xfs/xfs_iunlink_item.h @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2020-2022, Red Hat, Inc. + * All Rights Reserved. + */ +#ifndef XFS_IUNLINK_ITEM_H +#define XFS_IUNLINK_ITEM_H 1 + +struct xfs_trans; +struct xfs_inode; +struct xfs_perag; + +/* in memory log item structure */ +struct xfs_iunlink_item { + struct xfs_log_item item; + struct xfs_inode *ip; + struct xfs_perag *pag; + xfs_agino_t next_agino; + xfs_agino_t old_agino; +}; + +extern struct kmem_cache *xfs_iunlink_cache; + +int xfs_iunlink_log_inode(struct xfs_trans *tp, struct xfs_inode *ip, + struct xfs_perag *pag, xfs_agino_t next_agino); + +#endif /* XFS_IUNLINK_ITEM_H */ diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index cb9105d667db..f9878021e7d0 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -196,7 +196,7 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y) } int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count, - char *data, unsigned int op); + char *data, enum req_op op); #define ASSERT_ALWAYS(expr) \ (likely(expr) ? (void)0 : assfail(NULL, #expr, __FILE__, __LINE__)) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 1e972f884a81..4b1c0a9c6368 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -57,7 +57,8 @@ xlog_grant_push_ail( STATIC void xlog_sync( struct xlog *log, - struct xlog_in_core *iclog); + struct xlog_in_core *iclog, + struct xlog_ticket *ticket); #if defined(DEBUG) STATIC void xlog_verify_grant_tail( @@ -567,7 +568,8 @@ xlog_state_shutdown_callbacks( int xlog_state_release_iclog( struct xlog *log, - struct xlog_in_core *iclog) + struct xlog_in_core *iclog, + struct xlog_ticket *ticket) { xfs_lsn_t tail_lsn; bool last_ref; @@ -614,7 +616,7 @@ xlog_state_release_iclog( trace_xlog_iclog_syncing(iclog, _RET_IP_); spin_unlock(&log->l_icloglock); - xlog_sync(log, iclog); + xlog_sync(log, iclog, ticket); spin_lock(&log->l_icloglock); return 0; } @@ -881,7 +883,7 @@ xlog_force_iclog( iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA; if (iclog->ic_state == XLOG_STATE_ACTIVE) xlog_state_switch_iclogs(iclog->ic_log, iclog, 0); - return xlog_state_release_iclog(iclog->ic_log, iclog); + return xlog_state_release_iclog(iclog->ic_log, iclog, NULL); } /* @@ -944,6 +946,8 @@ xlog_write_unmount_record( .lv_niovecs = 1, .lv_iovecp = ®, }; + LIST_HEAD(lv_chain); + list_add(&vec.lv_list, &lv_chain); BUILD_BUG_ON((sizeof(struct xlog_op_header) + sizeof(struct xfs_unmount_log_format)) != @@ -952,7 +956,7 @@ xlog_write_unmount_record( /* account for space used by record data */ ticket->t_curr_res -= sizeof(unmount_rec); - return xlog_write(log, NULL, &vec, ticket, reg.i_len); + return xlog_write(log, NULL, &lv_chain, ticket, reg.i_len); } /* @@ -2000,7 +2004,7 @@ xlog_calc_iclog_size( } /* - * Flush out the in-core log (iclog) to the on-disk log in an asynchronous + * Flush out the in-core log (iclog) to the on-disk log in an asynchronous * fashion. Previously, we should have moved the current iclog * ptr in the log to point to the next available iclog. This allows further * write to continue while this code syncs out an iclog ready to go. @@ -2025,7 +2029,8 @@ xlog_calc_iclog_size( STATIC void xlog_sync( struct xlog *log, - struct xlog_in_core *iclog) + struct xlog_in_core *iclog, + struct xlog_ticket *ticket) { unsigned int count; /* byte count of bwrite */ unsigned int roundoff; /* roundoff to BB or stripe */ @@ -2037,12 +2042,20 @@ xlog_sync( count = xlog_calc_iclog_size(log, iclog, &roundoff); - /* move grant heads by roundoff in sync */ - xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff); - xlog_grant_add_space(log, &log->l_write_head.grant, roundoff); + /* + * If we have a ticket, account for the roundoff via the ticket + * reservation to avoid touching the hot grant heads needlessly. + * Otherwise, we have to move grant heads directly. + */ + if (ticket) { + ticket->t_curr_res -= roundoff; + } else { + xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff); + xlog_grant_add_space(log, &log->l_write_head.grant, roundoff); + } /* put cycle number in every block */ - xlog_pack_data(log, iclog, roundoff); + xlog_pack_data(log, iclog, roundoff); /* real byte length */ size = iclog->ic_offset; @@ -2092,8 +2105,6 @@ xlog_dealloc_log( xlog_in_core_t *iclog, *next_iclog; int i; - xlog_cil_destroy(log); - /* * Cycle all the iclogbuf locks to make sure all log IO completion * is done before we tear down these buffers. @@ -2105,6 +2116,13 @@ xlog_dealloc_log( iclog = iclog->ic_next; } + /* + * Destroy the CIL after waiting for iclog IO completion because an + * iclog EIO error will try to shut down the log, which accesses the + * CIL to wake up the waiters. + */ + xlog_cil_destroy(log); + iclog = log->l_iclog; for (i = 0; i < log->l_iclog_bufs; i++) { next_iclog = iclog->ic_next; @@ -2270,7 +2288,7 @@ xlog_write_get_more_iclog_space( spin_lock(&log->l_icloglock); ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC); xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); - error = xlog_state_release_iclog(log, iclog); + error = xlog_state_release_iclog(log, iclog, ticket); spin_unlock(&log->l_icloglock); if (error) return error; @@ -2466,13 +2484,13 @@ int xlog_write( struct xlog *log, struct xfs_cil_ctx *ctx, - struct xfs_log_vec *log_vector, + struct list_head *lv_chain, struct xlog_ticket *ticket, uint32_t len) { struct xlog_in_core *iclog = NULL; - struct xfs_log_vec *lv = log_vector; + struct xfs_log_vec *lv; uint32_t record_cnt = 0; uint32_t data_cnt = 0; int error = 0; @@ -2500,7 +2518,7 @@ xlog_write( if (ctx) xlog_cil_set_ctx_write_state(ctx, iclog); - while (lv) { + list_for_each_entry(lv, lv_chain, lv_list) { /* * If the entire log vec does not fit in the iclog, punt it to * the partial copy loop which can handle this case. @@ -2521,7 +2539,6 @@ xlog_write( xlog_write_full(lv, ticket, iclog, &log_offset, &len, &record_cnt, &data_cnt); } - lv = lv->lv_next; } ASSERT(len == 0); @@ -2533,7 +2550,7 @@ xlog_write( */ spin_lock(&log->l_icloglock); xlog_state_finish_copy(log, iclog, record_cnt, 0); - error = xlog_state_release_iclog(log, iclog); + error = xlog_state_release_iclog(log, iclog, ticket); spin_unlock(&log->l_icloglock); return error; @@ -2953,7 +2970,7 @@ restart: * reference to the iclog. */ if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) - error = xlog_state_release_iclog(log, iclog); + error = xlog_state_release_iclog(log, iclog, ticket); spin_unlock(&log->l_icloglock); if (error) return error; @@ -3401,7 +3418,8 @@ xfs_log_ticket_get( static int xlog_calc_unit_res( struct xlog *log, - int unit_bytes) + int unit_bytes, + int *niclogs) { int iclog_space; uint num_headers; @@ -3481,6 +3499,8 @@ xlog_calc_unit_res( /* roundoff padding for transaction data and one for commit record */ unit_bytes += 2 * log->l_iclog_roundoff; + if (niclogs) + *niclogs = num_headers; return unit_bytes; } @@ -3489,7 +3509,7 @@ xfs_log_calc_unit_res( struct xfs_mount *mp, int unit_bytes) { - return xlog_calc_unit_res(mp->m_log, unit_bytes); + return xlog_calc_unit_res(mp->m_log, unit_bytes, NULL); } /* @@ -3507,7 +3527,7 @@ xlog_ticket_alloc( tic = kmem_cache_zalloc(xfs_log_ticket_cache, GFP_NOFS | __GFP_NOFAIL); - unit_res = xlog_calc_unit_res(log, unit_bytes); + unit_res = xlog_calc_unit_res(log, unit_bytes, &tic->t_iclog_hdrs); atomic_set(&tic->t_ref, 1); tic->t_task = current; diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index f3ce046a7d45..2728886c2963 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -9,7 +9,8 @@ struct xfs_cil_ctx; struct xfs_log_vec { - struct xfs_log_vec *lv_next; /* next lv in build list */ + struct list_head lv_list; /* CIL lv chain ptrs */ + uint32_t lv_order_id; /* chain ordering info */ int lv_niovecs; /* number of iovecs in lv */ struct xfs_log_iovec *lv_iovecp; /* iovec array */ struct xfs_log_item *lv_item; /* owner */ diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index db6cb7800251..eccbfb99e894 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -44,9 +44,20 @@ xlog_cil_ticket_alloc( * transaction overhead reservation from the first transaction commit. */ tic->t_curr_res = 0; + tic->t_iclog_hdrs = 0; return tic; } +static inline void +xlog_cil_set_iclog_hdr_count(struct xfs_cil *cil) +{ + struct xlog *log = cil->xc_log; + + atomic_set(&cil->xc_iclog_hdrs, + (XLOG_CIL_BLOCKING_SPACE_LIMIT(log) / + (log->l_iclog_size - log->l_iclog_hsize))); +} + /* * Check if the current log item was first committed in this sequence. * We can't rely on just the log item being in the CIL, we have to check @@ -61,7 +72,7 @@ xlog_item_in_current_chkpt( struct xfs_cil *cil, struct xfs_log_item *lip) { - if (list_empty(&lip->li_cil)) + if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)) return false; /* @@ -93,15 +104,88 @@ xlog_cil_ctx_alloc(void) ctx = kmem_zalloc(sizeof(*ctx), KM_NOFS); INIT_LIST_HEAD(&ctx->committing); INIT_LIST_HEAD(&ctx->busy_extents); + INIT_LIST_HEAD(&ctx->log_items); + INIT_LIST_HEAD(&ctx->lv_chain); INIT_WORK(&ctx->push_work, xlog_cil_push_work); return ctx; } +/* + * Aggregate the CIL per cpu structures into global counts, lists, etc and + * clear the percpu state ready for the next context to use. This is called + * from the push code with the context lock held exclusively, hence nothing else + * will be accessing or modifying the per-cpu counters. + */ +static void +xlog_cil_push_pcp_aggregate( + struct xfs_cil *cil, + struct xfs_cil_ctx *ctx) +{ + struct xlog_cil_pcp *cilpcp; + int cpu; + + for_each_online_cpu(cpu) { + cilpcp = per_cpu_ptr(cil->xc_pcp, cpu); + + ctx->ticket->t_curr_res += cilpcp->space_reserved; + cilpcp->space_reserved = 0; + + if (!list_empty(&cilpcp->busy_extents)) { + list_splice_init(&cilpcp->busy_extents, + &ctx->busy_extents); + } + if (!list_empty(&cilpcp->log_items)) + list_splice_init(&cilpcp->log_items, &ctx->log_items); + + /* + * We're in the middle of switching cil contexts. Reset the + * counter we use to detect when the current context is nearing + * full. + */ + cilpcp->space_used = 0; + } +} + +/* + * Aggregate the CIL per-cpu space used counters into the global atomic value. + * This is called when the per-cpu counter aggregation will first pass the soft + * limit threshold so we can switch to atomic counter aggregation for accurate + * detection of hard limit traversal. + */ +static void +xlog_cil_insert_pcp_aggregate( + struct xfs_cil *cil, + struct xfs_cil_ctx *ctx) +{ + struct xlog_cil_pcp *cilpcp; + int cpu; + int count = 0; + + /* Trigger atomic updates then aggregate only for the first caller */ + if (!test_and_clear_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags)) + return; + + for_each_online_cpu(cpu) { + int old, prev; + + cilpcp = per_cpu_ptr(cil->xc_pcp, cpu); + do { + old = cilpcp->space_used; + prev = cmpxchg(&cilpcp->space_used, old, 0); + } while (old != prev); + count += old; + } + atomic_add(count, &ctx->space_used); +} + static void xlog_cil_ctx_switch( struct xfs_cil *cil, struct xfs_cil_ctx *ctx) { + xlog_cil_set_iclog_hdr_count(cil); + set_bit(XLOG_CIL_EMPTY, &cil->xc_flags); + set_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags); ctx->sequence = ++cil->xc_current_sequence; ctx->cil = cil; cil->xc_ctx = ctx; @@ -123,6 +207,7 @@ xlog_cil_init_post_recovery( { log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log); log->l_cilp->xc_ctx->sequence = 1; + xlog_cil_set_iclog_hdr_count(log->l_cilp); } static inline int @@ -254,6 +339,7 @@ xlog_cil_alloc_shadow_bufs( memset(lv, 0, xlog_cil_iovec_space(niovecs)); + INIT_LIST_HEAD(&lv->lv_list); lv->lv_item = lip; lv->lv_size = buf_size; if (ordered) @@ -269,7 +355,6 @@ xlog_cil_alloc_shadow_bufs( else lv->lv_buf_len = 0; lv->lv_bytes = 0; - lv->lv_next = NULL; } /* Ensure the lv is set up according to ->iop_size */ @@ -396,7 +481,6 @@ xlog_cil_insert_format_items( if (lip->li_lv && shadow->lv_size <= lip->li_lv->lv_size) { /* same or smaller, optimise common overwrite case */ lv = lip->li_lv; - lv->lv_next = NULL; if (ordered) goto insert; @@ -434,6 +518,23 @@ insert: } /* + * The use of lockless waitqueue_active() requires that the caller has + * serialised itself against the wakeup call in xlog_cil_push_work(). That + * can be done by either holding the push lock or the context lock. + */ +static inline bool +xlog_cil_over_hard_limit( + struct xlog *log, + int32_t space_used) +{ + if (waitqueue_active(&log->l_cilp->xc_push_wait)) + return true; + if (space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) + return true; + return false; +} + +/* * Insert the log items into the CIL and calculate the difference in space * consumed by the item. Add the space to the checkpoint ticket and calculate * if the change requires additional log metadata. If it does, take that space @@ -450,8 +551,10 @@ xlog_cil_insert_items( struct xfs_cil_ctx *ctx = cil->xc_ctx; struct xfs_log_item *lip; int len = 0; - int iclog_space; int iovhdr_res = 0, split_res = 0, ctx_res = 0; + int space_used; + int order; + struct xlog_cil_pcp *cilpcp; ASSERT(tp); @@ -461,93 +564,135 @@ xlog_cil_insert_items( */ xlog_cil_insert_format_items(log, tp, &len); - spin_lock(&cil->xc_cil_lock); + /* + * Subtract the space released by intent cancelation from the space we + * consumed so that we remove it from the CIL space and add it back to + * the current transaction reservation context. + */ + len -= released_space; - /* attach the transaction to the CIL if it has any busy extents */ - if (!list_empty(&tp->t_busy)) - list_splice_init(&tp->t_busy, &ctx->busy_extents); + /* + * Grab the per-cpu pointer for the CIL before we start any accounting. + * That ensures that we are running with pre-emption disabled and so we + * can't be scheduled away between split sample/update operations that + * are done without outside locking to serialise them. + */ + cilpcp = get_cpu_ptr(cil->xc_pcp); /* - * Now transfer enough transaction reservation to the context ticket - * for the checkpoint. The context ticket is special - the unit - * reservation has to grow as well as the current reservation as we - * steal from tickets so we can correctly determine the space used - * during the transaction commit. + * We need to take the CIL checkpoint unit reservation on the first + * commit into the CIL. Test the XLOG_CIL_EMPTY bit first so we don't + * unnecessarily do an atomic op in the fast path here. We can clear the + * XLOG_CIL_EMPTY bit as we are under the xc_ctx_lock here and that + * needs to be held exclusively to reset the XLOG_CIL_EMPTY bit. */ - if (ctx->ticket->t_curr_res == 0) { + if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags) && + test_and_clear_bit(XLOG_CIL_EMPTY, &cil->xc_flags)) ctx_res = ctx->ticket->t_unit_res; - ctx->ticket->t_curr_res = ctx_res; - tp->t_ticket->t_curr_res -= ctx_res; - } - /* do we need space for more log record headers? */ - iclog_space = log->l_iclog_size - log->l_iclog_hsize; - if (len > 0 && (ctx->space_used / iclog_space != - (ctx->space_used + len) / iclog_space)) { - split_res = (len + iclog_space - 1) / iclog_space; - /* need to take into account split region headers, too */ - split_res *= log->l_iclog_hsize + sizeof(struct xlog_op_header); - ctx->ticket->t_unit_res += split_res; - ctx->ticket->t_curr_res += split_res; - tp->t_ticket->t_curr_res -= split_res; - ASSERT(tp->t_ticket->t_curr_res >= len); + /* + * Check if we need to steal iclog headers. atomic_read() is not a + * locked atomic operation, so we can check the value before we do any + * real atomic ops in the fast path. If we've already taken the CIL unit + * reservation from this commit, we've already got one iclog header + * space reserved so we have to account for that otherwise we risk + * overrunning the reservation on this ticket. + * + * If the CIL is already at the hard limit, we might need more header + * space that originally reserved. So steal more header space from every + * commit that occurs once we are over the hard limit to ensure the CIL + * push won't run out of reservation space. + * + * This can steal more than we need, but that's OK. + * + * The cil->xc_ctx_lock provides the serialisation necessary for safely + * calling xlog_cil_over_hard_limit() in this context. + */ + space_used = atomic_read(&ctx->space_used) + cilpcp->space_used + len; + if (atomic_read(&cil->xc_iclog_hdrs) > 0 || + xlog_cil_over_hard_limit(log, space_used)) { + split_res = log->l_iclog_hsize + + sizeof(struct xlog_op_header); + if (ctx_res) + ctx_res += split_res * (tp->t_ticket->t_iclog_hdrs - 1); + else + ctx_res = split_res * tp->t_ticket->t_iclog_hdrs; + atomic_sub(tp->t_ticket->t_iclog_hdrs, &cil->xc_iclog_hdrs); } - tp->t_ticket->t_curr_res -= len; - tp->t_ticket->t_curr_res += released_space; - ctx->space_used += len; - ctx->space_used -= released_space; + cilpcp->space_reserved += ctx_res; /* - * If we've overrun the reservation, dump the tx details before we move - * the log items. Shutdown is imminent... + * Accurately account when over the soft limit, otherwise fold the + * percpu count into the global count if over the per-cpu threshold. */ - if (WARN_ON(tp->t_ticket->t_curr_res < 0)) { - xfs_warn(log->l_mp, "Transaction log reservation overrun:"); - xfs_warn(log->l_mp, - " log items: %d bytes (iov hdrs: %d bytes)", - len, iovhdr_res); - xfs_warn(log->l_mp, " split region headers: %d bytes", - split_res); - xfs_warn(log->l_mp, " ctx ticket: %d bytes", ctx_res); - xlog_print_trans(tp); + if (!test_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags)) { + atomic_add(len, &ctx->space_used); + } else if (cilpcp->space_used + len > + (XLOG_CIL_SPACE_LIMIT(log) / num_online_cpus())) { + space_used = atomic_add_return(cilpcp->space_used + len, + &ctx->space_used); + cilpcp->space_used = 0; + + /* + * If we just transitioned over the soft limit, we need to + * transition to the global atomic counter. + */ + if (space_used >= XLOG_CIL_SPACE_LIMIT(log)) + xlog_cil_insert_pcp_aggregate(cil, ctx); + } else { + cilpcp->space_used += len; } + /* attach the transaction to the CIL if it has any busy extents */ + if (!list_empty(&tp->t_busy)) + list_splice_init(&tp->t_busy, &cilpcp->busy_extents); /* - * Now (re-)position everything modified at the tail of the CIL. + * Now update the order of everything modified in the transaction + * and insert items into the CIL if they aren't already there. * We do this here so we only need to take the CIL lock once during * the transaction commit. */ + order = atomic_inc_return(&ctx->order_id); list_for_each_entry(lip, &tp->t_items, li_trans) { - /* Skip items which aren't dirty in this transaction. */ if (!test_bit(XFS_LI_DIRTY, &lip->li_flags)) continue; - /* - * Only move the item if it isn't already at the tail. This is - * to prevent a transient list_empty() state when reinserting - * an item that is already the only item in the CIL. - */ - if (!list_is_last(&lip->li_cil, &cil->xc_cil)) - list_move_tail(&lip->li_cil, &cil->xc_cil); + lip->li_order_id = order; + if (!list_empty(&lip->li_cil)) + continue; + list_add_tail(&lip->li_cil, &cilpcp->log_items); } + put_cpu_ptr(cilpcp); - spin_unlock(&cil->xc_cil_lock); - - if (tp->t_ticket->t_curr_res < 0) + /* + * If we've overrun the reservation, dump the tx details before we move + * the log items. Shutdown is imminent... + */ + tp->t_ticket->t_curr_res -= ctx_res + len; + if (WARN_ON(tp->t_ticket->t_curr_res < 0)) { + xfs_warn(log->l_mp, "Transaction log reservation overrun:"); + xfs_warn(log->l_mp, + " log items: %d bytes (iov hdrs: %d bytes)", + len, iovhdr_res); + xfs_warn(log->l_mp, " split region headers: %d bytes", + split_res); + xfs_warn(log->l_mp, " ctx ticket: %d bytes", ctx_res); + xlog_print_trans(tp); xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); + } } static void xlog_cil_free_logvec( - struct xfs_log_vec *log_vector) + struct list_head *lv_chain) { struct xfs_log_vec *lv; - for (lv = log_vector; lv; ) { - struct xfs_log_vec *next = lv->lv_next; + while (!list_empty(lv_chain)) { + lv = list_first_entry(lv_chain, struct xfs_log_vec, lv_list); + list_del_init(&lv->lv_list); kmem_free(lv); - lv = next; } } @@ -647,7 +792,7 @@ xlog_cil_committed( spin_unlock(&ctx->cil->xc_push_lock); } - xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain, + xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, &ctx->lv_chain, ctx->start_lsn, abort); xfs_extent_busy_sort(&ctx->busy_extents); @@ -658,7 +803,7 @@ xlog_cil_committed( list_del(&ctx->committing); spin_unlock(&ctx->cil->xc_push_lock); - xlog_cil_free_logvec(ctx->lv_chain); + xlog_cil_free_logvec(&ctx->lv_chain); if (!list_empty(&ctx->busy_extents)) xlog_discard_busy_extents(mp, ctx); @@ -817,7 +962,6 @@ restart: static int xlog_cil_write_chain( struct xfs_cil_ctx *ctx, - struct xfs_log_vec *chain, uint32_t chain_len) { struct xlog *log = ctx->cil->xc_log; @@ -826,7 +970,7 @@ xlog_cil_write_chain( error = xlog_cil_order_write(ctx->cil, ctx->sequence, _START_RECORD); if (error) return error; - return xlog_write(log, ctx, chain, ctx->ticket, chain_len); + return xlog_write(log, ctx, &ctx->lv_chain, ctx->ticket, chain_len); } /* @@ -855,6 +999,8 @@ xlog_cil_write_commit_record( .lv_iovecp = ®, }; int error; + LIST_HEAD(lv_chain); + list_add(&vec.lv_list, &lv_chain); if (xlog_is_shutdown(log)) return -EIO; @@ -865,7 +1011,7 @@ xlog_cil_write_commit_record( /* account for space used by record data */ ctx->ticket->t_curr_res -= reg.i_len; - error = xlog_write(log, ctx, &vec, ctx->ticket, reg.i_len); + error = xlog_write(log, ctx, &lv_chain, ctx->ticket, reg.i_len); if (error) xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); return error; @@ -931,12 +1077,31 @@ xlog_cil_build_trans_hdr( lvhdr->lv_niovecs = 2; lvhdr->lv_iovecp = &hdr->lhdr[0]; lvhdr->lv_bytes = hdr->lhdr[0].i_len + hdr->lhdr[1].i_len; - lvhdr->lv_next = ctx->lv_chain; tic->t_curr_res -= lvhdr->lv_bytes; } /* + * CIL item reordering compare function. We want to order in ascending ID order, + * but we want to leave items with the same ID in the order they were added to + * the list. This is important for operations like reflink where we log 4 order + * dependent intents in a single transaction when we overwrite an existing + * shared extent with a new shared extent. i.e. BUI(unmap), CUI(drop), + * CUI (inc), BUI(remap)... + */ +static int +xlog_cil_order_cmp( + void *priv, + const struct list_head *a, + const struct list_head *b) +{ + struct xfs_log_vec *l1 = container_of(a, struct xfs_log_vec, lv_list); + struct xfs_log_vec *l2 = container_of(b, struct xfs_log_vec, lv_list); + + return l1->lv_order_id > l2->lv_order_id; +} + +/* * Pull all the log vectors off the items in the CIL, and remove the items from * the CIL. We don't need the CIL lock here because it's only needed on the * transaction commit side which is currently locked out by the flush lock. @@ -947,18 +1112,16 @@ xlog_cil_build_trans_hdr( */ static void xlog_cil_build_lv_chain( - struct xfs_cil *cil, struct xfs_cil_ctx *ctx, struct list_head *whiteouts, uint32_t *num_iovecs, uint32_t *num_bytes) { - struct xfs_log_vec *lv = NULL; - - while (!list_empty(&cil->xc_cil)) { + while (!list_empty(&ctx->log_items)) { struct xfs_log_item *item; + struct xfs_log_vec *lv; - item = list_first_entry(&cil->xc_cil, + item = list_first_entry(&ctx->log_items, struct xfs_log_item, li_cil); if (test_bit(XFS_LI_WHITEOUT, &item->li_flags)) { @@ -967,18 +1130,18 @@ xlog_cil_build_lv_chain( continue; } - list_del_init(&item->li_cil); - if (!ctx->lv_chain) - ctx->lv_chain = item->li_lv; - else - lv->lv_next = item->li_lv; lv = item->li_lv; - item->li_lv = NULL; - *num_iovecs += lv->lv_niovecs; + lv->lv_order_id = item->li_order_id; /* we don't write ordered log vectors */ if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) *num_bytes += lv->lv_bytes; + *num_iovecs += lv->lv_niovecs; + list_add_tail(&lv->lv_list, &ctx->lv_chain); + + list_del_init(&item->li_cil); + item->li_order_id = 0; + item->li_lv = NULL; } } @@ -1022,10 +1185,11 @@ xlog_cil_push_work( int num_bytes = 0; int error = 0; struct xlog_cil_trans_hdr thdr; - struct xfs_log_vec lvhdr = { NULL }; + struct xfs_log_vec lvhdr = {}; xfs_csn_t push_seq; bool push_commit_stable; LIST_HEAD (whiteouts); + struct xlog_ticket *ticket; new_ctx = xlog_cil_ctx_alloc(); new_ctx->ticket = xlog_cil_ticket_alloc(log); @@ -1049,12 +1213,14 @@ xlog_cil_push_work( if (waitqueue_active(&cil->xc_push_wait)) wake_up_all(&cil->xc_push_wait); + xlog_cil_push_pcp_aggregate(cil, ctx); + /* * Check if we've anything to push. If there is nothing, then we don't * move on to a new sequence number and so we have to be able to push * this sequence again later. */ - if (list_empty(&cil->xc_cil)) { + if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)) { cil->xc_push_seq = 0; spin_unlock(&cil->xc_push_lock); goto out_skip; @@ -1094,7 +1260,7 @@ xlog_cil_push_work( list_add(&ctx->committing, &cil->xc_committing); spin_unlock(&cil->xc_push_lock); - xlog_cil_build_lv_chain(cil, ctx, &whiteouts, &num_iovecs, &num_bytes); + xlog_cil_build_lv_chain(ctx, &whiteouts, &num_iovecs, &num_bytes); /* * Switch the contexts so we can drop the context lock and move out @@ -1127,14 +1293,30 @@ xlog_cil_push_work( up_write(&cil->xc_ctx_lock); /* + * Sort the log vector chain before we add the transaction headers. + * This ensures we always have the transaction headers at the start + * of the chain. + */ + list_sort(NULL, &ctx->lv_chain, xlog_cil_order_cmp); + + /* * Build a checkpoint transaction header and write it to the log to * begin the transaction. We need to account for the space used by the * transaction header here as it is not accounted for in xlog_write(). + * Add the lvhdr to the head of the lv chain we pass to xlog_write() so + * it gets written into the iclog first. */ xlog_cil_build_trans_hdr(ctx, &thdr, &lvhdr, num_iovecs); num_bytes += lvhdr.lv_bytes; + list_add(&lvhdr.lv_list, &ctx->lv_chain); - error = xlog_cil_write_chain(ctx, &lvhdr, num_bytes); + /* + * Take the lvhdr back off the lv_chain immediately after calling + * xlog_cil_write_chain() as it should not be passed to log IO + * completion. + */ + error = xlog_cil_write_chain(ctx, num_bytes); + list_del(&lvhdr.lv_list); if (error) goto out_abort_free_ticket; @@ -1142,7 +1324,14 @@ xlog_cil_push_work( if (error) goto out_abort_free_ticket; - xfs_log_ticket_ungrant(log, ctx->ticket); + /* + * Grab the ticket from the ctx so we can ungrant it after releasing the + * commit_iclog. The ctx may be freed by the time we return from + * releasing the commit_iclog (i.e. checkpoint has been completed and + * callback run) so we can't reference the ctx after the call to + * xlog_state_release_iclog(). + */ + ticket = ctx->ticket; /* * If the checkpoint spans multiple iclogs, wait for all previous iclogs @@ -1192,12 +1381,14 @@ xlog_cil_push_work( if (push_commit_stable && ctx->commit_iclog->ic_state == XLOG_STATE_ACTIVE) xlog_state_switch_iclogs(log, ctx->commit_iclog, 0); - xlog_state_release_iclog(log, ctx->commit_iclog); + ticket = ctx->ticket; + xlog_state_release_iclog(log, ctx->commit_iclog, ticket); /* Not safe to reference ctx now! */ spin_unlock(&log->l_icloglock); xlog_cil_cleanup_whiteouts(&whiteouts); + xfs_log_ticket_ungrant(log, ticket); return; out_skip: @@ -1207,17 +1398,19 @@ out_skip: return; out_abort_free_ticket: - xfs_log_ticket_ungrant(log, ctx->ticket); ASSERT(xlog_is_shutdown(log)); xlog_cil_cleanup_whiteouts(&whiteouts); if (!ctx->commit_iclog) { + xfs_log_ticket_ungrant(log, ctx->ticket); xlog_cil_committed(ctx); return; } spin_lock(&log->l_icloglock); - xlog_state_release_iclog(log, ctx->commit_iclog); + ticket = ctx->ticket; + xlog_state_release_iclog(log, ctx->commit_iclog, ticket); /* Not safe to reference ctx now! */ spin_unlock(&log->l_icloglock); + xfs_log_ticket_ungrant(log, ticket); } /* @@ -1232,18 +1425,27 @@ xlog_cil_push_background( struct xlog *log) __releases(cil->xc_ctx_lock) { struct xfs_cil *cil = log->l_cilp; + int space_used = atomic_read(&cil->xc_ctx->space_used); /* * The cil won't be empty because we are called while holding the - * context lock so whatever we added to the CIL will still be there + * context lock so whatever we added to the CIL will still be there. */ - ASSERT(!list_empty(&cil->xc_cil)); + ASSERT(!test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)); /* - * Don't do a background push if we haven't used up all the - * space available yet. + * We are done if: + * - we haven't used up all the space available yet; or + * - we've already queued up a push; and + * - we're not over the hard limit; and + * - nothing has been over the hard limit. + * + * If so, we don't need to take the push lock as there's nothing to do. */ - if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) { + if (space_used < XLOG_CIL_SPACE_LIMIT(log) || + (cil->xc_push_seq == cil->xc_current_sequence && + space_used < XLOG_CIL_BLOCKING_SPACE_LIMIT(log) && + !waitqueue_active(&cil->xc_push_wait))) { up_read(&cil->xc_ctx_lock); return; } @@ -1270,12 +1472,11 @@ xlog_cil_push_background( * dipping back down under the hard limit. * * The ctx->xc_push_lock provides the serialisation necessary for safely - * using the lockless waitqueue_active() check in this context. + * calling xlog_cil_over_hard_limit() in this context. */ - if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log) || - waitqueue_active(&cil->xc_push_wait)) { + if (xlog_cil_over_hard_limit(log, space_used)) { trace_xfs_log_cil_wait(log, cil->xc_ctx->ticket); - ASSERT(cil->xc_ctx->space_used < log->l_logsize); + ASSERT(space_used < log->l_logsize); xlog_wait(&cil->xc_push_wait, &cil->xc_push_lock); return; } @@ -1334,7 +1535,8 @@ xlog_cil_push_now( * If the CIL is empty or we've already pushed the sequence then * there's no more work that we need to do. */ - if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) { + if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags) || + push_seq <= cil->xc_push_seq) { spin_unlock(&cil->xc_push_lock); return; } @@ -1352,7 +1554,7 @@ xlog_cil_empty( bool empty = false; spin_lock(&cil->xc_push_lock); - if (list_empty(&cil->xc_cil)) + if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)) empty = true; spin_unlock(&cil->xc_push_lock); return empty; @@ -1483,7 +1685,7 @@ xlog_cil_flush( * If the CIL is empty, make sure that any previous checkpoint that may * still be in an active iclog is pushed to stable storage. */ - if (list_empty(&log->l_cilp->xc_cil)) + if (test_bit(XLOG_CIL_EMPTY, &log->l_cilp->xc_flags)) xfs_log_force(log->l_mp, 0); } @@ -1568,7 +1770,7 @@ restart: * we would have found the context on the committing list. */ if (sequence == cil->xc_current_sequence && - !list_empty(&cil->xc_cil)) { + !test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)) { spin_unlock(&cil->xc_push_lock); goto restart; } @@ -1589,14 +1791,48 @@ out_shutdown: } /* + * Move dead percpu state to the relevant CIL context structures. + * + * We have to lock the CIL context here to ensure that nothing is modifying + * the percpu state, either addition or removal. Both of these are done under + * the CIL context lock, so grabbing that exclusively here will ensure we can + * safely drain the cilpcp for the CPU that is dying. + */ +void +xlog_cil_pcp_dead( + struct xlog *log, + unsigned int cpu) +{ + struct xfs_cil *cil = log->l_cilp; + struct xlog_cil_pcp *cilpcp = per_cpu_ptr(cil->xc_pcp, cpu); + struct xfs_cil_ctx *ctx; + + down_write(&cil->xc_ctx_lock); + ctx = cil->xc_ctx; + if (ctx->ticket) + ctx->ticket->t_curr_res += cilpcp->space_reserved; + cilpcp->space_reserved = 0; + + if (!list_empty(&cilpcp->log_items)) + list_splice_init(&cilpcp->log_items, &ctx->log_items); + if (!list_empty(&cilpcp->busy_extents)) + list_splice_init(&cilpcp->busy_extents, &ctx->busy_extents); + atomic_add(cilpcp->space_used, &ctx->space_used); + cilpcp->space_used = 0; + up_write(&cil->xc_ctx_lock); +} + +/* * Perform initial CIL structure initialisation. */ int xlog_cil_init( - struct xlog *log) + struct xlog *log) { - struct xfs_cil *cil; - struct xfs_cil_ctx *ctx; + struct xfs_cil *cil; + struct xfs_cil_ctx *ctx; + struct xlog_cil_pcp *cilpcp; + int cpu; cil = kmem_zalloc(sizeof(*cil), KM_MAYFAIL); if (!cil) @@ -1611,22 +1847,31 @@ xlog_cil_init( if (!cil->xc_push_wq) goto out_destroy_cil; - INIT_LIST_HEAD(&cil->xc_cil); + cil->xc_log = log; + cil->xc_pcp = alloc_percpu(struct xlog_cil_pcp); + if (!cil->xc_pcp) + goto out_destroy_wq; + + for_each_possible_cpu(cpu) { + cilpcp = per_cpu_ptr(cil->xc_pcp, cpu); + INIT_LIST_HEAD(&cilpcp->busy_extents); + INIT_LIST_HEAD(&cilpcp->log_items); + } + INIT_LIST_HEAD(&cil->xc_committing); - spin_lock_init(&cil->xc_cil_lock); spin_lock_init(&cil->xc_push_lock); init_waitqueue_head(&cil->xc_push_wait); init_rwsem(&cil->xc_ctx_lock); init_waitqueue_head(&cil->xc_start_wait); init_waitqueue_head(&cil->xc_commit_wait); - cil->xc_log = log; log->l_cilp = cil; ctx = xlog_cil_ctx_alloc(); xlog_cil_ctx_switch(cil, ctx); - return 0; +out_destroy_wq: + destroy_workqueue(cil->xc_push_wq); out_destroy_cil: kmem_free(cil); return -ENOMEM; @@ -1636,14 +1881,17 @@ void xlog_cil_destroy( struct xlog *log) { - if (log->l_cilp->xc_ctx) { - if (log->l_cilp->xc_ctx->ticket) - xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket); - kmem_free(log->l_cilp->xc_ctx); + struct xfs_cil *cil = log->l_cilp; + + if (cil->xc_ctx) { + if (cil->xc_ctx->ticket) + xfs_log_ticket_put(cil->xc_ctx->ticket); + kmem_free(cil->xc_ctx); } - ASSERT(list_empty(&log->l_cilp->xc_cil)); - destroy_workqueue(log->l_cilp->xc_push_wq); - kmem_free(log->l_cilp); + ASSERT(test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)); + free_percpu(cil->xc_pcp); + destroy_workqueue(cil->xc_push_wq); + kmem_free(cil); } diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 686c01eb3661..1bd2963e8fbd 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -143,15 +143,16 @@ enum xlog_iclog_state { #define XLOG_COVER_OPS 5 typedef struct xlog_ticket { - struct list_head t_queue; /* reserve/write queue */ - struct task_struct *t_task; /* task that owns this ticket */ - xlog_tid_t t_tid; /* transaction identifier : 4 */ - atomic_t t_ref; /* ticket reference count : 4 */ - int t_curr_res; /* current reservation in bytes : 4 */ - int t_unit_res; /* unit reservation in bytes : 4 */ - char t_ocnt; /* original count : 1 */ - char t_cnt; /* current count : 1 */ - uint8_t t_flags; /* properties of reservation : 1 */ + struct list_head t_queue; /* reserve/write queue */ + struct task_struct *t_task; /* task that owns this ticket */ + xlog_tid_t t_tid; /* transaction identifier */ + atomic_t t_ref; /* ticket reference count */ + int t_curr_res; /* current reservation */ + int t_unit_res; /* unit reservation */ + char t_ocnt; /* original unit count */ + char t_cnt; /* current unit count */ + uint8_t t_flags; /* properties of reservation */ + int t_iclog_hdrs; /* iclog hdrs in t_curr_res */ } xlog_ticket_t; /* @@ -221,13 +222,25 @@ struct xfs_cil_ctx { xfs_lsn_t commit_lsn; /* chkpt commit record lsn */ struct xlog_in_core *commit_iclog; struct xlog_ticket *ticket; /* chkpt ticket */ - int space_used; /* aggregate size of regions */ + atomic_t space_used; /* aggregate size of regions */ struct list_head busy_extents; /* busy extents in chkpt */ - struct xfs_log_vec *lv_chain; /* logvecs being pushed */ + struct list_head log_items; /* log items in chkpt */ + struct list_head lv_chain; /* logvecs being pushed */ struct list_head iclog_entry; struct list_head committing; /* ctx committing list */ struct work_struct discard_endio_work; struct work_struct push_work; + atomic_t order_id; +}; + +/* + * Per-cpu CIL tracking items + */ +struct xlog_cil_pcp { + int32_t space_used; + uint32_t space_reserved; + struct list_head busy_extents; + struct list_head log_items; }; /* @@ -248,8 +261,8 @@ struct xfs_cil_ctx { */ struct xfs_cil { struct xlog *xc_log; - struct list_head xc_cil; - spinlock_t xc_cil_lock; + unsigned long xc_flags; + atomic_t xc_iclog_hdrs; struct workqueue_struct *xc_push_wq; struct rw_semaphore xc_ctx_lock ____cacheline_aligned_in_smp; @@ -263,8 +276,17 @@ struct xfs_cil { wait_queue_head_t xc_start_wait; xfs_csn_t xc_current_sequence; wait_queue_head_t xc_push_wait; /* background push throttle */ + + void __percpu *xc_pcp; /* percpu CIL structures */ +#ifdef CONFIG_HOTPLUG_CPU + struct list_head xc_pcp_list; +#endif } ____cacheline_aligned_in_smp; +/* xc_flags bit values */ +#define XLOG_CIL_EMPTY 1 +#define XLOG_CIL_PCP_SPACE 2 + /* * The amount of log space we allow the CIL to aggregate is difficult to size. * Whatever we choose, we have to make sure we can get a reservation for the @@ -486,14 +508,15 @@ struct xlog_ticket *xlog_ticket_alloc(struct xlog *log, int unit_bytes, void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); void xlog_print_trans(struct xfs_trans *); int xlog_write(struct xlog *log, struct xfs_cil_ctx *ctx, - struct xfs_log_vec *log_vector, struct xlog_ticket *tic, + struct list_head *lv_chain, struct xlog_ticket *tic, uint32_t len); void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket); void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket); void xlog_state_switch_iclogs(struct xlog *log, struct xlog_in_core *iclog, int eventual_size); -int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog); +int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog, + struct xlog_ticket *ticket); /* * When we crack an atomic LSN, we sample it first so that the value will not @@ -682,4 +705,9 @@ xlog_kvmalloc( return p; } +/* + * CIL CPU dead notifier + */ +void xlog_cil_pcp_dead(struct xlog *log, unsigned int cpu); + #endif /* __XFS_LOG_PRIV_H__ */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 5f7e4e6e33ce..17e923b9c5fa 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -122,7 +122,7 @@ xlog_do_io( xfs_daddr_t blk_no, unsigned int nbblks, char *data, - unsigned int op) + enum req_op op) { int error; @@ -2629,21 +2629,21 @@ xlog_recover_cancel_intents( */ STATIC void xlog_recover_clear_agi_bucket( - xfs_mount_t *mp, - xfs_agnumber_t agno, - int bucket) + struct xfs_perag *pag, + int bucket) { - xfs_trans_t *tp; - xfs_agi_t *agi; - struct xfs_buf *agibp; - int offset; - int error; + struct xfs_mount *mp = pag->pag_mount; + struct xfs_trans *tp; + struct xfs_agi *agi; + struct xfs_buf *agibp; + int offset; + int error; error = xfs_trans_alloc(mp, &M_RES(mp)->tr_clearagi, 0, 0, 0, &tp); if (error) goto out_error; - error = xfs_read_agi(mp, tp, agno, &agibp); + error = xfs_read_agi(pag, tp, &agibp); if (error) goto out_abort; @@ -2662,60 +2662,62 @@ xlog_recover_clear_agi_bucket( out_abort: xfs_trans_cancel(tp); out_error: - xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno); + xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, + pag->pag_agno); return; } -STATIC xfs_agino_t -xlog_recover_process_one_iunlink( - struct xfs_mount *mp, - xfs_agnumber_t agno, - xfs_agino_t agino, - int bucket) +static int +xlog_recover_iunlink_bucket( + struct xfs_perag *pag, + struct xfs_agi *agi, + int bucket) { - struct xfs_buf *ibp; - struct xfs_dinode *dip; - struct xfs_inode *ip; - xfs_ino_t ino; - int error; - - ino = XFS_AGINO_TO_INO(mp, agno, agino); - error = xfs_iget(mp, NULL, ino, 0, 0, &ip); - if (error) - goto fail; + struct xfs_mount *mp = pag->pag_mount; + struct xfs_inode *prev_ip = NULL; + struct xfs_inode *ip; + xfs_agino_t prev_agino, agino; + int error = 0; - /* - * Get the on disk inode to find the next inode in the bucket. - */ - error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &ibp); - if (error) - goto fail_iput; - dip = xfs_buf_offset(ibp, ip->i_imap.im_boffset); + agino = be32_to_cpu(agi->agi_unlinked[bucket]); + while (agino != NULLAGINO) { + error = xfs_iget(mp, NULL, + XFS_AGINO_TO_INO(mp, pag->pag_agno, agino), + 0, 0, &ip); + if (error) + break; - xfs_iflags_clear(ip, XFS_IRECOVERY); - ASSERT(VFS_I(ip)->i_nlink == 0); - ASSERT(VFS_I(ip)->i_mode != 0); + ASSERT(VFS_I(ip)->i_nlink == 0); + ASSERT(VFS_I(ip)->i_mode != 0); + xfs_iflags_clear(ip, XFS_IRECOVERY); + agino = ip->i_next_unlinked; - /* setup for the next pass */ - agino = be32_to_cpu(dip->di_next_unlinked); - xfs_buf_relse(ibp); + if (prev_ip) { + ip->i_prev_unlinked = prev_agino; + xfs_irele(prev_ip); - xfs_irele(ip); - return agino; + /* + * Ensure the inode is removed from the unlinked list + * before we continue so that it won't race with + * building the in-memory list here. This could be + * serialised with the agibp lock, but that just + * serialises via lockstepping and it's much simpler + * just to flush the inodegc queue and wait for it to + * complete. + */ + xfs_inodegc_flush(mp); + } - fail_iput: - xfs_irele(ip); - fail: - /* - * We can't read in the inode this bucket points to, or this inode - * is messed up. Just ditch this bucket of inodes. We will lose - * some inodes and space, but at least we won't hang. - * - * Call xlog_recover_clear_agi_bucket() to perform a transaction to - * clear the inode pointer in the bucket. - */ - xlog_recover_clear_agi_bucket(mp, agno, bucket); - return NULLAGINO; + prev_agino = agino; + prev_ip = ip; + } + + if (prev_ip) { + ip->i_prev_unlinked = prev_agino; + xfs_irele(prev_ip); + } + xfs_inodegc_flush(mp); + return error; } /* @@ -2741,59 +2743,70 @@ xlog_recover_process_one_iunlink( * scheduled on this CPU to ensure other scheduled work can run without undue * latency. */ -STATIC void -xlog_recover_process_iunlinks( - struct xlog *log) +static void +xlog_recover_iunlink_ag( + struct xfs_perag *pag) { - struct xfs_mount *mp = log->l_mp; - struct xfs_perag *pag; - xfs_agnumber_t agno; struct xfs_agi *agi; struct xfs_buf *agibp; - xfs_agino_t agino; int bucket; int error; - for_each_perag(mp, agno, pag) { - error = xfs_read_agi(mp, NULL, pag->pag_agno, &agibp); + error = xfs_read_agi(pag, NULL, &agibp); + if (error) { + /* + * AGI is b0rked. Don't process it. + * + * We should probably mark the filesystem as corrupt after we've + * recovered all the ag's we can.... + */ + return; + } + + /* + * Unlock the buffer so that it can be acquired in the normal course of + * the transaction to truncate and free each inode. Because we are not + * racing with anyone else here for the AGI buffer, we don't even need + * to hold it locked to read the initial unlinked bucket entries out of + * the buffer. We keep buffer reference though, so that it stays pinned + * in memory while we need the buffer. + */ + agi = agibp->b_addr; + xfs_buf_unlock(agibp); + + for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) { + error = xlog_recover_iunlink_bucket(pag, agi, bucket); if (error) { /* - * AGI is b0rked. Don't process it. - * - * We should probably mark the filesystem as corrupt - * after we've recovered all the ag's we can.... + * Bucket is unrecoverable, so only a repair scan can + * free the remaining unlinked inodes. Just empty the + * bucket and remaining inodes on it unreferenced and + * unfreeable. */ - continue; + xfs_inodegc_flush(pag->pag_mount); + xlog_recover_clear_agi_bucket(pag, bucket); } - /* - * Unlock the buffer so that it can be acquired in the normal - * course of the transaction to truncate and free each inode. - * Because we are not racing with anyone else here for the AGI - * buffer, we don't even need to hold it locked to read the - * initial unlinked bucket entries out of the buffer. We keep - * buffer reference though, so that it stays pinned in memory - * while we need the buffer. - */ - agi = agibp->b_addr; - xfs_buf_unlock(agibp); - - for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) { - agino = be32_to_cpu(agi->agi_unlinked[bucket]); - while (agino != NULLAGINO) { - agino = xlog_recover_process_one_iunlink(mp, - pag->pag_agno, agino, bucket); - cond_resched(); - } - } - xfs_buf_rele(agibp); } + xfs_buf_rele(agibp); +} + +static void +xlog_recover_process_iunlinks( + struct xlog *log) +{ + struct xfs_perag *pag; + xfs_agnumber_t agno; + + for_each_perag(log->l_mp, agno, pag) + xlog_recover_iunlink_ag(pag); + /* * Flush the pending unlinked inodes to ensure that the inactivations * are fully completed on disk and the incore inodes can be reclaimed * before we signal that recovery is complete. */ - xfs_inodegc_flush(mp); + xfs_inodegc_flush(log->l_mp); } STATIC void @@ -3313,7 +3326,8 @@ xlog_do_recover( /* re-initialise in-core superblock and geometry structures */ mp->m_features |= xfs_sb_version_to_features(sbp); xfs_reinit_percpu_counters(mp); - error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi); + error = xfs_initialize_perag(mp, sbp->sb_agcount, sbp->sb_dblocks, + &mp->m_maxagi); if (error) { xfs_warn(mp, "Failed post-recovery per-ag init: %d", error); return error; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index daa8d29c46b4..f10c88cee116 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -778,7 +778,8 @@ xfs_mountfs( /* * Allocate and initialize the per-ag data. */ - error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi); + error = xfs_initialize_perag(mp, sbp->sb_agcount, mp->m_sb.sb_dblocks, + &mp->m_maxagi); if (error) { xfs_warn(mp, "Failed per-ag init: %d", error); goto out_free_dir; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index ba5d42abf66e..8aca2cc173ac 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -61,7 +61,7 @@ struct xfs_error_cfg { */ struct xfs_inodegc { struct llist_head list; - struct work_struct work; + struct delayed_work work; /* approximate count of inodes in the list */ unsigned int items; @@ -454,6 +454,7 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, uint32_t flags, char *fname, #define SHUTDOWN_LOG_IO_ERROR (1u << 1) /* write attempt to the log failed */ #define SHUTDOWN_FORCE_UMOUNT (1u << 2) /* shutdown from a forced unmount */ #define SHUTDOWN_CORRUPT_INCORE (1u << 3) /* corrupt in-memory structures */ +#define SHUTDOWN_CORRUPT_ONDISK (1u << 4) /* corrupt metadata on device */ #define XFS_SHUTDOWN_STRINGS \ { SHUTDOWN_META_IO_ERROR, "metadata_io" }, \ diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c new file mode 100644 index 000000000000..69d9c83ea4b2 --- /dev/null +++ b/fs/xfs/xfs_notify_failure.c @@ -0,0 +1,226 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2022 Fujitsu. All Rights Reserved. + */ + +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_alloc.h" +#include "xfs_bit.h" +#include "xfs_btree.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_rtalloc.h" +#include "xfs_trans.h" +#include "xfs_ag.h" + +#include <linux/mm.h> +#include <linux/dax.h> + +struct failure_info { + xfs_agblock_t startblock; + xfs_extlen_t blockcount; + int mf_flags; +}; + +static pgoff_t +xfs_failure_pgoff( + struct xfs_mount *mp, + const struct xfs_rmap_irec *rec, + const struct failure_info *notify) +{ + loff_t pos = XFS_FSB_TO_B(mp, rec->rm_offset); + + if (notify->startblock > rec->rm_startblock) + pos += XFS_FSB_TO_B(mp, + notify->startblock - rec->rm_startblock); + return pos >> PAGE_SHIFT; +} + +static unsigned long +xfs_failure_pgcnt( + struct xfs_mount *mp, + const struct xfs_rmap_irec *rec, + const struct failure_info *notify) +{ + xfs_agblock_t end_rec; + xfs_agblock_t end_notify; + xfs_agblock_t start_cross; + xfs_agblock_t end_cross; + + start_cross = max(rec->rm_startblock, notify->startblock); + + end_rec = rec->rm_startblock + rec->rm_blockcount; + end_notify = notify->startblock + notify->blockcount; + end_cross = min(end_rec, end_notify); + + return XFS_FSB_TO_B(mp, end_cross - start_cross) >> PAGE_SHIFT; +} + +static int +xfs_dax_failure_fn( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *data) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_inode *ip; + struct failure_info *notify = data; + int error = 0; + + if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) || + (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) { + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); + return -EFSCORRUPTED; + } + + /* Get files that incore, filter out others that are not in use. */ + error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, XFS_IGET_INCORE, + 0, &ip); + /* Continue the rmap query if the inode isn't incore */ + if (error == -ENODATA) + return 0; + if (error) + return error; + + error = mf_dax_kill_procs(VFS_I(ip)->i_mapping, + xfs_failure_pgoff(mp, rec, notify), + xfs_failure_pgcnt(mp, rec, notify), + notify->mf_flags); + xfs_irele(ip); + return error; +} + +static int +xfs_dax_notify_ddev_failure( + struct xfs_mount *mp, + xfs_daddr_t daddr, + xfs_daddr_t bblen, + int mf_flags) +{ + struct xfs_trans *tp = NULL; + struct xfs_btree_cur *cur = NULL; + struct xfs_buf *agf_bp = NULL; + int error = 0; + xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, daddr); + xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno); + xfs_fsblock_t end_fsbno = XFS_DADDR_TO_FSB(mp, daddr + bblen); + xfs_agnumber_t end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno); + + error = xfs_trans_alloc_empty(mp, &tp); + if (error) + return error; + + for (; agno <= end_agno; agno++) { + struct xfs_rmap_irec ri_low = { }; + struct xfs_rmap_irec ri_high; + struct failure_info notify; + struct xfs_agf *agf; + xfs_agblock_t agend; + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, agno); + error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp); + if (error) { + xfs_perag_put(pag); + break; + } + + cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag); + + /* + * Set the rmap range from ri_low to ri_high, which represents + * a [start, end] where we looking for the files or metadata. + */ + memset(&ri_high, 0xFF, sizeof(ri_high)); + ri_low.rm_startblock = XFS_FSB_TO_AGBNO(mp, fsbno); + if (agno == end_agno) + ri_high.rm_startblock = XFS_FSB_TO_AGBNO(mp, end_fsbno); + + agf = agf_bp->b_addr; + agend = min(be32_to_cpu(agf->agf_length), + ri_high.rm_startblock); + notify.startblock = ri_low.rm_startblock; + notify.blockcount = agend - ri_low.rm_startblock; + + error = xfs_rmap_query_range(cur, &ri_low, &ri_high, + xfs_dax_failure_fn, ¬ify); + xfs_btree_del_cursor(cur, error); + xfs_trans_brelse(tp, agf_bp); + xfs_perag_put(pag); + if (error) + break; + + fsbno = XFS_AGB_TO_FSB(mp, agno + 1, 0); + } + + xfs_trans_cancel(tp); + return error; +} + +static int +xfs_dax_notify_failure( + struct dax_device *dax_dev, + u64 offset, + u64 len, + int mf_flags) +{ + struct xfs_mount *mp = dax_holder(dax_dev); + u64 ddev_start; + u64 ddev_end; + + if (!(mp->m_sb.sb_flags & SB_BORN)) { + xfs_warn(mp, "filesystem is not ready for notify_failure()!"); + return -EIO; + } + + if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) { + xfs_warn(mp, + "notify_failure() not supported on realtime device!"); + return -EOPNOTSUPP; + } + + if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev && + mp->m_logdev_targp != mp->m_ddev_targp) { + xfs_err(mp, "ondisk log corrupt, shutting down fs!"); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); + return -EFSCORRUPTED; + } + + if (!xfs_has_rmapbt(mp)) { + xfs_warn(mp, "notify_failure() needs rmapbt enabled!"); + return -EOPNOTSUPP; + } + + ddev_start = mp->m_ddev_targp->bt_dax_part_off; + ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1; + + /* Ignore the range out of filesystem area */ + if (offset + len < ddev_start) + return -ENXIO; + if (offset > ddev_end) + return -ENXIO; + + /* Calculate the real range when it touches the boundary */ + if (offset > ddev_start) + offset -= ddev_start; + else { + len -= ddev_start - offset; + offset = 0; + } + if (offset + len > ddev_end) + len -= ddev_end - offset; + + return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), BTOBB(len), + mf_flags); +} + +const struct dax_holder_operations xfs_dax_holder_operations = { + .notify_failure = xfs_dax_notify_failure, +}; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index abf08bbf34a9..fbff7924ff3f 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -677,7 +677,8 @@ xfs_qm_init_quotainfo( qinf->qi_shrinker.seeks = DEFAULT_SEEKS; qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE; - error = register_shrinker(&qinf->qi_shrinker); + error = register_shrinker(&qinf->qi_shrinker, "xfs-qm:%s", + mp->m_super->s_id); if (error) goto out_free_inos; @@ -1154,7 +1155,7 @@ xfs_qm_dqusage_adjust( ASSERT(ip->i_delayed_blks == 0); if (XFS_IS_REALTIME_INODE(ip)) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); error = xfs_iread_extents(tp, ip, XFS_DATA_FORK); if (error) @@ -1229,12 +1230,11 @@ xfs_qm_flush_one( */ if (!xfs_dqflock_nowait(dqp)) { /* buf is pinned in-core by delwri list */ - bp = xfs_buf_incore(mp->m_ddev_targp, dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, 0); - if (!bp) { - error = -EINVAL; + error = xfs_buf_incore(mp->m_ddev_targp, dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, 0, &bp); + if (error) goto out_unlock; - } + xfs_buf_unlock(bp); xfs_buf_delwri_pushbuf(bp, buffer_list); diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 74ac9ca9e119..392cb39cc10c 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -454,9 +454,12 @@ xfs_qm_scall_getquota( struct xfs_dquot *dqp; int error; - /* Flush inodegc work at the start of a quota reporting scan. */ + /* + * Expedite pending inodegc work at the start of a quota reporting + * scan but don't block waiting for it to complete. + */ if (id == 0) - xfs_inodegc_flush(mp); + xfs_inodegc_push(mp); /* * Try to get the dquot. We don't want it allocated on disk, so don't @@ -498,7 +501,7 @@ xfs_qm_scall_getquota_next( /* Flush inodegc work at the start of a quota reporting scan. */ if (*id == 0) - xfs_inodegc_flush(mp); + xfs_inodegc_push(mp); error = xfs_qm_dqget_next(mp, *id, type, &dqp); if (error) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index e7a7c00d93be..e17a84e8b527 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -125,11 +125,10 @@ * shared blocks. If there are no shared extents, fbno and flen will * be set to NULLAGBLOCK and 0, respectively. */ -int +static int xfs_reflink_find_shared( - struct xfs_mount *mp, + struct xfs_perag *pag, struct xfs_trans *tp, - xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, @@ -140,11 +139,11 @@ xfs_reflink_find_shared( struct xfs_btree_cur *cur; int error; - error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); + error = xfs_alloc_read_agf(pag, tp, 0, &agbp); if (error) return error; - cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agbp->b_pag); + cur = xfs_refcountbt_init_cursor(pag->pag_mount, tp, agbp, pag); error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen, find_end_of_shared); @@ -171,7 +170,8 @@ xfs_reflink_trim_around_shared( struct xfs_bmbt_irec *irec, bool *shared) { - xfs_agnumber_t agno; + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; xfs_agblock_t agbno; xfs_extlen_t aglen; xfs_agblock_t fbno; @@ -186,12 +186,13 @@ xfs_reflink_trim_around_shared( trace_xfs_reflink_trim_around_shared(ip, irec); - agno = XFS_FSB_TO_AGNO(ip->i_mount, irec->br_startblock); - agbno = XFS_FSB_TO_AGBNO(ip->i_mount, irec->br_startblock); + pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, irec->br_startblock)); + agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock); aglen = irec->br_blockcount; - error = xfs_reflink_find_shared(ip->i_mount, NULL, agno, agbno, - aglen, &fbno, &flen, true); + error = xfs_reflink_find_shared(pag, NULL, agbno, aglen, &fbno, &flen, + true); + xfs_perag_put(pag); if (error) return error; @@ -452,7 +453,7 @@ xfs_reflink_cancel_cow_blocks( xfs_fileoff_t end_fsb, bool cancel_real) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); struct xfs_bmbt_irec got, del; struct xfs_iext_cursor icur; int error = 0; @@ -593,7 +594,7 @@ xfs_reflink_end_cow_extent( struct xfs_bmbt_irec got, del, data; struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); unsigned int resblks; int nmaps; int error; @@ -1363,12 +1364,16 @@ xfs_reflink_remap_prep( if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest)) goto out_unlock; - /* Don't share DAX file data for now. */ - if (IS_DAX(inode_in) || IS_DAX(inode_out)) + /* Don't share DAX file data with non-DAX file. */ + if (IS_DAX(inode_in) != IS_DAX(inode_out)) goto out_unlock; - ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, - len, remap_flags); + if (!IS_DAX(inode_in)) + ret = generic_remap_file_range_prep(file_in, pos_in, file_out, + pos_out, len, remap_flags); + else + ret = dax_remap_file_range_prep(file_in, pos_in, file_out, + pos_out, len, remap_flags, &xfs_read_iomap_ops); if (ret || *len == 0) goto out_unlock; @@ -1420,16 +1425,11 @@ xfs_reflink_inode_has_shared_extents( struct xfs_bmbt_irec got; struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp; - xfs_agnumber_t agno; - xfs_agblock_t agbno; - xfs_extlen_t aglen; - xfs_agblock_t rbno; - xfs_extlen_t rlen; struct xfs_iext_cursor icur; bool found; int error; - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); error = xfs_iread_extents(tp, ip, XFS_DATA_FORK); if (error) return error; @@ -1437,17 +1437,25 @@ xfs_reflink_inode_has_shared_extents( *has_shared = false; found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got); while (found) { + struct xfs_perag *pag; + xfs_agblock_t agbno; + xfs_extlen_t aglen; + xfs_agblock_t rbno; + xfs_extlen_t rlen; + if (isnullstartblock(got.br_startblock) || got.br_state != XFS_EXT_NORM) goto next; - agno = XFS_FSB_TO_AGNO(mp, got.br_startblock); + + pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, got.br_startblock)); agbno = XFS_FSB_TO_AGBNO(mp, got.br_startblock); aglen = got.br_blockcount; - - error = xfs_reflink_find_shared(mp, tp, agno, agbno, aglen, + error = xfs_reflink_find_shared(pag, tp, agbno, aglen, &rbno, &rlen, false); + xfs_perag_put(pag); if (error) return error; + /* Is there still a shared block here? */ if (rbno != NULLAGBLOCK) { *has_shared = true; diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index bea65f2fe657..65c5dfe17ecf 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -16,9 +16,6 @@ static inline bool xfs_is_cow_inode(struct xfs_inode *ip) return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip); } -extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen, - xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal); extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip, struct xfs_bmbt_irec *irec, bool *shared); int xfs_bmap_trim_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index ed18160e6181..9ac59814bbb6 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -40,6 +40,7 @@ #include "xfs_defer.h" #include "xfs_attr_item.h" #include "xfs_xattr.h" +#include "xfs_iunlink_item.h" #include <linux/magic.h> #include <linux/fs_context.h> @@ -350,8 +351,10 @@ xfs_setup_dax_always( goto disable_dax; } - if (xfs_has_reflink(mp)) { - xfs_alert(mp, "DAX and reflink cannot be used together!"); + if (xfs_has_reflink(mp) && + bdev_is_partition(mp->m_ddev_targp->bt_bdev)) { + xfs_alert(mp, + "DAX and reflink cannot work with multi-partitions!"); return -EINVAL; } @@ -797,8 +800,11 @@ xfs_fs_statfs( xfs_extlen_t lsize; int64_t ffree; - /* Wait for whatever inactivations are in progress. */ - xfs_inodegc_flush(mp); + /* + * Expedite background inodegc but don't wait. We do not want to block + * here waiting hours for a billion extent file to be truncated. + */ + xfs_inodegc_push(mp); statp->f_type = XFS_SUPER_MAGIC; statp->f_namelen = MAXNAMELEN - 1; @@ -1074,7 +1080,7 @@ xfs_inodegc_init_percpu( gc = per_cpu_ptr(mp->m_inodegc, cpu); init_llist_head(&gc->list); gc->items = 0; - INIT_WORK(&gc->work, xfs_inodegc_worker); + INIT_DELAYED_WORK(&gc->work, xfs_inodegc_worker); } return 0; } @@ -1963,11 +1969,19 @@ xfs_init_caches(void) { int error; + xfs_buf_cache = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0, + SLAB_HWCACHE_ALIGN | + SLAB_RECLAIM_ACCOUNT | + SLAB_MEM_SPREAD, + NULL); + if (!xfs_buf_cache) + goto out; + xfs_log_ticket_cache = kmem_cache_create("xfs_log_ticket", sizeof(struct xlog_ticket), 0, 0, NULL); if (!xfs_log_ticket_cache) - goto out; + goto out_destroy_buf_cache; error = xfs_btree_init_cur_caches(); if (error) @@ -2093,8 +2107,16 @@ xfs_init_caches(void) if (!xfs_attri_cache) goto out_destroy_attrd_cache; + xfs_iunlink_cache = kmem_cache_create("xfs_iul_item", + sizeof(struct xfs_iunlink_item), + 0, 0, NULL); + if (!xfs_iunlink_cache) + goto out_destroy_attri_cache; + return 0; + out_destroy_attri_cache: + kmem_cache_destroy(xfs_attri_cache); out_destroy_attrd_cache: kmem_cache_destroy(xfs_attrd_cache); out_destroy_bui_cache: @@ -2133,6 +2155,8 @@ xfs_init_caches(void) xfs_btree_destroy_cur_caches(); out_destroy_log_ticket_cache: kmem_cache_destroy(xfs_log_ticket_cache); + out_destroy_buf_cache: + kmem_cache_destroy(xfs_buf_cache); out: return -ENOMEM; } @@ -2145,6 +2169,7 @@ xfs_destroy_caches(void) * destroy caches. */ rcu_barrier(); + kmem_cache_destroy(xfs_iunlink_cache); kmem_cache_destroy(xfs_attri_cache); kmem_cache_destroy(xfs_attrd_cache); kmem_cache_destroy(xfs_bui_cache); @@ -2165,6 +2190,7 @@ xfs_destroy_caches(void) xfs_defer_destroy_item_caches(); xfs_btree_destroy_cur_caches(); kmem_cache_destroy(xfs_log_ticket_cache); + kmem_cache_destroy(xfs_buf_cache); } STATIC int __init @@ -2210,6 +2236,7 @@ xfs_cpu_dead( list_for_each_entry_safe(mp, n, &xfs_mount_list, m_mount_list) { spin_unlock(&xfs_mount_list_lock); xfs_inodegc_cpu_dead(mp, cpu); + xlog_cil_pcp_dead(mp->m_log, cpu); spin_lock(&xfs_mount_list_lock); } spin_unlock(&xfs_mount_list_lock); @@ -2269,13 +2296,9 @@ init_xfs_fs(void) if (error) goto out_destroy_wq; - error = xfs_buf_init(); - if (error) - goto out_mru_cache_uninit; - error = xfs_init_procfs(); if (error) - goto out_buf_terminate; + goto out_mru_cache_uninit; error = xfs_sysctl_register(); if (error) @@ -2332,8 +2355,6 @@ init_xfs_fs(void) xfs_sysctl_unregister(); out_cleanup_procfs: xfs_cleanup_procfs(); - out_buf_terminate: - xfs_buf_terminate(); out_mru_cache_uninit: xfs_mru_cache_uninit(); out_destroy_wq: @@ -2359,7 +2380,6 @@ exit_xfs_fs(void) kset_unregister(xfs_kset); xfs_sysctl_unregister(); xfs_cleanup_procfs(); - xfs_buf_terminate(); xfs_mru_cache_uninit(); xfs_destroy_workqueues(); xfs_destroy_caches(); diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 3cd5a51bace1..364e2c2648a8 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -92,6 +92,7 @@ extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *, extern const struct export_operations xfs_export_operations; extern const struct quotactl_ops xfs_quotactl_operations; +extern const struct dax_holder_operations xfs_dax_holder_operations; extern void xfs_reinit_percpu_counters(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 4145ba872547..8389f3ef88ef 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -256,7 +256,7 @@ xfs_symlink( /* * If the symlink will fit into the inode, write it inline. */ - if (pathlen <= XFS_IFORK_DSIZE(ip)) { + if (pathlen <= xfs_inode_data_fork_size(ip)) { xfs_init_local_fork(ip, XFS_DATA_FORK, target_path, pathlen); ip->i_disk_size = pathlen; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index d32026585c1b..f9057af6e0c8 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -240,6 +240,7 @@ DEFINE_EVENT(xfs_fs_class, name, \ TP_PROTO(struct xfs_mount *mp, void *caller_ip), \ TP_ARGS(mp, caller_ip)) DEFINE_FS_EVENT(xfs_inodegc_flush); +DEFINE_FS_EVENT(xfs_inodegc_push); DEFINE_FS_EVENT(xfs_inodegc_start); DEFINE_FS_EVENT(xfs_inodegc_stop); DEFINE_FS_EVENT(xfs_inodegc_queue); @@ -2170,7 +2171,7 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class, __entry->format = ip->i_df.if_format; __entry->nex = ip->i_df.if_nextents; __entry->broot_size = ip->i_df.if_broot_bytes; - __entry->fork_off = XFS_IFORK_BOFF(ip); + __entry->fork_off = xfs_inode_fork_boff(ip); ), TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %llu, " "broot size %d, forkoff 0x%x", @@ -3671,7 +3672,6 @@ DEFINE_EVENT(xfs_ag_inode_class, name, \ TP_ARGS(ip)) DEFINE_AGINODE_EVENT(xfs_iunlink); DEFINE_AGINODE_EVENT(xfs_iunlink_remove); -DEFINE_AG_EVENT(xfs_iunlink_map_prev_fallback); DECLARE_EVENT_CLASS(xfs_fs_corrupt_class, TP_PROTO(struct xfs_mount *mp, unsigned int flags), diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 82cf0189c0db..7bd16fbff534 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -760,7 +760,7 @@ xfs_log_item_batch_insert( void xfs_trans_committed_bulk( struct xfs_ail *ailp, - struct xfs_log_vec *log_vector, + struct list_head *lv_chain, xfs_lsn_t commit_lsn, bool aborted) { @@ -775,7 +775,7 @@ xfs_trans_committed_bulk( spin_unlock(&ailp->ail_lock); /* unpin all the log items */ - for (lv = log_vector; lv; lv = lv->lv_next ) { + list_for_each_entry(lv, lv_chain, lv_list) { struct xfs_log_item *lip = lv->lv_item; xfs_lsn_t item_lsn; @@ -845,6 +845,90 @@ xfs_trans_committed_bulk( } /* + * Sort transaction items prior to running precommit operations. This will + * attempt to order the items such that they will always be locked in the same + * order. Items that have no sort function are moved to the end of the list + * and so are locked last. + * + * This may need refinement as different types of objects add sort functions. + * + * Function is more complex than it needs to be because we are comparing 64 bit + * values and the function only returns 32 bit values. + */ +static int +xfs_trans_precommit_sort( + void *unused_arg, + const struct list_head *a, + const struct list_head *b) +{ + struct xfs_log_item *lia = container_of(a, + struct xfs_log_item, li_trans); + struct xfs_log_item *lib = container_of(b, + struct xfs_log_item, li_trans); + int64_t diff; + + /* + * If both items are non-sortable, leave them alone. If only one is + * sortable, move the non-sortable item towards the end of the list. + */ + if (!lia->li_ops->iop_sort && !lib->li_ops->iop_sort) + return 0; + if (!lia->li_ops->iop_sort) + return 1; + if (!lib->li_ops->iop_sort) + return -1; + + diff = lia->li_ops->iop_sort(lia) - lib->li_ops->iop_sort(lib); + if (diff < 0) + return -1; + if (diff > 0) + return 1; + return 0; +} + +/* + * Run transaction precommit functions. + * + * If there is an error in any of the callouts, then stop immediately and + * trigger a shutdown to abort the transaction. There is no recovery possible + * from errors at this point as the transaction is dirty.... + */ +static int +xfs_trans_run_precommits( + struct xfs_trans *tp) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_log_item *lip, *n; + int error = 0; + + /* + * Sort the item list to avoid ABBA deadlocks with other transactions + * running precommit operations that lock multiple shared items such as + * inode cluster buffers. + */ + list_sort(NULL, &tp->t_items, xfs_trans_precommit_sort); + + /* + * Precommit operations can remove the log item from the transaction + * if the log item exists purely to delay modifications until they + * can be ordered against other operations. Hence we have to use + * list_for_each_entry_safe() here. + */ + list_for_each_entry_safe(lip, n, &tp->t_items, li_trans) { + if (!test_bit(XFS_LI_DIRTY, &lip->li_flags)) + continue; + if (lip->li_ops->iop_precommit) { + error = lip->li_ops->iop_precommit(tp, lip); + if (error) + break; + } + } + if (error) + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return error; +} + +/* * Commit the given transaction to the log. * * XFS disk error handling mechanism is not based on a typical @@ -869,6 +953,13 @@ __xfs_trans_commit( trace_xfs_trans_commit(tp, _RET_IP_); + error = xfs_trans_run_precommits(tp); + if (error) { + if (tp->t_flags & XFS_TRANS_PERM_LOG_RES) + xfs_defer_cancel(tp); + goto out_unreserve; + } + /* * Finish deferred items on final commit. Only permanent transactions * should ever have deferred ops. diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 9561f193e7e1..55819785941c 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -45,6 +45,7 @@ struct xfs_log_item { struct xfs_log_vec *li_lv; /* active log vector */ struct xfs_log_vec *li_lv_shadow; /* standby vector */ xfs_csn_t li_seq; /* CIL commit seq */ + uint32_t li_order_id; /* CIL commit order */ }; /* @@ -71,10 +72,12 @@ struct xfs_item_ops { void (*iop_format)(struct xfs_log_item *, struct xfs_log_vec *); void (*iop_pin)(struct xfs_log_item *); void (*iop_unpin)(struct xfs_log_item *, int remove); - uint (*iop_push)(struct xfs_log_item *, struct list_head *); + uint64_t (*iop_sort)(struct xfs_log_item *lip); + int (*iop_precommit)(struct xfs_trans *tp, struct xfs_log_item *lip); void (*iop_committing)(struct xfs_log_item *lip, xfs_csn_t seq); - void (*iop_release)(struct xfs_log_item *); xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t); + uint (*iop_push)(struct xfs_log_item *, struct list_head *); + void (*iop_release)(struct xfs_log_item *); int (*iop_recover)(struct xfs_log_item *lip, struct list_head *capture_list); bool (*iop_match)(struct xfs_log_item *item, uint64_t id); diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index f0d79a9050ba..d5400150358e 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -19,7 +19,8 @@ void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); void xfs_trans_del_item(struct xfs_log_item *); void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); -void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv, +void xfs_trans_committed_bulk(struct xfs_ail *ailp, + struct list_head *lv_chain, xfs_lsn_t commit_lsn, bool aborted); /* * AIL traversal cursor. diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 053299758deb..511bb9fa3750 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -60,8 +60,7 @@ static void zonefs_account_active(struct inode *inode) } } -static inline int zonefs_zone_mgmt(struct inode *inode, - enum req_opf op) +static inline int zonefs_zone_mgmt(struct inode *inode, enum req_op op) { struct zonefs_inode_info *zi = ZONEFS_I(inode); int ret; @@ -271,7 +270,7 @@ static const struct address_space_operations zonefs_file_aops = { .dirty_folio = filemap_dirty_folio, .release_folio = iomap_release_folio, .invalidate_folio = iomap_invalidate_folio, - .migratepage = iomap_migrate_page, + .migrate_folio = filemap_migrate_folio, .is_partially_uptodate = iomap_is_partially_uptodate, .error_remove_page = generic_error_remove_page, .direct_IO = noop_direct_IO, @@ -525,7 +524,7 @@ static int zonefs_file_truncate(struct inode *inode, loff_t isize) { struct zonefs_inode_info *zi = ZONEFS_I(inode); loff_t old_isize; - enum req_opf op; + enum req_op op; int ret = 0; /* @@ -616,7 +615,7 @@ static int zonefs_inode_setattr(struct user_namespace *mnt_userns, !uid_eq(iattr->ia_uid, inode->i_uid)) || ((iattr->ia_valid & ATTR_GID) && !gid_eq(iattr->ia_gid, inode->i_gid))) { - ret = dquot_transfer(inode, iattr); + ret = dquot_transfer(mnt_userns, inode, iattr); if (ret) return ret; } @@ -780,7 +779,7 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS); bio->bi_iter.bi_sector = zi->i_zsector; bio->bi_ioprio = iocb->ki_ioprio; - if (iocb->ki_flags & IOCB_DSYNC) + if (iocb_is_dsync(iocb)) bio->bi_opf |= REQ_FUA; ret = bio_iov_iter_get_pages(bio, from); @@ -1394,7 +1393,7 @@ static void zonefs_init_dir_inode(struct inode *parent, struct inode *inode, { struct super_block *sb = parent->i_sb; - inode->i_ino = blkdev_nr_zones(sb->s_bdev->bd_disk) + type + 1; + inode->i_ino = bdev_nr_zones(sb->s_bdev) + type + 1; inode_init_owner(&init_user_ns, inode, parent, S_IFDIR | 0555); inode->i_op = &zonefs_dir_inode_operations; inode->i_fop = &simple_dir_operations; @@ -1540,7 +1539,7 @@ static int zonefs_create_zgroup(struct zonefs_zone_data *zd, /* * The first zone contains the super block: skip it. */ - end = zd->zones + blkdev_nr_zones(sb->s_bdev->bd_disk); + end = zd->zones + bdev_nr_zones(sb->s_bdev); for (zone = &zd->zones[1]; zone < end; zone = next) { next = zone + 1; @@ -1635,8 +1634,8 @@ static int zonefs_get_zone_info(struct zonefs_zone_data *zd) struct block_device *bdev = zd->sb->s_bdev; int ret; - zd->zones = kvcalloc(blkdev_nr_zones(bdev->bd_disk), - sizeof(struct blk_zone), GFP_KERNEL); + zd->zones = kvcalloc(bdev_nr_zones(bdev), sizeof(struct blk_zone), + GFP_KERNEL); if (!zd->zones) return -ENOMEM; @@ -1648,9 +1647,9 @@ static int zonefs_get_zone_info(struct zonefs_zone_data *zd) return ret; } - if (ret != blkdev_nr_zones(bdev->bd_disk)) { + if (ret != bdev_nr_zones(bdev)) { zonefs_err(zd->sb, "Invalid zone report (%d/%u zones)\n", - ret, blkdev_nr_zones(bdev->bd_disk)); + ret, bdev_nr_zones(bdev)); return -EIO; } @@ -1687,11 +1686,11 @@ static int zonefs_read_super(struct super_block *sb) if (ret) goto free_page; - super = kmap(page); + super = page_address(page); ret = -EINVAL; if (le32_to_cpu(super->s_magic) != ZONEFS_MAGIC) - goto unmap; + goto free_page; stored_crc = le32_to_cpu(super->s_crc); super->s_crc = 0; @@ -1699,14 +1698,14 @@ static int zonefs_read_super(struct super_block *sb) if (crc != stored_crc) { zonefs_err(sb, "Invalid checksum (Expected 0x%08x, got 0x%08x)", crc, stored_crc); - goto unmap; + goto free_page; } sbi->s_features = le64_to_cpu(super->s_features); if (sbi->s_features & ~ZONEFS_F_DEFINED_FEATURES) { zonefs_err(sb, "Unknown features set 0x%llx\n", sbi->s_features); - goto unmap; + goto free_page; } if (sbi->s_features & ZONEFS_F_UID) { @@ -1714,7 +1713,7 @@ static int zonefs_read_super(struct super_block *sb) le32_to_cpu(super->s_uid)); if (!uid_valid(sbi->s_uid)) { zonefs_err(sb, "Invalid UID feature\n"); - goto unmap; + goto free_page; } } @@ -1723,7 +1722,7 @@ static int zonefs_read_super(struct super_block *sb) le32_to_cpu(super->s_gid)); if (!gid_valid(sbi->s_gid)) { zonefs_err(sb, "Invalid GID feature\n"); - goto unmap; + goto free_page; } } @@ -1732,14 +1731,12 @@ static int zonefs_read_super(struct super_block *sb) if (memchr_inv(super->s_reserved, 0, sizeof(super->s_reserved))) { zonefs_err(sb, "Reserved area is being used\n"); - goto unmap; + goto free_page; } import_uuid(&sbi->s_uuid, super->s_uuid); ret = 0; -unmap: - kunmap(page); free_page: __free_page(page); @@ -1816,8 +1813,7 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) if (ret) goto cleanup; - zonefs_info(sb, "Mounting %u zones", - blkdev_nr_zones(sb->s_bdev->bd_disk)); + zonefs_info(sb, "Mounting %u zones", bdev_nr_zones(sb->s_bdev)); if (!sbi->s_max_wro_seq_files && !sbi->s_max_active_seq_files && @@ -1833,7 +1829,7 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) if (!inode) goto cleanup; - inode->i_ino = blkdev_nr_zones(sb->s_bdev->bd_disk); + inode->i_ino = bdev_nr_zones(sb->s_bdev); inode->i_mode = S_IFDIR | 0555; inode->i_ctime = inode->i_mtime = inode->i_atime = current_time(inode); inode->i_op = &zonefs_dir_inode_operations; diff --git a/fs/zonefs/trace.h b/fs/zonefs/trace.h index f369d7d50303..42edcfd393ed 100644 --- a/fs/zonefs/trace.h +++ b/fs/zonefs/trace.h @@ -20,12 +20,12 @@ #define show_dev(dev) MAJOR(dev), MINOR(dev) TRACE_EVENT(zonefs_zone_mgmt, - TP_PROTO(struct inode *inode, enum req_opf op), + TP_PROTO(struct inode *inode, enum req_op op), TP_ARGS(inode, op), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) - __field(int, op) + __field(enum req_op, op) __field(sector_t, sector) __field(sector_t, nr_sectors) ), |