aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorArnaldo Carvalho de Melo2020-11-30 08:56:55 -0300
committerArnaldo Carvalho de Melo2020-11-30 08:56:55 -0300
commit1f195e557d137be004894d2016357013331ec3d0 (patch)
treeac74e64b08349fc569e9db2edcf32c4bf84b0c9b /fs
parentfd4ebb457c9ca90d10a74aeb85d54e27b08d5e76 (diff)
parentb65054597872ce3aefbc6a666385eabdf9e288da (diff)
Merge remote-tracking branch 'torvalds/master' into perf/core
To pick up fixes. Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/afs/dir.c1
-rw-r--r--fs/afs/inode.c8
-rw-r--r--fs/afs/internal.h1
-rw-r--r--fs/afs/write.c5
-rw-r--r--fs/aio.c2
-rw-r--r--fs/btrfs/block-rsv.c3
-rw-r--r--fs/btrfs/ctree.h5
-rw-r--r--fs/btrfs/dev-replace.c26
-rw-r--r--fs/btrfs/file.c57
-rw-r--r--fs/btrfs/inode.c58
-rw-r--r--fs/btrfs/ioctl.c10
-rw-r--r--fs/btrfs/qgroup.c100
-rw-r--r--fs/btrfs/ref-verify.c1
-rw-r--r--fs/btrfs/relocation.c4
-rw-r--r--fs/btrfs/scrub.c5
-rw-r--r--fs/btrfs/tests/inode-tests.c12
-rw-r--r--fs/btrfs/tree-checker.c3
-rw-r--r--fs/btrfs/volumes.c34
-rw-r--r--fs/ceph/caps.c2
-rw-r--r--fs/ceph/mds_client.c50
-rw-r--r--fs/ceph/mds_client.h1
-rw-r--r--fs/ceph/quota.c2
-rw-r--r--fs/ceph/snap.c2
-rw-r--r--fs/cifs/cifsacl.c1
-rw-r--r--fs/cifs/smb2ops.c88
-rw-r--r--fs/crypto/inline_crypt.c2
-rw-r--r--fs/crypto/keysetup.c4
-rw-r--r--fs/efivarfs/inode.c2
-rw-r--r--fs/erofs/inode.c21
-rw-r--r--fs/erofs/zdata.c7
-rw-r--r--fs/ext4/ext4.h75
-rw-r--r--fs/ext4/extents.c7
-rw-r--r--fs/ext4/fast_commit.c174
-rw-r--r--fs/ext4/fast_commit.h6
-rw-r--r--fs/ext4/file.c6
-rw-r--r--fs/ext4/fsmap.c2
-rw-r--r--fs/ext4/fsync.c2
-rw-r--r--fs/ext4/inline.c1
-rw-r--r--fs/ext4/inode.c19
-rw-r--r--fs/ext4/mballoc.c6
-rw-r--r--fs/ext4/namei.c61
-rw-r--r--fs/ext4/super.c62
-rw-r--r--fs/gfs2/aops.c2
-rw-r--r--fs/gfs2/bmap.c8
-rw-r--r--fs/gfs2/glock.c3
-rw-r--r--fs/gfs2/glops.c69
-rw-r--r--fs/gfs2/glops.h1
-rw-r--r--fs/gfs2/inode.c3
-rw-r--r--fs/gfs2/log.c2
-rw-r--r--fs/gfs2/lops.c31
-rw-r--r--fs/gfs2/lops.h2
-rw-r--r--fs/gfs2/ops_fstype.c14
-rw-r--r--fs/gfs2/recovery.c2
-rw-r--r--fs/gfs2/rgrp.c15
-rw-r--r--fs/gfs2/super.c1
-rw-r--r--fs/io-wq.c4
-rw-r--r--fs/io_uring.c280
-rw-r--r--fs/iomap/buffered-io.c30
-rw-r--r--fs/jbd2/checkpoint.c2
-rw-r--r--fs/jbd2/commit.c11
-rw-r--r--fs/jbd2/journal.c172
-rw-r--r--fs/jbd2/recovery.c6
-rw-r--r--fs/jbd2/transaction.c35
-rw-r--r--fs/libfs.c6
-rw-r--r--fs/nfs/dir.c15
-rw-r--r--fs/nfs/nfs42xattr.c2
-rw-r--r--fs/nfs/nfs42xdr.c4
-rw-r--r--fs/nfs/nfsroot.c6
-rw-r--r--fs/nfsd/nfs3proc.c6
-rw-r--r--fs/nfsd/nfs3xdr.c1
-rw-r--r--fs/nfsd/nfs4proc.c3
-rw-r--r--fs/notify/fsnotify.c12
-rw-r--r--fs/ocfs2/journal.c2
-rw-r--r--fs/ocfs2/super.c1
-rw-r--r--fs/proc/cpuinfo.c2
-rw-r--r--fs/proc/generic.c4
-rw-r--r--fs/proc/inode.c2
-rw-r--r--fs/proc/self.c7
-rw-r--r--fs/proc/stat.c2
-rw-r--r--fs/seq_file.c45
-rw-r--r--fs/super.c49
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c1
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c8
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h2
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c2
-rw-r--r--fs/xfs/scrub/bmap.c10
-rw-r--r--fs/xfs/scrub/btree.c45
-rw-r--r--fs/xfs/scrub/dir.c21
-rw-r--r--fs/xfs/scrub/inode.c3
-rw-r--r--fs/xfs/scrub/refcount.c8
-rw-r--r--fs/xfs/xfs_aops.c20
-rw-r--r--fs/xfs/xfs_iomap.c29
-rw-r--r--fs/xfs/xfs_iops.c10
-rw-r--r--fs/xfs/xfs_iwalk.c27
-rw-r--r--fs/xfs/xfs_mount.c11
-rw-r--r--fs/xfs/xfs_pnfs.c2
-rw-r--r--fs/xfs/xfs_reflink.c3
97 files changed, 1228 insertions, 774 deletions
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 1bb5b9d7f0a2..9068d5578a26 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -823,6 +823,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
vp->cb_break_before = afs_calc_vnode_cb_break(vnode);
vp->vnode = vnode;
vp->put_vnode = true;
+ vp->speculative = true; /* vnode not locked */
}
}
}
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 0fe8844b4bee..b0d7b892090d 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -294,6 +294,13 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v
op->flags &= ~AFS_OPERATION_DIR_CONFLICT;
}
} else if (vp->scb.have_status) {
+ if (vp->dv_before + vp->dv_delta != vp->scb.status.data_version &&
+ vp->speculative)
+ /* Ignore the result of a speculative bulk status fetch
+ * if it splits around a modification op, thereby
+ * appearing to regress the data version.
+ */
+ goto out;
afs_apply_status(op, vp);
if (vp->scb.have_cb)
afs_apply_callback(op, vp);
@@ -305,6 +312,7 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v
}
}
+out:
write_sequnlock(&vnode->cb_lock);
if (vp->scb.have_status)
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 14d5d75f4b6e..0d150a29e39e 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -755,6 +755,7 @@ struct afs_vnode_param {
bool update_ctime:1; /* Need to update the ctime */
bool set_size:1; /* Must update i_size */
bool op_unlinked:1; /* True if file was unlinked by op */
+ bool speculative:1; /* T if speculative status fetch (no vnode lock) */
};
/*
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 50371207f327..c9195fc67fd8 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -169,11 +169,14 @@ int afs_write_end(struct file *file, struct address_space *mapping,
unsigned int f, from = pos & (PAGE_SIZE - 1);
unsigned int t, to = from + copied;
loff_t i_size, maybe_i_size;
- int ret;
+ int ret = 0;
_enter("{%llx:%llu},{%lx}",
vnode->fid.vid, vnode->fid.vnode, page->index);
+ if (copied == 0)
+ goto out;
+
maybe_i_size = pos + copied;
i_size = i_size_read(&vnode->vfs_inode);
diff --git a/fs/aio.c b/fs/aio.c
index c45c20d87538..6a21d8919409 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1572,7 +1572,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb,
* we return to userspace.
*/
if (S_ISREG(file_inode(file)->i_mode)) {
- __sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, true);
+ sb_start_write(file_inode(file)->i_sb);
__sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE);
}
req->ki_flags |= IOCB_WRITE;
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index 7e1549a84fcc..bc920afe23bf 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -511,7 +511,8 @@ again:
/*DEFAULT_RATELIMIT_BURST*/ 1);
if (__ratelimit(&_rs))
WARN(1, KERN_DEBUG
- "BTRFS: block rsv returned %d\n", ret);
+ "BTRFS: block rsv %d returned %d\n",
+ block_rsv->type, ret);
}
try_reserve:
ret = btrfs_reserve_metadata_bytes(root, block_rsv, blocksize,
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0378933d163c..0b29bdb25105 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -878,7 +878,10 @@ struct btrfs_fs_info {
*/
struct ulist *qgroup_ulist;
- /* protect user change for quota operations */
+ /*
+ * Protect user change for quota operations. If a transaction is needed,
+ * it must be started before locking this lock.
+ */
struct mutex qgroup_ioctl_lock;
/* list of dirty qgroups to be written at next commit */
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 5b9e3f3ace22..10638537b9ef 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -91,6 +91,17 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
if (ret) {
no_valid_dev_replace_entry_found:
+ /*
+ * We don't have a replace item or it's corrupted. If there is
+ * a replace target, fail the mount.
+ */
+ if (btrfs_find_device(fs_info->fs_devices,
+ BTRFS_DEV_REPLACE_DEVID, NULL, NULL, false)) {
+ btrfs_err(fs_info,
+ "found replace target device without a valid replace item");
+ ret = -EUCLEAN;
+ goto out;
+ }
ret = 0;
dev_replace->replace_state =
BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
@@ -143,8 +154,19 @@ no_valid_dev_replace_entry_found:
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
- dev_replace->srcdev = NULL;
- dev_replace->tgtdev = NULL;
+ /*
+ * We don't have an active replace item but if there is a
+ * replace target, fail the mount.
+ */
+ if (btrfs_find_device(fs_info->fs_devices,
+ BTRFS_DEV_REPLACE_DEVID, NULL, NULL, false)) {
+ btrfs_err(fs_info,
+ "replace devid present without an active replace item");
+ ret = -EUCLEAN;
+ } else {
+ dev_replace->srcdev = NULL;
+ dev_replace->tgtdev = NULL;
+ }
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 87355a38a654..4373da7bcc0d 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -452,46 +452,6 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)
}
}
-static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
- const u64 start,
- const u64 len,
- struct extent_state **cached_state)
-{
- u64 search_start = start;
- const u64 end = start + len - 1;
-
- while (search_start < end) {
- const u64 search_len = end - search_start + 1;
- struct extent_map *em;
- u64 em_len;
- int ret = 0;
-
- em = btrfs_get_extent(inode, NULL, 0, search_start, search_len);
- if (IS_ERR(em))
- return PTR_ERR(em);
-
- if (em->block_start != EXTENT_MAP_HOLE)
- goto next;
-
- em_len = em->len;
- if (em->start < search_start)
- em_len -= search_start - em->start;
- if (em_len > search_len)
- em_len = search_len;
-
- ret = set_extent_bit(&inode->io_tree, search_start,
- search_start + em_len - 1,
- EXTENT_DELALLOC_NEW,
- NULL, cached_state, GFP_NOFS);
-next:
- search_start = extent_map_end(em);
- free_extent_map(em);
- if (ret)
- return ret;
- }
- return 0;
-}
-
/*
* after copy_from_user, pages need to be dirtied and we need to make
* sure holes are created between the current EOF and the start of
@@ -528,23 +488,6 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
0, 0, cached);
- if (!btrfs_is_free_space_inode(inode)) {
- if (start_pos >= isize &&
- !(inode->flags & BTRFS_INODE_PREALLOC)) {
- /*
- * There can't be any extents following eof in this case
- * so just set the delalloc new bit for the range
- * directly.
- */
- extra_bits |= EXTENT_DELALLOC_NEW;
- } else {
- err = btrfs_find_new_delalloc_bytes(inode, start_pos,
- num_bytes, cached);
- if (err)
- return err;
- }
- }
-
err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
extra_bits, cached);
if (err)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index da58c58ef9aa..7e8d8169779d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2253,11 +2253,69 @@ static int add_pending_csums(struct btrfs_trans_handle *trans,
return 0;
}
+static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
+ const u64 start,
+ const u64 len,
+ struct extent_state **cached_state)
+{
+ u64 search_start = start;
+ const u64 end = start + len - 1;
+
+ while (search_start < end) {
+ const u64 search_len = end - search_start + 1;
+ struct extent_map *em;
+ u64 em_len;
+ int ret = 0;
+
+ em = btrfs_get_extent(inode, NULL, 0, search_start, search_len);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
+
+ if (em->block_start != EXTENT_MAP_HOLE)
+ goto next;
+
+ em_len = em->len;
+ if (em->start < search_start)
+ em_len -= search_start - em->start;
+ if (em_len > search_len)
+ em_len = search_len;
+
+ ret = set_extent_bit(&inode->io_tree, search_start,
+ search_start + em_len - 1,
+ EXTENT_DELALLOC_NEW,
+ NULL, cached_state, GFP_NOFS);
+next:
+ search_start = extent_map_end(em);
+ free_extent_map(em);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
unsigned int extra_bits,
struct extent_state **cached_state)
{
WARN_ON(PAGE_ALIGNED(end));
+
+ if (start >= i_size_read(&inode->vfs_inode) &&
+ !(inode->flags & BTRFS_INODE_PREALLOC)) {
+ /*
+ * There can't be any extents following eof in this case so just
+ * set the delalloc new bit for the range directly.
+ */
+ extra_bits |= EXTENT_DELALLOC_NEW;
+ } else {
+ int ret;
+
+ ret = btrfs_find_new_delalloc_bytes(inode, start,
+ end + 1 - start,
+ cached_state);
+ if (ret)
+ return ret;
+ }
+
return set_extent_delalloc(&inode->io_tree, start, end, extra_bits,
cached_state);
}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index ab408a23ba32..69a384145dc6 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1274,6 +1274,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
u64 page_start;
u64 page_end;
u64 page_cnt;
+ u64 start = (u64)start_index << PAGE_SHIFT;
int ret;
int i;
int i_done;
@@ -1290,8 +1291,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
- start_index << PAGE_SHIFT,
- page_cnt << PAGE_SHIFT);
+ start, page_cnt << PAGE_SHIFT);
if (ret)
return ret;
i_done = 0;
@@ -1380,8 +1380,7 @@ again:
btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
spin_unlock(&BTRFS_I(inode)->lock);
btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
- start_index << PAGE_SHIFT,
- (page_cnt - i_done) << PAGE_SHIFT, true);
+ start, (page_cnt - i_done) << PAGE_SHIFT, true);
}
@@ -1408,8 +1407,7 @@ out:
put_page(pages[i]);
}
btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
- start_index << PAGE_SHIFT,
- page_cnt << PAGE_SHIFT, true);
+ start, page_cnt << PAGE_SHIFT, true);
btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
extent_changeset_free(data_reserved);
return ret;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index c54ea6586632..87bd37b70738 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -11,6 +11,7 @@
#include <linux/slab.h>
#include <linux/workqueue.h>
#include <linux/btrfs.h>
+#include <linux/sched/mm.h>
#include "ctree.h"
#include "transaction.h"
@@ -497,13 +498,13 @@ next2:
break;
}
out:
+ btrfs_free_path(path);
fs_info->qgroup_flags |= flags;
if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
ret >= 0)
ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
- btrfs_free_path(path);
if (ret < 0) {
ulist_free(fs_info->qgroup_ulist);
@@ -936,6 +937,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
struct btrfs_key found_key;
struct btrfs_qgroup *qgroup = NULL;
struct btrfs_trans_handle *trans = NULL;
+ struct ulist *ulist = NULL;
int ret = 0;
int slot;
@@ -943,8 +945,8 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
if (fs_info->quota_root)
goto out;
- fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
- if (!fs_info->qgroup_ulist) {
+ ulist = ulist_alloc(GFP_KERNEL);
+ if (!ulist) {
ret = -ENOMEM;
goto out;
}
@@ -952,6 +954,22 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
ret = btrfs_sysfs_add_qgroups(fs_info);
if (ret < 0)
goto out;
+
+ /*
+ * Unlock qgroup_ioctl_lock before starting the transaction. This is to
+ * avoid lock acquisition inversion problems (reported by lockdep) between
+ * qgroup_ioctl_lock and the vfs freeze semaphores, acquired when we
+ * start a transaction.
+ * After we started the transaction lock qgroup_ioctl_lock again and
+ * check if someone else created the quota root in the meanwhile. If so,
+ * just return success and release the transaction handle.
+ *
+ * Also we don't need to worry about someone else calling
+ * btrfs_sysfs_add_qgroups() after we unlock and getting an error because
+ * that function returns 0 (success) when the sysfs entries already exist.
+ */
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+
/*
* 1 for quota root item
* 1 for BTRFS_QGROUP_STATUS item
@@ -961,12 +979,20 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
* would be a lot of overkill.
*/
trans = btrfs_start_transaction(tree_root, 2);
+
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
trans = NULL;
goto out;
}
+ if (fs_info->quota_root)
+ goto out;
+
+ fs_info->qgroup_ulist = ulist;
+ ulist = NULL;
+
/*
* initially create the quota tree
*/
@@ -1124,11 +1150,14 @@ out:
if (ret) {
ulist_free(fs_info->qgroup_ulist);
fs_info->qgroup_ulist = NULL;
- if (trans)
- btrfs_end_transaction(trans);
btrfs_sysfs_del_qgroups(fs_info);
}
mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ if (ret && trans)
+ btrfs_end_transaction(trans);
+ else if (trans)
+ ret = btrfs_end_transaction(trans);
+ ulist_free(ulist);
return ret;
}
@@ -1141,19 +1170,29 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root)
goto out;
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
/*
* 1 For the root item
*
* We should also reserve enough items for the quota tree deletion in
* btrfs_clean_quota_tree but this is not done.
+ *
+ * Also, we must always start a transaction without holding the mutex
+ * qgroup_ioctl_lock, see btrfs_quota_enable().
*/
trans = btrfs_start_transaction(fs_info->tree_root, 1);
+
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
+ trans = NULL;
goto out;
}
+ if (!fs_info->quota_root)
+ goto out;
+
clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
btrfs_qgroup_wait_for_completion(fs_info, false);
spin_lock(&fs_info->qgroup_lock);
@@ -1167,13 +1206,13 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
ret = btrfs_clean_quota_tree(trans, quota_root);
if (ret) {
btrfs_abort_transaction(trans, ret);
- goto end_trans;
+ goto out;
}
ret = btrfs_del_root(trans, &quota_root->root_key);
if (ret) {
btrfs_abort_transaction(trans, ret);
- goto end_trans;
+ goto out;
}
list_del(&quota_root->dirty_list);
@@ -1185,10 +1224,13 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
btrfs_put_root(quota_root);
-end_trans:
- ret = btrfs_end_transaction(trans);
out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ if (ret && trans)
+ btrfs_end_transaction(trans);
+ else if (trans)
+ ret = btrfs_end_transaction(trans);
+
return ret;
}
@@ -1324,13 +1366,17 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
struct btrfs_qgroup *member;
struct btrfs_qgroup_list *list;
struct ulist *tmp;
+ unsigned int nofs_flag;
int ret = 0;
/* Check the level of src and dst first */
if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
return -EINVAL;
+ /* We hold a transaction handle open, must do a NOFS allocation. */
+ nofs_flag = memalloc_nofs_save();
tmp = ulist_alloc(GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
if (!tmp)
return -ENOMEM;
@@ -1387,10 +1433,14 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
struct btrfs_qgroup_list *list;
struct ulist *tmp;
bool found = false;
+ unsigned int nofs_flag;
int ret = 0;
int ret2;
+ /* We hold a transaction handle open, must do a NOFS allocation. */
+ nofs_flag = memalloc_nofs_save();
tmp = ulist_alloc(GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
if (!tmp)
return -ENOMEM;
@@ -3435,24 +3485,20 @@ static int qgroup_unreserve_range(struct btrfs_inode *inode,
{
struct rb_node *node;
struct rb_node *next;
- struct ulist_node *entry = NULL;
+ struct ulist_node *entry;
int ret = 0;
node = reserved->range_changed.root.rb_node;
+ if (!node)
+ return 0;
while (node) {
entry = rb_entry(node, struct ulist_node, rb_node);
if (entry->val < start)
node = node->rb_right;
- else if (entry)
- node = node->rb_left;
else
- break;
+ node = node->rb_left;
}
- /* Empty changeset */
- if (!entry)
- return 0;
-
if (entry->val > start && rb_prev(&entry->rb_node))
entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node,
rb_node);
@@ -3516,6 +3562,7 @@ static int try_flush_qgroup(struct btrfs_root *root)
{
struct btrfs_trans_handle *trans;
int ret;
+ bool can_commit = true;
/*
* We don't want to run flush again and again, so if there is a running
@@ -3527,6 +3574,20 @@ static int try_flush_qgroup(struct btrfs_root *root)
return 0;
}
+ /*
+ * If current process holds a transaction, we shouldn't flush, as we
+ * assume all space reservation happens before a transaction handle is
+ * held.
+ *
+ * But there are cases like btrfs_delayed_item_reserve_metadata() where
+ * we try to reserve space with one transction handle already held.
+ * In that case we can't commit transaction, but at least try to end it
+ * and hope the started data writes can free some space.
+ */
+ if (current->journal_info &&
+ current->journal_info != BTRFS_SEND_TRANS_STUB)
+ can_commit = false;
+
ret = btrfs_start_delalloc_snapshot(root);
if (ret < 0)
goto out;
@@ -3538,7 +3599,10 @@ static int try_flush_qgroup(struct btrfs_root *root)
goto out;
}
- ret = btrfs_commit_transaction(trans);
+ if (can_commit)
+ ret = btrfs_commit_transaction(trans);
+ else
+ ret = btrfs_end_transaction(trans);
out:
clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
wake_up(&root->qgroup_flush_wait);
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 7f03dbe5b609..78693d3dd15b 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -860,6 +860,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
"dropping a ref for a root that doesn't have a ref on the block");
dump_block_entry(fs_info, be);
dump_ref_action(fs_info, ra);
+ kfree(ref);
kfree(ra);
goto out_unlock;
}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 3602806d71bd..9ba92d86da0b 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1648,6 +1648,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
struct btrfs_root_item *root_item;
struct btrfs_path *path;
struct extent_buffer *leaf;
+ int reserve_level;
int level;
int max_level;
int replaced = 0;
@@ -1696,7 +1697,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
* Thus the needed metadata size is at most root_level * nodesize,
* and * 2 since we have two trees to COW.
*/
- min_reserved = fs_info->nodesize * btrfs_root_level(root_item) * 2;
+ reserve_level = max_t(int, 1, btrfs_root_level(root_item));
+ min_reserved = fs_info->nodesize * reserve_level * 2;
memset(&next_key, 0, sizeof(next_key));
while (1) {
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index cf63f1e27a27..e71e7586e9eb 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3866,8 +3866,9 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
if (!is_dev_replace && !readonly &&
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable",
- rcu_str_deref(dev->name));
+ btrfs_err_in_rcu(fs_info,
+ "scrub on devid %llu: filesystem on %s is not writable",
+ devid, rcu_str_deref(dev->name));
ret = -EROFS;
goto out;
}
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index e6719f7db386..04022069761d 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -983,7 +983,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
BTRFS_MAX_EXTENT_SIZE >> 1,
(BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1,
- EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL);
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_UPTODATE, 0, 0, NULL);
if (ret) {
test_err("clear_extent_bit returned %d", ret);
goto out;
@@ -1050,7 +1051,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
BTRFS_MAX_EXTENT_SIZE + sectorsize,
BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1,
- EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL);
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_UPTODATE, 0, 0, NULL);
if (ret) {
test_err("clear_extent_bit returned %d", ret);
goto out;
@@ -1082,7 +1084,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
/* Empty */
ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
- EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL);
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_UPTODATE, 0, 0, NULL);
if (ret) {
test_err("clear_extent_bit returned %d", ret);
goto out;
@@ -1097,7 +1100,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
out:
if (ret)
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
- EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL);
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_UPTODATE, 0, 0, NULL);
iput(inode);
btrfs_free_dummy_root(root);
btrfs_free_dummy_fs_info(fs_info);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 8784b74f5232..ea2bb4cb5890 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -1068,6 +1068,7 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
"invalid root item size, have %u expect %zu or %u",
btrfs_item_size_nr(leaf, slot), sizeof(ri),
btrfs_legacy_root_item_size());
+ return -EUCLEAN;
}
/*
@@ -1423,6 +1424,7 @@ static int check_extent_data_ref(struct extent_buffer *leaf,
"invalid item size, have %u expect aligned to %zu for key type %u",
btrfs_item_size_nr(leaf, slot),
sizeof(*dref), key->type);
+ return -EUCLEAN;
}
if (!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize)) {
generic_err(leaf, slot,
@@ -1451,6 +1453,7 @@ static int check_extent_data_ref(struct extent_buffer *leaf,
extent_err(leaf, slot,
"invalid extent data backref offset, have %llu expect aligned to %u",
offset, leaf->fs_info->sectorsize);
+ return -EUCLEAN;
}
}
return 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b1e48078c318..78637665166e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -940,7 +940,13 @@ static noinline struct btrfs_device *device_list_add(const char *path,
if (device->bdev != path_bdev) {
bdput(path_bdev);
mutex_unlock(&fs_devices->device_list_mutex);
- btrfs_warn_in_rcu(device->fs_info,
+ /*
+ * device->fs_info may not be reliable here, so
+ * pass in a NULL instead. This avoids a
+ * possible use-after-free when the fs_info and
+ * fs_info->sb are already torn down.
+ */
+ btrfs_warn_in_rcu(NULL,
"duplicate device %s devid %llu generation %llu scanned by %s (%d)",
path, devid, found_transid,
current->comm,
@@ -1056,22 +1062,13 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
continue;
}
- if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
- /*
- * In the first step, keep the device which has
- * the correct fsid and the devid that is used
- * for the dev_replace procedure.
- * In the second step, the dev_replace state is
- * read from the device tree and it is known
- * whether the procedure is really active or
- * not, which means whether this device is
- * used or whether it should be removed.
- */
- if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
- &device->dev_state)) {
- continue;
- }
- }
+ /*
+ * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID,
+ * in btrfs_init_dev_replace() so just continue.
+ */
+ if (device->devid == BTRFS_DEV_REPLACE_DEVID)
+ continue;
+
if (device->bdev) {
blkdev_put(device->bdev, device->mode);
device->bdev = NULL;
@@ -1080,9 +1077,6 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
list_del_init(&device->dev_alloc_list);
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
- if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
- &device->dev_state))
- fs_devices->rw_devices--;
}
list_del_init(&device->dev_list);
fs_devices->num_devices--;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 5027bbdca419..ded4229c314a 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -4074,7 +4074,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
vino.snap, inode);
mutex_lock(&session->s_mutex);
- session->s_seq++;
+ inc_session_sequence(session);
dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
(unsigned)seq);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 08f1c0c31dc2..8f1d7500a7ec 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -4231,7 +4231,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
dname.len, dname.name);
mutex_lock(&session->s_mutex);
- session->s_seq++;
+ inc_session_sequence(session);
if (!inode) {
dout("handle_lease no inode %llx\n", vino.ino);
@@ -4385,29 +4385,49 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc)
bool check_session_state(struct ceph_mds_session *s)
{
- if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
- dout("resending session close request for mds%d\n",
- s->s_mds);
- request_close_session(s);
- return false;
- }
- if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
- if (s->s_state == CEPH_MDS_SESSION_OPEN) {
+ switch (s->s_state) {
+ case CEPH_MDS_SESSION_OPEN:
+ if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
s->s_state = CEPH_MDS_SESSION_HUNG;
pr_info("mds%d hung\n", s->s_mds);
}
- }
- if (s->s_state == CEPH_MDS_SESSION_NEW ||
- s->s_state == CEPH_MDS_SESSION_RESTARTING ||
- s->s_state == CEPH_MDS_SESSION_CLOSED ||
- s->s_state == CEPH_MDS_SESSION_REJECTED)
- /* this mds is failed or recovering, just wait */
+ break;
+ case CEPH_MDS_SESSION_CLOSING:
+ /* Should never reach this when we're unmounting */
+ WARN_ON_ONCE(true);
+ fallthrough;
+ case CEPH_MDS_SESSION_NEW:
+ case CEPH_MDS_SESSION_RESTARTING:
+ case CEPH_MDS_SESSION_CLOSED:
+ case CEPH_MDS_SESSION_REJECTED:
return false;
+ }
return true;
}
/*
+ * If the sequence is incremented while we're waiting on a REQUEST_CLOSE reply,
+ * then we need to retransmit that request.
+ */
+void inc_session_sequence(struct ceph_mds_session *s)
+{
+ lockdep_assert_held(&s->s_mutex);
+
+ s->s_seq++;
+
+ if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
+ int ret;
+
+ dout("resending session close request for mds%d\n", s->s_mds);
+ ret = request_close_session(s);
+ if (ret < 0)
+ pr_err("unable to close session to mds%d: %d\n",
+ s->s_mds, ret);
+ }
+}
+
+/*
* delayed work -- periodically trim expired leases, renew caps with mds
*/
static void schedule_delayed(struct ceph_mds_client *mdsc)
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index cbf8af437140..f5adbebcb38e 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -480,6 +480,7 @@ struct ceph_mds_client {
extern const char *ceph_mds_op_name(int op);
extern bool check_session_state(struct ceph_mds_session *s);
+void inc_session_sequence(struct ceph_mds_session *s);
extern struct ceph_mds_session *
__ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
index 83cb4f26b689..9b785f11e95a 100644
--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c
@@ -53,7 +53,7 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc,
/* increment msg sequence number */
mutex_lock(&session->s_mutex);
- session->s_seq++;
+ inc_session_sequence(session);
mutex_unlock(&session->s_mutex);
/* lookup inode */
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 0da39c16dab4..b611f829cb61 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -873,7 +873,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
ceph_snap_op_name(op), split, trace_len);
mutex_lock(&session->s_mutex);
- session->s_seq++;
+ inc_session_sequence(session);
mutex_unlock(&session->s_mutex);
down_write(&mdsc->snap_rwsem);
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 23b21e943652..ef4784e72b1d 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -1266,6 +1266,7 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
cifs_dbg(VFS, "%s: error %d getting sec desc\n", __func__, rc);
} else if (mode_from_special_sid) {
rc = parse_sec_desc(cifs_sb, pntsd, acllen, fattr, true);
+ kfree(pntsd);
} else {
/* get approximated mode from ACL */
rc = parse_sec_desc(cifs_sb, pntsd, acllen, fattr, false);
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 504766cb6c19..dab94f67c988 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -264,7 +264,7 @@ smb2_revert_current_mid(struct TCP_Server_Info *server, const unsigned int val)
}
static struct mid_q_entry *
-smb2_find_mid(struct TCP_Server_Info *server, char *buf)
+__smb2_find_mid(struct TCP_Server_Info *server, char *buf, bool dequeue)
{
struct mid_q_entry *mid;
struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
@@ -281,6 +281,10 @@ smb2_find_mid(struct TCP_Server_Info *server, char *buf)
(mid->mid_state == MID_REQUEST_SUBMITTED) &&
(mid->command == shdr->Command)) {
kref_get(&mid->refcount);
+ if (dequeue) {
+ list_del_init(&mid->qhead);
+ mid->mid_flags |= MID_DELETED;
+ }
spin_unlock(&GlobalMid_Lock);
return mid;
}
@@ -289,6 +293,18 @@ smb2_find_mid(struct TCP_Server_Info *server, char *buf)
return NULL;
}
+static struct mid_q_entry *
+smb2_find_mid(struct TCP_Server_Info *server, char *buf)
+{
+ return __smb2_find_mid(server, buf, false);
+}
+
+static struct mid_q_entry *
+smb2_find_dequeue_mid(struct TCP_Server_Info *server, char *buf)
+{
+ return __smb2_find_mid(server, buf, true);
+}
+
static void
smb2_dump_detail(void *buf, struct TCP_Server_Info *server)
{
@@ -4356,7 +4372,8 @@ init_read_bvec(struct page **pages, unsigned int npages, unsigned int data_size,
static int
handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
char *buf, unsigned int buf_len, struct page **pages,
- unsigned int npages, unsigned int page_data_size)
+ unsigned int npages, unsigned int page_data_size,
+ bool is_offloaded)
{
unsigned int data_offset;
unsigned int data_len;
@@ -4378,7 +4395,8 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
if (server->ops->is_session_expired &&
server->ops->is_session_expired(buf)) {
- cifs_reconnect(server);
+ if (!is_offloaded)
+ cifs_reconnect(server);
return -1;
}
@@ -4402,7 +4420,10 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
cifs_dbg(FYI, "%s: server returned error %d\n",
__func__, rdata->result);
/* normal error on read response */
- dequeue_mid(mid, false);
+ if (is_offloaded)
+ mid->mid_state = MID_RESPONSE_RECEIVED;
+ else
+ dequeue_mid(mid, false);
return 0;
}
@@ -4426,7 +4447,10 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
cifs_dbg(FYI, "%s: data offset (%u) beyond end of smallbuf\n",
__func__, data_offset);
rdata->result = -EIO;
- dequeue_mid(mid, rdata->result);
+ if (is_offloaded)
+ mid->mid_state = MID_RESPONSE_MALFORMED;
+ else
+ dequeue_mid(mid, rdata->result);
return 0;
}
@@ -4442,21 +4466,30 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
cifs_dbg(FYI, "%s: data offset (%u) beyond 1st page of response\n",
__func__, data_offset);
rdata->result = -EIO;
- dequeue_mid(mid, rdata->result);
+ if (is_offloaded)
+ mid->mid_state = MID_RESPONSE_MALFORMED;
+ else
+ dequeue_mid(mid, rdata->result);
return 0;
}
if (data_len > page_data_size - pad_len) {
/* data_len is corrupt -- discard frame */
rdata->result = -EIO;
- dequeue_mid(mid, rdata->result);
+ if (is_offloaded)
+ mid->mid_state = MID_RESPONSE_MALFORMED;
+ else
+ dequeue_mid(mid, rdata->result);
return 0;
}
rdata->result = init_read_bvec(pages, npages, page_data_size,
cur_off, &bvec);
if (rdata->result != 0) {
- dequeue_mid(mid, rdata->result);
+ if (is_offloaded)
+ mid->mid_state = MID_RESPONSE_MALFORMED;
+ else
+ dequeue_mid(mid, rdata->result);
return 0;
}
@@ -4471,7 +4504,10 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
/* read response payload cannot be in both buf and pages */
WARN_ONCE(1, "buf can not contain only a part of read data");
rdata->result = -EIO;
- dequeue_mid(mid, rdata->result);
+ if (is_offloaded)
+ mid->mid_state = MID_RESPONSE_MALFORMED;
+ else
+ dequeue_mid(mid, rdata->result);
return 0;
}
@@ -4482,7 +4518,10 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
if (length < 0)
return length;
- dequeue_mid(mid, false);
+ if (is_offloaded)
+ mid->mid_state = MID_RESPONSE_RECEIVED;
+ else
+ dequeue_mid(mid, false);
return length;
}
@@ -4511,15 +4550,34 @@ static void smb2_decrypt_offload(struct work_struct *work)
}
dw->server->lstrp = jiffies;
- mid = smb2_find_mid(dw->server, dw->buf);
+ mid = smb2_find_dequeue_mid(dw->server, dw->buf);
if (mid == NULL)
cifs_dbg(FYI, "mid not found\n");
else {
mid->decrypted = true;
rc = handle_read_data(dw->server, mid, dw->buf,
dw->server->vals->read_rsp_size,
- dw->ppages, dw->npages, dw->len);
- mid->callback(mid);
+ dw->ppages, dw->npages, dw->len,
+ true);
+ if (rc >= 0) {
+#ifdef CONFIG_CIFS_STATS2
+ mid->when_received = jiffies;
+#endif
+ mid->callback(mid);
+ } else {
+ spin_lock(&GlobalMid_Lock);
+ if (dw->server->tcpStatus == CifsNeedReconnect) {
+ mid->mid_state = MID_RETRY_NEEDED;
+ spin_unlock(&GlobalMid_Lock);
+ mid->callback(mid);
+ } else {
+ mid->mid_state = MID_REQUEST_SUBMITTED;
+ mid->mid_flags &= ~(MID_DELETED);
+ list_add_tail(&mid->qhead,
+ &dw->server->pending_mid_q);
+ spin_unlock(&GlobalMid_Lock);
+ }
+ }
cifs_mid_q_entry_release(mid);
}
@@ -4622,7 +4680,7 @@ non_offloaded_decrypt:
(*mid)->decrypted = true;
rc = handle_read_data(server, *mid, buf,
server->vals->read_rsp_size,
- pages, npages, len);
+ pages, npages, len, false);
}
free_pages:
@@ -4765,7 +4823,7 @@ smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid)
char *buf = server->large_buf ? server->bigbuf : server->smallbuf;
return handle_read_data(server, mid, buf, server->pdu_size,
- NULL, 0, 0);
+ NULL, 0, 0, false);
}
static int
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index 89bffa82ed74..c57bebfa48fe 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -74,7 +74,7 @@ int fscrypt_select_encryption_impl(struct fscrypt_info *ci)
int i;
/* The file must need contents encryption, not filenames encryption */
- if (!fscrypt_needs_contents_encryption(inode))
+ if (!S_ISREG(inode->i_mode))
return 0;
/* The crypto mode must have a blk-crypto counterpart */
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index d3c3e5d9b41f..d595abb8ef90 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -269,9 +269,7 @@ unlock:
* New inodes may not have an inode number assigned yet.
* Hashing their inode number is delayed until later.
*/
- if (ci->ci_inode->i_ino == 0)
- WARN_ON(!(ci->ci_inode->i_state & I_CREATING));
- else
+ if (ci->ci_inode->i_ino)
fscrypt_hash_inode_number(ci, mk);
return 0;
}
diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
index 96c0c86f3fff..0297ad95eb5c 100644
--- a/fs/efivarfs/inode.c
+++ b/fs/efivarfs/inode.c
@@ -7,6 +7,7 @@
#include <linux/efi.h>
#include <linux/fs.h>
#include <linux/ctype.h>
+#include <linux/kmemleak.h>
#include <linux/slab.h>
#include <linux/uuid.h>
@@ -103,6 +104,7 @@ static int efivarfs_create(struct inode *dir, struct dentry *dentry,
var->var.VariableName[i] = '\0';
inode->i_private = var;
+ kmemleak_ignore(var);
err = efivar_entry_add(var, &efivarfs_list);
if (err)
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 139d0bed42f8..3e21c0e8adae 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -107,11 +107,9 @@ static struct page *erofs_read_inode(struct inode *inode,
i_gid_write(inode, le32_to_cpu(die->i_gid));
set_nlink(inode, le32_to_cpu(die->i_nlink));
- /* ns timestamp */
- inode->i_mtime.tv_sec = inode->i_ctime.tv_sec =
- le64_to_cpu(die->i_ctime);
- inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec =
- le32_to_cpu(die->i_ctime_nsec);
+ /* extended inode has its own timestamp */
+ inode->i_ctime.tv_sec = le64_to_cpu(die->i_ctime);
+ inode->i_ctime.tv_nsec = le32_to_cpu(die->i_ctime_nsec);
inode->i_size = le64_to_cpu(die->i_size);
@@ -149,11 +147,9 @@ static struct page *erofs_read_inode(struct inode *inode,
i_gid_write(inode, le16_to_cpu(dic->i_gid));
set_nlink(inode, le16_to_cpu(dic->i_nlink));
- /* use build time to derive all file time */
- inode->i_mtime.tv_sec = inode->i_ctime.tv_sec =
- sbi->build_time;
- inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec =
- sbi->build_time_nsec;
+ /* use build time for compact inodes */
+ inode->i_ctime.tv_sec = sbi->build_time;
+ inode->i_ctime.tv_nsec = sbi->build_time_nsec;
inode->i_size = le32_to_cpu(dic->i_size);
if (erofs_inode_is_data_compressed(vi->datalayout))
@@ -167,6 +163,11 @@ static struct page *erofs_read_inode(struct inode *inode,
goto err_out;
}
+ inode->i_mtime.tv_sec = inode->i_ctime.tv_sec;
+ inode->i_atime.tv_sec = inode->i_ctime.tv_sec;
+ inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec;
+ inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec;
+
if (!nblks)
/* measure inode.i_blocks as generic filesystems */
inode->i_blocks = roundup(inode->i_size, EROFS_BLKSIZ) >> 9;
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 50912a5420b4..86fd3bf62af6 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -1078,8 +1078,11 @@ out_allocpage:
cond_resched();
goto repeat;
}
- set_page_private(page, (unsigned long)pcl);
- SetPagePrivate(page);
+
+ if (tocache) {
+ set_page_private(page, (unsigned long)pcl);
+ SetPagePrivate(page);
+ }
out: /* the only exit (for tracing and debugging) */
return page;
}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 45fcdbf538d1..65ecaf96d0a4 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1028,9 +1028,6 @@ struct ext4_inode_info {
* protected by sbi->s_fc_lock.
*/
- /* Fast commit subtid when this inode was committed */
- unsigned int i_fc_committed_subtid;
-
/* Start of lblk range that needs to be committed in this fast commit */
ext4_lblk_t i_fc_lblk_start;
@@ -1234,13 +1231,13 @@ struct ext4_inode_info {
blocks */
#define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated
file systems */
-#define EXT4_MOUNT2_DAX_NEVER 0x00000008 /* Do not allow Direct Access */
-#define EXT4_MOUNT2_DAX_INODE 0x00000010 /* For printing options only */
-
#define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM 0x00000008 /* User explicitly
specified journal checksum */
#define EXT4_MOUNT2_JOURNAL_FAST_COMMIT 0x00000010 /* Journal fast commit */
+#define EXT4_MOUNT2_DAX_NEVER 0x00000020 /* Do not allow Direct Access */
+#define EXT4_MOUNT2_DAX_INODE 0x00000040 /* For printing options only */
+
#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
~EXT4_MOUNT_##opt
@@ -1422,16 +1419,6 @@ struct ext4_super_block {
#ifdef __KERNEL__
-/*
- * run-time mount flags
- */
-#define EXT4_MF_MNTDIR_SAMPLED 0x0001
-#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */
-#define EXT4_MF_FC_INELIGIBLE 0x0004 /* Fast commit ineligible */
-#define EXT4_MF_FC_COMMITTING 0x0008 /* File system underoing a fast
- * commit.
- */
-
#ifdef CONFIG_FS_ENCRYPTION
#define DUMMY_ENCRYPTION_ENABLED(sbi) ((sbi)->s_dummy_enc_policy.policy != NULL)
#else
@@ -1466,7 +1453,7 @@ struct ext4_sb_info {
struct buffer_head * __rcu *s_group_desc;
unsigned int s_mount_opt;
unsigned int s_mount_opt2;
- unsigned int s_mount_flags;
+ unsigned long s_mount_flags;
unsigned int s_def_mount_opt;
ext4_fsblk_t s_sb_block;
atomic64_t s_resv_clusters;
@@ -1695,6 +1682,34 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
})
/*
+ * run-time mount flags
+ */
+enum {
+ EXT4_MF_MNTDIR_SAMPLED,
+ EXT4_MF_FS_ABORTED, /* Fatal error detected */
+ EXT4_MF_FC_INELIGIBLE, /* Fast commit ineligible */
+ EXT4_MF_FC_COMMITTING /* File system underoing a fast
+ * commit.
+ */
+};
+
+static inline void ext4_set_mount_flag(struct super_block *sb, int bit)
+{
+ set_bit(bit, &EXT4_SB(sb)->s_mount_flags);
+}
+
+static inline void ext4_clear_mount_flag(struct super_block *sb, int bit)
+{
+ clear_bit(bit, &EXT4_SB(sb)->s_mount_flags);
+}
+
+static inline int ext4_test_mount_flag(struct super_block *sb, int bit)
+{
+ return test_bit(bit, &EXT4_SB(sb)->s_mount_flags);
+}
+
+
+/*
* Simulate_fail codes
*/
#define EXT4_SIM_BBITMAP_EIO 1
@@ -1863,6 +1878,13 @@ static inline bool ext4_verity_in_progress(struct inode *inode)
#define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010
#define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020
#define EXT4_FEATURE_COMPAT_SPARSE_SUPER2 0x0200
+/*
+ * The reason why "FAST_COMMIT" is a compat feature is that, FS becomes
+ * incompatible only if fast commit blocks are present in the FS. Since we
+ * clear the journal (and thus the fast commit blocks), we don't mark FS as
+ * incompatible. We also have a JBD2 incompat feature, which gets set when
+ * there are fast commit blocks present in the journal.
+ */
#define EXT4_FEATURE_COMPAT_FAST_COMMIT 0x0400
#define EXT4_FEATURE_COMPAT_STABLE_INODES 0x0800
@@ -2673,7 +2695,8 @@ void ext4_insert_dentry(struct inode *inode,
struct ext4_filename *fname);
static inline void ext4_update_dx_flag(struct inode *inode)
{
- if (!ext4_has_feature_dir_index(inode->i_sb)) {
+ if (!ext4_has_feature_dir_index(inode->i_sb) &&
+ ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) {
/* ext4_iget() should have caught this... */
WARN_ON_ONCE(ext4_has_feature_metadata_csum(inode->i_sb));
ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
@@ -2731,12 +2754,16 @@ extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);
int ext4_fc_info_show(struct seq_file *seq, void *v);
void ext4_fc_init(struct super_block *sb, journal_t *journal);
void ext4_fc_init_inode(struct inode *inode);
-void ext4_fc_track_range(struct inode *inode, ext4_lblk_t start,
+void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
ext4_lblk_t end);
-void ext4_fc_track_unlink(struct inode *inode, struct dentry *dentry);
-void ext4_fc_track_link(struct inode *inode, struct dentry *dentry);
-void ext4_fc_track_create(struct inode *inode, struct dentry *dentry);
-void ext4_fc_track_inode(struct inode *inode);
+void __ext4_fc_track_unlink(handle_t *handle, struct inode *inode,
+ struct dentry *dentry);
+void __ext4_fc_track_link(handle_t *handle, struct inode *inode,
+ struct dentry *dentry);
+void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry);
+void ext4_fc_track_link(handle_t *handle, struct dentry *dentry);
+void ext4_fc_track_create(handle_t *handle, struct dentry *dentry);
+void ext4_fc_track_inode(handle_t *handle, struct inode *inode);
void ext4_fc_mark_ineligible(struct super_block *sb, int reason);
void ext4_fc_start_ineligible(struct super_block *sb, int reason);
void ext4_fc_stop_ineligible(struct super_block *sb);
@@ -3452,7 +3479,7 @@ extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode,
extern int ext4_ci_compare(const struct inode *parent,
const struct qstr *fname,
const struct qstr *entry, bool quick);
-extern int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
+extern int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name,
struct inode *inode);
extern int __ext4_link(struct inode *dir, struct inode *inode,
struct dentry *dentry);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 57cfa28919a9..17d7096b3212 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3724,7 +3724,6 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
err = ext4_ext_dirty(handle, inode, path + path->p_depth);
out:
ext4_ext_show_leaf(inode, path);
- ext4_fc_track_range(inode, ee_block, ee_block + ee_len - 1);
return err;
}
@@ -3796,7 +3795,6 @@ convert_initialized_extent(handle_t *handle, struct inode *inode,
if (*allocated > map->m_len)
*allocated = map->m_len;
map->m_len = *allocated;
- ext4_fc_track_range(inode, ee_block, ee_block + ee_len - 1);
return 0;
}
@@ -4329,7 +4327,6 @@ got_allocated_blocks:
map->m_len = ar.len;
allocated = map->m_len;
ext4_ext_show_leaf(inode, path);
- ext4_fc_track_range(inode, map->m_lblk, map->m_lblk + map->m_len - 1);
out:
ext4_ext_drop_refs(path);
kfree(path);
@@ -4602,7 +4599,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
ret = ext4_mark_inode_dirty(handle, inode);
if (unlikely(ret))
goto out_handle;
- ext4_fc_track_range(inode, offset >> inode->i_sb->s_blocksize_bits,
+ ext4_fc_track_range(handle, inode, offset >> inode->i_sb->s_blocksize_bits,
(offset + len - 1) >> inode->i_sb->s_blocksize_bits);
/* Zero out partial block at the edges of the range */
ret = ext4_zero_partial_blocks(handle, inode, offset, len);
@@ -4651,8 +4648,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
FALLOC_FL_INSERT_RANGE))
return -EOPNOTSUPP;
- ext4_fc_track_range(inode, offset >> blkbits,
- (offset + len - 1) >> blkbits);
ext4_fc_start_update(inode);
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 8d43058386c3..f2033e13a273 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -83,7 +83,7 @@
*
* Atomicity of commits
* --------------------
- * In order to gaurantee atomicity during the commit operation, fast commit
+ * In order to guarantee atomicity during the commit operation, fast commit
* uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
* tag contains CRC of the contents and TID of the transaction after which
* this fast commit should be applied. Recovery code replays fast commit
@@ -152,7 +152,31 @@ void ext4_fc_init_inode(struct inode *inode)
INIT_LIST_HEAD(&ei->i_fc_list);
init_waitqueue_head(&ei->i_fc_wait);
atomic_set(&ei->i_fc_updates, 0);
- ei->i_fc_committed_subtid = 0;
+}
+
+/* This function must be called with sbi->s_fc_lock held. */
+static void ext4_fc_wait_committing_inode(struct inode *inode)
+__releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
+{
+ wait_queue_head_t *wq;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+#if (BITS_PER_LONG < 64)
+ DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
+ EXT4_STATE_FC_COMMITTING);
+ wq = bit_waitqueue(&ei->i_state_flags,
+ EXT4_STATE_FC_COMMITTING);
+#else
+ DEFINE_WAIT_BIT(wait, &ei->i_flags,
+ EXT4_STATE_FC_COMMITTING);
+ wq = bit_waitqueue(&ei->i_flags,
+ EXT4_STATE_FC_COMMITTING);
+#endif
+ lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
+ prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
+ schedule();
+ finish_wait(wq, &wait.wq_entry);
}
/*
@@ -176,22 +200,7 @@ restart:
goto out;
if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
- wait_queue_head_t *wq;
-#if (BITS_PER_LONG < 64)
- DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
- EXT4_STATE_FC_COMMITTING);
- wq = bit_waitqueue(&ei->i_state_flags,
- EXT4_STATE_FC_COMMITTING);
-#else
- DEFINE_WAIT_BIT(wait, &ei->i_flags,
- EXT4_STATE_FC_COMMITTING);
- wq = bit_waitqueue(&ei->i_flags,
- EXT4_STATE_FC_COMMITTING);
-#endif
- prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
- spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
- schedule();
- finish_wait(wq, &wait.wq_entry);
+ ext4_fc_wait_committing_inode(inode);
goto restart;
}
out:
@@ -234,26 +243,10 @@ restart:
}
if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
- wait_queue_head_t *wq;
-#if (BITS_PER_LONG < 64)
- DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
- EXT4_STATE_FC_COMMITTING);
- wq = bit_waitqueue(&ei->i_state_flags,
- EXT4_STATE_FC_COMMITTING);
-#else
- DEFINE_WAIT_BIT(wait, &ei->i_flags,
- EXT4_STATE_FC_COMMITTING);
- wq = bit_waitqueue(&ei->i_flags,
- EXT4_STATE_FC_COMMITTING);
-#endif
- prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
- spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
- schedule();
- finish_wait(wq, &wait.wq_entry);
+ ext4_fc_wait_committing_inode(inode);
goto restart;
}
- if (!list_empty(&ei->i_fc_list))
- list_del_init(&ei->i_fc_list);
+ list_del_init(&ei->i_fc_list);
spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
}
@@ -269,7 +262,7 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
return;
- sbi->s_mount_flags |= EXT4_MF_FC_INELIGIBLE;
+ ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
WARN_ON(reason >= EXT4_FC_REASON_MAX);
sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
}
@@ -302,14 +295,14 @@ void ext4_fc_stop_ineligible(struct super_block *sb)
(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
return;
- EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FC_INELIGIBLE;
+ ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
}
static inline int ext4_fc_is_ineligible(struct super_block *sb)
{
- return (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FC_INELIGIBLE) ||
- atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates);
+ return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) ||
+ atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates));
}
/*
@@ -323,13 +316,14 @@ static inline int ext4_fc_is_ineligible(struct super_block *sb)
* If enqueue is set, this function enqueues the inode in fast commit list.
*/
static int ext4_fc_track_template(
- struct inode *inode, int (*__fc_track_fn)(struct inode *, void *, bool),
+ handle_t *handle, struct inode *inode,
+ int (*__fc_track_fn)(struct inode *, void *, bool),
void *args, int enqueue)
{
- tid_t running_txn_tid;
bool update = false;
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ tid_t tid = 0;
int ret;
if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
@@ -339,15 +333,13 @@ static int ext4_fc_track_template(
if (ext4_fc_is_ineligible(inode->i_sb))
return -EINVAL;
- running_txn_tid = sbi->s_journal ?
- sbi->s_journal->j_commit_sequence + 1 : 0;
-
+ tid = handle->h_transaction->t_tid;
mutex_lock(&ei->i_fc_lock);
- if (running_txn_tid == ei->i_sync_tid) {
+ if (tid == ei->i_sync_tid) {
update = true;
} else {
ext4_fc_reset_inode(inode);
- ei->i_sync_tid = running_txn_tid;
+ ei->i_sync_tid = tid;
}
ret = __fc_track_fn(inode, args, update);
mutex_unlock(&ei->i_fc_lock);
@@ -358,7 +350,7 @@ static int ext4_fc_track_template(
spin_lock(&sbi->s_fc_lock);
if (list_empty(&EXT4_I(inode)->i_fc_list))
list_add_tail(&EXT4_I(inode)->i_fc_list,
- (sbi->s_mount_flags & EXT4_MF_FC_COMMITTING) ?
+ (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
&sbi->s_fc_q[FC_Q_STAGING] :
&sbi->s_fc_q[FC_Q_MAIN]);
spin_unlock(&sbi->s_fc_lock);
@@ -384,7 +376,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update)
mutex_unlock(&ei->i_fc_lock);
node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
if (!node) {
- ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_MEM);
+ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
mutex_lock(&ei->i_fc_lock);
return -ENOMEM;
}
@@ -397,7 +389,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update)
if (!node->fcd_name.name) {
kmem_cache_free(ext4_fc_dentry_cachep, node);
ext4_fc_mark_ineligible(inode->i_sb,
- EXT4_FC_REASON_MEM);
+ EXT4_FC_REASON_NOMEM);
mutex_lock(&ei->i_fc_lock);
return -ENOMEM;
}
@@ -411,7 +403,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update)
node->fcd_name.len = dentry->d_name.len;
spin_lock(&sbi->s_fc_lock);
- if (sbi->s_mount_flags & EXT4_MF_FC_COMMITTING)
+ if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
list_add_tail(&node->fcd_list,
&sbi->s_fc_dentry_q[FC_Q_STAGING]);
else
@@ -422,7 +414,8 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update)
return 0;
}
-void ext4_fc_track_unlink(struct inode *inode, struct dentry *dentry)
+void __ext4_fc_track_unlink(handle_t *handle,
+ struct inode *inode, struct dentry *dentry)
{
struct __track_dentry_update_args args;
int ret;
@@ -430,12 +423,18 @@ void ext4_fc_track_unlink(struct inode *inode, struct dentry *dentry)
args.dentry = dentry;
args.op = EXT4_FC_TAG_UNLINK;
- ret = ext4_fc_track_template(inode, __track_dentry_update,
+ ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
(void *)&args, 0);
trace_ext4_fc_track_unlink(inode, dentry, ret);
}
-void ext4_fc_track_link(struct inode *inode, struct dentry *dentry)
+void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
+{
+ __ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
+}
+
+void __ext4_fc_track_link(handle_t *handle,
+ struct inode *inode, struct dentry *dentry)
{
struct __track_dentry_update_args args;
int ret;
@@ -443,20 +442,26 @@ void ext4_fc_track_link(struct inode *inode, struct dentry *dentry)
args.dentry = dentry;
args.op = EXT4_FC_TAG_LINK;
- ret = ext4_fc_track_template(inode, __track_dentry_update,
+ ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
(void *)&args, 0);
trace_ext4_fc_track_link(inode, dentry, ret);
}
-void ext4_fc_track_create(struct inode *inode, struct dentry *dentry)
+void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
+{
+ __ext4_fc_track_link(handle, d_inode(dentry), dentry);
+}
+
+void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
{
struct __track_dentry_update_args args;
+ struct inode *inode = d_inode(dentry);
int ret;
args.dentry = dentry;
args.op = EXT4_FC_TAG_CREAT;
- ret = ext4_fc_track_template(inode, __track_dentry_update,
+ ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
(void *)&args, 0);
trace_ext4_fc_track_create(inode, dentry, ret);
}
@@ -472,14 +477,20 @@ static int __track_inode(struct inode *inode, void *arg, bool update)
return 0;
}
-void ext4_fc_track_inode(struct inode *inode)
+void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
{
int ret;
if (S_ISDIR(inode->i_mode))
return;
- ret = ext4_fc_track_template(inode, __track_inode, NULL, 1);
+ if (ext4_should_journal_data(inode)) {
+ ext4_fc_mark_ineligible(inode->i_sb,
+ EXT4_FC_REASON_INODE_JOURNAL_DATA);
+ return;
+ }
+
+ ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
trace_ext4_fc_track_inode(inode, ret);
}
@@ -515,7 +526,7 @@ static int __track_range(struct inode *inode, void *arg, bool update)
return 0;
}
-void ext4_fc_track_range(struct inode *inode, ext4_lblk_t start,
+void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
ext4_lblk_t end)
{
struct __track_range_args args;
@@ -527,7 +538,7 @@ void ext4_fc_track_range(struct inode *inode, ext4_lblk_t start,
args.start = start;
args.end = end;
- ret = ext4_fc_track_template(inode, __track_range, &args, 1);
+ ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1);
trace_ext4_fc_track_range(inode, start, end, ret);
}
@@ -537,10 +548,11 @@ static void ext4_fc_submit_bh(struct super_block *sb)
int write_flags = REQ_SYNC;
struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
+ /* TODO: REQ_FUA | REQ_PREFLUSH is unnecessarily expensive. */
if (test_opt(sb, BARRIER))
write_flags |= REQ_FUA | REQ_PREFLUSH;
lock_buffer(bh);
- clear_buffer_dirty(bh);
+ set_buffer_dirty(bh);
set_buffer_uptodate(bh);
bh->b_end_io = ext4_end_buffer_io_sync;
submit_bh(REQ_OP_WRITE, write_flags, bh);
@@ -846,7 +858,7 @@ static int ext4_fc_submit_inode_data_all(journal_t *journal)
int ret = 0;
spin_lock(&sbi->s_fc_lock);
- sbi->s_mount_flags |= EXT4_MF_FC_COMMITTING;
+ ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
@@ -900,6 +912,8 @@ static int ext4_fc_wait_inode_data_all(journal_t *journal)
/* Commit all the directory entry updates */
static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
+__acquires(&sbi->s_fc_lock)
+__releases(&sbi->s_fc_lock)
{
struct super_block *sb = (struct super_block *)(journal->j_private);
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -996,6 +1010,13 @@ static int ext4_fc_perform_commit(journal_t *journal)
if (ret)
return ret;
+ /*
+ * If file system device is different from journal device, issue a cache
+ * flush before we start writing fast commit blocks.
+ */
+ if (journal->j_fs_dev != journal->j_dev)
+ blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS);
+
blk_start_plug(&plug);
if (sbi->s_fc_bytes == 0) {
/*
@@ -1031,8 +1052,6 @@ static int ext4_fc_perform_commit(journal_t *journal)
if (ret)
goto out;
spin_lock(&sbi->s_fc_lock);
- EXT4_I(inode)->i_fc_committed_subtid =
- atomic_read(&sbi->s_fc_subtid);
}
spin_unlock(&sbi->s_fc_lock);
@@ -1131,7 +1150,7 @@ out:
"Fast commit ended with blks = %d, reason = %d, subtid - %d",
nblks, reason, subtid);
if (reason == EXT4_FC_REASON_FC_FAILED)
- return jbd2_fc_end_commit_fallback(journal, commit_tid);
+ return jbd2_fc_end_commit_fallback(journal);
if (reason == EXT4_FC_REASON_FC_START_FAILED ||
reason == EXT4_FC_REASON_INELIGIBLE)
return jbd2_complete_transaction(journal, commit_tid);
@@ -1190,8 +1209,8 @@ static void ext4_fc_cleanup(journal_t *journal, int full)
list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
&sbi->s_fc_q[FC_Q_STAGING]);
- sbi->s_mount_flags &= ~EXT4_MF_FC_COMMITTING;
- sbi->s_mount_flags &= ~EXT4_MF_FC_INELIGIBLE;
+ ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
+ ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
if (full)
sbi->s_fc_bytes = 0;
@@ -1263,7 +1282,7 @@ static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl)
return 0;
}
- ret = __ext4_unlink(old_parent, &entry, inode);
+ ret = __ext4_unlink(NULL, old_parent, &entry, inode);
/* -ENOENT ok coz it might not exist anymore. */
if (ret == -ENOENT)
ret = 0;
@@ -2079,8 +2098,6 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
void ext4_fc_init(struct super_block *sb, journal_t *journal)
{
- int num_fc_blocks;
-
/*
* We set replay callback even if fast commit disabled because we may
* could still have fast commit blocks that need to be replayed even if
@@ -2090,21 +2107,9 @@ void ext4_fc_init(struct super_block *sb, journal_t *journal)
if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
return;
journal->j_fc_cleanup_callback = ext4_fc_cleanup;
- if (!buffer_uptodate(journal->j_sb_buffer)
- && ext4_read_bh_lock(journal->j_sb_buffer, REQ_META | REQ_PRIO,
- true)) {
- ext4_msg(sb, KERN_ERR, "I/O error on journal");
- return;
- }
- num_fc_blocks = be32_to_cpu(journal->j_superblock->s_num_fc_blks);
- if (jbd2_fc_init(journal, num_fc_blocks ? num_fc_blocks :
- EXT4_NUM_FC_BLKS)) {
- pr_warn("Error while enabling fast commits, turning off.");
- ext4_clear_feature_fast_commit(sb);
- }
}
-const char *fc_ineligible_reasons[] = {
+static const char *fc_ineligible_reasons[] = {
"Extended attributes changed",
"Cross rename",
"Journal flag changed",
@@ -2113,6 +2118,7 @@ const char *fc_ineligible_reasons[] = {
"Resize",
"Dir renamed",
"Falloc range op",
+ "Data journalling",
"FC Commit Failed"
};
diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h
index 06907d485989..3a6e5a1fa1b8 100644
--- a/fs/ext4/fast_commit.h
+++ b/fs/ext4/fast_commit.h
@@ -3,9 +3,6 @@
#ifndef __FAST_COMMIT_H__
#define __FAST_COMMIT_H__
-/* Number of blocks in journal area to allocate for fast commits */
-#define EXT4_NUM_FC_BLKS 256
-
/* Fast commit tags */
#define EXT4_FC_TAG_ADD_RANGE 0x0001
#define EXT4_FC_TAG_DEL_RANGE 0x0002
@@ -100,11 +97,12 @@ enum {
EXT4_FC_REASON_XATTR = 0,
EXT4_FC_REASON_CROSS_RENAME,
EXT4_FC_REASON_JOURNAL_FLAG_CHANGE,
- EXT4_FC_REASON_MEM,
+ EXT4_FC_REASON_NOMEM,
EXT4_FC_REASON_SWAP_BOOT,
EXT4_FC_REASON_RESIZE,
EXT4_FC_REASON_RENAME_DIR,
EXT4_FC_REASON_FALLOC_RANGE,
+ EXT4_FC_REASON_INODE_JOURNAL_DATA,
EXT4_FC_COMMIT_FAILED,
EXT4_FC_REASON_MAX
};
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index d85412d12e3a..3ed8c048fb12 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -761,7 +761,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
if (!daxdev_mapping_supported(vma, dax_dev))
return -EOPNOTSUPP;
- ext4_fc_start_update(inode);
file_accessed(file);
if (IS_DAX(file_inode(file))) {
vma->vm_ops = &ext4_dax_vm_ops;
@@ -769,7 +768,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
} else {
vma->vm_ops = &ext4_file_vm_ops;
}
- ext4_fc_stop_update(inode);
return 0;
}
@@ -782,13 +780,13 @@ static int ext4_sample_last_mounted(struct super_block *sb,
handle_t *handle;
int err;
- if (likely(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED))
+ if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED)))
return 0;
if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb))
return 0;
- sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED;
+ ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED);
/*
* Sample where the filesystem has been mounted and
* store it in the superblock for sysadmin convenience
diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c
index b232c2767534..4c2a9fe30067 100644
--- a/fs/ext4/fsmap.c
+++ b/fs/ext4/fsmap.c
@@ -280,7 +280,7 @@ static int ext4_getfsmap_logdev(struct super_block *sb, struct ext4_fsmap *keys,
/* Fabricate an rmap entry for the external log device. */
irec.fmr_physical = journal->j_blk_offset;
- irec.fmr_length = journal->j_maxlen;
+ irec.fmr_length = journal->j_total_len;
irec.fmr_owner = EXT4_FMR_OWN_LOG;
irec.fmr_flags = 0;
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 81a545fd14a3..a42ca95840f2 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -143,7 +143,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (sb_rdonly(inode->i_sb)) {
/* Make sure that we read updated s_mount_flags value */
smp_rmb();
- if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
+ if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED))
ret = -EROFS;
goto out;
}
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index caa51473207d..b41512d1badc 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1880,6 +1880,7 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline)
ext4_write_lock_xattr(inode, &no_expand);
if (!ext4_has_inline_data(inode)) {
+ ext4_write_unlock_xattr(inode, &no_expand);
*has_inline = 0;
ext4_journal_stop(handle);
return 0;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b96a18679a27..0d8385aea898 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -327,6 +327,8 @@ stop_handle:
ext4_xattr_inode_array_free(ea_inode_array);
return;
no_delete:
+ if (!list_empty(&EXT4_I(inode)->i_fc_list))
+ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
ext4_clear_inode(inode); /* We must guarantee clearing of inode... */
}
@@ -730,7 +732,7 @@ out_sem:
if (ret)
return ret;
}
- ext4_fc_track_range(inode, map->m_lblk,
+ ext4_fc_track_range(handle, inode, map->m_lblk,
map->m_lblk + map->m_len - 1);
}
@@ -2440,7 +2442,7 @@ static int mpage_map_and_submit_extent(handle_t *handle,
struct super_block *sb = inode->i_sb;
if (ext4_forced_shutdown(EXT4_SB(sb)) ||
- EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+ ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
goto invalidate_dirty_pages;
/*
* Let the uper layers retry transient errors.
@@ -2674,7 +2676,7 @@ static int ext4_writepages(struct address_space *mapping,
* the stack trace.
*/
if (unlikely(ext4_forced_shutdown(EXT4_SB(mapping->host->i_sb)) ||
- sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) {
+ ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED))) {
ret = -EROFS;
goto out_writepages;
}
@@ -3310,8 +3312,7 @@ static bool ext4_inode_datasync_dirty(struct inode *inode)
EXT4_I(inode)->i_datasync_tid))
return false;
if (test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT))
- return atomic_read(&EXT4_SB(inode->i_sb)->s_fc_subtid) <
- EXT4_I(inode)->i_fc_committed_subtid;
+ return !list_empty(&EXT4_I(inode)->i_fc_list);
return true;
}
@@ -4109,7 +4110,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
up_write(&EXT4_I(inode)->i_data_sem);
}
- ext4_fc_track_range(inode, first_block, stop_block);
+ ext4_fc_track_range(handle, inode, first_block, stop_block);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
@@ -5442,14 +5443,14 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
}
if (shrink)
- ext4_fc_track_range(inode,
+ ext4_fc_track_range(handle, inode,
(attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
inode->i_sb->s_blocksize_bits,
(oldsize > 0 ? oldsize - 1 : 0) >>
inode->i_sb->s_blocksize_bits);
else
ext4_fc_track_range(
- inode,
+ handle, inode,
(oldsize > 0 ? oldsize - 1 : oldsize) >>
inode->i_sb->s_blocksize_bits,
(attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
@@ -5699,7 +5700,7 @@ int ext4_mark_iloc_dirty(handle_t *handle,
put_bh(iloc->bh);
return -EIO;
}
- ext4_fc_track_inode(inode);
+ ext4_fc_track_inode(handle, inode);
if (IS_I_VERSION(inode))
inode_inc_iversion(inode);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 85abbfb98cbe..24af9ed5c3e5 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4477,7 +4477,7 @@ static inline void ext4_mb_show_pa(struct super_block *sb)
{
ext4_group_t i, ngroups;
- if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+ if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
return;
ngroups = ext4_get_groups_count(sb);
@@ -4508,7 +4508,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
{
struct super_block *sb = ac->ac_sb;
- if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+ if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
return;
mb_debug(sb, "Can't allocate:"
@@ -5167,7 +5167,7 @@ static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle,
struct super_block *sb = ar->inode->i_sb;
ext4_group_t group;
ext4_grpblk_t blkoff;
- int i;
+ int i = sb->s_blocksize;
ext4_fsblk_t goal, block;
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index f458d1d81d96..33509266f5a0 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2606,7 +2606,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
bool excl)
{
handle_t *handle;
- struct inode *inode, *inode_save;
+ struct inode *inode;
int err, credits, retries = 0;
err = dquot_initialize(dir);
@@ -2624,11 +2624,9 @@ retry:
inode->i_op = &ext4_file_inode_operations;
inode->i_fop = &ext4_file_operations;
ext4_set_aops(inode);
- inode_save = inode;
- ihold(inode_save);
err = ext4_add_nondir(handle, dentry, &inode);
- ext4_fc_track_create(inode_save, dentry);
- iput(inode_save);
+ if (!err)
+ ext4_fc_track_create(handle, dentry);
}
if (handle)
ext4_journal_stop(handle);
@@ -2643,7 +2641,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
umode_t mode, dev_t rdev)
{
handle_t *handle;
- struct inode *inode, *inode_save;
+ struct inode *inode;
int err, credits, retries = 0;
err = dquot_initialize(dir);
@@ -2660,12 +2658,9 @@ retry:
if (!IS_ERR(inode)) {
init_special_inode(inode, inode->i_mode, rdev);
inode->i_op = &ext4_special_inode_operations;
- inode_save = inode;
- ihold(inode_save);
err = ext4_add_nondir(handle, dentry, &inode);
if (!err)
- ext4_fc_track_create(inode_save, dentry);
- iput(inode_save);
+ ext4_fc_track_create(handle, dentry);
}
if (handle)
ext4_journal_stop(handle);
@@ -2829,7 +2824,6 @@ out_clear_inode:
iput(inode);
goto out_retry;
}
- ext4_fc_track_create(inode, dentry);
ext4_inc_count(dir);
ext4_update_dx_flag(dir);
@@ -2837,6 +2831,7 @@ out_clear_inode:
if (err)
goto out_clear_inode;
d_instantiate_new(dentry, inode);
+ ext4_fc_track_create(handle, dentry);
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
@@ -3171,7 +3166,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
goto end_rmdir;
ext4_dec_count(dir);
ext4_update_dx_flag(dir);
- ext4_fc_track_unlink(inode, dentry);
+ ext4_fc_track_unlink(handle, dentry);
retval = ext4_mark_inode_dirty(handle, dir);
#ifdef CONFIG_UNICODE
@@ -3192,13 +3187,12 @@ end_rmdir:
return retval;
}
-int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
+int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name,
struct inode *inode)
{
int retval = -ENOENT;
struct buffer_head *bh;
struct ext4_dir_entry_2 *de;
- handle_t *handle = NULL;
int skip_remove_dentry = 0;
bh = ext4_find_entry(dir, d_name, &de, NULL);
@@ -3217,14 +3211,7 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
skip_remove_dentry = 1;
else
- goto out_bh;
- }
-
- handle = ext4_journal_start(dir, EXT4_HT_DIR,
- EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
- if (IS_ERR(handle)) {
- retval = PTR_ERR(handle);
- goto out_bh;
+ goto out;
}
if (IS_DIRSYNC(dir))
@@ -3233,12 +3220,12 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
if (!skip_remove_dentry) {
retval = ext4_delete_entry(handle, dir, de, bh);
if (retval)
- goto out_handle;
+ goto out;
dir->i_ctime = dir->i_mtime = current_time(dir);
ext4_update_dx_flag(dir);
retval = ext4_mark_inode_dirty(handle, dir);
if (retval)
- goto out_handle;
+ goto out;
} else {
retval = 0;
}
@@ -3252,15 +3239,14 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
inode->i_ctime = current_time(inode);
retval = ext4_mark_inode_dirty(handle, inode);
-out_handle:
- ext4_journal_stop(handle);
-out_bh:
+out:
brelse(bh);
return retval;
}
static int ext4_unlink(struct inode *dir, struct dentry *dentry)
{
+ handle_t *handle;
int retval;
if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
@@ -3278,9 +3264,16 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
if (retval)
goto out_trace;
- retval = __ext4_unlink(dir, &dentry->d_name, d_inode(dentry));
+ handle = ext4_journal_start(dir, EXT4_HT_DIR,
+ EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
+ if (IS_ERR(handle)) {
+ retval = PTR_ERR(handle);
+ goto out_trace;
+ }
+
+ retval = __ext4_unlink(handle, dir, &dentry->d_name, d_inode(dentry));
if (!retval)
- ext4_fc_track_unlink(d_inode(dentry), dentry);
+ ext4_fc_track_unlink(handle, dentry);
#ifdef CONFIG_UNICODE
/* VFS negative dentries are incompatible with Encoding and
* Case-insensitiveness. Eventually we'll want avoid
@@ -3291,6 +3284,8 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
if (IS_CASEFOLDED(dir))
d_invalidate(dentry);
#endif
+ if (handle)
+ ext4_journal_stop(handle);
out_trace:
trace_ext4_unlink_exit(dentry, retval);
@@ -3447,7 +3442,6 @@ retry:
err = ext4_add_entry(handle, dentry, inode);
if (!err) {
- ext4_fc_track_link(inode, dentry);
err = ext4_mark_inode_dirty(handle, inode);
/* this can happen only for tmpfile being
* linked the first time
@@ -3455,6 +3449,7 @@ retry:
if (inode->i_nlink == 1)
ext4_orphan_del(handle, inode);
d_instantiate(dentry, inode);
+ ext4_fc_track_link(handle, dentry);
} else {
drop_nlink(inode);
iput(inode);
@@ -3915,9 +3910,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
EXT4_FC_REASON_RENAME_DIR);
} else {
if (new.inode)
- ext4_fc_track_unlink(new.inode, new.dentry);
- ext4_fc_track_link(old.inode, new.dentry);
- ext4_fc_track_unlink(old.inode, old.dentry);
+ ext4_fc_track_unlink(handle, new.dentry);
+ __ext4_fc_track_link(handle, old.inode, new.dentry);
+ __ext4_fc_track_unlink(handle, old.inode, old.dentry);
}
if (new.inode) {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ef4734b40e2a..94472044f4c1 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -289,18 +289,7 @@ void ext4_superblock_csum_set(struct super_block *sb)
if (!ext4_has_metadata_csum(sb))
return;
- /*
- * Locking the superblock prevents the scenario
- * where:
- * 1) a first thread pauses during checksum calculation.
- * 2) a second thread updates the superblock, recalculates
- * the checksum, and updates s_checksum
- * 3) the first thread resumes and finishes its checksum calculation
- * and updates s_checksum with a potentially stale or torn value.
- */
- lock_buffer(EXT4_SB(sb)->s_sbh);
es->s_checksum = ext4_superblock_csum(sb, es);
- unlock_buffer(EXT4_SB(sb)->s_sbh);
}
ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
@@ -686,7 +675,7 @@ static void ext4_handle_error(struct super_block *sb)
if (!test_opt(sb, ERRORS_CONT)) {
journal_t *journal = EXT4_SB(sb)->s_journal;
- EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
+ ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED);
if (journal)
jbd2_journal_abort(journal, -EIO);
}
@@ -904,7 +893,7 @@ void __ext4_abort(struct super_block *sb, const char *function,
va_end(args);
if (sb_rdonly(sb) == 0) {
- EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
+ ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED);
if (EXT4_SB(sb)->s_journal)
jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
@@ -1716,11 +1705,10 @@ enum {
Opt_dioread_nolock, Opt_dioread_lock,
Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
- Opt_prefetch_block_bitmaps, Opt_no_fc,
+ Opt_prefetch_block_bitmaps,
#ifdef CONFIG_EXT4_DEBUG
- Opt_fc_debug_max_replay,
+ Opt_fc_debug_max_replay, Opt_fc_debug_force
#endif
- Opt_fc_debug_force
};
static const match_table_t tokens = {
@@ -1807,9 +1795,8 @@ static const match_table_t tokens = {
{Opt_init_itable, "init_itable=%u"},
{Opt_init_itable, "init_itable"},
{Opt_noinit_itable, "noinit_itable"},
- {Opt_no_fc, "no_fc"},
- {Opt_fc_debug_force, "fc_debug_force"},
#ifdef CONFIG_EXT4_DEBUG
+ {Opt_fc_debug_force, "fc_debug_force"},
{Opt_fc_debug_max_replay, "fc_debug_max_replay=%u"},
#endif
{Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
@@ -2027,8 +2014,8 @@ static const struct mount_opts {
{Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA),
MOPT_CLEAR | MOPT_Q},
- {Opt_usrjquota, 0, MOPT_Q},
- {Opt_grpjquota, 0, MOPT_Q},
+ {Opt_usrjquota, 0, MOPT_Q | MOPT_STRING},
+ {Opt_grpjquota, 0, MOPT_Q | MOPT_STRING},
{Opt_offusrjquota, 0, MOPT_Q},
{Opt_offgrpjquota, 0, MOPT_Q},
{Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT},
@@ -2039,11 +2026,9 @@ static const struct mount_opts {
{Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
{Opt_prefetch_block_bitmaps, EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS,
MOPT_SET},
- {Opt_no_fc, EXT4_MOUNT2_JOURNAL_FAST_COMMIT,
- MOPT_CLEAR | MOPT_2 | MOPT_EXT4_ONLY},
+#ifdef CONFIG_EXT4_DEBUG
{Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT,
MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY},
-#ifdef CONFIG_EXT4_DEBUG
{Opt_fc_debug_max_replay, 0, MOPT_GTE0},
#endif
{Opt_err, 0, 0}
@@ -2153,7 +2138,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
ext4_msg(sb, KERN_WARNING, "Ignoring removed %s option", opt);
return 1;
case Opt_abort:
- sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
+ ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED);
return 1;
case Opt_i_version:
sb->s_flags |= SB_I_VERSION;
@@ -2653,10 +2638,6 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
} else if (test_opt2(sb, DAX_INODE)) {
SEQ_OPTS_PUTS("dax=inode");
}
-
- if (test_opt2(sb, JOURNAL_FAST_COMMIT))
- SEQ_OPTS_PUTS("fast_commit");
-
ext4_show_quota_options(seq, sb);
return 0;
}
@@ -3976,7 +3957,7 @@ int ext4_calculate_overhead(struct super_block *sb)
* loaded or not
*/
if (sbi->s_journal && !sbi->s_journal_bdev)
- overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen);
+ overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len);
else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) {
/* j_inum for internal journal is non-zero */
j_inode = ext4_get_journal_inode(sb, j_inum);
@@ -4340,9 +4321,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
#endif
if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
- printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, and O_DIRECT support!\n");
+ printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, O_DIRECT and fast_commit support!\n");
/* can't mount with both data=journal and dioread_nolock. */
clear_opt(sb, DIOREAD_NOLOCK);
+ clear_opt2(sb, JOURNAL_FAST_COMMIT);
if (test_opt2(sb, EXPLICIT_DELALLOC)) {
ext4_msg(sb, KERN_ERR, "can't mount with "
"both data=journal and delalloc");
@@ -4777,8 +4759,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]);
INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]);
sbi->s_fc_bytes = 0;
- sbi->s_mount_flags &= ~EXT4_MF_FC_INELIGIBLE;
- sbi->s_mount_flags &= ~EXT4_MF_FC_COMMITTING;
+ ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
+ ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
spin_lock_init(&sbi->s_fc_lock);
memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats));
sbi->s_fc_replay_state.fc_regions = NULL;
@@ -4857,6 +4839,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount_wq;
}
+ if (test_opt2(sb, JOURNAL_FAST_COMMIT) &&
+ !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
+ JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) {
+ ext4_msg(sb, KERN_ERR,
+ "Failed to set fast commit journal feature");
+ goto failed_mount_wq;
+ }
+
/* We have now updated the journal if required, so we can
* validate the data journaling mode. */
switch (test_opt(sb, DATA_FLAGS)) {
@@ -5872,7 +5862,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
goto restore_opts;
}
- if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
+ if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
ext4_abort(sb, EXT4_ERR_ESHUTDOWN, "Abort forced by user");
sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
@@ -5886,7 +5876,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
}
if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) {
- if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
+ if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) {
err = -EROFS;
goto restore_opts;
}
@@ -6560,10 +6550,6 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
brelse(bh);
out:
if (inode->i_size < off + len) {
- ext4_fc_track_range(inode,
- (inode->i_size > 0 ? inode->i_size - 1 : 0)
- >> inode->i_sb->s_blocksize_bits,
- (off + len) >> inode->i_sb->s_blocksize_bits);
i_size_write(inode, off + len);
EXT4_I(inode)->i_disksize = inode->i_size;
err2 = ext4_mark_inode_dirty(handle, inode);
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 9cd2ecad07db..cc4f987687f3 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -77,7 +77,7 @@ static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock,
if (error)
return error;
if (!buffer_mapped(bh_result))
- return -EIO;
+ return -ENODATA;
return 0;
}
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 8dff9cbd0a87..62d9081d1e26 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1301,12 +1301,8 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
ret = gfs2_iomap_get(inode, pos, length, flags, &iomap, &mp);
- if (!ret && iomap.type == IOMAP_HOLE) {
- if (create)
- ret = gfs2_iomap_alloc(inode, &iomap, &mp);
- else
- ret = -ENODATA;
- }
+ if (create && !ret && iomap.type == IOMAP_HOLE)
+ ret = gfs2_iomap_alloc(inode, &iomap, &mp);
release_metapath(&mp);
if (ret)
goto out;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 5441c17562c5..d98a2e5dab9f 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1078,7 +1078,8 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
out_free:
kfree(gl->gl_lksb.sb_lvbptr);
kmem_cache_free(cachep, gl);
- atomic_dec(&sdp->sd_glock_disposal);
+ if (atomic_dec_and_test(&sdp->sd_glock_disposal))
+ wake_up(&sdp->sd_glock_wait);
out:
return ret;
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index aa3f5236befb..67f2921ae8d4 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -165,6 +165,31 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
}
/**
+ * gfs2_rgrp_metasync - sync out the metadata of a resource group
+ * @gl: the glock protecting the resource group
+ *
+ */
+
+static int gfs2_rgrp_metasync(struct gfs2_glock *gl)
+{
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
+ struct address_space *metamapping = &sdp->sd_aspace;
+ struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl);
+ const unsigned bsize = sdp->sd_sb.sb_bsize;
+ loff_t start = (rgd->rd_addr * bsize) & PAGE_MASK;
+ loff_t end = PAGE_ALIGN((rgd->rd_addr + rgd->rd_length) * bsize) - 1;
+ int error;
+
+ filemap_fdatawrite_range(metamapping, start, end);
+ error = filemap_fdatawait_range(metamapping, start, end);
+ WARN_ON_ONCE(error && !gfs2_withdrawn(sdp));
+ mapping_set_error(metamapping, error);
+ if (error)
+ gfs2_io_error(sdp);
+ return error;
+}
+
+/**
* rgrp_go_sync - sync out the metadata for this glock
* @gl: the glock
*
@@ -176,11 +201,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
static int rgrp_go_sync(struct gfs2_glock *gl)
{
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
- struct address_space *mapping = &sdp->sd_aspace;
struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl);
- const unsigned bsize = sdp->sd_sb.sb_bsize;
- loff_t start = (rgd->rd_addr * bsize) & PAGE_MASK;
- loff_t end = PAGE_ALIGN((rgd->rd_addr + rgd->rd_length) * bsize) - 1;
int error;
if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
@@ -189,10 +210,7 @@ static int rgrp_go_sync(struct gfs2_glock *gl)
gfs2_log_flush(sdp, gl, GFS2_LOG_HEAD_FLUSH_NORMAL |
GFS2_LFC_RGRP_GO_SYNC);
- filemap_fdatawrite_range(mapping, start, end);
- error = filemap_fdatawait_range(mapping, start, end);
- WARN_ON_ONCE(error && !gfs2_withdrawn(sdp));
- mapping_set_error(mapping, error);
+ error = gfs2_rgrp_metasync(gl);
if (!error)
error = gfs2_ail_empty_gl(gl);
gfs2_free_clones(rgd);
@@ -266,7 +284,24 @@ static void gfs2_clear_glop_pending(struct gfs2_inode *ip)
}
/**
- * inode_go_sync - Sync the dirty data and/or metadata for an inode glock
+ * gfs2_inode_metasync - sync out the metadata of an inode
+ * @gl: the glock protecting the inode
+ *
+ */
+int gfs2_inode_metasync(struct gfs2_glock *gl)
+{
+ struct address_space *metamapping = gfs2_glock2aspace(gl);
+ int error;
+
+ filemap_fdatawrite(metamapping);
+ error = filemap_fdatawait(metamapping);
+ if (error)
+ gfs2_io_error(gl->gl_name.ln_sbd);
+ return error;
+}
+
+/**
+ * inode_go_sync - Sync the dirty metadata of an inode
* @gl: the glock protecting the inode
*
*/
@@ -297,8 +332,7 @@ static int inode_go_sync(struct gfs2_glock *gl)
error = filemap_fdatawait(mapping);
mapping_set_error(mapping, error);
}
- ret = filemap_fdatawait(metamapping);
- mapping_set_error(metamapping, ret);
+ ret = gfs2_inode_metasync(gl);
if (!error)
error = ret;
gfs2_ail_empty_gl(gl);
@@ -537,7 +571,18 @@ static int freeze_go_sync(struct gfs2_glock *gl)
int error = 0;
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
- if (gl->gl_req == LM_ST_EXCLUSIVE && !gfs2_withdrawn(sdp)) {
+ /*
+ * We need to check gl_state == LM_ST_SHARED here and not gl_req ==
+ * LM_ST_EXCLUSIVE. That's because when any node does a freeze,
+ * all the nodes should have the freeze glock in SH mode and they all
+ * call do_xmote: One for EX and the others for UN. They ALL must
+ * freeze locally, and they ALL must queue freeze work. The freeze_work
+ * calls freeze_func, which tries to reacquire the freeze glock in SH,
+ * effectively waiting for the thaw on the node who holds it in EX.
+ * Once thawed, the work func acquires the freeze glock in
+ * SH and everybody goes back to thawed.
+ */
+ if (gl->gl_state == LM_ST_SHARED && !gfs2_withdrawn(sdp)) {
atomic_set(&sdp->sd_freeze_state, SFS_STARTING_FREEZE);
error = freeze_super(sdp->sd_vfs);
if (error) {
diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h
index 2dd192e85618..695898afcaf1 100644
--- a/fs/gfs2/glops.h
+++ b/fs/gfs2/glops.h
@@ -22,6 +22,7 @@ extern const struct gfs2_glock_operations gfs2_quota_glops;
extern const struct gfs2_glock_operations gfs2_journal_glops;
extern const struct gfs2_glock_operations *gfs2_glops_list[];
+extern int gfs2_inode_metasync(struct gfs2_glock *gl);
extern void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync);
#endif /* __GLOPS_DOT_H__ */
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 6774865f5b5b..077ccb1b3ccc 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -180,7 +180,8 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
if (unlikely(error))
goto fail;
- gfs2_cancel_delete_work(ip->i_iopen_gh.gh_gl);
+ if (blktype != GFS2_BLKST_UNLINKED)
+ gfs2_cancel_delete_work(ip->i_iopen_gh.gh_gl);
glock_set_object(ip->i_iopen_gh.gh_gl, ip);
gfs2_glock_put(io_gl);
io_gl = NULL;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 9133b3178677..2e9314091c81 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -132,6 +132,8 @@ __acquires(&sdp->sd_ail_lock)
spin_unlock(&sdp->sd_ail_lock);
ret = generic_writepages(mapping, wbc);
spin_lock(&sdp->sd_ail_lock);
+ if (ret == -ENODATA) /* if a jdata write into a new hole */
+ ret = 0; /* ignore it */
if (ret || wbc->nr_to_write <= 0)
break;
return -EBUSY;
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index ed69298dd824..3922b26264f5 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -22,6 +22,7 @@
#include "incore.h"
#include "inode.h"
#include "glock.h"
+#include "glops.h"
#include "log.h"
#include "lops.h"
#include "meta_io.h"
@@ -817,41 +818,19 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, u32 start,
return error;
}
-/**
- * gfs2_meta_sync - Sync all buffers associated with a glock
- * @gl: The glock
- *
- */
-
-void gfs2_meta_sync(struct gfs2_glock *gl)
-{
- struct address_space *mapping = gfs2_glock2aspace(gl);
- struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
- int error;
-
- if (mapping == NULL)
- mapping = &sdp->sd_aspace;
-
- filemap_fdatawrite(mapping);
- error = filemap_fdatawait(mapping);
-
- if (error)
- gfs2_io_error(gl->gl_name.ln_sbd);
-}
-
static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
{
struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
if (error) {
- gfs2_meta_sync(ip->i_gl);
+ gfs2_inode_metasync(ip->i_gl);
return;
}
if (pass != 1)
return;
- gfs2_meta_sync(ip->i_gl);
+ gfs2_inode_metasync(ip->i_gl);
fs_info(sdp, "jid=%u: Replayed %u of %u blocks\n",
jd->jd_jid, jd->jd_replayed_blocks, jd->jd_found_blocks);
@@ -1060,14 +1039,14 @@ static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
if (error) {
- gfs2_meta_sync(ip->i_gl);
+ gfs2_inode_metasync(ip->i_gl);
return;
}
if (pass != 1)
return;
/* data sync? */
- gfs2_meta_sync(ip->i_gl);
+ gfs2_inode_metasync(ip->i_gl);
fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n",
jd->jd_jid, jd->jd_replayed_blocks, jd->jd_found_blocks);
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index 4a3d8aecdf82..fbdbb08dcec6 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -27,8 +27,6 @@ extern void gfs2_log_submit_bio(struct bio **biop, int opf);
extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh);
extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
struct gfs2_log_header_host *head, bool keep_cache);
-extern void gfs2_meta_sync(struct gfs2_glock *gl);
-
static inline unsigned int buf_limit(struct gfs2_sbd *sdp)
{
unsigned int limit;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 7a7e3c10a9a9..61fce59cb4d3 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -633,8 +633,10 @@ static int init_statfs(struct gfs2_sbd *sdp)
if (IS_ERR(sdp->sd_statfs_inode)) {
error = PTR_ERR(sdp->sd_statfs_inode);
fs_err(sdp, "can't read in statfs inode: %d\n", error);
- goto fail;
+ goto out;
}
+ if (sdp->sd_args.ar_spectator)
+ goto out;
pn = gfs2_lookup_simple(master, "per_node");
if (IS_ERR(pn)) {
@@ -682,15 +684,17 @@ free_local:
iput(pn);
put_statfs:
iput(sdp->sd_statfs_inode);
-fail:
+out:
return error;
}
/* Uninitialize and free up memory used by the list of statfs inodes */
static void uninit_statfs(struct gfs2_sbd *sdp)
{
- gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
- free_local_statfs_inodes(sdp);
+ if (!sdp->sd_args.ar_spectator) {
+ gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
+ free_local_statfs_inodes(sdp);
+ }
iput(sdp->sd_statfs_inode);
}
@@ -704,7 +708,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
if (undo) {
jindex = 0;
- goto fail_jinode_gh;
+ goto fail_statfs;
}
sdp->sd_jindex = gfs2_lookup_simple(master, "jindex");
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index b5cbe21efdfb..c26c68ebd29d 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -349,7 +349,7 @@ static int update_statfs_inode(struct gfs2_jdesc *jd,
mark_buffer_dirty(bh);
brelse(bh);
- gfs2_meta_sync(ip->i_gl);
+ gfs2_inode_metasync(ip->i_gl);
out:
return error;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index ee491bb9c1cc..f7addc6197ed 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -719,9 +719,9 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
}
gfs2_free_clones(rgd);
+ return_all_reservations(rgd);
kfree(rgd->rd_bits);
rgd->rd_bits = NULL;
- return_all_reservations(rgd);
kmem_cache_free(gfs2_rgrpd_cachep, rgd);
}
}
@@ -1370,6 +1370,9 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
+ return -EROFS;
+
if (!blk_queue_discard(q))
return -EOPNOTSUPP;
@@ -2526,13 +2529,13 @@ int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
rbm.rgd = rgd;
error = gfs2_rbm_from_block(&rbm, no_addr);
- if (WARN_ON_ONCE(error))
- goto fail;
-
- if (gfs2_testbit(&rbm, false) != type)
- error = -ESTALE;
+ if (!WARN_ON_ONCE(error)) {
+ if (gfs2_testbit(&rbm, false) != type)
+ error = -ESTALE;
+ }
gfs2_glock_dq_uninit(&rgd_gh);
+
fail:
return error;
}
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index b285192bd6b3..b3d951ab8068 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -738,6 +738,7 @@ restart:
gfs2_jindex_free(sdp);
/* Take apart glock structures and buffer lists */
gfs2_gl_hash_clear(sdp);
+ truncate_inode_pages_final(&sdp->sd_aspace);
gfs2_delete_debugfs_file(sdp);
/* Unmount the locking protocol */
gfs2_lm_unmount(sdp);
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 02894df7656d..b53c055bea6a 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -482,6 +482,10 @@ static void io_impersonate_work(struct io_worker *worker,
current->files = work->identity->files;
current->nsproxy = work->identity->nsproxy;
task_unlock(current);
+ if (!work->identity->files) {
+ /* failed grabbing files, ensure work gets cancelled */
+ work->flags |= IO_WQ_WORK_CANCEL;
+ }
}
if ((work->flags & IO_WQ_WORK_FS) && current->fs != work->identity->fs)
current->fs = work->identity->fs;
diff --git a/fs/io_uring.c b/fs/io_uring.c
index a7429c977eb3..1023f7b44cea 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -205,6 +205,7 @@ struct fixed_file_ref_node {
struct list_head file_list;
struct fixed_file_data *file_data;
struct llist_node llist;
+ bool done;
};
struct fixed_file_data {
@@ -478,6 +479,7 @@ struct io_sr_msg {
struct io_open {
struct file *file;
int dfd;
+ bool ignore_nonblock;
struct filename *filename;
struct open_how how;
unsigned long nofile;
@@ -995,20 +997,33 @@ static void io_sq_thread_drop_mm(void)
if (mm) {
kthread_unuse_mm(mm);
mmput(mm);
+ current->mm = NULL;
}
}
static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
{
- if (!current->mm) {
- if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL) ||
- !ctx->sqo_task->mm ||
- !mmget_not_zero(ctx->sqo_task->mm)))
- return -EFAULT;
- kthread_use_mm(ctx->sqo_task->mm);
+ struct mm_struct *mm;
+
+ if (current->mm)
+ return 0;
+
+ /* Should never happen */
+ if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL)))
+ return -EFAULT;
+
+ task_lock(ctx->sqo_task);
+ mm = ctx->sqo_task->mm;
+ if (unlikely(!mm || !mmget_not_zero(mm)))
+ mm = NULL;
+ task_unlock(ctx->sqo_task);
+
+ if (mm) {
+ kthread_use_mm(mm);
+ return 0;
}
- return 0;
+ return -EFAULT;
}
static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
@@ -1274,9 +1289,12 @@ static bool io_identity_cow(struct io_kiocb *req)
/* add one for this request */
refcount_inc(&id->count);
- /* drop old identity, assign new one. one ref for req, one for tctx */
- if (req->work.identity != tctx->identity &&
- refcount_sub_and_test(2, &req->work.identity->count))
+ /* drop tctx and req identity references, if needed */
+ if (tctx->identity != &tctx->__identity &&
+ refcount_dec_and_test(&tctx->identity->count))
+ kfree(tctx->identity);
+ if (req->work.identity != &tctx->__identity &&
+ refcount_dec_and_test(&req->work.identity->count))
kfree(req->work.identity);
req->work.identity = id;
@@ -1295,22 +1313,6 @@ static bool io_grab_identity(struct io_kiocb *req)
return false;
req->work.flags |= IO_WQ_WORK_FSIZE;
}
-
- if (!(req->work.flags & IO_WQ_WORK_FILES) &&
- (def->work_flags & IO_WQ_WORK_FILES) &&
- !(req->flags & REQ_F_NO_FILE_TABLE)) {
- if (id->files != current->files ||
- id->nsproxy != current->nsproxy)
- return false;
- atomic_inc(&id->files->count);
- get_nsproxy(id->nsproxy);
- req->flags |= REQ_F_INFLIGHT;
-
- spin_lock_irq(&ctx->inflight_lock);
- list_add(&req->inflight_entry, &ctx->inflight_list);
- spin_unlock_irq(&ctx->inflight_lock);
- req->work.flags |= IO_WQ_WORK_FILES;
- }
#ifdef CONFIG_BLK_CGROUP
if (!(req->work.flags & IO_WQ_WORK_BLKCG) &&
(def->work_flags & IO_WQ_WORK_BLKCG)) {
@@ -1352,6 +1354,21 @@ static bool io_grab_identity(struct io_kiocb *req)
}
spin_unlock(&current->fs->lock);
}
+ if (!(req->work.flags & IO_WQ_WORK_FILES) &&
+ (def->work_flags & IO_WQ_WORK_FILES) &&
+ !(req->flags & REQ_F_NO_FILE_TABLE)) {
+ if (id->files != current->files ||
+ id->nsproxy != current->nsproxy)
+ return false;
+ atomic_inc(&id->files->count);
+ get_nsproxy(id->nsproxy);
+ req->flags |= REQ_F_INFLIGHT;
+
+ spin_lock_irq(&ctx->inflight_lock);
+ list_add(&req->inflight_entry, &ctx->inflight_list);
+ spin_unlock_irq(&ctx->inflight_lock);
+ req->work.flags |= IO_WQ_WORK_FILES;
+ }
return true;
}
@@ -1577,14 +1594,29 @@ static void io_cqring_mark_overflow(struct io_ring_ctx *ctx)
}
}
-static inline bool io_match_files(struct io_kiocb *req,
- struct files_struct *files)
+static inline bool __io_match_files(struct io_kiocb *req,
+ struct files_struct *files)
+{
+ return ((req->flags & REQ_F_WORK_INITIALIZED) &&
+ (req->work.flags & IO_WQ_WORK_FILES)) &&
+ req->work.identity->files == files;
+}
+
+static bool io_match_files(struct io_kiocb *req,
+ struct files_struct *files)
{
+ struct io_kiocb *link;
+
if (!files)
return true;
- if ((req->flags & REQ_F_WORK_INITIALIZED) &&
- (req->work.flags & IO_WQ_WORK_FILES))
- return req->work.identity->files == files;
+ if (__io_match_files(req, files))
+ return true;
+ if (req->flags & REQ_F_LINK_HEAD) {
+ list_for_each_entry(link, &req->link_list, link_list) {
+ if (__io_match_files(link, files))
+ return true;
+ }
+ }
return false;
}
@@ -1668,7 +1700,8 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
WRITE_ONCE(cqe->user_data, req->user_data);
WRITE_ONCE(cqe->res, res);
WRITE_ONCE(cqe->flags, cflags);
- } else if (ctx->cq_overflow_flushed || req->task->io_uring->in_idle) {
+ } else if (ctx->cq_overflow_flushed ||
+ atomic_read(&req->task->io_uring->in_idle)) {
/*
* If we're in ring overflow flush mode, or in task cancel mode,
* then we cannot store the request for later flushing, we need
@@ -1838,7 +1871,7 @@ static void __io_free_req(struct io_kiocb *req)
io_dismantle_req(req);
percpu_counter_dec(&tctx->inflight);
- if (tctx->in_idle)
+ if (atomic_read(&tctx->in_idle))
wake_up(&tctx->wait);
put_task_struct(req->task);
@@ -2545,7 +2578,6 @@ static bool io_resubmit_prep(struct io_kiocb *req, int error)
}
end_req:
req_set_fail_links(req);
- io_req_complete(req, ret);
return false;
}
#endif
@@ -3160,7 +3192,7 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
rw->free_iovec = iovec;
rw->bytes_done = 0;
/* can only be fixed buffers, no need to do anything */
- if (iter->type == ITER_BVEC)
+ if (iov_iter_is_bvec(iter))
return;
if (!iovec) {
unsigned iov_off = 0;
@@ -3515,8 +3547,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
* we return to userspace.
*/
if (req->flags & REQ_F_ISREG) {
- __sb_start_write(file_inode(req->file)->i_sb,
- SB_FREEZE_WRITE, true);
+ sb_start_write(file_inode(req->file)->i_sb);
__sb_writers_release(file_inode(req->file)->i_sb,
SB_FREEZE_WRITE);
}
@@ -3764,6 +3795,7 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
return ret;
}
req->open.nofile = rlimit(RLIMIT_NOFILE);
+ req->open.ignore_nonblock = false;
req->flags |= REQ_F_NEED_CLEANUP;
return 0;
}
@@ -3807,7 +3839,7 @@ static int io_openat2(struct io_kiocb *req, bool force_nonblock)
struct file *file;
int ret;
- if (force_nonblock)
+ if (force_nonblock && !req->open.ignore_nonblock)
return -EAGAIN;
ret = build_open_flags(&req->open.how, &op);
@@ -3822,6 +3854,21 @@ static int io_openat2(struct io_kiocb *req, bool force_nonblock)
if (IS_ERR(file)) {
put_unused_fd(ret);
ret = PTR_ERR(file);
+ /*
+ * A work-around to ensure that /proc/self works that way
+ * that it should - if we get -EOPNOTSUPP back, then assume
+ * that proc_self_get_link() failed us because we're in async
+ * context. We should be safe to retry this from the task
+ * itself with force_nonblock == false set, as it should not
+ * block on lookup. Would be nice to know this upfront and
+ * avoid the async dance, but doesn't seem feasible.
+ */
+ if (ret == -EOPNOTSUPP && io_wq_current_is_worker()) {
+ req->open.ignore_nonblock = true;
+ refcount_inc(&req->refs);
+ io_req_task_queue(req);
+ return 0;
+ }
} else {
fsnotify_open(file);
fd_install(ret, file);
@@ -6926,9 +6973,7 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
return -ENXIO;
spin_lock(&data->lock);
- if (!list_empty(&data->ref_list))
- ref_node = list_first_entry(&data->ref_list,
- struct fixed_file_ref_node, node);
+ ref_node = data->node;
spin_unlock(&data->lock);
if (ref_node)
percpu_ref_kill(&ref_node->refs);
@@ -7277,10 +7322,6 @@ static void __io_file_put_work(struct fixed_file_ref_node *ref_node)
kfree(pfile);
}
- spin_lock(&file_data->lock);
- list_del(&ref_node->node);
- spin_unlock(&file_data->lock);
-
percpu_ref_exit(&ref_node->refs);
kfree(ref_node);
percpu_ref_put(&file_data->refs);
@@ -7307,17 +7348,32 @@ static void io_file_put_work(struct work_struct *work)
static void io_file_data_ref_zero(struct percpu_ref *ref)
{
struct fixed_file_ref_node *ref_node;
+ struct fixed_file_data *data;
struct io_ring_ctx *ctx;
- bool first_add;
+ bool first_add = false;
int delay = HZ;
ref_node = container_of(ref, struct fixed_file_ref_node, refs);
- ctx = ref_node->file_data->ctx;
+ data = ref_node->file_data;
+ ctx = data->ctx;
+
+ spin_lock(&data->lock);
+ ref_node->done = true;
+
+ while (!list_empty(&data->ref_list)) {
+ ref_node = list_first_entry(&data->ref_list,
+ struct fixed_file_ref_node, node);
+ /* recycle ref nodes in order */
+ if (!ref_node->done)
+ break;
+ list_del(&ref_node->node);
+ first_add |= llist_add(&ref_node->llist, &ctx->file_put_llist);
+ }
+ spin_unlock(&data->lock);
- if (percpu_ref_is_dying(&ctx->file_data->refs))
+ if (percpu_ref_is_dying(&data->refs))
delay = 0;
- first_add = llist_add(&ref_node->llist, &ctx->file_put_llist);
if (!delay)
mod_delayed_work(system_wq, &ctx->file_put_work, 0);
else if (first_add)
@@ -7341,6 +7397,7 @@ static struct fixed_file_ref_node *alloc_fixed_file_ref_node(
INIT_LIST_HEAD(&ref_node->node);
INIT_LIST_HEAD(&ref_node->file_list);
ref_node->file_data = ctx->file_data;
+ ref_node->done = false;
return ref_node;
}
@@ -7436,7 +7493,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
file_data->node = ref_node;
spin_lock(&file_data->lock);
- list_add(&ref_node->node, &file_data->ref_list);
+ list_add_tail(&ref_node->node, &file_data->ref_list);
spin_unlock(&file_data->lock);
percpu_ref_get(&file_data->refs);
return ret;
@@ -7595,7 +7652,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
if (needs_switch) {
percpu_ref_kill(&data->node->refs);
spin_lock(&data->lock);
- list_add(&ref_node->node, &data->ref_list);
+ list_add_tail(&ref_node->node, &data->ref_list);
data->node = ref_node;
spin_unlock(&data->lock);
percpu_ref_get(&ctx->file_data->refs);
@@ -7695,7 +7752,8 @@ static int io_uring_alloc_task_context(struct task_struct *task)
xa_init(&tctx->xa);
init_waitqueue_head(&tctx->wait);
tctx->last = NULL;
- tctx->in_idle = 0;
+ atomic_set(&tctx->in_idle, 0);
+ tctx->sqpoll = false;
io_init_identity(&tctx->__identity);
tctx->identity = &tctx->__identity;
task->io_uring = tctx;
@@ -8388,22 +8446,6 @@ static bool io_match_link(struct io_kiocb *preq, struct io_kiocb *req)
return false;
}
-static bool io_match_link_files(struct io_kiocb *req,
- struct files_struct *files)
-{
- struct io_kiocb *link;
-
- if (io_match_files(req, files))
- return true;
- if (req->flags & REQ_F_LINK_HEAD) {
- list_for_each_entry(link, &req->link_list, link_list) {
- if (io_match_files(link, files))
- return true;
- }
- }
- return false;
-}
-
/*
* We're looking to cancel 'req' because it's holding on to our files, but
* 'req' could be a link to another request. See if it is, and cancel that
@@ -8453,7 +8495,21 @@ static bool io_timeout_remove_link(struct io_ring_ctx *ctx,
static bool io_cancel_link_cb(struct io_wq_work *work, void *data)
{
- return io_match_link(container_of(work, struct io_kiocb, work), data);
+ struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+ bool ret;
+
+ if (req->flags & REQ_F_LINK_TIMEOUT) {
+ unsigned long flags;
+ struct io_ring_ctx *ctx = req->ctx;
+
+ /* protect against races with linked timeouts */
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+ ret = io_match_link(req, data);
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ } else {
+ ret = io_match_link(req, data);
+ }
+ return ret;
}
static void io_attempt_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req)
@@ -8479,6 +8535,7 @@ static void io_attempt_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req)
}
static void io_cancel_defer_files(struct io_ring_ctx *ctx,
+ struct task_struct *task,
struct files_struct *files)
{
struct io_defer_entry *de = NULL;
@@ -8486,7 +8543,8 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx,
spin_lock_irq(&ctx->completion_lock);
list_for_each_entry_reverse(de, &ctx->defer_list, list) {
- if (io_match_link_files(de->req, files)) {
+ if (io_task_match(de->req, task) &&
+ io_match_files(de->req, files)) {
list_cut_position(&list, &ctx->defer_list, &de->list);
break;
}
@@ -8512,7 +8570,6 @@ static bool io_uring_cancel_files(struct io_ring_ctx *ctx,
if (list_empty_careful(&ctx->inflight_list))
return false;
- io_cancel_defer_files(ctx, files);
/* cancel all at once, should be faster than doing it one by one*/
io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true);
@@ -8598,8 +8655,16 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
{
struct task_struct *task = current;
- if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data)
+ if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
task = ctx->sq_data->thread;
+ atomic_inc(&task->io_uring->in_idle);
+ io_sq_thread_park(ctx->sq_data);
+ }
+
+ if (files)
+ io_cancel_defer_files(ctx, NULL, files);
+ else
+ io_cancel_defer_files(ctx, task, NULL);
io_cqring_overflow_flush(ctx, true, task, files);
@@ -8607,12 +8672,23 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
io_run_task_work();
cond_resched();
}
+
+ if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
+ atomic_dec(&task->io_uring->in_idle);
+ /*
+ * If the files that are going away are the ones in the thread
+ * identity, clear them out.
+ */
+ if (task->io_uring->identity->files == files)
+ task->io_uring->identity->files = NULL;
+ io_sq_thread_unpark(ctx->sq_data);
+ }
}
/*
* Note that this task has used io_uring. We use it for cancelation purposes.
*/
-static int io_uring_add_task_file(struct file *file)
+static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file)
{
struct io_uring_task *tctx = current->io_uring;
@@ -8634,6 +8710,14 @@ static int io_uring_add_task_file(struct file *file)
tctx->last = file;
}
+ /*
+ * This is race safe in that the task itself is doing this, hence it
+ * cannot be going through the exit/cancel paths at the same time.
+ * This cannot be modified while exit/cancel is running.
+ */
+ if (!tctx->sqpoll && (ctx->flags & IORING_SETUP_SQPOLL))
+ tctx->sqpoll = true;
+
return 0;
}
@@ -8675,7 +8759,7 @@ void __io_uring_files_cancel(struct files_struct *files)
unsigned long index;
/* make sure overflow events are dropped */
- tctx->in_idle = true;
+ atomic_inc(&tctx->in_idle);
xa_for_each(&tctx->xa, index, file) {
struct io_ring_ctx *ctx = file->private_data;
@@ -8684,6 +8768,35 @@ void __io_uring_files_cancel(struct files_struct *files)
if (files)
io_uring_del_task_file(file);
}
+
+ atomic_dec(&tctx->in_idle);
+}
+
+static s64 tctx_inflight(struct io_uring_task *tctx)
+{
+ unsigned long index;
+ struct file *file;
+ s64 inflight;
+
+ inflight = percpu_counter_sum(&tctx->inflight);
+ if (!tctx->sqpoll)
+ return inflight;
+
+ /*
+ * If we have SQPOLL rings, then we need to iterate and find them, and
+ * add the pending count for those.
+ */
+ xa_for_each(&tctx->xa, index, file) {
+ struct io_ring_ctx *ctx = file->private_data;
+
+ if (ctx->flags & IORING_SETUP_SQPOLL) {
+ struct io_uring_task *__tctx = ctx->sqo_task->io_uring;
+
+ inflight += percpu_counter_sum(&__tctx->inflight);
+ }
+ }
+
+ return inflight;
}
/*
@@ -8697,11 +8810,11 @@ void __io_uring_task_cancel(void)
s64 inflight;
/* make sure overflow events are dropped */
- tctx->in_idle = true;
+ atomic_inc(&tctx->in_idle);
do {
/* read completions before cancelations */
- inflight = percpu_counter_sum(&tctx->inflight);
+ inflight = tctx_inflight(tctx);
if (!inflight)
break;
__io_uring_files_cancel(NULL);
@@ -8712,13 +8825,13 @@ void __io_uring_task_cancel(void)
* If we've seen completions, retry. This avoids a race where
* a completion comes in before we did prepare_to_wait().
*/
- if (inflight != percpu_counter_sum(&tctx->inflight))
+ if (inflight != tctx_inflight(tctx))
continue;
schedule();
} while (1);
finish_wait(&tctx->wait, &wait);
- tctx->in_idle = false;
+ atomic_dec(&tctx->in_idle);
}
static int io_uring_flush(struct file *file, void *data)
@@ -8863,7 +8976,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
io_sqpoll_wait_sq(ctx);
submitted = to_submit;
} else if (to_submit) {
- ret = io_uring_add_task_file(f.file);
+ ret = io_uring_add_task_file(ctx, f.file);
if (unlikely(ret))
goto out;
mutex_lock(&ctx->uring_lock);
@@ -8900,7 +9013,8 @@ out_fput:
#ifdef CONFIG_PROC_FS
static int io_uring_show_cred(int id, void *p, void *data)
{
- const struct cred *cred = p;
+ struct io_identity *iod = p;
+ const struct cred *cred = iod->creds;
struct seq_file *m = data;
struct user_namespace *uns = seq_user_ns(m);
struct group_info *gi;
@@ -9092,7 +9206,7 @@ err_fd:
#if defined(CONFIG_UNIX)
ctx->ring_sock->file = file;
#endif
- if (unlikely(io_uring_add_task_file(file))) {
+ if (unlikely(io_uring_add_task_file(ctx, file))) {
file = ERR_PTR(-ENOMEM);
goto err_fd;
}
@@ -9137,7 +9251,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
* to a power-of-two, if it isn't already. We do NOT impose
* any cq vs sq ring sizing.
*/
- if (p->cq_entries < p->sq_entries)
+ if (!p->cq_entries)
return -EINVAL;
if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
if (!(p->flags & IORING_SETUP_CLAMP))
@@ -9145,6 +9259,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
p->cq_entries = IORING_MAX_CQ_ENTRIES;
}
p->cq_entries = roundup_pow_of_two(p->cq_entries);
+ if (p->cq_entries < p->sq_entries)
+ return -EINVAL;
} else {
p->cq_entries = 2 * p->sq_entries;
}
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 8180061b9e16..10cc7979ce38 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1374,6 +1374,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list));
WARN_ON_ONCE(!PageLocked(page));
WARN_ON_ONCE(PageWriteback(page));
+ WARN_ON_ONCE(PageDirty(page));
/*
* We cannot cancel the ioend directly here on error. We may have
@@ -1382,33 +1383,22 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
* appropriately.
*/
if (unlikely(error)) {
+ /*
+ * Let the filesystem know what portion of the current page
+ * failed to map. If the page wasn't been added to ioend, it
+ * won't be affected by I/O completion and we must unlock it
+ * now.
+ */
+ if (wpc->ops->discard_page)
+ wpc->ops->discard_page(page, file_offset);
if (!count) {
- /*
- * If the current page hasn't been added to ioend, it
- * won't be affected by I/O completions and we must
- * discard and unlock it right here.
- */
- if (wpc->ops->discard_page)
- wpc->ops->discard_page(page);
ClearPageUptodate(page);
unlock_page(page);
goto done;
}
-
- /*
- * If the page was not fully cleaned, we need to ensure that the
- * higher layers come back to it correctly. That means we need
- * to keep the page dirty, and for WB_SYNC_ALL writeback we need
- * to ensure the PAGECACHE_TAG_TOWRITE index mark is not removed
- * so another attempt to write this page in this writeback sweep
- * will be made.
- */
- set_page_writeback_keepwrite(page);
- } else {
- clear_page_dirty_for_io(page);
- set_page_writeback(page);
}
+ set_page_writeback(page);
unlock_page(page);
/*
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 263f02ad8ebf..472932b9e6bc 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -106,6 +106,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
* for a checkpoint to free up some space in the log.
*/
void __jbd2_log_wait_for_space(journal_t *journal)
+__acquires(&journal->j_state_lock)
+__releases(&journal->j_state_lock)
{
int nblocks, space_left;
/* assert_spin_locked(&journal->j_state_lock); */
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index fa688e163a80..b121d7d434c6 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -450,6 +450,15 @@ void jbd2_journal_commit_transaction(journal_t *journal)
schedule();
write_lock(&journal->j_state_lock);
finish_wait(&journal->j_fc_wait, &wait);
+ /*
+ * TODO: by blocking fast commits here, we are increasing
+ * fsync() latency slightly. Strictly speaking, we don't need
+ * to block fast commits until the transaction enters T_FLUSH
+ * state. So an optimization is possible where we block new fast
+ * commits here and wait for existing ones to complete
+ * just before we enter T_FLUSH. That way, the existing fast
+ * commits and this full commit can proceed parallely.
+ */
}
write_unlock(&journal->j_state_lock);
@@ -801,7 +810,7 @@ start_journal_io:
if (first_block < journal->j_tail)
freed += journal->j_last - journal->j_first;
/* Update tail only if we free significant amount of space */
- if (freed < journal->j_maxlen / 4)
+ if (freed < jbd2_journal_get_max_txn_bufs(journal))
update_tail = 0;
}
J_ASSERT(commit_transaction->t_state == T_COMMIT);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 0c7c42bd530f..188f79d76988 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -566,12 +566,14 @@ static int __jbd2_journal_force_commit(journal_t *journal)
}
/**
- * Force and wait upon a commit if the calling process is not within
- * transaction. This is used for forcing out undo-protected data which contains
- * bitmaps, when the fs is running out of space.
+ * jbd2_journal_force_commit_nested - Force and wait upon a commit if the
+ * calling process is not within transaction.
*
* @journal: journal to force
* Returns true if progress was made.
+ *
+ * This is used for forcing out undo-protected data which contains
+ * bitmaps, when the fs is running out of space.
*/
int jbd2_journal_force_commit_nested(journal_t *journal)
{
@@ -582,7 +584,7 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
}
/**
- * int journal_force_commit() - force any uncommitted transactions
+ * jbd2_journal_force_commit() - force any uncommitted transactions
* @journal: journal to force
*
* Caller want unconditional commit. We can only force the running transaction
@@ -727,6 +729,8 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
*/
int jbd2_fc_begin_commit(journal_t *journal, tid_t tid)
{
+ if (unlikely(is_journal_aborted(journal)))
+ return -EIO;
/*
* Fast commits only allowed if at least one full commit has
* been processed.
@@ -734,10 +738,12 @@ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid)
if (!journal->j_stats.ts_tid)
return -EINVAL;
- if (tid <= journal->j_commit_sequence)
+ write_lock(&journal->j_state_lock);
+ if (tid <= journal->j_commit_sequence) {
+ write_unlock(&journal->j_state_lock);
return -EALREADY;
+ }
- write_lock(&journal->j_state_lock);
if (journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
(journal->j_flags & JBD2_FAST_COMMIT_ONGOING)) {
DEFINE_WAIT(wait);
@@ -777,13 +783,19 @@ static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback)
int jbd2_fc_end_commit(journal_t *journal)
{
- return __jbd2_fc_end_commit(journal, 0, 0);
+ return __jbd2_fc_end_commit(journal, 0, false);
}
EXPORT_SYMBOL(jbd2_fc_end_commit);
-int jbd2_fc_end_commit_fallback(journal_t *journal, tid_t tid)
+int jbd2_fc_end_commit_fallback(journal_t *journal)
{
- return __jbd2_fc_end_commit(journal, tid, 1);
+ tid_t tid;
+
+ read_lock(&journal->j_state_lock);
+ tid = journal->j_running_transaction ?
+ journal->j_running_transaction->t_tid : 0;
+ read_unlock(&journal->j_state_lock);
+ return __jbd2_fc_end_commit(journal, tid, true);
}
EXPORT_SYMBOL(jbd2_fc_end_commit_fallback);
@@ -865,7 +877,6 @@ int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out)
int fc_off;
*bh_out = NULL;
- write_lock(&journal->j_state_lock);
if (journal->j_fc_off + journal->j_fc_first < journal->j_fc_last) {
fc_off = journal->j_fc_off;
@@ -874,7 +885,6 @@ int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out)
} else {
ret = -EINVAL;
}
- write_unlock(&journal->j_state_lock);
if (ret)
return ret;
@@ -887,11 +897,7 @@ int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out)
if (!bh)
return -ENOMEM;
- lock_buffer(bh);
- clear_buffer_uptodate(bh);
- set_buffer_dirty(bh);
- unlock_buffer(bh);
journal->j_fc_wbuf[fc_off] = bh;
*bh_out = bh;
@@ -909,9 +915,7 @@ int jbd2_fc_wait_bufs(journal_t *journal, int num_blks)
struct buffer_head *bh;
int i, j_fc_off;
- read_lock(&journal->j_state_lock);
j_fc_off = journal->j_fc_off;
- read_unlock(&journal->j_state_lock);
/*
* Wait in reverse order to minimize chances of us being woken up before
@@ -939,9 +943,7 @@ int jbd2_fc_release_bufs(journal_t *journal)
struct buffer_head *bh;
int i, j_fc_off;
- read_lock(&journal->j_state_lock);
j_fc_off = journal->j_fc_off;
- read_unlock(&journal->j_state_lock);
/*
* Wait in reverse order to minimize chances of us being woken up before
@@ -1348,23 +1350,16 @@ static journal_t *journal_init_common(struct block_device *bdev,
journal->j_dev = bdev;
journal->j_fs_dev = fs_dev;
journal->j_blk_offset = start;
- journal->j_maxlen = len;
+ journal->j_total_len = len;
/* We need enough buffers to write out full descriptor block. */
n = journal->j_blocksize / jbd2_min_tag_size();
journal->j_wbufsize = n;
+ journal->j_fc_wbuf = NULL;
journal->j_wbuf = kmalloc_array(n, sizeof(struct buffer_head *),
GFP_KERNEL);
if (!journal->j_wbuf)
goto err_cleanup;
- if (journal->j_fc_wbufsize > 0) {
- journal->j_fc_wbuf = kmalloc_array(journal->j_fc_wbufsize,
- sizeof(struct buffer_head *),
- GFP_KERNEL);
- if (!journal->j_fc_wbuf)
- goto err_cleanup;
- }
-
bh = getblk_unmovable(journal->j_dev, start, journal->j_blocksize);
if (!bh) {
pr_err("%s: Cannot get buffer for journal superblock\n",
@@ -1378,23 +1373,11 @@ static journal_t *journal_init_common(struct block_device *bdev,
err_cleanup:
kfree(journal->j_wbuf);
- kfree(journal->j_fc_wbuf);
jbd2_journal_destroy_revoke(journal);
kfree(journal);
return NULL;
}
-int jbd2_fc_init(journal_t *journal, int num_fc_blks)
-{
- journal->j_fc_wbufsize = num_fc_blks;
- journal->j_fc_wbuf = kmalloc_array(journal->j_fc_wbufsize,
- sizeof(struct buffer_head *), GFP_KERNEL);
- if (!journal->j_fc_wbuf)
- return -ENOMEM;
- return 0;
-}
-EXPORT_SYMBOL(jbd2_fc_init);
-
/* jbd2_journal_init_dev and jbd2_journal_init_inode:
*
* Create a journal structure assigned some fixed set of disk blocks to
@@ -1512,16 +1495,7 @@ static int journal_reset(journal_t *journal)
}
journal->j_first = first;
-
- if (jbd2_has_feature_fast_commit(journal) &&
- journal->j_fc_wbufsize > 0) {
- journal->j_fc_last = last;
- journal->j_last = last - journal->j_fc_wbufsize;
- journal->j_fc_first = journal->j_last + 1;
- journal->j_fc_off = 0;
- } else {
- journal->j_last = last;
- }
+ journal->j_last = last;
journal->j_head = journal->j_first;
journal->j_tail = journal->j_first;
@@ -1531,7 +1505,14 @@ static int journal_reset(journal_t *journal)
journal->j_commit_sequence = journal->j_transaction_sequence - 1;
journal->j_commit_request = journal->j_commit_sequence;
- journal->j_max_transaction_buffers = journal->j_maxlen / 4;
+ journal->j_max_transaction_buffers = jbd2_journal_get_max_txn_bufs(journal);
+
+ /*
+ * Now that journal recovery is done, turn fast commits off here. This
+ * way, if fast commit was enabled before the crash but if now FS has
+ * disabled it, we don't enable fast commits.
+ */
+ jbd2_clear_feature_fast_commit(journal);
/*
* As a special case, if the on-disk copy is already marked as needing
@@ -1792,15 +1773,15 @@ static int journal_get_superblock(journal_t *journal)
goto out;
}
- if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen)
- journal->j_maxlen = be32_to_cpu(sb->s_maxlen);
- else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) {
+ if (be32_to_cpu(sb->s_maxlen) < journal->j_total_len)
+ journal->j_total_len = be32_to_cpu(sb->s_maxlen);
+ else if (be32_to_cpu(sb->s_maxlen) > journal->j_total_len) {
printk(KERN_WARNING "JBD2: journal file too short\n");
goto out;
}
if (be32_to_cpu(sb->s_first) == 0 ||
- be32_to_cpu(sb->s_first) >= journal->j_maxlen) {
+ be32_to_cpu(sb->s_first) >= journal->j_total_len) {
printk(KERN_WARNING
"JBD2: Invalid start block of journal: %u\n",
be32_to_cpu(sb->s_first));
@@ -1872,6 +1853,7 @@ static int load_superblock(journal_t *journal)
{
int err;
journal_superblock_t *sb;
+ int num_fc_blocks;
err = journal_get_superblock(journal);
if (err)
@@ -1883,15 +1865,17 @@ static int load_superblock(journal_t *journal)
journal->j_tail = be32_to_cpu(sb->s_start);
journal->j_first = be32_to_cpu(sb->s_first);
journal->j_errno = be32_to_cpu(sb->s_errno);
+ journal->j_last = be32_to_cpu(sb->s_maxlen);
- if (jbd2_has_feature_fast_commit(journal) &&
- journal->j_fc_wbufsize > 0) {
+ if (jbd2_has_feature_fast_commit(journal)) {
journal->j_fc_last = be32_to_cpu(sb->s_maxlen);
- journal->j_last = journal->j_fc_last - journal->j_fc_wbufsize;
+ num_fc_blocks = be32_to_cpu(sb->s_num_fc_blks);
+ if (!num_fc_blocks)
+ num_fc_blocks = JBD2_MIN_FC_BLOCKS;
+ if (journal->j_last - num_fc_blocks >= JBD2_MIN_JOURNAL_BLOCKS)
+ journal->j_last = journal->j_fc_last - num_fc_blocks;
journal->j_fc_first = journal->j_last + 1;
journal->j_fc_off = 0;
- } else {
- journal->j_last = be32_to_cpu(sb->s_maxlen);
}
return 0;
@@ -1899,7 +1883,7 @@ static int load_superblock(journal_t *journal)
/**
- * int jbd2_journal_load() - Read journal from disk.
+ * jbd2_journal_load() - Read journal from disk.
* @journal: Journal to act on.
*
* Given a journal_t structure which tells us which disk blocks contain
@@ -1954,9 +1938,6 @@ int jbd2_journal_load(journal_t *journal)
*/
journal->j_flags &= ~JBD2_ABORT;
- if (journal->j_fc_wbufsize > 0)
- jbd2_journal_set_features(journal, 0, 0,
- JBD2_FEATURE_INCOMPAT_FAST_COMMIT);
/* OK, we've finished with the dynamic journal bits:
* reinitialise the dynamic contents of the superblock in memory
* and reset them on disk. */
@@ -1972,7 +1953,7 @@ recovery_error:
}
/**
- * void jbd2_journal_destroy() - Release a journal_t structure.
+ * jbd2_journal_destroy() - Release a journal_t structure.
* @journal: Journal to act on.
*
* Release a journal_t structure once it is no longer in use by the
@@ -2040,8 +2021,7 @@ int jbd2_journal_destroy(journal_t *journal)
jbd2_journal_destroy_revoke(journal);
if (journal->j_chksum_driver)
crypto_free_shash(journal->j_chksum_driver);
- if (journal->j_fc_wbufsize > 0)
- kfree(journal->j_fc_wbuf);
+ kfree(journal->j_fc_wbuf);
kfree(journal->j_wbuf);
kfree(journal);
@@ -2050,7 +2030,7 @@ int jbd2_journal_destroy(journal_t *journal)
/**
- *int jbd2_journal_check_used_features() - Check if features specified are used.
+ * jbd2_journal_check_used_features() - Check if features specified are used.
* @journal: Journal to check.
* @compat: bitmask of compatible features
* @ro: bitmask of features that force read-only mount
@@ -2085,7 +2065,7 @@ int jbd2_journal_check_used_features(journal_t *journal, unsigned long compat,
}
/**
- * int jbd2_journal_check_available_features() - Check feature set in journalling layer
+ * jbd2_journal_check_available_features() - Check feature set in journalling layer
* @journal: Journal to check.
* @compat: bitmask of compatible features
* @ro: bitmask of features that force read-only mount
@@ -2116,8 +2096,39 @@ int jbd2_journal_check_available_features(journal_t *journal, unsigned long comp
return 0;
}
+static int
+jbd2_journal_initialize_fast_commit(journal_t *journal)
+{
+ journal_superblock_t *sb = journal->j_superblock;
+ unsigned long long num_fc_blks;
+
+ num_fc_blks = be32_to_cpu(sb->s_num_fc_blks);
+ if (num_fc_blks == 0)
+ num_fc_blks = JBD2_MIN_FC_BLOCKS;
+ if (journal->j_last - num_fc_blks < JBD2_MIN_JOURNAL_BLOCKS)
+ return -ENOSPC;
+
+ /* Are we called twice? */
+ WARN_ON(journal->j_fc_wbuf != NULL);
+ journal->j_fc_wbuf = kmalloc_array(num_fc_blks,
+ sizeof(struct buffer_head *), GFP_KERNEL);
+ if (!journal->j_fc_wbuf)
+ return -ENOMEM;
+
+ journal->j_fc_wbufsize = num_fc_blks;
+ journal->j_fc_last = journal->j_last;
+ journal->j_last = journal->j_fc_last - num_fc_blks;
+ journal->j_fc_first = journal->j_last + 1;
+ journal->j_fc_off = 0;
+ journal->j_free = journal->j_last - journal->j_first;
+ journal->j_max_transaction_buffers =
+ jbd2_journal_get_max_txn_bufs(journal);
+
+ return 0;
+}
+
/**
- * int jbd2_journal_set_features() - Mark a given journal feature in the superblock
+ * jbd2_journal_set_features() - Mark a given journal feature in the superblock
* @journal: Journal to act on.
* @compat: bitmask of compatible features
* @ro: bitmask of features that force read-only mount
@@ -2159,6 +2170,13 @@ int jbd2_journal_set_features(journal_t *journal, unsigned long compat,
sb = journal->j_superblock;
+ if (incompat & JBD2_FEATURE_INCOMPAT_FAST_COMMIT) {
+ if (jbd2_journal_initialize_fast_commit(journal)) {
+ pr_err("JBD2: Cannot enable fast commits.\n");
+ return 0;
+ }
+ }
+
/* Load the checksum driver if necessary */
if ((journal->j_chksum_driver == NULL) &&
INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
@@ -2201,7 +2219,7 @@ int jbd2_journal_set_features(journal_t *journal, unsigned long compat,
}
/*
- * jbd2_journal_clear_features () - Clear a given journal feature in the
+ * jbd2_journal_clear_features() - Clear a given journal feature in the
* superblock
* @journal: Journal to act on.
* @compat: bitmask of compatible features
@@ -2230,7 +2248,7 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
EXPORT_SYMBOL(jbd2_journal_clear_features);
/**
- * int jbd2_journal_flush () - Flush journal
+ * jbd2_journal_flush() - Flush journal
* @journal: Journal to act on.
*
* Flush all data for a given journal to disk and empty the journal.
@@ -2305,7 +2323,7 @@ out:
}
/**
- * int jbd2_journal_wipe() - Wipe journal contents
+ * jbd2_journal_wipe() - Wipe journal contents
* @journal: Journal to act on.
* @write: flag (see below)
*
@@ -2346,7 +2364,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
}
/**
- * void jbd2_journal_abort () - Shutdown the journal immediately.
+ * jbd2_journal_abort () - Shutdown the journal immediately.
* @journal: the journal to shutdown.
* @errno: an error number to record in the journal indicating
* the reason for the shutdown.
@@ -2437,7 +2455,7 @@ void jbd2_journal_abort(journal_t *journal, int errno)
}
/**
- * int jbd2_journal_errno () - returns the journal's error state.
+ * jbd2_journal_errno() - returns the journal's error state.
* @journal: journal to examine.
*
* This is the errno number set with jbd2_journal_abort(), the last
@@ -2461,7 +2479,7 @@ int jbd2_journal_errno(journal_t *journal)
}
/**
- * int jbd2_journal_clear_err () - clears the journal's error state
+ * jbd2_journal_clear_err() - clears the journal's error state
* @journal: journal to act on.
*
* An error must be cleared or acked to take a FS out of readonly
@@ -2481,7 +2499,7 @@ int jbd2_journal_clear_err(journal_t *journal)
}
/**
- * void jbd2_journal_ack_err() - Ack journal err.
+ * jbd2_journal_ack_err() - Ack journal err.
* @journal: journal to act on.
*
* An error must be cleared or acked to take a FS out of readonly
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index eb2606133cd8..dc0694fcfcd1 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -74,8 +74,8 @@ static int do_readahead(journal_t *journal, unsigned int start)
/* Do up to 128K of readahead */
max = start + (128 * 1024 / journal->j_blocksize);
- if (max > journal->j_maxlen)
- max = journal->j_maxlen;
+ if (max > journal->j_total_len)
+ max = journal->j_total_len;
/* Do the readahead itself. We'll submit MAXBUF buffer_heads at
* a time to the block device IO layer. */
@@ -134,7 +134,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
*bhp = NULL;
- if (offset >= journal->j_maxlen) {
+ if (offset >= journal->j_total_len) {
printk(KERN_ERR "JBD2: corrupted journal superblock\n");
return -EFSCORRUPTED;
}
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 43985738aa86..9396666b7314 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -195,8 +195,10 @@ static void wait_transaction_switching(journal_t *journal)
DEFINE_WAIT(wait);
if (WARN_ON(!journal->j_running_transaction ||
- journal->j_running_transaction->t_state != T_SWITCH))
+ journal->j_running_transaction->t_state != T_SWITCH)) {
+ read_unlock(&journal->j_state_lock);
return;
+ }
prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
TASK_UNINTERRUPTIBLE);
read_unlock(&journal->j_state_lock);
@@ -517,7 +519,7 @@ EXPORT_SYMBOL(jbd2__journal_start);
/**
- * handle_t *jbd2_journal_start() - Obtain a new handle.
+ * jbd2_journal_start() - Obtain a new handle.
* @journal: Journal to start transaction on.
* @nblocks: number of block buffer we might modify
*
@@ -564,7 +566,7 @@ void jbd2_journal_free_reserved(handle_t *handle)
EXPORT_SYMBOL(jbd2_journal_free_reserved);
/**
- * int jbd2_journal_start_reserved() - start reserved handle
+ * jbd2_journal_start_reserved() - start reserved handle
* @handle: handle to start
* @type: for handle statistics
* @line_no: for handle statistics
@@ -618,7 +620,7 @@ int jbd2_journal_start_reserved(handle_t *handle, unsigned int type,
EXPORT_SYMBOL(jbd2_journal_start_reserved);
/**
- * int jbd2_journal_extend() - extend buffer credits.
+ * jbd2_journal_extend() - extend buffer credits.
* @handle: handle to 'extend'
* @nblocks: nr blocks to try to extend by.
* @revoke_records: number of revoke records to try to extend by.
@@ -743,7 +745,7 @@ static void stop_this_handle(handle_t *handle)
}
/**
- * int jbd2_journal_restart() - restart a handle .
+ * jbd2__journal_restart() - restart a handle .
* @handle: handle to restart
* @nblocks: nr credits requested
* @revoke_records: number of revoke record credits requested
@@ -813,7 +815,7 @@ int jbd2_journal_restart(handle_t *handle, int nblocks)
EXPORT_SYMBOL(jbd2_journal_restart);
/**
- * void jbd2_journal_lock_updates () - establish a transaction barrier.
+ * jbd2_journal_lock_updates () - establish a transaction barrier.
* @journal: Journal to establish a barrier on.
*
* This locks out any further updates from being started, and blocks
@@ -872,7 +874,7 @@ void jbd2_journal_lock_updates(journal_t *journal)
}
/**
- * void jbd2_journal_unlock_updates (journal_t* journal) - release barrier
+ * jbd2_journal_unlock_updates () - release barrier
* @journal: Journal to release the barrier on.
*
* Release a transaction barrier obtained with jbd2_journal_lock_updates().
@@ -1180,7 +1182,8 @@ out:
}
/**
- * int jbd2_journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
+ * jbd2_journal_get_write_access() - notify intent to modify a buffer
+ * for metadata (not data) update.
* @handle: transaction to add buffer modifications to
* @bh: bh to be used for metadata writes
*
@@ -1224,7 +1227,7 @@ int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
* unlocked buffer beforehand. */
/**
- * int jbd2_journal_get_create_access () - notify intent to use newly created bh
+ * jbd2_journal_get_create_access () - notify intent to use newly created bh
* @handle: transaction to new buffer to
* @bh: new buffer.
*
@@ -1304,7 +1307,7 @@ out:
}
/**
- * int jbd2_journal_get_undo_access() - Notify intent to modify metadata with
+ * jbd2_journal_get_undo_access() - Notify intent to modify metadata with
* non-rewindable consequences
* @handle: transaction
* @bh: buffer to undo
@@ -1381,7 +1384,7 @@ out:
}
/**
- * void jbd2_journal_set_triggers() - Add triggers for commit writeout
+ * jbd2_journal_set_triggers() - Add triggers for commit writeout
* @bh: buffer to trigger on
* @type: struct jbd2_buffer_trigger_type containing the trigger(s).
*
@@ -1423,7 +1426,7 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh,
}
/**
- * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata
+ * jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata
* @handle: transaction to add buffer to.
* @bh: buffer to mark
*
@@ -1591,7 +1594,7 @@ out:
}
/**
- * void jbd2_journal_forget() - bforget() for potentially-journaled buffers.
+ * jbd2_journal_forget() - bforget() for potentially-journaled buffers.
* @handle: transaction handle
* @bh: bh to 'forget'
*
@@ -1760,7 +1763,7 @@ drop:
}
/**
- * int jbd2_journal_stop() - complete a transaction
+ * jbd2_journal_stop() - complete a transaction
* @handle: transaction to complete.
*
* All done for a particular handle.
@@ -2078,7 +2081,7 @@ out:
}
/**
- * int jbd2_journal_try_to_free_buffers() - try to free page buffers.
+ * jbd2_journal_try_to_free_buffers() - try to free page buffers.
* @journal: journal for operation
* @page: to try and free
*
@@ -2409,7 +2412,7 @@ zap_buffer_unlocked:
}
/**
- * void jbd2_journal_invalidatepage()
+ * jbd2_journal_invalidatepage()
* @journal: journal to use for flush...
* @page: page to flush
* @offset: start of the range to invalidate
diff --git a/fs/libfs.c b/fs/libfs.c
index fc34361c1489..7124c2e8df2f 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -959,7 +959,7 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf,
size_t len, loff_t *ppos)
{
struct simple_attr *attr;
- u64 val;
+ unsigned long long val;
size_t size;
ssize_t ret;
@@ -977,7 +977,9 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf,
goto out;
attr->set_buf[size] = '\0';
- val = simple_strtoll(attr->set_buf, NULL, 0);
+ ret = kstrtoull(attr->set_buf, 0, &val);
+ if (ret)
+ goto out;
ret = attr->set(attr->data, val);
if (ret == 0)
ret = len; /* on success, claim we got the whole input */
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index cb52db9a0cfb..4e011adaf967 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -955,7 +955,6 @@ out:
static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
{
- struct inode *inode = file_inode(filp);
struct nfs_open_dir_context *dir_ctx = filp->private_data;
dfprintk(FILE, "NFS: llseek dir(%pD2, %lld, %d)\n",
@@ -967,15 +966,15 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
case SEEK_SET:
if (offset < 0)
return -EINVAL;
- inode_lock(inode);
+ spin_lock(&filp->f_lock);
break;
case SEEK_CUR:
if (offset == 0)
return filp->f_pos;
- inode_lock(inode);
+ spin_lock(&filp->f_lock);
offset += filp->f_pos;
if (offset < 0) {
- inode_unlock(inode);
+ spin_unlock(&filp->f_lock);
return -EINVAL;
}
}
@@ -987,7 +986,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
dir_ctx->dir_cookie = 0;
dir_ctx->duped = 0;
}
- inode_unlock(inode);
+ spin_unlock(&filp->f_lock);
return offset;
}
@@ -998,13 +997,9 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end,
int datasync)
{
- struct inode *inode = file_inode(filp);
-
dfprintk(FILE, "NFS: fsync dir(%pD2) datasync %d\n", filp, datasync);
- inode_lock(inode);
- nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
- inode_unlock(inode);
+ nfs_inc_stats(file_inode(filp), NFSIOS_VFSFSYNC);
return 0;
}
diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c
index b51424ff8159..6c2ce799150f 100644
--- a/fs/nfs/nfs42xattr.c
+++ b/fs/nfs/nfs42xattr.c
@@ -1047,8 +1047,10 @@ out4:
void nfs4_xattr_cache_exit(void)
{
+ unregister_shrinker(&nfs4_xattr_large_entry_shrinker);
unregister_shrinker(&nfs4_xattr_entry_shrinker);
unregister_shrinker(&nfs4_xattr_cache_shrinker);
+ list_lru_destroy(&nfs4_xattr_large_entry_lru);
list_lru_destroy(&nfs4_xattr_entry_lru);
list_lru_destroy(&nfs4_xattr_cache_lru);
kmem_cache_destroy(nfs4_xattr_cache_cachep);
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 0dc31ad2362e..6e060a88f98c 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -196,7 +196,7 @@
1 + nfs4_xattr_name_maxsz + 1)
#define decode_setxattr_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz)
#define encode_listxattrs_maxsz (op_encode_hdr_maxsz + 2 + 1)
-#define decode_listxattrs_maxsz (op_decode_hdr_maxsz + 2 + 1 + 1)
+#define decode_listxattrs_maxsz (op_decode_hdr_maxsz + 2 + 1 + 1 + 1)
#define encode_removexattr_maxsz (op_encode_hdr_maxsz + 1 + \
nfs4_xattr_name_maxsz)
#define decode_removexattr_maxsz (op_decode_hdr_maxsz + \
@@ -531,7 +531,7 @@ static void encode_listxattrs(struct xdr_stream *xdr,
{
__be32 *p;
- encode_op_hdr(xdr, OP_LISTXATTRS, decode_listxattrs_maxsz + 1, hdr);
+ encode_op_hdr(xdr, OP_LISTXATTRS, decode_listxattrs_maxsz, hdr);
p = reserve_space(xdr, 12);
if (unlikely(!p))
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 8d3278805602..fa148308822c 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -88,7 +88,13 @@
#define NFS_ROOT "/tftpboot/%s"
/* Default NFSROOT mount options. */
+#if defined(CONFIG_NFS_V2)
#define NFS_DEF_OPTIONS "vers=2,tcp,rsize=4096,wsize=4096"
+#elif defined(CONFIG_NFS_V3)
+#define NFS_DEF_OPTIONS "vers=3,tcp,rsize=4096,wsize=4096"
+#else
+#define NFS_DEF_OPTIONS "vers=4,tcp,rsize=4096,wsize=4096"
+#endif
/* Parameters passed from the kernel command line */
static char nfs_root_parms[NFS_MAXPATHLEN + 1] __initdata = "";
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 14468613d150..a633044b0dc1 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -316,10 +316,6 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp)
fh_copy(&resp->dirfh, &argp->fh);
fh_init(&resp->fh, NFS3_FHSIZE);
- if (argp->ftype == 0 || argp->ftype >= NF3BAD) {
- resp->status = nfserr_inval;
- goto out;
- }
if (argp->ftype == NF3CHR || argp->ftype == NF3BLK) {
rdev = MKDEV(argp->major, argp->minor);
if (MAJOR(rdev) != argp->major ||
@@ -328,7 +324,7 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp)
goto out;
}
} else if (argp->ftype != NF3SOCK && argp->ftype != NF3FIFO) {
- resp->status = nfserr_inval;
+ resp->status = nfserr_badtype;
goto out;
}
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 9c23b6acf234..2277f83da250 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -1114,6 +1114,7 @@ nfs3svc_encode_pathconfres(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd3_pathconfres *resp = rqstp->rq_resp;
+ *p++ = resp->status;
*p++ = xdr_zero; /* no post_op_attr */
if (resp->status == 0) {
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index ad2fa1a8e7ad..e83b21778816 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1299,7 +1299,7 @@ nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct nfsd_file *src,
struct nfsd_file *dst)
{
nfs42_ssc_close(src->nf_file);
- nfsd_file_put(src);
+ /* 'src' is freed by nfsd4_do_async_copy */
nfsd_file_put(dst);
mntput(ss_mnt);
}
@@ -1486,6 +1486,7 @@ do_callback:
cb_copy = kzalloc(sizeof(struct nfsd4_copy), GFP_KERNEL);
if (!cb_copy)
goto out;
+ refcount_set(&cb_copy->refcount, 1);
memcpy(&cb_copy->cp_res, &copy->cp_res, sizeof(copy->cp_res));
cb_copy->cp_clp = copy->cp_clp;
cb_copy->nfserr = copy->nfserr;
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index a960ec3a569a..8d3ad5ef2925 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -178,6 +178,7 @@ int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,
struct inode *inode = d_inode(dentry);
struct dentry *parent;
bool parent_watched = dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED;
+ bool parent_needed, parent_interested;
__u32 p_mask;
struct inode *p_inode = NULL;
struct name_snapshot name;
@@ -193,7 +194,8 @@ int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,
return 0;
parent = NULL;
- if (!parent_watched && !fsnotify_event_needs_parent(inode, mnt, mask))
+ parent_needed = fsnotify_event_needs_parent(inode, mnt, mask);
+ if (!parent_watched && !parent_needed)
goto notify;
/* Does parent inode care about events on children? */
@@ -205,17 +207,17 @@ int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,
/*
* Include parent/name in notification either if some notification
- * groups require parent info (!parent_watched case) or the parent is
- * interested in this event.
+ * groups require parent info or the parent is interested in this event.
*/
- if (!parent_watched || (mask & p_mask & ALL_FSNOTIFY_EVENTS)) {
+ parent_interested = mask & p_mask & ALL_FSNOTIFY_EVENTS;
+ if (parent_needed || parent_interested) {
/* When notifying parent, child should be passed as data */
WARN_ON_ONCE(inode != fsnotify_data_inode(data, data_type));
/* Notify both parent and child with child name info */
take_dentry_name_snapshot(&name, dentry);
file_name = &name.name;
- if (parent_watched)
+ if (parent_interested)
mask |= FS_EVENT_ON_CHILD;
}
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index b9a9d69dde7e..db52e843002a 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -877,7 +877,7 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
goto done;
}
- trace_ocfs2_journal_init_maxlen(j_journal->j_maxlen);
+ trace_ocfs2_journal_init_maxlen(j_journal->j_total_len);
*dirty = (le32_to_cpu(di->id1.journal1.ij_flags) &
OCFS2_JOURNAL_DIRTY_FL);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 1d91dd1e8711..2febc76e9de7 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1713,6 +1713,7 @@ static void ocfs2_inode_init_once(void *data)
oi->ip_blkno = 0ULL;
oi->ip_clusters = 0;
+ oi->ip_next_orphan = NULL;
ocfs2_resv_init_once(&oi->ip_la_data_resv);
diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c
index d0989a443c77..419760fd77bd 100644
--- a/fs/proc/cpuinfo.c
+++ b/fs/proc/cpuinfo.c
@@ -19,7 +19,7 @@ static int cpuinfo_open(struct inode *inode, struct file *file)
static const struct proc_ops cpuinfo_proc_ops = {
.proc_flags = PROC_ENTRY_PERMANENT,
.proc_open = cpuinfo_open,
- .proc_read = seq_read,
+ .proc_read_iter = seq_read_iter,
.proc_lseek = seq_lseek,
.proc_release = seq_release,
};
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 2f9fa179194d..b84663252add 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -590,7 +590,7 @@ static int proc_seq_release(struct inode *inode, struct file *file)
static const struct proc_ops proc_seq_ops = {
/* not permanent -- can call into arbitrary seq_operations */
.proc_open = proc_seq_open,
- .proc_read = seq_read,
+ .proc_read_iter = seq_read_iter,
.proc_lseek = seq_lseek,
.proc_release = proc_seq_release,
};
@@ -621,7 +621,7 @@ static int proc_single_open(struct inode *inode, struct file *file)
static const struct proc_ops proc_single_ops = {
/* not permanent -- can call into arbitrary ->single_show */
.proc_open = proc_single_open,
- .proc_read = seq_read,
+ .proc_read_iter = seq_read_iter,
.proc_lseek = seq_lseek,
.proc_release = single_release,
};
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 58c075e2a452..bde6b6f69852 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -597,6 +597,7 @@ static const struct file_operations proc_iter_file_ops = {
.llseek = proc_reg_llseek,
.read_iter = proc_reg_read_iter,
.write = proc_reg_write,
+ .splice_read = generic_file_splice_read,
.poll = proc_reg_poll,
.unlocked_ioctl = proc_reg_unlocked_ioctl,
.mmap = proc_reg_mmap,
@@ -622,6 +623,7 @@ static const struct file_operations proc_reg_file_ops_compat = {
static const struct file_operations proc_iter_file_ops_compat = {
.llseek = proc_reg_llseek,
.read_iter = proc_reg_read_iter,
+ .splice_read = generic_file_splice_read,
.write = proc_reg_write,
.poll = proc_reg_poll,
.unlocked_ioctl = proc_reg_unlocked_ioctl,
diff --git a/fs/proc/self.c b/fs/proc/self.c
index 72cd69bcaf4a..cc71ce3466dc 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -16,6 +16,13 @@ static const char *proc_self_get_link(struct dentry *dentry,
pid_t tgid = task_tgid_nr_ns(current, ns);
char *name;
+ /*
+ * Not currently supported. Once we can inherit all of struct pid,
+ * we can allow this.
+ */
+ if (current->flags & PF_KTHREAD)
+ return ERR_PTR(-EOPNOTSUPP);
+
if (!tgid)
return ERR_PTR(-ENOENT);
/* max length of unsigned int in decimal + NULL term */
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 46b3293015fe..4695b6de3151 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -226,7 +226,7 @@ static int stat_open(struct inode *inode, struct file *file)
static const struct proc_ops stat_proc_ops = {
.proc_flags = PROC_ENTRY_PERMANENT,
.proc_open = stat_open,
- .proc_read = seq_read,
+ .proc_read_iter = seq_read_iter,
.proc_lseek = seq_lseek,
.proc_release = single_release,
};
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 31219c1db17d..3b20e21604e7 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -18,6 +18,7 @@
#include <linux/mm.h>
#include <linux/printk.h>
#include <linux/string_helpers.h>
+#include <linux/uio.h>
#include <linux/uaccess.h>
#include <asm/page.h>
@@ -146,7 +147,28 @@ Eoverflow:
*/
ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
{
- struct seq_file *m = file->private_data;
+ struct iovec iov = { .iov_base = buf, .iov_len = size};
+ struct kiocb kiocb;
+ struct iov_iter iter;
+ ssize_t ret;
+
+ init_sync_kiocb(&kiocb, file);
+ iov_iter_init(&iter, READ, &iov, 1, size);
+
+ kiocb.ki_pos = *ppos;
+ ret = seq_read_iter(&kiocb, &iter);
+ *ppos = kiocb.ki_pos;
+ return ret;
+}
+EXPORT_SYMBOL(seq_read);
+
+/*
+ * Ready-made ->f_op->read_iter()
+ */
+ssize_t seq_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct seq_file *m = iocb->ki_filp->private_data;
+ size_t size = iov_iter_count(iter);
size_t copied = 0;
size_t n;
void *p;
@@ -158,14 +180,14 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
* if request is to read from zero offset, reset iterator to first
* record as it might have been already advanced by previous requests
*/
- if (*ppos == 0) {
+ if (iocb->ki_pos == 0) {
m->index = 0;
m->count = 0;
}
- /* Don't assume *ppos is where we left it */
- if (unlikely(*ppos != m->read_pos)) {
- while ((err = traverse(m, *ppos)) == -EAGAIN)
+ /* Don't assume ki_pos is where we left it */
+ if (unlikely(iocb->ki_pos != m->read_pos)) {
+ while ((err = traverse(m, iocb->ki_pos)) == -EAGAIN)
;
if (err) {
/* With prejudice... */
@@ -174,7 +196,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
m->count = 0;
goto Done;
} else {
- m->read_pos = *ppos;
+ m->read_pos = iocb->ki_pos;
}
}
@@ -187,13 +209,11 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
/* if not empty - flush it first */
if (m->count) {
n = min(m->count, size);
- err = copy_to_user(buf, m->buf + m->from, n);
- if (err)
+ if (copy_to_iter(m->buf + m->from, n, iter) != n)
goto Efault;
m->count -= n;
m->from += n;
size -= n;
- buf += n;
copied += n;
if (!size)
goto Done;
@@ -254,8 +274,7 @@ Fill:
}
m->op->stop(m, p);
n = min(m->count, size);
- err = copy_to_user(buf, m->buf, n);
- if (err)
+ if (copy_to_iter(m->buf, n, iter) != n)
goto Efault;
copied += n;
m->count -= n;
@@ -264,7 +283,7 @@ Done:
if (!copied)
copied = err;
else {
- *ppos += copied;
+ iocb->ki_pos += copied;
m->read_pos += copied;
}
mutex_unlock(&m->lock);
@@ -276,7 +295,7 @@ Efault:
err = -EFAULT;
goto Done;
}
-EXPORT_SYMBOL(seq_read);
+EXPORT_SYMBOL(seq_read_iter);
/**
* seq_lseek - ->llseek() method for sequential files.
diff --git a/fs/super.c b/fs/super.c
index a51c2083cd6b..98bb0629ee10 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1631,55 +1631,6 @@ int super_setup_bdi(struct super_block *sb)
}
EXPORT_SYMBOL(super_setup_bdi);
-/*
- * This is an internal function, please use sb_end_{write,pagefault,intwrite}
- * instead.
- */
-void __sb_end_write(struct super_block *sb, int level)
-{
- percpu_up_read(sb->s_writers.rw_sem + level-1);
-}
-EXPORT_SYMBOL(__sb_end_write);
-
-/*
- * This is an internal function, please use sb_start_{write,pagefault,intwrite}
- * instead.
- */
-int __sb_start_write(struct super_block *sb, int level, bool wait)
-{
- bool force_trylock = false;
- int ret = 1;
-
-#ifdef CONFIG_LOCKDEP
- /*
- * We want lockdep to tell us about possible deadlocks with freezing
- * but it's it bit tricky to properly instrument it. Getting a freeze
- * protection works as getting a read lock but there are subtle
- * problems. XFS for example gets freeze protection on internal level
- * twice in some cases, which is OK only because we already hold a
- * freeze protection also on higher level. Due to these cases we have
- * to use wait == F (trylock mode) which must not fail.
- */
- if (wait) {
- int i;
-
- for (i = 0; i < level - 1; i++)
- if (percpu_rwsem_is_held(sb->s_writers.rw_sem + i)) {
- force_trylock = true;
- break;
- }
- }
-#endif
- if (wait && !force_trylock)
- percpu_down_read(sb->s_writers.rw_sem + level-1);
- else
- ret = percpu_down_read_trylock(sb->s_writers.rw_sem + level-1);
-
- WARN_ON(force_trylock && !ret);
- return ret;
-}
-EXPORT_SYMBOL(__sb_start_write);
-
/**
* sb_wait_write - wait until all writers to given file system finish
* @sb: the super for which we wait
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 852b536551b5..15640015be9d 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2467,6 +2467,7 @@ xfs_defer_agfl_block(
new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
new->xefi_blockcount = 1;
new->xefi_oinfo = *oinfo;
+ new->xefi_skip_discard = false;
trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index bb128db220ac..d6ef69ab1c67 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -515,7 +515,7 @@ xfs_attr_copy_value(
*========================================================================*/
/*
- * Query whether the requested number of additional bytes of extended
+ * Query whether the total requested number of attr fork bytes of extended
* attribute space will be able to fit inline.
*
* Returns zero if not, else the di_forkoff fork offset to be used in the
@@ -535,6 +535,12 @@ xfs_attr_shortform_bytesfit(
int maxforkoff;
int offset;
+ /*
+ * Check if the new size could fit at all first:
+ */
+ if (bytes > XFS_LITINO(mp))
+ return 0;
+
/* rounded down */
offset = (XFS_LITINO(mp) - bytes) >> 3;
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index e1bd484e5548..6747e97a7949 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -52,9 +52,9 @@ struct xfs_extent_free_item
{
xfs_fsblock_t xefi_startblock;/* starting fs block number */
xfs_extlen_t xefi_blockcount;/* number of blocks in extent */
+ bool xefi_skip_discard;
struct list_head xefi_list;
struct xfs_owner_info xefi_oinfo; /* extent owner */
- bool xefi_skip_discard;
};
#define XFS_BMAP_MAX_NMAP 4
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 340c83f76c80..2668ebe02865 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -1514,7 +1514,7 @@ xfs_rmap_convert_shared(
* record for our insertion point. This will also give us the record for
* start block contiguity tests.
*/
- error = xfs_rmap_lookup_le_range(cur, bno, owner, offset, flags,
+ error = xfs_rmap_lookup_le_range(cur, bno, owner, offset, oldext,
&PREV, &i);
if (error)
goto done;
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 955302e7cdde..fed56d213a3f 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -113,6 +113,8 @@ xchk_bmap_get_rmap(
if (info->whichfork == XFS_ATTR_FORK)
rflags |= XFS_RMAP_ATTR_FORK;
+ if (irec->br_state == XFS_EXT_UNWRITTEN)
+ rflags |= XFS_RMAP_UNWRITTEN;
/*
* CoW staging extents are owned (on disk) by the refcountbt, so
@@ -216,13 +218,13 @@ xchk_bmap_xref_rmap(
* which doesn't track unwritten state.
*/
if (owner != XFS_RMAP_OWN_COW &&
- irec->br_state == XFS_EXT_UNWRITTEN &&
- !(rmap.rm_flags & XFS_RMAP_UNWRITTEN))
+ !!(irec->br_state == XFS_EXT_UNWRITTEN) !=
+ !!(rmap.rm_flags & XFS_RMAP_UNWRITTEN))
xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
- if (info->whichfork == XFS_ATTR_FORK &&
- !(rmap.rm_flags & XFS_RMAP_ATTR_FORK))
+ if (!!(info->whichfork == XFS_ATTR_FORK) !=
+ !!(rmap.rm_flags & XFS_RMAP_ATTR_FORK))
xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
if (rmap.rm_flags & XFS_RMAP_BMBT_BLOCK)
diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
index f52a7b8256f9..debf392e0515 100644
--- a/fs/xfs/scrub/btree.c
+++ b/fs/xfs/scrub/btree.c
@@ -452,32 +452,41 @@ xchk_btree_check_minrecs(
int level,
struct xfs_btree_block *block)
{
- unsigned int numrecs;
- int ok_level;
-
- numrecs = be16_to_cpu(block->bb_numrecs);
+ struct xfs_btree_cur *cur = bs->cur;
+ unsigned int root_level = cur->bc_nlevels - 1;
+ unsigned int numrecs = be16_to_cpu(block->bb_numrecs);
/* More records than minrecs means the block is ok. */
- if (numrecs >= bs->cur->bc_ops->get_minrecs(bs->cur, level))
+ if (numrecs >= cur->bc_ops->get_minrecs(cur, level))
return;
/*
- * Certain btree blocks /can/ have fewer than minrecs records. Any
- * level greater than or equal to the level of the highest dedicated
- * btree block are allowed to violate this constraint.
- *
- * For a btree rooted in a block, the btree root can have fewer than
- * minrecs records. If the btree is rooted in an inode and does not
- * store records in the root, the direct children of the root and the
- * root itself can have fewer than minrecs records.
+ * For btrees rooted in the inode, it's possible that the root block
+ * contents spilled into a regular ondisk block because there wasn't
+ * enough space in the inode root. The number of records in that
+ * child block might be less than the standard minrecs, but that's ok
+ * provided that there's only one direct child of the root.
*/
- ok_level = bs->cur->bc_nlevels - 1;
- if (bs->cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
- ok_level--;
- if (level >= ok_level)
+ if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ level == cur->bc_nlevels - 2) {
+ struct xfs_btree_block *root_block;
+ struct xfs_buf *root_bp;
+ int root_maxrecs;
+
+ root_block = xfs_btree_get_block(cur, root_level, &root_bp);
+ root_maxrecs = cur->bc_ops->get_dmaxrecs(cur, root_level);
+ if (be16_to_cpu(root_block->bb_numrecs) != 1 ||
+ numrecs <= root_maxrecs)
+ xchk_btree_set_corrupt(bs->sc, cur, level);
return;
+ }
- xchk_btree_set_corrupt(bs->sc, bs->cur, level);
+ /*
+ * Otherwise, only the root level is allowed to have fewer than minrecs
+ * records or keyptrs.
+ */
+ if (level < root_level)
+ xchk_btree_set_corrupt(bs->sc, cur, level);
}
/*
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index 7c432997edad..b045e95c2ea7 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -558,14 +558,27 @@ xchk_directory_leaf1_bestfree(
/* Check all the bestfree entries. */
for (i = 0; i < bestcount; i++, bestp++) {
best = be16_to_cpu(*bestp);
- if (best == NULLDATAOFF)
- continue;
error = xfs_dir3_data_read(sc->tp, sc->ip,
- i * args->geo->fsbcount, 0, &dbp);
+ xfs_dir2_db_to_da(args->geo, i),
+ XFS_DABUF_MAP_HOLE_OK,
+ &dbp);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk,
&error))
break;
- xchk_directory_check_freesp(sc, lblk, dbp, best);
+
+ if (!dbp) {
+ if (best != NULLDATAOFF) {
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK,
+ lblk);
+ break;
+ }
+ continue;
+ }
+
+ if (best == NULLDATAOFF)
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ else
+ xchk_directory_check_freesp(sc, lblk, dbp, best);
xfs_trans_brelse(sc->tp, dbp);
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
break;
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 3aa85b64de36..bb25ff1b770d 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -121,8 +121,7 @@ xchk_inode_flags(
goto bad;
/* rt flags require rt device */
- if ((flags & (XFS_DIFLAG_REALTIME | XFS_DIFLAG_RTINHERIT)) &&
- !mp->m_rtdev_targp)
+ if ((flags & XFS_DIFLAG_REALTIME) && !mp->m_rtdev_targp)
goto bad;
/* new rt bitmap flag only valid for rbmino */
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index beaeb6fa3119..dd672e6bbc75 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -170,7 +170,6 @@ xchk_refcountbt_process_rmap_fragments(
*/
INIT_LIST_HEAD(&worklist);
rbno = NULLAGBLOCK;
- nr = 1;
/* Make sure the fragments actually /are/ in agbno order. */
bno = 0;
@@ -184,15 +183,14 @@ xchk_refcountbt_process_rmap_fragments(
* Find all the rmaps that start at or before the refc extent,
* and put them on the worklist.
*/
+ nr = 0;
list_for_each_entry_safe(frag, n, &refchk->fragments, list) {
- if (frag->rm.rm_startblock > refchk->bno)
- goto done;
+ if (frag->rm.rm_startblock > refchk->bno || nr > target_nr)
+ break;
bno = frag->rm.rm_startblock + frag->rm.rm_blockcount;
if (bno < rbno)
rbno = bno;
list_move_tail(&frag->list, &worklist);
- if (nr == target_nr)
- break;
nr++;
}
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 55d126d4e096..4304c6416fbb 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -346,8 +346,8 @@ xfs_map_blocks(
ssize_t count = i_blocksize(inode);
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
- xfs_fileoff_t cow_fsb = NULLFILEOFF;
- int whichfork = XFS_DATA_FORK;
+ xfs_fileoff_t cow_fsb;
+ int whichfork;
struct xfs_bmbt_irec imap;
struct xfs_iext_cursor icur;
int retries = 0;
@@ -381,6 +381,8 @@ xfs_map_blocks(
* landed in a hole and we skip the block.
*/
retry:
+ cow_fsb = NULLFILEOFF;
+ whichfork = XFS_DATA_FORK;
xfs_ilock(ip, XFS_ILOCK_SHARED);
ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE ||
(ip->i_df.if_flags & XFS_IFEXTENTS));
@@ -527,13 +529,15 @@ xfs_prepare_ioend(
*/
static void
xfs_discard_page(
- struct page *page)
+ struct page *page,
+ loff_t fileoff)
{
struct inode *inode = page->mapping->host;
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
- loff_t offset = page_offset(page);
- xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, offset);
+ unsigned int pageoff = offset_in_page(fileoff);
+ xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, fileoff);
+ xfs_fileoff_t pageoff_fsb = XFS_B_TO_FSBT(mp, pageoff);
int error;
if (XFS_FORCED_SHUTDOWN(mp))
@@ -541,14 +545,14 @@ xfs_discard_page(
xfs_alert_ratelimited(mp,
"page discard on page "PTR_FMT", inode 0x%llx, offset %llu.",
- page, ip->i_ino, offset);
+ page, ip->i_ino, fileoff);
error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
- i_blocks_per_page(inode, page));
+ i_blocks_per_page(inode, page) - pageoff_fsb);
if (error && !XFS_FORCED_SHUTDOWN(mp))
xfs_alert(mp, "page discard unable to remove delalloc mapping.");
out_invalidate:
- iomap_invalidatepage(page, 0, PAGE_SIZE);
+ iomap_invalidatepage(page, pageoff, PAGE_SIZE - pageoff);
}
static const struct iomap_writeback_ops xfs_writeback_ops = {
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 3abb8b9d6f4c..7b9ff824e82d 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -706,6 +706,23 @@ relock:
return 0;
}
+/*
+ * Check that the imap we are going to return to the caller spans the entire
+ * range that the caller requested for the IO.
+ */
+static bool
+imap_spans_range(
+ struct xfs_bmbt_irec *imap,
+ xfs_fileoff_t offset_fsb,
+ xfs_fileoff_t end_fsb)
+{
+ if (imap->br_startoff > offset_fsb)
+ return false;
+ if (imap->br_startoff + imap->br_blockcount < end_fsb)
+ return false;
+ return true;
+}
+
static int
xfs_direct_write_iomap_begin(
struct inode *inode,
@@ -766,6 +783,18 @@ xfs_direct_write_iomap_begin(
if (imap_needs_alloc(inode, flags, &imap, nimaps))
goto allocate_blocks;
+ /*
+ * NOWAIT IO needs to span the entire requested IO with a single map so
+ * that we avoid partial IO failures due to the rest of the IO range not
+ * covered by this map triggering an EAGAIN condition when it is
+ * subsequently mapped and aborting the IO.
+ */
+ if ((flags & IOMAP_NOWAIT) &&
+ !imap_spans_range(&imap, offset_fsb, end_fsb)) {
+ error = -EAGAIN;
+ goto out_unlock;
+ }
+
xfs_iunlock(ip, lockmode);
trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags);
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 5e165456da68..1414ab79eacf 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -911,6 +911,16 @@ xfs_setattr_size(
error = iomap_zero_range(inode, oldsize, newsize - oldsize,
&did_zeroing, &xfs_buffered_write_iomap_ops);
} else {
+ /*
+ * iomap won't detect a dirty page over an unwritten block (or a
+ * cow block over a hole) and subsequently skips zeroing the
+ * newly post-EOF portion of the page. Flush the new EOF to
+ * convert the block before the pagecache truncate.
+ */
+ error = filemap_write_and_wait_range(inode->i_mapping, newsize,
+ newsize);
+ if (error)
+ return error;
error = iomap_truncate_page(inode, newsize, &did_zeroing,
&xfs_buffered_write_iomap_ops);
}
diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
index 233dcc8784db..2a45138831e3 100644
--- a/fs/xfs/xfs_iwalk.c
+++ b/fs/xfs/xfs_iwalk.c
@@ -55,6 +55,9 @@ struct xfs_iwalk_ag {
/* Where do we start the traversal? */
xfs_ino_t startino;
+ /* What was the last inode number we saw when iterating the inobt? */
+ xfs_ino_t lastino;
+
/* Array of inobt records we cache. */
struct xfs_inobt_rec_incore *recs;
@@ -301,6 +304,9 @@ xfs_iwalk_ag_start(
if (XFS_IS_CORRUPT(mp, *has_more != 1))
return -EFSCORRUPTED;
+ iwag->lastino = XFS_AGINO_TO_INO(mp, agno,
+ irec->ir_startino + XFS_INODES_PER_CHUNK - 1);
+
/*
* If the LE lookup yielded an inobt record before the cursor position,
* skip it and see if there's another one after it.
@@ -347,15 +353,17 @@ xfs_iwalk_run_callbacks(
struct xfs_mount *mp = iwag->mp;
struct xfs_trans *tp = iwag->tp;
struct xfs_inobt_rec_incore *irec;
- xfs_agino_t restart;
+ xfs_agino_t next_agino;
int error;
+ next_agino = XFS_INO_TO_AGINO(mp, iwag->lastino) + 1;
+
ASSERT(iwag->nr_recs > 0);
/* Delete cursor but remember the last record we cached... */
xfs_iwalk_del_inobt(tp, curpp, agi_bpp, 0);
irec = &iwag->recs[iwag->nr_recs - 1];
- restart = irec->ir_startino + XFS_INODES_PER_CHUNK - 1;
+ ASSERT(next_agino == irec->ir_startino + XFS_INODES_PER_CHUNK);
error = xfs_iwalk_ag_recs(iwag);
if (error)
@@ -372,7 +380,7 @@ xfs_iwalk_run_callbacks(
if (error)
return error;
- return xfs_inobt_lookup(*curpp, restart, XFS_LOOKUP_GE, has_more);
+ return xfs_inobt_lookup(*curpp, next_agino, XFS_LOOKUP_GE, has_more);
}
/* Walk all inodes in a single AG, from @iwag->startino to the end of the AG. */
@@ -396,6 +404,7 @@ xfs_iwalk_ag(
while (!error && has_more) {
struct xfs_inobt_rec_incore *irec;
+ xfs_ino_t rec_fsino;
cond_resched();
if (xfs_pwork_want_abort(&iwag->pwork))
@@ -407,6 +416,15 @@ xfs_iwalk_ag(
if (error || !has_more)
break;
+ /* Make sure that we always move forward. */
+ rec_fsino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino);
+ if (iwag->lastino != NULLFSINO &&
+ XFS_IS_CORRUPT(mp, iwag->lastino >= rec_fsino)) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+ iwag->lastino = rec_fsino + XFS_INODES_PER_CHUNK - 1;
+
/* No allocated inodes in this chunk; skip it. */
if (iwag->skip_empty && irec->ir_freecount == irec->ir_count) {
error = xfs_btree_increment(cur, 0, &has_more);
@@ -535,6 +553,7 @@ xfs_iwalk(
.trim_start = 1,
.skip_empty = 1,
.pwork = XFS_PWORK_SINGLE_THREADED,
+ .lastino = NULLFSINO,
};
xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino);
int error;
@@ -623,6 +642,7 @@ xfs_iwalk_threaded(
iwag->data = data;
iwag->startino = startino;
iwag->sz_recs = xfs_iwalk_prefetch(inode_records);
+ iwag->lastino = NULLFSINO;
xfs_pwork_queue(&pctl, &iwag->pwork);
startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
if (flags & XFS_INOBT_WALK_SAME_AG)
@@ -696,6 +716,7 @@ xfs_inobt_walk(
.startino = startino,
.sz_recs = xfs_inobt_walk_prefetch(inobt_records),
.pwork = XFS_PWORK_SINGLE_THREADED,
+ .lastino = NULLFSINO,
};
xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino);
int error;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 150ee5cb8645..7110507a2b6b 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -194,20 +194,25 @@ xfs_initialize_perag(
}
pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
- if (!pag)
+ if (!pag) {
+ error = -ENOMEM;
goto out_unwind_new_pags;
+ }
pag->pag_agno = index;
pag->pag_mount = mp;
spin_lock_init(&pag->pag_ici_lock);
INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
- if (xfs_buf_hash_init(pag))
+
+ error = xfs_buf_hash_init(pag);
+ if (error)
goto out_free_pag;
init_waitqueue_head(&pag->pagb_wait);
spin_lock_init(&pag->pagb_lock);
pag->pagb_count = 0;
pag->pagb_tree = RB_ROOT;
- if (radix_tree_preload(GFP_NOFS))
+ error = radix_tree_preload(GFP_NOFS);
+ if (error)
goto out_hash_destroy;
spin_lock(&mp->m_perag_lock);
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index b101feb2aab4..f3082a957d5e 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -134,7 +134,7 @@ xfs_fs_map_blocks(
goto out_unlock;
error = invalidate_inode_pages2(inode->i_mapping);
if (WARN_ON_ONCE(error))
- return error;
+ goto out_unlock;
end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + length);
offset_fsb = XFS_B_TO_FSBT(mp, offset);
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 16098dc42add..6fa05fb78189 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1502,7 +1502,8 @@ xfs_reflink_unshare(
&xfs_buffered_write_iomap_ops);
if (error)
goto out;
- error = filemap_write_and_wait(inode->i_mapping);
+
+ error = filemap_write_and_wait_range(inode->i_mapping, offset, len);
if (error)
goto out;