aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds2022-05-24 18:52:35 -0700
committerLinus Torvalds2022-05-24 18:52:35 -0700
commitbd1b7c1384ec15294ee45bf3add7b7036e146dad (patch)
tree5b8efc004782d52f8697b2831bdcce9c9a884988
parent3842007b1a33589d57f67eac479b132b77767514 (diff)
parent0a05fafe9def0d9f0fbef3dfc8094925af9e3185 (diff)
Merge tag 'for-5.19-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba: "Features: - subpage: - support for PAGE_SIZE > 4K (previously only 64K) - make it work with raid56 - repair super block num_devices automatically if it does not match the number of device items - defrag can convert inline extents to regular extents, up to now inline files were skipped but the setting of mount option max_inline could affect the decision logic - zoned: - minimal accepted zone size is explicitly set to 4MiB - make zone reclaim less aggressive and don't reclaim if there are enough free zones - add per-profile sysfs tunable of the reclaim threshold - allow automatic block group reclaim for non-zoned filesystems, with sysfs tunables - tree-checker: new check, compare extent buffer owner against owner rootid Performance: - avoid blocking on space reservation when doing nowait direct io writes (+7% throughput for reads and writes) - NOCOW write throughput improvement due to refined locking (+3%) - send: reduce pressure to page cache by dropping extent pages right after they're processed Core: - convert all radix trees to xarray - add iterators for b-tree node items - support printk message index - user bulk page allocation for extent buffers - switch to bio_alloc API, use on-stack bios where convenient, other bio cleanups - use rw lock for block groups to favor concurrent reads - simplify workques, don't allocate high priority threads for all normal queues as we need only one - refactor scrub, process chunks based on their constraints and similarity - allocate direct io structures on stack and pass around only pointers, avoids allocation and reduces potential error handling Fixes: - fix count of reserved transaction items for various inode operations - fix deadlock between concurrent dio writes when low on free data space - fix a few cases when zones need to be finished VFS, iomap: - add helper to check if sb write has started (usable for assertions) - new helper iomap_dio_alloc_bio, export iomap_dio_bio_end_io" * tag 'for-5.19-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (173 commits) btrfs: zoned: introduce a minimal zone size 4M and reject mount btrfs: allow defrag to convert inline extents to regular extents btrfs: add "0x" prefix for unsupported optional features btrfs: do not account twice for inode ref when reserving metadata units btrfs: zoned: fix comparison of alloc_offset vs meta_write_pointer btrfs: send: avoid trashing the page cache btrfs: send: keep the current inode open while processing it btrfs: allocate the btrfs_dio_private as part of the iomap dio bio btrfs: move struct btrfs_dio_private to inode.c btrfs: remove the disk_bytenr in struct btrfs_dio_private btrfs: allocate dio_data on stack iomap: add per-iomap_iter private data iomap: allow the file system to provide a bio_set for direct I/O btrfs: add a btrfs_dio_rw wrapper btrfs: zoned: zone finish unused block group btrfs: zoned: properly finish block group on metadata write btrfs: zoned: finish block group when there are no more allocatable bytes left btrfs: zoned: consolidate zone finish functions btrfs: zoned: introduce btrfs_zoned_bg_is_full btrfs: improve error reporting in lookup_inline_extent_backref ...
-rw-r--r--fs/btrfs/acl.c39
-rw-r--r--fs/btrfs/async-thread.c122
-rw-r--r--fs/btrfs/async-thread.h7
-rw-r--r--fs/btrfs/block-group.c205
-rw-r--r--fs/btrfs/block-group.h7
-rw-r--r--fs/btrfs/btrfs_inode.h25
-rw-r--r--fs/btrfs/check-integrity.c172
-rw-r--r--fs/btrfs/check-integrity.h6
-rw-r--r--fs/btrfs/compression.c60
-rw-r--r--fs/btrfs/compression.h4
-rw-r--r--fs/btrfs/ctree.c102
-rw-r--r--fs/btrfs/ctree.h165
-rw-r--r--fs/btrfs/delalloc-space.c9
-rw-r--r--fs/btrfs/delayed-inode.c84
-rw-r--r--fs/btrfs/delayed-ref.c4
-rw-r--r--fs/btrfs/delayed-ref.h1
-rw-r--r--fs/btrfs/dev-replace.c52
-rw-r--r--fs/btrfs/dir-item.c31
-rw-r--r--fs/btrfs/disk-io.c310
-rw-r--r--fs/btrfs/disk-io.h10
-rw-r--r--fs/btrfs/extent-tree.c61
-rw-r--r--fs/btrfs/extent_io.c619
-rw-r--r--fs/btrfs/extent_io.h47
-rw-r--r--fs/btrfs/file.c286
-rw-r--r--fs/btrfs/free-space-cache.c9
-rw-r--r--fs/btrfs/free-space-tree.c2
-rw-r--r--fs/btrfs/inode.c1850
-rw-r--r--fs/btrfs/ioctl.c268
-rw-r--r--fs/btrfs/props.c40
-rw-r--r--fs/btrfs/props.h4
-rw-r--r--fs/btrfs/qgroup.c7
-rw-r--r--fs/btrfs/qgroup.h12
-rw-r--r--fs/btrfs/raid56.c809
-rw-r--r--fs/btrfs/raid56.h9
-rw-r--r--fs/btrfs/reflink.c23
-rw-r--r--fs/btrfs/relocation.c19
-rw-r--r--fs/btrfs/root-tree.c3
-rw-r--r--fs/btrfs/scrub.c1889
-rw-r--r--fs/btrfs/send.c400
-rw-r--r--fs/btrfs/space-info.c11
-rw-r--r--fs/btrfs/space-info.h8
-rw-r--r--fs/btrfs/subpage.c55
-rw-r--r--fs/btrfs/subpage.h2
-rw-r--r--fs/btrfs/super.c9
-rw-r--r--fs/btrfs/sysfs.c43
-rw-r--r--fs/btrfs/tests/btrfs-tests.c24
-rw-r--r--fs/btrfs/transaction.c116
-rw-r--r--fs/btrfs/tree-checker.c55
-rw-r--r--fs/btrfs/tree-checker.h1
-rw-r--r--fs/btrfs/tree-log.c11
-rw-r--r--fs/btrfs/volumes.c127
-rw-r--r--fs/btrfs/volumes.h42
-rw-r--r--fs/btrfs/xattr.c40
-rw-r--r--fs/btrfs/zoned.c217
-rw-r--r--fs/btrfs/zoned.h23
-rw-r--r--fs/btrfs/zstd.c14
-rw-r--r--fs/erofs/data.c2
-rw-r--r--fs/ext4/file.c4
-rw-r--r--fs/f2fs/file.c4
-rw-r--r--fs/gfs2/file.c4
-rw-r--r--fs/iomap/direct-io.c25
-rw-r--r--fs/xfs/xfs_file.c6
-rw-r--r--fs/zonefs/super.c4
-rw-r--r--include/linux/fs.h5
-rw-r--r--include/linux/iomap.h16
-rw-r--r--include/trace/events/btrfs.h30
-rw-r--r--include/uapi/linux/btrfs_tree.h13
67 files changed, 4472 insertions, 4211 deletions
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 0a0d0eccee4e..548d6a5477b4 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -55,9 +55,8 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu)
return acl;
}
-static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
- struct user_namespace *mnt_userns,
- struct inode *inode, struct posix_acl *acl, int type)
+int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode,
+ struct posix_acl *acl, int type)
{
int ret, size = 0;
const char *name;
@@ -123,40 +122,8 @@ int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
if (ret)
return ret;
}
- ret = __btrfs_set_acl(NULL, mnt_userns, inode, acl, type);
+ ret = __btrfs_set_acl(NULL, inode, acl, type);
if (ret)
inode->i_mode = old_mode;
return ret;
}
-
-int btrfs_init_acl(struct btrfs_trans_handle *trans,
- struct inode *inode, struct inode *dir)
-{
- struct posix_acl *default_acl, *acl;
- int ret = 0;
-
- /* this happens with subvols */
- if (!dir)
- return 0;
-
- ret = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
- if (ret)
- return ret;
-
- if (default_acl) {
- ret = __btrfs_set_acl(trans, &init_user_ns, inode, default_acl,
- ACL_TYPE_DEFAULT);
- posix_acl_release(default_acl);
- }
-
- if (acl) {
- if (!ret)
- ret = __btrfs_set_acl(trans, &init_user_ns, inode, acl,
- ACL_TYPE_ACCESS);
- posix_acl_release(acl);
- }
-
- if (!default_acl && !acl)
- cache_no_acl(inode);
- return ret;
-}
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 43c89952b7d2..aac240430efe 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -15,13 +15,12 @@
enum {
WORK_DONE_BIT,
WORK_ORDER_DONE_BIT,
- WORK_HIGH_PRIO_BIT,
};
#define NO_THRESHOLD (-1)
#define DFT_THRESHOLD (32)
-struct __btrfs_workqueue {
+struct btrfs_workqueue {
struct workqueue_struct *normal_wq;
/* File system this workqueue services */
@@ -48,12 +47,7 @@ struct __btrfs_workqueue {
spinlock_t thres_lock;
};
-struct btrfs_workqueue {
- struct __btrfs_workqueue *normal;
- struct __btrfs_workqueue *high;
-};
-
-struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq)
+struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct btrfs_workqueue *wq)
{
return wq->fs_info;
}
@@ -66,22 +60,22 @@ struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work)
bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq)
{
/*
- * We could compare wq->normal->pending with num_online_cpus()
+ * We could compare wq->pending with num_online_cpus()
* to support "thresh == NO_THRESHOLD" case, but it requires
* moving up atomic_inc/dec in thresh_queue/exec_hook. Let's
* postpone it until someone needs the support of that case.
*/
- if (wq->normal->thresh == NO_THRESHOLD)
+ if (wq->thresh == NO_THRESHOLD)
return false;
- return atomic_read(&wq->normal->pending) > wq->normal->thresh * 2;
+ return atomic_read(&wq->pending) > wq->thresh * 2;
}
-static struct __btrfs_workqueue *
-__btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name,
- unsigned int flags, int limit_active, int thresh)
+struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
+ const char *name, unsigned int flags,
+ int limit_active, int thresh)
{
- struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+ struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL);
if (!ret)
return NULL;
@@ -105,12 +99,8 @@ __btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name,
ret->thresh = thresh;
}
- if (flags & WQ_HIGHPRI)
- ret->normal_wq = alloc_workqueue("btrfs-%s-high", flags,
- ret->current_active, name);
- else
- ret->normal_wq = alloc_workqueue("btrfs-%s", flags,
- ret->current_active, name);
+ ret->normal_wq = alloc_workqueue("btrfs-%s", flags, ret->current_active,
+ name);
if (!ret->normal_wq) {
kfree(ret);
return NULL;
@@ -119,41 +109,7 @@ __btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name,
INIT_LIST_HEAD(&ret->ordered_list);
spin_lock_init(&ret->list_lock);
spin_lock_init(&ret->thres_lock);
- trace_btrfs_workqueue_alloc(ret, name, flags & WQ_HIGHPRI);
- return ret;
-}
-
-static inline void
-__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq);
-
-struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
- const char *name,
- unsigned int flags,
- int limit_active,
- int thresh)
-{
- struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL);
-
- if (!ret)
- return NULL;
-
- ret->normal = __btrfs_alloc_workqueue(fs_info, name,
- flags & ~WQ_HIGHPRI,
- limit_active, thresh);
- if (!ret->normal) {
- kfree(ret);
- return NULL;
- }
-
- if (flags & WQ_HIGHPRI) {
- ret->high = __btrfs_alloc_workqueue(fs_info, name, flags,
- limit_active, thresh);
- if (!ret->high) {
- __btrfs_destroy_workqueue(ret->normal);
- kfree(ret);
- return NULL;
- }
- }
+ trace_btrfs_workqueue_alloc(ret, name);
return ret;
}
@@ -162,7 +118,7 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
* This hook WILL be called in IRQ handler context,
* so workqueue_set_max_active MUST NOT be called in this hook
*/
-static inline void thresh_queue_hook(struct __btrfs_workqueue *wq)
+static inline void thresh_queue_hook(struct btrfs_workqueue *wq)
{
if (wq->thresh == NO_THRESHOLD)
return;
@@ -174,7 +130,7 @@ static inline void thresh_queue_hook(struct __btrfs_workqueue *wq)
* This hook is called in kthread content.
* So workqueue_set_max_active is called here.
*/
-static inline void thresh_exec_hook(struct __btrfs_workqueue *wq)
+static inline void thresh_exec_hook(struct btrfs_workqueue *wq)
{
int new_current_active;
long pending;
@@ -217,7 +173,7 @@ out:
}
}
-static void run_ordered_work(struct __btrfs_workqueue *wq,
+static void run_ordered_work(struct btrfs_workqueue *wq,
struct btrfs_work *self)
{
struct list_head *list = &wq->ordered_list;
@@ -305,7 +261,7 @@ static void btrfs_work_helper(struct work_struct *normal_work)
{
struct btrfs_work *work = container_of(normal_work, struct btrfs_work,
normal_work);
- struct __btrfs_workqueue *wq;
+ struct btrfs_workqueue *wq = work->wq;
int need_order = 0;
/*
@@ -318,7 +274,6 @@ static void btrfs_work_helper(struct work_struct *normal_work)
*/
if (work->ordered_func)
need_order = 1;
- wq = work->wq;
trace_btrfs_work_sched(work);
thresh_exec_hook(wq);
@@ -350,8 +305,7 @@ void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func,
work->flags = 0;
}
-static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq,
- struct btrfs_work *work)
+void btrfs_queue_work(struct btrfs_workqueue *wq, struct btrfs_work *work)
{
unsigned long flags;
@@ -366,54 +320,22 @@ static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq,
queue_work(wq->normal_wq, &work->normal_work);
}
-void btrfs_queue_work(struct btrfs_workqueue *wq,
- struct btrfs_work *work)
-{
- struct __btrfs_workqueue *dest_wq;
-
- if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags) && wq->high)
- dest_wq = wq->high;
- else
- dest_wq = wq->normal;
- __btrfs_queue_work(dest_wq, work);
-}
-
-static inline void
-__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq)
-{
- destroy_workqueue(wq->normal_wq);
- trace_btrfs_workqueue_destroy(wq);
- kfree(wq);
-}
-
void btrfs_destroy_workqueue(struct btrfs_workqueue *wq)
{
if (!wq)
return;
- if (wq->high)
- __btrfs_destroy_workqueue(wq->high);
- __btrfs_destroy_workqueue(wq->normal);
+ destroy_workqueue(wq->normal_wq);
+ trace_btrfs_workqueue_destroy(wq);
kfree(wq);
}
void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int limit_active)
{
- if (!wq)
- return;
- wq->normal->limit_active = limit_active;
- if (wq->high)
- wq->high->limit_active = limit_active;
-}
-
-void btrfs_set_work_high_priority(struct btrfs_work *work)
-{
- set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
+ if (wq)
+ wq->limit_active = limit_active;
}
void btrfs_flush_workqueue(struct btrfs_workqueue *wq)
{
- if (wq->high)
- flush_workqueue(wq->high->normal_wq);
-
- flush_workqueue(wq->normal->normal_wq);
+ flush_workqueue(wq->normal_wq);
}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 3204daa51b95..07960529b360 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -11,8 +11,6 @@
struct btrfs_fs_info;
struct btrfs_workqueue;
-/* Internal use only */
-struct __btrfs_workqueue;
struct btrfs_work;
typedef void (*btrfs_func_t)(struct btrfs_work *arg);
typedef void (*btrfs_work_func_t)(struct work_struct *arg);
@@ -25,7 +23,7 @@ struct btrfs_work {
/* Don't touch things below */
struct work_struct normal_work;
struct list_head ordered_list;
- struct __btrfs_workqueue *wq;
+ struct btrfs_workqueue *wq;
unsigned long flags;
};
@@ -40,9 +38,8 @@ void btrfs_queue_work(struct btrfs_workqueue *wq,
struct btrfs_work *work);
void btrfs_destroy_workqueue(struct btrfs_workqueue *wq);
void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max);
-void btrfs_set_work_high_priority(struct btrfs_work *work);
struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work);
-struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq);
+struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct btrfs_workqueue *wq);
bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq);
void btrfs_flush_workqueue(struct btrfs_workqueue *wq);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 0dd6de994199..ede389f2602d 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -168,11 +168,12 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
struct rb_node **p;
struct rb_node *parent = NULL;
struct btrfs_block_group *cache;
+ bool leftmost = true;
ASSERT(block_group->length != 0);
- spin_lock(&info->block_group_cache_lock);
- p = &info->block_group_cache_tree.rb_node;
+ write_lock(&info->block_group_cache_lock);
+ p = &info->block_group_cache_tree.rb_root.rb_node;
while (*p) {
parent = *p;
@@ -181,20 +182,18 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
p = &(*p)->rb_left;
} else if (block_group->start > cache->start) {
p = &(*p)->rb_right;
+ leftmost = false;
} else {
- spin_unlock(&info->block_group_cache_lock);
+ write_unlock(&info->block_group_cache_lock);
return -EEXIST;
}
}
rb_link_node(&block_group->cache_node, parent, p);
- rb_insert_color(&block_group->cache_node,
- &info->block_group_cache_tree);
+ rb_insert_color_cached(&block_group->cache_node,
+ &info->block_group_cache_tree, leftmost);
- if (info->first_logical_byte > block_group->start)
- info->first_logical_byte = block_group->start;
-
- spin_unlock(&info->block_group_cache_lock);
+ write_unlock(&info->block_group_cache_lock);
return 0;
}
@@ -210,8 +209,8 @@ static struct btrfs_block_group *block_group_cache_tree_search(
struct rb_node *n;
u64 end, start;
- spin_lock(&info->block_group_cache_lock);
- n = info->block_group_cache_tree.rb_node;
+ read_lock(&info->block_group_cache_lock);
+ n = info->block_group_cache_tree.rb_root.rb_node;
while (n) {
cache = rb_entry(n, struct btrfs_block_group, cache_node);
@@ -233,12 +232,9 @@ static struct btrfs_block_group *block_group_cache_tree_search(
break;
}
}
- if (ret) {
+ if (ret)
btrfs_get_block_group(ret);
- if (bytenr == 0 && info->first_logical_byte > ret->start)
- info->first_logical_byte = ret->start;
- }
- spin_unlock(&info->block_group_cache_lock);
+ read_unlock(&info->block_group_cache_lock);
return ret;
}
@@ -267,15 +263,15 @@ struct btrfs_block_group *btrfs_next_block_group(
struct btrfs_fs_info *fs_info = cache->fs_info;
struct rb_node *node;
- spin_lock(&fs_info->block_group_cache_lock);
+ read_lock(&fs_info->block_group_cache_lock);
/* If our block group was removed, we need a full search. */
if (RB_EMPTY_NODE(&cache->cache_node)) {
const u64 next_bytenr = cache->start + cache->length;
- spin_unlock(&fs_info->block_group_cache_lock);
+ read_unlock(&fs_info->block_group_cache_lock);
btrfs_put_block_group(cache);
- cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
+ return btrfs_lookup_first_block_group(fs_info, next_bytenr);
}
node = rb_next(&cache->cache_node);
btrfs_put_block_group(cache);
@@ -284,46 +280,70 @@ struct btrfs_block_group *btrfs_next_block_group(
btrfs_get_block_group(cache);
} else
cache = NULL;
- spin_unlock(&fs_info->block_group_cache_lock);
+ read_unlock(&fs_info->block_group_cache_lock);
return cache;
}
-bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
+/**
+ * Check if we can do a NOCOW write for a given extent.
+ *
+ * @fs_info: The filesystem information object.
+ * @bytenr: Logical start address of the extent.
+ *
+ * Check if we can do a NOCOW write for the given extent, and increments the
+ * number of NOCOW writers in the block group that contains the extent, as long
+ * as the block group exists and it's currently not in read-only mode.
+ *
+ * Returns: A non-NULL block group pointer if we can do a NOCOW write, the caller
+ * is responsible for calling btrfs_dec_nocow_writers() later.
+ *
+ * Or NULL if we can not do a NOCOW write
+ */
+struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info,
+ u64 bytenr)
{
struct btrfs_block_group *bg;
- bool ret = true;
+ bool can_nocow = true;
bg = btrfs_lookup_block_group(fs_info, bytenr);
if (!bg)
- return false;
+ return NULL;
spin_lock(&bg->lock);
if (bg->ro)
- ret = false;
+ can_nocow = false;
else
atomic_inc(&bg->nocow_writers);
spin_unlock(&bg->lock);
- /* No put on block group, done by btrfs_dec_nocow_writers */
- if (!ret)
+ if (!can_nocow) {
btrfs_put_block_group(bg);
+ return NULL;
+ }
- return ret;
+ /* No put on block group, done by btrfs_dec_nocow_writers(). */
+ return bg;
}
-void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
+/**
+ * Decrement the number of NOCOW writers in a block group.
+ *
+ * @bg: The block group.
+ *
+ * This is meant to be called after a previous call to btrfs_inc_nocow_writers(),
+ * and on the block group returned by that call. Typically this is called after
+ * creating an ordered extent for a NOCOW write, to prevent races with scrub and
+ * relocation.
+ *
+ * After this call, the caller should not use the block group anymore. It it wants
+ * to use it, then it should get a reference on it before calling this function.
+ */
+void btrfs_dec_nocow_writers(struct btrfs_block_group *bg)
{
- struct btrfs_block_group *bg;
-
- bg = btrfs_lookup_block_group(fs_info, bytenr);
- ASSERT(bg);
if (atomic_dec_and_test(&bg->nocow_writers))
wake_up_var(&bg->nocow_writers);
- /*
- * Once for our lookup and once for the lookup done by a previous call
- * to btrfs_inc_nocow_writers()
- */
- btrfs_put_block_group(bg);
+
+ /* For the lookup done by a previous call to btrfs_inc_nocow_writers(). */
btrfs_put_block_group(bg);
}
@@ -772,10 +792,10 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only
cache->has_caching_ctl = 1;
spin_unlock(&cache->lock);
- spin_lock(&fs_info->block_group_cache_lock);
+ write_lock(&fs_info->block_group_cache_lock);
refcount_inc(&caching_ctl->count);
list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
- spin_unlock(&fs_info->block_group_cache_lock);
+ write_unlock(&fs_info->block_group_cache_lock);
btrfs_get_block_group(cache);
@@ -957,17 +977,15 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- spin_lock(&fs_info->block_group_cache_lock);
- rb_erase(&block_group->cache_node,
- &fs_info->block_group_cache_tree);
+ write_lock(&fs_info->block_group_cache_lock);
+ rb_erase_cached(&block_group->cache_node,
+ &fs_info->block_group_cache_tree);
RB_CLEAR_NODE(&block_group->cache_node);
/* Once for the block groups rbtree */
btrfs_put_block_group(block_group);
- if (fs_info->first_logical_byte == block_group->start)
- fs_info->first_logical_byte = (u64)-1;
- spin_unlock(&fs_info->block_group_cache_lock);
+ write_unlock(&fs_info->block_group_cache_lock);
down_write(&block_group->space_info->groups_sem);
/*
@@ -992,7 +1010,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
if (block_group->cached == BTRFS_CACHE_STARTED)
btrfs_wait_block_group_cache_done(block_group);
if (block_group->has_caching_ctl) {
- spin_lock(&fs_info->block_group_cache_lock);
+ write_lock(&fs_info->block_group_cache_lock);
if (!caching_ctl) {
struct btrfs_caching_control *ctl;
@@ -1006,7 +1024,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
}
if (caching_ctl)
list_del_init(&caching_ctl->list);
- spin_unlock(&fs_info->block_group_cache_lock);
+ write_unlock(&fs_info->block_group_cache_lock);
if (caching_ctl) {
/* Once for the caching bgs list and once for us. */
btrfs_put_caching_control(caching_ctl);
@@ -1367,6 +1385,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
goto next;
}
+ ret = btrfs_zone_finish(block_group);
+ if (ret < 0) {
+ btrfs_dec_block_group_ro(block_group);
+ if (ret == -EAGAIN)
+ ret = 0;
+ goto next;
+ }
+
/*
* Want to do this before we do anything else so we can recover
* properly if we fail to join the transaction.
@@ -1512,6 +1538,13 @@ static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
return bg1->used > bg2->used;
}
+static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info)
+{
+ if (btrfs_is_zoned(fs_info))
+ return btrfs_zoned_should_reclaim(fs_info);
+ return true;
+}
+
void btrfs_reclaim_bgs_work(struct work_struct *work)
{
struct btrfs_fs_info *fs_info =
@@ -1522,6 +1555,9 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
return;
+ if (!btrfs_should_reclaim(fs_info))
+ return;
+
sb_start_write(fs_info->sb);
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
@@ -1692,35 +1728,13 @@ static int find_first_block_group(struct btrfs_fs_info *fs_info,
struct btrfs_root *root = btrfs_block_group_root(fs_info);
int ret;
struct btrfs_key found_key;
- struct extent_buffer *leaf;
- int slot;
-
- ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
- if (ret < 0)
- return ret;
-
- while (1) {
- slot = path->slots[0];
- leaf = path->nodes[0];
- if (slot >= btrfs_header_nritems(leaf)) {
- ret = btrfs_next_leaf(root, path);
- if (ret == 0)
- continue;
- if (ret < 0)
- goto out;
- break;
- }
- btrfs_item_key_to_cpu(leaf, &found_key, slot);
+ btrfs_for_each_slot(root, key, &found_key, path, ret) {
if (found_key.objectid >= key->objectid &&
found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
- ret = read_bg_from_eb(fs_info, &found_key, path);
- break;
+ return read_bg_from_eb(fs_info, &found_key, path);
}
-
- path->slots[0]++;
}
-out:
return ret;
}
@@ -3220,6 +3234,31 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
return ret;
}
+static inline bool should_reclaim_block_group(struct btrfs_block_group *bg,
+ u64 bytes_freed)
+{
+ const struct btrfs_space_info *space_info = bg->space_info;
+ const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold);
+ const u64 new_val = bg->used;
+ const u64 old_val = new_val + bytes_freed;
+ u64 thresh;
+
+ if (reclaim_thresh == 0)
+ return false;
+
+ thresh = div_factor_fine(bg->length, reclaim_thresh);
+
+ /*
+ * If we were below the threshold before don't reclaim, we are likely a
+ * brand new block group and we don't want to relocate new block groups.
+ */
+ if (old_val < thresh)
+ return false;
+ if (new_val >= thresh)
+ return false;
+ return true;
+}
+
int btrfs_update_block_group(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, bool alloc)
{
@@ -3242,6 +3281,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&info->delalloc_root_lock);
while (total) {
+ bool reclaim;
+
cache = btrfs_lookup_block_group(info, bytenr);
if (!cache) {
ret = -ENOENT;
@@ -3287,6 +3328,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
cache->space_info, num_bytes);
cache->space_info->bytes_used -= num_bytes;
cache->space_info->disk_used -= num_bytes * factor;
+
+ reclaim = should_reclaim_block_group(cache, num_bytes);
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
@@ -3313,6 +3356,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
if (!alloc && old_val == 0) {
if (!btrfs_test_opt(info, DISCARD_ASYNC))
btrfs_mark_bg_unused(cache);
+ } else if (!alloc && reclaim) {
+ btrfs_mark_bg_to_reclaim(cache);
}
btrfs_put_block_group(cache);
@@ -3957,14 +4002,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
struct btrfs_caching_control *caching_ctl;
struct rb_node *n;
- spin_lock(&info->block_group_cache_lock);
+ write_lock(&info->block_group_cache_lock);
while (!list_empty(&info->caching_block_groups)) {
caching_ctl = list_entry(info->caching_block_groups.next,
struct btrfs_caching_control, list);
list_del(&caching_ctl->list);
btrfs_put_caching_control(caching_ctl);
}
- spin_unlock(&info->block_group_cache_lock);
+ write_unlock(&info->block_group_cache_lock);
spin_lock(&info->unused_bgs_lock);
while (!list_empty(&info->unused_bgs)) {
@@ -3994,14 +4039,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
}
spin_unlock(&info->zone_active_bgs_lock);
- spin_lock(&info->block_group_cache_lock);
- while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
+ write_lock(&info->block_group_cache_lock);
+ while ((n = rb_last(&info->block_group_cache_tree.rb_root)) != NULL) {
block_group = rb_entry(n, struct btrfs_block_group,
cache_node);
- rb_erase(&block_group->cache_node,
- &info->block_group_cache_tree);
+ rb_erase_cached(&block_group->cache_node,
+ &info->block_group_cache_tree);
RB_CLEAR_NODE(&block_group->cache_node);
- spin_unlock(&info->block_group_cache_lock);
+ write_unlock(&info->block_group_cache_lock);
down_write(&block_group->space_info->groups_sem);
list_del(&block_group->list);
@@ -4024,9 +4069,9 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
ASSERT(block_group->swap_extents == 0);
btrfs_put_block_group(block_group);
- spin_lock(&info->block_group_cache_lock);
+ write_lock(&info->block_group_cache_lock);
}
- spin_unlock(&info->block_group_cache_lock);
+ write_unlock(&info->block_group_cache_lock);
btrfs_release_global_block_rsv(info);
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index e8308f2ad07d..3ac668ace50a 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -212,6 +212,8 @@ struct btrfs_block_group {
u64 meta_write_pointer;
struct map_lookup *physical_map;
struct list_head active_bg_list;
+ struct work_struct zone_finish_work;
+ struct extent_buffer *last_eb;
};
static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
@@ -254,8 +256,9 @@ void btrfs_put_block_group(struct btrfs_block_group *cache);
void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
const u64 start);
void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg);
-bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
-void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
+struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info,
+ u64 bytenr);
+void btrfs_dec_nocow_writers(struct btrfs_block_group *bg);
void btrfs_wait_nocow_writers(struct btrfs_block_group *bg);
void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
u64 num_bytes);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 32131a5d321b..33811e896623 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -395,31 +395,6 @@ static inline bool btrfs_inode_can_compress(const struct btrfs_inode *inode)
return true;
}
-struct btrfs_dio_private {
- struct inode *inode;
-
- /*
- * Since DIO can use anonymous page, we cannot use page_offset() to
- * grab the file offset, thus need a dedicated member for file offset.
- */
- u64 file_offset;
- u64 disk_bytenr;
- /* Used for bio::bi_size */
- u32 bytes;
-
- /*
- * References to this structure. There is one reference per in-flight
- * bio plus one while we're still setting up.
- */
- refcount_t refs;
-
- /* dio_bio came from fs/direct-io.c */
- struct bio *dio_bio;
-
- /* Array of checksums */
- u8 csums[];
-};
-
/*
* btrfs_inode_item stores flags in a u64, btrfs_inode stores them in two
* separate u32s. These two functions convert between the two representations.
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index abac86a75840..5d20137b7b67 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -1552,21 +1552,18 @@ static int btrfsic_read_block(struct btrfsic_state *state,
return -ENOMEM;
block_ctx->datav = block_ctx->mem_to_free;
block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages);
- for (i = 0; i < num_pages; i++) {
- block_ctx->pagev[i] = alloc_page(GFP_NOFS);
- if (!block_ctx->pagev[i])
- return -1;
- }
+ ret = btrfs_alloc_page_array(num_pages, block_ctx->pagev);
+ if (ret)
+ return ret;
dev_bytenr = block_ctx->dev_bytenr;
for (i = 0; i < num_pages;) {
struct bio *bio;
unsigned int j;
- bio = btrfs_bio_alloc(num_pages - i);
- bio_set_dev(bio, block_ctx->dev->bdev);
+ bio = bio_alloc(block_ctx->dev->bdev, num_pages - i,
+ REQ_OP_READ, GFP_NOFS);
bio->bi_iter.bi_sector = dev_bytenr >> 9;
- bio->bi_opf = REQ_OP_READ;
for (j = i; j < num_pages; j++) {
ret = bio_add_page(bio, block_ctx->pagev[j],
@@ -2033,7 +2030,7 @@ continue_loop:
static void btrfsic_bio_end_io(struct bio *bp)
{
- struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private;
+ struct btrfsic_block *block = bp->bi_private;
int iodone_w_error;
/* mutex is not held! This is not save if IO is not yet completed
@@ -2635,100 +2632,93 @@ static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev)
&btrfsic_dev_state_hashtable);
}
-static void __btrfsic_submit_bio(struct bio *bio)
+static void btrfsic_check_write_bio(struct bio *bio, struct btrfsic_dev_state *dev_state)
{
- struct btrfsic_dev_state *dev_state;
+ unsigned int segs = bio_segments(bio);
+ u64 dev_bytenr = 512 * bio->bi_iter.bi_sector;
+ u64 cur_bytenr = dev_bytenr;
+ struct bvec_iter iter;
+ struct bio_vec bvec;
+ char **mapped_datav;
+ int bio_is_patched = 0;
+ int i = 0;
+
+ if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+ pr_info(
+"submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n",
+ bio_op(bio), bio->bi_opf, segs,
+ bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev);
- if (!btrfsic_is_initialized)
+ mapped_datav = kmalloc_array(segs, sizeof(*mapped_datav), GFP_NOFS);
+ if (!mapped_datav)
return;
- mutex_lock(&btrfsic_mutex);
- /* since btrfsic_submit_bio() is also called before
- * btrfsic_mount(), this might return NULL */
- dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev);
- if (NULL != dev_state &&
- (bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) {
- int i = 0;
- u64 dev_bytenr;
- u64 cur_bytenr;
- struct bio_vec bvec;
- struct bvec_iter iter;
- int bio_is_patched;
- char **mapped_datav;
- unsigned int segs = bio_segments(bio);
-
- dev_bytenr = 512 * bio->bi_iter.bi_sector;
- bio_is_patched = 0;
- if (dev_state->state->print_mask &
- BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
- pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n",
- bio_op(bio), bio->bi_opf, segs,
- bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev);
-
- mapped_datav = kmalloc_array(segs,
- sizeof(*mapped_datav), GFP_NOFS);
- if (!mapped_datav)
- goto leave;
- cur_bytenr = dev_bytenr;
-
- bio_for_each_segment(bvec, bio, iter) {
- BUG_ON(bvec.bv_len != PAGE_SIZE);
- mapped_datav[i] = page_address(bvec.bv_page);
- i++;
-
- if (dev_state->state->print_mask &
- BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE)
- pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n",
- i, cur_bytenr, bvec.bv_len, bvec.bv_offset);
- cur_bytenr += bvec.bv_len;
- }
- btrfsic_process_written_block(dev_state, dev_bytenr,
- mapped_datav, segs,
- bio, &bio_is_patched,
- bio->bi_opf);
- kfree(mapped_datav);
- } else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) {
- if (dev_state->state->print_mask &
- BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
- pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n",
- bio_op(bio), bio->bi_opf, bio->bi_bdev);
- if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
- if ((dev_state->state->print_mask &
- (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
- BTRFSIC_PRINT_MASK_VERBOSE)))
- pr_info(
-"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n",
- dev_state->bdev);
- } else {
- struct btrfsic_block *const block =
- &dev_state->dummy_block_for_bio_bh_flush;
+ bio_for_each_segment(bvec, bio, iter) {
+ BUG_ON(bvec.bv_len != PAGE_SIZE);
+ mapped_datav[i] = page_address(bvec.bv_page);
+ i++;
- block->is_iodone = 0;
- block->never_written = 0;
- block->iodone_w_error = 0;
- block->flush_gen = dev_state->last_flush_gen + 1;
- block->submit_bio_bh_rw = bio->bi_opf;
- block->orig_bio_private = bio->bi_private;
- block->orig_bio_end_io = bio->bi_end_io;
- block->next_in_same_bio = NULL;
- bio->bi_private = block;
- bio->bi_end_io = btrfsic_bio_end_io;
- }
+ if (dev_state->state->print_mask &
+ BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE)
+ pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n",
+ i, cur_bytenr, bvec.bv_len, bvec.bv_offset);
+ cur_bytenr += bvec.bv_len;
}
-leave:
- mutex_unlock(&btrfsic_mutex);
+
+ btrfsic_process_written_block(dev_state, dev_bytenr, mapped_datav, segs,
+ bio, &bio_is_patched, bio->bi_opf);
+ kfree(mapped_datav);
}
-void btrfsic_submit_bio(struct bio *bio)
+static void btrfsic_check_flush_bio(struct bio *bio, struct btrfsic_dev_state *dev_state)
{
- __btrfsic_submit_bio(bio);
- submit_bio(bio);
+ if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+ pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n",
+ bio_op(bio), bio->bi_opf, bio->bi_bdev);
+
+ if (dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
+ struct btrfsic_block *const block =
+ &dev_state->dummy_block_for_bio_bh_flush;
+
+ block->is_iodone = 0;
+ block->never_written = 0;
+ block->iodone_w_error = 0;
+ block->flush_gen = dev_state->last_flush_gen + 1;
+ block->submit_bio_bh_rw = bio->bi_opf;
+ block->orig_bio_private = bio->bi_private;
+ block->orig_bio_end_io = bio->bi_end_io;
+ block->next_in_same_bio = NULL;
+ bio->bi_private = block;
+ bio->bi_end_io = btrfsic_bio_end_io;
+ } else if ((dev_state->state->print_mask &
+ (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+ BTRFSIC_PRINT_MASK_VERBOSE))) {
+ pr_info(
+"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n",
+ dev_state->bdev);
+ }
}
-int btrfsic_submit_bio_wait(struct bio *bio)
+void btrfsic_check_bio(struct bio *bio)
{
- __btrfsic_submit_bio(bio);
- return submit_bio_wait(bio);
+ struct btrfsic_dev_state *dev_state;
+
+ if (!btrfsic_is_initialized)
+ return;
+
+ /*
+ * We can be called before btrfsic_mount, so there might not be a
+ * dev_state.
+ */
+ dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev);
+ mutex_lock(&btrfsic_mutex);
+ if (dev_state) {
+ if (bio_op(bio) == REQ_OP_WRITE && bio_has_data(bio))
+ btrfsic_check_write_bio(bio, dev_state);
+ else if (bio->bi_opf & REQ_PREFLUSH)
+ btrfsic_check_flush_bio(bio, dev_state);
+ }
+ mutex_unlock(&btrfsic_mutex);
}
int btrfsic_mount(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h
index bcc730a06cb5..e4c8aed7996f 100644
--- a/fs/btrfs/check-integrity.h
+++ b/fs/btrfs/check-integrity.h
@@ -7,11 +7,9 @@
#define BTRFS_CHECK_INTEGRITY_H
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-void btrfsic_submit_bio(struct bio *bio);
-int btrfsic_submit_bio_wait(struct bio *bio);
+void btrfsic_check_bio(struct bio *bio);
#else
-#define btrfsic_submit_bio submit_bio
-#define btrfsic_submit_bio_wait submit_bio_wait
+static inline void btrfsic_check_bio(struct bio *bio) { }
#endif
int btrfsic_mount(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 19bf36d8ffea..f4564f32f6d9 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -425,7 +425,6 @@ out:
}
static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info,
- struct compressed_bio *cb,
struct bio *bio, int mirror_num)
{
blk_status_t ret;
@@ -604,7 +603,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
goto finish_cb;
}
- ret = submit_compressed_bio(fs_info, cb, bio, 0);
+ ret = submit_compressed_bio(fs_info, bio, 0);
if (ret)
goto finish_cb;
bio = NULL;
@@ -802,15 +801,13 @@ static noinline int add_ra_bio_pages(struct inode *inode,
* After the compressed pages are read, we copy the bytes into the
* bio we were passed and then call the bio end_io calls
*/
-blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags)
+void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+ int mirror_num)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map_tree *em_tree;
struct compressed_bio *cb;
unsigned int compressed_len;
- unsigned int nr_pages;
- unsigned int pg_index;
struct bio *comp_bio = NULL;
const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT;
u64 cur_disk_byte = disk_bytenr;
@@ -820,7 +817,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
u64 em_start;
struct extent_map *em;
blk_status_t ret;
- int faili = 0;
+ int ret2;
+ int i;
u8 *sums;
em_tree = &BTRFS_I(inode)->extent_tree;
@@ -855,32 +853,26 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
em_len = em->len;
em_start = em->start;
- free_extent_map(em);
- em = NULL;
-
cb->len = bio->bi_iter.bi_size;
cb->compressed_len = compressed_len;
- cb->compress_type = extent_compress_type(bio_flags);
+ cb->compress_type = em->compress_type;
cb->orig_bio = bio;
- nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
- cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *),
- GFP_NOFS);
+ free_extent_map(em);
+ em = NULL;
+
+ cb->nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
+ cb->compressed_pages = kcalloc(cb->nr_pages, sizeof(struct page *), GFP_NOFS);
if (!cb->compressed_pages) {
ret = BLK_STS_RESOURCE;
- goto fail1;
+ goto fail;
}
- for (pg_index = 0; pg_index < nr_pages; pg_index++) {
- cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS);
- if (!cb->compressed_pages[pg_index]) {
- faili = pg_index - 1;
- ret = BLK_STS_RESOURCE;
- goto fail2;
- }
+ ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages);
+ if (ret2) {
+ ret = BLK_STS_RESOURCE;
+ goto fail;
}
- faili = nr_pages - 1;
- cb->nr_pages = nr_pages;
add_ra_bio_pages(inode, em_start + em_len, cb);
@@ -949,28 +941,29 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
fs_info->sectorsize);
sums += fs_info->csum_size * nr_sectors;
- ret = submit_compressed_bio(fs_info, cb, comp_bio, mirror_num);
+ ret = submit_compressed_bio(fs_info, comp_bio, mirror_num);
if (ret)
goto finish_cb;
comp_bio = NULL;
}
}
- return BLK_STS_OK;
+ return;
-fail2:
- while (faili >= 0) {
- __free_page(cb->compressed_pages[faili]);
- faili--;
+fail:
+ if (cb->compressed_pages) {
+ for (i = 0; i < cb->nr_pages; i++) {
+ if (cb->compressed_pages[i])
+ __free_page(cb->compressed_pages[i]);
+ }
}
kfree(cb->compressed_pages);
-fail1:
kfree(cb);
out:
free_extent_map(em);
bio->bi_status = ret;
bio_endio(bio);
- return ret;
+ return;
finish_cb:
if (comp_bio) {
comp_bio->bi_status = ret;
@@ -978,7 +971,7 @@ finish_cb:
}
/* All bytes of @cb is submitted, endio will free @cb */
if (cur_disk_byte == disk_bytenr + compressed_len)
- return ret;
+ return;
wait_var_event(cb, refcount_read(&cb->pending_sectors) ==
(disk_bytenr + compressed_len - cur_disk_byte) >>
@@ -990,7 +983,6 @@ finish_cb:
ASSERT(refcount_read(&cb->pending_sectors));
/* Now we are the only one referring @cb, can finish it safely. */
finish_compressed_bio_read(cb);
- return ret;
}
/*
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index ac5b20731d2a..2707404389a5 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -102,8 +102,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
unsigned int write_flags,
struct cgroup_subsys_state *blkcg_css,
bool writeback);
-blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags);
+void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+ int mirror_num);
unsigned int btrfs_compress_str2level(unsigned int type, const char *str);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0eecf98d0abb..6e556031a8f3 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -16,6 +16,7 @@
#include "volumes.h"
#include "qgroup.h"
#include "tree-mod-log.h"
+#include "tree-checker.h"
static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_path *path, int level);
@@ -342,7 +343,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
int level = btrfs_header_level(buf);
ret = btrfs_set_disk_extent_flags(trans, buf,
- new_flags, level, 0);
+ new_flags, level);
if (ret)
return ret;
}
@@ -1390,12 +1391,13 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
}
/*
- * helper function for btrfs_search_slot. The goal is to find a block
- * in cache without setting the path to blocking. If we find the block
- * we return zero and the path is unchanged.
+ * Helper function for btrfs_search_slot() and other functions that do a search
+ * on a btree. The goal is to find a tree block in the cache (the radix tree at
+ * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read
+ * its pages from disk.
*
- * If we can't find the block, we set the path blocking and do some
- * reada. -EAGAIN is returned and the search must be repeated.
+ * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the
+ * whole btree search, starting again from the current root node.
*/
static int
read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
@@ -1409,12 +1411,21 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
struct btrfs_key first_key;
int ret;
int parent_level;
+ bool unlock_up;
+ unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);
blocknr = btrfs_node_blockptr(*eb_ret, slot);
gen = btrfs_node_ptr_generation(*eb_ret, slot);
parent_level = btrfs_header_level(*eb_ret);
btrfs_node_key_to_cpu(*eb_ret, &first_key, slot);
+ /*
+ * If we need to read an extent buffer from disk and we are holding locks
+ * on upper level nodes, we unlock all the upper nodes before reading the
+ * extent buffer, and then return -EAGAIN to the caller as it needs to
+ * restart the search. We don't release the lock on the current level
+ * because we need to walk this node to figure out which blocks to read.
+ */
tmp = find_extent_buffer(fs_info, blocknr);
if (tmp) {
if (p->reada == READA_FORWARD_ALWAYS)
@@ -1436,30 +1447,38 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
return 0;
}
+ if (unlock_up)
+ btrfs_unlock_up_safe(p, level + 1);
+
/* now we're allowed to do a blocking uptodate check */
- ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
+ ret = btrfs_read_extent_buffer(tmp, gen, parent_level - 1, &first_key);
if (ret) {
free_extent_buffer(tmp);
btrfs_release_path(p);
return -EIO;
}
- *eb_ret = tmp;
- return 0;
+ if (btrfs_check_eb_owner(tmp, root->root_key.objectid)) {
+ free_extent_buffer(tmp);
+ btrfs_release_path(p);
+ return -EUCLEAN;
+ }
+
+ if (unlock_up)
+ ret = -EAGAIN;
+
+ goto out;
}
- /*
- * reduce lock contention at high levels
- * of the btree by dropping locks before
- * we read. Don't release the lock on the current
- * level because we need to walk this node to figure
- * out which blocks to read.
- */
- btrfs_unlock_up_safe(p, level + 1);
+ if (unlock_up) {
+ btrfs_unlock_up_safe(p, level + 1);
+ ret = -EAGAIN;
+ } else {
+ ret = 0;
+ }
if (p->reada != READA_NONE)
reada_for_search(fs_info, p, level, slot, key->objectid);
- ret = -EAGAIN;
tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid,
gen, parent_level - 1, &first_key);
if (IS_ERR(tmp)) {
@@ -1474,9 +1493,15 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
*/
if (!extent_buffer_uptodate(tmp))
ret = -EIO;
- free_extent_buffer(tmp);
- btrfs_release_path(p);
+out:
+ if (ret == 0) {
+ *eb_ret = tmp;
+ } else {
+ free_extent_buffer(tmp);
+ btrfs_release_path(p);
+ }
+
return ret;
}
@@ -2279,6 +2304,43 @@ int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
return ret;
}
+/**
+ * Search for a valid slot for the given path.
+ *
+ * @root: The root node of the tree.
+ * @key: Will contain a valid item if found.
+ * @path: The starting point to validate the slot.
+ *
+ * Return: 0 if the item is valid
+ * 1 if not found
+ * <0 if error.
+ */
+int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
+ struct btrfs_path *path)
+{
+ while (1) {
+ int ret;
+ const int slot = path->slots[0];
+ const struct extent_buffer *leaf = path->nodes[0];
+
+ /* This is where we start walking the path. */
+ if (slot >= btrfs_header_nritems(leaf)) {
+ /*
+ * If we've reached the last slot in this leaf we need
+ * to go to the next leaf and reset the path.
+ */
+ ret = btrfs_next_leaf(root, path);
+ if (ret)
+ return ret;
+ continue;
+ }
+ /* Store the found, valid item in @key. */
+ btrfs_item_key_to_cpu(leaf, key, slot);
+ break;
+ }
+ return 0;
+}
+
/*
* adjust the pointers going up the tree, starting at level
* making sure the right key of each node is points to 'key'.
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 077c95e9baa5..0e49b1a0c071 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -675,13 +675,13 @@ struct btrfs_fs_info {
rwlock_t global_root_lock;
struct rb_root global_root_tree;
- spinlock_t fs_roots_radix_lock;
- struct radix_tree_root fs_roots_radix;
+ /* The xarray that holds all the FS roots */
+ spinlock_t fs_roots_lock;
+ struct xarray fs_roots;
/* block group cache stuff */
- spinlock_t block_group_cache_lock;
- u64 first_logical_byte;
- struct rb_root block_group_cache_tree;
+ rwlock_t block_group_cache_lock;
+ struct rb_root_cached block_group_cache_tree;
/* keep track of unallocated space */
atomic64_t free_chunk_space;
@@ -848,12 +848,13 @@ struct btrfs_fs_info {
* two
*/
struct btrfs_workqueue *workers;
+ struct btrfs_workqueue *hipri_workers;
struct btrfs_workqueue *delalloc_workers;
struct btrfs_workqueue *flush_workers;
struct btrfs_workqueue *endio_workers;
struct btrfs_workqueue *endio_meta_workers;
struct btrfs_workqueue *endio_raid56_workers;
- struct btrfs_workqueue *rmw_workers;
+ struct workqueue_struct *rmw_workers;
struct btrfs_workqueue *endio_meta_write_workers;
struct btrfs_workqueue *endio_write_workers;
struct btrfs_workqueue *endio_freespace_worker;
@@ -946,9 +947,9 @@ struct btrfs_fs_info {
* running.
*/
refcount_t scrub_workers_refcnt;
- struct btrfs_workqueue *scrub_workers;
- struct btrfs_workqueue *scrub_wr_completion_workers;
- struct btrfs_workqueue *scrub_parity_workers;
+ struct workqueue_struct *scrub_workers;
+ struct workqueue_struct *scrub_wr_completion_workers;
+ struct workqueue_struct *scrub_parity_workers;
struct btrfs_subpage_info *subpage_info;
struct btrfs_discard_ctl discard_ctl;
@@ -994,10 +995,10 @@ struct btrfs_fs_info {
struct btrfs_delayed_root *delayed_root;
- /* Extent buffer radix tree */
+ /* Extent buffer xarray */
spinlock_t buffer_lock;
/* Entries are eb->start / sectorsize */
- struct radix_tree_root buffer_radix;
+ struct xarray extent_buffers;
/* next backup root to be overwritten */
int backup_root_index;
@@ -1045,10 +1046,7 @@ struct btrfs_fs_info {
* Zone size > 0 when in ZONED mode, otherwise it's used for a check
* if the mode is enabled
*/
- union {
- u64 zone_size;
- u64 zoned;
- };
+ u64 zone_size;
struct mutex zoned_meta_io_lock;
spinlock_t treelog_bg_lock;
@@ -1121,7 +1119,8 @@ enum {
*/
BTRFS_ROOT_SHAREABLE,
BTRFS_ROOT_TRACK_DIRTY,
- BTRFS_ROOT_IN_RADIX,
+ /* The root is tracked in fs_info::fs_roots */
+ BTRFS_ROOT_REGISTERED,
BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
BTRFS_ROOT_DEFRAG_RUNNING,
BTRFS_ROOT_FORCE_COW,
@@ -1225,10 +1224,10 @@ struct btrfs_root {
struct rb_root inode_tree;
/*
- * radix tree that keeps track of delayed nodes of every inode,
- * protected by inode_lock
+ * Xarray that keeps track of delayed nodes of every inode, protected
+ * by inode_lock
*/
- struct radix_tree_root delayed_nodes_tree;
+ struct xarray delayed_nodes;
/*
* right now this just gets used so that a root has its own devid
* for stat. It may be used for more later
@@ -2784,7 +2783,8 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes);
int btrfs_exclude_logged_extents(struct extent_buffer *eb);
int btrfs_cross_ref_exist(struct btrfs_root *root,
- u64 objectid, u64 offset, u64 bytenr, bool strict);
+ u64 objectid, u64 offset, u64 bytenr, bool strict,
+ struct btrfs_path *path);
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 parent, u64 root_objectid,
@@ -2811,8 +2811,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *buf, int full_backref);
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
- struct extent_buffer *eb, u64 flags,
- int level, int is_data);
+ struct extent_buffer *eb, u64 flags, int level);
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref);
int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
@@ -2892,7 +2891,7 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
- u64 disk_num_bytes);
+ u64 disk_num_bytes, bool noflush);
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
u64 start, u64 end);
@@ -3039,6 +3038,35 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
struct btrfs_path *path);
+int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
+ struct btrfs_path *path);
+
+/*
+ * Search in @root for a given @key, and store the slot found in @found_key.
+ *
+ * @root: The root node of the tree.
+ * @key: The key we are looking for.
+ * @found_key: Will hold the found item.
+ * @path: Holds the current slot/leaf.
+ * @iter_ret: Contains the value returned from btrfs_search_slot or
+ * btrfs_get_next_valid_item, whichever was executed last.
+ *
+ * The @iter_ret is an output variable that will contain the return value of
+ * btrfs_search_slot, if it encountered an error, or the value returned from
+ * btrfs_get_next_valid_item otherwise. That return value can be 0, if a valid
+ * slot was found, 1 if there were no more leaves, and <0 if there was an error.
+ *
+ * It's recommended to use a separate variable for iter_ret and then use it to
+ * set the function return value so there's no confusion of the 0/1/errno
+ * values stemming from btrfs_search_slot.
+ */
+#define btrfs_for_each_slot(root, key, found_key, path, iter_ret) \
+ for (iter_ret = btrfs_search_slot(NULL, (root), (key), (path), 0, 0); \
+ (iter_ret) >= 0 && \
+ (iter_ret = btrfs_get_next_valid_item((root), (found_key), (path))) == 0; \
+ (path)->slots[0]++ \
+ )
+
static inline int btrfs_next_old_item(struct btrfs_root *root,
struct btrfs_path *p, u64 time_seq)
{
@@ -3190,7 +3218,6 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);
/* file-item.c */
-struct btrfs_dio_private;
int btrfs_del_csums(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr, u64 len);
blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst);
@@ -3224,8 +3251,8 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz
u64 btrfs_file_extent_end(const struct btrfs_path *path);
/* inode.c */
-blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags);
+void btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, enum btrfs_compression_type compress_type);
unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
u32 bio_offset, struct page *page,
u64 start, u64 end);
@@ -3255,10 +3282,28 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
unsigned int extra_bits,
struct extent_state **cached_state);
-int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *new_root,
- struct btrfs_root *parent_root,
- struct user_namespace *mnt_userns);
+struct btrfs_new_inode_args {
+ /* Input */
+ struct inode *dir;
+ struct dentry *dentry;
+ struct inode *inode;
+ bool orphan;
+ bool subvol;
+
+ /*
+ * Output from btrfs_new_inode_prepare(), input to
+ * btrfs_create_new_inode().
+ */
+ struct posix_acl *default_acl;
+ struct posix_acl *acl;
+};
+int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
+ unsigned int *trans_num_items);
+int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_new_inode_args *args);
+void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args);
+struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns,
+ struct inode *dir);
void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
unsigned *bits);
void btrfs_clear_delalloc_extent(struct inode *inode,
@@ -3269,7 +3314,6 @@ void btrfs_split_delalloc_extent(struct inode *inode,
struct extent_state *orig, u64 split);
void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end);
vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf);
-int btrfs_readpage(struct file *file, struct page *page);
void btrfs_evict_inode(struct inode *inode);
int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
struct inode *btrfs_alloc_inode(struct super_block *sb);
@@ -3314,9 +3358,9 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
const struct btrfs_ioctl_encoded_io_args *encoded);
+ssize_t btrfs_dio_rw(struct kiocb *iocb, struct iov_iter *iter, size_t done_before);
+
extern const struct dentry_operations btrfs_dentry_operations;
-extern const struct iomap_ops btrfs_dio_iomap_ops;
-extern const struct iomap_dio_ops btrfs_dio_ops;
/* Inode locking type flags, by default the exclusive lock is taken */
#define BTRFS_ILOCK_SHARED (1U << 0)
@@ -3328,6 +3372,7 @@ void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags);
void btrfs_update_inode_bytes(struct btrfs_inode *inode,
const u64 add_bytes,
const u64 del_bytes);
+void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end);
/* ioctl.c */
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
@@ -3403,11 +3448,29 @@ void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
{
}
-#ifdef CONFIG_PRINTK
+#ifdef CONFIG_PRINTK_INDEX
+
+#define btrfs_printk(fs_info, fmt, args...) \
+do { \
+ printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt); \
+ _btrfs_printk(fs_info, fmt, ##args); \
+} while (0)
+
__printf(2, 3)
__cold
-void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
+void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
+
+#elif defined(CONFIG_PRINTK)
+
+#define btrfs_printk(fs_info, fmt, args...) \
+ _btrfs_printk(fs_info, fmt, ##args)
+
+__printf(2, 3)
+__cold
+void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
+
#else
+
#define btrfs_printk(fs_info, fmt, args...) \
btrfs_no_printk(fs_info, fmt, ##args)
#endif
@@ -3658,12 +3721,25 @@ do { \
__LINE__, (errno)); \
} while (0)
+#ifdef CONFIG_PRINTK_INDEX
+
#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \
-do { \
- __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \
- (errno), fmt, ##args); \
+do { \
+ printk_index_subsys_emit( \
+ "BTRFS: error (device %s%s) in %s:%d: errno=%d %s", \
+ KERN_CRIT, fmt); \
+ __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \
+ (errno), fmt, ##args); \
} while (0)
+#else
+
+#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \
+ __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \
+ (errno), fmt, ##args)
+
+#endif
+
#define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \
&(fs_info)->fs_state)))
#define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \
@@ -3816,15 +3892,16 @@ static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag)
struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu);
int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
struct posix_acl *acl, int type);
-int btrfs_init_acl(struct btrfs_trans_handle *trans,
- struct inode *inode, struct inode *dir);
+int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode,
+ struct posix_acl *acl, int type);
#else
#define btrfs_get_acl NULL
#define btrfs_set_acl NULL
-static inline int btrfs_init_acl(struct btrfs_trans_handle *trans,
- struct inode *inode, struct inode *dir)
+static inline int __btrfs_set_acl(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct posix_acl *acl,
+ int type)
{
- return 0;
+ return -EOPNOTSUPP;
}
#endif
@@ -3929,7 +4006,7 @@ static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
{
- return fs_info->zoned != 0;
+ return fs_info->zone_size > 0;
}
static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root)
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index bd8267c4687d..36ab0859a263 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -289,7 +289,7 @@ static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
}
int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
- u64 disk_num_bytes)
+ u64 disk_num_bytes, bool noflush)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -308,7 +308,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
* If we have a transaction open (can happen if we call truncate_block
* from truncate), then we need FLUSH_LIMIT so we don't deadlock.
*/
- if (btrfs_is_free_space_inode(inode)) {
+ if (noflush || btrfs_is_free_space_inode(inode)) {
flush = BTRFS_RESERVE_NO_FLUSH;
} else {
if (current->journal_info)
@@ -333,7 +333,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
*/
calc_inode_reservations(fs_info, num_bytes, disk_num_bytes,
&meta_reserve, &qgroup_reserve);
- ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true);
+ ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true,
+ noflush);
if (ret)
return ret;
ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, meta_reserve, flush);
@@ -456,7 +457,7 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
ret = btrfs_check_data_free_space(inode, reserved, start, len);
if (ret < 0)
return ret;
- ret = btrfs_delalloc_reserve_metadata(inode, len, len);
+ ret = btrfs_delalloc_reserve_metadata(inode, len, len, false);
if (ret < 0) {
btrfs_free_reserved_data_space(inode, *reserved, start, len);
extent_changeset_free(*reserved);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 748bf6b0d860..66779ab3ed4a 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -78,7 +78,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
}
spin_lock(&root->inode_lock);
- node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
+ node = xa_load(&root->delayed_nodes, ino);
if (node) {
if (btrfs_inode->delayed_node) {
@@ -90,9 +90,9 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
/*
* It's possible that we're racing into the middle of removing
- * this node from the radix tree. In this case, the refcount
+ * this node from the xarray. In this case, the refcount
* was zero and it should never go back to one. Just return
- * NULL like it was never in the radix at all; our release
+ * NULL like it was never in the xarray at all; our release
* function is in the process of removing it.
*
* Some implementations of refcount_inc refuse to bump the
@@ -100,7 +100,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
* here, refcount_inc() may decide to just WARN_ONCE() instead
* of actually bumping the refcount.
*
- * If this node is properly in the radix, we want to bump the
+ * If this node is properly in the xarray, we want to bump the
* refcount twice, once for the inode and once for this get
* operation.
*/
@@ -128,36 +128,30 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
u64 ino = btrfs_ino(btrfs_inode);
int ret;
-again:
- node = btrfs_get_delayed_node(btrfs_inode);
- if (node)
- return node;
+ do {
+ node = btrfs_get_delayed_node(btrfs_inode);
+ if (node)
+ return node;
- node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS);
- if (!node)
- return ERR_PTR(-ENOMEM);
- btrfs_init_delayed_node(node, root, ino);
+ node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS);
+ if (!node)
+ return ERR_PTR(-ENOMEM);
+ btrfs_init_delayed_node(node, root, ino);
- /* cached in the btrfs inode and can be accessed */
- refcount_set(&node->refs, 2);
+ /* Cached in the inode and can be accessed */
+ refcount_set(&node->refs, 2);
- ret = radix_tree_preload(GFP_NOFS);
- if (ret) {
- kmem_cache_free(delayed_node_cache, node);
- return ERR_PTR(ret);
- }
-
- spin_lock(&root->inode_lock);
- ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node);
- if (ret == -EEXIST) {
- spin_unlock(&root->inode_lock);
- kmem_cache_free(delayed_node_cache, node);
- radix_tree_preload_end();
- goto again;
- }
+ spin_lock(&root->inode_lock);
+ ret = xa_insert(&root->delayed_nodes, ino, node, GFP_NOFS);
+ if (ret) {
+ spin_unlock(&root->inode_lock);
+ kmem_cache_free(delayed_node_cache, node);
+ if (ret != -EBUSY)
+ return ERR_PTR(ret);
+ }
+ } while (ret);
btrfs_inode->delayed_node = node;
spin_unlock(&root->inode_lock);
- radix_tree_preload_end();
return node;
}
@@ -276,8 +270,7 @@ static void __btrfs_release_delayed_node(
* back up. We can delete it now.
*/
ASSERT(refcount_read(&delayed_node->refs) == 0);
- radix_tree_delete(&root->delayed_nodes_tree,
- delayed_node->inode_id);
+ xa_erase(&root->delayed_nodes, delayed_node->inode_id);
spin_unlock(&root->inode_lock);
kmem_cache_free(delayed_node_cache, delayed_node);
}
@@ -1870,34 +1863,35 @@ void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode)
void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
{
- u64 inode_id = 0;
+ unsigned long index = 0;
+ struct btrfs_delayed_node *delayed_node;
struct btrfs_delayed_node *delayed_nodes[8];
- int i, n;
while (1) {
+ int n = 0;
+
spin_lock(&root->inode_lock);
- n = radix_tree_gang_lookup(&root->delayed_nodes_tree,
- (void **)delayed_nodes, inode_id,
- ARRAY_SIZE(delayed_nodes));
- if (!n) {
+ if (xa_empty(&root->delayed_nodes)) {
spin_unlock(&root->inode_lock);
- break;
+ return;
}
- inode_id = delayed_nodes[n - 1]->inode_id + 1;
- for (i = 0; i < n; i++) {
+ xa_for_each_start(&root->delayed_nodes, index, delayed_node, index) {
/*
* Don't increase refs in case the node is dead and
* about to be removed from the tree in the loop below
*/
- if (!refcount_inc_not_zero(&delayed_nodes[i]->refs))
- delayed_nodes[i] = NULL;
+ if (refcount_inc_not_zero(&delayed_node->refs)) {
+ delayed_nodes[n] = delayed_node;
+ n++;
+ }
+ if (n >= ARRAY_SIZE(delayed_nodes))
+ break;
}
+ index++;
spin_unlock(&root->inode_lock);
- for (i = 0; i < n; i++) {
- if (!delayed_nodes[i])
- continue;
+ for (int i = 0; i < n; i++) {
__btrfs_kill_delayed_node(delayed_nodes[i]);
btrfs_release_delayed_node(delayed_nodes[i]);
}
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 4176df149d04..99f37fca2e96 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -930,7 +930,6 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
is_system = (generic_ref->tree_ref.owning_root == BTRFS_CHUNK_TREE_OBJECTID);
ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action);
- BUG_ON(extent_op && extent_op->is_data);
ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
if (!ref)
return -ENOMEM;
@@ -1103,8 +1102,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
return -ENOMEM;
init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0,
- BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data,
- false);
+ BTRFS_UPDATE_DELAYED_HEAD, false, false);
head_ref->extent_op = extent_op;
delayed_refs = &trans->transaction->delayed_refs;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 91a3aabad150..d6304b690ec4 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -58,7 +58,6 @@ struct btrfs_delayed_extent_op {
u8 level;
bool update_key;
bool update_flags;
- bool is_data;
u64 flags_to_set;
};
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index f26202621989..a7dd6ba25e99 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -474,6 +474,7 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
struct btrfs_dev_extent *dev_extent = NULL;
struct btrfs_block_group *cache;
struct btrfs_trans_handle *trans;
+ int iter_ret = 0;
int ret = 0;
u64 chunk_offset;
@@ -524,29 +525,8 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
key.type = BTRFS_DEV_EXTENT_KEY;
key.offset = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto free_path;
- if (ret > 0) {
- if (path->slots[0] >=
- btrfs_header_nritems(path->nodes[0])) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- goto free_path;
- if (ret > 0) {
- ret = 0;
- goto free_path;
- }
- } else {
- ret = 0;
- }
- }
-
- while (1) {
+ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
struct extent_buffer *leaf = path->nodes[0];
- int slot = path->slots[0];
-
- btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.objectid != src_dev->devid)
break;
@@ -557,30 +537,23 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
if (found_key.offset < key.offset)
break;
- dev_extent = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
+ dev_extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dev_extent);
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
if (!cache)
- goto skip;
+ continue;
spin_lock(&cache->lock);
cache->to_copy = 1;
spin_unlock(&cache->lock);
btrfs_put_block_group(cache);
-
-skip:
- ret = btrfs_next_item(root, path);
- if (ret != 0) {
- if (ret > 0)
- ret = 0;
- break;
- }
}
+ if (iter_ret < 0)
+ ret = iter_ret;
-free_path:
btrfs_free_path(path);
unlock:
mutex_unlock(&fs_info->chunk_mutex);
@@ -881,6 +854,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
int scrub_ret)
{
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *tgt_device;
struct btrfs_device *src_device;
struct btrfs_root *root = fs_info->tree_root;
@@ -930,12 +904,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
WARN_ON(ret);
/* Prevent write_all_supers() during the finishing procedure */
- mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ mutex_lock(&fs_devices->device_list_mutex);
/* Prevent new chunks being allocated on the source device */
mutex_lock(&fs_info->chunk_mutex);
if (!list_empty(&src_device->post_commit_list)) {
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&fs_devices->device_list_mutex);
mutex_unlock(&fs_info->chunk_mutex);
} else {
break;
@@ -972,7 +946,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
error:
up_write(&dev_replace->rwsem);
mutex_unlock(&fs_info->chunk_mutex);
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&fs_devices->device_list_mutex);
btrfs_rm_dev_replace_blocked(fs_info);
if (tgt_device)
btrfs_destroy_dev_replace_tgtdev(tgt_device);
@@ -1001,8 +975,8 @@ error:
btrfs_assign_next_active_device(src_device, tgt_device);
- list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
- fs_info->fs_devices->rw_devices++;
+ list_add(&tgt_device->dev_alloc_list, &fs_devices->alloc_list);
+ fs_devices->rw_devices++;
up_write(&dev_replace->rwsem);
btrfs_rm_dev_replace_blocked(fs_info);
@@ -1025,7 +999,7 @@ error:
* belong to this filesystem.
*/
mutex_unlock(&fs_info->chunk_mutex);
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&fs_devices->device_list_mutex);
/* replace the sysfs entry */
btrfs_sysfs_remove_device(src_device);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 3b532bab0755..72fb2c518a2b 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -325,36 +325,15 @@ btrfs_search_dir_index_item(struct btrfs_root *root,
struct btrfs_path *path, u64 dirid,
const char *name, int name_len)
{
- struct extent_buffer *leaf;
struct btrfs_dir_item *di;
struct btrfs_key key;
- u32 nritems;
int ret;
key.objectid = dirid;
key.type = BTRFS_DIR_INDEX_KEY;
key.offset = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- return ERR_PTR(ret);
-
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
-
- while (1) {
- if (path->slots[0] >= nritems) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- return ERR_PTR(ret);
- if (ret > 0)
- break;
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
- continue;
- }
-
- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ btrfs_for_each_slot(root, &key, &key, path, ret) {
if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY)
break;
@@ -362,10 +341,12 @@ btrfs_search_dir_index_item(struct btrfs_root *root,
name, name_len);
if (di)
return di;
-
- path->slots[0]++;
}
- return NULL;
+ /* Adjust return code if the key was not found in the next leaf. */
+ if (ret > 0)
+ ret = 0;
+
+ return ERR_PTR(ret);
}
struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 84795d831282..14f8a90df321 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -5,7 +5,6 @@
#include <linux/fs.h>
#include <linux/blkdev.h>
-#include <linux/radix-tree.h>
#include <linux/writeback.h>
#include <linux/workqueue.h>
#include <linux/kthread.h>
@@ -374,9 +373,9 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level,
* @level: expected level, mandatory check
* @first_key: expected key of first slot, skip check if NULL
*/
-static int btree_read_extent_buffer_pages(struct extent_buffer *eb,
- u64 parent_transid, int level,
- struct btrfs_key *first_key)
+int btrfs_read_extent_buffer(struct extent_buffer *eb,
+ u64 parent_transid, int level,
+ struct btrfs_key *first_key)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
struct extent_io_tree *io_tree;
@@ -486,7 +485,7 @@ static int csum_dirty_subpage_buffers(struct btrfs_fs_info *fs_info,
uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur,
fs_info->nodesize);
- /* A dirty eb shouldn't disappear from buffer_radix */
+ /* A dirty eb shouldn't disappear from extent_buffers */
if (WARN_ON(!eb))
return -EUCLEAN;
@@ -519,7 +518,7 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec
u64 found_start;
struct extent_buffer *eb;
- if (fs_info->sectorsize < PAGE_SIZE)
+ if (fs_info->nodesize < PAGE_SIZE)
return csum_dirty_subpage_buffers(fs_info, bvec);
eb = (struct extent_buffer *)page->private;
@@ -704,7 +703,7 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
ASSERT(page->private);
- if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
+ if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
return validate_subpage_buffer(page, start, end, mirror);
eb = (struct extent_buffer *)page->private;
@@ -850,8 +849,7 @@ static void run_one_async_free(struct btrfs_work *work)
}
blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags,
- u64 dio_file_offset,
+ int mirror_num, u64 dio_file_offset,
extent_submit_bio_start_t *submit_bio_start)
{
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
@@ -874,9 +872,9 @@ blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio,
async->status = 0;
if (op_is_sync(bio->bi_opf))
- btrfs_set_work_high_priority(&async->work);
-
- btrfs_queue_work(fs_info->workers, &async->work);
+ btrfs_queue_work(fs_info->hipri_workers, &async->work);
+ else
+ btrfs_queue_work(fs_info->workers, &async->work);
return 0;
}
@@ -920,8 +918,7 @@ static bool should_async_write(struct btrfs_fs_info *fs_info,
return true;
}
-blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags)
+void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
blk_status_t ret;
@@ -933,31 +930,25 @@ blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
*/
ret = btrfs_bio_wq_end_io(fs_info, bio,
BTRFS_WQ_ENDIO_METADATA);
- if (ret)
- goto out_w_error;
- ret = btrfs_map_bio(fs_info, bio, mirror_num);
+ if (!ret)
+ ret = btrfs_map_bio(fs_info, bio, mirror_num);
} else if (!should_async_write(fs_info, BTRFS_I(inode))) {
ret = btree_csum_one_bio(bio);
- if (ret)
- goto out_w_error;
- ret = btrfs_map_bio(fs_info, bio, mirror_num);
+ if (!ret)
+ ret = btrfs_map_bio(fs_info, bio, mirror_num);
} else {
/*
* kthread helpers are used to submit writes so that
* checksumming can happen in parallel across all CPUs
*/
ret = btrfs_wq_submit_bio(inode, bio, mirror_num, 0,
- 0, btree_submit_bio_start);
+ btree_submit_bio_start);
}
- if (ret)
- goto out_w_error;
- return 0;
-
-out_w_error:
- bio->bi_status = ret;
- bio_endio(bio);
- return ret;
+ if (ret) {
+ bio->bi_status = ret;
+ bio_endio(bio);
+ }
}
#ifdef CONFIG_MIGRATION
@@ -1118,12 +1109,15 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
if (IS_ERR(buf))
return buf;
- ret = btree_read_extent_buffer_pages(buf, parent_transid,
- level, first_key);
+ ret = btrfs_read_extent_buffer(buf, parent_transid, level, first_key);
if (ret) {
free_extent_buffer_stale(buf);
return ERR_PTR(ret);
}
+ if (btrfs_check_eb_owner(buf, owner_root)) {
+ free_extent_buffer_stale(buf);
+ return ERR_PTR(-EUCLEAN);
+ }
return buf;
}
@@ -1164,7 +1158,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->nr_delalloc_inodes = 0;
root->nr_ordered_extents = 0;
root->inode_tree = RB_ROOT;
- INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
+ xa_init_flags(&root->delayed_nodes, GFP_ATOMIC);
btrfs_init_root_block_rsv(root);
@@ -1216,9 +1210,9 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
#ifdef CONFIG_BTRFS_DEBUG
INIT_LIST_HEAD(&root->leak_list);
- spin_lock(&fs_info->fs_roots_radix_lock);
+ spin_lock(&fs_info->fs_roots_lock);
list_add_tail(&root->leak_list, &fs_info->allocated_roots);
- spin_unlock(&fs_info->fs_roots_radix_lock);
+ spin_unlock(&fs_info->fs_roots_lock);
#endif
}
@@ -1563,6 +1557,23 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
ret = -EIO;
goto fail;
}
+
+ /*
+ * For real fs, and not log/reloc trees, root owner must
+ * match its root node owner
+ */
+ if (!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state) &&
+ root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
+ root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
+ root->root_key.objectid != btrfs_header_owner(root->node)) {
+ btrfs_crit(fs_info,
+"root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu",
+ root->root_key.objectid, root->node->start,
+ btrfs_header_owner(root->node),
+ root->root_key.objectid);
+ ret = -EUCLEAN;
+ goto fail;
+ }
root->commit_root = btrfs_root_node(root);
return root;
fail:
@@ -1648,12 +1659,11 @@ static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
{
struct btrfs_root *root;
- spin_lock(&fs_info->fs_roots_radix_lock);
- root = radix_tree_lookup(&fs_info->fs_roots_radix,
- (unsigned long)root_id);
+ spin_lock(&fs_info->fs_roots_lock);
+ root = xa_load(&fs_info->fs_roots, (unsigned long)root_id);
if (root)
root = btrfs_grab_root(root);
- spin_unlock(&fs_info->fs_roots_radix_lock);
+ spin_unlock(&fs_info->fs_roots_lock);
return root;
}
@@ -1695,20 +1705,14 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
{
int ret;
- ret = radix_tree_preload(GFP_NOFS);
- if (ret)
- return ret;
-
- spin_lock(&fs_info->fs_roots_radix_lock);
- ret = radix_tree_insert(&fs_info->fs_roots_radix,
- (unsigned long)root->root_key.objectid,
- root);
+ spin_lock(&fs_info->fs_roots_lock);
+ ret = xa_insert(&fs_info->fs_roots, (unsigned long)root->root_key.objectid,
+ root, GFP_NOFS);
if (ret == 0) {
btrfs_grab_root(root);
- set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
+ set_bit(BTRFS_ROOT_REGISTERED, &root->state);
}
- spin_unlock(&fs_info->fs_roots_radix_lock);
- radix_tree_preload_end();
+ spin_unlock(&fs_info->fs_roots_lock);
return ret;
}
@@ -1964,7 +1968,7 @@ static void end_workqueue_fn(struct btrfs_work *work)
static int cleaner_kthread(void *arg)
{
- struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)arg;
+ struct btrfs_fs_info *fs_info = arg;
int again;
while (1) {
@@ -2266,10 +2270,12 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
{
btrfs_destroy_workqueue(fs_info->fixup_workers);
btrfs_destroy_workqueue(fs_info->delalloc_workers);
+ btrfs_destroy_workqueue(fs_info->hipri_workers);
btrfs_destroy_workqueue(fs_info->workers);
btrfs_destroy_workqueue(fs_info->endio_workers);
btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
- btrfs_destroy_workqueue(fs_info->rmw_workers);
+ if (fs_info->rmw_workers)
+ destroy_workqueue(fs_info->rmw_workers);
btrfs_destroy_workqueue(fs_info->endio_write_workers);
btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
btrfs_destroy_workqueue(fs_info->delayed_workers);
@@ -2336,9 +2342,9 @@ void btrfs_put_root(struct btrfs_root *root)
btrfs_drew_lock_destroy(&root->snapshot_lock);
free_root_extent_buffers(root);
#ifdef CONFIG_BTRFS_DEBUG
- spin_lock(&root->fs_info->fs_roots_radix_lock);
+ spin_lock(&root->fs_info->fs_roots_lock);
list_del_init(&root->leak_list);
- spin_unlock(&root->fs_info->fs_roots_radix_lock);
+ spin_unlock(&root->fs_info->fs_roots_lock);
#endif
kfree(root);
}
@@ -2346,28 +2352,21 @@ void btrfs_put_root(struct btrfs_root *root)
void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
{
- int ret;
- struct btrfs_root *gang[8];
- int i;
+ struct btrfs_root *root;
+ unsigned long index = 0;
while (!list_empty(&fs_info->dead_roots)) {
- gang[0] = list_entry(fs_info->dead_roots.next,
- struct btrfs_root, root_list);
- list_del(&gang[0]->root_list);
+ root = list_entry(fs_info->dead_roots.next,
+ struct btrfs_root, root_list);
+ list_del(&root->root_list);
- if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state))
- btrfs_drop_and_free_fs_root(fs_info, gang[0]);
- btrfs_put_root(gang[0]);
+ if (test_bit(BTRFS_ROOT_REGISTERED, &root->state))
+ btrfs_drop_and_free_fs_root(fs_info, root);
+ btrfs_put_root(root);
}
- while (1) {
- ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
- (void **)gang, 0,
- ARRAY_SIZE(gang));
- if (!ret)
- break;
- for (i = 0; i < ret; i++)
- btrfs_drop_and_free_fs_root(fs_info, gang[i]);
+ xa_for_each(&fs_info->fs_roots, index, root) {
+ btrfs_drop_and_free_fs_root(fs_info, root);
}
}
@@ -2444,7 +2443,9 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
fs_info->workers =
- btrfs_alloc_workqueue(fs_info, "worker",
+ btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16);
+ fs_info->hipri_workers =
+ btrfs_alloc_workqueue(fs_info, "worker-high",
flags | WQ_HIGHPRI, max_active, 16);
fs_info->delalloc_workers =
@@ -2476,8 +2477,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
fs_info->endio_raid56_workers =
btrfs_alloc_workqueue(fs_info, "endio-raid56", flags,
max_active, 4);
- fs_info->rmw_workers =
- btrfs_alloc_workqueue(fs_info, "rmw", flags, max_active, 2);
+ fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active);
fs_info->endio_write_workers =
btrfs_alloc_workqueue(fs_info, "endio-write", flags,
max_active, 2);
@@ -2492,8 +2492,8 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
fs_info->discard_ctl.discard_workers =
alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1);
- if (!(fs_info->workers && fs_info->delalloc_workers &&
- fs_info->flush_workers &&
+ if (!(fs_info->workers && fs_info->hipri_workers &&
+ fs_info->delalloc_workers && fs_info->flush_workers &&
fs_info->endio_workers && fs_info->endio_meta_workers &&
fs_info->endio_meta_write_workers &&
fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
@@ -2815,12 +2815,14 @@ static int validate_super(struct btrfs_fs_info *fs_info,
}
/*
- * For 4K page size, we only support 4K sector size.
- * For 64K page size, we support 64K and 4K sector sizes.
+ * We only support at most two sectorsizes: 4K and PAGE_SIZE.
+ *
+ * We can support 16K sectorsize with 64K page size without problem,
+ * but such sectorsize/pagesize combination doesn't make much sense.
+ * 4K will be our future standard, PAGE_SIZE is supported from the very
+ * beginning.
*/
- if ((PAGE_SIZE == SZ_4K && sectorsize != PAGE_SIZE) ||
- (PAGE_SIZE == SZ_64K && (sectorsize != SZ_4K &&
- sectorsize != SZ_64K))) {
+ if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && sectorsize != PAGE_SIZE)) {
btrfs_err(fs_info,
"sectorsize %llu not yet supported for page size %lu",
sectorsize, PAGE_SIZE);
@@ -3132,8 +3134,8 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
{
- INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
- INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
+ xa_init_flags(&fs_info->fs_roots, GFP_ATOMIC);
+ xa_init_flags(&fs_info->extent_buffers, GFP_ATOMIC);
INIT_LIST_HEAD(&fs_info->trans_list);
INIT_LIST_HEAD(&fs_info->dead_roots);
INIT_LIST_HEAD(&fs_info->delayed_iputs);
@@ -3141,7 +3143,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
INIT_LIST_HEAD(&fs_info->caching_block_groups);
spin_lock_init(&fs_info->delalloc_root_lock);
spin_lock_init(&fs_info->trans_lock);
- spin_lock_init(&fs_info->fs_roots_radix_lock);
+ spin_lock_init(&fs_info->fs_roots_lock);
spin_lock_init(&fs_info->delayed_iput_lock);
spin_lock_init(&fs_info->defrag_inodes_lock);
spin_lock_init(&fs_info->super_lock);
@@ -3209,9 +3211,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
btrfs_init_balance(fs_info);
btrfs_init_async_reclaim_work(fs_info);
- spin_lock_init(&fs_info->block_group_cache_lock);
- fs_info->block_group_cache_tree = RB_ROOT;
- fs_info->first_logical_byte = (u64)-1;
+ rwlock_init(&fs_info->block_group_cache_lock);
+ fs_info->block_group_cache_tree = RB_ROOT_CACHED;
extent_io_tree_init(fs_info, &fs_info->excluded_extents,
IO_TREE_FS_EXCLUDED_EXTENTS, NULL);
@@ -3295,7 +3296,7 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
static int btrfs_uuid_rescan_kthread(void *data)
{
- struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
+ struct btrfs_fs_info *fs_info = data;
int ret;
/*
@@ -3373,7 +3374,7 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
/*
* btrfs_find_orphan_roots() is responsible for finding all the dead
* roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load
- * them into the fs_info->fs_roots_radix tree. This must be done before
+ * them into the fs_info->fs_roots. This must be done before
* calling btrfs_orphan_cleanup() on the tree root. If we don't do it
* first, then btrfs_orphan_cleanup() will delete a dead root's orphan
* item before the root's tree is deleted - this means that if we unmount
@@ -3611,7 +3612,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
~BTRFS_FEATURE_INCOMPAT_SUPP;
if (features) {
btrfs_err(fs_info,
- "cannot mount because of unsupported optional features (%llx)",
+ "cannot mount because of unsupported optional features (0x%llx)",
features);
err = -EINVAL;
goto fail_alloc;
@@ -3649,7 +3650,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
~BTRFS_FEATURE_COMPAT_RO_SUPP;
if (!sb_rdonly(sb) && features) {
btrfs_err(fs_info,
- "cannot mount read-write because of unsupported optional features (%llx)",
+ "cannot mount read-write because of unsupported optional features (0x%llx)",
features);
err = -EINVAL;
goto fail_alloc;
@@ -3672,14 +3673,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
btrfs_warn(fs_info,
"read-write for sector size %u with page size %lu is experimental",
sectorsize, PAGE_SIZE);
- if (btrfs_super_incompat_flags(fs_info->super_copy) &
- BTRFS_FEATURE_INCOMPAT_RAID56) {
- btrfs_err(fs_info,
- "RAID56 is not yet supported for sector size %u with page size %lu",
- sectorsize, PAGE_SIZE);
- err = -EINVAL;
- goto fail_alloc;
- }
subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL);
if (!subpage_info)
goto fail_alloc;
@@ -4157,7 +4150,8 @@ static int write_dev_supers(struct btrfs_device *device,
if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
bio->bi_opf |= REQ_FUA;
- btrfsic_submit_bio(bio);
+ btrfsic_check_bio(bio);
+ submit_bio(bio);
if (btrfs_advance_sb_log(device, i))
errors++;
@@ -4271,7 +4265,8 @@ static void write_dev_flush(struct btrfs_device *device)
init_completion(&device->flush_wait);
bio->bi_private = &device->flush_wait;
- btrfsic_submit_bio(bio);
+ btrfsic_check_bio(bio);
+ submit_bio(bio);
set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
}
@@ -4504,12 +4499,11 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
{
bool drop_ref = false;
- spin_lock(&fs_info->fs_roots_radix_lock);
- radix_tree_delete(&fs_info->fs_roots_radix,
- (unsigned long)root->root_key.objectid);
- if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state))
+ spin_lock(&fs_info->fs_roots_lock);
+ xa_erase(&fs_info->fs_roots, (unsigned long)root->root_key.objectid);
+ if (test_and_clear_bit(BTRFS_ROOT_REGISTERED, &root->state))
drop_ref = true;
- spin_unlock(&fs_info->fs_roots_radix_lock);
+ spin_unlock(&fs_info->fs_roots_lock);
if (BTRFS_FS_ERROR(fs_info)) {
ASSERT(root->log_root == NULL);
@@ -4525,50 +4519,48 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
{
- u64 root_objectid = 0;
- struct btrfs_root *gang[8];
- int i = 0;
+ struct btrfs_root *roots[8];
+ unsigned long index = 0;
+ int i;
int err = 0;
- unsigned int ret = 0;
+ int grabbed;
while (1) {
- spin_lock(&fs_info->fs_roots_radix_lock);
- ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
- (void **)gang, root_objectid,
- ARRAY_SIZE(gang));
- if (!ret) {
- spin_unlock(&fs_info->fs_roots_radix_lock);
- break;
+ struct btrfs_root *root;
+
+ spin_lock(&fs_info->fs_roots_lock);
+ if (!xa_find(&fs_info->fs_roots, &index, ULONG_MAX, XA_PRESENT)) {
+ spin_unlock(&fs_info->fs_roots_lock);
+ return err;
}
- root_objectid = gang[ret - 1]->root_key.objectid + 1;
- for (i = 0; i < ret; i++) {
- /* Avoid to grab roots in dead_roots */
- if (btrfs_root_refs(&gang[i]->root_item) == 0) {
- gang[i] = NULL;
- continue;
- }
- /* grab all the search result for later use */
- gang[i] = btrfs_grab_root(gang[i]);
+ grabbed = 0;
+ xa_for_each_start(&fs_info->fs_roots, index, root, index) {
+ /* Avoid grabbing roots in dead_roots */
+ if (btrfs_root_refs(&root->root_item) > 0)
+ roots[grabbed++] = btrfs_grab_root(root);
+ if (grabbed >= ARRAY_SIZE(roots))
+ break;
}
- spin_unlock(&fs_info->fs_roots_radix_lock);
+ spin_unlock(&fs_info->fs_roots_lock);
- for (i = 0; i < ret; i++) {
- if (!gang[i])
+ for (i = 0; i < grabbed; i++) {
+ if (!roots[i])
continue;
- root_objectid = gang[i]->root_key.objectid;
- err = btrfs_orphan_cleanup(gang[i]);
+ index = roots[i]->root_key.objectid;
+ err = btrfs_orphan_cleanup(roots[i]);
if (err)
- break;
- btrfs_put_root(gang[i]);
+ goto out;
+ btrfs_put_root(roots[i]);
}
- root_objectid++;
+ index++;
}
- /* release the uncleaned roots due to error */
- for (; i < ret; i++) {
- if (gang[i])
- btrfs_put_root(gang[i]);
+out:
+ /* Release the roots that remain uncleaned due to error */
+ for (; i < grabbed; i++) {
+ if (roots[i])
+ btrfs_put_root(roots[i]);
}
return err;
}
@@ -4863,13 +4855,6 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info)
__btrfs_btree_balance_dirty(fs_info, 0);
}
-int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
- struct btrfs_key *first_key)
-{
- return btree_read_extent_buffer_pages(buf, parent_transid,
- level, first_key);
-}
-
static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
{
/* cleanup FS via transaction */
@@ -4885,31 +4870,28 @@ static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info)
{
- struct btrfs_root *gang[8];
- u64 root_objectid = 0;
- int ret;
-
- spin_lock(&fs_info->fs_roots_radix_lock);
- while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
- (void **)gang, root_objectid,
- ARRAY_SIZE(gang))) != 0) {
- int i;
+ unsigned long index = 0;
+ int grabbed = 0;
+ struct btrfs_root *roots[8];
- for (i = 0; i < ret; i++)
- gang[i] = btrfs_grab_root(gang[i]);
- spin_unlock(&fs_info->fs_roots_radix_lock);
+ spin_lock(&fs_info->fs_roots_lock);
+ while ((grabbed = xa_extract(&fs_info->fs_roots, (void **)roots, index,
+ ULONG_MAX, 8, XA_PRESENT))) {
+ for (int i = 0; i < grabbed; i++)
+ roots[i] = btrfs_grab_root(roots[i]);
+ spin_unlock(&fs_info->fs_roots_lock);
- for (i = 0; i < ret; i++) {
- if (!gang[i])
+ for (int i = 0; i < grabbed; i++) {
+ if (!roots[i])
continue;
- root_objectid = gang[i]->root_key.objectid;
- btrfs_free_log(NULL, gang[i]);
- btrfs_put_root(gang[i]);
+ index = roots[i]->root_key.objectid;
+ btrfs_free_log(NULL, roots[i]);
+ btrfs_put_root(roots[i]);
}
- root_objectid++;
- spin_lock(&fs_info->fs_roots_radix_lock);
+ index++;
+ spin_lock(&fs_info->fs_roots_lock);
}
- spin_unlock(&fs_info->fs_roots_radix_lock);
+ spin_unlock(&fs_info->fs_roots_lock);
btrfs_free_log_root_tree(NULL, fs_info);
}
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 2e10514ecda8..4ee8c42c9f78 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -87,8 +87,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
struct page *page, u64 start, u64 end,
int mirror);
-blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags);
+void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);
#endif
@@ -120,13 +119,12 @@ void btrfs_put_root(struct btrfs_root *root);
void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
int atomic);
-int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
- struct btrfs_key *first_key);
+int btrfs_read_extent_buffer(struct extent_buffer *buf, u64 parent_transid,
+ int level, struct btrfs_key *first_key);
blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
enum btrfs_wq_endio_type metadata);
blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags,
- u64 dio_file_offset,
+ int mirror_num, u64 dio_file_offset,
extent_submit_bio_start_t *submit_bio_start);
blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio,
int mirror_num);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6260784e74b5..0867c5cd6e01 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -895,7 +895,13 @@ again:
err = -ENOENT;
while (1) {
if (ptr >= end) {
- WARN_ON(ptr > end);
+ if (ptr > end) {
+ err = -EUCLEAN;
+ btrfs_print_leaf(path->nodes[0]);
+ btrfs_crit(fs_info,
+"overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu",
+ path->slots[0], root_objectid, owner, offset, parent);
+ }
break;
}
iref = (struct btrfs_extent_inline_ref *)ptr;
@@ -1577,12 +1583,12 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
u32 item_size;
int ret;
int err = 0;
- int metadata = !extent_op->is_data;
+ int metadata = 1;
if (TRANS_ABORTED(trans))
return 0;
- if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+ if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
metadata = 0;
path = btrfs_alloc_path();
@@ -2180,7 +2186,7 @@ out:
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
struct extent_buffer *eb, u64 flags,
- int level, int is_data)
+ int level)
{
struct btrfs_delayed_extent_op *extent_op;
int ret;
@@ -2192,7 +2198,6 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
extent_op->flags_to_set = flags;
extent_op->update_flags = true;
extent_op->update_key = false;
- extent_op->is_data = is_data ? true : false;
extent_op->level = level;
ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op);
@@ -2357,15 +2362,10 @@ out:
}
int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
- u64 bytenr, bool strict)
+ u64 bytenr, bool strict, struct btrfs_path *path)
{
- struct btrfs_path *path;
int ret;
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
do {
ret = check_committed_ref(root, path, objectid,
offset, bytenr, strict);
@@ -2376,7 +2376,7 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
} while (ret == -EAGAIN);
out:
- btrfs_free_path(path);
+ btrfs_release_path(path);
if (btrfs_is_data_reloc_root(root))
WARN_ON(ret > 0);
return ret;
@@ -2497,24 +2497,21 @@ static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
return ret;
}
-static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
+static u64 first_logical_byte(struct btrfs_fs_info *fs_info)
{
- struct btrfs_block_group *cache;
- u64 bytenr;
-
- spin_lock(&fs_info->block_group_cache_lock);
- bytenr = fs_info->first_logical_byte;
- spin_unlock(&fs_info->block_group_cache_lock);
-
- if (bytenr < (u64)-1)
- return bytenr;
+ struct rb_node *leftmost;
+ u64 bytenr = 0;
- cache = btrfs_lookup_first_block_group(fs_info, search_start);
- if (!cache)
- return 0;
+ read_lock(&fs_info->block_group_cache_lock);
+ /* Get the block group with the lowest logical start address. */
+ leftmost = rb_first_cached(&fs_info->block_group_cache_tree);
+ if (leftmost) {
+ struct btrfs_block_group *bg;
- bytenr = cache->start;
- btrfs_put_block_group(cache);
+ bg = rb_entry(leftmost, struct btrfs_block_group, cache_node);
+ bytenr = bg->start;
+ }
+ read_unlock(&fs_info->block_group_cache_lock);
return bytenr;
}
@@ -3803,8 +3800,7 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
/* Check RO and no space case before trying to activate it */
spin_lock(&block_group->lock);
- if (block_group->ro ||
- block_group->alloc_offset == block_group->zone_capacity) {
+ if (block_group->ro || btrfs_zoned_bg_is_full(block_group)) {
ret = 1;
/*
* May need to clear fs_info->{treelog,data_reloc}_bg.
@@ -4272,7 +4268,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
return ret;
ffe_ctl->search_start = max(ffe_ctl->search_start,
- first_logical_byte(fs_info, 0));
+ first_logical_byte(fs_info));
ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte);
if (ffe_ctl->search_start == ffe_ctl->hint_byte) {
block_group = btrfs_lookup_block_group(fs_info,
@@ -4959,7 +4955,6 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
extent_op->flags_to_set = flags;
extent_op->update_key = skinny_metadata ? false : true;
extent_op->update_flags = true;
- extent_op->is_data = false;
extent_op->level = level;
btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
@@ -5144,7 +5139,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
ret = btrfs_dec_ref(trans, root, eb, 0);
BUG_ON(ret); /* -ENOMEM */
ret = btrfs_set_disk_extent_flags(trans, eb, flag,
- btrfs_header_level(eb), 0);
+ btrfs_header_level(eb));
BUG_ON(ret); /* -ENOMEM */
wc->flags[level] |= flag;
}
@@ -5818,7 +5813,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
btrfs_qgroup_free_meta_all_pertrans(root);
- if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
+ if (test_bit(BTRFS_ROOT_REGISTERED, &root->state))
btrfs_add_dropped_root(trans, root);
else
btrfs_put_root(root);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 33c19f51d79b..588c7c606a2c 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -6,6 +6,7 @@
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/page-flags.h>
+#include <linux/sched/mm.h>
#include <linux/spinlock.h>
#include <linux/blkdev.h>
#include <linux/swap.h>
@@ -28,6 +29,7 @@
#include "subpage.h"
#include "zoned.h"
#include "block-group.h"
+#include "compression.h"
static struct kmem_cache *extent_state_cache;
static struct kmem_cache *extent_buffer_cache;
@@ -75,6 +77,7 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
if (!fs_info->allocated_ebs.next)
return;
+ WARN_ON(!list_empty(&fs_info->allocated_ebs));
spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
while (!list_empty(&fs_info->allocated_ebs)) {
eb = list_first_entry(&fs_info->allocated_ebs,
@@ -135,6 +138,17 @@ struct tree_entry {
struct rb_node rb_node;
};
+/*
+ * Structure to record info about the bio being assembled, and other info like
+ * how many bytes are there before stripe/ordered extent boundary.
+ */
+struct btrfs_bio_ctrl {
+ struct bio *bio;
+ enum btrfs_compression_type compress_type;
+ u32 len_to_stripe_boundary;
+ u32 len_to_oe_boundary;
+};
+
struct extent_page_data {
struct btrfs_bio_ctrl bio_ctrl;
/* tells writepage not to lock the state bits for this range
@@ -164,24 +178,27 @@ static int add_extent_changeset(struct extent_state *state, u32 bits,
return ret;
}
-int __must_check submit_one_bio(struct bio *bio, int mirror_num,
- unsigned long bio_flags)
+static void submit_one_bio(struct bio *bio, int mirror_num,
+ enum btrfs_compression_type compress_type)
{
- blk_status_t ret = 0;
struct extent_io_tree *tree = bio->bi_private;
bio->bi_private = NULL;
/* Caller should ensure the bio has at least some range added */
ASSERT(bio->bi_iter.bi_size);
+
if (is_data_inode(tree->private_data))
- ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num,
- bio_flags);
+ btrfs_submit_data_bio(tree->private_data, bio, mirror_num,
+ compress_type);
else
- ret = btrfs_submit_metadata_bio(tree->private_data, bio,
- mirror_num, bio_flags);
-
- return blk_status_to_errno(ret);
+ btrfs_submit_metadata_bio(tree->private_data, bio, mirror_num);
+ /*
+ * Above submission hooks will handle the error by ending the bio,
+ * which will do the cleanup properly. So here we should not return
+ * any error, or the caller of submit_extent_page() will do cleanup
+ * again, causing problems.
+ */
}
/* Cleanup unsubmitted bios */
@@ -202,13 +219,12 @@ static void end_write_bio(struct extent_page_data *epd, int ret)
* Return 0 if everything is OK.
* Return <0 for error.
*/
-static int __must_check flush_write_bio(struct extent_page_data *epd)
+static void flush_write_bio(struct extent_page_data *epd)
{
- int ret = 0;
struct bio *bio = epd->bio_ctrl.bio;
if (bio) {
- ret = submit_one_bio(bio, 0, 0);
+ submit_one_bio(bio, 0, 0);
/*
* Clean up of epd->bio is handled by its endio function.
* And endio is either triggered by successful bio execution
@@ -218,7 +234,6 @@ static int __must_check flush_write_bio(struct extent_page_data *epd)
*/
epd->bio_ctrl.bio = NULL;
}
- return ret;
}
int __init extent_state_cache_init(void)
@@ -2303,12 +2318,13 @@ static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
u64 length, u64 logical, struct page *page,
unsigned int pg_offset, int mirror_num)
{
- struct bio *bio;
struct btrfs_device *dev;
+ struct bio_vec bvec;
+ struct bio bio;
u64 map_length = 0;
u64 sector;
struct btrfs_io_context *bioc = NULL;
- int ret;
+ int ret = 0;
ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
BUG_ON(!mirror_num);
@@ -2316,8 +2332,6 @@ static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
if (btrfs_repair_one_zone(fs_info, logical))
return 0;
- bio = btrfs_bio_alloc(1);
- bio->bi_iter.bi_size = 0;
map_length = length;
/*
@@ -2335,52 +2349,50 @@ static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
*/
ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
&map_length, &bioc, 0);
- if (ret) {
- btrfs_bio_counter_dec(fs_info);
- bio_put(bio);
- return -EIO;
- }
+ if (ret)
+ goto out_counter_dec;
ASSERT(bioc->mirror_num == 1);
} else {
ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
&map_length, &bioc, mirror_num);
- if (ret) {
- btrfs_bio_counter_dec(fs_info);
- bio_put(bio);
- return -EIO;
- }
+ if (ret)
+ goto out_counter_dec;
BUG_ON(mirror_num != bioc->mirror_num);
}
sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9;
- bio->bi_iter.bi_sector = sector;
dev = bioc->stripes[bioc->mirror_num - 1].dev;
btrfs_put_bioc(bioc);
+
if (!dev || !dev->bdev ||
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
- btrfs_bio_counter_dec(fs_info);
- bio_put(bio);
- return -EIO;
+ ret = -EIO;
+ goto out_counter_dec;
}
- bio_set_dev(bio, dev->bdev);
- bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
- bio_add_page(bio, page, length, pg_offset);
- if (btrfsic_submit_bio_wait(bio)) {
+ bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
+ bio.bi_iter.bi_sector = sector;
+ __bio_add_page(&bio, page, length, pg_offset);
+
+ btrfsic_check_bio(&bio);
+ ret = submit_bio_wait(&bio);
+ if (ret) {
/* try to remap that extent elsewhere? */
- btrfs_bio_counter_dec(fs_info);
- bio_put(bio);
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
- return -EIO;
+ goto out_bio_uninit;
}
btrfs_info_rl_in_rcu(fs_info,
"read error corrected: ino %llu off %llu (dev %s sector %llu)",
ino, start,
rcu_str_deref(dev->name), sector);
+ ret = 0;
+
+out_bio_uninit:
+ bio_uninit(&bio);
+out_counter_dec:
btrfs_bio_counter_dec(fs_info);
- bio_put(bio);
- return 0;
+ return ret;
}
int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num)
@@ -2527,7 +2539,7 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
failrec->start = start;
failrec->len = sectorsize;
failrec->this_mirror = 0;
- failrec->bio_flags = 0;
+ failrec->compress_type = BTRFS_COMPRESS_NONE;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, failrec->len);
@@ -2551,8 +2563,7 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
logical = em->block_start + logical;
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
logical = em->block_start;
- failrec->bio_flags = EXTENT_BIO_COMPRESSED;
- extent_set_compress_type(&failrec->bio_flags, em->compress_type);
+ failrec->compress_type = em->compress_type;
}
btrfs_debug(fs_info,
@@ -2684,7 +2695,7 @@ int btrfs_repair_one_sector(struct inode *inode,
* will be handled by the endio on the repair_bio, so we can't return an
* error here.
*/
- submit_bio_hook(inode, repair_bio, failrec->this_mirror, failrec->bio_flags);
+ submit_bio_hook(inode, repair_bio, failrec->this_mirror, failrec->compress_type);
return BLK_STS_OK;
}
@@ -2710,18 +2721,19 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
btrfs_page_set_error(fs_info, page, start, len);
}
- if (fs_info->sectorsize == PAGE_SIZE)
+ if (!btrfs_is_subpage(fs_info, page))
unlock_page(page);
else
btrfs_subpage_end_reader(fs_info, page, start, len);
}
-static blk_status_t submit_read_repair(struct inode *inode,
- struct bio *failed_bio, u32 bio_offset,
- struct page *page, unsigned int pgoff,
- u64 start, u64 end, int failed_mirror,
- unsigned int error_bitmap,
- submit_bio_hook_t *submit_bio_hook)
+static blk_status_t submit_data_read_repair(struct inode *inode,
+ struct bio *failed_bio,
+ u32 bio_offset, struct page *page,
+ unsigned int pgoff,
+ u64 start, u64 end,
+ int failed_mirror,
+ unsigned int error_bitmap)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
const u32 sectorsize = fs_info->sectorsize;
@@ -2731,6 +2743,9 @@ static blk_status_t submit_read_repair(struct inode *inode,
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
+ /* This repair is only for data */
+ ASSERT(is_data_inode(inode));
+
/* We're here because we had some read errors or csum mismatch */
ASSERT(error_bitmap);
@@ -2759,7 +2774,7 @@ static blk_status_t submit_read_repair(struct inode *inode,
ret = btrfs_repair_one_sector(inode, failed_bio,
bio_offset + offset,
page, pgoff + offset, start + offset,
- failed_mirror, submit_bio_hook);
+ failed_mirror, btrfs_submit_data_bio);
if (!ret) {
/*
* We have submitted the read repair, the page release
@@ -2943,7 +2958,7 @@ update:
static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
{
ASSERT(PageLocked(page));
- if (fs_info->sectorsize == PAGE_SIZE)
+ if (!btrfs_is_subpage(fs_info, page))
return;
ASSERT(PagePrivate(page));
@@ -2951,7 +2966,7 @@ static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
}
/*
- * Find extent buffer for a givne bytenr.
+ * Find extent buffer for a given bytenr.
*
* This is for end_bio_extent_readpage(), thus we can't do any unsafe locking
* in endio context.
@@ -2965,16 +2980,14 @@ static struct extent_buffer *find_extent_buffer_readpage(
* For regular sectorsize, we can use page->private to grab extent
* buffer
*/
- if (fs_info->sectorsize == PAGE_SIZE) {
+ if (fs_info->nodesize >= PAGE_SIZE) {
ASSERT(PagePrivate(page) && page->private);
return (struct extent_buffer *)page->private;
}
- /* For subpage case, we need to lookup buffer radix tree */
- rcu_read_lock();
- eb = radix_tree_lookup(&fs_info->buffer_radix,
- bytenr >> fs_info->sectorsize_bits);
- rcu_read_unlock();
+ /* For subpage case, we need to lookup extent buffer xarray */
+ eb = xa_load(&fs_info->extent_buffers,
+ bytenr >> fs_info->sectorsize_bits);
ASSERT(eb);
return eb;
}
@@ -3077,13 +3090,13 @@ static void end_bio_extent_readpage(struct bio *bio)
goto readpage_ok;
/*
- * btrfs_submit_read_repair() will handle all the good
+ * submit_data_read_repair() will handle all the good
* and bad sectors, we just continue to the next bvec.
*/
- submit_read_repair(inode, bio, bio_offset, page,
- start - page_offset(page), start,
- end, mirror, error_bitmap,
- btrfs_submit_data_bio);
+ submit_data_read_repair(inode, bio, bio_offset, page,
+ start - page_offset(page),
+ start, end, mirror,
+ error_bitmap);
ASSERT(bio_offset + len > bio_offset);
bio_offset += len;
@@ -3132,6 +3145,42 @@ readpage_ok:
bio_put(bio);
}
+/**
+ * Populate every free slot in a provided array with pages.
+ *
+ * @nr_pages: number of pages to allocate
+ * @page_array: the array to fill with pages; any existing non-null entries in
+ * the array will be skipped
+ *
+ * Return: 0 if all pages were able to be allocated;
+ * -ENOMEM otherwise, and the caller is responsible for freeing all
+ * non-null page pointers in the array.
+ */
+int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array)
+{
+ unsigned int allocated;
+
+ for (allocated = 0; allocated < nr_pages;) {
+ unsigned int last = allocated;
+
+ allocated = alloc_pages_bulk_array(GFP_NOFS, nr_pages, page_array);
+
+ if (allocated == nr_pages)
+ return 0;
+
+ /*
+ * During this iteration, no page could be allocated, even
+ * though alloc_pages_bulk_array() falls back to alloc_page()
+ * if it could not bulk-allocate. So we must be out of memory.
+ */
+ if (allocated == last)
+ return -ENOMEM;
+
+ memalloc_retry_wait(GFP_NOFS);
+ }
+ return 0;
+}
+
/*
* Initialize the members up to but not including 'bio'. Use after allocating a
* new bio by bio_alloc_bioset as it does not initialize the bytes outside of
@@ -3157,13 +3206,13 @@ struct bio *btrfs_bio_alloc(unsigned int nr_iovecs)
return bio;
}
-struct bio *btrfs_bio_clone(struct bio *bio)
+struct bio *btrfs_bio_clone(struct block_device *bdev, struct bio *bio)
{
struct btrfs_bio *bbio;
struct bio *new;
/* Bio allocation backed by a bioset does not fail */
- new = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOFS, &btrfs_bioset);
+ new = bio_alloc_clone(bdev, bio, GFP_NOFS, &btrfs_bioset);
bbio = btrfs_bio(new);
btrfs_bio_init(bbio);
bbio->iter = bio->bi_iter;
@@ -3198,7 +3247,7 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
* a contiguous page to the previous one
* @size: portion of page that we want to write
* @pg_offset: starting offset in the page
- * @bio_flags: flags of the current bio to see if we can merge them
+ * @compress_type: compression type of the current bio to see if we can merge them
*
* Attempt to add a page to bio considering stripe alignment etc.
*
@@ -3210,7 +3259,7 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
struct page *page,
u64 disk_bytenr, unsigned int size,
unsigned int pg_offset,
- unsigned long bio_flags)
+ enum btrfs_compression_type compress_type)
{
struct bio *bio = bio_ctrl->bio;
u32 bio_size = bio->bi_iter.bi_size;
@@ -3222,10 +3271,10 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
ASSERT(bio);
/* The limit should be calculated when bio_ctrl->bio is allocated */
ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary);
- if (bio_ctrl->bio_flags != bio_flags)
+ if (bio_ctrl->compress_type != compress_type)
return 0;
- if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED)
+ if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE)
contig = bio->bi_iter.bi_sector == sector;
else
contig = bio_end_sector(bio) == sector;
@@ -3268,7 +3317,7 @@ static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
* The split happens for real compressed bio, which happens in
* btrfs_submit_compressed_read/write().
*/
- if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED) {
+ if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) {
bio_ctrl->len_to_oe_boundary = U32_MAX;
bio_ctrl->len_to_stripe_boundary = U32_MAX;
return 0;
@@ -3311,7 +3360,7 @@ static int alloc_new_bio(struct btrfs_inode *inode,
unsigned int opf,
bio_end_io_t end_io_func,
u64 disk_bytenr, u32 offset, u64 file_offset,
- unsigned long bio_flags)
+ enum btrfs_compression_type compress_type)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct bio *bio;
@@ -3322,12 +3371,12 @@ static int alloc_new_bio(struct btrfs_inode *inode,
* For compressed page range, its disk_bytenr is always @disk_bytenr
* passed in, no matter if we have added any range into previous bio.
*/
- if (bio_flags & EXTENT_BIO_COMPRESSED)
+ if (compress_type != BTRFS_COMPRESS_NONE)
bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
else
bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT;
bio_ctrl->bio = bio;
- bio_ctrl->bio_flags = bio_flags;
+ bio_ctrl->compress_type = compress_type;
bio->bi_end_io = end_io_func;
bio->bi_private = &inode->io_tree;
bio->bi_opf = opf;
@@ -3386,7 +3435,7 @@ error:
* @end_io_func: end_io callback for new bio
* @mirror_num: desired mirror to read/write
* @prev_bio_flags: flags of previous bio to see if we can merge the current one
- * @bio_flags: flags of the current bio to see if we can merge them
+ * @compress_type: compress type for current bio
*/
static int submit_extent_page(unsigned int opf,
struct writeback_control *wbc,
@@ -3395,7 +3444,7 @@ static int submit_extent_page(unsigned int opf,
size_t size, unsigned long pg_offset,
bio_end_io_t end_io_func,
int mirror_num,
- unsigned long bio_flags,
+ enum btrfs_compression_type compress_type,
bool force_bio_submit)
{
int ret = 0;
@@ -3407,10 +3456,8 @@ static int submit_extent_page(unsigned int opf,
ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE &&
pg_offset + size <= PAGE_SIZE);
if (force_bio_submit && bio_ctrl->bio) {
- ret = submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->bio_flags);
+ submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->compress_type);
bio_ctrl->bio = NULL;
- if (ret < 0)
- return ret;
}
while (cur < pg_offset + size) {
@@ -3422,7 +3469,7 @@ static int submit_extent_page(unsigned int opf,
ret = alloc_new_bio(inode, bio_ctrl, wbc, opf,
end_io_func, disk_bytenr, offset,
page_offset(page) + cur,
- bio_flags);
+ compress_type);
if (ret < 0)
return ret;
}
@@ -3430,14 +3477,14 @@ static int submit_extent_page(unsigned int opf,
* We must go through btrfs_bio_add_page() to ensure each
* page range won't cross various boundaries.
*/
- if (bio_flags & EXTENT_BIO_COMPRESSED)
+ if (compress_type != BTRFS_COMPRESS_NONE)
added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr,
size - offset, pg_offset + offset,
- bio_flags);
+ compress_type);
else
added = btrfs_bio_add_page(bio_ctrl, page,
disk_bytenr + offset, size - offset,
- pg_offset + offset, bio_flags);
+ pg_offset + offset, compress_type);
/* Metadata page range should never be split */
if (!is_data_inode(&inode->vfs_inode))
@@ -3451,11 +3498,8 @@ static int submit_extent_page(unsigned int opf,
if (added < size - offset) {
/* The bio should contain some page(s) */
ASSERT(bio_ctrl->bio->bi_iter.bi_size);
- ret = submit_one_bio(bio_ctrl->bio, mirror_num,
- bio_ctrl->bio_flags);
+ submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->compress_type);
bio_ctrl->bio = NULL;
- if (ret < 0)
- return ret;
}
cur += added;
}
@@ -3478,7 +3522,7 @@ static int attach_extent_buffer_page(struct extent_buffer *eb,
if (page->mapping)
lockdep_assert_held(&page->mapping->private_lock);
- if (fs_info->sectorsize == PAGE_SIZE) {
+ if (fs_info->nodesize >= PAGE_SIZE) {
if (!PagePrivate(page))
attach_page_private(page, eb);
else
@@ -3513,7 +3557,7 @@ int set_page_extent_mapped(struct page *page)
fs_info = btrfs_sb(page->mapping->host->i_sb);
- if (fs_info->sectorsize < PAGE_SIZE)
+ if (btrfs_is_subpage(fs_info, page))
return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA);
attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE);
@@ -3530,7 +3574,7 @@ void clear_page_extent_mapped(struct page *page)
return;
fs_info = btrfs_sb(page->mapping->host->i_sb);
- if (fs_info->sectorsize < PAGE_SIZE)
+ if (btrfs_is_subpage(fs_info, page))
return btrfs_detach_subpage(fs_info, page);
detach_page_private(page);
@@ -3569,7 +3613,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
* XXX JDM: This needs looking at to ensure proper page locking
* return 0 on success, otherwise return error
*/
-int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
+static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
struct btrfs_bio_ctrl *bio_ctrl,
unsigned int read_flags, u64 *prev_em_start)
{
@@ -3638,16 +3682,13 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
BUG_ON(extent_map_end(em) <= cur);
BUG_ON(end < cur);
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
- this_bio_flag |= EXTENT_BIO_COMPRESSED;
- extent_set_compress_type(&this_bio_flag,
- em->compress_type);
- }
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+ this_bio_flag = em->compress_type;
iosize = min(extent_map_end(em) - cur, end - cur + 1);
cur_end = min(extent_map_end(em) - 1, end);
iosize = ALIGN(iosize, blocksize);
- if (this_bio_flag & EXTENT_BIO_COMPRESSED)
+ if (this_bio_flag != BTRFS_COMPRESS_NONE)
disk_bytenr = em->block_start;
else
disk_bytenr = em->block_start + extent_offset;
@@ -3743,8 +3784,12 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
this_bio_flag,
force_bio_submit);
if (ret) {
- unlock_extent(tree, cur, cur + iosize - 1);
- end_page_read(page, false, cur, iosize);
+ /*
+ * We have to unlock the remaining range, or the page
+ * will never be unlocked.
+ */
+ unlock_extent(tree, cur, end);
+ end_page_read(page, false, cur, end + 1 - cur);
goto out;
}
cur = cur + iosize;
@@ -3754,6 +3799,26 @@ out:
return ret;
}
+int btrfs_readpage(struct file *file, struct page *page)
+{
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ u64 start = page_offset(page);
+ u64 end = start + PAGE_SIZE - 1;
+ struct btrfs_bio_ctrl bio_ctrl = { 0 };
+ int ret;
+
+ btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
+
+ ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL);
+ /*
+ * If btrfs_do_readpage() failed we will want to submit the assembled
+ * bio to do the cleanup.
+ */
+ if (bio_ctrl.bio)
+ submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.compress_type);
+ return ret;
+}
+
static inline void contiguous_readpages(struct page *pages[], int nr_pages,
u64 start, u64 end,
struct extent_map **em_cached,
@@ -3772,12 +3837,6 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages,
}
}
-static void update_nr_written(struct writeback_control *wbc,
- unsigned long nr_written)
-{
- wbc->nr_to_write -= nr_written;
-}
-
/*
* helper for __extent_writepage, doing all of the delayed allocation setup.
*
@@ -3877,7 +3936,7 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
* For regular sector size == page size case, since one page only
* contains one sector, we return the page offset directly.
*/
- if (fs_info->sectorsize == PAGE_SIZE) {
+ if (!btrfs_is_subpage(fs_info, page)) {
*start = page_offset(page);
*end = page_offset(page) + PAGE_SIZE;
return;
@@ -3920,10 +3979,12 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
u64 extent_offset;
u64 block_start;
struct extent_map *em;
+ int saved_ret = 0;
int ret = 0;
int nr = 0;
u32 opf = REQ_OP_WRITE;
const unsigned int write_flags = wbc_to_write_flags(wbc);
+ bool has_error = false;
bool compressed;
ret = btrfs_writepage_cow_fixup(page);
@@ -3938,7 +3999,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
* we don't want to touch the inode after unlocking the page,
* so we update the mapping writeback index now
*/
- update_nr_written(wbc, 1);
+ wbc->nr_to_write--;
while (cur <= end) {
u64 disk_bytenr;
@@ -3973,6 +4034,9 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
if (IS_ERR(em)) {
btrfs_page_set_error(fs_info, page, cur, end - cur + 1);
ret = PTR_ERR_OR_ZERO(em);
+ has_error = true;
+ if (!saved_ret)
+ saved_ret = ret;
break;
}
@@ -4036,6 +4100,10 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
end_bio_extent_writepage,
0, 0, false);
if (ret) {
+ has_error = true;
+ if (!saved_ret)
+ saved_ret = ret;
+
btrfs_page_set_error(fs_info, page, cur, iosize);
if (PageWriteback(page))
btrfs_page_clear_writeback(fs_info, page, cur,
@@ -4049,8 +4117,10 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
* If we finish without problem, we should not only clear page dirty,
* but also empty subpage dirty bits
*/
- if (!ret)
+ if (!has_error)
btrfs_page_assert_not_dirty(fs_info, page);
+ else
+ ret = saved_ret;
*nr_ret = nr;
return ret;
}
@@ -4181,9 +4251,6 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
static void end_extent_buffer_writeback(struct extent_buffer *eb)
{
- if (test_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags))
- btrfs_zone_finish_endio(eb->fs_info, eb->start, eb->len);
-
clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
smp_mb__after_atomic();
wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
@@ -4203,14 +4270,12 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
struct extent_page_data *epd)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- int i, num_pages, failed_page_nr;
+ int i, num_pages;
int flush = 0;
int ret = 0;
if (!btrfs_try_tree_write_lock(eb)) {
- ret = flush_write_bio(epd);
- if (ret < 0)
- return ret;
+ flush_write_bio(epd);
flush = 1;
btrfs_tree_lock(eb);
}
@@ -4220,9 +4285,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
if (!epd->sync_io)
return 0;
if (!flush) {
- ret = flush_write_bio(epd);
- if (ret < 0)
- return ret;
+ flush_write_bio(epd);
flush = 1;
}
while (1) {
@@ -4260,7 +4323,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
* Subpage metadata doesn't use page locking at all, so we can skip
* the page locking.
*/
- if (!ret || fs_info->sectorsize < PAGE_SIZE)
+ if (!ret || fs_info->nodesize < PAGE_SIZE)
return ret;
num_pages = num_extent_pages(eb);
@@ -4269,14 +4332,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
if (!trylock_page(p)) {
if (!flush) {
- int err;
-
- err = flush_write_bio(epd);
- if (err < 0) {
- ret = err;
- failed_page_nr = i;
- goto err_unlock;
- }
+ flush_write_bio(epd);
flush = 1;
}
lock_page(p);
@@ -4284,25 +4340,6 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
}
return ret;
-err_unlock:
- /* Unlock already locked pages */
- for (i = 0; i < failed_page_nr; i++)
- unlock_page(eb->pages[i]);
- /*
- * Clear EXTENT_BUFFER_WRITEBACK and wake up anyone waiting on it.
- * Also set back EXTENT_BUFFER_DIRTY so future attempts to this eb can
- * be made and undo everything done before.
- */
- btrfs_tree_lock(eb);
- spin_lock(&eb->refs_lock);
- set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
- end_extent_buffer_writeback(eb);
- spin_unlock(&eb->refs_lock);
- percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, eb->len,
- fs_info->dirty_metadata_batch);
- btrfs_clear_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
- btrfs_tree_unlock(eb);
- return ret;
}
static void set_btree_ioerr(struct page *page, struct extent_buffer *eb)
@@ -4397,8 +4434,8 @@ static struct extent_buffer *find_extent_buffer_nolock(
struct extent_buffer *eb;
rcu_read_lock();
- eb = radix_tree_lookup(&fs_info->buffer_radix,
- start >> fs_info->sectorsize_bits);
+ eb = xa_load(&fs_info->extent_buffers,
+ start >> fs_info->sectorsize_bits);
if (eb && atomic_inc_not_zero(&eb->refs)) {
rcu_read_unlock();
return eb;
@@ -4420,7 +4457,7 @@ static void end_bio_subpage_eb_writepage(struct bio *bio)
struct bvec_iter_all iter_all;
fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb);
- ASSERT(fs_info->sectorsize < PAGE_SIZE);
+ ASSERT(fs_info->nodesize < PAGE_SIZE);
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, iter_all) {
@@ -4572,7 +4609,7 @@ static int write_one_subpage_eb(struct extent_buffer *eb,
* dirty anymore, we have submitted a page. Update nr_written in wbc.
*/
if (no_dirty_ebs)
- update_nr_written(wbc, 1);
+ wbc->nr_to_write--;
return ret;
}
@@ -4608,7 +4645,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
break;
}
disk_bytenr += PAGE_SIZE;
- update_nr_written(wbc, 1);
+ wbc->nr_to_write--;
unlock_page(p);
}
@@ -4747,7 +4784,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
if (!PagePrivate(page))
return 0;
- if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
+ if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
return submit_eb_subpage(page, wbc, epd);
spin_lock(&mapping->private_lock);
@@ -4803,8 +4840,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
/*
* Implies write in zoned mode. Mark the last eb in a block group.
*/
- if (cache->seq_zone && eb->start + eb->len == cache->zone_capacity)
- set_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags);
+ btrfs_schedule_zone_finish_bg(cache, eb);
btrfs_put_block_group(cache);
}
ret = write_one_eb(eb, wbc, epd);
@@ -4923,13 +4959,19 @@ retry:
* if the fs already has error.
*/
if (!BTRFS_FS_ERROR(fs_info)) {
- ret = flush_write_bio(&epd);
+ flush_write_bio(&epd);
} else {
ret = -EROFS;
end_write_bio(&epd, ret);
}
out:
btrfs_zoned_meta_io_unlock(fs_info);
+ /*
+ * We can get ret > 0 from submit_extent_page() indicating how many ebs
+ * were submitted. Reset it to 0 to avoid false alerts for the caller.
+ */
+ if (ret > 0)
+ ret = 0;
return ret;
}
@@ -5031,8 +5073,7 @@ retry:
* tmpfs file mapping
*/
if (!trylock_page(page)) {
- ret = flush_write_bio(epd);
- BUG_ON(ret < 0);
+ flush_write_bio(epd);
lock_page(page);
}
@@ -5042,10 +5083,8 @@ retry:
}
if (wbc->sync_mode != WB_SYNC_NONE) {
- if (PageWriteback(page)) {
- ret = flush_write_bio(epd);
- BUG_ON(ret < 0);
- }
+ if (PageWriteback(page))
+ flush_write_bio(epd);
wait_on_page_writeback(page);
}
@@ -5085,9 +5124,8 @@ retry:
* page in our current bio, and thus deadlock, so flush the
* write bio here.
*/
- ret = flush_write_bio(epd);
- if (!ret)
- goto retry;
+ flush_write_bio(epd);
+ goto retry;
}
if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
@@ -5113,8 +5151,7 @@ int extent_write_full_page(struct page *page, struct writeback_control *wbc)
return ret;
}
- ret = flush_write_bio(&epd);
- ASSERT(ret <= 0);
+ flush_write_bio(&epd);
return ret;
}
@@ -5176,7 +5213,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
}
if (!found_error)
- ret = flush_write_bio(&epd);
+ flush_write_bio(&epd);
else
end_write_bio(&epd, ret);
@@ -5209,7 +5246,7 @@ int extent_writepages(struct address_space *mapping,
end_write_bio(&epd, ret);
return ret;
}
- ret = flush_write_bio(&epd);
+ flush_write_bio(&epd);
return ret;
}
@@ -5232,10 +5269,8 @@ void extent_readahead(struct readahead_control *rac)
if (em_cached)
free_extent_map(em_cached);
- if (bio_ctrl.bio) {
- if (submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags))
- return;
- }
+ if (bio_ctrl.bio)
+ submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.compress_type);
}
/*
@@ -5804,7 +5839,7 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag
return;
}
- if (fs_info->sectorsize == PAGE_SIZE) {
+ if (fs_info->nodesize >= PAGE_SIZE) {
/*
* We do this since we'll remove the pages after we've
* removed the eb from the radix tree, so we could race
@@ -5911,9 +5946,9 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
{
int i;
- struct page *p;
struct extent_buffer *new;
int num_pages = num_extent_pages(src);
+ int ret;
new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
if (new == NULL)
@@ -5926,22 +5961,23 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
*/
set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
+ memset(new->pages, 0, sizeof(*new->pages) * num_pages);
+ ret = btrfs_alloc_page_array(num_pages, new->pages);
+ if (ret) {
+ btrfs_release_extent_buffer(new);
+ return NULL;
+ }
+
for (i = 0; i < num_pages; i++) {
int ret;
+ struct page *p = new->pages[i];
- p = alloc_page(GFP_NOFS);
- if (!p) {
- btrfs_release_extent_buffer(new);
- return NULL;
- }
ret = attach_extent_buffer_page(new, p, NULL);
if (ret < 0) {
- put_page(p);
btrfs_release_extent_buffer(new);
return NULL;
}
WARN_ON(PageDirty(p));
- new->pages[i] = p;
copy_page(page_address(p), page_address(src->pages[i]));
}
set_extent_buffer_uptodate(new);
@@ -5955,31 +5991,36 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb;
int num_pages;
int i;
+ int ret;
eb = __alloc_extent_buffer(fs_info, start, len);
if (!eb)
return NULL;
num_pages = num_extent_pages(eb);
+ ret = btrfs_alloc_page_array(num_pages, eb->pages);
+ if (ret)
+ goto err;
+
for (i = 0; i < num_pages; i++) {
- int ret;
+ struct page *p = eb->pages[i];
- eb->pages[i] = alloc_page(GFP_NOFS);
- if (!eb->pages[i])
- goto err;
- ret = attach_extent_buffer_page(eb, eb->pages[i], NULL);
+ ret = attach_extent_buffer_page(eb, p, NULL);
if (ret < 0)
goto err;
}
+
set_extent_buffer_uptodate(eb);
btrfs_set_header_nritems(eb, 0);
set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
return eb;
err:
- for (; i > 0; i--) {
- detach_extent_buffer_page(eb, eb->pages[i - 1]);
- __free_page(eb->pages[i - 1]);
+ for (i = 0; i < num_pages; i++) {
+ if (eb->pages[i]) {
+ detach_extent_buffer_page(eb, eb->pages[i]);
+ __free_page(eb->pages[i]);
+ }
}
__free_extent_buffer(eb);
return NULL;
@@ -6086,24 +6127,22 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
if (!eb)
return ERR_PTR(-ENOMEM);
eb->fs_info = fs_info;
-again:
- ret = radix_tree_preload(GFP_NOFS);
- if (ret) {
- exists = ERR_PTR(ret);
- goto free_eb;
- }
- spin_lock(&fs_info->buffer_lock);
- ret = radix_tree_insert(&fs_info->buffer_radix,
- start >> fs_info->sectorsize_bits, eb);
- spin_unlock(&fs_info->buffer_lock);
- radix_tree_preload_end();
- if (ret == -EEXIST) {
- exists = find_extent_buffer(fs_info, start);
- if (exists)
+
+ do {
+ ret = xa_insert(&fs_info->extent_buffers,
+ start >> fs_info->sectorsize_bits,
+ eb, GFP_NOFS);
+ if (ret == -ENOMEM) {
+ exists = ERR_PTR(ret);
goto free_eb;
- else
- goto again;
- }
+ }
+ if (ret == -EBUSY) {
+ exists = find_extent_buffer(fs_info, start);
+ if (exists)
+ goto free_eb;
+ }
+ } while (ret);
+
check_buffer_tree_ref(eb);
set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
@@ -6124,7 +6163,7 @@ static struct extent_buffer *grab_extent_buffer(
* don't try to insert two ebs for the same bytenr. So here we always
* return NULL and just continue.
*/
- if (fs_info->sectorsize < PAGE_SIZE)
+ if (fs_info->nodesize < PAGE_SIZE)
return NULL;
/* Page not yet attached to an extent buffer */
@@ -6146,6 +6185,30 @@ static struct extent_buffer *grab_extent_buffer(
return NULL;
}
+static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
+{
+ if (!IS_ALIGNED(start, fs_info->sectorsize)) {
+ btrfs_err(fs_info, "bad tree block start %llu", start);
+ return -EINVAL;
+ }
+
+ if (fs_info->nodesize < PAGE_SIZE &&
+ offset_in_page(start) + fs_info->nodesize > PAGE_SIZE) {
+ btrfs_err(fs_info,
+ "tree block crosses page boundary, start %llu nodesize %u",
+ start, fs_info->nodesize);
+ return -EINVAL;
+ }
+ if (fs_info->nodesize >= PAGE_SIZE &&
+ !IS_ALIGNED(start, PAGE_SIZE)) {
+ btrfs_err(fs_info,
+ "tree block is not page aligned, start %llu nodesize %u",
+ start, fs_info->nodesize);
+ return -EINVAL;
+ }
+ return 0;
+}
+
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, u64 owner_root, int level)
{
@@ -6160,10 +6223,8 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
int uptodate = 1;
int ret;
- if (!IS_ALIGNED(start, fs_info->sectorsize)) {
- btrfs_err(fs_info, "bad tree block start %llu", start);
+ if (check_eb_alignment(fs_info, start))
return ERR_PTR(-EINVAL);
- }
#if BITS_PER_LONG == 32
if (start >= MAX_LFS_FILESIZE) {
@@ -6176,14 +6237,6 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
btrfs_warn_32bit_limit(fs_info);
#endif
- if (fs_info->sectorsize < PAGE_SIZE &&
- offset_in_page(start) + len > PAGE_SIZE) {
- btrfs_err(fs_info,
- "tree block crosses page boundary, start %llu nodesize %lu",
- start, len);
- return ERR_PTR(-EINVAL);
- }
-
eb = find_extent_buffer(fs_info, start);
if (eb)
return eb;
@@ -6213,7 +6266,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
* page, but it may change in the future for 16K page size
* support, so we still preallocate the memory in the loop.
*/
- if (fs_info->sectorsize < PAGE_SIZE) {
+ if (fs_info->nodesize < PAGE_SIZE) {
prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA);
if (IS_ERR(prealloc)) {
ret = PTR_ERR(prealloc);
@@ -6264,25 +6317,22 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
}
if (uptodate)
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
-again:
- ret = radix_tree_preload(GFP_NOFS);
- if (ret) {
- exists = ERR_PTR(ret);
- goto free_eb;
- }
-
- spin_lock(&fs_info->buffer_lock);
- ret = radix_tree_insert(&fs_info->buffer_radix,
- start >> fs_info->sectorsize_bits, eb);
- spin_unlock(&fs_info->buffer_lock);
- radix_tree_preload_end();
- if (ret == -EEXIST) {
- exists = find_extent_buffer(fs_info, start);
- if (exists)
+
+ do {
+ ret = xa_insert(&fs_info->extent_buffers,
+ start >> fs_info->sectorsize_bits,
+ eb, GFP_NOFS);
+ if (ret == -ENOMEM) {
+ exists = ERR_PTR(ret);
goto free_eb;
- else
- goto again;
- }
+ }
+ if (ret == -EBUSY) {
+ exists = find_extent_buffer(fs_info, start);
+ if (exists)
+ goto free_eb;
+ }
+ } while (ret);
+
/* add one reference for the tree */
check_buffer_tree_ref(eb);
set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
@@ -6327,10 +6377,8 @@ static int release_extent_buffer(struct extent_buffer *eb)
spin_unlock(&eb->refs_lock);
- spin_lock(&fs_info->buffer_lock);
- radix_tree_delete(&fs_info->buffer_radix,
- eb->start >> fs_info->sectorsize_bits);
- spin_unlock(&fs_info->buffer_lock);
+ xa_erase(&fs_info->extent_buffers,
+ eb->start >> fs_info->sectorsize_bits);
} else {
spin_unlock(&eb->refs_lock);
}
@@ -6432,7 +6480,7 @@ void clear_extent_buffer_dirty(const struct extent_buffer *eb)
int num_pages;
struct page *page;
- if (eb->fs_info->sectorsize < PAGE_SIZE)
+ if (eb->fs_info->nodesize < PAGE_SIZE)
return clear_subpage_extent_buffer_dirty(eb);
num_pages = num_extent_pages(eb);
@@ -6464,7 +6512,7 @@ bool set_extent_buffer_dirty(struct extent_buffer *eb)
WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
if (!was_dirty) {
- bool subpage = eb->fs_info->sectorsize < PAGE_SIZE;
+ bool subpage = eb->fs_info->nodesize < PAGE_SIZE;
/*
* For subpage case, we can have other extent buffers in the
@@ -6504,9 +6552,18 @@ void clear_extent_buffer_uptodate(struct extent_buffer *eb)
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) {
page = eb->pages[i];
- if (page)
- btrfs_page_clear_uptodate(fs_info, page,
- eb->start, eb->len);
+ if (!page)
+ continue;
+
+ /*
+ * This is special handling for metadata subpage, as regular
+ * btrfs_is_subpage() can not handle cloned/dummy metadata.
+ */
+ if (fs_info->nodesize >= PAGE_SIZE)
+ ClearPageUptodate(page);
+ else
+ btrfs_subpage_clear_uptodate(fs_info, page, eb->start,
+ eb->len);
}
}
@@ -6521,7 +6578,16 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb)
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) {
page = eb->pages[i];
- btrfs_page_set_uptodate(fs_info, page, eb->start, eb->len);
+
+ /*
+ * This is special handling for metadata subpage, as regular
+ * btrfs_is_subpage() can not handle cloned/dummy metadata.
+ */
+ if (fs_info->nodesize >= PAGE_SIZE)
+ SetPageUptodate(page);
+ else
+ btrfs_subpage_set_uptodate(fs_info, page, eb->start,
+ eb->len);
}
}
@@ -6577,12 +6643,8 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
atomic_dec(&eb->io_pages);
}
if (bio_ctrl.bio) {
- int tmp;
-
- tmp = submit_one_bio(bio_ctrl.bio, mirror_num, 0);
+ submit_one_bio(bio_ctrl.bio, mirror_num, 0);
bio_ctrl.bio = NULL;
- if (tmp < 0)
- return tmp;
}
if (ret || wait != WAIT_COMPLETE)
return ret;
@@ -6616,7 +6678,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)))
return -EIO;
- if (eb->fs_info->sectorsize < PAGE_SIZE)
+ if (eb->fs_info->nodesize < PAGE_SIZE)
return read_extent_buffer_subpage(eb, wait, mirror_num);
num_pages = num_extent_pages(eb);
@@ -6695,10 +6757,8 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
}
if (bio_ctrl.bio) {
- err = submit_one_bio(bio_ctrl.bio, mirror_num, bio_ctrl.bio_flags);
+ submit_one_bio(bio_ctrl.bio, mirror_num, bio_ctrl.compress_type);
bio_ctrl.bio = NULL;
- if (err)
- return err;
}
if (ret || wait != WAIT_COMPLETE)
@@ -6871,7 +6931,7 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb,
* would have !PageUptodate && !PageError, as we clear PageError before
* reading.
*/
- if (fs_info->sectorsize < PAGE_SIZE) {
+ if (fs_info->nodesize < PAGE_SIZE) {
bool uptodate, error;
uptodate = btrfs_subpage_test_uptodate(fs_info, page,
@@ -6973,7 +7033,7 @@ void copy_extent_buffer_full(const struct extent_buffer *dst,
ASSERT(dst->len == src->len);
- if (dst->fs_info->sectorsize == PAGE_SIZE) {
+ if (dst->fs_info->nodesize >= PAGE_SIZE) {
num_pages = num_extent_pages(dst);
for (i = 0; i < num_pages; i++)
copy_page(page_address(dst->pages[i]),
@@ -6982,7 +7042,7 @@ void copy_extent_buffer_full(const struct extent_buffer *dst,
size_t src_offset = get_eb_offset_in_page(src, 0);
size_t dst_offset = get_eb_offset_in_page(dst, 0);
- ASSERT(src->fs_info->sectorsize < PAGE_SIZE);
+ ASSERT(src->fs_info->nodesize < PAGE_SIZE);
memcpy(page_address(dst->pages[0]) + dst_offset,
page_address(src->pages[0]) + src_offset,
src->len);
@@ -7263,42 +7323,25 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
}
}
-#define GANG_LOOKUP_SIZE 16
static struct extent_buffer *get_next_extent_buffer(
struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
{
- struct extent_buffer *gang[GANG_LOOKUP_SIZE];
- struct extent_buffer *found = NULL;
+ struct extent_buffer *eb;
+ unsigned long index;
u64 page_start = page_offset(page);
- u64 cur = page_start;
ASSERT(in_range(bytenr, page_start, PAGE_SIZE));
lockdep_assert_held(&fs_info->buffer_lock);
- while (cur < page_start + PAGE_SIZE) {
- int ret;
- int i;
-
- ret = radix_tree_gang_lookup(&fs_info->buffer_radix,
- (void **)gang, cur >> fs_info->sectorsize_bits,
- min_t(unsigned int, GANG_LOOKUP_SIZE,
- PAGE_SIZE / fs_info->nodesize));
- if (ret == 0)
- goto out;
- for (i = 0; i < ret; i++) {
- /* Already beyond page end */
- if (gang[i]->start >= page_start + PAGE_SIZE)
- goto out;
- /* Found one */
- if (gang[i]->start >= bytenr) {
- found = gang[i];
- goto out;
- }
- }
- cur = gang[ret - 1]->start + gang[ret - 1]->len;
+ xa_for_each_start(&fs_info->extent_buffers, index, eb,
+ page_start >> fs_info->sectorsize_bits) {
+ if (in_range(eb->start, page_start, PAGE_SIZE))
+ return eb;
+ else if (eb->start >= page_start + PAGE_SIZE)
+ /* Already beyond page end */
+ return NULL;
}
-out:
- return found;
+ return NULL;
}
static int try_release_subpage_extent_buffer(struct page *page)
@@ -7375,7 +7418,7 @@ int try_release_extent_buffer(struct page *page)
{
struct extent_buffer *eb;
- if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
+ if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
return try_release_subpage_extent_buffer(page);
/*
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 151e9da5da2d..956fa434df43 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -7,15 +7,9 @@
#include <linux/refcount.h>
#include <linux/fiemap.h>
#include <linux/btrfs_tree.h>
+#include "compression.h"
#include "ulist.h"
-/*
- * flags for bio submission. The high bits indicate the compression
- * type for this bio
- */
-#define EXTENT_BIO_COMPRESSED 1
-#define EXTENT_BIO_FLAG_SHIFT 16
-
enum {
EXTENT_BUFFER_UPTODATE,
EXTENT_BUFFER_DIRTY,
@@ -32,7 +26,6 @@ enum {
/* write IO error */
EXTENT_BUFFER_WRITE_ERR,
EXTENT_BUFFER_NO_CHECK,
- EXTENT_BUFFER_ZONE_FINISH,
};
/* these are flags for __process_pages_contig */
@@ -71,9 +64,9 @@ struct btrfs_fs_info;
struct io_failure_record;
struct extent_io_tree;
-typedef blk_status_t (submit_bio_hook_t)(struct inode *inode, struct bio *bio,
+typedef void (submit_bio_hook_t)(struct inode *inode, struct bio *bio,
int mirror_num,
- unsigned long bio_flags);
+ enum btrfs_compression_type compress_type);
typedef blk_status_t (extent_submit_bio_start_t)(struct inode *inode,
struct bio *bio, u64 dio_file_offset);
@@ -103,17 +96,6 @@ struct extent_buffer {
};
/*
- * Structure to record info about the bio being assembled, and other info like
- * how many bytes are there before stripe/ordered extent boundary.
- */
-struct btrfs_bio_ctrl {
- struct bio *bio;
- unsigned long bio_flags;
- u32 len_to_stripe_boundary;
- u32 len_to_oe_boundary;
-};
-
-/*
* Structure to record how many bytes and which ranges are set/cleared
*/
struct extent_changeset {
@@ -158,17 +140,6 @@ static inline void extent_changeset_free(struct extent_changeset *changeset)
kfree(changeset);
}
-static inline void extent_set_compress_type(unsigned long *bio_flags,
- int compress_type)
-{
- *bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT;
-}
-
-static inline int extent_compress_type(unsigned long bio_flags)
-{
- return bio_flags >> EXTENT_BIO_FLAG_SHIFT;
-}
-
struct extent_map_tree;
typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode,
@@ -178,11 +149,7 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode,
int try_release_extent_mapping(struct page *page, gfp_t mask);
int try_release_extent_buffer(struct page *page);
-int __must_check submit_one_bio(struct bio *bio, int mirror_num,
- unsigned long bio_flags);
-int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
- struct btrfs_bio_ctrl *bio_ctrl,
- unsigned int read_flags, u64 *prev_em_start);
+int btrfs_readpage(struct file *file, struct page *page);
int extent_write_full_page(struct page *page, struct writeback_control *wbc);
int extent_write_locked_range(struct inode *inode, u64 start, u64 end);
int extent_writepages(struct address_space *mapping,
@@ -277,8 +244,10 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
u32 bits_to_clear, unsigned long page_ops);
+
+int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array);
struct bio *btrfs_bio_alloc(unsigned int nr_iovecs);
-struct bio *btrfs_bio_clone(struct bio *bio);
+struct bio *btrfs_bio_clone(struct block_device *bdev, struct bio *bio);
struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size);
void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
@@ -297,7 +266,7 @@ struct io_failure_record {
u64 start;
u64 len;
u64 logical;
- unsigned long bio_flags;
+ enum btrfs_compression_type compress_type;
int this_mirror;
int failed_mirror;
};
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 380054c94e4b..46c2baa8fdf5 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1460,8 +1460,27 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
return ret;
}
-static int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
- size_t *write_bytes, bool nowait)
+/*
+ * Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
+ *
+ * @pos: File offset.
+ * @write_bytes: The length to write, will be updated to the nocow writeable
+ * range.
+ *
+ * This function will flush ordered extents in the range to ensure proper
+ * nocow checks.
+ *
+ * Return:
+ * > 0 If we can nocow, and updates @write_bytes.
+ * 0 If we can't do a nocow write.
+ * -EAGAIN If we can't do a nocow write because snapshoting of the inode's
+ * root is in progress.
+ * < 0 If an error happened.
+ *
+ * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0.
+ */
+int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
+ size_t *write_bytes)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_root *root = inode->root;
@@ -1472,7 +1491,7 @@ static int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
return 0;
- if (!nowait && !btrfs_drew_try_write_lock(&root->snapshot_lock))
+ if (!btrfs_drew_try_write_lock(&root->snapshot_lock))
return -EAGAIN;
lockstart = round_down(pos, fs_info->sectorsize);
@@ -1480,71 +1499,21 @@ static int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
fs_info->sectorsize) - 1;
num_bytes = lockend - lockstart + 1;
- if (nowait) {
- struct btrfs_ordered_extent *ordered;
-
- if (!try_lock_extent(&inode->io_tree, lockstart, lockend))
- return -EAGAIN;
-
- ordered = btrfs_lookup_ordered_range(inode, lockstart,
- num_bytes);
- if (ordered) {
- btrfs_put_ordered_extent(ordered);
- ret = -EAGAIN;
- goto out_unlock;
- }
- } else {
- btrfs_lock_and_flush_ordered_range(inode, lockstart,
- lockend, NULL);
- }
-
+ btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, NULL);
ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
NULL, NULL, NULL, false);
if (ret <= 0) {
ret = 0;
- if (!nowait)
- btrfs_drew_write_unlock(&root->snapshot_lock);
+ btrfs_drew_write_unlock(&root->snapshot_lock);
} else {
*write_bytes = min_t(size_t, *write_bytes ,
num_bytes - pos + lockstart);
}
-out_unlock:
unlock_extent(&inode->io_tree, lockstart, lockend);
return ret;
}
-static int check_nocow_nolock(struct btrfs_inode *inode, loff_t pos,
- size_t *write_bytes)
-{
- return check_can_nocow(inode, pos, write_bytes, true);
-}
-
-/*
- * Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
- *
- * @pos: File offset
- * @write_bytes: The length to write, will be updated to the nocow writeable
- * range
- *
- * This function will flush ordered extents in the range to ensure proper
- * nocow checks.
- *
- * Return:
- * >0 and update @write_bytes if we can do nocow write
- * 0 if we can't do nocow write
- * -EAGAIN if we can't get the needed lock or there are ordered extents
- * for * (nowait == true) case
- * <0 if other error happened
- *
- * NOTE: Callers need to release the lock by btrfs_check_nocow_unlock().
- */
-int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
- size_t *write_bytes)
-{
- return check_can_nocow(inode, pos, write_bytes, false);
-}
-
void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
{
btrfs_drew_write_unlock(&inode->root->snapshot_lock);
@@ -1579,20 +1548,15 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
loff_t oldsize;
loff_t start_pos;
- if (iocb->ki_flags & IOCB_NOWAIT) {
- size_t nocow_bytes = count;
-
- /* We will allocate space in case nodatacow is not set, so bail */
- if (check_nocow_nolock(BTRFS_I(inode), pos, &nocow_bytes) <= 0)
- return -EAGAIN;
- /*
- * There are holes in the range or parts of the range that must
- * be COWed (shared extents, RO block groups, etc), so just bail
- * out.
- */
- if (nocow_bytes < count)
- return -EAGAIN;
- }
+ /*
+ * Quickly bail out on NOWAIT writes if we don't have the nodatacow or
+ * prealloc flags, as without those flags we always have to COW. We will
+ * later check if we can really COW into the target range (using
+ * can_nocow_extent() at btrfs_get_blocks_direct_write()).
+ */
+ if ((iocb->ki_flags & IOCB_NOWAIT) &&
+ !(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
+ return -EAGAIN;
current->backing_dev_info = inode_to_bdi(inode);
ret = file_remove_privs(file);
@@ -1720,7 +1684,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
WARN_ON(reserve_bytes == 0);
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
reserve_bytes,
- reserve_bytes);
+ reserve_bytes, false);
if (ret) {
if (!only_release_metadata)
btrfs_free_reserved_data_space(BTRFS_I(inode),
@@ -1965,8 +1929,7 @@ relock:
*/
again:
from->nofault = true;
- err = iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
- IOMAP_DIO_PARTIAL, written);
+ err = btrfs_dio_rw(iocb, from, written);
from->nofault = false;
/* No increment (+=) because iomap returns a cumulative value. */
@@ -2570,10 +2533,10 @@ static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
return ret;
}
-static int btrfs_punch_hole_lock_range(struct inode *inode,
- const u64 lockstart,
- const u64 lockend,
- struct extent_state **cached_state)
+static void btrfs_punch_hole_lock_range(struct inode *inode,
+ const u64 lockstart,
+ const u64 lockend,
+ struct extent_state **cached_state)
{
/*
* For subpage case, if the range is not at page boundary, we could
@@ -2587,40 +2550,29 @@ static int btrfs_punch_hole_lock_range(struct inode *inode,
const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1;
while (1) {
- struct btrfs_ordered_extent *ordered;
- int ret;
-
truncate_pagecache_range(inode, lockstart, lockend);
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
cached_state);
- ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode),
- lockend);
-
/*
- * We need to make sure we have no ordered extents in this range
- * and nobody raced in and read a page in this range, if we did
- * we need to try again.
+ * We can't have ordered extents in the range, nor dirty/writeback
+ * pages, because we have locked the inode's VFS lock in exclusive
+ * mode, we have locked the inode's i_mmap_lock in exclusive mode,
+ * we have flushed all delalloc in the range and we have waited
+ * for any ordered extents in the range to complete.
+ * We can race with anyone reading pages from this range, so after
+ * locking the range check if we have pages in the range, and if
+ * we do, unlock the range and retry.
*/
- if ((!ordered ||
- (ordered->file_offset + ordered->num_bytes <= lockstart ||
- ordered->file_offset > lockend)) &&
- !filemap_range_has_page(inode->i_mapping,
- page_lockstart, page_lockend)) {
- if (ordered)
- btrfs_put_ordered_extent(ordered);
+ if (!filemap_range_has_page(inode->i_mapping, page_lockstart,
+ page_lockend))
break;
- }
- if (ordered)
- btrfs_put_ordered_extent(ordered);
+
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
lockend, cached_state);
- ret = btrfs_wait_ordered_range(inode, lockstart,
- lockend - lockstart + 1);
- if (ret)
- return ret;
}
- return 0;
+
+ btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend);
}
static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
@@ -2976,11 +2928,12 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
bool truncated_block = false;
bool updated_inode = false;
+ btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
+
ret = btrfs_wait_ordered_range(inode, offset, len);
if (ret)
- return ret;
+ goto out_only_mutex;
- btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
ino_size = round_up(inode->i_size, fs_info->sectorsize);
ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
if (ret < 0)
@@ -3072,10 +3025,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
goto out_only_mutex;
}
- ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
- &cached_state);
- if (ret)
- goto out_only_mutex;
+ btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state);
path = btrfs_alloc_path();
if (!path) {
@@ -3237,8 +3187,6 @@ static int btrfs_zero_range(struct inode *inode,
u64 bytes_to_reserve = 0;
bool space_reserved = false;
- inode_dio_wait(inode);
-
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
alloc_end - alloc_start);
if (IS_ERR(em)) {
@@ -3368,10 +3316,8 @@ reserve_space:
if (ret < 0)
goto out;
space_reserved = true;
- ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
- &cached_state);
- if (ret)
- goto out;
+ btrfs_punch_hole_lock_range(inode, lockstart, lockend,
+ &cached_state);
ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
alloc_start, bytes_to_reserve);
if (ret) {
@@ -3417,6 +3363,9 @@ static long btrfs_fallocate(struct file *file, int mode,
u64 alloc_hint = 0;
u64 locked_end;
u64 actual_end = 0;
+ u64 data_space_needed = 0;
+ u64 data_space_reserved = 0;
+ u64 qgroup_reserved = 0;
struct extent_map *em;
int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode));
int ret;
@@ -3437,18 +3386,6 @@ static long btrfs_fallocate(struct file *file, int mode,
if (mode & FALLOC_FL_PUNCH_HOLE)
return btrfs_punch_hole(file, offset, len);
- /*
- * Only trigger disk allocation, don't trigger qgroup reserve
- *
- * For qgroup space, it will be checked later.
- */
- if (!(mode & FALLOC_FL_ZERO_RANGE)) {
- ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
- alloc_end - alloc_start);
- if (ret < 0)
- return ret;
- }
-
btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
@@ -3485,8 +3422,12 @@ static long btrfs_fallocate(struct file *file, int mode,
}
/*
- * wait for ordered IO before we have any locks. We'll loop again
- * below with the locks held.
+ * We have locked the inode at the VFS level (in exclusive mode) and we
+ * have locked the i_mmap_lock lock (in exclusive mode). Now before
+ * locking the file range, flush all dealloc in the range and wait for
+ * all ordered extents in the range to complete. After this we can lock
+ * the file range and, due to the previous locking we did, we know there
+ * can't be more delalloc or ordered extents in the range.
*/
ret = btrfs_wait_ordered_range(inode, alloc_start,
alloc_end - alloc_start);
@@ -3500,38 +3441,10 @@ static long btrfs_fallocate(struct file *file, int mode,
}
locked_end = alloc_end - 1;
- while (1) {
- struct btrfs_ordered_extent *ordered;
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+ &cached_state);
- /* the extent lock is ordered inside the running
- * transaction
- */
- lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
- locked_end, &cached_state);
- ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode),
- locked_end);
-
- if (ordered &&
- ordered->file_offset + ordered->num_bytes > alloc_start &&
- ordered->file_offset < alloc_end) {
- btrfs_put_ordered_extent(ordered);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- alloc_start, locked_end,
- &cached_state);
- /*
- * we can't wait on the range with the transaction
- * running or with the extent lock held
- */
- ret = btrfs_wait_ordered_range(inode, alloc_start,
- alloc_end - alloc_start);
- if (ret)
- goto out;
- } else {
- if (ordered)
- btrfs_put_ordered_extent(ordered);
- break;
- }
- }
+ btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end);
/* First, check if we exceed the qgroup limit */
INIT_LIST_HEAD(&reserve_list);
@@ -3548,48 +3461,64 @@ static long btrfs_fallocate(struct file *file, int mode,
if (em->block_start == EXTENT_MAP_HOLE ||
(cur_offset >= inode->i_size &&
!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
- ret = add_falloc_range(&reserve_list, cur_offset,
- last_byte - cur_offset);
+ const u64 range_len = last_byte - cur_offset;
+
+ ret = add_falloc_range(&reserve_list, cur_offset, range_len);
if (ret < 0) {
free_extent_map(em);
break;
}
ret = btrfs_qgroup_reserve_data(BTRFS_I(inode),
- &data_reserved, cur_offset,
- last_byte - cur_offset);
+ &data_reserved, cur_offset, range_len);
if (ret < 0) {
- cur_offset = last_byte;
free_extent_map(em);
break;
}
- } else {
- /*
- * Do not need to reserve unwritten extent for this
- * range, free reserved data space first, otherwise
- * it'll result in false ENOSPC error.
- */
- btrfs_free_reserved_data_space(BTRFS_I(inode),
- data_reserved, cur_offset,
- last_byte - cur_offset);
+ qgroup_reserved += range_len;
+ data_space_needed += range_len;
}
free_extent_map(em);
cur_offset = last_byte;
}
+ if (!ret && data_space_needed > 0) {
+ /*
+ * We are safe to reserve space here as we can't have delalloc
+ * in the range, see above.
+ */
+ ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
+ data_space_needed);
+ if (!ret)
+ data_space_reserved = data_space_needed;
+ }
+
/*
* If ret is still 0, means we're OK to fallocate.
* Or just cleanup the list and exit.
*/
list_for_each_entry_safe(range, tmp, &reserve_list, list) {
- if (!ret)
+ if (!ret) {
ret = btrfs_prealloc_file_range(inode, mode,
range->start,
range->len, i_blocksize(inode),
offset + len, &alloc_hint);
- else
+ /*
+ * btrfs_prealloc_file_range() releases space even
+ * if it returns an error.
+ */
+ data_space_reserved -= range->len;
+ qgroup_reserved -= range->len;
+ } else if (data_space_reserved > 0) {
btrfs_free_reserved_data_space(BTRFS_I(inode),
- data_reserved, range->start,
- range->len);
+ data_reserved, range->start,
+ range->len);
+ data_space_reserved -= range->len;
+ qgroup_reserved -= range->len;
+ } else if (qgroup_reserved > 0) {
+ btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved,
+ range->start, range->len);
+ qgroup_reserved -= range->len;
+ }
list_del(&range->list);
kfree(range);
}
@@ -3606,10 +3535,6 @@ out_unlock:
&cached_state);
out:
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
- /* Let go of our reservation. */
- if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
- btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
- cur_offset, alloc_end - cur_offset);
extent_changeset_free(data_reserved);
return ret;
}
@@ -3767,8 +3692,7 @@ again:
*/
pagefault_disable();
to->nofault = true;
- ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
- IOMAP_DIO_PARTIAL, read);
+ ret = btrfs_dio_rw(iocb, to, read);
to->nofault = false;
pagefault_enable();
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 01a408db5683..f7adee6fa05e 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2630,16 +2630,19 @@ out:
static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
u64 bytenr, u64 size, bool used)
{
- struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct btrfs_space_info *sinfo = block_group->space_info;
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
u64 offset = bytenr - block_group->start;
u64 to_free, to_unusable;
- const int bg_reclaim_threshold = READ_ONCE(fs_info->bg_reclaim_threshold);
+ int bg_reclaim_threshold = 0;
bool initial = (size == block_group->length);
u64 reclaimable_unusable;
WARN_ON(!initial && offset + size > block_group->zone_capacity);
+ if (!initial)
+ bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold);
+
spin_lock(&ctl->tree_lock);
if (!used)
to_free = size;
@@ -4069,7 +4072,7 @@ static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info,
btrfs_info(fs_info, "cleaning free space cache v1");
- node = rb_first(&fs_info->block_group_cache_tree);
+ node = rb_first_cached(&fs_info->block_group_cache_tree);
while (node) {
block_group = rb_entry(node, struct btrfs_block_group, cache_node);
ret = btrfs_remove_free_space_inode(trans, NULL, block_group);
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 0ae54d8c10d6..1bf89aa67216 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -1178,7 +1178,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
goto abort;
}
- node = rb_first(&fs_info->block_group_cache_tree);
+ node = rb_first_cached(&fs_info->block_group_cache_tree);
while (node) {
block_group = rb_entry(node, struct btrfs_block_group,
cache_node);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 95c499b8424e..da13bd0d10f1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -64,8 +64,36 @@ struct btrfs_iget_args {
struct btrfs_dio_data {
ssize_t submitted;
struct extent_changeset *data_reserved;
+ bool data_space_reserved;
+ bool nocow_done;
};
+struct btrfs_dio_private {
+ struct inode *inode;
+
+ /*
+ * Since DIO can use anonymous page, we cannot use page_offset() to
+ * grab the file offset, thus need a dedicated member for file offset.
+ */
+ u64 file_offset;
+ /* Used for bio::bi_size */
+ u32 bytes;
+
+ /*
+ * References to this structure. There is one reference per in-flight
+ * bio plus one while we're still setting up.
+ */
+ refcount_t refs;
+
+ /* Array of checksums */
+ u8 *csums;
+
+ /* This must be last */
+ struct bio bio;
+};
+
+static struct bio_set btrfs_dio_bioset;
+
struct btrfs_rename_ctx {
/* Output field. Stores the index number of the old directory entry. */
u64 index;
@@ -222,15 +250,25 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
static int btrfs_dirty_inode(struct inode *inode);
static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
- struct inode *inode, struct inode *dir,
- const struct qstr *qstr)
+ struct btrfs_new_inode_args *args)
{
int err;
- err = btrfs_init_acl(trans, inode, dir);
- if (!err)
- err = btrfs_xattr_security_init(trans, inode, dir, qstr);
- return err;
+ if (args->default_acl) {
+ err = __btrfs_set_acl(trans, args->inode, args->default_acl,
+ ACL_TYPE_DEFAULT);
+ if (err)
+ return err;
+ }
+ if (args->acl) {
+ err = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS);
+ if (err)
+ return err;
+ }
+ if (!args->default_acl && !args->acl)
+ cache_no_acl(args->inode);
+ return btrfs_xattr_security_init(trans, args->inode, args->dir,
+ &args->dentry->d_name);
}
/*
@@ -1607,6 +1645,141 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
nr_written, 1);
}
+struct can_nocow_file_extent_args {
+ /* Input fields. */
+
+ /* Start file offset of the range we want to NOCOW. */
+ u64 start;
+ /* End file offset (inclusive) of the range we want to NOCOW. */
+ u64 end;
+ bool writeback_path;
+ bool strict;
+ /*
+ * Free the path passed to can_nocow_file_extent() once it's not needed
+ * anymore.
+ */
+ bool free_path;
+
+ /* Output fields. Only set when can_nocow_file_extent() returns 1. */
+
+ u64 disk_bytenr;
+ u64 disk_num_bytes;
+ u64 extent_offset;
+ /* Number of bytes that can be written to in NOCOW mode. */
+ u64 num_bytes;
+};
+
+/*
+ * Check if we can NOCOW the file extent that the path points to.
+ * This function may return with the path released, so the caller should check
+ * if path->nodes[0] is NULL or not if it needs to use the path afterwards.
+ *
+ * Returns: < 0 on error
+ * 0 if we can not NOCOW
+ * 1 if we can NOCOW
+ */
+static int can_nocow_file_extent(struct btrfs_path *path,
+ struct btrfs_key *key,
+ struct btrfs_inode *inode,
+ struct can_nocow_file_extent_args *args)
+{
+ const bool is_freespace_inode = btrfs_is_free_space_inode(inode);
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_root *root = inode->root;
+ struct btrfs_file_extent_item *fi;
+ u64 extent_end;
+ u8 extent_type;
+ int can_nocow = 0;
+ int ret = 0;
+
+ fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+ extent_type = btrfs_file_extent_type(leaf, fi);
+
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+ goto out;
+
+ /* Can't access these fields unless we know it's not an inline extent. */
+ args->disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ args->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+ args->extent_offset = btrfs_file_extent_offset(leaf, fi);
+
+ if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
+ extent_type == BTRFS_FILE_EXTENT_REG)
+ goto out;
+
+ /*
+ * If the extent was created before the generation where the last snapshot
+ * for its subvolume was created, then this implies the extent is shared,
+ * hence we must COW.
+ */
+ if (!args->strict &&
+ btrfs_file_extent_generation(leaf, fi) <=
+ btrfs_root_last_snapshot(&root->root_item))
+ goto out;
+
+ /* An explicit hole, must COW. */
+ if (args->disk_bytenr == 0)
+ goto out;
+
+ /* Compressed/encrypted/encoded extents must be COWed. */
+ if (btrfs_file_extent_compression(leaf, fi) ||
+ btrfs_file_extent_encryption(leaf, fi) ||
+ btrfs_file_extent_other_encoding(leaf, fi))
+ goto out;
+
+ extent_end = btrfs_file_extent_end(path);
+
+ /*
+ * The following checks can be expensive, as they need to take other
+ * locks and do btree or rbtree searches, so release the path to avoid
+ * blocking other tasks for too long.
+ */
+ btrfs_release_path(path);
+
+ ret = btrfs_cross_ref_exist(root, btrfs_ino(inode),
+ key->offset - args->extent_offset,
+ args->disk_bytenr, false, path);
+ WARN_ON_ONCE(ret > 0 && is_freespace_inode);
+ if (ret != 0)
+ goto out;
+
+ if (args->free_path) {
+ /*
+ * We don't need the path anymore, plus through the
+ * csum_exist_in_range() call below we will end up allocating
+ * another path. So free the path to avoid unnecessary extra
+ * memory usage.
+ */
+ btrfs_free_path(path);
+ path = NULL;
+ }
+
+ /* If there are pending snapshots for this root, we must COW. */
+ if (args->writeback_path && !is_freespace_inode &&
+ atomic_read(&root->snapshot_force_cow))
+ goto out;
+
+ args->disk_bytenr += args->extent_offset;
+ args->disk_bytenr += args->start - key->offset;
+ args->num_bytes = min(args->end + 1, extent_end) - args->start;
+
+ /*
+ * Force COW if csums exist in the range. This ensures that csums for a
+ * given extent are either valid or do not exist.
+ */
+ ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes);
+ WARN_ON_ONCE(ret > 0 && is_freespace_inode);
+ if (ret != 0)
+ goto out;
+
+ can_nocow = 1;
+ out:
+ if (args->free_path && path)
+ btrfs_free_path(path);
+
+ return ret < 0 ? ret : can_nocow;
+}
+
/*
* when nowcow writeback call back. This checks for snapshots or COW copies
* of the extents that exist in the file, and COWs the file as required.
@@ -1627,11 +1800,10 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
u64 cur_offset = start;
int ret;
bool check_prev = true;
- const bool freespace_inode = btrfs_is_free_space_inode(inode);
u64 ino = btrfs_ino(inode);
+ struct btrfs_block_group *bg;
bool nocow = false;
- u64 disk_bytenr = 0;
- const bool force = inode->flags & BTRFS_INODE_NODATACOW;
+ struct can_nocow_file_extent_args nocow_args = { 0 };
path = btrfs_alloc_path();
if (!path) {
@@ -1644,15 +1816,16 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
return -ENOMEM;
}
+ nocow_args.end = end;
+ nocow_args.writeback_path = true;
+
while (1) {
struct btrfs_key found_key;
struct btrfs_file_extent_item *fi;
struct extent_buffer *leaf;
u64 extent_end;
- u64 extent_offset;
- u64 num_bytes = 0;
- u64 disk_num_bytes;
u64 ram_bytes;
+ u64 nocow_end;
int extent_type;
nocow = false;
@@ -1728,116 +1901,38 @@ next_slot:
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
extent_type = btrfs_file_extent_type(leaf, fi);
-
+ /* If this is triggered then we have a memory corruption. */
+ ASSERT(extent_type < BTRFS_NR_FILE_EXTENT_TYPES);
+ if (WARN_ON(extent_type >= BTRFS_NR_FILE_EXTENT_TYPES)) {
+ ret = -EUCLEAN;
+ goto error;
+ }
ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
- if (extent_type == BTRFS_FILE_EXTENT_REG ||
- extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
- disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
- extent_offset = btrfs_file_extent_offset(leaf, fi);
- extent_end = found_key.offset +
- btrfs_file_extent_num_bytes(leaf, fi);
- disk_num_bytes =
- btrfs_file_extent_disk_num_bytes(leaf, fi);
- /*
- * If the extent we got ends before our current offset,
- * skip to the next extent.
- */
- if (extent_end <= cur_offset) {
- path->slots[0]++;
- goto next_slot;
- }
- /* Skip holes */
- if (disk_bytenr == 0)
- goto out_check;
- /* Skip compressed/encrypted/encoded extents */
- if (btrfs_file_extent_compression(leaf, fi) ||
- btrfs_file_extent_encryption(leaf, fi) ||
- btrfs_file_extent_other_encoding(leaf, fi))
- goto out_check;
- /*
- * If extent is created before the last volume's snapshot
- * this implies the extent is shared, hence we can't do
- * nocow. This is the same check as in
- * btrfs_cross_ref_exist but without calling
- * btrfs_search_slot.
- */
- if (!freespace_inode &&
- btrfs_file_extent_generation(leaf, fi) <=
- btrfs_root_last_snapshot(&root->root_item))
- goto out_check;
- if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
- goto out_check;
+ extent_end = btrfs_file_extent_end(path);
- /*
- * The following checks can be expensive, as they need to
- * take other locks and do btree or rbtree searches, so
- * release the path to avoid blocking other tasks for too
- * long.
- */
- btrfs_release_path(path);
+ /*
+ * If the extent we got ends before our current offset, skip to
+ * the next extent.
+ */
+ if (extent_end <= cur_offset) {
+ path->slots[0]++;
+ goto next_slot;
+ }
- ret = btrfs_cross_ref_exist(root, ino,
- found_key.offset -
- extent_offset, disk_bytenr, false);
- if (ret) {
- /*
- * ret could be -EIO if the above fails to read
- * metadata.
- */
- if (ret < 0) {
- if (cow_start != (u64)-1)
- cur_offset = cow_start;
- goto error;
- }
+ nocow_args.start = cur_offset;
+ ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args);
+ if (ret < 0) {
+ if (cow_start != (u64)-1)
+ cur_offset = cow_start;
+ goto error;
+ } else if (ret == 0) {
+ goto out_check;
+ }
- WARN_ON_ONCE(freespace_inode);
- goto out_check;
- }
- disk_bytenr += extent_offset;
- disk_bytenr += cur_offset - found_key.offset;
- num_bytes = min(end + 1, extent_end) - cur_offset;
- /*
- * If there are pending snapshots for this root, we
- * fall into common COW way
- */
- if (!freespace_inode && atomic_read(&root->snapshot_force_cow))
- goto out_check;
- /*
- * force cow if csum exists in the range.
- * this ensure that csum for a given extent are
- * either valid or do not exist.
- */
- ret = csum_exist_in_range(fs_info, disk_bytenr,
- num_bytes);
- if (ret) {
- /*
- * ret could be -EIO if the above fails to read
- * metadata.
- */
- if (ret < 0) {
- if (cow_start != (u64)-1)
- cur_offset = cow_start;
- goto error;
- }
- WARN_ON_ONCE(freespace_inode);
- goto out_check;
- }
- /* If the extent's block group is RO, we must COW */
- if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr))
- goto out_check;
+ ret = 0;
+ bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr);
+ if (bg)
nocow = true;
- } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
- extent_end = found_key.offset + ram_bytes;
- extent_end = ALIGN(extent_end, fs_info->sectorsize);
- /* Skip extents outside of our requested range */
- if (extent_end <= start) {
- path->slots[0]++;
- goto next_slot;
- }
- } else {
- /* If this triggers then we have a memory corruption */
- BUG();
- }
out_check:
/*
* If nocow is false then record the beginning of the range
@@ -1869,15 +1964,17 @@ out_check:
cow_start = (u64)-1;
}
+ nocow_end = cur_offset + nocow_args.num_bytes - 1;
+
if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
- u64 orig_start = found_key.offset - extent_offset;
+ u64 orig_start = found_key.offset - nocow_args.extent_offset;
struct extent_map *em;
- em = create_io_em(inode, cur_offset, num_bytes,
+ em = create_io_em(inode, cur_offset, nocow_args.num_bytes,
orig_start,
- disk_bytenr, /* block_start */
- num_bytes, /* block_len */
- disk_num_bytes, /* orig_block_len */
+ nocow_args.disk_bytenr, /* block_start */
+ nocow_args.num_bytes, /* block_len */
+ nocow_args.disk_num_bytes, /* orig_block_len */
ram_bytes, BTRFS_COMPRESS_NONE,
BTRFS_ORDERED_PREALLOC);
if (IS_ERR(em)) {
@@ -1886,20 +1983,23 @@ out_check:
}
free_extent_map(em);
ret = btrfs_add_ordered_extent(inode,
- cur_offset, num_bytes, num_bytes,
- disk_bytenr, num_bytes, 0,
+ cur_offset, nocow_args.num_bytes,
+ nocow_args.num_bytes,
+ nocow_args.disk_bytenr,
+ nocow_args.num_bytes, 0,
1 << BTRFS_ORDERED_PREALLOC,
BTRFS_COMPRESS_NONE);
if (ret) {
btrfs_drop_extent_cache(inode, cur_offset,
- cur_offset + num_bytes - 1,
- 0);
+ nocow_end, 0);
goto error;
}
} else {
ret = btrfs_add_ordered_extent(inode, cur_offset,
- num_bytes, num_bytes,
- disk_bytenr, num_bytes,
+ nocow_args.num_bytes,
+ nocow_args.num_bytes,
+ nocow_args.disk_bytenr,
+ nocow_args.num_bytes,
0,
1 << BTRFS_ORDERED_NOCOW,
BTRFS_COMPRESS_NONE);
@@ -1907,9 +2007,10 @@ out_check:
goto error;
}
- if (nocow)
- btrfs_dec_nocow_writers(fs_info, disk_bytenr);
- nocow = false;
+ if (nocow) {
+ btrfs_dec_nocow_writers(bg);
+ nocow = false;
+ }
if (btrfs_is_data_reloc_root(root))
/*
@@ -1918,10 +2019,9 @@ out_check:
* from freeing metadata of created ordered extent.
*/
ret = btrfs_reloc_clone_csums(inode, cur_offset,
- num_bytes);
+ nocow_args.num_bytes);
- extent_clear_unlock_delalloc(inode, cur_offset,
- cur_offset + num_bytes - 1,
+ extent_clear_unlock_delalloc(inode, cur_offset, nocow_end,
locked_page, EXTENT_LOCKED |
EXTENT_DELALLOC |
EXTENT_CLEAR_DATA_RESV,
@@ -1954,7 +2054,7 @@ out_check:
error:
if (nocow)
- btrfs_dec_nocow_writers(fs_info, disk_bytenr);
+ btrfs_dec_nocow_writers(bg);
if (ret && cur_offset < end)
extent_clear_unlock_delalloc(inode, cur_offset, end,
@@ -2498,9 +2598,8 @@ out:
*
* c-3) otherwise: async submit
*/
-blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags)
-
+void btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, enum btrfs_compression_type compress_type)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2529,16 +2628,14 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
if (ret)
goto out;
- if (bio_flags & EXTENT_BIO_COMPRESSED) {
+ if (compress_type != BTRFS_COMPRESS_NONE) {
/*
* btrfs_submit_compressed_read will handle completing
* the bio if there were any errors, so just return
* here.
*/
- ret = btrfs_submit_compressed_read(inode, bio,
- mirror_num,
- bio_flags);
- goto out_no_endio;
+ btrfs_submit_compressed_read(inode, bio, mirror_num);
+ return;
} else {
/*
* Lookup bio sums does extra checks around whether we
@@ -2555,7 +2652,7 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
if (btrfs_is_data_reloc_root(root))
goto mapit;
/* we're doing a write, do the async checksumming */
- ret = btrfs_wq_submit_bio(inode, bio, mirror_num, bio_flags,
+ ret = btrfs_wq_submit_bio(inode, bio, mirror_num,
0, btrfs_submit_bio_start);
goto out;
} else if (!skip_sum) {
@@ -2572,8 +2669,6 @@ out:
bio->bi_status = ret;
bio_endio(bio);
}
-out_no_endio:
- return ret;
}
/*
@@ -3264,11 +3359,11 @@ static int check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
shash->tfm = fs_info->csum_shash;
crypto_shash_digest(shash, kaddr + pgoff, len, csum);
+ kunmap_atomic(kaddr);
if (memcmp(csum, csum_expected, csum_size))
goto zeroit;
- kunmap_atomic(kaddr);
return 0;
zeroit:
btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
@@ -3276,9 +3371,7 @@ zeroit:
if (bbio->device)
btrfs_dev_stat_inc_and_print(bbio->device,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
- memset(kaddr + pgoff, 1, len);
- flush_dcache_page(page);
- kunmap_atomic(kaddr);
+ memzero_page(page, pgoff, len);
return -EIO;
}
@@ -3483,6 +3576,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
u64 last_objectid = 0;
int ret = 0, nr_unlink = 0;
+ /* Bail out if the cleanup is already running. */
if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state))
return 0;
@@ -3565,17 +3659,17 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
*
* btrfs_find_orphan_roots() ran before us, which has
* found all deleted roots and loaded them into
- * fs_info->fs_roots_radix. So here we can find if an
+ * fs_info->fs_roots. So here we can find if an
* orphan item corresponds to a deleted root by looking
- * up the root from that radix tree.
+ * up the root from that xarray.
*/
- spin_lock(&fs_info->fs_roots_radix_lock);
- dead_root = radix_tree_lookup(&fs_info->fs_roots_radix,
- (unsigned long)found_key.objectid);
+ spin_lock(&fs_info->fs_roots_lock);
+ dead_root = xa_load(&fs_info->fs_roots,
+ (unsigned long)found_key.objectid);
if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0)
is_dead_root = 1;
- spin_unlock(&fs_info->fs_roots_radix_lock);
+ spin_unlock(&fs_info->fs_roots_lock);
if (is_dead_root) {
/* prevent this orphan from being found again */
@@ -3815,7 +3909,7 @@ cache_index:
* cache.
*
* This is required for both inode re-read from disk and delayed inode
- * in delayed_nodes_tree.
+ * in the delayed_nodes xarray.
*/
if (BTRFS_I(inode)->last_trans == fs_info->generation)
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
@@ -4199,8 +4293,9 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
* 1 for the dir index
* 1 for the inode ref
* 1 for the inode
+ * 1 for the parent inode
*/
- return btrfs_start_transaction_fallback_global_rsv(root, 5);
+ return btrfs_start_transaction_fallback_global_rsv(root, 6);
}
static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
@@ -4693,7 +4788,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
goto out;
}
}
- ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize);
+ ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize, false);
if (ret < 0) {
if (!only_release_metadata)
btrfs_free_reserved_data_space(inode, data_reserved,
@@ -5780,8 +5875,6 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
struct list_head ins_list;
struct list_head del_list;
int ret;
- struct extent_buffer *leaf;
- int slot;
char *name_ptr;
int name_len;
int entries = 0;
@@ -5808,35 +5901,19 @@ again:
key.offset = ctx->pos;
key.objectid = btrfs_ino(BTRFS_I(inode));
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto err;
-
- while (1) {
+ btrfs_for_each_slot(root, &key, &found_key, path, ret) {
struct dir_entry *entry;
-
- leaf = path->nodes[0];
- slot = path->slots[0];
- if (slot >= btrfs_header_nritems(leaf)) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- goto err;
- else if (ret > 0)
- break;
- continue;
- }
-
- btrfs_item_key_to_cpu(leaf, &found_key, slot);
+ struct extent_buffer *leaf = path->nodes[0];
if (found_key.objectid != key.objectid)
break;
if (found_key.type != BTRFS_DIR_INDEX_KEY)
break;
if (found_key.offset < ctx->pos)
- goto next;
+ continue;
if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
- goto next;
- di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+ continue;
+ di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
name_len = btrfs_dir_name_len(leaf, di);
if ((total_len + sizeof(struct dir_entry) + name_len) >=
PAGE_SIZE) {
@@ -5863,9 +5940,11 @@ again:
entries++;
addr += sizeof(struct dir_entry) + name_len;
total_len += sizeof(struct dir_entry) + name_len;
-next:
- path->slots[0]++;
}
+ /* Catch error encountered during iteration */
+ if (ret < 0)
+ goto err;
+
btrfs_release_path(path);
ret = btrfs_filldir(private->filldir_buf, entries, ctx);
@@ -6053,6 +6132,57 @@ static int btrfs_insert_inode_locked(struct inode *inode)
btrfs_find_actor, &args);
}
+int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
+ unsigned int *trans_num_items)
+{
+ struct inode *dir = args->dir;
+ struct inode *inode = args->inode;
+ int ret;
+
+ ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl);
+ if (ret)
+ return ret;
+
+ /* 1 to add inode item */
+ *trans_num_items = 1;
+ /* 1 to add compression property */
+ if (BTRFS_I(dir)->prop_compress)
+ (*trans_num_items)++;
+ /* 1 to add default ACL xattr */
+ if (args->default_acl)
+ (*trans_num_items)++;
+ /* 1 to add access ACL xattr */
+ if (args->acl)
+ (*trans_num_items)++;
+#ifdef CONFIG_SECURITY
+ /* 1 to add LSM xattr */
+ if (dir->i_security)
+ (*trans_num_items)++;
+#endif
+ if (args->orphan) {
+ /* 1 to add orphan item */
+ (*trans_num_items)++;
+ } else {
+ /*
+ * 1 to add dir item
+ * 1 to add dir index
+ * 1 to update parent inode item
+ *
+ * No need for 1 unit for the inode ref item because it is
+ * inserted in a batch together with the inode item at
+ * btrfs_create_new_inode().
+ */
+ *trans_num_items += 3;
+ }
+ return 0;
+}
+
+void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args)
+{
+ posix_acl_release(args->acl);
+ posix_acl_release(args->default_acl);
+}
+
/*
* Inherit flags from the parent inode.
*
@@ -6062,9 +6192,6 @@ static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
{
unsigned int flags;
- if (!dir)
- return;
-
flags = BTRFS_I(dir)->flags;
if (flags & BTRFS_INODE_NOCOMPRESS) {
@@ -6084,76 +6211,86 @@ static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
btrfs_sync_inode_flags_to_i_flags(inode);
}
-static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct user_namespace *mnt_userns,
- struct inode *dir,
- const char *name, int name_len,
- u64 ref_objectid, u64 objectid,
- umode_t mode, u64 *index)
+int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_new_inode_args *args)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct inode *inode;
+ struct inode *dir = args->dir;
+ struct inode *inode = args->inode;
+ const char *name = args->orphan ? NULL : args->dentry->d_name.name;
+ int name_len = args->orphan ? 0 : args->dentry->d_name.len;
+ struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+ struct btrfs_root *root;
struct btrfs_inode_item *inode_item;
struct btrfs_key *location;
struct btrfs_path *path;
+ u64 objectid;
struct btrfs_inode_ref *ref;
struct btrfs_key key[2];
u32 sizes[2];
struct btrfs_item_batch batch;
unsigned long ptr;
- unsigned int nofs_flag;
int ret;
path = btrfs_alloc_path();
if (!path)
- return ERR_PTR(-ENOMEM);
-
- nofs_flag = memalloc_nofs_save();
- inode = new_inode(fs_info->sb);
- memalloc_nofs_restore(nofs_flag);
- if (!inode) {
- btrfs_free_path(path);
- return ERR_PTR(-ENOMEM);
- }
+ return -ENOMEM;
- /*
- * O_TMPFILE, set link count to 0, so that after this point,
- * we fill in an inode item with the correct link count.
- */
- if (!name)
- set_nlink(inode, 0);
+ if (!args->subvol)
+ BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root);
+ root = BTRFS_I(inode)->root;
- /*
- * we have to initialize this early, so we can reclaim the inode
- * number if we fail afterwards in this function.
- */
+ ret = btrfs_get_free_objectid(root, &objectid);
+ if (ret)
+ goto out;
inode->i_ino = objectid;
- if (dir && name) {
+ if (args->orphan) {
+ /*
+ * O_TMPFILE, set link count to 0, so that after this point, we
+ * fill in an inode item with the correct link count.
+ */
+ set_nlink(inode, 0);
+ } else {
trace_btrfs_inode_request(dir);
- ret = btrfs_set_inode_index(BTRFS_I(dir), index);
- if (ret) {
- btrfs_free_path(path);
- iput(inode);
- return ERR_PTR(ret);
- }
- } else if (dir) {
- *index = 0;
+ ret = btrfs_set_inode_index(BTRFS_I(dir), &BTRFS_I(inode)->dir_index);
+ if (ret)
+ goto out;
}
- /*
- * index_cnt is ignored for everything but a dir,
- * btrfs_set_inode_index_count has an explanation for the magic
- * number
- */
- BTRFS_I(inode)->index_cnt = 2;
- BTRFS_I(inode)->dir_index = *index;
- BTRFS_I(inode)->root = btrfs_grab_root(root);
+ /* index_cnt is ignored for everything but a dir. */
+ BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX;
BTRFS_I(inode)->generation = trans->transid;
inode->i_generation = BTRFS_I(inode)->generation;
/*
+ * Subvolumes don't inherit flags from their parent directory.
+ * Originally this was probably by accident, but we probably can't
+ * change it now without compatibility issues.
+ */
+ if (!args->subvol)
+ btrfs_inherit_iflags(inode, dir);
+
+ if (S_ISREG(inode->i_mode)) {
+ if (btrfs_test_opt(fs_info, NODATASUM))
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
+ if (btrfs_test_opt(fs_info, NODATACOW))
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
+ BTRFS_INODE_NODATASUM;
+ }
+
+ location = &BTRFS_I(inode)->location;
+ location->objectid = objectid;
+ location->offset = 0;
+ location->type = BTRFS_INODE_ITEM_KEY;
+
+ ret = btrfs_insert_inode_locked(inode);
+ if (ret < 0) {
+ if (!args->orphan)
+ BTRFS_I(dir)->index_cnt--;
+ goto out;
+ }
+
+ /*
* We could have gotten an inode number from somebody who was fsynced
* and then removed in this same transaction, so let's just set full
* sync since it will be a full sync anyway and this will blow away the
@@ -6167,7 +6304,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
sizes[0] = sizeof(struct btrfs_inode_item);
- if (name) {
+ if (!args->orphan) {
/*
* Start new inodes with an inode_ref. This is slightly more
* efficient for small numbers of hard links since they will
@@ -6176,64 +6313,95 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
*/
key[1].objectid = objectid;
key[1].type = BTRFS_INODE_REF_KEY;
- key[1].offset = ref_objectid;
-
- sizes[1] = name_len + sizeof(*ref);
- }
-
- location = &BTRFS_I(inode)->location;
- location->objectid = objectid;
- location->offset = 0;
- location->type = BTRFS_INODE_ITEM_KEY;
-
- ret = btrfs_insert_inode_locked(inode);
- if (ret < 0) {
- iput(inode);
- goto fail;
+ if (args->subvol) {
+ key[1].offset = objectid;
+ sizes[1] = 2 + sizeof(*ref);
+ } else {
+ key[1].offset = btrfs_ino(BTRFS_I(dir));
+ sizes[1] = name_len + sizeof(*ref);
+ }
}
batch.keys = &key[0];
batch.data_sizes = &sizes[0];
- batch.total_data_size = sizes[0] + (name ? sizes[1] : 0);
- batch.nr = name ? 2 : 1;
+ batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]);
+ batch.nr = args->orphan ? 1 : 2;
ret = btrfs_insert_empty_items(trans, root, path, &batch);
- if (ret != 0)
- goto fail_unlock;
-
- inode_init_owner(mnt_userns, inode, dir, mode);
- inode_set_bytes(inode, 0);
+ if (ret != 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto discard;
+ }
inode->i_mtime = current_time(inode);
inode->i_atime = inode->i_mtime;
inode->i_ctime = inode->i_mtime;
BTRFS_I(inode)->i_otime = inode->i_mtime;
+ /*
+ * We're going to fill the inode item now, so at this point the inode
+ * must be fully initialized.
+ */
+
inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_item);
memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item,
sizeof(*inode_item));
fill_inode_item(trans, path->nodes[0], inode_item, inode);
- if (name) {
+ if (!args->orphan) {
ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
struct btrfs_inode_ref);
- btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
- btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
ptr = (unsigned long)(ref + 1);
- write_extent_buffer(path->nodes[0], name, ptr, name_len);
+ if (args->subvol) {
+ btrfs_set_inode_ref_name_len(path->nodes[0], ref, 2);
+ btrfs_set_inode_ref_index(path->nodes[0], ref, 0);
+ write_extent_buffer(path->nodes[0], "..", ptr, 2);
+ } else {
+ btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+ btrfs_set_inode_ref_index(path->nodes[0], ref,
+ BTRFS_I(inode)->dir_index);
+ write_extent_buffer(path->nodes[0], name, ptr, name_len);
+ }
}
btrfs_mark_buffer_dirty(path->nodes[0]);
- btrfs_free_path(path);
+ btrfs_release_path(path);
- btrfs_inherit_iflags(inode, dir);
+ if (args->subvol) {
+ struct inode *parent;
- if (S_ISREG(mode)) {
- if (btrfs_test_opt(fs_info, NODATASUM))
- BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
- if (btrfs_test_opt(fs_info, NODATACOW))
- BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
- BTRFS_INODE_NODATASUM;
+ /*
+ * Subvolumes inherit properties from their parent subvolume,
+ * not the directory they were created in.
+ */
+ parent = btrfs_iget(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID,
+ BTRFS_I(dir)->root);
+ if (IS_ERR(parent)) {
+ ret = PTR_ERR(parent);
+ } else {
+ ret = btrfs_inode_inherit_props(trans, inode, parent);
+ iput(parent);
+ }
+ } else {
+ ret = btrfs_inode_inherit_props(trans, inode, dir);
+ }
+ if (ret) {
+ btrfs_err(fs_info,
+ "error inheriting props for ino %llu (root %llu): %d",
+ btrfs_ino(BTRFS_I(inode)), root->root_key.objectid,
+ ret);
+ }
+
+ /*
+ * Subvolumes don't inherit ACLs or get passed to the LSM. This is
+ * probably a bug.
+ */
+ if (!args->subvol) {
+ ret = btrfs_init_inode_security(trans, args);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto discard;
+ }
}
inode_tree_add(inode);
@@ -6243,21 +6411,30 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
btrfs_update_root_times(trans, root);
- ret = btrfs_inode_inherit_props(trans, inode, dir);
- if (ret)
- btrfs_err(fs_info,
- "error inheriting props for ino %llu (root %llu): %d",
- btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, ret);
+ if (args->orphan) {
+ ret = btrfs_orphan_add(trans, BTRFS_I(inode));
+ } else {
+ ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
+ name_len, 0, BTRFS_I(inode)->dir_index);
+ }
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto discard;
+ }
- return inode;
+ ret = 0;
+ goto out;
-fail_unlock:
+discard:
+ /*
+ * discard_new_inode() calls iput(), but the caller owns the reference
+ * to the inode.
+ */
+ ihold(inode);
discard_new_inode(inode);
-fail:
- if (dir && name)
- BTRFS_I(dir)->index_cnt--;
+out:
btrfs_free_path(path);
- return ERR_PTR(ret);
+ return ret;
}
/*
@@ -6349,147 +6526,71 @@ fail_dir_item:
return ret;
}
-static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
- struct btrfs_inode *dir, struct dentry *dentry,
- struct btrfs_inode *inode, int backref, u64 index)
-{
- int err = btrfs_add_link(trans, dir, inode,
- dentry->d_name.name, dentry->d_name.len,
- backref, index);
- if (err > 0)
- err = -EEXIST;
- return err;
-}
-
-static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
- struct dentry *dentry, umode_t mode, dev_t rdev)
+static int btrfs_create_common(struct inode *dir, struct dentry *dentry,
+ struct inode *inode)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
- struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
- struct inode *inode = NULL;
+ struct btrfs_new_inode_args new_inode_args = {
+ .dir = dir,
+ .dentry = dentry,
+ .inode = inode,
+ };
+ unsigned int trans_num_items;
+ struct btrfs_trans_handle *trans;
int err;
- u64 objectid;
- u64 index = 0;
- /*
- * 2 for inode item and ref
- * 2 for dir items
- * 1 for xattr if selinux is on
- */
- trans = btrfs_start_transaction(root, 5);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
- err = btrfs_get_free_objectid(root, &objectid);
+ err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
if (err)
- goto out_unlock;
+ goto out_inode;
- inode = btrfs_new_inode(trans, root, mnt_userns, dir,
- dentry->d_name.name, dentry->d_name.len,
- btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
- inode = NULL;
- goto out_unlock;
+ trans = btrfs_start_transaction(root, trans_num_items);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out_new_inode_args;
}
- /*
- * If the active LSM wants to access the inode during
- * d_instantiate it needs these. Smack checks to see
- * if the filesystem supports xattrs by looking at the
- * ops vector.
- */
- inode->i_op = &btrfs_special_inode_operations;
- init_special_inode(inode, inode->i_mode, rdev);
-
- err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
- if (err)
- goto out_unlock;
-
- err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
- 0, index);
- if (err)
- goto out_unlock;
-
- btrfs_update_inode(trans, root, BTRFS_I(inode));
- d_instantiate_new(dentry, inode);
+ err = btrfs_create_new_inode(trans, &new_inode_args);
+ if (!err)
+ d_instantiate_new(dentry, inode);
-out_unlock:
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
- if (err && inode) {
- inode_dec_link_count(inode);
- discard_new_inode(inode);
- }
+out_new_inode_args:
+ btrfs_new_inode_args_destroy(&new_inode_args);
+out_inode:
+ if (err)
+ iput(inode);
return err;
}
-static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir,
- struct dentry *dentry, umode_t mode, bool excl)
+static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, dev_t rdev)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
- struct btrfs_trans_handle *trans;
- struct btrfs_root *root = BTRFS_I(dir)->root;
- struct inode *inode = NULL;
- int err;
- u64 objectid;
- u64 index = 0;
+ struct inode *inode;
- /*
- * 2 for inode item and ref
- * 2 for dir items
- * 1 for xattr if selinux is on
- */
- trans = btrfs_start_transaction(root, 5);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ inode = new_inode(dir->i_sb);
+ if (!inode)
+ return -ENOMEM;
+ inode_init_owner(mnt_userns, inode, dir, mode);
+ inode->i_op = &btrfs_special_inode_operations;
+ init_special_inode(inode, inode->i_mode, rdev);
+ return btrfs_create_common(dir, dentry, inode);
+}
- err = btrfs_get_free_objectid(root, &objectid);
- if (err)
- goto out_unlock;
+static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, bool excl)
+{
+ struct inode *inode;
- inode = btrfs_new_inode(trans, root, mnt_userns, dir,
- dentry->d_name.name, dentry->d_name.len,
- btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
- inode = NULL;
- goto out_unlock;
- }
- /*
- * If the active LSM wants to access the inode during
- * d_instantiate it needs these. Smack checks to see
- * if the filesystem supports xattrs by looking at the
- * ops vector.
- */
+ inode = new_inode(dir->i_sb);
+ if (!inode)
+ return -ENOMEM;
+ inode_init_owner(mnt_userns, inode, dir, mode);
inode->i_fop = &btrfs_file_operations;
inode->i_op = &btrfs_file_inode_operations;
inode->i_mapping->a_ops = &btrfs_aops;
-
- err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
- if (err)
- goto out_unlock;
-
- err = btrfs_update_inode(trans, root, BTRFS_I(inode));
- if (err)
- goto out_unlock;
-
- err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
- 0, index);
- if (err)
- goto out_unlock;
-
- d_instantiate_new(dentry, inode);
-
-out_unlock:
- btrfs_end_transaction(trans);
- if (err && inode) {
- inode_dec_link_count(inode);
- discard_new_inode(inode);
- }
- btrfs_btree_balance_dirty(fs_info);
- return err;
+ return btrfs_create_common(dir, dentry, inode);
}
static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
@@ -6535,8 +6636,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
ihold(inode);
set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
- err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
- 1, index);
+ err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
+ dentry->d_name.name, dentry->d_name.len, 1, index);
if (err) {
drop_inode = 1;
@@ -6573,66 +6674,15 @@ fail:
static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
struct dentry *dentry, umode_t mode)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
- struct inode *inode = NULL;
- struct btrfs_trans_handle *trans;
- struct btrfs_root *root = BTRFS_I(dir)->root;
- int err = 0;
- u64 objectid = 0;
- u64 index = 0;
-
- /*
- * 2 items for inode and ref
- * 2 items for dir items
- * 1 for xattr if selinux is on
- */
- trans = btrfs_start_transaction(root, 5);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
- err = btrfs_get_free_objectid(root, &objectid);
- if (err)
- goto out_fail;
-
- inode = btrfs_new_inode(trans, root, mnt_userns, dir,
- dentry->d_name.name, dentry->d_name.len,
- btrfs_ino(BTRFS_I(dir)), objectid,
- S_IFDIR | mode, &index);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
- inode = NULL;
- goto out_fail;
- }
+ struct inode *inode;
- /* these must be set before we unlock the inode */
+ inode = new_inode(dir->i_sb);
+ if (!inode)
+ return -ENOMEM;
+ inode_init_owner(mnt_userns, inode, dir, S_IFDIR | mode);
inode->i_op = &btrfs_dir_inode_operations;
inode->i_fop = &btrfs_dir_file_operations;
-
- err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
- if (err)
- goto out_fail;
-
- btrfs_i_size_write(BTRFS_I(inode), 0);
- err = btrfs_update_inode(trans, root, BTRFS_I(inode));
- if (err)
- goto out_fail;
-
- err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
- dentry->d_name.name,
- dentry->d_name.len, 0, index);
- if (err)
- goto out_fail;
-
- d_instantiate_new(dentry, inode);
-
-out_fail:
- btrfs_end_transaction(trans);
- if (err && inode) {
- inode_dec_link_count(inode);
- discard_new_inode(inode);
- }
- btrfs_btree_balance_dirty(fs_info);
- return err;
+ return btrfs_create_common(dir, dentry, inode);
}
static noinline int uncompress_inline(struct btrfs_path *path,
@@ -7141,6 +7191,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
u64 *ram_bytes, bool strict)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct can_nocow_file_extent_args nocow_args = { 0 };
struct btrfs_path *path;
int ret;
struct extent_buffer *leaf;
@@ -7148,13 +7199,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_file_extent_item *fi;
struct btrfs_key key;
- u64 disk_bytenr;
- u64 backref_offset;
- u64 extent_end;
- u64 num_bytes;
- int slot;
int found_type;
- bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
path = btrfs_alloc_path();
if (!path)
@@ -7165,18 +7210,17 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
if (ret < 0)
goto out;
- slot = path->slots[0];
if (ret == 1) {
- if (slot == 0) {
+ if (path->slots[0] == 0) {
/* can't find the item, must cow */
ret = 0;
goto out;
}
- slot--;
+ path->slots[0]--;
}
ret = 0;
leaf = path->nodes[0];
- btrfs_item_key_to_cpu(leaf, &key, slot);
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.objectid != btrfs_ino(BTRFS_I(inode)) ||
key.type != BTRFS_EXTENT_DATA_KEY) {
/* not our file or wrong item type, must cow */
@@ -7188,55 +7232,38 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
goto out;
}
- fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
- found_type = btrfs_file_extent_type(leaf, fi);
- if (found_type != BTRFS_FILE_EXTENT_REG &&
- found_type != BTRFS_FILE_EXTENT_PREALLOC) {
- /* not a regular extent, must cow */
- goto out;
- }
-
- if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
+ if (btrfs_file_extent_end(path) <= offset)
goto out;
- extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
- if (extent_end <= offset)
- goto out;
+ fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+ found_type = btrfs_file_extent_type(leaf, fi);
+ if (ram_bytes)
+ *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
- disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
- if (disk_bytenr == 0)
- goto out;
+ nocow_args.start = offset;
+ nocow_args.end = offset + *len - 1;
+ nocow_args.strict = strict;
+ nocow_args.free_path = true;
- if (btrfs_file_extent_compression(leaf, fi) ||
- btrfs_file_extent_encryption(leaf, fi) ||
- btrfs_file_extent_other_encoding(leaf, fi))
- goto out;
+ ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args);
+ /* can_nocow_file_extent() has freed the path. */
+ path = NULL;
- /*
- * Do the same check as in btrfs_cross_ref_exist but without the
- * unnecessary search.
- */
- if (!strict &&
- (btrfs_file_extent_generation(leaf, fi) <=
- btrfs_root_last_snapshot(&root->root_item)))
+ if (ret != 1) {
+ /* Treat errors as not being able to NOCOW. */
+ ret = 0;
goto out;
-
- backref_offset = btrfs_file_extent_offset(leaf, fi);
-
- if (orig_start) {
- *orig_start = key.offset - backref_offset;
- *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
- *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
}
- if (btrfs_extent_readonly(fs_info, disk_bytenr))
+ ret = 0;
+ if (btrfs_extent_readonly(fs_info, nocow_args.disk_bytenr))
goto out;
- num_bytes = min(offset + *len, extent_end) - offset;
- if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+ found_type == BTRFS_FILE_EXTENT_PREALLOC) {
u64 range_end;
- range_end = round_up(offset + num_bytes,
+ range_end = round_up(offset + nocow_args.num_bytes,
root->fs_info->sectorsize) - 1;
ret = test_range_bit(io_tree, offset, range_end,
EXTENT_DELALLOC, 0, NULL);
@@ -7246,36 +7273,12 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
}
}
- btrfs_release_path(path);
-
- /*
- * look for other files referencing this extent, if we
- * find any we must cow
- */
-
- ret = btrfs_cross_ref_exist(root, btrfs_ino(BTRFS_I(inode)),
- key.offset - backref_offset, disk_bytenr,
- strict);
- if (ret) {
- ret = 0;
- goto out;
- }
+ if (orig_start)
+ *orig_start = key.offset - nocow_args.extent_offset;
+ if (orig_block_len)
+ *orig_block_len = nocow_args.disk_num_bytes;
- /*
- * adjust disk_bytenr and num_bytes to cover just the bytes
- * in this extent we are about to write. If there
- * are any csums in that range we have to cow in order
- * to keep the csums correct
- */
- disk_bytenr += backref_offset;
- disk_bytenr += offset - key.offset;
- if (csum_exist_in_range(fs_info, disk_bytenr, num_bytes))
- goto out;
- /*
- * all of the above have passed, it is safe to overwrite this extent
- * without cow
- */
- *len = num_bytes;
+ *len = nocow_args.num_bytes;
ret = 1;
out:
btrfs_free_path(path);
@@ -7283,14 +7286,22 @@ out:
}
static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
- struct extent_state **cached_state, bool writing)
+ struct extent_state **cached_state,
+ unsigned int iomap_flags)
{
+ const bool writing = (iomap_flags & IOMAP_WRITE);
+ const bool nowait = (iomap_flags & IOMAP_NOWAIT);
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_ordered_extent *ordered;
int ret = 0;
while (1) {
- lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- cached_state);
+ if (nowait) {
+ if (!try_lock_extent(io_tree, lockstart, lockend))
+ return -EAGAIN;
+ } else {
+ lock_extent_bits(io_tree, lockstart, lockend, cached_state);
+ }
/*
* We're concerned with the entire range that we're going to be
* doing DIO to, so we need to make sure there's no ordered
@@ -7311,10 +7322,14 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
lockstart, lockend)))
break;
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- cached_state);
+ unlock_extent_cached(io_tree, lockstart, lockend, cached_state);
if (ordered) {
+ if (nowait) {
+ btrfs_put_ordered_extent(ordered);
+ ret = -EAGAIN;
+ break;
+ }
/*
* If we are doing a DIO read and the ordered extent we
* found is for a buffered write, we can not wait for it
@@ -7334,7 +7349,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
btrfs_start_ordered_extent(ordered, 1);
else
- ret = -ENOTBLK;
+ ret = nowait ? -EAGAIN : -ENOTBLK;
btrfs_put_ordered_extent(ordered);
} else {
/*
@@ -7350,7 +7365,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
* ordered extent to complete while holding a lock on
* that page.
*/
- ret = -ENOTBLK;
+ ret = nowait ? -EAGAIN : -ENOTBLK;
}
if (ret)
@@ -7424,12 +7439,15 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
static int btrfs_get_blocks_direct_write(struct extent_map **map,
struct inode *inode,
struct btrfs_dio_data *dio_data,
- u64 start, u64 len)
+ u64 start, u64 len,
+ unsigned int iomap_flags)
{
+ const bool nowait = (iomap_flags & IOMAP_NOWAIT);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map *em = *map;
int type;
u64 block_start, orig_start, orig_block_len, ram_bytes;
+ struct btrfs_block_group *bg;
bool can_nocow = false;
bool space_reserved = false;
u64 prev_len;
@@ -7455,9 +7473,11 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
block_start = em->block_start + (start - em->start);
if (can_nocow_extent(inode, start, &len, &orig_start,
- &orig_block_len, &ram_bytes, false) == 1 &&
- btrfs_inc_nocow_writers(fs_info, block_start))
- can_nocow = true;
+ &orig_block_len, &ram_bytes, false) == 1) {
+ bg = btrfs_inc_nocow_writers(fs_info, block_start);
+ if (bg)
+ can_nocow = true;
+ }
}
prev_len = len;
@@ -7465,12 +7485,15 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
struct extent_map *em2;
/* We can NOCOW, so only need to reserve metadata space. */
- ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len);
+ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
+ nowait);
if (ret < 0) {
/* Our caller expects us to free the input extent map. */
free_extent_map(em);
*map = NULL;
- btrfs_dec_nocow_writers(fs_info, block_start);
+ btrfs_dec_nocow_writers(bg);
+ if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
+ ret = -EAGAIN;
goto out;
}
space_reserved = true;
@@ -7479,7 +7502,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
orig_start, block_start,
len, orig_block_len,
ram_bytes, type);
- btrfs_dec_nocow_writers(fs_info, block_start);
+ btrfs_dec_nocow_writers(bg);
if (type == BTRFS_ORDERED_PREALLOC) {
free_extent_map(em);
*map = em = em2;
@@ -7489,15 +7512,29 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
ret = PTR_ERR(em2);
goto out;
}
+
+ dio_data->nocow_done = true;
} else {
/* Our caller expects us to free the input extent map. */
free_extent_map(em);
*map = NULL;
- /* We have to COW, so need to reserve metadata and data space. */
- ret = btrfs_delalloc_reserve_space(BTRFS_I(inode),
- &dio_data->data_reserved,
- start, len);
+ if (nowait)
+ return -EAGAIN;
+
+ /*
+ * If we could not allocate data space before locking the file
+ * range and we can't do a NOCOW write, then we have to fail.
+ */
+ if (!dio_data->data_space_reserved)
+ return -ENOSPC;
+
+ /*
+ * We have to COW and we have already reserved data space before,
+ * so now we reserve only metadata.
+ */
+ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
+ false);
if (ret < 0)
goto out;
space_reserved = true;
@@ -7510,10 +7547,8 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
*map = em;
len = min(len, em->len - (start - em->start));
if (len < prev_len)
- btrfs_delalloc_release_space(BTRFS_I(inode),
- dio_data->data_reserved,
- start + len, prev_len - len,
- true);
+ btrfs_delalloc_release_metadata(BTRFS_I(inode),
+ prev_len - len, true);
}
/*
@@ -7531,15 +7566,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
out:
if (ret && space_reserved) {
btrfs_delalloc_release_extents(BTRFS_I(inode), len);
- if (can_nocow) {
- btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
- } else {
- btrfs_delalloc_release_space(BTRFS_I(inode),
- dio_data->data_reserved,
- start, len, true);
- extent_changeset_free(dio_data->data_reserved);
- dio_data->data_reserved = NULL;
- }
+ btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
}
return ret;
}
@@ -7548,14 +7575,16 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
loff_t length, unsigned int flags, struct iomap *iomap,
struct iomap *srcmap)
{
+ struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map *em;
struct extent_state *cached_state = NULL;
- struct btrfs_dio_data *dio_data = NULL;
+ struct btrfs_dio_data *dio_data = iter->private;
u64 lockstart, lockend;
const bool write = !!(flags & IOMAP_WRITE);
int ret = 0;
u64 len = length;
+ const u64 data_alloc_len = length;
bool unlock_extents = false;
if (!write)
@@ -7565,34 +7594,67 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
lockend = start + len - 1;
/*
- * The generic stuff only does filemap_write_and_wait_range, which
- * isn't enough if we've written compressed pages to this area, so we
- * need to flush the dirty pages again to make absolutely sure that any
- * outstanding dirty pages are on disk.
+ * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't
+ * enough if we've written compressed pages to this area, so we need to
+ * flush the dirty pages again to make absolutely sure that any
+ * outstanding dirty pages are on disk - the first flush only starts
+ * compression on the data, while keeping the pages locked, so by the
+ * time the second flush returns we know bios for the compressed pages
+ * were submitted and finished, and the pages no longer under writeback.
+ *
+ * If we have a NOWAIT request and we have any pages in the range that
+ * are locked, likely due to compression still in progress, we don't want
+ * to block on page locks. We also don't want to block on pages marked as
+ * dirty or under writeback (same as for the non-compression case).
+ * iomap_dio_rw() did the same check, but after that and before we got
+ * here, mmap'ed writes may have happened or buffered reads started
+ * (readpage() and readahead(), which lock pages), as we haven't locked
+ * the file range yet.
*/
if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
&BTRFS_I(inode)->runtime_flags)) {
- ret = filemap_fdatawrite_range(inode->i_mapping, start,
- start + length - 1);
- if (ret)
- return ret;
+ if (flags & IOMAP_NOWAIT) {
+ if (filemap_range_needs_writeback(inode->i_mapping,
+ lockstart, lockend))
+ return -EAGAIN;
+ } else {
+ ret = filemap_fdatawrite_range(inode->i_mapping, start,
+ start + length - 1);
+ if (ret)
+ return ret;
+ }
}
- dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS);
- if (!dio_data)
- return -ENOMEM;
-
- iomap->private = dio_data;
+ memset(dio_data, 0, sizeof(*dio_data));
+ /*
+ * We always try to allocate data space and must do it before locking
+ * the file range, to avoid deadlocks with concurrent writes to the same
+ * range if the range has several extents and the writes don't expand the
+ * current i_size (the inode lock is taken in shared mode). If we fail to
+ * allocate data space here we continue and later, after locking the
+ * file range, we fail with ENOSPC only if we figure out we can not do a
+ * NOCOW write.
+ */
+ if (write && !(flags & IOMAP_NOWAIT)) {
+ ret = btrfs_check_data_free_space(BTRFS_I(inode),
+ &dio_data->data_reserved,
+ start, data_alloc_len);
+ if (!ret)
+ dio_data->data_space_reserved = true;
+ else if (ret && !(BTRFS_I(inode)->flags &
+ (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
+ goto err;
+ }
/*
* If this errors out it's because we couldn't invalidate pagecache for
- * this range and we need to fallback to buffered.
+ * this range and we need to fallback to buffered IO, or we are doing a
+ * NOWAIT read/write and we need to block.
*/
- if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) {
- ret = -ENOTBLK;
+ ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags);
+ if (ret < 0)
goto err;
- }
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
if (IS_ERR(em)) {
@@ -7652,12 +7714,30 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
if (write) {
ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
- start, len);
+ start, len, flags);
if (ret < 0)
goto unlock_err;
unlock_extents = true;
/* Recalc len in case the new em is smaller than requested */
len = min(len, em->len - (start - em->start));
+ if (dio_data->data_space_reserved) {
+ u64 release_offset;
+ u64 release_len = 0;
+
+ if (dio_data->nocow_done) {
+ release_offset = start;
+ release_len = data_alloc_len;
+ } else if (len < data_alloc_len) {
+ release_offset = start + len;
+ release_len = data_alloc_len - len;
+ }
+
+ if (release_len > 0)
+ btrfs_free_reserved_data_space(BTRFS_I(inode),
+ dio_data->data_reserved,
+ release_offset,
+ release_len);
+ }
} else {
/*
* We need to unlock only the end area that we aren't using.
@@ -7702,7 +7782,12 @@ unlock_err:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state);
err:
- kfree(dio_data);
+ if (dio_data->data_space_reserved) {
+ btrfs_free_reserved_data_space(BTRFS_I(inode),
+ dio_data->data_reserved,
+ start, data_alloc_len);
+ extent_changeset_free(dio_data->data_reserved);
+ }
return ret;
}
@@ -7710,15 +7795,16 @@ err:
static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
ssize_t written, unsigned int flags, struct iomap *iomap)
{
- int ret = 0;
- struct btrfs_dio_data *dio_data = iomap->private;
+ struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
+ struct btrfs_dio_data *dio_data = iter->private;
size_t submitted = dio_data->submitted;
const bool write = !!(flags & IOMAP_WRITE);
+ int ret = 0;
if (!write && (iomap->type == IOMAP_HOLE)) {
/* If reading from a hole, unlock and return */
unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1);
- goto out;
+ return 0;
}
if (submitted < length) {
@@ -7735,10 +7821,6 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
if (write)
extent_changeset_free(dio_data->data_reserved);
-out:
- kfree(dio_data);
- iomap->private = NULL;
-
return ret;
}
@@ -7751,40 +7833,36 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
if (!refcount_dec_and_test(&dip->refs))
return;
- if (btrfs_op(dip->dio_bio) == BTRFS_MAP_WRITE) {
+ if (btrfs_op(&dip->bio) == BTRFS_MAP_WRITE) {
__endio_write_update_ordered(BTRFS_I(dip->inode),
dip->file_offset,
dip->bytes,
- !dip->dio_bio->bi_status);
+ !dip->bio.bi_status);
} else {
unlock_extent(&BTRFS_I(dip->inode)->io_tree,
dip->file_offset,
dip->file_offset + dip->bytes - 1);
}
- bio_endio(dip->dio_bio);
- kfree(dip);
+ kfree(dip->csums);
+ bio_endio(&dip->bio);
}
-static blk_status_t submit_dio_repair_bio(struct inode *inode, struct bio *bio,
- int mirror_num,
- unsigned long bio_flags)
+static void submit_dio_repair_bio(struct inode *inode, struct bio *bio,
+ int mirror_num,
+ enum btrfs_compression_type compress_type)
{
struct btrfs_dio_private *dip = bio->bi_private;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- blk_status_t ret;
BUG_ON(bio_op(bio) == REQ_OP_WRITE);
- ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
- if (ret)
- return ret;
+ if (btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA))
+ return;
refcount_inc(&dip->refs);
- ret = btrfs_map_bio(fs_info, bio, mirror_num);
- if (ret)
+ if (btrfs_map_bio(fs_info, bio, mirror_num))
refcount_dec(&dip->refs);
- return ret;
}
static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
@@ -7869,7 +7947,7 @@ static void btrfs_end_dio_bio(struct bio *bio)
err = btrfs_check_read_dio_bio(dip, bbio, !err);
if (err)
- dip->dio_bio->bi_status = err;
+ dip->bio.bi_status = err;
btrfs_record_physical_zoned(dip->inode, bbio->file_offset, bio);
@@ -7899,7 +7977,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
goto map;
if (write && async_submit) {
- ret = btrfs_wq_submit_bio(inode, bio, 0, 0, file_offset,
+ ret = btrfs_wq_submit_bio(inode, bio, 0, file_offset,
btrfs_submit_bio_start_direct_io);
goto err;
} else if (write) {
@@ -7924,50 +8002,16 @@ err:
return ret;
}
-/*
- * If this succeeds, the btrfs_dio_private is responsible for cleaning up locked
- * or ordered extents whether or not we submit any bios.
- */
-static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
- struct inode *inode,
- loff_t file_offset)
-{
- const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE);
- const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
- size_t dip_size;
- struct btrfs_dio_private *dip;
-
- dip_size = sizeof(*dip);
- if (!write && csum) {
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- size_t nblocks;
-
- nblocks = dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits;
- dip_size += fs_info->csum_size * nblocks;
- }
-
- dip = kzalloc(dip_size, GFP_NOFS);
- if (!dip)
- return NULL;
-
- dip->inode = inode;
- dip->file_offset = file_offset;
- dip->bytes = dio_bio->bi_iter.bi_size;
- dip->disk_bytenr = dio_bio->bi_iter.bi_sector << 9;
- dip->dio_bio = dio_bio;
- refcount_set(&dip->refs, 1);
- return dip;
-}
-
static void btrfs_submit_direct(const struct iomap_iter *iter,
struct bio *dio_bio, loff_t file_offset)
{
+ struct btrfs_dio_private *dip =
+ container_of(dio_bio, struct btrfs_dio_private, bio);
struct inode *inode = iter->inode;
const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
const bool raid56 = (btrfs_data_alloc_profile(fs_info) &
BTRFS_BLOCK_GROUP_RAID56_MASK);
- struct btrfs_dio_private *dip;
struct bio *bio;
u64 start_sector;
int async_submit = 0;
@@ -7978,27 +8022,28 @@ static void btrfs_submit_direct(const struct iomap_iter *iter,
int ret;
blk_status_t status;
struct btrfs_io_geometry geom;
- struct btrfs_dio_data *dio_data = iter->iomap.private;
+ struct btrfs_dio_data *dio_data = iter->private;
struct extent_map *em = NULL;
- dip = btrfs_create_dio_private(dio_bio, inode, file_offset);
- if (!dip) {
- if (!write) {
- unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
- file_offset + dio_bio->bi_iter.bi_size - 1);
- }
- dio_bio->bi_status = BLK_STS_RESOURCE;
- bio_endio(dio_bio);
- return;
- }
+ dip->inode = inode;
+ dip->file_offset = file_offset;
+ dip->bytes = dio_bio->bi_iter.bi_size;
+ refcount_set(&dip->refs, 1);
+ dip->csums = NULL;
+
+ if (!write && !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+ unsigned int nr_sectors =
+ (dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits);
- if (!write) {
/*
* Load the csums up front to reduce csum tree searches and
* contention when submitting bios.
- *
- * If we have csums disabled this will do nothing.
*/
+ status = BLK_STS_RESOURCE;
+ dip->csums = kcalloc(nr_sectors, fs_info->csum_size, GFP_NOFS);
+ if (!dip)
+ goto out_err;
+
status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums);
if (status != BLK_STS_OK)
goto out_err;
@@ -8088,19 +8133,28 @@ static void btrfs_submit_direct(const struct iomap_iter *iter,
out_err_em:
free_extent_map(em);
out_err:
- dip->dio_bio->bi_status = status;
+ dio_bio->bi_status = status;
btrfs_dio_private_put(dip);
}
-const struct iomap_ops btrfs_dio_iomap_ops = {
+static const struct iomap_ops btrfs_dio_iomap_ops = {
.iomap_begin = btrfs_dio_iomap_begin,
.iomap_end = btrfs_dio_iomap_end,
};
-const struct iomap_dio_ops btrfs_dio_ops = {
+static const struct iomap_dio_ops btrfs_dio_ops = {
.submit_io = btrfs_submit_direct,
+ .bio_set = &btrfs_dio_bioset,
};
+ssize_t btrfs_dio_rw(struct kiocb *iocb, struct iov_iter *iter, size_t done_before)
+{
+ struct btrfs_dio_data data;
+
+ return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
+ IOMAP_DIO_PARTIAL, &data, done_before);
+}
+
static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
@@ -8113,27 +8167,6 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
return extent_fiemap(BTRFS_I(inode), fieinfo, start, len);
}
-int btrfs_readpage(struct file *file, struct page *page)
-{
- struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
- u64 start = page_offset(page);
- u64 end = start + PAGE_SIZE - 1;
- struct btrfs_bio_ctrl bio_ctrl = { 0 };
- int ret;
-
- btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
-
- ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL);
- if (bio_ctrl.bio) {
- int ret2;
-
- ret2 = submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags);
- if (ret == 0)
- ret = ret2;
- }
- return ret;
-}
-
static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
@@ -8182,7 +8215,7 @@ static void wait_subpage_spinlock(struct page *page)
struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
struct btrfs_subpage *subpage;
- if (fs_info->sectorsize == PAGE_SIZE)
+ if (!btrfs_is_subpage(fs_info, page))
return;
ASSERT(PagePrivate(page) && page->private);
@@ -8764,46 +8797,23 @@ out:
return ret;
}
-/*
- * create a new subvolume directory/inode (helper for the ioctl).
- */
-int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *new_root,
- struct btrfs_root *parent_root,
- struct user_namespace *mnt_userns)
+struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns,
+ struct inode *dir)
{
struct inode *inode;
- int err;
- u64 index = 0;
- u64 ino;
-
- err = btrfs_get_free_objectid(new_root, &ino);
- if (err < 0)
- return err;
-
- inode = btrfs_new_inode(trans, new_root, mnt_userns, NULL, "..", 2,
- ino, ino,
- S_IFDIR | (~current_umask() & S_IRWXUGO),
- &index);
- if (IS_ERR(inode))
- return PTR_ERR(inode);
- inode->i_op = &btrfs_dir_inode_operations;
- inode->i_fop = &btrfs_dir_file_operations;
-
- set_nlink(inode, 1);
- btrfs_i_size_write(BTRFS_I(inode), 0);
- unlock_new_inode(inode);
-
- err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
- if (err)
- btrfs_err(new_root->fs_info,
- "error inheriting subvolume %llu properties: %d",
- new_root->root_key.objectid, err);
- err = btrfs_update_inode(trans, new_root, BTRFS_I(inode));
-
- iput(inode);
- return err;
+ inode = new_inode(dir->i_sb);
+ if (inode) {
+ /*
+ * Subvolumes don't inherit the sgid bit or the parent's gid if
+ * the parent's sgid bit is set. This is probably a bug.
+ */
+ inode_init_owner(mnt_userns, inode, NULL,
+ S_IFDIR | (~current_umask() & S_IRWXUGO));
+ inode->i_op = &btrfs_dir_inode_operations;
+ inode->i_fop = &btrfs_dir_file_operations;
+ }
+ return inode;
}
struct inode *btrfs_alloc_inode(struct super_block *sb)
@@ -8943,7 +8953,7 @@ int btrfs_drop_inode(struct inode *inode)
static void init_once(void *foo)
{
- struct btrfs_inode *ei = (struct btrfs_inode *) foo;
+ struct btrfs_inode *ei = foo;
inode_init_once(&ei->vfs_inode);
}
@@ -8955,6 +8965,7 @@ void __cold btrfs_destroy_cachep(void)
* destroy cache.
*/
rcu_barrier();
+ bioset_exit(&btrfs_dio_bioset);
kmem_cache_destroy(btrfs_inode_cachep);
kmem_cache_destroy(btrfs_trans_handle_cachep);
kmem_cache_destroy(btrfs_path_cachep);
@@ -8995,6 +9006,11 @@ int __init btrfs_init_cachep(void)
if (!btrfs_free_space_bitmap_cachep)
goto fail;
+ if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
+ offsetof(struct btrfs_dio_private, bio),
+ BIOSET_NEED_BVECS))
+ goto fail;
+
return 0;
fail:
btrfs_destroy_cachep();
@@ -9050,6 +9066,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
{
struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
struct btrfs_trans_handle *trans;
+ unsigned int trans_num_items;
struct btrfs_root *root = BTRFS_I(old_dir)->root;
struct btrfs_root *dest = BTRFS_I(new_dir)->root;
struct inode *new_inode = new_dentry->d_inode;
@@ -9081,14 +9098,37 @@ static int btrfs_rename_exchange(struct inode *old_dir,
down_read(&fs_info->subvol_sem);
/*
- * We want to reserve the absolute worst case amount of items. So if
- * both inodes are subvols and we need to unlink them then that would
- * require 4 item modifications, but if they are both normal inodes it
- * would require 5 item modifications, so we'll assume their normal
- * inodes. So 5 * 2 is 10, plus 2 for the new links, so 12 total items
- * should cover the worst case number of items we'll modify.
+ * For each inode:
+ * 1 to remove old dir item
+ * 1 to remove old dir index
+ * 1 to add new dir item
+ * 1 to add new dir index
+ * 1 to update parent inode
+ *
+ * If the parents are the same, we only need to account for one
*/
- trans = btrfs_start_transaction(root, 12);
+ trans_num_items = (old_dir == new_dir ? 9 : 10);
+ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
+ /*
+ * 1 to remove old root ref
+ * 1 to remove old root backref
+ * 1 to add new root ref
+ * 1 to add new root backref
+ */
+ trans_num_items += 4;
+ } else {
+ /*
+ * 1 to update inode item
+ * 1 to remove old inode ref
+ * 1 to add new inode ref
+ */
+ trans_num_items += 3;
+ }
+ if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
+ trans_num_items += 4;
+ else
+ trans_num_items += 3;
+ trans = btrfs_start_transaction(root, trans_num_items);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out_notrans;
@@ -9255,56 +9295,19 @@ out_notrans:
return ret;
}
-static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct user_namespace *mnt_userns,
- struct inode *dir,
- struct dentry *dentry)
+static struct inode *new_whiteout_inode(struct user_namespace *mnt_userns,
+ struct inode *dir)
{
- int ret;
struct inode *inode;
- u64 objectid;
- u64 index;
-
- ret = btrfs_get_free_objectid(root, &objectid);
- if (ret)
- return ret;
- inode = btrfs_new_inode(trans, root, mnt_userns, dir,
- dentry->d_name.name,
- dentry->d_name.len,
- btrfs_ino(BTRFS_I(dir)),
- objectid,
- S_IFCHR | WHITEOUT_MODE,
- &index);
-
- if (IS_ERR(inode)) {
- ret = PTR_ERR(inode);
- return ret;
+ inode = new_inode(dir->i_sb);
+ if (inode) {
+ inode_init_owner(mnt_userns, inode, dir,
+ S_IFCHR | WHITEOUT_MODE);
+ inode->i_op = &btrfs_special_inode_operations;
+ init_special_inode(inode, inode->i_mode, WHITEOUT_DEV);
}
-
- inode->i_op = &btrfs_special_inode_operations;
- init_special_inode(inode, inode->i_mode,
- WHITEOUT_DEV);
-
- ret = btrfs_init_inode_security(trans, inode, dir,
- &dentry->d_name);
- if (ret)
- goto out;
-
- ret = btrfs_add_nondir(trans, BTRFS_I(dir), dentry,
- BTRFS_I(inode), 0, index);
- if (ret)
- goto out;
-
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
-out:
- unlock_new_inode(inode);
- if (ret)
- inode_dec_link_count(inode);
- iput(inode);
-
- return ret;
+ return inode;
}
static int btrfs_rename(struct user_namespace *mnt_userns,
@@ -9313,6 +9316,10 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
unsigned int flags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
+ struct btrfs_new_inode_args whiteout_args = {
+ .dir = old_dir,
+ .dentry = old_dentry,
+ };
struct btrfs_trans_handle *trans;
unsigned int trans_num_items;
struct btrfs_root *root = BTRFS_I(old_dir)->root;
@@ -9367,23 +9374,56 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
filemap_flush(old_inode->i_mapping);
- /* close the racy window with snapshot create/destroy ioctl */
- if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
+ if (flags & RENAME_WHITEOUT) {
+ whiteout_args.inode = new_whiteout_inode(mnt_userns, old_dir);
+ if (!whiteout_args.inode)
+ return -ENOMEM;
+ ret = btrfs_new_inode_prepare(&whiteout_args, &trans_num_items);
+ if (ret)
+ goto out_whiteout_inode;
+ } else {
+ /* 1 to update the old parent inode. */
+ trans_num_items = 1;
+ }
+
+ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
+ /* Close the race window with snapshot create/destroy ioctl */
down_read(&fs_info->subvol_sem);
+ /*
+ * 1 to remove old root ref
+ * 1 to remove old root backref
+ * 1 to add new root ref
+ * 1 to add new root backref
+ */
+ trans_num_items += 4;
+ } else {
+ /*
+ * 1 to update inode
+ * 1 to remove old inode ref
+ * 1 to add new inode ref
+ */
+ trans_num_items += 3;
+ }
/*
- * We want to reserve the absolute worst case amount of items. So if
- * both inodes are subvols and we need to unlink them then that would
- * require 4 item modifications, but if they are both normal inodes it
- * would require 5 item modifications, so we'll assume they are normal
- * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
- * should cover the worst case number of items we'll modify.
- * If our rename has the whiteout flag, we need more 5 units for the
- * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item
- * when selinux is enabled).
+ * 1 to remove old dir item
+ * 1 to remove old dir index
+ * 1 to add new dir item
+ * 1 to add new dir index
*/
- trans_num_items = 11;
- if (flags & RENAME_WHITEOUT)
+ trans_num_items += 4;
+ /* 1 to update new parent inode if it's not the same as the old parent */
+ if (new_dir != old_dir)
+ trans_num_items++;
+ if (new_inode) {
+ /*
+ * 1 to update inode
+ * 1 to remove inode ref
+ * 1 to remove dir item
+ * 1 to remove dir index
+ * 1 to possibly add orphan item
+ */
trans_num_items += 5;
+ }
trans = btrfs_start_transaction(root, trans_num_items);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
@@ -9479,12 +9519,14 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
rename_ctx.index, new_dentry->d_parent);
if (flags & RENAME_WHITEOUT) {
- ret = btrfs_whiteout_for_rename(trans, root, mnt_userns,
- old_dir, old_dentry);
-
+ ret = btrfs_create_new_inode(trans, &whiteout_args);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
+ } else {
+ unlock_new_inode(whiteout_args.inode);
+ iput(whiteout_args.inode);
+ whiteout_args.inode = NULL;
}
}
out_fail:
@@ -9493,7 +9535,11 @@ out_fail:
out_notrans:
if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&fs_info->subvol_sem);
-
+ if (flags & RENAME_WHITEOUT)
+ btrfs_new_inode_args_destroy(&whiteout_args);
+out_whiteout_inode:
+ if (flags & RENAME_WHITEOUT)
+ iput(whiteout_args.inode);
return ret;
}
@@ -9712,10 +9758,13 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_path *path;
struct btrfs_key key;
- struct inode *inode = NULL;
+ struct inode *inode;
+ struct btrfs_new_inode_args new_inode_args = {
+ .dir = dir,
+ .dentry = dentry,
+ };
+ unsigned int trans_num_items;
int err;
- u64 objectid;
- u64 index = 0;
int name_len;
int datasize;
unsigned long ptr;
@@ -9726,49 +9775,40 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
return -ENAMETOOLONG;
- /*
- * 2 items for inode item and ref
- * 2 items for dir items
- * 1 item for updating parent inode item
- * 1 item for the inline extent item
- * 1 item for xattr if selinux is on
- */
- trans = btrfs_start_transaction(root, 7);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ inode = new_inode(dir->i_sb);
+ if (!inode)
+ return -ENOMEM;
+ inode_init_owner(mnt_userns, inode, dir, S_IFLNK | S_IRWXUGO);
+ inode->i_op = &btrfs_symlink_inode_operations;
+ inode_nohighmem(inode);
+ inode->i_mapping->a_ops = &btrfs_aops;
+ btrfs_i_size_write(BTRFS_I(inode), name_len);
+ inode_set_bytes(inode, name_len);
- err = btrfs_get_free_objectid(root, &objectid);
+ new_inode_args.inode = inode;
+ err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
if (err)
- goto out_unlock;
+ goto out_inode;
+ /* 1 additional item for the inline extent */
+ trans_num_items++;
- inode = btrfs_new_inode(trans, root, mnt_userns, dir,
- dentry->d_name.name, dentry->d_name.len,
- btrfs_ino(BTRFS_I(dir)), objectid,
- S_IFLNK | S_IRWXUGO, &index);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
- inode = NULL;
- goto out_unlock;
+ trans = btrfs_start_transaction(root, trans_num_items);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out_new_inode_args;
}
- /*
- * If the active LSM wants to access the inode during
- * d_instantiate it needs these. Smack checks to see
- * if the filesystem supports xattrs by looking at the
- * ops vector.
- */
- inode->i_fop = &btrfs_file_operations;
- inode->i_op = &btrfs_file_inode_operations;
- inode->i_mapping->a_ops = &btrfs_aops;
-
- err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
+ err = btrfs_create_new_inode(trans, &new_inode_args);
if (err)
- goto out_unlock;
+ goto out;
path = btrfs_alloc_path();
if (!path) {
err = -ENOMEM;
- goto out_unlock;
+ btrfs_abort_transaction(trans, err);
+ discard_new_inode(inode);
+ inode = NULL;
+ goto out;
}
key.objectid = btrfs_ino(BTRFS_I(inode));
key.offset = 0;
@@ -9777,8 +9817,11 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
err = btrfs_insert_empty_item(trans, root, path, &key,
datasize);
if (err) {
+ btrfs_abort_transaction(trans, err);
btrfs_free_path(path);
- goto out_unlock;
+ discard_new_inode(inode);
+ inode = NULL;
+ goto out;
}
leaf = path->nodes[0];
ei = btrfs_item_ptr(leaf, path->slots[0],
@@ -9796,31 +9839,16 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
btrfs_mark_buffer_dirty(leaf);
btrfs_free_path(path);
- inode->i_op = &btrfs_symlink_inode_operations;
- inode_nohighmem(inode);
- inode_set_bytes(inode, name_len);
- btrfs_i_size_write(BTRFS_I(inode), name_len);
- err = btrfs_update_inode(trans, root, BTRFS_I(inode));
- /*
- * Last step, add directory indexes for our symlink inode. This is the
- * last step to avoid extra cleanup of these indexes if an error happens
- * elsewhere above.
- */
- if (!err)
- err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry,
- BTRFS_I(inode), 0, index);
- if (err)
- goto out_unlock;
-
d_instantiate_new(dentry, inode);
-
-out_unlock:
+ err = 0;
+out:
btrfs_end_transaction(trans);
- if (err && inode) {
- inode_dec_link_count(inode);
- discard_new_inode(inode);
- }
btrfs_btree_balance_dirty(fs_info);
+out_new_inode_args:
+ btrfs_new_inode_args_destroy(&new_inode_args);
+out_inode:
+ if (err)
+ iput(inode);
return err;
}
@@ -10071,62 +10099,58 @@ static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
- struct inode *inode = NULL;
- u64 objectid;
- u64 index;
- int ret = 0;
-
- /*
- * 5 units required for adding orphan entry
- */
- trans = btrfs_start_transaction(root, 5);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
- ret = btrfs_get_free_objectid(root, &objectid);
- if (ret)
- goto out;
-
- inode = btrfs_new_inode(trans, root, mnt_userns, dir, NULL, 0,
- btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
- if (IS_ERR(inode)) {
- ret = PTR_ERR(inode);
- inode = NULL;
- goto out;
- }
+ struct inode *inode;
+ struct btrfs_new_inode_args new_inode_args = {
+ .dir = dir,
+ .dentry = dentry,
+ .orphan = true,
+ };
+ unsigned int trans_num_items;
+ int ret;
+ inode = new_inode(dir->i_sb);
+ if (!inode)
+ return -ENOMEM;
+ inode_init_owner(mnt_userns, inode, dir, mode);
inode->i_fop = &btrfs_file_operations;
inode->i_op = &btrfs_file_inode_operations;
-
inode->i_mapping->a_ops = &btrfs_aops;
- ret = btrfs_init_inode_security(trans, inode, dir, NULL);
+ new_inode_args.inode = inode;
+ ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
if (ret)
- goto out;
+ goto out_inode;
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
- if (ret)
- goto out;
- ret = btrfs_orphan_add(trans, BTRFS_I(inode));
- if (ret)
- goto out;
+ trans = btrfs_start_transaction(root, trans_num_items);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out_new_inode_args;
+ }
+
+ ret = btrfs_create_new_inode(trans, &new_inode_args);
/*
- * We set number of links to 0 in btrfs_new_inode(), and here we set
- * it to 1 because d_tmpfile() will issue a warning if the count is 0,
- * through:
+ * We set number of links to 0 in btrfs_create_new_inode(), and here we
+ * set it to 1 because d_tmpfile() will issue a warning if the count is
+ * 0, through:
*
* d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
*/
set_nlink(inode, 1);
- d_tmpfile(dentry, inode);
- unlock_new_inode(inode);
- mark_inode_dirty(inode);
-out:
+
+ if (!ret) {
+ d_tmpfile(dentry, inode);
+ unlock_new_inode(inode);
+ mark_inode_dirty(inode);
+ }
+
btrfs_end_transaction(trans);
- if (ret && inode)
- discard_new_inode(inode);
btrfs_btree_balance_dirty(fs_info);
+out_new_inode_args:
+ btrfs_new_inode_args_destroy(&new_inode_args);
+out_inode:
+ if (ret)
+ iput(inode);
return ret;
}
@@ -10458,13 +10482,11 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!pages)
return -ENOMEM;
- for (i = 0; i < nr_pages; i++) {
- pages[i] = alloc_page(GFP_NOFS);
- if (!pages[i]) {
- ret = -ENOMEM;
- goto out;
+ ret = btrfs_alloc_page_array(nr_pages, pages);
+ if (ret) {
+ ret = -ENOMEM;
+ goto out;
}
- }
ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
disk_io_size, pages);
@@ -10800,7 +10822,8 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes);
if (ret)
goto out_free_data_space;
- ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes);
+ ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes,
+ false);
if (ret)
goto out_qgroup_free_data;
@@ -11309,6 +11332,41 @@ void btrfs_update_inode_bytes(struct btrfs_inode *inode,
spin_unlock(&inode->lock);
}
+/**
+ * Verify that there are no ordered extents for a given file range.
+ *
+ * @inode: The target inode.
+ * @start: Start offset of the file range, should be sector size aligned.
+ * @end: End offset (inclusive) of the file range, its value +1 should be
+ * sector size aligned.
+ *
+ * This should typically be used for cases where we locked an inode's VFS lock in
+ * exclusive mode, we have also locked the inode's i_mmap_lock in exclusive mode,
+ * we have flushed all delalloc in the range, we have waited for all ordered
+ * extents in the range to complete and finally we have locked the file range in
+ * the inode's io_tree.
+ */
+void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_ordered_extent *ordered;
+
+ if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
+ return;
+
+ ordered = btrfs_lookup_first_ordered_range(inode, start, end + 1 - start);
+ if (ordered) {
+ btrfs_err(root->fs_info,
+"found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])",
+ start, end, btrfs_ino(inode), root->root_key.objectid,
+ ordered->file_offset,
+ ordered->file_offset + ordered->num_bytes - 1);
+ btrfs_put_ordered_extent(ordered);
+ }
+
+ ASSERT(ordered == NULL);
+}
+
static const struct inode_operations btrfs_dir_inode_operations = {
.getattr = btrfs_getattr,
.lookup = btrfs_lookup,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index b2c692b2fd8d..43b6f23bbd89 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -540,9 +540,35 @@ int __pure btrfs_is_empty_uuid(u8 *uuid)
return 1;
}
+/*
+ * Calculate the number of transaction items to reserve for creating a subvolume
+ * or snapshot, not including the inode, directory entries, or parent directory.
+ */
+static unsigned int create_subvol_num_items(struct btrfs_qgroup_inherit *inherit)
+{
+ /*
+ * 1 to add root block
+ * 1 to add root item
+ * 1 to add root ref
+ * 1 to add root backref
+ * 1 to add UUID item
+ * 1 to add qgroup info
+ * 1 to add qgroup limit
+ *
+ * Ideally the last two would only be accounted if qgroups are enabled,
+ * but that can change between now and the time we would insert them.
+ */
+ unsigned int num_items = 7;
+
+ if (inherit) {
+ /* 2 to add qgroup relations for each inherited qgroup */
+ num_items += 2 * inherit->num_qgroups;
+ }
+ return num_items;
+}
+
static noinline int create_subvol(struct user_namespace *mnt_userns,
struct inode *dir, struct dentry *dentry,
- const char *name, int namelen,
struct btrfs_qgroup_inherit *inherit)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
@@ -555,11 +581,15 @@ static noinline int create_subvol(struct user_namespace *mnt_userns,
struct btrfs_root *new_root;
struct btrfs_block_rsv block_rsv;
struct timespec64 cur_time = current_time(dir);
- struct inode *inode;
+ struct btrfs_new_inode_args new_inode_args = {
+ .dir = dir,
+ .dentry = dentry,
+ .subvol = true,
+ };
+ unsigned int trans_num_items;
int ret;
- dev_t anon_dev = 0;
+ dev_t anon_dev;
u64 objectid;
- u64 index = 0;
root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
if (!root_item)
@@ -567,11 +597,7 @@ static noinline int create_subvol(struct user_namespace *mnt_userns,
ret = btrfs_get_free_objectid(fs_info->tree_root, &objectid);
if (ret)
- goto fail_free;
-
- ret = get_anon_bdev(&anon_dev);
- if (ret < 0)
- goto fail_free;
+ goto out_root_item;
/*
* Don't create subvolume whose level is not zero. Or qgroup will be
@@ -579,36 +605,47 @@ static noinline int create_subvol(struct user_namespace *mnt_userns,
*/
if (btrfs_qgroup_level(objectid)) {
ret = -ENOSPC;
- goto fail_free;
+ goto out_root_item;
}
+ ret = get_anon_bdev(&anon_dev);
+ if (ret < 0)
+ goto out_root_item;
+
+ new_inode_args.inode = btrfs_new_subvol_inode(mnt_userns, dir);
+ if (!new_inode_args.inode) {
+ ret = -ENOMEM;
+ goto out_anon_dev;
+ }
+ ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
+ if (ret)
+ goto out_inode;
+ trans_num_items += create_subvol_num_items(inherit);
+
btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
- /*
- * The same as the snapshot creation, please see the comment
- * of create_snapshot().
- */
- ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 8, false);
+ ret = btrfs_subvolume_reserve_metadata(root, &block_rsv,
+ trans_num_items, false);
if (ret)
- goto fail_free;
+ goto out_new_inode_args;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
btrfs_subvolume_release_metadata(root, &block_rsv);
- goto fail_free;
+ goto out_new_inode_args;
}
trans->block_rsv = &block_rsv;
trans->bytes_reserved = block_rsv.size;
ret = btrfs_qgroup_inherit(trans, 0, objectid, inherit);
if (ret)
- goto fail;
+ goto out;
leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
BTRFS_NESTING_NORMAL);
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
- goto fail;
+ goto out;
}
btrfs_mark_buffer_dirty(leaf);
@@ -663,75 +700,46 @@ static noinline int create_subvol(struct user_namespace *mnt_userns,
btrfs_tree_unlock(leaf);
btrfs_free_tree_block(trans, objectid, leaf, 0, 1);
free_extent_buffer(leaf);
- goto fail;
+ goto out;
}
free_extent_buffer(leaf);
leaf = NULL;
- key.offset = (u64)-1;
new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev);
if (IS_ERR(new_root)) {
- free_anon_bdev(anon_dev);
ret = PTR_ERR(new_root);
btrfs_abort_transaction(trans, ret);
- goto fail;
+ goto out;
}
- /* Freeing will be done in btrfs_put_root() of new_root */
+ /* anon_dev is owned by new_root now. */
anon_dev = 0;
+ BTRFS_I(new_inode_args.inode)->root = new_root;
+ /* ... and new_root is owned by new_inode_args.inode now. */
ret = btrfs_record_root_in_trans(trans, new_root);
if (ret) {
- btrfs_put_root(new_root);
- btrfs_abort_transaction(trans, ret);
- goto fail;
- }
-
- ret = btrfs_create_subvol_root(trans, new_root, root, mnt_userns);
- btrfs_put_root(new_root);
- if (ret) {
- /* We potentially lose an unused inode item here */
btrfs_abort_transaction(trans, ret);
- goto fail;
- }
-
- /*
- * insert the directory item
- */
- ret = btrfs_set_inode_index(BTRFS_I(dir), &index);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto fail;
- }
-
- ret = btrfs_insert_dir_item(trans, name, namelen, BTRFS_I(dir), &key,
- BTRFS_FT_DIR, index);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto fail;
+ goto out;
}
- btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2);
- ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
+ ret = btrfs_uuid_tree_add(trans, root_item->uuid,
+ BTRFS_UUID_KEY_SUBVOL, objectid);
if (ret) {
btrfs_abort_transaction(trans, ret);
- goto fail;
+ goto out;
}
- ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid,
- btrfs_ino(BTRFS_I(dir)), index, name, namelen);
+ ret = btrfs_create_new_inode(trans, &new_inode_args);
if (ret) {
btrfs_abort_transaction(trans, ret);
- goto fail;
+ goto out;
}
- ret = btrfs_uuid_tree_add(trans, root_item->uuid,
- BTRFS_UUID_KEY_SUBVOL, objectid);
- if (ret)
- btrfs_abort_transaction(trans, ret);
+ d_instantiate_new(dentry, new_inode_args.inode);
+ new_inode_args.inode = NULL;
-fail:
- kfree(root_item);
+out:
trans->block_rsv = NULL;
trans->bytes_reserved = 0;
btrfs_subvolume_release_metadata(root, &block_rsv);
@@ -740,18 +748,14 @@ fail:
btrfs_end_transaction(trans);
else
ret = btrfs_commit_transaction(trans);
-
- if (!ret) {
- inode = btrfs_lookup_dentry(dir, dentry);
- if (IS_ERR(inode))
- return PTR_ERR(inode);
- d_instantiate(dentry, inode);
- }
- return ret;
-
-fail_free:
+out_new_inode_args:
+ btrfs_new_inode_args_destroy(&new_inode_args);
+out_inode:
+ iput(new_inode_args.inode);
+out_anon_dev:
if (anon_dev)
free_anon_bdev(anon_dev);
+out_root_item:
kfree(root_item);
return ret;
}
@@ -763,6 +767,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct inode *inode;
struct btrfs_pending_snapshot *pending_snapshot;
+ unsigned int trans_num_items;
struct btrfs_trans_handle *trans;
int ret;
@@ -800,16 +805,14 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
btrfs_init_block_rsv(&pending_snapshot->block_rsv,
BTRFS_BLOCK_RSV_TEMP);
/*
- * 1 - parent dir inode
- * 2 - dir entries
- * 1 - root item
- * 2 - root ref/backref
- * 1 - root of snapshot
- * 1 - UUID item
+ * 1 to add dir item
+ * 1 to add dir index
+ * 1 to update parent inode item
*/
+ trans_num_items = create_subvol_num_items(inherit) + 3;
ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root,
- &pending_snapshot->block_rsv, 8,
- false);
+ &pending_snapshot->block_rsv,
+ trans_num_items, false);
if (ret)
goto free_pending;
@@ -979,7 +982,7 @@ static noinline int btrfs_mksubvol(const struct path *parent,
if (snap_src)
error = create_snapshot(snap_src, dir, dentry, readonly, inherit);
else
- error = create_subvol(mnt_userns, dir, dentry, name, namelen, inherit);
+ error = create_subvol(mnt_userns, dir, dentry, inherit);
if (!error)
fsnotify_mkdir(dir, dentry);
@@ -1413,8 +1416,19 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
if (!em)
break;
- /* Skip hole/inline/preallocated extents */
- if (em->block_start >= EXTENT_MAP_LAST_BYTE ||
+ /*
+ * If the file extent is an inlined one, we may still want to
+ * defrag it (fallthrough) if it will cause a regular extent.
+ * This is for users who want to convert inline extents to
+ * regular ones through max_inline= mount option.
+ */
+ if (em->block_start == EXTENT_MAP_INLINE &&
+ em->len <= inode->root->fs_info->max_inline)
+ goto next;
+
+ /* Skip hole/delalloc/preallocated extents */
+ if (em->block_start == EXTENT_MAP_HOLE ||
+ em->block_start == EXTENT_MAP_DELALLOC ||
test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
goto next;
@@ -1473,6 +1487,15 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
if (em->len >= get_extent_max_capacity(em))
goto next;
+ /*
+ * Normally there are no more extents after an inline one, thus
+ * @next_mergeable will normally be false and not defragged.
+ * So if an inline extent passed all above checks, just add it
+ * for defrag, and be converted to regular extents.
+ */
+ if (em->block_start == EXTENT_MAP_INLINE)
+ goto add;
+
next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
extent_thresh, newer_than, locked);
if (!next_mergeable) {
@@ -2594,7 +2617,7 @@ err:
static noinline int btrfs_ioctl_tree_search(struct inode *inode,
void __user *argp)
{
- struct btrfs_ioctl_search_args __user *uargs;
+ struct btrfs_ioctl_search_args __user *uargs = argp;
struct btrfs_ioctl_search_key sk;
int ret;
size_t buf_size;
@@ -2602,8 +2625,6 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode,
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- uargs = (struct btrfs_ioctl_search_args __user *)argp;
-
if (copy_from_user(&sk, &uargs->key, sizeof(sk)))
return -EFAULT;
@@ -2626,7 +2647,7 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode,
static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode,
void __user *argp)
{
- struct btrfs_ioctl_search_args_v2 __user *uarg;
+ struct btrfs_ioctl_search_args_v2 __user *uarg = argp;
struct btrfs_ioctl_search_args_v2 args;
int ret;
size_t buf_size;
@@ -2636,7 +2657,6 @@ static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode,
return -EPERM;
/* copy search header and buffer size */
- uarg = (struct btrfs_ioctl_search_args_v2 __user *)argp;
if (copy_from_user(&args, uarg, sizeof(args)))
return -EFAULT;
@@ -4344,10 +4364,6 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
bool need_unlock; /* for mut. excl. ops lock */
int ret;
- if (!arg)
- btrfs_warn(fs_info,
- "IOC_BALANCE ioctl (v1) is deprecated and will be removed in kernel 5.18");
-
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -4355,6 +4371,13 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
if (ret)
return ret;
+ bargs = memdup_user(arg, sizeof(*bargs));
+ if (IS_ERR(bargs)) {
+ ret = PTR_ERR(bargs);
+ bargs = NULL;
+ goto out;
+ }
+
again:
if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
mutex_lock(&fs_info->balance_mutex);
@@ -4402,59 +4425,42 @@ again:
}
locked:
-
- if (arg) {
- bargs = memdup_user(arg, sizeof(*bargs));
- if (IS_ERR(bargs)) {
- ret = PTR_ERR(bargs);
+ if (bargs->flags & BTRFS_BALANCE_RESUME) {
+ if (!fs_info->balance_ctl) {
+ ret = -ENOTCONN;
goto out_unlock;
}
- if (bargs->flags & BTRFS_BALANCE_RESUME) {
- if (!fs_info->balance_ctl) {
- ret = -ENOTCONN;
- goto out_bargs;
- }
+ bctl = fs_info->balance_ctl;
+ spin_lock(&fs_info->balance_lock);
+ bctl->flags |= BTRFS_BALANCE_RESUME;
+ spin_unlock(&fs_info->balance_lock);
+ btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE);
- bctl = fs_info->balance_ctl;
- spin_lock(&fs_info->balance_lock);
- bctl->flags |= BTRFS_BALANCE_RESUME;
- spin_unlock(&fs_info->balance_lock);
- btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE);
+ goto do_balance;
+ }
- goto do_balance;
- }
- } else {
- bargs = NULL;
+ if (bargs->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) {
+ ret = -EINVAL;
+ goto out_unlock;
}
if (fs_info->balance_ctl) {
ret = -EINPROGRESS;
- goto out_bargs;
+ goto out_unlock;
}
bctl = kzalloc(sizeof(*bctl), GFP_KERNEL);
if (!bctl) {
ret = -ENOMEM;
- goto out_bargs;
- }
-
- if (arg) {
- memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
- memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
- memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
-
- bctl->flags = bargs->flags;
- } else {
- /* balance everything - no filters */
- bctl->flags |= BTRFS_BALANCE_TYPE_MASK;
+ goto out_unlock;
}
- if (bctl->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) {
- ret = -EINVAL;
- goto out_bctl;
- }
+ memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
+ memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
+ memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
+ bctl->flags = bargs->flags;
do_balance:
/*
* Ownership of bctl and exclusive operation goes to btrfs_balance.
@@ -4467,21 +4473,19 @@ do_balance:
ret = btrfs_balance(fs_info, bctl, bargs);
bctl = NULL;
- if ((ret == 0 || ret == -ECANCELED) && arg) {
+ if (ret == 0 || ret == -ECANCELED) {
if (copy_to_user(arg, bargs, sizeof(*bargs)))
ret = -EFAULT;
}
-out_bctl:
kfree(bctl);
-out_bargs:
- kfree(bargs);
out_unlock:
mutex_unlock(&fs_info->balance_mutex);
if (need_unlock)
btrfs_exclop_finish(fs_info);
out:
mnt_drop_write_file(file);
+ kfree(bargs);
return ret;
}
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index 1b31481f9e72..a2ec8ecae8de 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -380,9 +380,8 @@ static struct prop_handler prop_handlers[] = {
},
};
-static int inherit_props(struct btrfs_trans_handle *trans,
- struct inode *inode,
- struct inode *parent)
+int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct inode *parent)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -457,41 +456,6 @@ static int inherit_props(struct btrfs_trans_handle *trans,
return 0;
}
-int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
- struct inode *inode,
- struct inode *dir)
-{
- if (!dir)
- return 0;
-
- return inherit_props(trans, inode, dir);
-}
-
-int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_root *parent_root)
-{
- struct super_block *sb = root->fs_info->sb;
- struct inode *parent_inode, *child_inode;
- int ret;
-
- parent_inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, parent_root);
- if (IS_ERR(parent_inode))
- return PTR_ERR(parent_inode);
-
- child_inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, root);
- if (IS_ERR(child_inode)) {
- iput(parent_inode);
- return PTR_ERR(child_inode);
- }
-
- ret = inherit_props(trans, child_inode, parent_inode);
- iput(child_inode);
- iput(parent_inode);
-
- return ret;
-}
-
void __init btrfs_props_init(void)
{
int i;
diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h
index 59bea741cfcf..ca9dd3df129b 100644
--- a/fs/btrfs/props.h
+++ b/fs/btrfs/props.h
@@ -23,8 +23,4 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
struct inode *inode,
struct inode *dir);
-int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_root *parent_root);
-
#endif
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 1866b1f0da01..db723c0026bd 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2290,7 +2290,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
return 0;
if (!extent_buffer_uptodate(root_eb)) {
- ret = btrfs_read_buffer(root_eb, root_gen, root_level, NULL);
+ ret = btrfs_read_extent_buffer(root_eb, root_gen, root_level, NULL);
if (ret)
goto out;
}
@@ -3939,12 +3939,13 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
}
int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
- enum btrfs_qgroup_rsv_type type, bool enforce)
+ enum btrfs_qgroup_rsv_type type, bool enforce,
+ bool noflush)
{
int ret;
ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
- if (ret <= 0 && ret != -EDQUOT)
+ if ((ret <= 0 && ret != -EDQUOT) || noflush)
return ret;
ret = try_flush_qgroup(root);
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 880e9df0dac1..0c4dd2a9af96 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -364,19 +364,23 @@ int btrfs_qgroup_free_data(struct btrfs_inode *inode,
int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
enum btrfs_qgroup_rsv_type type, bool enforce);
int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
- enum btrfs_qgroup_rsv_type type, bool enforce);
+ enum btrfs_qgroup_rsv_type type, bool enforce,
+ bool noflush);
/* Reserve metadata space for pertrans and prealloc type */
static inline int btrfs_qgroup_reserve_meta_pertrans(struct btrfs_root *root,
int num_bytes, bool enforce)
{
return __btrfs_qgroup_reserve_meta(root, num_bytes,
- BTRFS_QGROUP_RSV_META_PERTRANS, enforce);
+ BTRFS_QGROUP_RSV_META_PERTRANS,
+ enforce, false);
}
static inline int btrfs_qgroup_reserve_meta_prealloc(struct btrfs_root *root,
- int num_bytes, bool enforce)
+ int num_bytes, bool enforce,
+ bool noflush)
{
return __btrfs_qgroup_reserve_meta(root, num_bytes,
- BTRFS_QGROUP_RSV_META_PREALLOC, enforce);
+ BTRFS_QGROUP_RSV_META_PREALLOC,
+ enforce, noflush);
}
void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 0e239a4c3b26..a5b623ee6fac 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -52,6 +52,17 @@ struct btrfs_stripe_hash_table {
struct btrfs_stripe_hash table[];
};
+/*
+ * A bvec like structure to present a sector inside a page.
+ *
+ * Unlike bvec we don't need bvlen, as it's fixed to sectorsize.
+ */
+struct sector_ptr {
+ struct page *page;
+ unsigned int pgoff:24;
+ unsigned int uptodate:8;
+};
+
enum btrfs_rbio_ops {
BTRFS_RBIO_WRITE,
BTRFS_RBIO_READ_REBUILD,
@@ -77,7 +88,7 @@ struct btrfs_raid_bio {
/*
* for scheduling work in the helper threads
*/
- struct btrfs_work work;
+ struct work_struct work;
/*
* bio list and bio_list_lock are used
@@ -101,15 +112,6 @@ struct btrfs_raid_bio {
*/
unsigned long flags;
- /* size of each individual stripe on disk */
- int stripe_len;
-
- /* number of data stripes (no p/q) */
- int nr_data;
-
- int real_stripes;
-
- int stripe_npages;
/*
* set if we're doing a parity rebuild
* for a read from higher up, which is handled
@@ -118,18 +120,35 @@ struct btrfs_raid_bio {
*/
enum btrfs_rbio_ops operation;
- /* first bad stripe */
- int faila;
+ /* Size of each individual stripe on disk */
+ u32 stripe_len;
- /* second bad stripe (for raid6 use) */
- int failb;
+ /* How many pages there are for the full stripe including P/Q */
+ u16 nr_pages;
- int scrubp;
- /*
- * number of pages needed to represent the full
- * stripe
- */
- int nr_pages;
+ /* How many sectors there are for the full stripe including P/Q */
+ u16 nr_sectors;
+
+ /* Number of data stripes (no p/q) */
+ u8 nr_data;
+
+ /* Numer of all stripes (including P/Q) */
+ u8 real_stripes;
+
+ /* How many pages there are for each stripe */
+ u8 stripe_npages;
+
+ /* How many sectors there are for each stripe */
+ u8 stripe_nsectors;
+
+ /* First bad stripe, -1 means no corruption */
+ s8 faila;
+
+ /* Second bad stripe (for RAID6 use) */
+ s8 failb;
+
+ /* Stripe number that we're scrubbing */
+ u8 scrubp;
/*
* size of all the bios in the bio_list. This
@@ -156,28 +175,29 @@ struct btrfs_raid_bio {
*/
struct page **stripe_pages;
- /*
- * pointers to the pages in the bio_list. Stored
- * here for faster lookup
- */
- struct page **bio_pages;
+ /* Pointers to the sectors in the bio_list, for faster lookup */
+ struct sector_ptr *bio_sectors;
/*
- * bitmap to record which horizontal stripe has data
+ * For subpage support, we need to map each sector to above
+ * stripe_pages.
*/
+ struct sector_ptr *stripe_sectors;
+
+ /* Bitmap to record which horizontal stripe has data */
unsigned long *dbitmap;
/* allocated with real_stripes-many pointers for finish_*() calls */
void **finish_pointers;
- /* allocated with stripe_npages-many bits for finish_*() calls */
+ /* Allocated with stripe_nsectors-many bits for finish_*() calls */
unsigned long *finish_pbitmap;
};
static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
-static void rmw_work(struct btrfs_work *work);
-static void read_rebuild_work(struct btrfs_work *work);
+static void rmw_work(struct work_struct *work);
+static void read_rebuild_work(struct work_struct *work);
static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
static void __free_raid_bio(struct btrfs_raid_bio *rbio);
@@ -186,12 +206,12 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
int need_check);
-static void scrub_parity_work(struct btrfs_work *work);
+static void scrub_parity_work(struct work_struct *work);
-static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func)
+static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func)
{
- btrfs_init_work(&rbio->work, work_func, NULL, NULL);
- btrfs_queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
+ INIT_WORK(&rbio->work, work_func);
+ queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
}
/*
@@ -239,7 +259,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
/*
* caching an rbio means to copy anything from the
- * bio_pages array into the stripe_pages array. We
+ * bio_sectors array into the stripe_pages array. We
* use the page uptodate bit in the stripe cache array
* to indicate if it has valid data
*
@@ -255,12 +275,18 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
if (ret)
return;
- for (i = 0; i < rbio->nr_pages; i++) {
- if (!rbio->bio_pages[i])
+ for (i = 0; i < rbio->nr_sectors; i++) {
+ /* Some range not covered by bio (partial write), skip it */
+ if (!rbio->bio_sectors[i].page)
continue;
- copy_highpage(rbio->stripe_pages[i], rbio->bio_pages[i]);
- SetPageUptodate(rbio->stripe_pages[i]);
+ ASSERT(rbio->stripe_sectors[i].page);
+ memcpy_page(rbio->stripe_sectors[i].page,
+ rbio->stripe_sectors[i].pgoff,
+ rbio->bio_sectors[i].page,
+ rbio->bio_sectors[i].pgoff,
+ rbio->bioc->fs_info->sectorsize);
+ rbio->stripe_sectors[i].uptodate = 1;
}
set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
}
@@ -283,9 +309,50 @@ static int rbio_bucket(struct btrfs_raid_bio *rbio)
return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
}
+static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio,
+ unsigned int page_nr)
+{
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
+ const u32 sectors_per_page = PAGE_SIZE / sectorsize;
+ int i;
+
+ ASSERT(page_nr < rbio->nr_pages);
+
+ for (i = sectors_per_page * page_nr;
+ i < sectors_per_page * page_nr + sectors_per_page;
+ i++) {
+ if (!rbio->stripe_sectors[i].uptodate)
+ return false;
+ }
+ return true;
+}
+
/*
- * stealing an rbio means taking all the uptodate pages from the stripe
- * array in the source rbio and putting them into the destination rbio
+ * Update the stripe_sectors[] array to use correct page and pgoff
+ *
+ * Should be called every time any page pointer in stripes_pages[] got modified.
+ */
+static void index_stripe_sectors(struct btrfs_raid_bio *rbio)
+{
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
+ u32 offset;
+ int i;
+
+ for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) {
+ int page_index = offset >> PAGE_SHIFT;
+
+ ASSERT(page_index < rbio->nr_pages);
+ rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index];
+ rbio->stripe_sectors[i].pgoff = offset_in_page(offset);
+ }
+}
+
+/*
+ * Stealing an rbio means taking all the uptodate pages from the stripe array
+ * in the source rbio and putting them into the destination rbio.
+ *
+ * This will also update the involved stripe_sectors[] which are referring to
+ * the old pages.
*/
static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
{
@@ -298,9 +365,8 @@ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
for (i = 0; i < dest->nr_pages; i++) {
s = src->stripe_pages[i];
- if (!s || !PageUptodate(s)) {
+ if (!s || !full_page_sectors_uptodate(src, i))
continue;
- }
d = dest->stripe_pages[i];
if (d)
@@ -309,6 +375,8 @@ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
dest->stripe_pages[i] = s;
src->stripe_pages[i] = NULL;
}
+ index_stripe_sectors(dest);
+ index_stripe_sectors(src);
}
/*
@@ -600,39 +668,39 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
return 1;
}
-static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe,
- int index)
+static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio,
+ unsigned int stripe_nr,
+ unsigned int sector_nr)
{
- return stripe * rbio->stripe_npages + index;
+ ASSERT(stripe_nr < rbio->real_stripes);
+ ASSERT(sector_nr < rbio->stripe_nsectors);
+
+ return stripe_nr * rbio->stripe_nsectors + sector_nr;
}
-/*
- * these are just the pages from the rbio array, not from anything
- * the FS sent down to us
- */
-static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe,
- int index)
+/* Return a sector from rbio->stripe_sectors, not from the bio list */
+static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio,
+ unsigned int stripe_nr,
+ unsigned int sector_nr)
{
- return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)];
+ return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr,
+ sector_nr)];
}
-/*
- * helper to index into the pstripe
- */
-static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
+/* Grab a sector inside P stripe */
+static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio,
+ unsigned int sector_nr)
{
- return rbio_stripe_page(rbio, rbio->nr_data, index);
+ return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr);
}
-/*
- * helper to index into the qstripe, returns null
- * if there is no qstripe
- */
-static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
+/* Grab a sector inside Q stripe, return NULL if not RAID6 */
+static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio,
+ unsigned int sector_nr)
{
if (rbio->nr_data + 1 == rbio->real_stripes)
return NULL;
- return rbio_stripe_page(rbio, rbio->nr_data + 1, index);
+ return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr);
}
/*
@@ -911,47 +979,43 @@ static void raid_write_end_io(struct bio *bio)
rbio_orig_end_io(rbio, err);
}
-/*
- * the read/modify/write code wants to use the original bio for
- * any pages it included, and then use the rbio for everything
- * else. This function decides if a given index (stripe number)
- * and page number in that stripe fall inside the original bio
- * or the rbio.
- *
- * if you set bio_list_only, you'll get a NULL back for any ranges
- * that are outside the bio_list
+/**
+ * Get a sector pointer specified by its @stripe_nr and @sector_nr
*
- * This doesn't take any refs on anything, you get a bare page pointer
- * and the caller must bump refs as required.
+ * @rbio: The raid bio
+ * @stripe_nr: Stripe number, valid range [0, real_stripe)
+ * @sector_nr: Sector number inside the stripe,
+ * valid range [0, stripe_nsectors)
+ * @bio_list_only: Whether to use sectors inside the bio list only.
*
- * You must call index_rbio_pages once before you can trust
- * the answers from this function.
+ * The read/modify/write code wants to reuse the original bio page as much
+ * as possible, and only use stripe_sectors as fallback.
*/
-static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
- int index, int pagenr, int bio_list_only)
+static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
+ int stripe_nr, int sector_nr,
+ bool bio_list_only)
{
- int chunk_page;
- struct page *p = NULL;
+ struct sector_ptr *sector;
+ int index;
+
+ ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes);
+ ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
- chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
+ index = stripe_nr * rbio->stripe_nsectors + sector_nr;
+ ASSERT(index >= 0 && index < rbio->nr_sectors);
spin_lock_irq(&rbio->bio_list_lock);
- p = rbio->bio_pages[chunk_page];
+ sector = &rbio->bio_sectors[index];
+ if (sector->page || bio_list_only) {
+ /* Don't return sector without a valid page pointer */
+ if (!sector->page)
+ sector = NULL;
+ spin_unlock_irq(&rbio->bio_list_lock);
+ return sector;
+ }
spin_unlock_irq(&rbio->bio_list_lock);
- if (p || bio_list_only)
- return p;
-
- return rbio->stripe_pages[chunk_page];
-}
-
-/*
- * number of pages we need for the entire stripe across all the
- * drives
- */
-static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
-{
- return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes;
+ return &rbio->stripe_sectors[index];
}
/*
@@ -960,22 +1024,28 @@ static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
*/
static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
struct btrfs_io_context *bioc,
- u64 stripe_len)
+ u32 stripe_len)
{
+ const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
+ const unsigned int stripe_npages = stripe_len >> PAGE_SHIFT;
+ const unsigned int num_pages = stripe_npages * real_stripes;
+ const unsigned int stripe_nsectors = stripe_len >> fs_info->sectorsize_bits;
+ const unsigned int num_sectors = stripe_nsectors * real_stripes;
struct btrfs_raid_bio *rbio;
int nr_data = 0;
- int real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
- int num_pages = rbio_nr_pages(stripe_len, real_stripes);
- int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
void *p;
+ ASSERT(IS_ALIGNED(stripe_len, PAGE_SIZE));
+ /* PAGE_SIZE must also be aligned to sectorsize for subpage support */
+ ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize));
+
rbio = kzalloc(sizeof(*rbio) +
sizeof(*rbio->stripe_pages) * num_pages +
- sizeof(*rbio->bio_pages) * num_pages +
+ sizeof(*rbio->bio_sectors) * num_sectors +
+ sizeof(*rbio->stripe_sectors) * num_sectors +
sizeof(*rbio->finish_pointers) * real_stripes +
- sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_npages) +
- sizeof(*rbio->finish_pbitmap) *
- BITS_TO_LONGS(stripe_npages),
+ sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_nsectors) +
+ sizeof(*rbio->finish_pbitmap) * BITS_TO_LONGS(stripe_nsectors),
GFP_NOFS);
if (!rbio)
return ERR_PTR(-ENOMEM);
@@ -988,8 +1058,10 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
rbio->bioc = bioc;
rbio->stripe_len = stripe_len;
rbio->nr_pages = num_pages;
+ rbio->nr_sectors = num_sectors;
rbio->real_stripes = real_stripes;
rbio->stripe_npages = stripe_npages;
+ rbio->stripe_nsectors = stripe_nsectors;
rbio->faila = -1;
rbio->failb = -1;
refcount_set(&rbio->refs, 1);
@@ -997,8 +1069,8 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
atomic_set(&rbio->stripes_pending, 0);
/*
- * the stripe_pages, bio_pages, etc arrays point to the extra
- * memory we allocated past the end of the rbio
+ * The stripe_pages, bio_sectors, etc arrays point to the extra memory
+ * we allocated past the end of the rbio.
*/
p = rbio + 1;
#define CONSUME_ALLOC(ptr, count) do { \
@@ -1006,10 +1078,11 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
p = (unsigned char *)p + sizeof(*(ptr)) * (count); \
} while (0)
CONSUME_ALLOC(rbio->stripe_pages, num_pages);
- CONSUME_ALLOC(rbio->bio_pages, num_pages);
+ CONSUME_ALLOC(rbio->bio_sectors, num_sectors);
+ CONSUME_ALLOC(rbio->stripe_sectors, num_sectors);
CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
- CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_npages));
- CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages));
+ CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_nsectors));
+ CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_nsectors));
#undef CONSUME_ALLOC
if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
@@ -1026,59 +1099,63 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
/* allocate pages for all the stripes in the bio, including parity */
static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
{
- int i;
- struct page *page;
+ int ret;
- for (i = 0; i < rbio->nr_pages; i++) {
- if (rbio->stripe_pages[i])
- continue;
- page = alloc_page(GFP_NOFS);
- if (!page)
- return -ENOMEM;
- rbio->stripe_pages[i] = page;
- }
+ ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages);
+ if (ret < 0)
+ return ret;
+ /* Mapping all sectors */
+ index_stripe_sectors(rbio);
return 0;
}
/* only allocate pages for p/q stripes */
static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
{
- int i;
- struct page *page;
+ const int data_pages = rbio->nr_data * rbio->stripe_npages;
+ int ret;
- i = rbio_stripe_page_index(rbio, rbio->nr_data, 0);
+ ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages,
+ rbio->stripe_pages + data_pages);
+ if (ret < 0)
+ return ret;
- for (; i < rbio->nr_pages; i++) {
- if (rbio->stripe_pages[i])
- continue;
- page = alloc_page(GFP_NOFS);
- if (!page)
- return -ENOMEM;
- rbio->stripe_pages[i] = page;
- }
+ index_stripe_sectors(rbio);
return 0;
}
/*
- * add a single page from a specific stripe into our list of bios for IO
- * this will try to merge into existing bios if possible, and returns
- * zero if all went well.
+ * Add a single sector @sector into our list of bios for IO.
+ *
+ * Return 0 if everything went well.
+ * Return <0 for error.
*/
-static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
- struct bio_list *bio_list,
- struct page *page,
- int stripe_nr,
- unsigned long page_index,
- unsigned long bio_max_len)
-{
+static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
+ struct bio_list *bio_list,
+ struct sector_ptr *sector,
+ unsigned int stripe_nr,
+ unsigned int sector_nr,
+ unsigned long bio_max_len,
+ unsigned int opf)
+{
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
struct bio *last = bio_list->tail;
int ret;
struct bio *bio;
struct btrfs_io_stripe *stripe;
u64 disk_start;
+ /*
+ * Note: here stripe_nr has taken device replace into consideration,
+ * thus it can be larger than rbio->real_stripe.
+ * So here we check against bioc->num_stripes, not rbio->real_stripes.
+ */
+ ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes);
+ ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
+ ASSERT(sector->page);
+
stripe = &rbio->bioc->stripes[stripe_nr];
- disk_start = stripe->physical + (page_index << PAGE_SHIFT);
+ disk_start = stripe->physical + sector_nr * sectorsize;
/* if the device is missing, just fail this stripe */
if (!stripe->dev->bdev)
@@ -1095,20 +1172,20 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
*/
if (last_end == disk_start && !last->bi_status &&
last->bi_bdev == stripe->dev->bdev) {
- ret = bio_add_page(last, page, PAGE_SIZE, 0);
- if (ret == PAGE_SIZE)
+ ret = bio_add_page(last, sector->page, sectorsize,
+ sector->pgoff);
+ if (ret == sectorsize)
return 0;
}
}
/* put a new bio on the list */
- bio = btrfs_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
- btrfs_bio(bio)->device = stripe->dev;
- bio->bi_iter.bi_size = 0;
- bio_set_dev(bio, stripe->dev->bdev);
+ bio = bio_alloc(stripe->dev->bdev, max(bio_max_len >> PAGE_SHIFT, 1UL),
+ opf, GFP_NOFS);
bio->bi_iter.bi_sector = disk_start >> 9;
+ bio->bi_private = rbio;
- bio_add_page(bio, page, PAGE_SIZE, 0);
+ bio_add_page(bio, sector->page, sectorsize, sector->pgoff);
bio_list_add(bio_list, bio);
return 0;
}
@@ -1130,6 +1207,32 @@ static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
}
}
+static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
+{
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
+ struct bio_vec bvec;
+ struct bvec_iter iter;
+ u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
+ rbio->bioc->raid_map[0];
+
+ if (bio_flagged(bio, BIO_CLONED))
+ bio->bi_iter = btrfs_bio(bio)->iter;
+
+ bio_for_each_segment(bvec, bio, iter) {
+ u32 bvec_offset;
+
+ for (bvec_offset = 0; bvec_offset < bvec.bv_len;
+ bvec_offset += sectorsize, offset += sectorsize) {
+ int index = offset / sectorsize;
+ struct sector_ptr *sector = &rbio->bio_sectors[index];
+
+ sector->page = bvec.bv_page;
+ sector->pgoff = bvec.bv_offset + bvec_offset;
+ ASSERT(sector->pgoff < PAGE_SIZE);
+ }
+ }
+}
+
/*
* helper function to walk our bio list and populate the bio_pages array with
* the result. This seems expensive, but it is faster than constantly
@@ -1141,28 +1244,11 @@ static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
static void index_rbio_pages(struct btrfs_raid_bio *rbio)
{
struct bio *bio;
- u64 start;
- unsigned long stripe_offset;
- unsigned long page_index;
spin_lock_irq(&rbio->bio_list_lock);
- bio_list_for_each(bio, &rbio->bio_list) {
- struct bio_vec bvec;
- struct bvec_iter iter;
- int i = 0;
+ bio_list_for_each(bio, &rbio->bio_list)
+ index_one_bio(rbio, bio);
- start = bio->bi_iter.bi_sector << 9;
- stripe_offset = start - rbio->bioc->raid_map[0];
- page_index = stripe_offset >> PAGE_SHIFT;
-
- if (bio_flagged(bio, BIO_CLONED))
- bio->bi_iter = btrfs_bio(bio)->iter;
-
- bio_for_each_segment(bvec, bio, iter) {
- rbio->bio_pages[page_index + i] = bvec.bv_page;
- i++;
- }
- }
spin_unlock_irq(&rbio->bio_list_lock);
}
@@ -1177,10 +1263,11 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
{
struct btrfs_io_context *bioc = rbio->bioc;
+ const u32 sectorsize = bioc->fs_info->sectorsize;
void **pointers = rbio->finish_pointers;
int nr_data = rbio->nr_data;
int stripe;
- int pagenr;
+ int sectornr;
bool has_qstripe;
struct bio_list bio_list;
struct bio *bio;
@@ -1224,35 +1311,37 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
else
clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
- for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
- struct page *p;
- /* first collect one page from each data stripe */
+ for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
+ struct sector_ptr *sector;
+
+ /* First collect one sector from each data stripe */
for (stripe = 0; stripe < nr_data; stripe++) {
- p = page_in_rbio(rbio, stripe, pagenr, 0);
- pointers[stripe] = kmap_local_page(p);
+ sector = sector_in_rbio(rbio, stripe, sectornr, 0);
+ pointers[stripe] = kmap_local_page(sector->page) +
+ sector->pgoff;
}
- /* then add the parity stripe */
- p = rbio_pstripe_page(rbio, pagenr);
- SetPageUptodate(p);
- pointers[stripe++] = kmap_local_page(p);
+ /* Then add the parity stripe */
+ sector = rbio_pstripe_sector(rbio, sectornr);
+ sector->uptodate = 1;
+ pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff;
if (has_qstripe) {
-
/*
- * raid6, add the qstripe and call the
- * library function to fill in our p/q
+ * RAID6, add the qstripe and call the library function
+ * to fill in our p/q
*/
- p = rbio_qstripe_page(rbio, pagenr);
- SetPageUptodate(p);
- pointers[stripe++] = kmap_local_page(p);
+ sector = rbio_qstripe_sector(rbio, sectornr);
+ sector->uptodate = 1;
+ pointers[stripe++] = kmap_local_page(sector->page) +
+ sector->pgoff;
- raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
+ raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
pointers);
} else {
/* raid5 */
- copy_page(pointers[nr_data], pointers[0]);
- run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
+ memcpy(pointers[nr_data], pointers[0], sectorsize);
+ run_xor(pointers + 1, nr_data - 1, sectorsize);
}
for (stripe = stripe - 1; stripe >= 0; stripe--)
kunmap_local(pointers[stripe]);
@@ -1264,18 +1353,20 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
* everything else.
*/
for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
- for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
- struct page *page;
+ for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
+ struct sector_ptr *sector;
+
if (stripe < rbio->nr_data) {
- page = page_in_rbio(rbio, stripe, pagenr, 1);
- if (!page)
+ sector = sector_in_rbio(rbio, stripe, sectornr, 1);
+ if (!sector)
continue;
} else {
- page = rbio_stripe_page(rbio, stripe, pagenr);
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
}
- ret = rbio_add_io_page(rbio, &bio_list,
- page, stripe, pagenr, rbio->stripe_len);
+ ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
+ sectornr, rbio->stripe_len,
+ REQ_OP_WRITE);
if (ret)
goto cleanup;
}
@@ -1288,19 +1379,21 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
if (!bioc->tgtdev_map[stripe])
continue;
- for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
- struct page *page;
+ for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
+ struct sector_ptr *sector;
+
if (stripe < rbio->nr_data) {
- page = page_in_rbio(rbio, stripe, pagenr, 1);
- if (!page)
+ sector = sector_in_rbio(rbio, stripe, sectornr, 1);
+ if (!sector)
continue;
} else {
- page = rbio_stripe_page(rbio, stripe, pagenr);
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
}
- ret = rbio_add_io_page(rbio, &bio_list, page,
+ ret = rbio_add_io_sector(rbio, &bio_list, sector,
rbio->bioc->tgtdev_map[stripe],
- pagenr, rbio->stripe_len);
+ sectornr, rbio->stripe_len,
+ REQ_OP_WRITE);
if (ret)
goto cleanup;
}
@@ -1311,9 +1404,7 @@ write_data:
BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
while ((bio = bio_list_pop(&bio_list))) {
- bio->bi_private = rbio;
bio->bi_end_io = raid_write_end_io;
- bio->bi_opf = REQ_OP_WRITE;
submit_bio(bio);
}
@@ -1417,18 +1508,48 @@ static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
}
/*
+ * For subpage case, we can no longer set page Uptodate directly for
+ * stripe_pages[], thus we need to locate the sector.
+ */
+static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio,
+ struct page *page,
+ unsigned int pgoff)
+{
+ int i;
+
+ for (i = 0; i < rbio->nr_sectors; i++) {
+ struct sector_ptr *sector = &rbio->stripe_sectors[i];
+
+ if (sector->page == page && sector->pgoff == pgoff)
+ return sector;
+ }
+ return NULL;
+}
+
+/*
* this sets each page in the bio uptodate. It should only be used on private
* rbio pages, nothing that comes in from the higher layers
*/
-static void set_bio_pages_uptodate(struct bio *bio)
+static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
{
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
struct bio_vec *bvec;
struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, iter_all)
- SetPageUptodate(bvec->bv_page);
+ bio_for_each_segment_all(bvec, bio, iter_all) {
+ struct sector_ptr *sector;
+ int pgoff;
+
+ for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len;
+ pgoff += sectorsize) {
+ sector = find_stripe_sector(rbio, bvec->bv_page, pgoff);
+ ASSERT(sector);
+ if (sector)
+ sector->uptodate = 1;
+ }
+ }
}
/*
@@ -1446,7 +1567,7 @@ static void raid_rmw_end_io(struct bio *bio)
if (bio->bi_status)
fail_bio_stripe(rbio, bio);
else
- set_bio_pages_uptodate(bio);
+ set_bio_pages_uptodate(rbio, bio);
bio_put(bio);
@@ -1478,7 +1599,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
int bios_to_read = 0;
struct bio_list bio_list;
int ret;
- int pagenr;
+ int sectornr;
int stripe;
struct bio *bio;
@@ -1496,28 +1617,30 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
* stripe
*/
for (stripe = 0; stripe < rbio->nr_data; stripe++) {
- for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
- struct page *page;
+ for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
+ struct sector_ptr *sector;
+
/*
- * we want to find all the pages missing from
- * the rbio and read them from the disk. If
- * page_in_rbio finds a page in the bio list
- * we don't need to read it off the stripe.
+ * We want to find all the sectors missing from the
+ * rbio and read them from the disk. If * sector_in_rbio()
+ * finds a page in the bio list we don't need to read
+ * it off the stripe.
*/
- page = page_in_rbio(rbio, stripe, pagenr, 1);
- if (page)
+ sector = sector_in_rbio(rbio, stripe, sectornr, 1);
+ if (sector)
continue;
- page = rbio_stripe_page(rbio, stripe, pagenr);
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
/*
- * the bio cache may have handed us an uptodate
- * page. If so, be happy and use it
+ * The bio cache may have handed us an uptodate page.
+ * If so, be happy and use it.
*/
- if (PageUptodate(page))
+ if (sector->uptodate)
continue;
- ret = rbio_add_io_page(rbio, &bio_list, page,
- stripe, pagenr, rbio->stripe_len);
+ ret = rbio_add_io_sector(rbio, &bio_list, sector,
+ stripe, sectornr, rbio->stripe_len,
+ REQ_OP_READ);
if (ret)
goto cleanup;
}
@@ -1540,9 +1663,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
while ((bio = bio_list_pop(&bio_list))) {
- bio->bi_private = rbio;
bio->bi_end_io = raid_rmw_end_io;
- bio->bi_opf = REQ_OP_READ;
btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
@@ -1624,7 +1745,7 @@ struct btrfs_plug_cb {
struct blk_plug_cb cb;
struct btrfs_fs_info *info;
struct list_head rbio_list;
- struct btrfs_work work;
+ struct work_struct work;
};
/*
@@ -1692,7 +1813,7 @@ static void run_plug(struct btrfs_plug_cb *plug)
* if the unplug comes from schedule, we have to push the
* work off to a helper thread
*/
-static void unplug_work(struct btrfs_work *work)
+static void unplug_work(struct work_struct *work)
{
struct btrfs_plug_cb *plug;
plug = container_of(work, struct btrfs_plug_cb, work);
@@ -1705,9 +1826,8 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
plug = container_of(cb, struct btrfs_plug_cb, cb);
if (from_schedule) {
- btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
- btrfs_queue_work(plug->info->rmw_workers,
- &plug->work);
+ INIT_WORK(&plug->work, unplug_work);
+ queue_work(plug->info->rmw_workers, &plug->work);
return;
}
run_plug(plug);
@@ -1716,8 +1836,7 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
/*
* our main entry point for writes from the rest of the FS.
*/
-int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc,
- u64 stripe_len)
+int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stripe_len)
{
struct btrfs_fs_info *fs_info = bioc->fs_info;
struct btrfs_raid_bio *rbio;
@@ -1772,14 +1891,18 @@ int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc,
*/
static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
{
- int pagenr, stripe;
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
+ int sectornr, stripe;
void **pointers;
void **unmap_array;
int faila = -1, failb = -1;
- struct page *page;
blk_status_t err;
int i;
+ /*
+ * This array stores the pointer for each sector, thus it has the extra
+ * pgoff value added from each sector
+ */
pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
if (!pointers) {
err = BLK_STS_RESOURCE;
@@ -1808,43 +1931,44 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
index_rbio_pages(rbio);
- for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
+ for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
+ struct sector_ptr *sector;
+
/*
* Now we just use bitmap to mark the horizontal stripes in
* which we have data when doing parity scrub.
*/
if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
- !test_bit(pagenr, rbio->dbitmap))
+ !test_bit(sectornr, rbio->dbitmap))
continue;
/*
- * Setup our array of pointers with pages from each stripe
+ * Setup our array of pointers with sectors from each stripe
*
* NOTE: store a duplicate array of pointers to preserve the
* pointer order
*/
for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
/*
- * if we're rebuilding a read, we have to use
+ * If we're rebuilding a read, we have to use
* pages from the bio list
*/
if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
(stripe == faila || stripe == failb)) {
- page = page_in_rbio(rbio, stripe, pagenr, 0);
+ sector = sector_in_rbio(rbio, stripe, sectornr, 0);
} else {
- page = rbio_stripe_page(rbio, stripe, pagenr);
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
}
- pointers[stripe] = kmap_local_page(page);
+ ASSERT(sector->page);
+ pointers[stripe] = kmap_local_page(sector->page) +
+ sector->pgoff;
unmap_array[stripe] = pointers[stripe];
}
- /* all raid6 handling here */
+ /* All raid6 handling here */
if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
- /*
- * single failure, rebuild from parity raid5
- * style
- */
+ /* Single failure, rebuild from parity raid5 style */
if (failb < 0) {
if (faila == rbio->nr_data) {
/*
@@ -1887,10 +2011,10 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) {
raid6_datap_recov(rbio->real_stripes,
- PAGE_SIZE, faila, pointers);
+ sectorsize, faila, pointers);
} else {
raid6_2data_recov(rbio->real_stripes,
- PAGE_SIZE, faila, failb,
+ sectorsize, faila, failb,
pointers);
}
} else {
@@ -1900,7 +2024,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
BUG_ON(failb != -1);
pstripe:
/* Copy parity block into failed block to start with */
- copy_page(pointers[faila], pointers[rbio->nr_data]);
+ memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize);
/* rearrange the pointer array */
p = pointers[faila];
@@ -1909,7 +2033,7 @@ pstripe:
pointers[rbio->nr_data - 1] = p;
/* xor in the rest */
- run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE);
+ run_xor(pointers, rbio->nr_data - 1, sectorsize);
}
/* if we're doing this rebuild as part of an rmw, go through
* and set all of our private rbio pages in the
@@ -1918,14 +2042,14 @@ pstripe:
* other endio functions will fiddle the uptodate bits
*/
if (rbio->operation == BTRFS_RBIO_WRITE) {
- for (i = 0; i < rbio->stripe_npages; i++) {
+ for (i = 0; i < rbio->stripe_nsectors; i++) {
if (faila != -1) {
- page = rbio_stripe_page(rbio, faila, i);
- SetPageUptodate(page);
+ sector = rbio_stripe_sector(rbio, faila, i);
+ sector->uptodate = 1;
}
if (failb != -1) {
- page = rbio_stripe_page(rbio, failb, i);
- SetPageUptodate(page);
+ sector = rbio_stripe_sector(rbio, failb, i);
+ sector->uptodate = 1;
}
}
}
@@ -1998,7 +2122,7 @@ static void raid_recover_end_io(struct bio *bio)
if (bio->bi_status)
fail_bio_stripe(rbio, bio);
else
- set_bio_pages_uptodate(bio);
+ set_bio_pages_uptodate(rbio, bio);
bio_put(bio);
if (!atomic_dec_and_test(&rbio->stripes_pending))
@@ -2023,7 +2147,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
int bios_to_read = 0;
struct bio_list bio_list;
int ret;
- int pagenr;
+ int sectornr;
int stripe;
struct bio *bio;
@@ -2046,20 +2170,20 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
continue;
}
- for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
- struct page *p;
+ for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
+ struct sector_ptr *sector;
/*
* the rmw code may have already read this
* page in
*/
- p = rbio_stripe_page(rbio, stripe, pagenr);
- if (PageUptodate(p))
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
+ if (sector->uptodate)
continue;
- ret = rbio_add_io_page(rbio, &bio_list,
- rbio_stripe_page(rbio, stripe, pagenr),
- stripe, pagenr, rbio->stripe_len);
+ ret = rbio_add_io_sector(rbio, &bio_list, sector,
+ stripe, sectornr, rbio->stripe_len,
+ REQ_OP_READ);
if (ret < 0)
goto cleanup;
}
@@ -2086,9 +2210,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
while ((bio = bio_list_pop(&bio_list))) {
- bio->bi_private = rbio;
bio->bi_end_io = raid_recover_end_io;
- bio->bi_opf = REQ_OP_READ;
btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
@@ -2115,7 +2237,7 @@ cleanup:
* of the drive.
*/
int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
- u64 stripe_len, int mirror_num, int generic_io)
+ u32 stripe_len, int mirror_num, int generic_io)
{
struct btrfs_fs_info *fs_info = bioc->fs_info;
struct btrfs_raid_bio *rbio;
@@ -2193,7 +2315,7 @@ int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
}
-static void rmw_work(struct btrfs_work *work)
+static void rmw_work(struct work_struct *work)
{
struct btrfs_raid_bio *rbio;
@@ -2201,7 +2323,7 @@ static void rmw_work(struct btrfs_work *work)
raid56_rmw_stripe(rbio);
}
-static void read_rebuild_work(struct btrfs_work *work)
+static void read_rebuild_work(struct work_struct *work)
{
struct btrfs_raid_bio *rbio;
@@ -2221,7 +2343,7 @@ static void read_rebuild_work(struct btrfs_work *work)
struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
struct btrfs_io_context *bioc,
- u64 stripe_len, struct btrfs_device *scrub_dev,
+ u32 stripe_len, struct btrfs_device *scrub_dev,
unsigned long *dbitmap, int stripe_nsectors)
{
struct btrfs_fs_info *fs_info = bioc->fs_info;
@@ -2252,9 +2374,6 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
}
ASSERT(i < rbio->real_stripes);
- /* Now we just support the sectorsize equals to page size */
- ASSERT(fs_info->sectorsize == PAGE_SIZE);
- ASSERT(rbio->stripe_npages == stripe_nsectors);
bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
/*
@@ -2268,17 +2387,19 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
/* Used for both parity scrub and missing. */
void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
- u64 logical)
+ unsigned int pgoff, u64 logical)
{
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
int stripe_offset;
int index;
ASSERT(logical >= rbio->bioc->raid_map[0]);
- ASSERT(logical + PAGE_SIZE <= rbio->bioc->raid_map[0] +
+ ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] +
rbio->stripe_len * rbio->nr_data);
stripe_offset = (int)(logical - rbio->bioc->raid_map[0]);
- index = stripe_offset >> PAGE_SHIFT;
- rbio->bio_pages[index] = page;
+ index = stripe_offset / sectorsize;
+ rbio->bio_sectors[index].page = page;
+ rbio->bio_sectors[index].pgoff = pgoff;
}
/*
@@ -2287,14 +2408,16 @@ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
*/
static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
{
- int i;
- int bit;
- int index;
- struct page *page;
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
+ int stripe;
+ int sectornr;
+
+ for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) {
+ for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
+ struct page *page;
+ int index = (stripe * rbio->stripe_nsectors + sectornr) *
+ sectorsize >> PAGE_SHIFT;
- for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
- for (i = 0; i < rbio->real_stripes; i++) {
- index = i * rbio->stripe_npages + bit;
if (rbio->stripe_pages[index])
continue;
@@ -2304,6 +2427,7 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
rbio->stripe_pages[index] = page;
}
}
+ index_stripe_sectors(rbio);
return 0;
}
@@ -2311,14 +2435,15 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
int need_check)
{
struct btrfs_io_context *bioc = rbio->bioc;
+ const u32 sectorsize = bioc->fs_info->sectorsize;
void **pointers = rbio->finish_pointers;
unsigned long *pbitmap = rbio->finish_pbitmap;
int nr_data = rbio->nr_data;
int stripe;
- int pagenr;
+ int sectornr;
bool has_qstripe;
- struct page *p_page = NULL;
- struct page *q_page = NULL;
+ struct sector_ptr p_sector = { 0 };
+ struct sector_ptr q_sector = { 0 };
struct bio_list bio_list;
struct bio *bio;
int is_replace = 0;
@@ -2335,7 +2460,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) {
is_replace = 1;
- bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
+ bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_nsectors);
}
/*
@@ -2348,54 +2473,59 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
if (!need_check)
goto writeback;
- p_page = alloc_page(GFP_NOFS);
- if (!p_page)
+ p_sector.page = alloc_page(GFP_NOFS);
+ if (!p_sector.page)
goto cleanup;
- SetPageUptodate(p_page);
+ p_sector.pgoff = 0;
+ p_sector.uptodate = 1;
if (has_qstripe) {
/* RAID6, allocate and map temp space for the Q stripe */
- q_page = alloc_page(GFP_NOFS);
- if (!q_page) {
- __free_page(p_page);
+ q_sector.page = alloc_page(GFP_NOFS);
+ if (!q_sector.page) {
+ __free_page(p_sector.page);
+ p_sector.page = NULL;
goto cleanup;
}
- SetPageUptodate(q_page);
- pointers[rbio->real_stripes - 1] = kmap_local_page(q_page);
+ q_sector.pgoff = 0;
+ q_sector.uptodate = 1;
+ pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page);
}
atomic_set(&rbio->error, 0);
/* Map the parity stripe just once */
- pointers[nr_data] = kmap_local_page(p_page);
+ pointers[nr_data] = kmap_local_page(p_sector.page);
- for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
- struct page *p;
+ for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) {
+ struct sector_ptr *sector;
void *parity;
+
/* first collect one page from each data stripe */
for (stripe = 0; stripe < nr_data; stripe++) {
- p = page_in_rbio(rbio, stripe, pagenr, 0);
- pointers[stripe] = kmap_local_page(p);
+ sector = sector_in_rbio(rbio, stripe, sectornr, 0);
+ pointers[stripe] = kmap_local_page(sector->page) +
+ sector->pgoff;
}
if (has_qstripe) {
/* RAID6, call the library function to fill in our P/Q */
- raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
+ raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
pointers);
} else {
/* raid5 */
- copy_page(pointers[nr_data], pointers[0]);
- run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
+ memcpy(pointers[nr_data], pointers[0], sectorsize);
+ run_xor(pointers + 1, nr_data - 1, sectorsize);
}
/* Check scrubbing parity and repair it */
- p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
- parity = kmap_local_page(p);
- if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE))
- copy_page(parity, pointers[rbio->scrubp]);
+ sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
+ parity = kmap_local_page(sector->page) + sector->pgoff;
+ if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0)
+ memcpy(parity, pointers[rbio->scrubp], sectorsize);
else
/* Parity is right, needn't writeback */
- bitmap_clear(rbio->dbitmap, pagenr, 1);
+ bitmap_clear(rbio->dbitmap, sectornr, 1);
kunmap_local(parity);
for (stripe = nr_data - 1; stripe >= 0; stripe--)
@@ -2403,10 +2533,12 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
}
kunmap_local(pointers[nr_data]);
- __free_page(p_page);
- if (q_page) {
+ __free_page(p_sector.page);
+ p_sector.page = NULL;
+ if (q_sector.page) {
kunmap_local(pointers[rbio->real_stripes - 1]);
- __free_page(q_page);
+ __free_page(q_sector.page);
+ q_sector.page = NULL;
}
writeback:
@@ -2415,12 +2547,12 @@ writeback:
* higher layers (the bio_list in our rbio) and our p/q. Ignore
* everything else.
*/
- for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
- struct page *page;
+ for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) {
+ struct sector_ptr *sector;
- page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
- ret = rbio_add_io_page(rbio, &bio_list,
- page, rbio->scrubp, pagenr, rbio->stripe_len);
+ sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
+ ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp,
+ sectornr, rbio->stripe_len, REQ_OP_WRITE);
if (ret)
goto cleanup;
}
@@ -2428,13 +2560,13 @@ writeback:
if (!is_replace)
goto submit_write;
- for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
- struct page *page;
+ for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) {
+ struct sector_ptr *sector;
- page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
- ret = rbio_add_io_page(rbio, &bio_list, page,
+ sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
+ ret = rbio_add_io_sector(rbio, &bio_list, sector,
bioc->tgtdev_map[rbio->scrubp],
- pagenr, rbio->stripe_len);
+ sectornr, rbio->stripe_len, REQ_OP_WRITE);
if (ret)
goto cleanup;
}
@@ -2450,9 +2582,7 @@ submit_write:
atomic_set(&rbio->stripes_pending, nr_data);
while ((bio = bio_list_pop(&bio_list))) {
- bio->bi_private = rbio;
bio->bi_end_io = raid_write_end_io;
- bio->bi_opf = REQ_OP_WRITE;
submit_bio(bio);
}
@@ -2548,7 +2678,7 @@ static void raid56_parity_scrub_end_io(struct bio *bio)
if (bio->bi_status)
fail_bio_stripe(rbio, bio);
else
- set_bio_pages_uptodate(bio);
+ set_bio_pages_uptodate(rbio, bio);
bio_put(bio);
@@ -2568,7 +2698,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
int bios_to_read = 0;
struct bio_list bio_list;
int ret;
- int pagenr;
+ int sectornr;
int stripe;
struct bio *bio;
@@ -2584,28 +2714,29 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
* stripe
*/
for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
- for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
- struct page *page;
+ for_each_set_bit(sectornr , rbio->dbitmap, rbio->stripe_nsectors) {
+ struct sector_ptr *sector;
/*
- * we want to find all the pages missing from
- * the rbio and read them from the disk. If
- * page_in_rbio finds a page in the bio list
- * we don't need to read it off the stripe.
+ * We want to find all the sectors missing from the
+ * rbio and read them from the disk. If * sector_in_rbio()
+ * finds a sector in the bio list we don't need to read
+ * it off the stripe.
*/
- page = page_in_rbio(rbio, stripe, pagenr, 1);
- if (page)
+ sector = sector_in_rbio(rbio, stripe, sectornr, 1);
+ if (sector)
continue;
- page = rbio_stripe_page(rbio, stripe, pagenr);
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
/*
- * the bio cache may have handed us an uptodate
- * page. If so, be happy and use it
+ * The bio cache may have handed us an uptodate sector.
+ * If so, be happy and use it.
*/
- if (PageUptodate(page))
+ if (sector->uptodate)
continue;
- ret = rbio_add_io_page(rbio, &bio_list, page,
- stripe, pagenr, rbio->stripe_len);
+ ret = rbio_add_io_sector(rbio, &bio_list, sector,
+ stripe, sectornr, rbio->stripe_len,
+ REQ_OP_READ);
if (ret)
goto cleanup;
}
@@ -2628,9 +2759,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
while ((bio = bio_list_pop(&bio_list))) {
- bio->bi_private = rbio;
bio->bi_end_io = raid56_parity_scrub_end_io;
- bio->bi_opf = REQ_OP_READ;
btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
@@ -2651,7 +2780,7 @@ finish:
validate_rbio_for_parity_scrub(rbio);
}
-static void scrub_parity_work(struct btrfs_work *work)
+static void scrub_parity_work(struct work_struct *work)
{
struct btrfs_raid_bio *rbio;
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 72c00fc284b5..aaad08aefd7d 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -31,15 +31,14 @@ struct btrfs_raid_bio;
struct btrfs_device;
int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
- u64 stripe_len, int mirror_num, int generic_io);
-int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc,
- u64 stripe_len);
+ u32 stripe_len, int mirror_num, int generic_io);
+int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stripe_len);
void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
- u64 logical);
+ unsigned int pgoff, u64 logical);
struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
- struct btrfs_io_context *bioc, u64 stripe_len,
+ struct btrfs_io_context *bioc, u32 stripe_len,
struct btrfs_device *scrub_dev,
unsigned long *dbitmap, int stripe_nsectors);
void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 998e3f180d90..c39f8b3a5a4a 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -614,14 +614,23 @@ static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
struct inode *inode2, u64 loff2, u64 len)
{
+ u64 range1_end = loff1 + len - 1;
+ u64 range2_end = loff2 + len - 1;
+
if (inode1 < inode2) {
swap(inode1, inode2);
swap(loff1, loff2);
+ swap(range1_end, range2_end);
} else if (inode1 == inode2 && loff2 < loff1) {
swap(loff1, loff2);
+ swap(range1_end, range2_end);
}
- lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
- lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
+
+ lock_extent(&BTRFS_I(inode1)->io_tree, loff1, range1_end);
+ lock_extent(&BTRFS_I(inode2)->io_tree, loff2, range2_end);
+
+ btrfs_assert_inode_range_clean(BTRFS_I(inode1), loff1, range1_end);
+ btrfs_assert_inode_range_clean(BTRFS_I(inode2), loff2, range2_end);
}
static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2)
@@ -771,7 +780,6 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out);
u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize;
- bool same_inode = inode_out == inode_in;
u64 wb_len;
int ret;
@@ -810,15 +818,6 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
wb_len = ALIGN(*len, bs);
/*
- * Since we don't lock ranges, wait for ongoing lockless dio writes (as
- * any in progress could create its ordered extents after we wait for
- * existing ordered extents below).
- */
- inode_dio_wait(inode_in);
- if (!same_inode)
- inode_dio_wait(inode_out);
-
- /*
* Workaround to make sure NOCOW buffered write reach disk as NOCOW.
*
* Btrfs' back references do not have a block level granularity, they
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index fdc2c4b411f0..edddd93d2118 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -362,7 +362,7 @@ struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr)
rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr);
if (rb_node) {
node = rb_entry(rb_node, struct mapping_node, rb_node);
- root = (struct btrfs_root *)node->data;
+ root = node->data;
}
spin_unlock(&rc->reloc_root_tree.lock);
return btrfs_grab_root(root);
@@ -2997,7 +2997,8 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
/* Reserve metadata for this range */
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
- clamped_len, clamped_len);
+ clamped_len, clamped_len,
+ false);
if (ret)
goto release_page;
@@ -3845,8 +3846,7 @@ out:
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
if (err) {
- if (inode)
- iput(inode);
+ iput(inode);
inode = ERR_PTR(err);
}
return inode;
@@ -3977,6 +3977,17 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
if (!bg)
return -ENOENT;
+ /*
+ * Relocation of a data block group creates ordered extents. Without
+ * sb_start_write(), we can freeze the filesystem while unfinished
+ * ordered extents are left. Such ordered extents can cause a deadlock
+ * e.g. when syncfs() is waiting for their completion but they can't
+ * finish because they block when joining a transaction, due to the
+ * fact that the freeze locks are being held in write mode.
+ */
+ if (bg->flags & BTRFS_BLOCK_GROUP_DATA)
+ ASSERT(sb_write_started(fs_info->sb));
+
if (btrfs_pinned_by_swapfile(fs_info, bg)) {
btrfs_put_block_group(bg);
return -ETXTBSY;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index ca7426ef61c8..a64b26b16904 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -509,7 +509,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
/* One for parent inode, two for dir entries */
qgroup_num_bytes = 3 * fs_info->nodesize;
ret = btrfs_qgroup_reserve_meta_prealloc(root,
- qgroup_num_bytes, true);
+ qgroup_num_bytes, true,
+ false);
if (ret)
return ret;
}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 8cd713d37ad2..e7b0323e6efd 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -45,14 +45,14 @@ struct scrub_ctx;
* operations. The first one configures an upper limit for the number
* of (dynamically allocated) pages that are added to a bio.
*/
-#define SCRUB_PAGES_PER_BIO 32 /* 128KiB per bio for x86 */
-#define SCRUB_BIOS_PER_SCTX 64 /* 8MiB per device in flight for x86 */
+#define SCRUB_SECTORS_PER_BIO 32 /* 128KiB per bio for 4KiB pages */
+#define SCRUB_BIOS_PER_SCTX 64 /* 8MiB per device in flight for 4KiB pages */
/*
* The following value times PAGE_SIZE needs to be large enough to match the
* largest node/leaf/sector size that shall be supported.
*/
-#define SCRUB_MAX_PAGES_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
+#define SCRUB_MAX_SECTORS_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
struct scrub_recover {
refcount_t refs;
@@ -60,7 +60,7 @@ struct scrub_recover {
u64 map_length;
};
-struct scrub_page {
+struct scrub_sector {
struct scrub_block *sblock;
struct page *page;
struct btrfs_device *dev;
@@ -87,16 +87,16 @@ struct scrub_bio {
blk_status_t status;
u64 logical;
u64 physical;
- struct scrub_page *pagev[SCRUB_PAGES_PER_BIO];
- int page_count;
+ struct scrub_sector *sectors[SCRUB_SECTORS_PER_BIO];
+ int sector_count;
int next_free;
- struct btrfs_work work;
+ struct work_struct work;
};
struct scrub_block {
- struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
- int page_count;
- atomic_t outstanding_pages;
+ struct scrub_sector *sectors[SCRUB_MAX_SECTORS_PER_BLOCK];
+ int sector_count;
+ atomic_t outstanding_sectors;
refcount_t refs; /* free mem on transition to zero */
struct scrub_ctx *sctx;
struct scrub_parity *sparity;
@@ -110,7 +110,7 @@ struct scrub_block {
/* It is for the data with checksum */
unsigned int data_corrected:1;
};
- struct btrfs_work work;
+ struct work_struct work;
};
/* Used for the chunks with parity stripe such RAID5/6 */
@@ -129,10 +129,10 @@ struct scrub_parity {
refcount_t refs;
- struct list_head spages;
+ struct list_head sectors_list;
/* Work of parity check and repair */
- struct btrfs_work work;
+ struct work_struct work;
/* Mark the parity blocks which have data */
unsigned long *dbitmap;
@@ -158,7 +158,7 @@ struct scrub_ctx {
struct list_head csum_list;
atomic_t cancel_req;
int readonly;
- int pages_per_bio;
+ int sectors_per_bio;
/* State of IO submission throttling affecting the associated device */
ktime_t throttle_deadline;
@@ -212,43 +212,43 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
static void scrub_recheck_block_checksum(struct scrub_block *sblock);
static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_block *sblock_good);
-static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
+static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_block *sblock_good,
- int page_num, int force_write);
+ int sector_num, int force_write);
static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
-static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
- int page_num);
+static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock,
+ int sector_num);
static int scrub_checksum_data(struct scrub_block *sblock);
static int scrub_checksum_tree_block(struct scrub_block *sblock);
static int scrub_checksum_super(struct scrub_block *sblock);
static void scrub_block_put(struct scrub_block *sblock);
-static void scrub_page_get(struct scrub_page *spage);
-static void scrub_page_put(struct scrub_page *spage);
+static void scrub_sector_get(struct scrub_sector *sector);
+static void scrub_sector_put(struct scrub_sector *sector);
static void scrub_parity_get(struct scrub_parity *sparity);
static void scrub_parity_put(struct scrub_parity *sparity);
-static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len,
- u64 physical, struct btrfs_device *dev, u64 flags,
- u64 gen, int mirror_num, u8 *csum,
- u64 physical_for_dev_replace);
+static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
+ u64 physical, struct btrfs_device *dev, u64 flags,
+ u64 gen, int mirror_num, u8 *csum,
+ u64 physical_for_dev_replace);
static void scrub_bio_end_io(struct bio *bio);
-static void scrub_bio_end_io_worker(struct btrfs_work *work);
+static void scrub_bio_end_io_worker(struct work_struct *work);
static void scrub_block_complete(struct scrub_block *sblock);
-static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
- u64 extent_logical, u32 extent_len,
- u64 *extent_physical,
- struct btrfs_device **extent_dev,
- int *extent_mirror_num);
-static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
- struct scrub_page *spage);
+static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
+ u64 extent_logical, u32 extent_len,
+ u64 *extent_physical,
+ struct btrfs_device **extent_dev,
+ int *extent_mirror_num);
+static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
+ struct scrub_sector *sector);
static void scrub_wr_submit(struct scrub_ctx *sctx);
static void scrub_wr_bio_end_io(struct bio *bio);
-static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
+static void scrub_wr_bio_end_io_worker(struct work_struct *work);
static void scrub_put_ctx(struct scrub_ctx *sctx);
-static inline int scrub_is_page_on_raid56(struct scrub_page *spage)
+static inline int scrub_is_page_on_raid56(struct scrub_sector *sector)
{
- return spage->recover &&
- (spage->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
+ return sector->recover &&
+ (sector->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
}
static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
@@ -535,9 +535,9 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
if (sctx->curr != -1) {
struct scrub_bio *sbio = sctx->bios[sctx->curr];
- for (i = 0; i < sbio->page_count; i++) {
- WARN_ON(!sbio->pagev[i]->page);
- scrub_block_put(sbio->pagev[i]->sblock);
+ for (i = 0; i < sbio->sector_count; i++) {
+ WARN_ON(!sbio->sectors[i]->page);
+ scrub_block_put(sbio->sectors[i]->sblock);
}
bio_put(sbio->bio);
}
@@ -572,7 +572,7 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
goto nomem;
refcount_set(&sctx->refs, 1);
sctx->is_dev_replace = is_dev_replace;
- sctx->pages_per_bio = SCRUB_PAGES_PER_BIO;
+ sctx->sectors_per_bio = SCRUB_SECTORS_PER_BIO;
sctx->curr = -1;
sctx->fs_info = fs_info;
INIT_LIST_HEAD(&sctx->csum_list);
@@ -586,9 +586,8 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
sbio->index = i;
sbio->sctx = sctx;
- sbio->page_count = 0;
- btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, NULL,
- NULL);
+ sbio->sector_count = 0;
+ INIT_WORK(&sbio->work, scrub_bio_end_io_worker);
if (i != SCRUB_BIOS_PER_SCTX - 1)
sctx->bios[i]->next_free = i + 1;
@@ -728,16 +727,16 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
u8 ref_level = 0;
int ret;
- WARN_ON(sblock->page_count < 1);
- dev = sblock->pagev[0]->dev;
+ WARN_ON(sblock->sector_count < 1);
+ dev = sblock->sectors[0]->dev;
fs_info = sblock->sctx->fs_info;
path = btrfs_alloc_path();
if (!path)
return;
- swarn.physical = sblock->pagev[0]->physical;
- swarn.logical = sblock->pagev[0]->logical;
+ swarn.physical = sblock->sectors[0]->physical;
+ swarn.logical = sblock->sectors[0]->logical;
swarn.errstr = errstr;
swarn.dev = NULL;
@@ -798,8 +797,8 @@ static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
/*
* scrub_handle_errored_block gets called when either verification of the
- * pages failed or the bio failed to read, e.g. with EIO. In the latter
- * case, this function handles all pages in the bio, even though only one
+ * sectors failed or the bio failed to read, e.g. with EIO. In the latter
+ * case, this function handles all sectors in the bio, even though only one
* may be bad.
* The goal of this function is to repair the errored block by using the
* contents of one of the mirrors.
@@ -817,16 +816,16 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
struct scrub_block *sblock_bad;
int ret;
int mirror_index;
- int page_num;
+ int sector_num;
int success;
bool full_stripe_locked;
unsigned int nofs_flag;
static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
- BUG_ON(sblock_to_check->page_count < 1);
+ BUG_ON(sblock_to_check->sector_count < 1);
fs_info = sctx->fs_info;
- if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
+ if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
/*
* if we find an error in a super block, we just report it.
* They will get written with the next transaction commit
@@ -837,13 +836,13 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
spin_unlock(&sctx->stat_lock);
return 0;
}
- logical = sblock_to_check->pagev[0]->logical;
- BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
- failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
- is_metadata = !(sblock_to_check->pagev[0]->flags &
+ logical = sblock_to_check->sectors[0]->logical;
+ BUG_ON(sblock_to_check->sectors[0]->mirror_num < 1);
+ failed_mirror_index = sblock_to_check->sectors[0]->mirror_num - 1;
+ is_metadata = !(sblock_to_check->sectors[0]->flags &
BTRFS_EXTENT_FLAG_DATA);
- have_csum = sblock_to_check->pagev[0]->have_csum;
- dev = sblock_to_check->pagev[0]->dev;
+ have_csum = sblock_to_check->sectors[0]->have_csum;
+ dev = sblock_to_check->sectors[0]->dev;
if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical))
return 0;
@@ -854,7 +853,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
* might be waiting the scrub task to pause (which needs to wait for all
* the worker tasks to complete before pausing).
* We do allocations in the workers through insert_full_stripe_lock()
- * and scrub_add_page_to_wr_bio(), which happens down the call chain of
+ * and scrub_add_sector_to_wr_bio(), which happens down the call chain of
* this function.
*/
nofs_flag = memalloc_nofs_save();
@@ -918,7 +917,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
goto out;
}
- /* setup the context, map the logical blocks and alloc the pages */
+ /* Setup the context, map the logical blocks and alloc the sectors */
ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
if (ret) {
spin_lock(&sctx->stat_lock);
@@ -937,7 +936,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
sblock_bad->no_io_error_seen) {
/*
- * the error disappeared after reading page by page, or
+ * The error disappeared after reading sector by sector, or
* the area was part of a huge bio and other parts of the
* bio caused I/O errors, or the block layer merged several
* read requests into one and the error is caused by a
@@ -998,10 +997,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
* that is known to contain an error is rewritten. Afterwards
* the block is known to be corrected.
* If a mirror is found which is completely correct, and no
- * checksum is present, only those pages are rewritten that had
+ * checksum is present, only those sectors are rewritten that had
* an I/O error in the block to be repaired, since it cannot be
- * determined, which copy of the other pages is better (and it
- * could happen otherwise that a correct page would be
+ * determined, which copy of the other sectors is better (and it
+ * could happen otherwise that a correct sector would be
* overwritten by a bad one).
*/
for (mirror_index = 0; ;mirror_index++) {
@@ -1011,25 +1010,25 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
continue;
/* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
- if (!scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
+ if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
if (mirror_index >= BTRFS_MAX_MIRRORS)
break;
- if (!sblocks_for_recheck[mirror_index].page_count)
+ if (!sblocks_for_recheck[mirror_index].sector_count)
break;
sblock_other = sblocks_for_recheck + mirror_index;
} else {
- struct scrub_recover *r = sblock_bad->pagev[0]->recover;
+ struct scrub_recover *r = sblock_bad->sectors[0]->recover;
int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs;
if (mirror_index >= max_allowed)
break;
- if (!sblocks_for_recheck[1].page_count)
+ if (!sblocks_for_recheck[1].sector_count)
break;
ASSERT(failed_mirror_index == 0);
sblock_other = sblocks_for_recheck + 1;
- sblock_other->pagev[0]->mirror_num = 1 + mirror_index;
+ sblock_other->sectors[0]->mirror_num = 1 + mirror_index;
}
/* build and submit the bios, check checksums */
@@ -1078,16 +1077,16 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
* area are unreadable.
*/
success = 1;
- for (page_num = 0; page_num < sblock_bad->page_count;
- page_num++) {
- struct scrub_page *spage_bad = sblock_bad->pagev[page_num];
+ for (sector_num = 0; sector_num < sblock_bad->sector_count;
+ sector_num++) {
+ struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
struct scrub_block *sblock_other = NULL;
- /* skip no-io-error page in scrub */
- if (!spage_bad->io_error && !sctx->is_dev_replace)
+ /* Skip no-io-error sectors in scrub */
+ if (!sector_bad->io_error && !sctx->is_dev_replace)
continue;
- if (scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
+ if (scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
/*
* In case of dev replace, if raid56 rebuild process
* didn't work out correct data, then copy the content
@@ -1096,14 +1095,14 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
* sblock_for_recheck array to target device.
*/
sblock_other = NULL;
- } else if (spage_bad->io_error) {
- /* try to find no-io-error page in mirrors */
+ } else if (sector_bad->io_error) {
+ /* Try to find no-io-error sector in mirrors */
for (mirror_index = 0;
mirror_index < BTRFS_MAX_MIRRORS &&
- sblocks_for_recheck[mirror_index].page_count > 0;
+ sblocks_for_recheck[mirror_index].sector_count > 0;
mirror_index++) {
if (!sblocks_for_recheck[mirror_index].
- pagev[page_num]->io_error) {
+ sectors[sector_num]->io_error) {
sblock_other = sblocks_for_recheck +
mirror_index;
break;
@@ -1115,27 +1114,26 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
if (sctx->is_dev_replace) {
/*
- * did not find a mirror to fetch the page
- * from. scrub_write_page_to_dev_replace()
- * handles this case (page->io_error), by
- * filling the block with zeros before
- * submitting the write request
+ * Did not find a mirror to fetch the sector from.
+ * scrub_write_sector_to_dev_replace() handles this
+ * case (sector->io_error), by filling the block with
+ * zeros before submitting the write request
*/
if (!sblock_other)
sblock_other = sblock_bad;
- if (scrub_write_page_to_dev_replace(sblock_other,
- page_num) != 0) {
+ if (scrub_write_sector_to_dev_replace(sblock_other,
+ sector_num) != 0) {
atomic64_inc(
&fs_info->dev_replace.num_write_errors);
success = 0;
}
} else if (sblock_other) {
- ret = scrub_repair_page_from_good_copy(sblock_bad,
- sblock_other,
- page_num, 0);
+ ret = scrub_repair_sector_from_good_copy(sblock_bad,
+ sblock_other,
+ sector_num, 0);
if (0 == ret)
- spage_bad->io_error = 0;
+ sector_bad->io_error = 0;
else
success = 0;
}
@@ -1186,18 +1184,16 @@ out:
struct scrub_block *sblock = sblocks_for_recheck +
mirror_index;
struct scrub_recover *recover;
- int page_index;
+ int i;
- for (page_index = 0; page_index < sblock->page_count;
- page_index++) {
- sblock->pagev[page_index]->sblock = NULL;
- recover = sblock->pagev[page_index]->recover;
+ for (i = 0; i < sblock->sector_count; i++) {
+ sblock->sectors[i]->sblock = NULL;
+ recover = sblock->sectors[i]->recover;
if (recover) {
scrub_put_recover(fs_info, recover);
- sblock->pagev[page_index]->recover =
- NULL;
+ sblock->sectors[i]->recover = NULL;
}
- scrub_page_put(sblock->pagev[page_index]);
+ scrub_sector_put(sblock->sectors[i]);
}
}
kfree(sblocks_for_recheck);
@@ -1255,26 +1251,25 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
{
struct scrub_ctx *sctx = original_sblock->sctx;
struct btrfs_fs_info *fs_info = sctx->fs_info;
- u64 length = original_sblock->page_count * fs_info->sectorsize;
- u64 logical = original_sblock->pagev[0]->logical;
- u64 generation = original_sblock->pagev[0]->generation;
- u64 flags = original_sblock->pagev[0]->flags;
- u64 have_csum = original_sblock->pagev[0]->have_csum;
+ u64 length = original_sblock->sector_count << fs_info->sectorsize_bits;
+ u64 logical = original_sblock->sectors[0]->logical;
+ u64 generation = original_sblock->sectors[0]->generation;
+ u64 flags = original_sblock->sectors[0]->flags;
+ u64 have_csum = original_sblock->sectors[0]->have_csum;
struct scrub_recover *recover;
struct btrfs_io_context *bioc;
u64 sublen;
u64 mapped_length;
u64 stripe_offset;
int stripe_index;
- int page_index = 0;
+ int sector_index = 0;
int mirror_index;
int nmirrors;
int ret;
/*
- * note: the two members refs and outstanding_pages
- * are not used (and not set) in the blocks that are used for
- * the recheck procedure
+ * Note: the two members refs and outstanding_sectors are not used (and
+ * not set) in the blocks that are used for the recheck procedure.
*/
while (length > 0) {
@@ -1306,20 +1301,20 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
recover->bioc = bioc;
recover->map_length = mapped_length;
- ASSERT(page_index < SCRUB_MAX_PAGES_PER_BLOCK);
+ ASSERT(sector_index < SCRUB_MAX_SECTORS_PER_BLOCK);
nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS);
for (mirror_index = 0; mirror_index < nmirrors;
mirror_index++) {
struct scrub_block *sblock;
- struct scrub_page *spage;
+ struct scrub_sector *sector;
sblock = sblocks_for_recheck + mirror_index;
sblock->sctx = sctx;
- spage = kzalloc(sizeof(*spage), GFP_NOFS);
- if (!spage) {
+ sector = kzalloc(sizeof(*sector), GFP_NOFS);
+ if (!sector) {
leave_nomem:
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
@@ -1327,16 +1322,16 @@ leave_nomem:
scrub_put_recover(fs_info, recover);
return -ENOMEM;
}
- scrub_page_get(spage);
- sblock->pagev[page_index] = spage;
- spage->sblock = sblock;
- spage->flags = flags;
- spage->generation = generation;
- spage->logical = logical;
- spage->have_csum = have_csum;
+ scrub_sector_get(sector);
+ sblock->sectors[sector_index] = sector;
+ sector->sblock = sblock;
+ sector->flags = flags;
+ sector->generation = generation;
+ sector->logical = logical;
+ sector->have_csum = have_csum;
if (have_csum)
- memcpy(spage->csum,
- original_sblock->pagev[0]->csum,
+ memcpy(sector->csum,
+ original_sblock->sectors[0]->csum,
sctx->fs_info->csum_size);
scrub_stripe_index_and_offset(logical,
@@ -1348,28 +1343,28 @@ leave_nomem:
mirror_index,
&stripe_index,
&stripe_offset);
- spage->physical = bioc->stripes[stripe_index].physical +
+ sector->physical = bioc->stripes[stripe_index].physical +
stripe_offset;
- spage->dev = bioc->stripes[stripe_index].dev;
+ sector->dev = bioc->stripes[stripe_index].dev;
- BUG_ON(page_index >= original_sblock->page_count);
- spage->physical_for_dev_replace =
- original_sblock->pagev[page_index]->
+ BUG_ON(sector_index >= original_sblock->sector_count);
+ sector->physical_for_dev_replace =
+ original_sblock->sectors[sector_index]->
physical_for_dev_replace;
- /* for missing devices, dev->bdev is NULL */
- spage->mirror_num = mirror_index + 1;
- sblock->page_count++;
- spage->page = alloc_page(GFP_NOFS);
- if (!spage->page)
+ /* For missing devices, dev->bdev is NULL */
+ sector->mirror_num = mirror_index + 1;
+ sblock->sector_count++;
+ sector->page = alloc_page(GFP_NOFS);
+ if (!sector->page)
goto leave_nomem;
scrub_get_recover(recover);
- spage->recover = recover;
+ sector->recover = recover;
}
scrub_put_recover(fs_info, recover);
length -= sublen;
logical += sublen;
- page_index++;
+ sector_index++;
}
return 0;
@@ -1382,19 +1377,19 @@ static void scrub_bio_wait_endio(struct bio *bio)
static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
struct bio *bio,
- struct scrub_page *spage)
+ struct scrub_sector *sector)
{
DECLARE_COMPLETION_ONSTACK(done);
int ret;
int mirror_num;
- bio->bi_iter.bi_sector = spage->logical >> 9;
+ bio->bi_iter.bi_sector = sector->logical >> 9;
bio->bi_private = &done;
bio->bi_end_io = scrub_bio_wait_endio;
- mirror_num = spage->sblock->pagev[0]->mirror_num;
- ret = raid56_parity_recover(bio, spage->recover->bioc,
- spage->recover->map_length,
+ mirror_num = sector->sblock->sectors[0]->mirror_num;
+ ret = raid56_parity_recover(bio, sector->recover->bioc,
+ sector->recover->map_length,
mirror_num, 0);
if (ret)
return ret;
@@ -1406,26 +1401,25 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock)
{
- struct scrub_page *first_page = sblock->pagev[0];
+ struct scrub_sector *first_sector = sblock->sectors[0];
struct bio *bio;
- int page_num;
+ int i;
- /* All pages in sblock belong to the same stripe on the same device. */
- ASSERT(first_page->dev);
- if (!first_page->dev->bdev)
+ /* All sectors in sblock belong to the same stripe on the same device. */
+ ASSERT(first_sector->dev);
+ if (!first_sector->dev->bdev)
goto out;
- bio = btrfs_bio_alloc(BIO_MAX_VECS);
- bio_set_dev(bio, first_page->dev->bdev);
+ bio = bio_alloc(first_sector->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
- for (page_num = 0; page_num < sblock->page_count; page_num++) {
- struct scrub_page *spage = sblock->pagev[page_num];
+ for (i = 0; i < sblock->sector_count; i++) {
+ struct scrub_sector *sector = sblock->sectors[i];
- WARN_ON(!spage->page);
- bio_add_page(bio, spage->page, PAGE_SIZE, 0);
+ WARN_ON(!sector->page);
+ bio_add_page(bio, sector->page, PAGE_SIZE, 0);
}
- if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) {
+ if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) {
bio_put(bio);
goto out;
}
@@ -1436,65 +1430,63 @@ static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
return;
out:
- for (page_num = 0; page_num < sblock->page_count; page_num++)
- sblock->pagev[page_num]->io_error = 1;
+ for (i = 0; i < sblock->sector_count; i++)
+ sblock->sectors[i]->io_error = 1;
sblock->no_io_error_seen = 0;
}
/*
- * this function will check the on disk data for checksum errors, header
- * errors and read I/O errors. If any I/O errors happen, the exact pages
- * which are errored are marked as being bad. The goal is to enable scrub
- * to take those pages that are not errored from all the mirrors so that
- * the pages that are errored in the just handled mirror can be repaired.
+ * This function will check the on disk data for checksum errors, header errors
+ * and read I/O errors. If any I/O errors happen, the exact sectors which are
+ * errored are marked as being bad. The goal is to enable scrub to take those
+ * sectors that are not errored from all the mirrors so that the sectors that
+ * are errored in the just handled mirror can be repaired.
*/
static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock,
int retry_failed_mirror)
{
- int page_num;
+ int i;
sblock->no_io_error_seen = 1;
/* short cut for raid56 */
- if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->pagev[0]))
+ if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->sectors[0]))
return scrub_recheck_block_on_raid56(fs_info, sblock);
- for (page_num = 0; page_num < sblock->page_count; page_num++) {
- struct bio *bio;
- struct scrub_page *spage = sblock->pagev[page_num];
+ for (i = 0; i < sblock->sector_count; i++) {
+ struct scrub_sector *sector = sblock->sectors[i];
+ struct bio bio;
+ struct bio_vec bvec;
- if (spage->dev->bdev == NULL) {
- spage->io_error = 1;
+ if (sector->dev->bdev == NULL) {
+ sector->io_error = 1;
sblock->no_io_error_seen = 0;
continue;
}
- WARN_ON(!spage->page);
- bio = btrfs_bio_alloc(1);
- bio_set_dev(bio, spage->dev->bdev);
-
- bio_add_page(bio, spage->page, fs_info->sectorsize, 0);
- bio->bi_iter.bi_sector = spage->physical >> 9;
- bio->bi_opf = REQ_OP_READ;
+ WARN_ON(!sector->page);
+ bio_init(&bio, sector->dev->bdev, &bvec, 1, REQ_OP_READ);
+ bio_add_page(&bio, sector->page, fs_info->sectorsize, 0);
+ bio.bi_iter.bi_sector = sector->physical >> 9;
- if (btrfsic_submit_bio_wait(bio)) {
- spage->io_error = 1;
+ btrfsic_check_bio(&bio);
+ if (submit_bio_wait(&bio)) {
+ sector->io_error = 1;
sblock->no_io_error_seen = 0;
}
- bio_put(bio);
+ bio_uninit(&bio);
}
if (sblock->no_io_error_seen)
scrub_recheck_block_checksum(sblock);
}
-static inline int scrub_check_fsid(u8 fsid[],
- struct scrub_page *spage)
+static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector)
{
- struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
+ struct btrfs_fs_devices *fs_devices = sector->dev->fs_devices;
int ret;
ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
@@ -1507,7 +1499,7 @@ static void scrub_recheck_block_checksum(struct scrub_block *sblock)
sblock->checksum_error = 0;
sblock->generation_error = 0;
- if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA)
+ if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA)
scrub_checksum_data(sblock);
else
scrub_checksum_tree_block(sblock);
@@ -1516,15 +1508,14 @@ static void scrub_recheck_block_checksum(struct scrub_block *sblock)
static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_block *sblock_good)
{
- int page_num;
+ int i;
int ret = 0;
- for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
+ for (i = 0; i < sblock_bad->sector_count; i++) {
int ret_sub;
- ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
- sblock_good,
- page_num, 1);
+ ret_sub = scrub_repair_sector_from_good_copy(sblock_bad,
+ sblock_good, i, 1);
if (ret_sub)
ret = ret_sub;
}
@@ -1532,47 +1523,43 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
return ret;
}
-static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
- struct scrub_block *sblock_good,
- int page_num, int force_write)
+static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
+ struct scrub_block *sblock_good,
+ int sector_num, int force_write)
{
- struct scrub_page *spage_bad = sblock_bad->pagev[page_num];
- struct scrub_page *spage_good = sblock_good->pagev[page_num];
+ struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
+ struct scrub_sector *sector_good = sblock_good->sectors[sector_num];
struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
const u32 sectorsize = fs_info->sectorsize;
- BUG_ON(spage_bad->page == NULL);
- BUG_ON(spage_good->page == NULL);
+ BUG_ON(sector_bad->page == NULL);
+ BUG_ON(sector_good->page == NULL);
if (force_write || sblock_bad->header_error ||
- sblock_bad->checksum_error || spage_bad->io_error) {
- struct bio *bio;
+ sblock_bad->checksum_error || sector_bad->io_error) {
+ struct bio bio;
+ struct bio_vec bvec;
int ret;
- if (!spage_bad->dev->bdev) {
+ if (!sector_bad->dev->bdev) {
btrfs_warn_rl(fs_info,
"scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
return -EIO;
}
- bio = btrfs_bio_alloc(1);
- bio_set_dev(bio, spage_bad->dev->bdev);
- bio->bi_iter.bi_sector = spage_bad->physical >> 9;
- bio->bi_opf = REQ_OP_WRITE;
+ bio_init(&bio, sector_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE);
+ bio.bi_iter.bi_sector = sector_bad->physical >> 9;
+ __bio_add_page(&bio, sector_good->page, sectorsize, 0);
- ret = bio_add_page(bio, spage_good->page, sectorsize, 0);
- if (ret != sectorsize) {
- bio_put(bio);
- return -EIO;
- }
+ btrfsic_check_bio(&bio);
+ ret = submit_bio_wait(&bio);
+ bio_uninit(&bio);
- if (btrfsic_submit_bio_wait(bio)) {
- btrfs_dev_stat_inc_and_print(spage_bad->dev,
+ if (ret) {
+ btrfs_dev_stat_inc_and_print(sector_bad->dev,
BTRFS_DEV_STAT_WRITE_ERRS);
atomic64_inc(&fs_info->dev_replace.num_write_errors);
- bio_put(bio);
return -EIO;
}
- bio_put(bio);
}
return 0;
@@ -1581,7 +1568,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
{
struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
- int page_num;
+ int i;
/*
* This block is used for the check of the parity on the source device,
@@ -1590,25 +1577,24 @@ static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
if (sblock->sparity)
return;
- for (page_num = 0; page_num < sblock->page_count; page_num++) {
+ for (i = 0; i < sblock->sector_count; i++) {
int ret;
- ret = scrub_write_page_to_dev_replace(sblock, page_num);
+ ret = scrub_write_sector_to_dev_replace(sblock, i);
if (ret)
atomic64_inc(&fs_info->dev_replace.num_write_errors);
}
}
-static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
- int page_num)
+static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, int sector_num)
{
- struct scrub_page *spage = sblock->pagev[page_num];
+ struct scrub_sector *sector = sblock->sectors[sector_num];
- BUG_ON(spage->page == NULL);
- if (spage->io_error)
- clear_page(page_address(spage->page));
+ BUG_ON(sector->page == NULL);
+ if (sector->io_error)
+ clear_page(page_address(sector->page));
- return scrub_add_page_to_wr_bio(sblock->sctx, spage);
+ return scrub_add_sector_to_wr_bio(sblock->sctx, sector);
}
static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
@@ -1633,8 +1619,8 @@ static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
return ret;
}
-static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
- struct scrub_page *spage)
+static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
+ struct scrub_sector *sector)
{
struct scrub_bio *sbio;
int ret;
@@ -1650,45 +1636,38 @@ again:
return -ENOMEM;
}
sctx->wr_curr_bio->sctx = sctx;
- sctx->wr_curr_bio->page_count = 0;
+ sctx->wr_curr_bio->sector_count = 0;
}
sbio = sctx->wr_curr_bio;
- if (sbio->page_count == 0) {
- struct bio *bio;
-
- ret = fill_writer_pointer_gap(sctx,
- spage->physical_for_dev_replace);
+ if (sbio->sector_count == 0) {
+ ret = fill_writer_pointer_gap(sctx, sector->physical_for_dev_replace);
if (ret) {
mutex_unlock(&sctx->wr_lock);
return ret;
}
- sbio->physical = spage->physical_for_dev_replace;
- sbio->logical = spage->logical;
+ sbio->physical = sector->physical_for_dev_replace;
+ sbio->logical = sector->logical;
sbio->dev = sctx->wr_tgtdev;
- bio = sbio->bio;
- if (!bio) {
- bio = btrfs_bio_alloc(sctx->pages_per_bio);
- sbio->bio = bio;
+ if (!sbio->bio) {
+ sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
+ REQ_OP_WRITE, GFP_NOFS);
}
-
- bio->bi_private = sbio;
- bio->bi_end_io = scrub_wr_bio_end_io;
- bio_set_dev(bio, sbio->dev->bdev);
- bio->bi_iter.bi_sector = sbio->physical >> 9;
- bio->bi_opf = REQ_OP_WRITE;
+ sbio->bio->bi_private = sbio;
+ sbio->bio->bi_end_io = scrub_wr_bio_end_io;
+ sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
sbio->status = 0;
- } else if (sbio->physical + sbio->page_count * sectorsize !=
- spage->physical_for_dev_replace ||
- sbio->logical + sbio->page_count * sectorsize !=
- spage->logical) {
+ } else if (sbio->physical + sbio->sector_count * sectorsize !=
+ sector->physical_for_dev_replace ||
+ sbio->logical + sbio->sector_count * sectorsize !=
+ sector->logical) {
scrub_wr_submit(sctx);
goto again;
}
- ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0);
+ ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0);
if (ret != sectorsize) {
- if (sbio->page_count < 1) {
+ if (sbio->sector_count < 1) {
bio_put(sbio->bio);
sbio->bio = NULL;
mutex_unlock(&sctx->wr_lock);
@@ -1698,10 +1677,10 @@ again:
goto again;
}
- sbio->pagev[sbio->page_count] = spage;
- scrub_page_get(spage);
- sbio->page_count++;
- if (sbio->page_count == sctx->pages_per_bio)
+ sbio->sectors[sbio->sector_count] = sector;
+ scrub_sector_get(sector);
+ sbio->sector_count++;
+ if (sbio->sector_count == sctx->sectors_per_bio)
scrub_wr_submit(sctx);
mutex_unlock(&sctx->wr_lock);
@@ -1717,16 +1696,16 @@ static void scrub_wr_submit(struct scrub_ctx *sctx)
sbio = sctx->wr_curr_bio;
sctx->wr_curr_bio = NULL;
- WARN_ON(!sbio->bio->bi_bdev);
scrub_pending_bio_inc(sctx);
/* process all writes in a single worker thread. Then the block layer
* orders the requests before sending them to the driver which
* doubled the write performance on spinning disks when measured
* with Linux 3.5 */
- btrfsic_submit_bio(sbio->bio);
+ btrfsic_check_bio(sbio->bio);
+ submit_bio(sbio->bio);
if (btrfs_is_zoned(sctx->fs_info))
- sctx->write_pointer = sbio->physical + sbio->page_count *
+ sctx->write_pointer = sbio->physical + sbio->sector_count *
sctx->fs_info->sectorsize;
}
@@ -1738,31 +1717,31 @@ static void scrub_wr_bio_end_io(struct bio *bio)
sbio->status = bio->bi_status;
sbio->bio = bio;
- btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
- btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
+ INIT_WORK(&sbio->work, scrub_wr_bio_end_io_worker);
+ queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
}
-static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
+static void scrub_wr_bio_end_io_worker(struct work_struct *work)
{
struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
struct scrub_ctx *sctx = sbio->sctx;
int i;
- ASSERT(sbio->page_count <= SCRUB_PAGES_PER_BIO);
+ ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
if (sbio->status) {
struct btrfs_dev_replace *dev_replace =
&sbio->sctx->fs_info->dev_replace;
- for (i = 0; i < sbio->page_count; i++) {
- struct scrub_page *spage = sbio->pagev[i];
+ for (i = 0; i < sbio->sector_count; i++) {
+ struct scrub_sector *sector = sbio->sectors[i];
- spage->io_error = 1;
+ sector->io_error = 1;
atomic64_inc(&dev_replace->num_write_errors);
}
}
- for (i = 0; i < sbio->page_count; i++)
- scrub_page_put(sbio->pagev[i]);
+ for (i = 0; i < sbio->sector_count; i++)
+ scrub_sector_put(sbio->sectors[i]);
bio_put(sbio->bio);
kfree(sbio);
@@ -1786,8 +1765,8 @@ static int scrub_checksum(struct scrub_block *sblock)
sblock->generation_error = 0;
sblock->checksum_error = 0;
- WARN_ON(sblock->page_count < 1);
- flags = sblock->pagev[0]->flags;
+ WARN_ON(sblock->sector_count < 1);
+ flags = sblock->sectors[0]->flags;
ret = 0;
if (flags & BTRFS_EXTENT_FLAG_DATA)
ret = scrub_checksum_data(sblock);
@@ -1809,26 +1788,26 @@ static int scrub_checksum_data(struct scrub_block *sblock)
struct btrfs_fs_info *fs_info = sctx->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
u8 csum[BTRFS_CSUM_SIZE];
- struct scrub_page *spage;
+ struct scrub_sector *sector;
char *kaddr;
- BUG_ON(sblock->page_count < 1);
- spage = sblock->pagev[0];
- if (!spage->have_csum)
+ BUG_ON(sblock->sector_count < 1);
+ sector = sblock->sectors[0];
+ if (!sector->have_csum)
return 0;
- kaddr = page_address(spage->page);
+ kaddr = page_address(sector->page);
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
/*
- * In scrub_pages() and scrub_pages_for_parity() we ensure each spage
+ * In scrub_sectors() and scrub_sectors_for_parity() we ensure each sector
* only contains one sector of data.
*/
crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
- if (memcmp(csum, spage->csum, fs_info->csum_size))
+ if (memcmp(csum, sector->csum, fs_info->csum_size))
sblock->checksum_error = 1;
return sblock->checksum_error;
}
@@ -1849,16 +1828,16 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
const u32 sectorsize = sctx->fs_info->sectorsize;
const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits;
int i;
- struct scrub_page *spage;
+ struct scrub_sector *sector;
char *kaddr;
- BUG_ON(sblock->page_count < 1);
+ BUG_ON(sblock->sector_count < 1);
- /* Each member in pagev is just one block, not a full page */
- ASSERT(sblock->page_count == num_sectors);
+ /* Each member in sectors is just one sector */
+ ASSERT(sblock->sector_count == num_sectors);
- spage = sblock->pagev[0];
- kaddr = page_address(spage->page);
+ sector = sblock->sectors[0];
+ kaddr = page_address(sector->page);
h = (struct btrfs_header *)kaddr;
memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size);
@@ -1867,15 +1846,15 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
* a) don't have an extent buffer and
* b) the page is already kmapped
*/
- if (spage->logical != btrfs_stack_header_bytenr(h))
+ if (sector->logical != btrfs_stack_header_bytenr(h))
sblock->header_error = 1;
- if (spage->generation != btrfs_stack_header_generation(h)) {
+ if (sector->generation != btrfs_stack_header_generation(h)) {
sblock->header_error = 1;
sblock->generation_error = 1;
}
- if (!scrub_check_fsid(h->fsid, spage))
+ if (!scrub_check_fsid(h->fsid, sector))
sblock->header_error = 1;
if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
@@ -1888,7 +1867,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
sectorsize - BTRFS_CSUM_SIZE);
for (i = 1; i < num_sectors; i++) {
- kaddr = page_address(sblock->pagev[i]->page);
+ kaddr = page_address(sblock->sectors[i]->page);
crypto_shash_update(shash, kaddr, sectorsize);
}
@@ -1906,23 +1885,23 @@ static int scrub_checksum_super(struct scrub_block *sblock)
struct btrfs_fs_info *fs_info = sctx->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
u8 calculated_csum[BTRFS_CSUM_SIZE];
- struct scrub_page *spage;
+ struct scrub_sector *sector;
char *kaddr;
int fail_gen = 0;
int fail_cor = 0;
- BUG_ON(sblock->page_count < 1);
- spage = sblock->pagev[0];
- kaddr = page_address(spage->page);
+ BUG_ON(sblock->sector_count < 1);
+ sector = sblock->sectors[0];
+ kaddr = page_address(sector->page);
s = (struct btrfs_super_block *)kaddr;
- if (spage->logical != btrfs_super_bytenr(s))
+ if (sector->logical != btrfs_super_bytenr(s))
++fail_cor;
- if (spage->generation != btrfs_super_generation(s))
+ if (sector->generation != btrfs_super_generation(s))
++fail_gen;
- if (!scrub_check_fsid(s->fsid, spage))
+ if (!scrub_check_fsid(s->fsid, sector))
++fail_cor;
shash->tfm = fs_info->csum_shash;
@@ -1943,10 +1922,10 @@ static int scrub_checksum_super(struct scrub_block *sblock)
++sctx->stat.super_errors;
spin_unlock(&sctx->stat_lock);
if (fail_cor)
- btrfs_dev_stat_inc_and_print(spage->dev,
+ btrfs_dev_stat_inc_and_print(sector->dev,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
else
- btrfs_dev_stat_inc_and_print(spage->dev,
+ btrfs_dev_stat_inc_and_print(sector->dev,
BTRFS_DEV_STAT_GENERATION_ERRS);
}
@@ -1966,23 +1945,23 @@ static void scrub_block_put(struct scrub_block *sblock)
if (sblock->sparity)
scrub_parity_put(sblock->sparity);
- for (i = 0; i < sblock->page_count; i++)
- scrub_page_put(sblock->pagev[i]);
+ for (i = 0; i < sblock->sector_count; i++)
+ scrub_sector_put(sblock->sectors[i]);
kfree(sblock);
}
}
-static void scrub_page_get(struct scrub_page *spage)
+static void scrub_sector_get(struct scrub_sector *sector)
{
- atomic_inc(&spage->refs);
+ atomic_inc(&sector->refs);
}
-static void scrub_page_put(struct scrub_page *spage)
+static void scrub_sector_put(struct scrub_sector *sector)
{
- if (atomic_dec_and_test(&spage->refs)) {
- if (spage->page)
- __free_page(spage->page);
- kfree(spage);
+ if (atomic_dec_and_test(&sector->refs)) {
+ if (sector->page)
+ __free_page(sector->page);
+ kfree(sector);
}
}
@@ -2057,13 +2036,14 @@ static void scrub_submit(struct scrub_ctx *sctx)
sbio = sctx->bios[sctx->curr];
sctx->curr = -1;
scrub_pending_bio_inc(sctx);
- btrfsic_submit_bio(sbio->bio);
+ btrfsic_check_bio(sbio->bio);
+ submit_bio(sbio->bio);
}
-static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
- struct scrub_page *spage)
+static int scrub_add_sector_to_rd_bio(struct scrub_ctx *sctx,
+ struct scrub_sector *sector)
{
- struct scrub_block *sblock = spage->sblock;
+ struct scrub_block *sblock = sector->sblock;
struct scrub_bio *sbio;
const u32 sectorsize = sctx->fs_info->sectorsize;
int ret;
@@ -2078,7 +2058,7 @@ again:
if (sctx->curr != -1) {
sctx->first_free = sctx->bios[sctx->curr]->next_free;
sctx->bios[sctx->curr]->next_free = -1;
- sctx->bios[sctx->curr]->page_count = 0;
+ sctx->bios[sctx->curr]->sector_count = 0;
spin_unlock(&sctx->list_lock);
} else {
spin_unlock(&sctx->list_lock);
@@ -2086,37 +2066,31 @@ again:
}
}
sbio = sctx->bios[sctx->curr];
- if (sbio->page_count == 0) {
- struct bio *bio;
-
- sbio->physical = spage->physical;
- sbio->logical = spage->logical;
- sbio->dev = spage->dev;
- bio = sbio->bio;
- if (!bio) {
- bio = btrfs_bio_alloc(sctx->pages_per_bio);
- sbio->bio = bio;
+ if (sbio->sector_count == 0) {
+ sbio->physical = sector->physical;
+ sbio->logical = sector->logical;
+ sbio->dev = sector->dev;
+ if (!sbio->bio) {
+ sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
+ REQ_OP_READ, GFP_NOFS);
}
-
- bio->bi_private = sbio;
- bio->bi_end_io = scrub_bio_end_io;
- bio_set_dev(bio, sbio->dev->bdev);
- bio->bi_iter.bi_sector = sbio->physical >> 9;
- bio->bi_opf = REQ_OP_READ;
+ sbio->bio->bi_private = sbio;
+ sbio->bio->bi_end_io = scrub_bio_end_io;
+ sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
sbio->status = 0;
- } else if (sbio->physical + sbio->page_count * sectorsize !=
- spage->physical ||
- sbio->logical + sbio->page_count * sectorsize !=
- spage->logical ||
- sbio->dev != spage->dev) {
+ } else if (sbio->physical + sbio->sector_count * sectorsize !=
+ sector->physical ||
+ sbio->logical + sbio->sector_count * sectorsize !=
+ sector->logical ||
+ sbio->dev != sector->dev) {
scrub_submit(sctx);
goto again;
}
- sbio->pagev[sbio->page_count] = spage;
- ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0);
+ sbio->sectors[sbio->sector_count] = sector;
+ ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0);
if (ret != sectorsize) {
- if (sbio->page_count < 1) {
+ if (sbio->sector_count < 1) {
bio_put(sbio->bio);
sbio->bio = NULL;
return -EIO;
@@ -2126,9 +2100,9 @@ again:
}
scrub_block_get(sblock); /* one for the page added to the bio */
- atomic_inc(&sblock->outstanding_pages);
- sbio->page_count++;
- if (sbio->page_count == sctx->pages_per_bio)
+ atomic_inc(&sblock->outstanding_sectors);
+ sbio->sector_count++;
+ if (sbio->sector_count == sctx->sectors_per_bio)
scrub_submit(sctx);
return 0;
@@ -2144,10 +2118,10 @@ static void scrub_missing_raid56_end_io(struct bio *bio)
bio_put(bio);
- btrfs_queue_work(fs_info->scrub_workers, &sblock->work);
+ queue_work(fs_info->scrub_workers, &sblock->work);
}
-static void scrub_missing_raid56_worker(struct btrfs_work *work)
+static void scrub_missing_raid56_worker(struct work_struct *work)
{
struct scrub_block *sblock = container_of(work, struct scrub_block, work);
struct scrub_ctx *sctx = sblock->sctx;
@@ -2155,8 +2129,8 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work)
u64 logical;
struct btrfs_device *dev;
- logical = sblock->pagev[0]->logical;
- dev = sblock->pagev[0]->dev;
+ logical = sblock->sectors[0]->logical;
+ dev = sblock->sectors[0]->dev;
if (sblock->no_io_error_seen)
scrub_recheck_block_checksum(sblock);
@@ -2193,8 +2167,8 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
{
struct scrub_ctx *sctx = sblock->sctx;
struct btrfs_fs_info *fs_info = sctx->fs_info;
- u64 length = sblock->page_count * PAGE_SIZE;
- u64 logical = sblock->pagev[0]->logical;
+ u64 length = sblock->sector_count << fs_info->sectorsize_bits;
+ u64 logical = sblock->sectors[0]->logical;
struct btrfs_io_context *bioc = NULL;
struct bio *bio;
struct btrfs_raid_bio *rbio;
@@ -2213,12 +2187,12 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
* We shouldn't be scrubbing a missing device. Even for dev
* replace, we should only get here for RAID 5/6. We either
* managed to mount something with no mirrors remaining or
- * there's a bug in scrub_remap_extent()/btrfs_map_block().
+ * there's a bug in scrub_find_good_copy()/btrfs_map_block().
*/
goto bioc_out;
}
- bio = btrfs_bio_alloc(BIO_MAX_VECS);
+ bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
bio->bi_iter.bi_sector = logical >> 9;
bio->bi_private = sblock;
bio->bi_end_io = scrub_missing_raid56_end_io;
@@ -2227,13 +2201,17 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
if (!rbio)
goto rbio_out;
- for (i = 0; i < sblock->page_count; i++) {
- struct scrub_page *spage = sblock->pagev[i];
+ for (i = 0; i < sblock->sector_count; i++) {
+ struct scrub_sector *sector = sblock->sectors[i];
- raid56_add_scrub_pages(rbio, spage->page, spage->logical);
+ /*
+ * For now, our scrub is still one page per sector, so pgoff
+ * is always 0.
+ */
+ raid56_add_scrub_pages(rbio, sector->page, 0, sector->logical);
}
- btrfs_init_work(&sblock->work, scrub_missing_raid56_worker, NULL, NULL);
+ INIT_WORK(&sblock->work, scrub_missing_raid56_worker);
scrub_block_get(sblock);
scrub_pending_bio_inc(sctx);
raid56_submit_missing_rbio(rbio);
@@ -2249,7 +2227,7 @@ bioc_out:
spin_unlock(&sctx->stat_lock);
}
-static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len,
+static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
u64 physical, struct btrfs_device *dev, u64 flags,
u64 gen, int mirror_num, u8 *csum,
u64 physical_for_dev_replace)
@@ -2273,7 +2251,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len,
sblock->no_io_error_seen = 1;
for (index = 0; len > 0; index++) {
- struct scrub_page *spage;
+ struct scrub_sector *sector;
/*
* Here we will allocate one page for one sector to scrub.
* This is fine if PAGE_SIZE == sectorsize, but will cost
@@ -2281,8 +2259,8 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len,
*/
u32 l = min(sectorsize, len);
- spage = kzalloc(sizeof(*spage), GFP_KERNEL);
- if (!spage) {
+ sector = kzalloc(sizeof(*sector), GFP_KERNEL);
+ if (!sector) {
leave_nomem:
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
@@ -2290,26 +2268,26 @@ leave_nomem:
scrub_block_put(sblock);
return -ENOMEM;
}
- ASSERT(index < SCRUB_MAX_PAGES_PER_BLOCK);
- scrub_page_get(spage);
- sblock->pagev[index] = spage;
- spage->sblock = sblock;
- spage->dev = dev;
- spage->flags = flags;
- spage->generation = gen;
- spage->logical = logical;
- spage->physical = physical;
- spage->physical_for_dev_replace = physical_for_dev_replace;
- spage->mirror_num = mirror_num;
+ ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK);
+ scrub_sector_get(sector);
+ sblock->sectors[index] = sector;
+ sector->sblock = sblock;
+ sector->dev = dev;
+ sector->flags = flags;
+ sector->generation = gen;
+ sector->logical = logical;
+ sector->physical = physical;
+ sector->physical_for_dev_replace = physical_for_dev_replace;
+ sector->mirror_num = mirror_num;
if (csum) {
- spage->have_csum = 1;
- memcpy(spage->csum, csum, sctx->fs_info->csum_size);
+ sector->have_csum = 1;
+ memcpy(sector->csum, csum, sctx->fs_info->csum_size);
} else {
- spage->have_csum = 0;
+ sector->have_csum = 0;
}
- sblock->page_count++;
- spage->page = alloc_page(GFP_KERNEL);
- if (!spage->page)
+ sblock->sector_count++;
+ sector->page = alloc_page(GFP_KERNEL);
+ if (!sector->page)
goto leave_nomem;
len -= l;
logical += l;
@@ -2317,7 +2295,7 @@ leave_nomem:
physical_for_dev_replace += l;
}
- WARN_ON(sblock->page_count == 0);
+ WARN_ON(sblock->sector_count == 0);
if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
/*
* This case should only be hit for RAID 5/6 device replace. See
@@ -2325,11 +2303,11 @@ leave_nomem:
*/
scrub_missing_raid56_pages(sblock);
} else {
- for (index = 0; index < sblock->page_count; index++) {
- struct scrub_page *spage = sblock->pagev[index];
+ for (index = 0; index < sblock->sector_count; index++) {
+ struct scrub_sector *sector = sblock->sectors[index];
int ret;
- ret = scrub_add_page_to_rd_bio(sctx, spage);
+ ret = scrub_add_sector_to_rd_bio(sctx, sector);
if (ret) {
scrub_block_put(sblock);
return ret;
@@ -2353,31 +2331,31 @@ static void scrub_bio_end_io(struct bio *bio)
sbio->status = bio->bi_status;
sbio->bio = bio;
- btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
+ queue_work(fs_info->scrub_workers, &sbio->work);
}
-static void scrub_bio_end_io_worker(struct btrfs_work *work)
+static void scrub_bio_end_io_worker(struct work_struct *work)
{
struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
struct scrub_ctx *sctx = sbio->sctx;
int i;
- ASSERT(sbio->page_count <= SCRUB_PAGES_PER_BIO);
+ ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
if (sbio->status) {
- for (i = 0; i < sbio->page_count; i++) {
- struct scrub_page *spage = sbio->pagev[i];
+ for (i = 0; i < sbio->sector_count; i++) {
+ struct scrub_sector *sector = sbio->sectors[i];
- spage->io_error = 1;
- spage->sblock->no_io_error_seen = 0;
+ sector->io_error = 1;
+ sector->sblock->no_io_error_seen = 0;
}
}
- /* now complete the scrub_block items that have all pages completed */
- for (i = 0; i < sbio->page_count; i++) {
- struct scrub_page *spage = sbio->pagev[i];
- struct scrub_block *sblock = spage->sblock;
+ /* Now complete the scrub_block items that have all pages completed */
+ for (i = 0; i < sbio->sector_count; i++) {
+ struct scrub_sector *sector = sbio->sectors[i];
+ struct scrub_block *sblock = sector->sblock;
- if (atomic_dec_and_test(&sblock->outstanding_pages))
+ if (atomic_dec_and_test(&sblock->outstanding_sectors))
scrub_block_complete(sblock);
scrub_block_put(sblock);
}
@@ -2456,8 +2434,8 @@ static void scrub_block_complete(struct scrub_block *sblock)
}
if (sblock->sparity && corrupted && !sblock->data_corrected) {
- u64 start = sblock->pagev[0]->logical;
- u64 end = sblock->pagev[sblock->page_count - 1]->logical +
+ u64 start = sblock->sectors[0]->logical;
+ u64 end = sblock->sectors[sblock->sector_count - 1]->logical +
sblock->sctx->fs_info->sectorsize;
ASSERT(end - start <= U32_MAX);
@@ -2532,8 +2510,11 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
u64 logical, u32 len,
u64 physical, struct btrfs_device *dev, u64 flags,
- u64 gen, int mirror_num, u64 physical_for_dev_replace)
+ u64 gen, int mirror_num)
{
+ struct btrfs_device *src_dev = dev;
+ u64 src_physical = physical;
+ int src_mirror = mirror_num;
int ret;
u8 csum[BTRFS_CSUM_SIZE];
u32 blocksize;
@@ -2561,6 +2542,18 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
WARN_ON(1);
}
+ /*
+ * For dev-replace case, we can have @dev being a missing device.
+ * Regular scrub will avoid its execution on missing device at all,
+ * as that would trigger tons of read error.
+ *
+ * Reading from missing device will cause read error counts to
+ * increase unnecessarily.
+ * So here we change the read source to a good mirror.
+ */
+ if (sctx->is_dev_replace && !dev->bdev)
+ scrub_find_good_copy(sctx->fs_info, logical, len, &src_physical,
+ &src_dev, &src_mirror);
while (len) {
u32 l = min(len, blocksize);
int have_csum = 0;
@@ -2571,20 +2564,20 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
if (have_csum == 0)
++sctx->stat.no_csum;
}
- ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
- mirror_num, have_csum ? csum : NULL,
- physical_for_dev_replace);
+ ret = scrub_sectors(sctx, logical, l, src_physical, src_dev,
+ flags, gen, src_mirror,
+ have_csum ? csum : NULL, physical);
if (ret)
return ret;
len -= l;
logical += l;
physical += l;
- physical_for_dev_replace += l;
+ src_physical += l;
}
return 0;
}
-static int scrub_pages_for_parity(struct scrub_parity *sparity,
+static int scrub_sectors_for_parity(struct scrub_parity *sparity,
u64 logical, u32 len,
u64 physical, struct btrfs_device *dev,
u64 flags, u64 gen, int mirror_num, u8 *csum)
@@ -2613,10 +2606,10 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity,
scrub_parity_get(sparity);
for (index = 0; len > 0; index++) {
- struct scrub_page *spage;
+ struct scrub_sector *sector;
- spage = kzalloc(sizeof(*spage), GFP_KERNEL);
- if (!spage) {
+ sector = kzalloc(sizeof(*sector), GFP_KERNEL);
+ if (!sector) {
leave_nomem:
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
@@ -2624,29 +2617,29 @@ leave_nomem:
scrub_block_put(sblock);
return -ENOMEM;
}
- ASSERT(index < SCRUB_MAX_PAGES_PER_BLOCK);
+ ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK);
/* For scrub block */
- scrub_page_get(spage);
- sblock->pagev[index] = spage;
+ scrub_sector_get(sector);
+ sblock->sectors[index] = sector;
/* For scrub parity */
- scrub_page_get(spage);
- list_add_tail(&spage->list, &sparity->spages);
- spage->sblock = sblock;
- spage->dev = dev;
- spage->flags = flags;
- spage->generation = gen;
- spage->logical = logical;
- spage->physical = physical;
- spage->mirror_num = mirror_num;
+ scrub_sector_get(sector);
+ list_add_tail(&sector->list, &sparity->sectors_list);
+ sector->sblock = sblock;
+ sector->dev = dev;
+ sector->flags = flags;
+ sector->generation = gen;
+ sector->logical = logical;
+ sector->physical = physical;
+ sector->mirror_num = mirror_num;
if (csum) {
- spage->have_csum = 1;
- memcpy(spage->csum, csum, sctx->fs_info->csum_size);
+ sector->have_csum = 1;
+ memcpy(sector->csum, csum, sctx->fs_info->csum_size);
} else {
- spage->have_csum = 0;
+ sector->have_csum = 0;
}
- sblock->page_count++;
- spage->page = alloc_page(GFP_KERNEL);
- if (!spage->page)
+ sblock->sector_count++;
+ sector->page = alloc_page(GFP_KERNEL);
+ if (!sector->page)
goto leave_nomem;
@@ -2656,19 +2649,19 @@ leave_nomem:
physical += sectorsize;
}
- WARN_ON(sblock->page_count == 0);
- for (index = 0; index < sblock->page_count; index++) {
- struct scrub_page *spage = sblock->pagev[index];
+ WARN_ON(sblock->sector_count == 0);
+ for (index = 0; index < sblock->sector_count; index++) {
+ struct scrub_sector *sector = sblock->sectors[index];
int ret;
- ret = scrub_add_page_to_rd_bio(sctx, spage);
+ ret = scrub_add_sector_to_rd_bio(sctx, sector);
if (ret) {
scrub_block_put(sblock);
return ret;
}
}
- /* last one frees, either here or in bio completion for last page */
+ /* Last one frees, either here or in bio completion for last sector */
scrub_block_put(sblock);
return 0;
}
@@ -2707,7 +2700,7 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity,
if (have_csum == 0)
goto skip;
}
- ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
+ ret = scrub_sectors_for_parity(sparity, logical, l, physical, dev,
flags, gen, mirror_num,
have_csum ? csum : NULL);
if (ret)
@@ -2767,7 +2760,7 @@ static int get_raid56_logic_offset(u64 physical, int num,
static void scrub_free_parity(struct scrub_parity *sparity)
{
struct scrub_ctx *sctx = sparity->sctx;
- struct scrub_page *curr, *next;
+ struct scrub_sector *curr, *next;
int nbits;
nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
@@ -2778,15 +2771,15 @@ static void scrub_free_parity(struct scrub_parity *sparity)
spin_unlock(&sctx->stat_lock);
}
- list_for_each_entry_safe(curr, next, &sparity->spages, list) {
+ list_for_each_entry_safe(curr, next, &sparity->sectors_list, list) {
list_del_init(&curr->list);
- scrub_page_put(curr);
+ scrub_sector_put(curr);
}
kfree(sparity);
}
-static void scrub_parity_bio_endio_worker(struct btrfs_work *work)
+static void scrub_parity_bio_endio_worker(struct work_struct *work)
{
struct scrub_parity *sparity = container_of(work, struct scrub_parity,
work);
@@ -2798,7 +2791,7 @@ static void scrub_parity_bio_endio_worker(struct btrfs_work *work)
static void scrub_parity_bio_endio(struct bio *bio)
{
- struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
+ struct scrub_parity *sparity = bio->bi_private;
struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
if (bio->bi_status)
@@ -2807,9 +2800,8 @@ static void scrub_parity_bio_endio(struct bio *bio)
bio_put(bio);
- btrfs_init_work(&sparity->work, scrub_parity_bio_endio_worker, NULL,
- NULL);
- btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work);
+ INIT_WORK(&sparity->work, scrub_parity_bio_endio_worker);
+ queue_work(fs_info->scrub_parity_workers, &sparity->work);
}
static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
@@ -2834,7 +2826,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
if (ret || !bioc || !bioc->raid_map)
goto bioc_out;
- bio = btrfs_bio_alloc(BIO_MAX_VECS);
+ bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
bio->bi_iter.bi_sector = sparity->logic_start >> 9;
bio->bi_private = sparity;
bio->bi_end_io = scrub_parity_bio_endio;
@@ -2882,6 +2874,251 @@ static void scrub_parity_put(struct scrub_parity *sparity)
scrub_parity_check_and_repair(sparity);
}
+/*
+ * Return 0 if the extent item range covers any byte of the range.
+ * Return <0 if the extent item is before @search_start.
+ * Return >0 if the extent item is after @start_start + @search_len.
+ */
+static int compare_extent_item_range(struct btrfs_path *path,
+ u64 search_start, u64 search_len)
+{
+ struct btrfs_fs_info *fs_info = path->nodes[0]->fs_info;
+ u64 len;
+ struct btrfs_key key;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY ||
+ key.type == BTRFS_METADATA_ITEM_KEY);
+ if (key.type == BTRFS_METADATA_ITEM_KEY)
+ len = fs_info->nodesize;
+ else
+ len = key.offset;
+
+ if (key.objectid + len <= search_start)
+ return -1;
+ if (key.objectid >= search_start + search_len)
+ return 1;
+ return 0;
+}
+
+/*
+ * Locate one extent item which covers any byte in range
+ * [@search_start, @search_start + @search_length)
+ *
+ * If the path is not initialized, we will initialize the search by doing
+ * a btrfs_search_slot().
+ * If the path is already initialized, we will use the path as the initial
+ * slot, to avoid duplicated btrfs_search_slot() calls.
+ *
+ * NOTE: If an extent item starts before @search_start, we will still
+ * return the extent item. This is for data extent crossing stripe boundary.
+ *
+ * Return 0 if we found such extent item, and @path will point to the extent item.
+ * Return >0 if no such extent item can be found, and @path will be released.
+ * Return <0 if hit fatal error, and @path will be released.
+ */
+static int find_first_extent_item(struct btrfs_root *extent_root,
+ struct btrfs_path *path,
+ u64 search_start, u64 search_len)
+{
+ struct btrfs_fs_info *fs_info = extent_root->fs_info;
+ struct btrfs_key key;
+ int ret;
+
+ /* Continue using the existing path */
+ if (path->nodes[0])
+ goto search_forward;
+
+ if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ else
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.objectid = search_start;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ ASSERT(ret > 0);
+ /*
+ * Here we intentionally pass 0 as @min_objectid, as there could be
+ * an extent item starting before @search_start.
+ */
+ ret = btrfs_previous_extent_item(extent_root, path, 0);
+ if (ret < 0)
+ return ret;
+ /*
+ * No matter whether we have found an extent item, the next loop will
+ * properly do every check on the key.
+ */
+search_forward:
+ while (true) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid >= search_start + search_len)
+ break;
+ if (key.type != BTRFS_METADATA_ITEM_KEY &&
+ key.type != BTRFS_EXTENT_ITEM_KEY)
+ goto next;
+
+ ret = compare_extent_item_range(path, search_start, search_len);
+ if (ret == 0)
+ return ret;
+ if (ret > 0)
+ break;
+next:
+ path->slots[0]++;
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(extent_root, path);
+ if (ret) {
+ /* Either no more item or fatal error */
+ btrfs_release_path(path);
+ return ret;
+ }
+ }
+ }
+ btrfs_release_path(path);
+ return 1;
+}
+
+static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret,
+ u64 *size_ret, u64 *flags_ret, u64 *generation_ret)
+{
+ struct btrfs_key key;
+ struct btrfs_extent_item *ei;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ ASSERT(key.type == BTRFS_METADATA_ITEM_KEY ||
+ key.type == BTRFS_EXTENT_ITEM_KEY);
+ *extent_start_ret = key.objectid;
+ if (key.type == BTRFS_METADATA_ITEM_KEY)
+ *size_ret = path->nodes[0]->fs_info->nodesize;
+ else
+ *size_ret = key.offset;
+ ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item);
+ *flags_ret = btrfs_extent_flags(path->nodes[0], ei);
+ *generation_ret = btrfs_extent_generation(path->nodes[0], ei);
+}
+
+static bool does_range_cross_boundary(u64 extent_start, u64 extent_len,
+ u64 boundary_start, u64 boudary_len)
+{
+ return (extent_start < boundary_start &&
+ extent_start + extent_len > boundary_start) ||
+ (extent_start < boundary_start + boudary_len &&
+ extent_start + extent_len > boundary_start + boudary_len);
+}
+
+static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx,
+ struct scrub_parity *sparity,
+ struct map_lookup *map,
+ struct btrfs_device *sdev,
+ struct btrfs_path *path,
+ u64 logical)
+{
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical);
+ struct btrfs_root *csum_root = btrfs_csum_root(fs_info, logical);
+ u64 cur_logical = logical;
+ int ret;
+
+ ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
+
+ /* Path must not be populated */
+ ASSERT(!path->nodes[0]);
+
+ while (cur_logical < logical + map->stripe_len) {
+ struct btrfs_io_context *bioc = NULL;
+ struct btrfs_device *extent_dev;
+ u64 extent_start;
+ u64 extent_size;
+ u64 mapped_length;
+ u64 extent_flags;
+ u64 extent_gen;
+ u64 extent_physical;
+ u64 extent_mirror_num;
+
+ ret = find_first_extent_item(extent_root, path, cur_logical,
+ logical + map->stripe_len - cur_logical);
+ /* No more extent item in this data stripe */
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ if (ret < 0)
+ break;
+ get_extent_info(path, &extent_start, &extent_size, &extent_flags,
+ &extent_gen);
+
+ /* Metadata should not cross stripe boundaries */
+ if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
+ does_range_cross_boundary(extent_start, extent_size,
+ logical, map->stripe_len)) {
+ btrfs_err(fs_info,
+ "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
+ extent_start, logical);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
+ cur_logical += extent_size;
+ continue;
+ }
+
+ /* Skip hole range which doesn't have any extent */
+ cur_logical = max(extent_start, cur_logical);
+
+ /* Truncate the range inside this data stripe */
+ extent_size = min(extent_start + extent_size,
+ logical + map->stripe_len) - cur_logical;
+ extent_start = cur_logical;
+ ASSERT(extent_size <= U32_MAX);
+
+ scrub_parity_mark_sectors_data(sparity, extent_start, extent_size);
+
+ mapped_length = extent_size;
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_start,
+ &mapped_length, &bioc, 0);
+ if (!ret && (!bioc || mapped_length < extent_size))
+ ret = -EIO;
+ if (ret) {
+ btrfs_put_bioc(bioc);
+ scrub_parity_mark_sectors_error(sparity, extent_start,
+ extent_size);
+ break;
+ }
+ extent_physical = bioc->stripes[0].physical;
+ extent_mirror_num = bioc->mirror_num;
+ extent_dev = bioc->stripes[0].dev;
+ btrfs_put_bioc(bioc);
+
+ ret = btrfs_lookup_csums_range(csum_root, extent_start,
+ extent_start + extent_size - 1,
+ &sctx->csum_list, 1);
+ if (ret) {
+ scrub_parity_mark_sectors_error(sparity, extent_start,
+ extent_size);
+ break;
+ }
+
+ ret = scrub_extent_for_parity(sparity, extent_start,
+ extent_size, extent_physical,
+ extent_dev, extent_flags,
+ extent_gen, extent_mirror_num);
+ scrub_free_csums(sctx);
+
+ if (ret) {
+ scrub_parity_mark_sectors_error(sparity, extent_start,
+ extent_size);
+ break;
+ }
+
+ cond_resched();
+ cur_logical += extent_size;
+ }
+ btrfs_release_path(path);
+ return ret;
+}
+
static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
struct map_lookup *map,
struct btrfs_device *sdev,
@@ -2889,28 +3126,12 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
u64 logic_end)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
- struct btrfs_root *root = btrfs_extent_root(fs_info, logic_start);
- struct btrfs_root *csum_root;
- struct btrfs_extent_item *extent;
- struct btrfs_io_context *bioc = NULL;
struct btrfs_path *path;
- u64 flags;
+ u64 cur_logical;
int ret;
- int slot;
- struct extent_buffer *l;
- struct btrfs_key key;
- u64 generation;
- u64 extent_logical;
- u64 extent_physical;
- /* Check the comment in scrub_stripe() for why u32 is enough here */
- u32 extent_len;
- u64 mapped_length;
- struct btrfs_device *extent_dev;
struct scrub_parity *sparity;
int nsectors;
int bitmap_len;
- int extent_mirror_num;
- int stop_loop = 0;
path = btrfs_alloc_path();
if (!path) {
@@ -2943,178 +3164,19 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
sparity->logic_start = logic_start;
sparity->logic_end = logic_end;
refcount_set(&sparity->refs, 1);
- INIT_LIST_HEAD(&sparity->spages);
+ INIT_LIST_HEAD(&sparity->sectors_list);
sparity->dbitmap = sparity->bitmap;
sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
ret = 0;
- while (logic_start < logic_end) {
- if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
- key.type = BTRFS_METADATA_ITEM_KEY;
- else
- key.type = BTRFS_EXTENT_ITEM_KEY;
- key.objectid = logic_start;
- key.offset = (u64)-1;
-
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ for (cur_logical = logic_start; cur_logical < logic_end;
+ cur_logical += map->stripe_len) {
+ ret = scrub_raid56_data_stripe_for_parity(sctx, sparity, map,
+ sdev, path, cur_logical);
if (ret < 0)
- goto out;
-
- if (ret > 0) {
- ret = btrfs_previous_extent_item(root, path, 0);
- if (ret < 0)
- goto out;
- if (ret > 0) {
- btrfs_release_path(path);
- ret = btrfs_search_slot(NULL, root, &key,
- path, 0, 0);
- if (ret < 0)
- goto out;
- }
- }
-
- stop_loop = 0;
- while (1) {
- u64 bytes;
-
- l = path->nodes[0];
- slot = path->slots[0];
- if (slot >= btrfs_header_nritems(l)) {
- ret = btrfs_next_leaf(root, path);
- if (ret == 0)
- continue;
- if (ret < 0)
- goto out;
-
- stop_loop = 1;
- break;
- }
- btrfs_item_key_to_cpu(l, &key, slot);
-
- if (key.type != BTRFS_EXTENT_ITEM_KEY &&
- key.type != BTRFS_METADATA_ITEM_KEY)
- goto next;
-
- if (key.type == BTRFS_METADATA_ITEM_KEY)
- bytes = fs_info->nodesize;
- else
- bytes = key.offset;
-
- if (key.objectid + bytes <= logic_start)
- goto next;
-
- if (key.objectid >= logic_end) {
- stop_loop = 1;
- break;
- }
-
- while (key.objectid >= logic_start + map->stripe_len)
- logic_start += map->stripe_len;
-
- extent = btrfs_item_ptr(l, slot,
- struct btrfs_extent_item);
- flags = btrfs_extent_flags(l, extent);
- generation = btrfs_extent_generation(l, extent);
-
- if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
- (key.objectid < logic_start ||
- key.objectid + bytes >
- logic_start + map->stripe_len)) {
- btrfs_err(fs_info,
- "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
- key.objectid, logic_start);
- spin_lock(&sctx->stat_lock);
- sctx->stat.uncorrectable_errors++;
- spin_unlock(&sctx->stat_lock);
- goto next;
- }
-again:
- extent_logical = key.objectid;
- ASSERT(bytes <= U32_MAX);
- extent_len = bytes;
-
- if (extent_logical < logic_start) {
- extent_len -= logic_start - extent_logical;
- extent_logical = logic_start;
- }
-
- if (extent_logical + extent_len >
- logic_start + map->stripe_len)
- extent_len = logic_start + map->stripe_len -
- extent_logical;
-
- scrub_parity_mark_sectors_data(sparity, extent_logical,
- extent_len);
-
- mapped_length = extent_len;
- bioc = NULL;
- ret = btrfs_map_block(fs_info, BTRFS_MAP_READ,
- extent_logical, &mapped_length, &bioc,
- 0);
- if (!ret) {
- if (!bioc || mapped_length < extent_len)
- ret = -EIO;
- }
- if (ret) {
- btrfs_put_bioc(bioc);
- goto out;
- }
- extent_physical = bioc->stripes[0].physical;
- extent_mirror_num = bioc->mirror_num;
- extent_dev = bioc->stripes[0].dev;
- btrfs_put_bioc(bioc);
-
- csum_root = btrfs_csum_root(fs_info, extent_logical);
- ret = btrfs_lookup_csums_range(csum_root,
- extent_logical,
- extent_logical + extent_len - 1,
- &sctx->csum_list, 1);
- if (ret)
- goto out;
-
- ret = scrub_extent_for_parity(sparity, extent_logical,
- extent_len,
- extent_physical,
- extent_dev, flags,
- generation,
- extent_mirror_num);
-
- scrub_free_csums(sctx);
-
- if (ret)
- goto out;
-
- if (extent_logical + extent_len <
- key.objectid + bytes) {
- logic_start += map->stripe_len;
-
- if (logic_start >= logic_end) {
- stop_loop = 1;
- break;
- }
-
- if (logic_start < key.objectid + bytes) {
- cond_resched();
- goto again;
- }
- }
-next:
- path->slots[0]++;
- }
-
- btrfs_release_path(path);
-
- if (stop_loop)
break;
-
- logic_start += map->stripe_len;
- }
-out:
- if (ret < 0) {
- ASSERT(logic_end - logic_start <= U32_MAX);
- scrub_parity_mark_sectors_error(sparity, logic_start,
- logic_end - logic_start);
}
+
scrub_parity_put(sparity);
scrub_submit(sctx);
mutex_lock(&sctx->wr_lock);
@@ -3165,6 +3227,206 @@ static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
return ret;
}
+/*
+ * Scrub one range which can only has simple mirror based profile.
+ * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in
+ * RAID0/RAID10).
+ *
+ * Since we may need to handle a subset of block group, we need @logical_start
+ * and @logical_length parameter.
+ */
+static int scrub_simple_mirror(struct scrub_ctx *sctx,
+ struct btrfs_root *extent_root,
+ struct btrfs_root *csum_root,
+ struct btrfs_block_group *bg,
+ struct map_lookup *map,
+ u64 logical_start, u64 logical_length,
+ struct btrfs_device *device,
+ u64 physical, int mirror_num)
+{
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ const u64 logical_end = logical_start + logical_length;
+ /* An artificial limit, inherit from old scrub behavior */
+ const u32 max_length = SZ_64K;
+ struct btrfs_path path = { 0 };
+ u64 cur_logical = logical_start;
+ int ret;
+
+ /* The range must be inside the bg */
+ ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
+
+ path.search_commit_root = 1;
+ path.skip_locking = 1;
+ /* Go through each extent items inside the logical range */
+ while (cur_logical < logical_end) {
+ u64 extent_start;
+ u64 extent_len;
+ u64 extent_flags;
+ u64 extent_gen;
+ u64 scrub_len;
+
+ /* Canceled? */
+ if (atomic_read(&fs_info->scrub_cancel_req) ||
+ atomic_read(&sctx->cancel_req)) {
+ ret = -ECANCELED;
+ break;
+ }
+ /* Paused? */
+ if (atomic_read(&fs_info->scrub_pause_req)) {
+ /* Push queued extents */
+ sctx->flush_all_writes = true;
+ scrub_submit(sctx);
+ mutex_lock(&sctx->wr_lock);
+ scrub_wr_submit(sctx);
+ mutex_unlock(&sctx->wr_lock);
+ wait_event(sctx->list_wait,
+ atomic_read(&sctx->bios_in_flight) == 0);
+ sctx->flush_all_writes = false;
+ scrub_blocked_if_needed(fs_info);
+ }
+ /* Block group removed? */
+ spin_lock(&bg->lock);
+ if (bg->removed) {
+ spin_unlock(&bg->lock);
+ ret = 0;
+ break;
+ }
+ spin_unlock(&bg->lock);
+
+ ret = find_first_extent_item(extent_root, &path, cur_logical,
+ logical_end - cur_logical);
+ if (ret > 0) {
+ /* No more extent, just update the accounting */
+ sctx->stat.last_physical = physical + logical_length;
+ ret = 0;
+ break;
+ }
+ if (ret < 0)
+ break;
+ get_extent_info(&path, &extent_start, &extent_len,
+ &extent_flags, &extent_gen);
+ /* Skip hole range which doesn't have any extent */
+ cur_logical = max(extent_start, cur_logical);
+
+ /*
+ * Scrub len has three limits:
+ * - Extent size limit
+ * - Scrub range limit
+ * This is especially imporatant for RAID0/RAID10 to reuse
+ * this function
+ * - Max scrub size limit
+ */
+ scrub_len = min(min(extent_start + extent_len,
+ logical_end), cur_logical + max_length) -
+ cur_logical;
+
+ if (extent_flags & BTRFS_EXTENT_FLAG_DATA) {
+ ret = btrfs_lookup_csums_range(csum_root, cur_logical,
+ cur_logical + scrub_len - 1,
+ &sctx->csum_list, 1);
+ if (ret)
+ break;
+ }
+ if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
+ does_range_cross_boundary(extent_start, extent_len,
+ logical_start, logical_length)) {
+ btrfs_err(fs_info,
+"scrub: tree block %llu spanning boundaries, ignored. boundary=[%llu, %llu)",
+ extent_start, logical_start, logical_end);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
+ cur_logical += scrub_len;
+ continue;
+ }
+ ret = scrub_extent(sctx, map, cur_logical, scrub_len,
+ cur_logical - logical_start + physical,
+ device, extent_flags, extent_gen,
+ mirror_num);
+ scrub_free_csums(sctx);
+ if (ret)
+ break;
+ if (sctx->is_dev_replace)
+ sync_replace_for_zoned(sctx);
+ cur_logical += scrub_len;
+ /* Don't hold CPU for too long time */
+ cond_resched();
+ }
+ btrfs_release_path(&path);
+ return ret;
+}
+
+/* Calculate the full stripe length for simple stripe based profiles */
+static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
+{
+ ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID10));
+
+ return map->num_stripes / map->sub_stripes * map->stripe_len;
+}
+
+/* Get the logical bytenr for the stripe */
+static u64 simple_stripe_get_logical(struct map_lookup *map,
+ struct btrfs_block_group *bg,
+ int stripe_index)
+{
+ ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID10));
+ ASSERT(stripe_index < map->num_stripes);
+
+ /*
+ * (stripe_index / sub_stripes) gives how many data stripes we need to
+ * skip.
+ */
+ return (stripe_index / map->sub_stripes) * map->stripe_len + bg->start;
+}
+
+/* Get the mirror number for the stripe */
+static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index)
+{
+ ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID10));
+ ASSERT(stripe_index < map->num_stripes);
+
+ /* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */
+ return stripe_index % map->sub_stripes + 1;
+}
+
+static int scrub_simple_stripe(struct scrub_ctx *sctx,
+ struct btrfs_root *extent_root,
+ struct btrfs_root *csum_root,
+ struct btrfs_block_group *bg,
+ struct map_lookup *map,
+ struct btrfs_device *device,
+ int stripe_index)
+{
+ const u64 logical_increment = simple_stripe_full_stripe_len(map);
+ const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index);
+ const u64 orig_physical = map->stripes[stripe_index].physical;
+ const int mirror_num = simple_stripe_mirror_num(map, stripe_index);
+ u64 cur_logical = orig_logical;
+ u64 cur_physical = orig_physical;
+ int ret = 0;
+
+ while (cur_logical < bg->start + bg->length) {
+ /*
+ * Inside each stripe, RAID0 is just SINGLE, and RAID10 is
+ * just RAID1, so we can reuse scrub_simple_mirror() to scrub
+ * this stripe.
+ */
+ ret = scrub_simple_mirror(sctx, extent_root, csum_root, bg, map,
+ cur_logical, map->stripe_len, device,
+ cur_physical, mirror_num);
+ if (ret)
+ return ret;
+ /* Skip to next stripe which belongs to the target device */
+ cur_logical += logical_increment;
+ /* For physical offset, we just go to next stripe */
+ cur_physical += map->stripe_len;
+ }
+ return ret;
+}
+
static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
struct btrfs_block_group *bg,
struct map_lookup *map,
@@ -3175,59 +3437,22 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct btrfs_root *root;
struct btrfs_root *csum_root;
- struct btrfs_extent_item *extent;
struct blk_plug plug;
+ const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
const u64 chunk_logical = bg->start;
- u64 flags;
int ret;
- int slot;
- u64 nstripes;
- struct extent_buffer *l;
- u64 physical;
+ u64 physical = map->stripes[stripe_index].physical;
+ const u64 physical_end = physical + dev_extent_len;
u64 logical;
u64 logic_end;
- u64 physical_end;
- u64 generation;
- int mirror_num;
- struct btrfs_key key;
+ /* The logical increment after finishing one stripe */
u64 increment;
+ /* Offset inside the chunk */
u64 offset;
- u64 extent_logical;
- u64 extent_physical;
- /*
- * Unlike chunk length, extent length should never go beyond
- * BTRFS_MAX_EXTENT_SIZE, thus u32 is enough here.
- */
- u32 extent_len;
u64 stripe_logical;
u64 stripe_end;
- struct btrfs_device *extent_dev;
- int extent_mirror_num;
int stop_loop = 0;
- physical = map->stripes[stripe_index].physical;
- offset = 0;
- nstripes = div64_u64(dev_extent_len, map->stripe_len);
- mirror_num = 1;
- increment = map->stripe_len;
- if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
- offset = map->stripe_len * stripe_index;
- increment = map->stripe_len * map->num_stripes;
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
- int factor = map->num_stripes / map->sub_stripes;
- offset = map->stripe_len * (stripe_index / map->sub_stripes);
- increment = map->stripe_len * factor;
- mirror_num = stripe_index % map->sub_stripes + 1;
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
- mirror_num = stripe_index % map->num_stripes + 1;
- } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
- mirror_num = stripe_index % map->num_stripes + 1;
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- get_raid56_logic_offset(physical, stripe_index, map, &offset,
- NULL);
- increment = map->stripe_len * nr_data_stripes(map);
- }
-
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -3241,21 +3466,12 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
path->skip_locking = 1;
path->reada = READA_FORWARD;
- logical = chunk_logical + offset;
- physical_end = physical + nstripes * map->stripe_len;
- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- get_raid56_logic_offset(physical_end, stripe_index,
- map, &logic_end, NULL);
- logic_end += chunk_logical;
- } else {
- logic_end = logical + increment * nstripes;
- }
wait_event(sctx->list_wait,
atomic_read(&sctx->bios_in_flight) == 0);
scrub_blocked_if_needed(fs_info);
- root = btrfs_extent_root(fs_info, logical);
- csum_root = btrfs_csum_root(fs_info, logical);
+ root = btrfs_extent_root(fs_info, bg->start);
+ csum_root = btrfs_csum_root(fs_info, bg->start);
/*
* collect all data csums for the stripe to avoid seeking during
@@ -3272,241 +3488,83 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
}
/*
- * now find all extents for each stripe and scrub them
+ * There used to be a big double loop to handle all profiles using the
+ * same routine, which grows larger and more gross over time.
+ *
+ * So here we handle each profile differently, so simpler profiles
+ * have simpler scrubbing function.
*/
- ret = 0;
- while (physical < physical_end) {
- /*
- * canceled?
- */
- if (atomic_read(&fs_info->scrub_cancel_req) ||
- atomic_read(&sctx->cancel_req)) {
- ret = -ECANCELED;
- goto out;
- }
+ if (!(profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_RAID56_MASK))) {
/*
- * check to see if we have to pause
+ * Above check rules out all complex profile, the remaining
+ * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple
+ * mirrored duplication without stripe.
+ *
+ * Only @physical and @mirror_num needs to calculated using
+ * @stripe_index.
*/
- if (atomic_read(&fs_info->scrub_pause_req)) {
- /* push queued extents */
- sctx->flush_all_writes = true;
- scrub_submit(sctx);
- mutex_lock(&sctx->wr_lock);
- scrub_wr_submit(sctx);
- mutex_unlock(&sctx->wr_lock);
- wait_event(sctx->list_wait,
- atomic_read(&sctx->bios_in_flight) == 0);
- sctx->flush_all_writes = false;
- scrub_blocked_if_needed(fs_info);
- }
-
- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- ret = get_raid56_logic_offset(physical, stripe_index,
- map, &logical,
- &stripe_logical);
- logical += chunk_logical;
- if (ret) {
- /* it is parity strip */
- stripe_logical += chunk_logical;
- stripe_end = stripe_logical + increment;
- ret = scrub_raid56_parity(sctx, map, scrub_dev,
- stripe_logical,
- stripe_end);
- if (ret)
- goto out;
- goto skip;
- }
- }
-
- if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
- key.type = BTRFS_METADATA_ITEM_KEY;
- else
- key.type = BTRFS_EXTENT_ITEM_KEY;
- key.objectid = logical;
- key.offset = (u64)-1;
-
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
-
- if (ret > 0) {
- ret = btrfs_previous_extent_item(root, path, 0);
- if (ret < 0)
- goto out;
- if (ret > 0) {
- /* there's no smaller item, so stick with the
- * larger one */
- btrfs_release_path(path);
- ret = btrfs_search_slot(NULL, root, &key,
- path, 0, 0);
- if (ret < 0)
- goto out;
- }
- }
-
- stop_loop = 0;
- while (1) {
- u64 bytes;
-
- l = path->nodes[0];
- slot = path->slots[0];
- if (slot >= btrfs_header_nritems(l)) {
- ret = btrfs_next_leaf(root, path);
- if (ret == 0)
- continue;
- if (ret < 0)
- goto out;
-
- stop_loop = 1;
- break;
- }
- btrfs_item_key_to_cpu(l, &key, slot);
-
- if (key.type != BTRFS_EXTENT_ITEM_KEY &&
- key.type != BTRFS_METADATA_ITEM_KEY)
- goto next;
-
- if (key.type == BTRFS_METADATA_ITEM_KEY)
- bytes = fs_info->nodesize;
- else
- bytes = key.offset;
-
- if (key.objectid + bytes <= logical)
- goto next;
-
- if (key.objectid >= logical + map->stripe_len) {
- /* out of this device extent */
- if (key.objectid >= logic_end)
- stop_loop = 1;
- break;
- }
-
- /*
- * If our block group was removed in the meanwhile, just
- * stop scrubbing since there is no point in continuing.
- * Continuing would prevent reusing its device extents
- * for new block groups for a long time.
- */
- spin_lock(&bg->lock);
- if (bg->removed) {
- spin_unlock(&bg->lock);
- ret = 0;
- goto out;
- }
- spin_unlock(&bg->lock);
-
- extent = btrfs_item_ptr(l, slot,
- struct btrfs_extent_item);
- flags = btrfs_extent_flags(l, extent);
- generation = btrfs_extent_generation(l, extent);
-
- if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
- (key.objectid < logical ||
- key.objectid + bytes >
- logical + map->stripe_len)) {
- btrfs_err(fs_info,
- "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
- key.objectid, logical);
- spin_lock(&sctx->stat_lock);
- sctx->stat.uncorrectable_errors++;
- spin_unlock(&sctx->stat_lock);
- goto next;
- }
-
-again:
- extent_logical = key.objectid;
- ASSERT(bytes <= U32_MAX);
- extent_len = bytes;
-
- /*
- * trim extent to this stripe
- */
- if (extent_logical < logical) {
- extent_len -= logical - extent_logical;
- extent_logical = logical;
- }
- if (extent_logical + extent_len >
- logical + map->stripe_len) {
- extent_len = logical + map->stripe_len -
- extent_logical;
- }
+ ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
+ bg->start, bg->length, scrub_dev,
+ map->stripes[stripe_index].physical,
+ stripe_index + 1);
+ offset = 0;
+ goto out;
+ }
+ if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
+ ret = scrub_simple_stripe(sctx, root, csum_root, bg, map,
+ scrub_dev, stripe_index);
+ offset = map->stripe_len * (stripe_index / map->sub_stripes);
+ goto out;
+ }
- extent_physical = extent_logical - logical + physical;
- extent_dev = scrub_dev;
- extent_mirror_num = mirror_num;
- if (sctx->is_dev_replace)
- scrub_remap_extent(fs_info, extent_logical,
- extent_len, &extent_physical,
- &extent_dev,
- &extent_mirror_num);
-
- if (flags & BTRFS_EXTENT_FLAG_DATA) {
- ret = btrfs_lookup_csums_range(csum_root,
- extent_logical,
- extent_logical + extent_len - 1,
- &sctx->csum_list, 1);
- if (ret)
- goto out;
- }
+ /* Only RAID56 goes through the old code */
+ ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
+ ret = 0;
- ret = scrub_extent(sctx, map, extent_logical, extent_len,
- extent_physical, extent_dev, flags,
- generation, extent_mirror_num,
- extent_logical - logical + physical);
+ /* Calculate the logical end of the stripe */
+ get_raid56_logic_offset(physical_end, stripe_index,
+ map, &logic_end, NULL);
+ logic_end += chunk_logical;
- scrub_free_csums(sctx);
+ /* Initialize @offset in case we need to go to out: label */
+ get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL);
+ increment = map->stripe_len * nr_data_stripes(map);
+ /*
+ * Due to the rotation, for RAID56 it's better to iterate each stripe
+ * using their physical offset.
+ */
+ while (physical < physical_end) {
+ ret = get_raid56_logic_offset(physical, stripe_index, map,
+ &logical, &stripe_logical);
+ logical += chunk_logical;
+ if (ret) {
+ /* it is parity strip */
+ stripe_logical += chunk_logical;
+ stripe_end = stripe_logical + increment;
+ ret = scrub_raid56_parity(sctx, map, scrub_dev,
+ stripe_logical,
+ stripe_end);
if (ret)
goto out;
+ goto next;
+ }
- if (sctx->is_dev_replace)
- sync_replace_for_zoned(sctx);
-
- if (extent_logical + extent_len <
- key.objectid + bytes) {
- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- /*
- * loop until we find next data stripe
- * or we have finished all stripes.
- */
-loop:
- physical += map->stripe_len;
- ret = get_raid56_logic_offset(physical,
- stripe_index, map,
- &logical, &stripe_logical);
- logical += chunk_logical;
-
- if (ret && physical < physical_end) {
- stripe_logical += chunk_logical;
- stripe_end = stripe_logical +
- increment;
- ret = scrub_raid56_parity(sctx,
- map, scrub_dev,
- stripe_logical,
- stripe_end);
- if (ret)
- goto out;
- goto loop;
- }
- } else {
- physical += map->stripe_len;
- logical += increment;
- }
- if (logical < key.objectid + bytes) {
- cond_resched();
- goto again;
- }
-
- if (physical >= physical_end) {
- stop_loop = 1;
- break;
- }
- }
+ /*
+ * Now we're at a data stripe, scrub each extents in the range.
+ *
+ * At this stage, if we ignore the repair part, inside each data
+ * stripe it is no different than SINGLE profile.
+ * We can reuse scrub_simple_mirror() here, as the repair part
+ * is still based on @mirror_num.
+ */
+ ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
+ logical, map->stripe_len,
+ scrub_dev, physical, 1);
+ if (ret < 0)
+ goto out;
next:
- path->slots[0]++;
- }
- btrfs_release_path(path);
-skip:
logical += increment;
physical += map->stripe_len;
spin_lock(&sctx->stat_lock);
@@ -3964,9 +4022,9 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
if (!btrfs_check_super_location(scrub_dev, bytenr))
continue;
- ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
- scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
- NULL, bytenr);
+ ret = scrub_sectors(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
+ scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
+ NULL, bytenr);
if (ret)
return ret;
}
@@ -3979,22 +4037,23 @@ static void scrub_workers_put(struct btrfs_fs_info *fs_info)
{
if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
&fs_info->scrub_lock)) {
- struct btrfs_workqueue *scrub_workers = NULL;
- struct btrfs_workqueue *scrub_wr_comp = NULL;
- struct btrfs_workqueue *scrub_parity = NULL;
-
- scrub_workers = fs_info->scrub_workers;
- scrub_wr_comp = fs_info->scrub_wr_completion_workers;
- scrub_parity = fs_info->scrub_parity_workers;
+ struct workqueue_struct *scrub_workers = fs_info->scrub_workers;
+ struct workqueue_struct *scrub_wr_comp =
+ fs_info->scrub_wr_completion_workers;
+ struct workqueue_struct *scrub_parity =
+ fs_info->scrub_parity_workers;
fs_info->scrub_workers = NULL;
fs_info->scrub_wr_completion_workers = NULL;
fs_info->scrub_parity_workers = NULL;
mutex_unlock(&fs_info->scrub_lock);
- btrfs_destroy_workqueue(scrub_workers);
- btrfs_destroy_workqueue(scrub_wr_comp);
- btrfs_destroy_workqueue(scrub_parity);
+ if (scrub_workers)
+ destroy_workqueue(scrub_workers);
+ if (scrub_wr_comp)
+ destroy_workqueue(scrub_wr_comp);
+ if (scrub_parity)
+ destroy_workqueue(scrub_parity);
}
}
@@ -4004,9 +4063,9 @@ static void scrub_workers_put(struct btrfs_fs_info *fs_info)
static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
int is_dev_replace)
{
- struct btrfs_workqueue *scrub_workers = NULL;
- struct btrfs_workqueue *scrub_wr_comp = NULL;
- struct btrfs_workqueue *scrub_parity = NULL;
+ struct workqueue_struct *scrub_workers = NULL;
+ struct workqueue_struct *scrub_wr_comp = NULL;
+ struct workqueue_struct *scrub_parity = NULL;
unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
int max_active = fs_info->thread_pool_size;
int ret = -ENOMEM;
@@ -4014,18 +4073,16 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
return 0;
- scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", flags,
- is_dev_replace ? 1 : max_active, 4);
+ scrub_workers = alloc_workqueue("btrfs-scrub", flags,
+ is_dev_replace ? 1 : max_active);
if (!scrub_workers)
goto fail_scrub_workers;
- scrub_wr_comp = btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
- max_active, 2);
+ scrub_wr_comp = alloc_workqueue("btrfs-scrubwrc", flags, max_active);
if (!scrub_wr_comp)
goto fail_scrub_wr_completion_workers;
- scrub_parity = btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
- max_active, 2);
+ scrub_parity = alloc_workqueue("btrfs-scrubparity", flags, max_active);
if (!scrub_parity)
goto fail_scrub_parity_workers;
@@ -4046,11 +4103,11 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
mutex_unlock(&fs_info->scrub_lock);
ret = 0;
- btrfs_destroy_workqueue(scrub_parity);
+ destroy_workqueue(scrub_parity);
fail_scrub_parity_workers:
- btrfs_destroy_workqueue(scrub_wr_comp);
+ destroy_workqueue(scrub_wr_comp);
fail_scrub_wr_completion_workers:
- btrfs_destroy_workqueue(scrub_workers);
+ destroy_workqueue(scrub_workers);
fail_scrub_workers:
return ret;
}
@@ -4082,18 +4139,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
}
if (fs_info->nodesize >
- PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
- fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
+ SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits ||
+ fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_SECTORS_PER_BLOCK) {
/*
- * would exhaust the array bounds of pagev member in
+ * Would exhaust the array bounds of sectorv member in
* struct scrub_block
*/
btrfs_err(fs_info,
- "scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
- fs_info->nodesize,
- SCRUB_MAX_PAGES_PER_BLOCK,
- fs_info->sectorsize,
- SCRUB_MAX_PAGES_PER_BLOCK);
+"scrub: nodesize and sectorsize <= SCRUB_MAX_SECTORS_PER_BLOCK (%d <= %d && %d <= %d) fails",
+ fs_info->nodesize, SCRUB_MAX_SECTORS_PER_BLOCK,
+ fs_info->sectorsize, SCRUB_MAX_SECTORS_PER_BLOCK);
return -EINVAL;
}
@@ -4161,7 +4216,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
/*
* In order to avoid deadlock with reclaim when there is a transaction
* trying to pause scrub, make sure we use GFP_NOFS for all the
- * allocations done at btrfs_scrub_pages() and scrub_pages_for_parity()
+ * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity()
* invoked by our callees. The pausing request is done when the
* transaction commit starts, and it blocks the transaction until scrub
* is paused (done at specific points at scrub_stripe() or right above
@@ -4295,11 +4350,11 @@ int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
}
-static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
- u64 extent_logical, u32 extent_len,
- u64 *extent_physical,
- struct btrfs_device **extent_dev,
- int *extent_mirror_num)
+static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
+ u64 extent_logical, u32 extent_len,
+ u64 *extent_physical,
+ struct btrfs_device **extent_dev,
+ int *extent_mirror_num)
{
u64 mapped_length;
struct btrfs_io_context *bioc = NULL;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 7d1642937274..5a05beabf0c3 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -10,7 +10,6 @@
#include <linux/mount.h>
#include <linux/xattr.h>
#include <linux/posix_acl_xattr.h>
-#include <linux/radix-tree.h>
#include <linux/vmalloc.h>
#include <linux/string.h>
#include <linux/compat.h>
@@ -128,11 +127,18 @@ struct send_ctx {
struct list_head new_refs;
struct list_head deleted_refs;
- struct radix_tree_root name_cache;
+ struct xarray name_cache;
struct list_head name_cache_list;
int name_cache_size;
+ /*
+ * The inode we are currently processing. It's not NULL only when we
+ * need to issue write commands for data extents from this inode.
+ */
+ struct inode *cur_inode;
struct file_ra_state ra;
+ u64 page_cache_clear_start;
+ bool clean_page_cache;
/*
* We process inodes by their increasing order, so if before an
@@ -262,14 +268,13 @@ struct orphan_dir_info {
struct name_cache_entry {
struct list_head list;
/*
- * radix_tree has only 32bit entries but we need to handle 64bit inums.
- * We use the lower 32bit of the 64bit inum to store it in the tree. If
- * more then one inum would fall into the same entry, we use radix_list
- * to store the additional entries. radix_list is also used to store
- * entries where two entries have the same inum but different
- * generations.
+ * On 32bit kernels, xarray has only 32bit indices, but we need to
+ * handle 64bit inums. We use the lower 32bit of the 64bit inum to store
+ * it in the tree. If more than one inum would fall into the same entry,
+ * we use inum_aliases to store the additional entries. inum_aliases is
+ * also used to store entries with the same inum but different generations.
*/
- struct list_head radix_list;
+ struct list_head inum_aliases;
u64 ino;
u64 gen;
u64 parent_ino;
@@ -2019,9 +2024,9 @@ out:
}
/*
- * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit,
+ * Insert a name cache entry. On 32bit kernels the xarray index is 32bit,
* so we need to do some special handling in case we have clashes. This function
- * takes care of this with the help of name_cache_entry::radix_list.
+ * takes care of this with the help of name_cache_entry::inum_aliases.
* In case of error, nce is kfreed.
*/
static int name_cache_insert(struct send_ctx *sctx,
@@ -2030,8 +2035,7 @@ static int name_cache_insert(struct send_ctx *sctx,
int ret = 0;
struct list_head *nce_head;
- nce_head = radix_tree_lookup(&sctx->name_cache,
- (unsigned long)nce->ino);
+ nce_head = xa_load(&sctx->name_cache, (unsigned long)nce->ino);
if (!nce_head) {
nce_head = kmalloc(sizeof(*nce_head), GFP_KERNEL);
if (!nce_head) {
@@ -2040,14 +2044,14 @@ static int name_cache_insert(struct send_ctx *sctx,
}
INIT_LIST_HEAD(nce_head);
- ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head);
+ ret = xa_insert(&sctx->name_cache, nce->ino, nce_head, GFP_KERNEL);
if (ret < 0) {
kfree(nce_head);
kfree(nce);
return ret;
}
}
- list_add_tail(&nce->radix_list, nce_head);
+ list_add_tail(&nce->inum_aliases, nce_head);
list_add_tail(&nce->list, &sctx->name_cache_list);
sctx->name_cache_size++;
@@ -2059,15 +2063,14 @@ static void name_cache_delete(struct send_ctx *sctx,
{
struct list_head *nce_head;
- nce_head = radix_tree_lookup(&sctx->name_cache,
- (unsigned long)nce->ino);
+ nce_head = xa_load(&sctx->name_cache, (unsigned long)nce->ino);
if (!nce_head) {
btrfs_err(sctx->send_root->fs_info,
"name_cache_delete lookup failed ino %llu cache size %d, leaking memory",
nce->ino, sctx->name_cache_size);
}
- list_del(&nce->radix_list);
+ list_del(&nce->inum_aliases);
list_del(&nce->list);
sctx->name_cache_size--;
@@ -2075,7 +2078,7 @@ static void name_cache_delete(struct send_ctx *sctx,
* We may not get to the final release of nce_head if the lookup fails
*/
if (nce_head && list_empty(nce_head)) {
- radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);
+ xa_erase(&sctx->name_cache, (unsigned long)nce->ino);
kfree(nce_head);
}
}
@@ -2086,11 +2089,11 @@ static struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
struct list_head *nce_head;
struct name_cache_entry *cur;
- nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino);
+ nce_head = xa_load(&sctx->name_cache, (unsigned long)ino);
if (!nce_head)
return NULL;
- list_for_each_entry(cur, nce_head, radix_list) {
+ list_for_each_entry(cur, nce_head, inum_aliases) {
if (cur->ino == ino && cur->gen == gen)
return cur;
}
@@ -2675,61 +2678,43 @@ out:
static int did_create_dir(struct send_ctx *sctx, u64 dir)
{
int ret = 0;
+ int iter_ret = 0;
struct btrfs_path *path = NULL;
struct btrfs_key key;
struct btrfs_key found_key;
struct btrfs_key di_key;
- struct extent_buffer *eb;
struct btrfs_dir_item *di;
- int slot;
path = alloc_path_for_send();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!path)
+ return -ENOMEM;
key.objectid = dir;
key.type = BTRFS_DIR_INDEX_KEY;
key.offset = 0;
- ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
- while (1) {
- eb = path->nodes[0];
- slot = path->slots[0];
- if (slot >= btrfs_header_nritems(eb)) {
- ret = btrfs_next_leaf(sctx->send_root, path);
- if (ret < 0) {
- goto out;
- } else if (ret > 0) {
- ret = 0;
- break;
- }
- continue;
- }
+ btrfs_for_each_slot(sctx->send_root, &key, &found_key, path, iter_ret) {
+ struct extent_buffer *eb = path->nodes[0];
- btrfs_item_key_to_cpu(eb, &found_key, slot);
if (found_key.objectid != key.objectid ||
found_key.type != key.type) {
ret = 0;
- goto out;
+ break;
}
- di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
+ di = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dir_item);
btrfs_dir_item_key_to_cpu(eb, di, &di_key);
if (di_key.type != BTRFS_ROOT_ITEM_KEY &&
di_key.objectid < sctx->send_progress) {
ret = 1;
- goto out;
+ break;
}
-
- path->slots[0]++;
}
+ /* Catch error found during iteration */
+ if (iter_ret < 0)
+ ret = iter_ret;
-out:
btrfs_free_path(path);
return ret;
}
@@ -2933,6 +2918,7 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
u64 send_progress)
{
int ret = 0;
+ int iter_ret = 0;
struct btrfs_root *root = sctx->parent_root;
struct btrfs_path *path;
struct btrfs_key key;
@@ -2959,23 +2945,9 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
if (odi)
key.offset = odi->last_dir_index_offset;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
-
- while (1) {
+ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
struct waiting_dir_move *dm;
- if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- goto out;
- else if (ret > 0)
- break;
- continue;
- }
- btrfs_item_key_to_cpu(path->nodes[0], &found_key,
- path->slots[0]);
if (found_key.objectid != key.objectid ||
found_key.type != key.type)
break;
@@ -3010,8 +2982,10 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
ret = 0;
goto out;
}
-
- path->slots[0]++;
+ }
+ if (iter_ret < 0) {
+ ret = iter_ret;
+ goto out;
}
free_orphan_dir_info(sctx, odi);
@@ -3579,7 +3553,7 @@ static int check_ino_in_path(struct btrfs_root *root,
}
/*
- * Check if ino ino1 is an ancestor of inode ino2 in the given root for any
+ * Check if inode ino1 is an ancestor of inode ino2 in the given root for any
* possible path (in case ino2 is not a directory and has multiple hard links).
* Return 1 if true, 0 if false and < 0 on error.
*/
@@ -3591,6 +3565,7 @@ static int is_ancestor(struct btrfs_root *root,
{
bool free_fs_path = false;
int ret = 0;
+ int iter_ret = 0;
struct btrfs_path *path = NULL;
struct btrfs_key key;
@@ -3611,26 +3586,12 @@ static int is_ancestor(struct btrfs_root *root,
key.type = BTRFS_INODE_REF_KEY;
key.offset = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
-
- while (true) {
+ btrfs_for_each_slot(root, &key, &key, path, iter_ret) {
struct extent_buffer *leaf = path->nodes[0];
int slot = path->slots[0];
u32 cur_offset = 0;
u32 item_size;
- if (slot >= btrfs_header_nritems(leaf)) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- goto out;
- if (ret > 0)
- break;
- continue;
- }
-
- btrfs_item_key_to_cpu(leaf, &key, slot);
if (key.objectid != ino2)
break;
if (key.type != BTRFS_INODE_REF_KEY &&
@@ -3668,10 +3629,12 @@ static int is_ancestor(struct btrfs_root *root,
if (ret)
goto out;
}
- path->slots[0]++;
}
ret = 0;
- out:
+ if (iter_ret < 0)
+ ret = iter_ret;
+
+out:
btrfs_free_path(path);
if (free_fs_path)
fs_path_free(fs_path);
@@ -4551,13 +4514,12 @@ out:
static int process_all_refs(struct send_ctx *sctx,
enum btrfs_compare_tree_result cmd)
{
- int ret;
+ int ret = 0;
+ int iter_ret = 0;
struct btrfs_root *root;
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_key found_key;
- struct extent_buffer *eb;
- int slot;
iterate_inode_ref_t cb;
int pending_move = 0;
@@ -4581,24 +4543,7 @@ static int process_all_refs(struct send_ctx *sctx,
key.objectid = sctx->cmp_key->objectid;
key.type = BTRFS_INODE_REF_KEY;
key.offset = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
-
- while (1) {
- eb = path->nodes[0];
- slot = path->slots[0];
- if (slot >= btrfs_header_nritems(eb)) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- goto out;
- else if (ret > 0)
- break;
- continue;
- }
-
- btrfs_item_key_to_cpu(eb, &found_key, slot);
-
+ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
if (found_key.objectid != key.objectid ||
(found_key.type != BTRFS_INODE_REF_KEY &&
found_key.type != BTRFS_INODE_EXTREF_KEY))
@@ -4607,8 +4552,11 @@ static int process_all_refs(struct send_ctx *sctx,
ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
if (ret < 0)
goto out;
-
- path->slots[0]++;
+ }
+ /* Catch error found during iteration */
+ if (iter_ret < 0) {
+ ret = iter_ret;
+ goto out;
}
btrfs_release_path(path);
@@ -4870,13 +4818,12 @@ out:
static int process_all_new_xattrs(struct send_ctx *sctx)
{
- int ret;
+ int ret = 0;
+ int iter_ret = 0;
struct btrfs_root *root;
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_key found_key;
- struct extent_buffer *eb;
- int slot;
path = alloc_path_for_send();
if (!path)
@@ -4887,39 +4834,21 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
key.objectid = sctx->cmp_key->objectid;
key.type = BTRFS_XATTR_ITEM_KEY;
key.offset = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
-
- while (1) {
- eb = path->nodes[0];
- slot = path->slots[0];
- if (slot >= btrfs_header_nritems(eb)) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0) {
- goto out;
- } else if (ret > 0) {
- ret = 0;
- break;
- }
- continue;
- }
-
- btrfs_item_key_to_cpu(eb, &found_key, slot);
+ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
if (found_key.objectid != key.objectid ||
found_key.type != key.type) {
ret = 0;
- goto out;
+ break;
}
ret = iterate_dir_item(root, path, __process_new_xattr, sctx);
if (ret < 0)
- goto out;
-
- path->slots[0]++;
+ break;
}
+ /* Catch error found during iteration */
+ if (iter_ret < 0)
+ ret = iter_ret;
-out:
btrfs_free_path(path);
return ret;
}
@@ -4946,7 +4875,6 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
{
struct btrfs_root *root = sctx->send_root;
struct btrfs_fs_info *fs_info = root->fs_info;
- struct inode *inode;
struct page *page;
pgoff_t index = offset >> PAGE_SHIFT;
pgoff_t last_index;
@@ -4957,37 +4885,30 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
if (ret)
return ret;
- inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root);
- if (IS_ERR(inode))
- return PTR_ERR(inode);
-
last_index = (offset + len - 1) >> PAGE_SHIFT;
- /* initial readahead */
- memset(&sctx->ra, 0, sizeof(struct file_ra_state));
- file_ra_state_init(&sctx->ra, inode->i_mapping);
-
while (index <= last_index) {
unsigned cur_len = min_t(unsigned, len,
PAGE_SIZE - pg_offset);
- page = find_lock_page(inode->i_mapping, index);
+ page = find_lock_page(sctx->cur_inode->i_mapping, index);
if (!page) {
- page_cache_sync_readahead(inode->i_mapping, &sctx->ra,
- NULL, index, last_index + 1 - index);
+ page_cache_sync_readahead(sctx->cur_inode->i_mapping,
+ &sctx->ra, NULL, index,
+ last_index + 1 - index);
- page = find_or_create_page(inode->i_mapping, index,
- GFP_KERNEL);
+ page = find_or_create_page(sctx->cur_inode->i_mapping,
+ index, GFP_KERNEL);
if (!page) {
ret = -ENOMEM;
break;
}
}
- if (PageReadahead(page)) {
- page_cache_async_readahead(inode->i_mapping, &sctx->ra,
- NULL, page, index, last_index + 1 - index);
- }
+ if (PageReadahead(page))
+ page_cache_async_readahead(sctx->cur_inode->i_mapping,
+ &sctx->ra, NULL, page, index,
+ last_index + 1 - index);
if (!PageUptodate(page)) {
btrfs_readpage(NULL, page);
@@ -5013,7 +4934,7 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
len -= cur_len;
sctx->send_size += cur_len;
}
- iput(inode);
+
return ret;
}
@@ -5220,12 +5141,49 @@ static int send_extent_data(struct send_ctx *sctx,
const u64 offset,
const u64 len)
{
+ const u64 end = offset + len;
u64 read_size = max_send_read_size(sctx);
u64 sent = 0;
if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
return send_update_extent(sctx, offset, len);
+ if (sctx->cur_inode == NULL) {
+ struct btrfs_root *root = sctx->send_root;
+
+ sctx->cur_inode = btrfs_iget(root->fs_info->sb, sctx->cur_ino, root);
+ if (IS_ERR(sctx->cur_inode)) {
+ int err = PTR_ERR(sctx->cur_inode);
+
+ sctx->cur_inode = NULL;
+ return err;
+ }
+ memset(&sctx->ra, 0, sizeof(struct file_ra_state));
+ file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping);
+
+ /*
+ * It's very likely there are no pages from this inode in the page
+ * cache, so after reading extents and sending their data, we clean
+ * the page cache to avoid trashing the page cache (adding pressure
+ * to the page cache and forcing eviction of other data more useful
+ * for applications).
+ *
+ * We decide if we should clean the page cache simply by checking
+ * if the inode's mapping nrpages is 0 when we first open it, and
+ * not by using something like filemap_range_has_page() before
+ * reading an extent because when we ask the readahead code to
+ * read a given file range, it may (and almost always does) read
+ * pages from beyond that range (see the documentation for
+ * page_cache_sync_readahead()), so it would not be reliable,
+ * because after reading the first extent future calls to
+ * filemap_range_has_page() would return true because the readahead
+ * on the previous extent resulted in reading pages of the current
+ * extent as well.
+ */
+ sctx->clean_page_cache = (sctx->cur_inode->i_mapping->nrpages == 0);
+ sctx->page_cache_clear_start = round_down(offset, PAGE_SIZE);
+ }
+
while (sent < len) {
u64 size = min(len - sent, read_size);
int ret;
@@ -5235,6 +5193,37 @@ static int send_extent_data(struct send_ctx *sctx,
return ret;
sent += size;
}
+
+ if (sctx->clean_page_cache && IS_ALIGNED(end, PAGE_SIZE)) {
+ /*
+ * Always operate only on ranges that are a multiple of the page
+ * size. This is not only to prevent zeroing parts of a page in
+ * the case of subpage sector size, but also to guarantee we evict
+ * pages, as passing a range that is smaller than page size does
+ * not evict the respective page (only zeroes part of its content).
+ *
+ * Always start from the end offset of the last range cleared.
+ * This is because the readahead code may (and very often does)
+ * reads pages beyond the range we request for readahead. So if
+ * we have an extent layout like this:
+ *
+ * [ extent A ] [ extent B ] [ extent C ]
+ *
+ * When we ask page_cache_sync_readahead() to read extent A, it
+ * may also trigger reads for pages of extent B. If we are doing
+ * an incremental send and extent B has not changed between the
+ * parent and send snapshots, some or all of its pages may end
+ * up being read and placed in the page cache. So when truncating
+ * the page cache we always start from the end offset of the
+ * previously processed extent up to the end of the current
+ * extent.
+ */
+ truncate_inode_pages_range(&sctx->cur_inode->i_data,
+ sctx->page_cache_clear_start,
+ end - 1);
+ sctx->page_cache_clear_start = end;
+ }
+
return 0;
}
@@ -5965,13 +5954,12 @@ out:
static int process_all_extents(struct send_ctx *sctx)
{
- int ret;
+ int ret = 0;
+ int iter_ret = 0;
struct btrfs_root *root;
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_key found_key;
- struct extent_buffer *eb;
- int slot;
root = sctx->send_root;
path = alloc_path_for_send();
@@ -5981,41 +5969,21 @@ static int process_all_extents(struct send_ctx *sctx)
key.objectid = sctx->cmp_key->objectid;
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
-
- while (1) {
- eb = path->nodes[0];
- slot = path->slots[0];
-
- if (slot >= btrfs_header_nritems(eb)) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0) {
- goto out;
- } else if (ret > 0) {
- ret = 0;
- break;
- }
- continue;
- }
-
- btrfs_item_key_to_cpu(eb, &found_key, slot);
-
+ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
if (found_key.objectid != key.objectid ||
found_key.type != key.type) {
ret = 0;
- goto out;
+ break;
}
ret = process_extent(sctx, path, &found_key);
if (ret < 0)
- goto out;
-
- path->slots[0]++;
+ break;
}
+ /* Catch error found during iteration */
+ if (iter_ret < 0)
+ ret = iter_ret;
-out:
btrfs_free_path(path);
return ret;
}
@@ -6205,8 +6173,11 @@ static int btrfs_unlink_all_paths(struct send_ctx *sctx)
{
LIST_HEAD(deleted_refs);
struct btrfs_path *path;
+ struct btrfs_root *root = sctx->parent_root;
struct btrfs_key key;
+ struct btrfs_key found_key;
struct parent_paths_ctx ctx;
+ int iter_ret = 0;
int ret;
path = alloc_path_for_send();
@@ -6216,39 +6187,26 @@ static int btrfs_unlink_all_paths(struct send_ctx *sctx)
key.objectid = sctx->cur_ino;
key.type = BTRFS_INODE_REF_KEY;
key.offset = 0;
- ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
ctx.refs = &deleted_refs;
ctx.sctx = sctx;
- while (true) {
- struct extent_buffer *eb = path->nodes[0];
- int slot = path->slots[0];
-
- if (slot >= btrfs_header_nritems(eb)) {
- ret = btrfs_next_leaf(sctx->parent_root, path);
- if (ret < 0)
- goto out;
- else if (ret > 0)
- break;
- continue;
- }
-
- btrfs_item_key_to_cpu(eb, &key, slot);
- if (key.objectid != sctx->cur_ino)
+ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
+ if (found_key.objectid != key.objectid)
break;
- if (key.type != BTRFS_INODE_REF_KEY &&
- key.type != BTRFS_INODE_EXTREF_KEY)
+ if (found_key.type != key.type &&
+ found_key.type != BTRFS_INODE_EXTREF_KEY)
break;
- ret = iterate_inode_ref(sctx->parent_root, path, &key, 1,
+ ret = iterate_inode_ref(root, path, &found_key, 1,
record_parent_ref, &ctx);
if (ret < 0)
goto out;
-
- path->slots[0]++;
+ }
+ /* Catch error found during iteration */
+ if (iter_ret < 0) {
+ ret = iter_ret;
+ goto out;
}
while (!list_empty(&deleted_refs)) {
@@ -6270,6 +6228,30 @@ out:
return ret;
}
+static void close_current_inode(struct send_ctx *sctx)
+{
+ u64 i_size;
+
+ if (sctx->cur_inode == NULL)
+ return;
+
+ i_size = i_size_read(sctx->cur_inode);
+
+ /*
+ * If we are doing an incremental send, we may have extents between the
+ * last processed extent and the i_size that have not been processed
+ * because they haven't changed but we may have read some of their pages
+ * through readahead, see the comments at send_extent_data().
+ */
+ if (sctx->clean_page_cache && sctx->page_cache_clear_start < i_size)
+ truncate_inode_pages_range(&sctx->cur_inode->i_data,
+ sctx->page_cache_clear_start,
+ round_up(i_size, PAGE_SIZE) - 1);
+
+ iput(sctx->cur_inode);
+ sctx->cur_inode = NULL;
+}
+
static int changed_inode(struct send_ctx *sctx,
enum btrfs_compare_tree_result result)
{
@@ -6280,6 +6262,8 @@ static int changed_inode(struct send_ctx *sctx,
u64 left_gen = 0;
u64 right_gen = 0;
+ close_current_inode(sctx);
+
sctx->cur_ino = key->objectid;
sctx->cur_inode_new_gen = 0;
sctx->cur_inode_last_extent = (u64)-1;
@@ -7534,7 +7518,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
INIT_LIST_HEAD(&sctx->new_refs);
INIT_LIST_HEAD(&sctx->deleted_refs);
- INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL);
+ xa_init_flags(&sctx->name_cache, GFP_KERNEL);
INIT_LIST_HEAD(&sctx->name_cache_list);
sctx->flags = arg->flags;
@@ -7766,6 +7750,8 @@ out:
name_cache_free(sctx);
+ close_current_inode(sctx);
+
kfree(sctx);
}
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index b87931a458eb..2dd8754cb990 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -181,6 +181,12 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
found->full = 0;
}
+/*
+ * Block groups with more than this value (percents) of unusable space will be
+ * scheduled for background reclaim.
+ */
+#define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75)
+
static int create_space_info(struct btrfs_fs_info *info, u64 flags)
{
@@ -203,6 +209,9 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags)
INIT_LIST_HEAD(&space_info->priority_tickets);
space_info->clamp = 1;
+ if (btrfs_is_zoned(info))
+ space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH;
+
ret = btrfs_sysfs_add_space_info_type(info, space_info);
if (ret)
return ret;
@@ -519,7 +528,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2;
}
- trans = (struct btrfs_trans_handle *)current->journal_info;
+ trans = current->journal_info;
/*
* If we are doing more ordered than delalloc we need to just wait on
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index d841fed73492..c096695598c1 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -3,6 +3,8 @@
#ifndef BTRFS_SPACE_INFO_H
#define BTRFS_SPACE_INFO_H
+#include "volumes.h"
+
struct btrfs_space_info {
spinlock_t lock;
@@ -24,6 +26,12 @@ struct btrfs_space_info {
the space info if we had an ENOSPC in the
allocator. */
+ /*
+ * Once a block group drops below this threshold (percents) we'll
+ * schedule it for reclaim.
+ */
+ int bg_reclaim_threshold;
+
int clamp; /* Used to scale our threshold for preemptive
flushing. The value is >> clamp, so turns
out to be a 2^clamp divisor. */
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index ef7ae20d2b77..a105b291444f 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -63,6 +63,29 @@
* This means a slightly higher tree locking latency.
*/
+bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page)
+{
+ if (fs_info->sectorsize >= PAGE_SIZE)
+ return false;
+
+ /*
+ * Only data pages (either through DIO or compression) can have no
+ * mapping. And if page->mapping->host is data inode, it's subpage.
+ * As we have ruled our sectorsize >= PAGE_SIZE case already.
+ */
+ if (!page->mapping || !page->mapping->host ||
+ is_data_inode(page->mapping->host))
+ return true;
+
+ /*
+ * Now the only remaining case is metadata, which we only go subpage
+ * routine if nodesize < PAGE_SIZE.
+ */
+ if (fs_info->nodesize < PAGE_SIZE)
+ return true;
+ return false;
+}
+
void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize)
{
unsigned int cur = 0;
@@ -107,7 +130,7 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
ASSERT(PageLocked(page));
/* Either not subpage, or the page already has private attached */
- if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page))
+ if (!btrfs_is_subpage(fs_info, page) || PagePrivate(page))
return 0;
subpage = btrfs_alloc_subpage(fs_info, type);
@@ -124,10 +147,10 @@ void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
struct btrfs_subpage *subpage;
/* Either not subpage, or already detached */
- if (fs_info->sectorsize == PAGE_SIZE || !PagePrivate(page))
+ if (!btrfs_is_subpage(fs_info, page) || !PagePrivate(page))
return;
- subpage = (struct btrfs_subpage *)detach_page_private(page);
+ subpage = detach_page_private(page);
ASSERT(subpage);
btrfs_free_subpage(subpage);
}
@@ -175,7 +198,7 @@ void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
{
struct btrfs_subpage *subpage;
- if (fs_info->sectorsize == PAGE_SIZE)
+ if (!btrfs_is_subpage(fs_info, page))
return;
ASSERT(PagePrivate(page) && page->mapping);
@@ -190,7 +213,7 @@ void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info,
{
struct btrfs_subpage *subpage;
- if (fs_info->sectorsize == PAGE_SIZE)
+ if (!btrfs_is_subpage(fs_info, page))
return;
ASSERT(PagePrivate(page) && page->mapping);
@@ -319,7 +342,7 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
- if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) {
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) {
lock_page(page);
return 0;
}
@@ -336,7 +359,7 @@ int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info,
void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
- if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE)
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page))
return unlock_page(page);
btrfs_subpage_clamp_range(page, &start, &len);
if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len))
@@ -620,7 +643,7 @@ IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(checked);
void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \
struct page *page, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
set_page_func(page); \
return; \
} \
@@ -629,7 +652,7 @@ void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \
void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \
struct page *page, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
clear_page_func(page); \
return; \
} \
@@ -638,14 +661,14 @@ void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \
bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \
struct page *page, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \
return test_page_func(page); \
return btrfs_subpage_test_##name(fs_info, page, start, len); \
} \
void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
struct page *page, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
set_page_func(page); \
return; \
} \
@@ -655,7 +678,7 @@ void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
struct page *page, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
clear_page_func(page); \
return; \
} \
@@ -665,7 +688,7 @@ void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
struct page *page, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \
return test_page_func(page); \
btrfs_subpage_clamp_range(page, &start, &len); \
return btrfs_subpage_test_##name(fs_info, page, start, len); \
@@ -694,7 +717,7 @@ void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
return;
ASSERT(!PageDirty(page));
- if (fs_info->sectorsize == PAGE_SIZE)
+ if (!btrfs_is_subpage(fs_info, page))
return;
ASSERT(PagePrivate(page) && page->private);
@@ -722,8 +745,8 @@ void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
struct btrfs_subpage *subpage;
ASSERT(PageLocked(page));
- /* For regular page size case, we just unlock the page */
- if (fs_info->sectorsize == PAGE_SIZE)
+ /* For non-subpage case, we just unlock the page */
+ if (!btrfs_is_subpage(fs_info, page))
return unlock_page(page);
ASSERT(PagePrivate(page) && page->private);
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index 7accb5c40d33..0e80ad336904 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -74,6 +74,8 @@ enum btrfs_subpage_type {
BTRFS_SUBPAGE_DATA,
};
+bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page);
+
void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize);
int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
struct page *page, enum btrfs_subpage_type type);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b228efe8ab6e..b1fdc6a26c76 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -261,7 +261,7 @@ static struct ratelimit_state printk_limits[] = {
RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100),
};
-void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
+void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
{
char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0";
struct va_format vaf;
@@ -292,10 +292,10 @@ void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, .
char statestr[STATE_STRING_BUF_LEN];
btrfs_state_to_string(fs_info, statestr);
- printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type,
+ _printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type,
fs_info->sb->s_id, statestr, &vaf);
} else {
- printk("%sBTRFS %s: %pV\n", lvl, type, &vaf);
+ _printk("%sBTRFS %s: %pV\n", lvl, type, &vaf);
}
}
@@ -1903,6 +1903,7 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
old_pool_size, new_pool_size);
btrfs_workqueue_set_max(fs_info->workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->hipri_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size);
@@ -1912,8 +1913,6 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
- btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers,
- new_pool_size);
}
static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index ba78ca5aabbb..92a1fa8e3da6 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -394,11 +394,9 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj,
{
ssize_t ret = 0;
- /* 4K sector size is also supported with 64K page size */
- if (PAGE_SIZE == SZ_64K)
+ /* An artificial limit to only support 4K and PAGE_SIZE */
+ if (PAGE_SIZE > SZ_4K)
ret += sysfs_emit_at(buf, ret, "%u ", SZ_4K);
-
- /* Only sectorsize == PAGE_SIZE is now supported */
ret += sysfs_emit_at(buf, ret, "%lu\n", PAGE_SIZE);
return ret;
@@ -722,6 +720,42 @@ SPACE_INFO_ATTR(bytes_zone_unusable);
SPACE_INFO_ATTR(disk_used);
SPACE_INFO_ATTR(disk_total);
+static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_space_info *space_info = to_space_info(kobj);
+ ssize_t ret;
+
+ ret = sysfs_emit(buf, "%d\n", READ_ONCE(space_info->bg_reclaim_threshold));
+
+ return ret;
+}
+
+static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_space_info *space_info = to_space_info(kobj);
+ int thresh;
+ int ret;
+
+ ret = kstrtoint(buf, 10, &thresh);
+ if (ret)
+ return ret;
+
+ if (thresh < 0 || thresh > 100)
+ return -EINVAL;
+
+ WRITE_ONCE(space_info->bg_reclaim_threshold, thresh);
+
+ return len;
+}
+
+BTRFS_ATTR_RW(space_info, bg_reclaim_threshold,
+ btrfs_sinfo_bg_reclaim_threshold_show,
+ btrfs_sinfo_bg_reclaim_threshold_store);
+
/*
* Allocation information about block group types.
*
@@ -738,6 +772,7 @@ static struct attribute *space_info_attrs[] = {
BTRFS_ATTR_PTR(space_info, bytes_zone_unusable),
BTRFS_ATTR_PTR(space_info, disk_used),
BTRFS_ATTR_PTR(space_info, disk_total),
+ BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold),
NULL,
};
ATTRIBUTE_GROUPS(space_info);
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index d8e56edd6991..1591bfa55bcc 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -150,8 +150,8 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
{
- struct radix_tree_iter iter;
- void **slot;
+ unsigned long index;
+ struct extent_buffer *eb;
struct btrfs_device *dev, *tmp;
if (!fs_info)
@@ -163,25 +163,9 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
test_mnt->mnt_sb->s_fs_info = NULL;
- spin_lock(&fs_info->buffer_lock);
- radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) {
- struct extent_buffer *eb;
-
- eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock);
- if (!eb)
- continue;
- /* Shouldn't happen but that kind of thinking creates CVE's */
- if (radix_tree_exception(eb)) {
- if (radix_tree_deref_retry(eb))
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
- slot = radix_tree_iter_resume(slot, &iter);
- spin_unlock(&fs_info->buffer_lock);
+ xa_for_each(&fs_info->extent_buffers, index, eb) {
free_extent_buffer_stale(eb);
- spin_lock(&fs_info->buffer_lock);
}
- spin_unlock(&fs_info->buffer_lock);
btrfs_mapping_tree_free(&fs_info->mapping_tree);
list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices,
@@ -202,7 +186,7 @@ void btrfs_free_dummy_root(struct btrfs_root *root)
if (!root)
return;
/* Will be freed by btrfs_free_fs_roots */
- if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state)))
+ if (WARN_ON(test_bit(BTRFS_ROOT_REGISTERED, &root->state)))
return;
btrfs_global_root_delete(root);
btrfs_put_root(root);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index b008c5110958..06c0a958d114 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -23,7 +23,7 @@
#include "space-info.h"
#include "zoned.h"
-#define BTRFS_ROOT_TRANS_TAG 0
+#define BTRFS_ROOT_TRANS_TAG XA_MARK_0
/*
* Transaction states and transitions
@@ -221,7 +221,7 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
* the caching thread will re-start it's search from 3, and thus find
* the hole from [4,6) to add to the free space cache.
*/
- spin_lock(&fs_info->block_group_cache_lock);
+ write_lock(&fs_info->block_group_cache_lock);
list_for_each_entry_safe(caching_ctl, next,
&fs_info->caching_block_groups, list) {
struct btrfs_block_group *cache = caching_ctl->block_group;
@@ -234,7 +234,7 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
cache->last_byte_to_unpin = caching_ctl->progress;
}
}
- spin_unlock(&fs_info->block_group_cache_lock);
+ write_unlock(&fs_info->block_group_cache_lock);
up_write(&fs_info->commit_root_sem);
}
@@ -437,15 +437,15 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
*/
smp_wmb();
- spin_lock(&fs_info->fs_roots_radix_lock);
+ spin_lock(&fs_info->fs_roots_lock);
if (root->last_trans == trans->transid && !force) {
- spin_unlock(&fs_info->fs_roots_radix_lock);
+ spin_unlock(&fs_info->fs_roots_lock);
return 0;
}
- radix_tree_tag_set(&fs_info->fs_roots_radix,
- (unsigned long)root->root_key.objectid,
- BTRFS_ROOT_TRANS_TAG);
- spin_unlock(&fs_info->fs_roots_radix_lock);
+ xa_set_mark(&fs_info->fs_roots,
+ (unsigned long)root->root_key.objectid,
+ BTRFS_ROOT_TRANS_TAG);
+ spin_unlock(&fs_info->fs_roots_lock);
root->last_trans = trans->transid;
/* this is pretty tricky. We don't want to
@@ -487,11 +487,9 @@ void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
spin_unlock(&cur_trans->dropped_roots_lock);
/* Make sure we don't try to update the root at commit time */
- spin_lock(&fs_info->fs_roots_radix_lock);
- radix_tree_tag_clear(&fs_info->fs_roots_radix,
- (unsigned long)root->root_key.objectid,
- BTRFS_ROOT_TRANS_TAG);
- spin_unlock(&fs_info->fs_roots_radix_lock);
+ xa_clear_mark(&fs_info->fs_roots,
+ (unsigned long)root->root_key.objectid,
+ BTRFS_ROOT_TRANS_TAG);
}
int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
@@ -1404,9 +1402,8 @@ void btrfs_add_dead_root(struct btrfs_root *root)
static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *gang[8];
- int i;
- int ret;
+ struct btrfs_root *root;
+ unsigned long index;
/*
* At this point no one can be using this transaction to modify any tree
@@ -1414,57 +1411,46 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
*/
ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING);
- spin_lock(&fs_info->fs_roots_radix_lock);
- while (1) {
- ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
- (void **)gang, 0,
- ARRAY_SIZE(gang),
- BTRFS_ROOT_TRANS_TAG);
- if (ret == 0)
- break;
- for (i = 0; i < ret; i++) {
- struct btrfs_root *root = gang[i];
- int ret2;
-
- /*
- * At this point we can neither have tasks logging inodes
- * from a root nor trying to commit a log tree.
- */
- ASSERT(atomic_read(&root->log_writers) == 0);
- ASSERT(atomic_read(&root->log_commit[0]) == 0);
- ASSERT(atomic_read(&root->log_commit[1]) == 0);
-
- radix_tree_tag_clear(&fs_info->fs_roots_radix,
- (unsigned long)root->root_key.objectid,
- BTRFS_ROOT_TRANS_TAG);
- spin_unlock(&fs_info->fs_roots_radix_lock);
-
- btrfs_free_log(trans, root);
- ret2 = btrfs_update_reloc_root(trans, root);
- if (ret2)
- return ret2;
-
- /* see comments in should_cow_block() */
- clear_bit(BTRFS_ROOT_FORCE_COW, &root->state);
- smp_mb__after_atomic();
-
- if (root->commit_root != root->node) {
- list_add_tail(&root->dirty_list,
- &trans->transaction->switch_commits);
- btrfs_set_root_node(&root->root_item,
- root->node);
- }
+ spin_lock(&fs_info->fs_roots_lock);
+ xa_for_each_marked(&fs_info->fs_roots, index, root, BTRFS_ROOT_TRANS_TAG) {
+ int ret;
+
+ /*
+ * At this point we can neither have tasks logging inodes
+ * from a root nor trying to commit a log tree.
+ */
+ ASSERT(atomic_read(&root->log_writers) == 0);
+ ASSERT(atomic_read(&root->log_commit[0]) == 0);
+ ASSERT(atomic_read(&root->log_commit[1]) == 0);
+
+ xa_clear_mark(&fs_info->fs_roots,
+ (unsigned long)root->root_key.objectid,
+ BTRFS_ROOT_TRANS_TAG);
+ spin_unlock(&fs_info->fs_roots_lock);
- ret2 = btrfs_update_root(trans, fs_info->tree_root,
- &root->root_key,
- &root->root_item);
- if (ret2)
- return ret2;
- spin_lock(&fs_info->fs_roots_radix_lock);
- btrfs_qgroup_free_meta_all_pertrans(root);
+ btrfs_free_log(trans, root);
+ ret = btrfs_update_reloc_root(trans, root);
+ if (ret)
+ return ret;
+
+ /* See comments in should_cow_block() */
+ clear_bit(BTRFS_ROOT_FORCE_COW, &root->state);
+ smp_mb__after_atomic();
+
+ if (root->commit_root != root->node) {
+ list_add_tail(&root->dirty_list,
+ &trans->transaction->switch_commits);
+ btrfs_set_root_node(&root->root_item, root->node);
}
+
+ ret = btrfs_update_root(trans, fs_info->tree_root,
+ &root->root_key, &root->root_item);
+ if (ret)
+ return ret;
+ spin_lock(&fs_info->fs_roots_lock);
+ btrfs_qgroup_free_meta_all_pertrans(root);
}
- spin_unlock(&fs_info->fs_roots_radix_lock);
+ spin_unlock(&fs_info->fs_roots_lock);
return 0;
}
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index e56c0107eea3..9e0e0ae2288c 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -1855,3 +1855,58 @@ out:
return ret;
}
ALLOW_ERROR_INJECTION(btrfs_check_node, ERRNO);
+
+int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner)
+{
+ const bool is_subvol = is_fstree(root_owner);
+ const u64 eb_owner = btrfs_header_owner(eb);
+
+ /*
+ * Skip dummy fs, as selftests don't create unique ebs for each dummy
+ * root.
+ */
+ if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &eb->fs_info->fs_state))
+ return 0;
+ /*
+ * There are several call sites (backref walking, qgroup, and data
+ * reloc) passing 0 as @root_owner, as they are not holding the
+ * tree root. In that case, we can not do a reliable ownership check,
+ * so just exit.
+ */
+ if (root_owner == 0)
+ return 0;
+ /*
+ * These trees use key.offset as their owner, our callers don't have
+ * the extra capacity to pass key.offset here. So we just skip them.
+ */
+ if (root_owner == BTRFS_TREE_LOG_OBJECTID ||
+ root_owner == BTRFS_TREE_RELOC_OBJECTID)
+ return 0;
+
+ if (!is_subvol) {
+ /* For non-subvolume trees, the eb owner should match root owner */
+ if (unlikely(root_owner != eb_owner)) {
+ btrfs_crit(eb->fs_info,
+"corrupted %s, root=%llu block=%llu owner mismatch, have %llu expect %llu",
+ btrfs_header_level(eb) == 0 ? "leaf" : "node",
+ root_owner, btrfs_header_bytenr(eb), eb_owner,
+ root_owner);
+ return -EUCLEAN;
+ }
+ return 0;
+ }
+
+ /*
+ * For subvolume trees, owners can mismatch, but they should all belong
+ * to subvolume trees.
+ */
+ if (unlikely(is_subvol != is_fstree(eb_owner))) {
+ btrfs_crit(eb->fs_info,
+"corrupted %s, root=%llu block=%llu owner mismatch, have %llu expect [%llu, %llu]",
+ btrfs_header_level(eb) == 0 ? "leaf" : "node",
+ root_owner, btrfs_header_bytenr(eb), eb_owner,
+ BTRFS_FIRST_FREE_OBJECTID, BTRFS_LAST_FREE_OBJECTID);
+ return -EUCLEAN;
+ }
+ return 0;
+}
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
index 32fecc9dc1dd..ece497e26558 100644
--- a/fs/btrfs/tree-checker.h
+++ b/fs/btrfs/tree-checker.h
@@ -25,5 +25,6 @@ int btrfs_check_node(struct extent_buffer *node);
int btrfs_check_chunk_valid(struct extent_buffer *leaf,
struct btrfs_chunk *chunk, u64 logical);
+int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner);
#endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index e65633686378..370388fadf96 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -333,7 +333,7 @@ static int process_one_buffer(struct btrfs_root *log,
* pin down any logged extents, so we have to read the block.
*/
if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
- ret = btrfs_read_buffer(eb, gen, level, NULL);
+ ret = btrfs_read_extent_buffer(eb, gen, level, NULL);
if (ret)
return ret;
}
@@ -894,8 +894,7 @@ update_inode:
btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found);
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
out:
- if (inode)
- iput(inode);
+ iput(inode);
return ret;
}
@@ -2575,7 +2574,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
int i;
int ret;
- ret = btrfs_read_buffer(eb, gen, level, NULL);
+ ret = btrfs_read_extent_buffer(eb, gen, level, NULL);
if (ret)
return ret;
@@ -2786,7 +2785,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
path->slots[*level]++;
if (wc->free) {
- ret = btrfs_read_buffer(next, ptr_gen,
+ ret = btrfs_read_extent_buffer(next, ptr_gen,
*level - 1, &first_key);
if (ret) {
free_extent_buffer(next);
@@ -2815,7 +2814,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
free_extent_buffer(next);
continue;
}
- ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key);
+ ret = btrfs_read_extent_buffer(next, ptr_gen, *level - 1, &first_key);
if (ret) {
free_extent_buffer(next);
return ret;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b6b00338037c..9c20049d1fec 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -164,24 +164,12 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
*/
enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags)
{
- if (flags & BTRFS_BLOCK_GROUP_RAID10)
- return BTRFS_RAID_RAID10;
- else if (flags & BTRFS_BLOCK_GROUP_RAID1)
- return BTRFS_RAID_RAID1;
- else if (flags & BTRFS_BLOCK_GROUP_RAID1C3)
- return BTRFS_RAID_RAID1C3;
- else if (flags & BTRFS_BLOCK_GROUP_RAID1C4)
- return BTRFS_RAID_RAID1C4;
- else if (flags & BTRFS_BLOCK_GROUP_DUP)
- return BTRFS_RAID_DUP;
- else if (flags & BTRFS_BLOCK_GROUP_RAID0)
- return BTRFS_RAID_RAID0;
- else if (flags & BTRFS_BLOCK_GROUP_RAID5)
- return BTRFS_RAID_RAID5;
- else if (flags & BTRFS_BLOCK_GROUP_RAID6)
- return BTRFS_RAID_RAID6;
-
- return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
+ const u64 profile = (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK);
+
+ if (!profile)
+ return BTRFS_RAID_SINGLE;
+
+ return BTRFS_BG_FLAG_TO_INDEX(profile);
}
const char *btrfs_bg_type_to_raid_name(u64 flags)
@@ -4062,13 +4050,6 @@ static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
return true;
- if (fs_info->sectorsize < PAGE_SIZE &&
- bargs->target & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- btrfs_err(fs_info,
- "RAID56 is not yet supported for sectorsize %u with page size %lu",
- fs_info->sectorsize, PAGE_SIZE);
- return false;
- }
/* Profile is valid and does not have bits outside of the allowed set */
if (alloc_profile_is_valid(bargs->target, 1) &&
(bargs->target & ~allowed) == 0)
@@ -6312,7 +6293,7 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
u64 offset;
u64 stripe_offset;
u64 stripe_nr;
- u64 stripe_len;
+ u32 stripe_len;
u64 raid56_full_stripe_start = (u64)-1;
int data_stripes;
@@ -6323,19 +6304,13 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
offset = logical - em->start;
/* Len of a stripe in a chunk */
stripe_len = map->stripe_len;
- /* Stripe where this block falls in */
- stripe_nr = div64_u64(offset, stripe_len);
- /* Offset of stripe in the chunk */
- stripe_offset = stripe_nr * stripe_len;
- if (offset < stripe_offset) {
- btrfs_crit(fs_info,
-"stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu",
- stripe_offset, offset, em->start, logical, stripe_len);
- return -EINVAL;
- }
+ /*
+ * Stripe_nr is where this block falls in
+ * stripe_offset is the offset of this block in its stripe.
+ */
+ stripe_nr = div64_u64_rem(offset, stripe_len, &stripe_offset);
+ ASSERT(stripe_offset < U32_MAX);
- /* stripe_offset is the offset of this block in its stripe */
- stripe_offset = offset - stripe_offset;
data_stripes = nr_data_stripes(map);
/* Only stripe based profiles needs to check against stripe length. */
@@ -6737,11 +6712,11 @@ static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
(unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
dev->devid, bio->bi_iter.bi_size);
- bio_set_dev(bio, dev->bdev);
btrfs_bio_counter_inc_noblocked(fs_info);
- btrfsic_submit_bio(bio);
+ btrfsic_check_bio(bio);
+ submit_bio(bio);
}
static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logical)
@@ -6823,10 +6798,12 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
continue;
}
- if (dev_nr < total_devs - 1)
- bio = btrfs_bio_clone(first_bio);
- else
+ if (dev_nr < total_devs - 1) {
+ bio = btrfs_bio_clone(dev->bdev, first_bio);
+ } else {
bio = first_bio;
+ bio_set_dev(bio, dev->bdev);
+ }
submit_stripe_bio(bioc, bio, bioc->stripes[dev_nr].physical, dev);
}
@@ -7359,7 +7336,6 @@ static int read_one_dev(struct extent_buffer *leaf,
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
{
- struct btrfs_root *root = fs_info->tree_root;
struct btrfs_super_block *super_copy = fs_info->super_copy;
struct extent_buffer *sb;
struct btrfs_disk_key *disk_key;
@@ -7375,30 +7351,16 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
struct btrfs_key key;
ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
+
/*
- * This will create extent buffer of nodesize, superblock size is
- * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
- * overallocate but we can keep it as-is, only the first page is used.
+ * We allocated a dummy extent, just to use extent buffer accessors.
+ * There will be unused space after BTRFS_SUPER_INFO_SIZE, but
+ * that's fine, we will not go beyond system chunk array anyway.
*/
- sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET,
- root->root_key.objectid, 0);
- if (IS_ERR(sb))
- return PTR_ERR(sb);
+ sb = alloc_dummy_extent_buffer(fs_info, BTRFS_SUPER_INFO_OFFSET);
+ if (!sb)
+ return -ENOMEM;
set_extent_buffer_uptodate(sb);
- /*
- * The sb extent buffer is artificial and just used to read the system array.
- * set_extent_buffer_uptodate() call does not properly mark all it's
- * pages up-to-date when the page is larger: extent does not cover the
- * whole page and consequently check_page_uptodate does not find all
- * the page's extents up-to-date (the hole beyond sb),
- * write_extent_buffer then triggers a WARN_ON.
- *
- * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
- * but sb spans only this function. Add an explicit SetPageUptodate call
- * to silence the warning eg. on PowerPC 64.
- */
- if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE)
- SetPageUptodate(sb->pages[0]);
write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
array_size = btrfs_super_sys_array_size(super_copy);
@@ -7561,6 +7523,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
struct btrfs_key found_key;
int ret;
int slot;
+ int iter_ret = 0;
u64 total_dev = 0;
u64 last_ra_node = 0;
@@ -7604,30 +7567,18 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.offset = 0;
key.type = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto error;
- while (1) {
- struct extent_buffer *node;
+ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
+ struct extent_buffer *node = path->nodes[1];
leaf = path->nodes[0];
slot = path->slots[0];
- if (slot >= btrfs_header_nritems(leaf)) {
- ret = btrfs_next_leaf(root, path);
- if (ret == 0)
- continue;
- if (ret < 0)
- goto error;
- break;
- }
- node = path->nodes[1];
+
if (node) {
if (last_ra_node != node->start) {
readahead_tree_node_children(node);
last_ra_node = node->start;
}
}
- btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.type == BTRFS_DEV_ITEM_KEY) {
struct btrfs_dev_item *dev_item;
dev_item = btrfs_item_ptr(leaf, slot,
@@ -7652,7 +7603,11 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
if (ret)
goto error;
}
- path->slots[0]++;
+ }
+ /* Catch error found during iteration */
+ if (iter_ret < 0) {
+ ret = iter_ret;
+ goto error;
}
/*
@@ -7660,12 +7615,12 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
* do another round of validation checks.
*/
if (total_dev != fs_info->fs_devices->total_devices) {
- btrfs_err(fs_info,
- "super_num_devices %llu mismatch with num_devices %llu found here",
+ btrfs_warn(fs_info,
+"super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit",
btrfs_super_num_devices(fs_info->super_copy),
total_dev);
- ret = -EINVAL;
- goto error;
+ fs_info->fs_devices->total_devices = total_dev;
+ btrfs_set_super_num_devices(fs_info->super_copy, total_dev);
}
if (btrfs_super_total_bytes(fs_info->super_copy) <
fs_info->fs_devices->total_rw_bytes) {
@@ -8277,7 +8232,7 @@ bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
static int relocating_repair_kthread(void *data)
{
- struct btrfs_block_group *cache = (struct btrfs_block_group *)data;
+ struct btrfs_block_group *cache = data;
struct btrfs_fs_info *fs_info = cache->fs_info;
u64 target;
int ret = 0;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index b11c563d2025..6721002000ee 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -17,17 +17,51 @@ extern struct mutex uuid_mutex;
#define BTRFS_STRIPE_LEN SZ_64K
+/* Used by sanity check for btrfs_raid_types. */
+#define const_ffs(n) (__builtin_ctzll(n) + 1)
+
+/*
+ * The conversion from BTRFS_BLOCK_GROUP_* bits to btrfs_raid_type requires
+ * RAID0 always to be the lowest profile bit.
+ * Although it's part of on-disk format and should never change, do extra
+ * compile-time sanity checks.
+ */
+static_assert(const_ffs(BTRFS_BLOCK_GROUP_RAID0) <
+ const_ffs(BTRFS_BLOCK_GROUP_PROFILE_MASK & ~BTRFS_BLOCK_GROUP_RAID0));
+static_assert(const_ilog2(BTRFS_BLOCK_GROUP_RAID0) >
+ ilog2(BTRFS_BLOCK_GROUP_TYPE_MASK));
+
+/* ilog2() can handle both constants and variables */
+#define BTRFS_BG_FLAG_TO_INDEX(profile) \
+ ilog2((profile) >> (ilog2(BTRFS_BLOCK_GROUP_RAID0) - 1))
+
+enum btrfs_raid_types {
+ /* SINGLE is the special one as it doesn't have on-disk bit. */
+ BTRFS_RAID_SINGLE = 0,
+
+ BTRFS_RAID_RAID0 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID0),
+ BTRFS_RAID_RAID1 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1),
+ BTRFS_RAID_DUP = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_DUP),
+ BTRFS_RAID_RAID10 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID10),
+ BTRFS_RAID_RAID5 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID5),
+ BTRFS_RAID_RAID6 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID6),
+ BTRFS_RAID_RAID1C3 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1C3),
+ BTRFS_RAID_RAID1C4 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1C4),
+
+ BTRFS_NR_RAID_TYPES
+};
+
struct btrfs_io_geometry {
/* remaining bytes before crossing a stripe */
u64 len;
/* offset of logical address in chunk */
u64 offset;
/* length of single IO stripe */
- u64 stripe_len;
+ u32 stripe_len;
+ /* offset of address in stripe */
+ u32 stripe_offset;
/* number of stripe where address falls */
u64 stripe_nr;
- /* offset of address in stripe */
- u64 stripe_offset;
/* offset of raid56 stripe into the chunk */
u64 raid56_stripe_offset;
};
@@ -430,7 +464,7 @@ struct map_lookup {
u64 type;
int io_align;
int io_width;
- u64 stripe_len;
+ u32 stripe_len;
int num_stripes;
int sub_stripes;
int verified_stripes; /* For mount time dev extent verification */
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 85691dc2232f..7421abcf325a 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -272,10 +272,12 @@ out:
ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
{
+ struct btrfs_key found_key;
struct btrfs_key key;
struct inode *inode = d_inode(dentry);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_path *path;
+ int iter_ret = 0;
int ret = 0;
size_t total_size = 0, size_left = size;
@@ -294,44 +296,23 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
path->reada = READA_FORWARD;
/* search for our xattrs */
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto err;
-
- while (1) {
+ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
struct extent_buffer *leaf;
int slot;
struct btrfs_dir_item *di;
- struct btrfs_key found_key;
u32 item_size;
u32 cur;
leaf = path->nodes[0];
slot = path->slots[0];
- /* this is where we start walking through the path */
- if (slot >= btrfs_header_nritems(leaf)) {
- /*
- * if we've reached the last slot in this leaf we need
- * to go to the next leaf and reset everything
- */
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- goto err;
- else if (ret > 0)
- break;
- continue;
- }
-
- btrfs_item_key_to_cpu(leaf, &found_key, slot);
-
/* check to make sure this item is what we want */
if (found_key.objectid != key.objectid)
break;
if (found_key.type > BTRFS_XATTR_ITEM_KEY)
break;
if (found_key.type < BTRFS_XATTR_ITEM_KEY)
- goto next_item;
+ continue;
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
item_size = btrfs_item_size(leaf, slot);
@@ -351,8 +332,8 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
goto next;
if (!buffer || (name_len + 1) > size_left) {
- ret = -ERANGE;
- goto err;
+ iter_ret = -ERANGE;
+ break;
}
read_extent_buffer(leaf, buffer, name_ptr, name_len);
@@ -364,12 +345,13 @@ next:
cur += this_len;
di = (struct btrfs_dir_item *)((char *)di + this_len);
}
-next_item:
- path->slots[0]++;
}
- ret = total_size;
-err:
+ if (iter_ret < 0)
+ ret = iter_ret;
+ else
+ ret = total_size;
+
btrfs_free_path(path);
return ret;
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 29b54fd9c128..11237a913bee 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -51,11 +51,13 @@
#define BTRFS_MIN_ACTIVE_ZONES (BTRFS_SUPER_MIRROR_MAX + 5)
/*
- * Maximum supported zone size. Currently, SMR disks have a zone size of
- * 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do not
- * expect the zone size to become larger than 8GiB in the near future.
+ * Minimum / maximum supported zone size. Currently, SMR disks have a zone
+ * size of 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range.
+ * We do not expect the zone size to become larger than 8GiB or smaller than
+ * 4MiB in the near future.
*/
#define BTRFS_MAX_ZONE_SIZE SZ_8G
+#define BTRFS_MIN_ZONE_SIZE SZ_4M
#define SUPER_INFO_SECTORS ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT)
@@ -401,6 +403,13 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
zone_info->zone_size, BTRFS_MAX_ZONE_SIZE);
ret = -EINVAL;
goto out;
+ } else if (zone_info->zone_size < BTRFS_MIN_ZONE_SIZE) {
+ btrfs_err_in_rcu(fs_info,
+ "zoned: %s: zone size %llu smaller than supported minimum %u",
+ rcu_str_deref(device->name),
+ zone_info->zone_size, BTRFS_MIN_ZONE_SIZE);
+ ret = -EINVAL;
+ goto out;
}
nr_sectors = bdev_nr_sectors(bdev);
@@ -1835,7 +1844,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
}
/* No space left */
- if (block_group->alloc_offset == block_group->zone_capacity) {
+ if (btrfs_zoned_bg_is_full(block_group)) {
ret = false;
goto out_unlock;
}
@@ -1872,20 +1881,14 @@ out_unlock:
return ret;
}
-int btrfs_zone_finish(struct btrfs_block_group *block_group)
+static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct map_lookup *map;
- struct btrfs_device *device;
- u64 physical;
+ bool need_zone_finish;
int ret = 0;
int i;
- if (!btrfs_is_zoned(fs_info))
- return 0;
-
- map = block_group->physical_map;
-
spin_lock(&block_group->lock);
if (!block_group->zone_is_active) {
spin_unlock(&block_group->lock);
@@ -1895,40 +1898,56 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group)
/* Check if we have unwritten allocated space */
if ((block_group->flags &
(BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) &&
- block_group->alloc_offset > block_group->meta_write_pointer) {
+ block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) {
spin_unlock(&block_group->lock);
return -EAGAIN;
}
- spin_unlock(&block_group->lock);
-
- ret = btrfs_inc_block_group_ro(block_group, false);
- if (ret)
- return ret;
-
- /* Ensure all writes in this block group finish */
- btrfs_wait_block_group_reservations(block_group);
- /* No need to wait for NOCOW writers. Zoned mode does not allow that. */
- btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start,
- block_group->length);
-
- spin_lock(&block_group->lock);
/*
- * Bail out if someone already deactivated the block group, or
- * allocated space is left in the block group.
+ * If we are sure that the block group is full (= no more room left for
+ * new allocation) and the IO for the last usable block is completed, we
+ * don't need to wait for the other IOs. This holds because we ensure
+ * the sequential IO submissions using the ZONE_APPEND command for data
+ * and block_group->meta_write_pointer for metadata.
*/
- if (!block_group->zone_is_active) {
+ if (!fully_written) {
spin_unlock(&block_group->lock);
- btrfs_dec_block_group_ro(block_group);
- return 0;
- }
- if (block_group->reserved) {
- spin_unlock(&block_group->lock);
- btrfs_dec_block_group_ro(block_group);
- return -EAGAIN;
+ ret = btrfs_inc_block_group_ro(block_group, false);
+ if (ret)
+ return ret;
+
+ /* Ensure all writes in this block group finish */
+ btrfs_wait_block_group_reservations(block_group);
+ /* No need to wait for NOCOW writers. Zoned mode does not allow that */
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start,
+ block_group->length);
+
+ spin_lock(&block_group->lock);
+
+ /*
+ * Bail out if someone already deactivated the block group, or
+ * allocated space is left in the block group.
+ */
+ if (!block_group->zone_is_active) {
+ spin_unlock(&block_group->lock);
+ btrfs_dec_block_group_ro(block_group);
+ return 0;
+ }
+
+ if (block_group->reserved) {
+ spin_unlock(&block_group->lock);
+ btrfs_dec_block_group_ro(block_group);
+ return -EAGAIN;
+ }
}
+ /*
+ * The block group is not fully allocated, so not fully written yet. We
+ * need to send ZONE_FINISH command to free up an active zone.
+ */
+ need_zone_finish = !btrfs_zoned_bg_is_full(block_group);
+
block_group->zone_is_active = 0;
block_group->alloc_offset = block_group->zone_capacity;
block_group->free_space_ctl->free_space = 0;
@@ -1936,24 +1955,29 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group)
btrfs_clear_data_reloc_bg(block_group);
spin_unlock(&block_group->lock);
+ map = block_group->physical_map;
for (i = 0; i < map->num_stripes; i++) {
- device = map->stripes[i].dev;
- physical = map->stripes[i].physical;
+ struct btrfs_device *device = map->stripes[i].dev;
+ const u64 physical = map->stripes[i].physical;
if (device->zone_info->max_active_zones == 0)
continue;
- ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
- physical >> SECTOR_SHIFT,
- device->zone_info->zone_size >> SECTOR_SHIFT,
- GFP_NOFS);
+ if (need_zone_finish) {
+ ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
+ physical >> SECTOR_SHIFT,
+ device->zone_info->zone_size >> SECTOR_SHIFT,
+ GFP_NOFS);
- if (ret)
- return ret;
+ if (ret)
+ return ret;
+ }
btrfs_dev_clear_active_zone(device, physical);
}
- btrfs_dec_block_group_ro(block_group);
+
+ if (!fully_written)
+ btrfs_dec_block_group_ro(block_group);
spin_lock(&fs_info->zone_active_bgs_lock);
ASSERT(!list_empty(&block_group->active_bg_list));
@@ -1966,6 +1990,14 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group)
return 0;
}
+int btrfs_zone_finish(struct btrfs_block_group *block_group)
+{
+ if (!btrfs_is_zoned(block_group->fs_info))
+ return 0;
+
+ return do_zone_finish(block_group, false);
+}
+
bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
{
struct btrfs_fs_info *fs_info = fs_devices->fs_info;
@@ -1997,9 +2029,7 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
{
struct btrfs_block_group *block_group;
- struct map_lookup *map;
- struct btrfs_device *device;
- u64 physical;
+ u64 min_alloc_bytes;
if (!btrfs_is_zoned(fs_info))
return;
@@ -2007,42 +2037,52 @@ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 len
block_group = btrfs_lookup_block_group(fs_info, logical);
ASSERT(block_group);
- if (logical + length < block_group->start + block_group->zone_capacity)
- goto out;
-
- spin_lock(&block_group->lock);
+ /* No MIXED_BG on zoned btrfs. */
+ if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
+ min_alloc_bytes = fs_info->sectorsize;
+ else
+ min_alloc_bytes = fs_info->nodesize;
- if (!block_group->zone_is_active) {
- spin_unlock(&block_group->lock);
+ /* Bail out if we can allocate more data from this block group. */
+ if (logical + length + min_alloc_bytes <=
+ block_group->start + block_group->zone_capacity)
goto out;
- }
- block_group->zone_is_active = 0;
- /* We should have consumed all the free space */
- ASSERT(block_group->alloc_offset == block_group->zone_capacity);
- ASSERT(block_group->free_space_ctl->free_space == 0);
- btrfs_clear_treelog_bg(block_group);
- btrfs_clear_data_reloc_bg(block_group);
- spin_unlock(&block_group->lock);
+ do_zone_finish(block_group, true);
- map = block_group->physical_map;
- device = map->stripes[0].dev;
- physical = map->stripes[0].physical;
+out:
+ btrfs_put_block_group(block_group);
+}
- if (!device->zone_info->max_active_zones)
- goto out;
+static void btrfs_zone_finish_endio_workfn(struct work_struct *work)
+{
+ struct btrfs_block_group *bg =
+ container_of(work, struct btrfs_block_group, zone_finish_work);
- btrfs_dev_clear_active_zone(device, physical);
+ wait_on_extent_buffer_writeback(bg->last_eb);
+ free_extent_buffer(bg->last_eb);
+ btrfs_zone_finish_endio(bg->fs_info, bg->start, bg->length);
+ btrfs_put_block_group(bg);
+}
- spin_lock(&fs_info->zone_active_bgs_lock);
- ASSERT(!list_empty(&block_group->active_bg_list));
- list_del_init(&block_group->active_bg_list);
- spin_unlock(&fs_info->zone_active_bgs_lock);
+void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
+ struct extent_buffer *eb)
+{
+ if (!bg->seq_zone || eb->start + eb->len * 2 <= bg->start + bg->zone_capacity)
+ return;
- btrfs_put_block_group(block_group);
+ if (WARN_ON(bg->zone_finish_work.func == btrfs_zone_finish_endio_workfn)) {
+ btrfs_err(bg->fs_info, "double scheduling of bg %llu zone finishing",
+ bg->start);
+ return;
+ }
-out:
- btrfs_put_block_group(block_group);
+ /* For the work */
+ btrfs_get_block_group(bg);
+ atomic_inc(&eb->refs);
+ bg->last_eb = eb;
+ INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn);
+ queue_work(system_unbound_wq, &bg->zone_finish_work);
}
void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
@@ -2072,3 +2112,30 @@ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
}
mutex_unlock(&fs_devices->device_list_mutex);
}
+
+bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_device *device;
+ u64 used = 0;
+ u64 total = 0;
+ u64 factor;
+
+ ASSERT(btrfs_is_zoned(fs_info));
+
+ if (fs_info->bg_reclaim_threshold == 0)
+ return false;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ if (!device->bdev)
+ continue;
+
+ total += device->disk_total_bytes;
+ used += device->bytes_used;
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ factor = div64_u64(used * 100, total);
+ return factor >= fs_info->bg_reclaim_threshold;
+}
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index 6dee76248cb4..bb1a189e11f9 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -10,11 +10,7 @@
#include "block-group.h"
#include "btrfs_inode.h"
-/*
- * Block groups with more than this value (percents) of unusable space will be
- * scheduled for background reclaim.
- */
-#define BTRFS_DEFAULT_RECLAIM_THRESH 75
+#define BTRFS_DEFAULT_RECLAIM_THRESH (75)
struct btrfs_zoned_device_info {
/*
@@ -76,8 +72,11 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group);
bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags);
void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
u64 length);
+void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
+ struct extent_buffer *eb);
void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info);
+bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info);
#else /* CONFIG_BLK_DEV_ZONED */
static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
struct blk_zone *zone)
@@ -233,9 +232,17 @@ static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices,
static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info,
u64 logical, u64 length) { }
+static inline void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
+ struct extent_buffer *eb) { }
+
static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { }
static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { }
+
+static inline bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
+{
+ return false;
+}
#endif
static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
@@ -370,4 +377,10 @@ static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode)
mutex_unlock(&root->fs_info->zoned_data_reloc_io_lock);
}
+static inline bool btrfs_zoned_bg_is_full(const struct btrfs_block_group *bg)
+{
+ ASSERT(btrfs_is_zoned(bg->fs_info));
+ return (bg->alloc_offset == bg->zone_capacity);
+}
+
#endif
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index fc42dd0badd7..0fe31a6f6e68 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -93,22 +93,26 @@ static inline struct workspace *list_to_workspace(struct list_head *list)
void zstd_free_workspace(struct list_head *ws);
struct list_head *zstd_alloc_workspace(unsigned int level);
-/*
- * zstd_reclaim_timer_fn - reclaim timer
+
+/**
+ * Timer callback to free unused workspaces.
+ *
* @t: timer
*
* This scans the lru_list and attempts to reclaim any workspace that hasn't
* been used for ZSTD_BTRFS_RECLAIM_JIFFIES.
+ *
+ * The context is softirq and does not need the _bh locking primitives.
*/
static void zstd_reclaim_timer_fn(struct timer_list *timer)
{
unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES;
struct list_head *pos, *next;
- spin_lock_bh(&wsm.lock);
+ spin_lock(&wsm.lock);
if (list_empty(&wsm.lru_list)) {
- spin_unlock_bh(&wsm.lock);
+ spin_unlock(&wsm.lock);
return;
}
@@ -137,7 +141,7 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer)
if (!list_empty(&wsm.lru_list))
mod_timer(&wsm.timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES);
- spin_unlock_bh(&wsm.lock);
+ spin_unlock(&wsm.lock);
}
/*
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index bb9c1fd48c19..252f4ee977d5 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -399,7 +399,7 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (!err)
return iomap_dio_rw(iocb, to, &erofs_iomap_ops,
- NULL, 0, 0);
+ NULL, 0, NULL, 0);
if (err < 0)
return err;
}
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 6feb07e3e1eb..109d07629f81 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -76,7 +76,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
return generic_file_read_iter(iocb, to);
}
- ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, 0);
+ ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, NULL, 0);
inode_unlock_shared(inode);
file_accessed(iocb->ki_filp);
@@ -565,7 +565,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
iomap_ops = &ext4_iomap_overwrite_ops;
ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
(unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0,
- 0);
+ NULL, 0);
if (ret == -ENOTBLK)
ret = 0;
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 35b6c720c2bc..100637b1adb3 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -4308,7 +4308,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
*/
inc_page_count(sbi, F2FS_DIO_READ);
dio = __iomap_dio_rw(iocb, to, &f2fs_iomap_ops,
- &f2fs_iomap_dio_read_ops, 0, 0);
+ &f2fs_iomap_dio_read_ops, 0, NULL, 0);
if (IS_ERR_OR_NULL(dio)) {
ret = PTR_ERR_OR_ZERO(dio);
if (ret != -EIOCBQUEUED)
@@ -4526,7 +4526,7 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
if (pos + count > inode->i_size)
dio_flags |= IOMAP_DIO_FORCE_WAIT;
dio = __iomap_dio_rw(iocb, from, &f2fs_iomap_ops,
- &f2fs_iomap_dio_write_ops, dio_flags, 0);
+ &f2fs_iomap_dio_write_ops, dio_flags, NULL, 0);
if (IS_ERR_OR_NULL(dio)) {
ret = PTR_ERR_OR_ZERO(dio);
if (ret == -ENOTBLK)
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 2556ae1f92ea..0b07d5a7bb81 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -835,7 +835,7 @@ retry:
pagefault_disable();
to->nofault = true;
ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL,
- IOMAP_DIO_PARTIAL, read);
+ IOMAP_DIO_PARTIAL, NULL, read);
to->nofault = false;
pagefault_enable();
if (ret <= 0 && ret != -EFAULT)
@@ -898,7 +898,7 @@ retry:
from->nofault = true;
ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL,
- IOMAP_DIO_PARTIAL, written);
+ IOMAP_DIO_PARTIAL, NULL, written);
from->nofault = false;
if (ret <= 0) {
if (ret == -ENOTBLK)
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 80f9b047aa1b..370c3241618a 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -51,6 +51,15 @@ struct iomap_dio {
};
};
+static struct bio *iomap_dio_alloc_bio(const struct iomap_iter *iter,
+ struct iomap_dio *dio, unsigned short nr_vecs, unsigned int opf)
+{
+ if (dio->dops && dio->dops->bio_set)
+ return bio_alloc_bioset(iter->iomap.bdev, nr_vecs, opf,
+ GFP_KERNEL, dio->dops->bio_set);
+ return bio_alloc(iter->iomap.bdev, nr_vecs, opf, GFP_KERNEL);
+}
+
static void iomap_dio_submit_bio(const struct iomap_iter *iter,
struct iomap_dio *dio, struct bio *bio, loff_t pos)
{
@@ -145,7 +154,7 @@ static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
cmpxchg(&dio->error, 0, ret);
}
-static void iomap_dio_bio_end_io(struct bio *bio)
+void iomap_dio_bio_end_io(struct bio *bio)
{
struct iomap_dio *dio = bio->bi_private;
bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
@@ -177,16 +186,16 @@ static void iomap_dio_bio_end_io(struct bio *bio)
bio_put(bio);
}
}
+EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io);
static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
loff_t pos, unsigned len)
{
struct inode *inode = file_inode(dio->iocb->ki_filp);
struct page *page = ZERO_PAGE(0);
- int flags = REQ_SYNC | REQ_IDLE;
struct bio *bio;
- bio = bio_alloc(iter->iomap.bdev, 1, REQ_OP_WRITE | flags, GFP_KERNEL);
+ bio = iomap_dio_alloc_bio(iter, dio, 1, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE);
fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
GFP_KERNEL);
bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos);
@@ -311,7 +320,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
goto out;
}
- bio = bio_alloc(iomap->bdev, nr_pages, bio_opf, GFP_KERNEL);
+ bio = iomap_dio_alloc_bio(iter, dio, nr_pages, bio_opf);
fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
GFP_KERNEL);
bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
@@ -474,7 +483,7 @@ static loff_t iomap_dio_iter(const struct iomap_iter *iter,
struct iomap_dio *
__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
- unsigned int dio_flags, size_t done_before)
+ unsigned int dio_flags, void *private, size_t done_before)
{
struct address_space *mapping = iocb->ki_filp->f_mapping;
struct inode *inode = file_inode(iocb->ki_filp);
@@ -483,6 +492,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
.pos = iocb->ki_pos,
.len = iov_iter_count(iter),
.flags = IOMAP_DIRECT,
+ .private = private,
};
loff_t end = iomi.pos + iomi.len - 1, ret = 0;
bool wait_for_completion =
@@ -672,11 +682,12 @@ EXPORT_SYMBOL_GPL(__iomap_dio_rw);
ssize_t
iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
- unsigned int dio_flags, size_t done_before)
+ unsigned int dio_flags, void *private, size_t done_before)
{
struct iomap_dio *dio;
- dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags, done_before);
+ dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags, private,
+ done_before);
if (IS_ERR_OR_NULL(dio))
return PTR_ERR_OR_ZERO(dio);
return iomap_dio_complete(dio);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 5bddb1e9e0b3..85c412107a10 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -225,7 +225,7 @@ xfs_file_dio_read(
ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
if (ret)
return ret;
- ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, 0);
+ ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
return ret;
@@ -534,7 +534,7 @@ xfs_file_dio_write_aligned(
}
trace_xfs_file_direct_write(iocb, from);
ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
- &xfs_dio_write_ops, 0, 0);
+ &xfs_dio_write_ops, 0, NULL, 0);
out_unlock:
if (iolock)
xfs_iunlock(ip, iolock);
@@ -612,7 +612,7 @@ retry_exclusive:
trace_xfs_file_direct_write(iocb, from);
ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
- &xfs_dio_write_ops, flags, 0);
+ &xfs_dio_write_ops, flags, NULL, 0);
/*
* Retry unaligned I/O with exclusive blocking semantics if the DIO
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 652752df1a2f..8f306485c953 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -900,7 +900,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
ret = zonefs_file_dio_append(iocb, from);
else
ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops,
- &zonefs_write_dio_ops, 0, 0);
+ &zonefs_write_dio_ops, 0, NULL, 0);
if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
(ret > 0 || ret == -EIOCBQUEUED)) {
if (ret > 0)
@@ -1042,7 +1042,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
}
file_accessed(iocb->ki_filp);
ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops,
- &zonefs_read_dio_ops, 0, 0);
+ &zonefs_read_dio_ops, 0, NULL, 0);
} else {
ret = generic_file_read_iter(iocb, to);
if (ret == -EIO)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 87b5af1d9fbe..02e7f60638b8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1708,6 +1708,11 @@ static inline bool __sb_start_write_trylock(struct super_block *sb, int level)
#define __sb_writers_release(sb, lev) \
percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)
+static inline bool sb_write_started(const struct super_block *sb)
+{
+ return lockdep_is_held_type(sb->s_writers.rw_sem + SB_FREEZE_WRITE - 1, 1);
+}
+
/**
* sb_end_write - drop write access to a superblock
* @sb: the super we wrote to
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index b76f0dd149fb..5b6f64f4d771 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -188,6 +188,7 @@ struct iomap_iter {
unsigned flags;
struct iomap iomap;
struct iomap srcmap;
+ void *private;
};
int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops);
@@ -320,6 +321,16 @@ struct iomap_dio_ops {
unsigned flags);
void (*submit_io)(const struct iomap_iter *iter, struct bio *bio,
loff_t file_offset);
+
+ /*
+ * Filesystems wishing to attach private information to a direct io bio
+ * must provide a ->submit_io method that attaches the additional
+ * information to the bio and changes the ->bi_end_io callback to a
+ * custom function. This function should, at a minimum, perform any
+ * relevant post-processing of the bio and end with a call to
+ * iomap_dio_bio_end_io.
+ */
+ struct bio_set *bio_set;
};
/*
@@ -344,11 +355,12 @@ struct iomap_dio_ops {
ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
- unsigned int dio_flags, size_t done_before);
+ unsigned int dio_flags, void *private, size_t done_before);
struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
- unsigned int dio_flags, size_t done_before);
+ unsigned int dio_flags, void *private, size_t done_before);
ssize_t iomap_dio_complete(struct iomap_dio *dio);
+void iomap_dio_bio_end_io(struct bio *bio);
#ifdef CONFIG_SWAP
struct file;
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index f068ff30d654..290f07eb050a 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -24,7 +24,7 @@ struct btrfs_free_cluster;
struct map_lookup;
struct extent_buffer;
struct btrfs_work;
-struct __btrfs_workqueue;
+struct btrfs_workqueue;
struct btrfs_qgroup_extent_record;
struct btrfs_qgroup;
struct extent_io_tree;
@@ -1457,42 +1457,36 @@ DEFINE_EVENT(btrfs__work, btrfs_ordered_sched,
TP_ARGS(work)
);
-DECLARE_EVENT_CLASS(btrfs__workqueue,
+DECLARE_EVENT_CLASS(btrfs_workqueue,
- TP_PROTO(const struct __btrfs_workqueue *wq,
- const char *name, int high),
+ TP_PROTO(const struct btrfs_workqueue *wq, const char *name),
- TP_ARGS(wq, name, high),
+ TP_ARGS(wq, name),
TP_STRUCT__entry_btrfs(
__field( const void *, wq )
__string( name, name )
- __field( int , high )
),
TP_fast_assign_btrfs(btrfs_workqueue_owner(wq),
__entry->wq = wq;
__assign_str(name, name);
- __entry->high = high;
),
- TP_printk_btrfs("name=%s%s wq=%p", __get_str(name),
- __print_flags(__entry->high, "",
- {(WQ_HIGHPRI), "-high"}),
+ TP_printk_btrfs("name=%s wq=%p", __get_str(name),
__entry->wq)
);
-DEFINE_EVENT(btrfs__workqueue, btrfs_workqueue_alloc,
+DEFINE_EVENT(btrfs_workqueue, btrfs_workqueue_alloc,
- TP_PROTO(const struct __btrfs_workqueue *wq,
- const char *name, int high),
+ TP_PROTO(const struct btrfs_workqueue *wq, const char *name),
- TP_ARGS(wq, name, high)
+ TP_ARGS(wq, name)
);
-DECLARE_EVENT_CLASS(btrfs__workqueue_done,
+DECLARE_EVENT_CLASS(btrfs_workqueue_done,
- TP_PROTO(const struct __btrfs_workqueue *wq),
+ TP_PROTO(const struct btrfs_workqueue *wq),
TP_ARGS(wq),
@@ -1507,9 +1501,9 @@ DECLARE_EVENT_CLASS(btrfs__workqueue_done,
TP_printk_btrfs("wq=%p", __entry->wq)
);
-DEFINE_EVENT(btrfs__workqueue_done, btrfs_workqueue_destroy,
+DEFINE_EVENT(btrfs_workqueue_done, btrfs_workqueue_destroy,
- TP_PROTO(const struct __btrfs_workqueue *wq),
+ TP_PROTO(const struct btrfs_workqueue *wq),
TP_ARGS(wq)
);
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index b069752a8ecf..d4117152d907 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -880,19 +880,6 @@ struct btrfs_dev_replace_item {
#define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \
BTRFS_SPACE_INFO_GLOBAL_RSV)
-enum btrfs_raid_types {
- BTRFS_RAID_RAID10,
- BTRFS_RAID_RAID1,
- BTRFS_RAID_DUP,
- BTRFS_RAID_RAID0,
- BTRFS_RAID_SINGLE,
- BTRFS_RAID_RAID5,
- BTRFS_RAID_RAID6,
- BTRFS_RAID_RAID1C3,
- BTRFS_RAID_RAID1C4,
- BTRFS_NR_RAID_TYPES
-};
-
#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \
BTRFS_BLOCK_GROUP_SYSTEM | \
BTRFS_BLOCK_GROUP_METADATA)