diff options
Diffstat (limited to 'fs')
139 files changed, 5065 insertions, 2488 deletions
diff --git a/fs/Makefile b/fs/Makefile index 293733f61594..23fcd8c164a3 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -12,7 +12,8 @@ obj-y := open.o read_write.o file_table.o super.o \ attr.o bad_inode.o file.o filesystems.o namespace.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ pnode.o splice.o sync.o utimes.o d_path.o \ - stack.o fs_struct.o statfs.o fs_pin.o nsfs.o + stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \ + fs_types.o ifeq ($(CONFIG_BLOCK),y) obj-y += buffer.o block_dev.o direct-io.o mpage.o diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 3b66c957ea6f..5810463dc6d2 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -9,6 +9,7 @@ #include <linux/posix_acl_xattr.h> #include <linux/posix_acl.h> #include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/slab.h> #include "ctree.h" @@ -72,8 +73,16 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans, } if (acl) { + unsigned int nofs_flag; + size = posix_acl_xattr_size(acl->a_count); + /* + * We're holding a transaction handle, so use a NOFS memory + * allocation context to avoid deadlock if reclaim happens. + */ + nofs_flag = memalloc_nofs_save(); value = kmalloc(size, GFP_KERNEL); + memalloc_nofs_restore(nofs_flag); if (!value) { ret = -ENOMEM; goto out; diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index d522494698fa..122cb97c7909 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -139,13 +139,11 @@ __btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name, } if (flags & WQ_HIGHPRI) - ret->normal_wq = alloc_workqueue("%s-%s-high", flags, - ret->current_active, "btrfs", - name); + ret->normal_wq = alloc_workqueue("btrfs-%s-high", flags, + ret->current_active, name); else - ret->normal_wq = alloc_workqueue("%s-%s", flags, - ret->current_active, "btrfs", - name); + ret->normal_wq = alloc_workqueue("btrfs-%s", flags, + ret->current_active, name); if (!ret->normal_wq) { kfree(ret); return NULL; diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 78556447e1d5..11459fe84a29 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -712,7 +712,7 @@ out: * read tree blocks and add keys where required. */ static int add_missing_keys(struct btrfs_fs_info *fs_info, - struct preftrees *preftrees) + struct preftrees *preftrees, bool lock) { struct prelim_ref *ref; struct extent_buffer *eb; @@ -737,12 +737,14 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info, free_extent_buffer(eb); return -EIO; } - btrfs_tree_read_lock(eb); + if (lock) + btrfs_tree_read_lock(eb); if (btrfs_header_level(eb) == 0) btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0); else btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0); - btrfs_tree_read_unlock(eb); + if (lock) + btrfs_tree_read_unlock(eb); free_extent_buffer(eb); prelim_ref_insert(fs_info, &preftrees->indirect, ref, NULL); cond_resched(); @@ -1227,7 +1229,7 @@ again: btrfs_release_path(path); - ret = add_missing_keys(fs_info, &preftrees); + ret = add_missing_keys(fs_info, &preftrees, path->skip_locking == 0); if (ret) goto out; @@ -1288,11 +1290,15 @@ again: ret = -EIO; goto out; } - btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + + if (!path->skip_locking) { + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_read(eb); + } ret = find_extent_in_eb(eb, bytenr, *extent_item_pos, &eie, ignore_offset); - btrfs_tree_read_unlock_blocking(eb); + if (!path->skip_locking) + btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); if (ret < 0) goto out; @@ -1650,7 +1656,7 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, /* make sure we can use eb after releasing the path */ if (eb != eb_in) { if (!path->skip_locking) - btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + btrfs_set_lock_blocking_read(eb); path->nodes[0] = NULL; path->locks[0] = 0; } diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 548057630b69..eb8e20b740d6 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -730,6 +730,28 @@ struct heuristic_ws { struct list_head list; }; +static struct workspace_manager heuristic_wsm; + +static void heuristic_init_workspace_manager(void) +{ + btrfs_init_workspace_manager(&heuristic_wsm, &btrfs_heuristic_compress); +} + +static void heuristic_cleanup_workspace_manager(void) +{ + btrfs_cleanup_workspace_manager(&heuristic_wsm); +} + +static struct list_head *heuristic_get_workspace(unsigned int level) +{ + return btrfs_get_workspace(&heuristic_wsm, level); +} + +static void heuristic_put_workspace(struct list_head *ws) +{ + btrfs_put_workspace(&heuristic_wsm, ws); +} + static void free_heuristic_ws(struct list_head *ws) { struct heuristic_ws *workspace; @@ -742,7 +764,7 @@ static void free_heuristic_ws(struct list_head *ws) kfree(workspace); } -static struct list_head *alloc_heuristic_ws(void) +static struct list_head *alloc_heuristic_ws(unsigned int level) { struct heuristic_ws *ws; @@ -769,65 +791,59 @@ fail: return ERR_PTR(-ENOMEM); } -struct workspaces_list { - struct list_head idle_ws; - spinlock_t ws_lock; - /* Number of free workspaces */ - int free_ws; - /* Total number of allocated workspaces */ - atomic_t total_ws; - /* Waiters for a free workspace */ - wait_queue_head_t ws_wait; +const struct btrfs_compress_op btrfs_heuristic_compress = { + .init_workspace_manager = heuristic_init_workspace_manager, + .cleanup_workspace_manager = heuristic_cleanup_workspace_manager, + .get_workspace = heuristic_get_workspace, + .put_workspace = heuristic_put_workspace, + .alloc_workspace = alloc_heuristic_ws, + .free_workspace = free_heuristic_ws, }; -static struct workspaces_list btrfs_comp_ws[BTRFS_COMPRESS_TYPES]; - -static struct workspaces_list btrfs_heuristic_ws; - static const struct btrfs_compress_op * const btrfs_compress_op[] = { + /* The heuristic is represented as compression type 0 */ + &btrfs_heuristic_compress, &btrfs_zlib_compress, &btrfs_lzo_compress, &btrfs_zstd_compress, }; -void __init btrfs_init_compress(void) +void btrfs_init_workspace_manager(struct workspace_manager *wsm, + const struct btrfs_compress_op *ops) { struct list_head *workspace; - int i; - INIT_LIST_HEAD(&btrfs_heuristic_ws.idle_ws); - spin_lock_init(&btrfs_heuristic_ws.ws_lock); - atomic_set(&btrfs_heuristic_ws.total_ws, 0); - init_waitqueue_head(&btrfs_heuristic_ws.ws_wait); + wsm->ops = ops; - workspace = alloc_heuristic_ws(); + INIT_LIST_HEAD(&wsm->idle_ws); + spin_lock_init(&wsm->ws_lock); + atomic_set(&wsm->total_ws, 0); + init_waitqueue_head(&wsm->ws_wait); + + /* + * Preallocate one workspace for each compression type so we can + * guarantee forward progress in the worst case + */ + workspace = wsm->ops->alloc_workspace(0); if (IS_ERR(workspace)) { pr_warn( - "BTRFS: cannot preallocate heuristic workspace, will try later\n"); + "BTRFS: cannot preallocate compression workspace, will try later\n"); } else { - atomic_set(&btrfs_heuristic_ws.total_ws, 1); - btrfs_heuristic_ws.free_ws = 1; - list_add(workspace, &btrfs_heuristic_ws.idle_ws); + atomic_set(&wsm->total_ws, 1); + wsm->free_ws = 1; + list_add(workspace, &wsm->idle_ws); } +} - for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { - INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws); - spin_lock_init(&btrfs_comp_ws[i].ws_lock); - atomic_set(&btrfs_comp_ws[i].total_ws, 0); - init_waitqueue_head(&btrfs_comp_ws[i].ws_wait); +void btrfs_cleanup_workspace_manager(struct workspace_manager *wsman) +{ + struct list_head *ws; - /* - * Preallocate one workspace for each compression type so - * we can guarantee forward progress in the worst case - */ - workspace = btrfs_compress_op[i]->alloc_workspace(); - if (IS_ERR(workspace)) { - pr_warn("BTRFS: cannot preallocate compression workspace, will try later\n"); - } else { - atomic_set(&btrfs_comp_ws[i].total_ws, 1); - btrfs_comp_ws[i].free_ws = 1; - list_add(workspace, &btrfs_comp_ws[i].idle_ws); - } + while (!list_empty(&wsman->idle_ws)) { + ws = wsman->idle_ws.next; + list_del(ws); + wsman->ops->free_workspace(ws); + atomic_dec(&wsman->total_ws); } } @@ -837,11 +853,11 @@ void __init btrfs_init_compress(void) * Preallocation makes a forward progress guarantees and we do not return * errors. */ -static struct list_head *__find_workspace(int type, bool heuristic) +struct list_head *btrfs_get_workspace(struct workspace_manager *wsm, + unsigned int level) { struct list_head *workspace; int cpus = num_online_cpus(); - int idx = type - 1; unsigned nofs_flag; struct list_head *idle_ws; spinlock_t *ws_lock; @@ -849,19 +865,11 @@ static struct list_head *__find_workspace(int type, bool heuristic) wait_queue_head_t *ws_wait; int *free_ws; - if (heuristic) { - idle_ws = &btrfs_heuristic_ws.idle_ws; - ws_lock = &btrfs_heuristic_ws.ws_lock; - total_ws = &btrfs_heuristic_ws.total_ws; - ws_wait = &btrfs_heuristic_ws.ws_wait; - free_ws = &btrfs_heuristic_ws.free_ws; - } else { - idle_ws = &btrfs_comp_ws[idx].idle_ws; - ws_lock = &btrfs_comp_ws[idx].ws_lock; - total_ws = &btrfs_comp_ws[idx].total_ws; - ws_wait = &btrfs_comp_ws[idx].ws_wait; - free_ws = &btrfs_comp_ws[idx].free_ws; - } + idle_ws = &wsm->idle_ws; + ws_lock = &wsm->ws_lock; + total_ws = &wsm->total_ws; + ws_wait = &wsm->ws_wait; + free_ws = &wsm->free_ws; again: spin_lock(ws_lock); @@ -892,10 +900,7 @@ again: * context of btrfs_compress_bio/btrfs_compress_pages */ nofs_flag = memalloc_nofs_save(); - if (heuristic) - workspace = alloc_heuristic_ws(); - else - workspace = btrfs_compress_op[idx]->alloc_workspace(); + workspace = wsm->ops->alloc_workspace(level); memalloc_nofs_restore(nofs_flag); if (IS_ERR(workspace)) { @@ -926,85 +931,47 @@ again: return workspace; } -static struct list_head *find_workspace(int type) +static struct list_head *get_workspace(int type, int level) { - return __find_workspace(type, false); + return btrfs_compress_op[type]->get_workspace(level); } /* * put a workspace struct back on the list or free it if we have enough * idle ones sitting around */ -static void __free_workspace(int type, struct list_head *workspace, - bool heuristic) +void btrfs_put_workspace(struct workspace_manager *wsm, struct list_head *ws) { - int idx = type - 1; struct list_head *idle_ws; spinlock_t *ws_lock; atomic_t *total_ws; wait_queue_head_t *ws_wait; int *free_ws; - if (heuristic) { - idle_ws = &btrfs_heuristic_ws.idle_ws; - ws_lock = &btrfs_heuristic_ws.ws_lock; - total_ws = &btrfs_heuristic_ws.total_ws; - ws_wait = &btrfs_heuristic_ws.ws_wait; - free_ws = &btrfs_heuristic_ws.free_ws; - } else { - idle_ws = &btrfs_comp_ws[idx].idle_ws; - ws_lock = &btrfs_comp_ws[idx].ws_lock; - total_ws = &btrfs_comp_ws[idx].total_ws; - ws_wait = &btrfs_comp_ws[idx].ws_wait; - free_ws = &btrfs_comp_ws[idx].free_ws; - } + idle_ws = &wsm->idle_ws; + ws_lock = &wsm->ws_lock; + total_ws = &wsm->total_ws; + ws_wait = &wsm->ws_wait; + free_ws = &wsm->free_ws; spin_lock(ws_lock); if (*free_ws <= num_online_cpus()) { - list_add(workspace, idle_ws); + list_add(ws, idle_ws); (*free_ws)++; spin_unlock(ws_lock); goto wake; } spin_unlock(ws_lock); - if (heuristic) - free_heuristic_ws(workspace); - else - btrfs_compress_op[idx]->free_workspace(workspace); + wsm->ops->free_workspace(ws); atomic_dec(total_ws); wake: cond_wake_up(ws_wait); } -static void free_workspace(int type, struct list_head *ws) +static void put_workspace(int type, struct list_head *ws) { - return __free_workspace(type, ws, false); -} - -/* - * cleanup function for module exit - */ -static void free_workspaces(void) -{ - struct list_head *workspace; - int i; - - while (!list_empty(&btrfs_heuristic_ws.idle_ws)) { - workspace = btrfs_heuristic_ws.idle_ws.next; - list_del(workspace); - free_heuristic_ws(workspace); - atomic_dec(&btrfs_heuristic_ws.total_ws); - } - - for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { - while (!list_empty(&btrfs_comp_ws[i].idle_ws)) { - workspace = btrfs_comp_ws[i].idle_ws.next; - list_del(workspace); - btrfs_compress_op[i]->free_workspace(workspace); - atomic_dec(&btrfs_comp_ws[i].total_ws); - } - } + return btrfs_compress_op[type]->put_workspace(ws); } /* @@ -1036,18 +1003,17 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out) { + int type = btrfs_compress_type(type_level); + int level = btrfs_compress_level(type_level); struct list_head *workspace; int ret; - int type = type_level & 0xF; - - workspace = find_workspace(type); - btrfs_compress_op[type - 1]->set_level(workspace, type_level); - ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping, + workspace = get_workspace(type, level); + ret = btrfs_compress_op[type]->compress_pages(workspace, mapping, start, pages, out_pages, total_in, total_out); - free_workspace(type, workspace); + put_workspace(type, workspace); return ret; } @@ -1071,9 +1037,9 @@ static int btrfs_decompress_bio(struct compressed_bio *cb) int ret; int type = cb->compress_type; - workspace = find_workspace(type); - ret = btrfs_compress_op[type - 1]->decompress_bio(workspace, cb); - free_workspace(type, workspace); + workspace = get_workspace(type, 0); + ret = btrfs_compress_op[type]->decompress_bio(workspace, cb); + put_workspace(type, workspace); return ret; } @@ -1089,19 +1055,29 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, struct list_head *workspace; int ret; - workspace = find_workspace(type); - - ret = btrfs_compress_op[type-1]->decompress(workspace, data_in, + workspace = get_workspace(type, 0); + ret = btrfs_compress_op[type]->decompress(workspace, data_in, dest_page, start_byte, srclen, destlen); + put_workspace(type, workspace); - free_workspace(type, workspace); return ret; } +void __init btrfs_init_compress(void) +{ + int i; + + for (i = 0; i < BTRFS_NR_WORKSPACE_MANAGERS; i++) + btrfs_compress_op[i]->init_workspace_manager(); +} + void __cold btrfs_exit_compress(void) { - free_workspaces(); + int i; + + for (i = 0; i < BTRFS_NR_WORKSPACE_MANAGERS; i++) + btrfs_compress_op[i]->cleanup_workspace_manager(); } /* @@ -1512,7 +1488,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, */ int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end) { - struct list_head *ws_list = __find_workspace(0, true); + struct list_head *ws_list = get_workspace(0, 0); struct heuristic_ws *ws; u32 i; u8 byte; @@ -1581,18 +1557,29 @@ int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end) } out: - __free_workspace(0, ws_list, true); + put_workspace(0, ws_list); return ret; } -unsigned int btrfs_compress_str2level(const char *str) +/* + * Convert the compression suffix (eg. after "zlib" starting with ":") to + * level, unrecognized string will set the default level + */ +unsigned int btrfs_compress_str2level(unsigned int type, const char *str) { - if (strncmp(str, "zlib", 4) != 0) + unsigned int level = 0; + int ret; + + if (!type) return 0; - /* Accepted form: zlib:1 up to zlib:9 and nothing left after the number */ - if (str[4] == ':' && '1' <= str[5] && str[5] <= '9' && str[6] == 0) - return str[5] - '0'; + if (str[0] == ':') { + ret = kstrtouint(str + 1, 10, &level); + if (ret) + level = 0; + } + + level = btrfs_compress_op[type]->set_level(level); - return BTRFS_ZLIB_DEFAULT_LEVEL; + return level; } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index ddda9b80bf20..9976fe0f7526 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -64,6 +64,16 @@ struct compressed_bio { u32 sums; }; +static inline unsigned int btrfs_compress_type(unsigned int type_level) +{ + return (type_level & 0xF); +} + +static inline unsigned int btrfs_compress_level(unsigned int type_level) +{ + return ((type_level & 0xF0) >> 4); +} + void __init btrfs_init_compress(void); void __cold btrfs_exit_compress(void); @@ -87,7 +97,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags); -unsigned btrfs_compress_str2level(const char *str); +unsigned int btrfs_compress_str2level(unsigned int type, const char *str); enum btrfs_compression_type { BTRFS_COMPRESS_NONE = 0, @@ -97,8 +107,35 @@ enum btrfs_compression_type { BTRFS_COMPRESS_TYPES = 3, }; +struct workspace_manager { + const struct btrfs_compress_op *ops; + struct list_head idle_ws; + spinlock_t ws_lock; + /* Number of free workspaces */ + int free_ws; + /* Total number of allocated workspaces */ + atomic_t total_ws; + /* Waiters for a free workspace */ + wait_queue_head_t ws_wait; +}; + +void btrfs_init_workspace_manager(struct workspace_manager *wsm, + const struct btrfs_compress_op *ops); +struct list_head *btrfs_get_workspace(struct workspace_manager *wsm, + unsigned int level); +void btrfs_put_workspace(struct workspace_manager *wsm, struct list_head *ws); +void btrfs_cleanup_workspace_manager(struct workspace_manager *wsm); + struct btrfs_compress_op { - struct list_head *(*alloc_workspace)(void); + void (*init_workspace_manager)(void); + + void (*cleanup_workspace_manager)(void); + + struct list_head *(*get_workspace)(unsigned int level); + + void (*put_workspace)(struct list_head *ws); + + struct list_head *(*alloc_workspace)(unsigned int level); void (*free_workspace)(struct list_head *workspace); @@ -119,9 +156,18 @@ struct btrfs_compress_op { unsigned long start_byte, size_t srclen, size_t destlen); - void (*set_level)(struct list_head *ws, unsigned int type); + /* + * This bounds the level set by the user to be within range of a + * particular compression type. It returns the level that will be used + * if the level is out of bounds or the default if 0 is passed in. + */ + unsigned int (*set_level)(unsigned int level); }; +/* The heuristic workspaces are managed via the 0th workspace manager */ +#define BTRFS_NR_WORKSPACE_MANAGERS (BTRFS_COMPRESS_TYPES + 1) + +extern const struct btrfs_compress_op btrfs_heuristic_compress; extern const struct btrfs_compress_op btrfs_zlib_compress; extern const struct btrfs_compress_op btrfs_lzo_compress; extern const struct btrfs_compress_op btrfs_zstd_compress; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 5a6c39b44c84..324df36d28bf 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -13,6 +13,7 @@ #include "print-tree.h" #include "locking.h" #include "volumes.h" +#include "qgroup.h" static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level); @@ -45,11 +46,18 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p) for (i = 0; i < BTRFS_MAX_LEVEL; i++) { if (!p->nodes[i] || !p->locks[i]) continue; - btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]); - if (p->locks[i] == BTRFS_READ_LOCK) + /* + * If we currently have a spinning reader or writer lock this + * will bump the count of blocking holders and drop the + * spinlock. + */ + if (p->locks[i] == BTRFS_READ_LOCK) { + btrfs_set_lock_blocking_read(p->nodes[i]); p->locks[i] = BTRFS_READ_LOCK_BLOCKING; - else if (p->locks[i] == BTRFS_WRITE_LOCK) + } else if (p->locks[i] == BTRFS_WRITE_LOCK) { + btrfs_set_lock_blocking_write(p->nodes[i]); p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING; + } } } @@ -1288,7 +1296,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, return eb; btrfs_set_path_blocking(path); - btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + btrfs_set_lock_blocking_read(eb); if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) { BUG_ON(tm->slot != 0); @@ -1378,7 +1386,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) free_extent_buffer(eb_root); eb = alloc_dummy_extent_buffer(fs_info, logical); } else { - btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK); + btrfs_set_lock_blocking_read(eb_root); eb = btrfs_clone_extent_buffer(eb_root); btrfs_tree_read_unlock_blocking(eb_root); free_extent_buffer(eb_root); @@ -1486,9 +1494,16 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, search_start = buf->start & ~((u64)SZ_1G - 1); if (parent) - btrfs_set_lock_blocking(parent); - btrfs_set_lock_blocking(buf); + btrfs_set_lock_blocking_write(parent); + btrfs_set_lock_blocking_write(buf); + /* + * Before CoWing this block for later modification, check if it's + * the subtree root and do the delayed subtree trace if needed. + * + * Also We don't care about the error, as it's handled internally. + */ + btrfs_qgroup_trace_subtree_after_cow(trans, root, buf); ret = __btrfs_cow_block(trans, root, buf, parent, parent_slot, cow_ret, search_start, 0); @@ -1582,7 +1597,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, if (parent_nritems <= 1) return 0; - btrfs_set_lock_blocking(parent); + btrfs_set_lock_blocking_write(parent); for (i = start_slot; i <= end_slot; i++) { struct btrfs_key first_key; @@ -1641,7 +1656,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, search_start = last_block; btrfs_tree_lock(cur); - btrfs_set_lock_blocking(cur); + btrfs_set_lock_blocking_write(cur); err = __btrfs_cow_block(trans, root, cur, parent, i, &cur, search_start, min(16 * blocksize, @@ -1856,7 +1871,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, } btrfs_tree_lock(child); - btrfs_set_lock_blocking(child); + btrfs_set_lock_blocking_write(child); ret = btrfs_cow_block(trans, root, child, mid, 0, &child); if (ret) { btrfs_tree_unlock(child); @@ -1894,7 +1909,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (left) { btrfs_tree_lock(left); - btrfs_set_lock_blocking(left); + btrfs_set_lock_blocking_write(left); wret = btrfs_cow_block(trans, root, left, parent, pslot - 1, &left); if (wret) { @@ -1909,7 +1924,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (right) { btrfs_tree_lock(right); - btrfs_set_lock_blocking(right); + btrfs_set_lock_blocking_write(right); wret = btrfs_cow_block(trans, root, right, parent, pslot + 1, &right); if (wret) { @@ -2072,7 +2087,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, u32 left_nr; btrfs_tree_lock(left); - btrfs_set_lock_blocking(left); + btrfs_set_lock_blocking_write(left); left_nr = btrfs_header_nritems(left); if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { @@ -2127,7 +2142,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, u32 right_nr; btrfs_tree_lock(right); - btrfs_set_lock_blocking(right); + btrfs_set_lock_blocking_write(right); right_nr = btrfs_header_nritems(right); if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { @@ -2529,26 +2544,6 @@ done: return ret; } -static void key_search_validate(struct extent_buffer *b, - const struct btrfs_key *key, - int level) -{ -#ifdef CONFIG_BTRFS_ASSERT - struct btrfs_disk_key disk_key; - - btrfs_cpu_key_to_disk(&disk_key, key); - - if (level == 0) - ASSERT(!memcmp_extent_buffer(b, &disk_key, - offsetof(struct btrfs_leaf, items[0].key), - sizeof(disk_key))); - else - ASSERT(!memcmp_extent_buffer(b, &disk_key, - offsetof(struct btrfs_node, ptrs[0].key), - sizeof(disk_key))); -#endif -} - static int key_search(struct extent_buffer *b, const struct btrfs_key *key, int level, int *prev_cmp, int *slot) { @@ -2557,7 +2552,6 @@ static int key_search(struct extent_buffer *b, const struct btrfs_key *key, return *prev_cmp; } - key_search_validate(b, key, level); *slot = 0; return 0; @@ -3005,6 +2999,8 @@ again: */ prev_cmp = -1; ret = key_search(b, key, level, &prev_cmp, &slot); + if (ret < 0) + goto done; if (level != 0) { int dec = 0; @@ -3771,7 +3767,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root return 1; btrfs_tree_lock(right); - btrfs_set_lock_blocking(right); + btrfs_set_lock_blocking_write(right); free_space = btrfs_leaf_free_space(fs_info, right); if (free_space < data_size) @@ -4005,7 +4001,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root return 1; btrfs_tree_lock(left); - btrfs_set_lock_blocking(left); + btrfs_set_lock_blocking_write(left); free_space = btrfs_leaf_free_space(fs_info, left); if (free_space < data_size) { @@ -5156,6 +5152,10 @@ again: nritems = btrfs_header_nritems(cur); level = btrfs_header_level(cur); sret = btrfs_bin_search(cur, min_key, level, &slot); + if (sret < 0) { + ret = sret; + goto out; + } /* at the lowest level, we're done, setup the path and exit */ if (level == path->lowest_level) { diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 94618a028730..129d26226e70 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -934,7 +934,8 @@ struct btrfs_fs_info { spinlock_t delayed_iput_lock; struct list_head delayed_iputs; - struct mutex cleaner_delayed_iput_mutex; + atomic_t nr_delayed_iputs; + wait_queue_head_t delayed_iputs_wait; /* this protects tree_mod_seq_list */ spinlock_t tree_mod_seq_lock; @@ -1074,10 +1075,13 @@ struct btrfs_fs_info { atomic_t scrubs_paused; atomic_t scrub_cancel_req; wait_queue_head_t scrub_pause_wait; - int scrub_workers_refcnt; + /* + * The worker pointers are NULL iff the refcount is 0, ie. scrub is not + * running. + */ + refcount_t scrub_workers_refcnt; struct btrfs_workqueue *scrub_workers; struct btrfs_workqueue *scrub_wr_completion_workers; - struct btrfs_workqueue *scrub_nocow_workers; struct btrfs_workqueue *scrub_parity_workers; #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY @@ -1199,6 +1203,24 @@ enum { BTRFS_ROOT_MULTI_LOG_TASKS, BTRFS_ROOT_DIRTY, BTRFS_ROOT_DELETING, + + /* + * Reloc tree is orphan, only kept here for qgroup delayed subtree scan + * + * Set for the subvolume tree owning the reloc tree. + */ + BTRFS_ROOT_DEAD_RELOC_TREE, +}; + +/* + * Record swapped tree blocks of a subvolume tree for delayed subtree trace + * code. For detail check comment in fs/btrfs/qgroup.c. + */ +struct btrfs_qgroup_swapped_blocks { + spinlock_t lock; + /* RM_EMPTY_ROOT() of above blocks[] */ + bool swapped; + struct rb_root blocks[BTRFS_MAX_LEVEL]; }; /* @@ -1312,6 +1334,14 @@ struct btrfs_root { u64 nr_ordered_extents; /* + * Not empty if this subvolume root has gone through tree block swap + * (relocation) + * + * Will be used by reloc_control::dirty_subvol_roots. + */ + struct list_head reloc_dirty_list; + + /* * Number of currently running SEND ioctls to prevent * manipulation with the read-only status via SUBVOL_SETFLAGS */ @@ -1328,6 +1358,9 @@ struct btrfs_root { /* Number of active swapfiles */ atomic_t nr_swapfiles; + /* Record pairs of swapped blocks for qgroup */ + struct btrfs_qgroup_swapped_blocks swapped_blocks; + #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS u64 alloc_bytenr; #endif @@ -2775,7 +2808,8 @@ enum btrfs_flush_state { FLUSH_DELALLOC = 5, FLUSH_DELALLOC_WAIT = 6, ALLOC_CHUNK = 7, - COMMIT_TRANS = 8, + ALLOC_CHUNK_FORCE = 8, + COMMIT_TRANS = 9, }; int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); @@ -3181,8 +3215,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, /* inode.c */ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, u64 start, - u64 len, int create); + u64 start, u64 len); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, u64 *ram_bytes); @@ -3254,6 +3287,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root); int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size); void btrfs_add_delayed_iput(struct inode *inode); void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info); +int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info); int btrfs_prealloc_file_range(struct inode *inode, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); @@ -3261,7 +3295,7 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, struct btrfs_trans_handle *trans, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); -int btrfs_run_delalloc_range(void *private_data, struct page *locked_page, +int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written, struct writeback_control *wbc); int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end); @@ -3476,21 +3510,18 @@ do { \ rcu_read_unlock(); \ } while (0) -#ifdef CONFIG_BTRFS_ASSERT - __cold static inline void assfail(const char *expr, const char *file, int line) { - pr_err("assertion failed: %s, file: %s, line: %d\n", - expr, file, line); - BUG(); + if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) { + pr_err("assertion failed: %s, file: %s, line: %d\n", + expr, file, line); + BUG(); + } } #define ASSERT(expr) \ (likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) -#else -#define ASSERT(expr) ((void)0) -#endif /* * Use that for functions that are conditionally exported for sanity tests but diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index cad36c99a483..7d2a413df90d 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -602,17 +602,14 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, RB_CLEAR_NODE(&head_ref->href_node); head_ref->processing = 0; head_ref->total_ref_mod = count_mod; - head_ref->qgroup_reserved = 0; - head_ref->qgroup_ref_root = 0; spin_lock_init(&head_ref->lock); mutex_init(&head_ref->mutex); if (qrecord) { if (ref_root && reserved) { - head_ref->qgroup_ref_root = ref_root; - head_ref->qgroup_reserved = reserved; + qrecord->data_rsv = reserved; + qrecord->data_rsv_refroot = ref_root; } - qrecord->bytenr = bytenr; qrecord->num_bytes = num_bytes; qrecord->old_roots = NULL; @@ -651,10 +648,6 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, existing = htree_insert(&delayed_refs->href_root, &head_ref->href_node); if (existing) { - WARN_ON(qrecord && head_ref->qgroup_ref_root - && head_ref->qgroup_reserved - && existing->qgroup_ref_root - && existing->qgroup_reserved); update_existing_head_ref(trans, existing, head_ref, old_ref_mod); /* @@ -770,7 +763,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) && is_fstree(ref_root)) { - record = kmalloc(sizeof(*record), GFP_NOFS); + record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) { kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); @@ -867,7 +860,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) && is_fstree(ref_root)) { - record = kmalloc(sizeof(*record), GFP_NOFS); + record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) { kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); kmem_cache_free(btrfs_delayed_ref_head_cachep, diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index d2af974f68a1..70606da440aa 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -103,17 +103,6 @@ struct btrfs_delayed_ref_head { int ref_mod; /* - * For qgroup reserved space freeing. - * - * ref_root and reserved will be recorded after - * BTRFS_ADD_DELAYED_EXTENT is called. - * And will be used to free reserved qgroup space at - * run_delayed_refs() time. - */ - u64 qgroup_ref_root; - u64 qgroup_reserved; - - /* * when a new extent is allocated, it is just reserved in memory * The actual extent isn't inserted into the extent allocation tree * until the delayed ref is processed. must_insert_reserved is diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 8750c835f535..ee193c5222b2 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -111,11 +111,11 @@ no_valid_dev_replace_entry_found: break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: - dev_replace->srcdev = btrfs_find_device(fs_info, src_devid, - NULL, NULL); - dev_replace->tgtdev = btrfs_find_device(fs_info, + dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices, + src_devid, NULL, NULL, true); + dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices, BTRFS_DEV_REPLACE_DEVID, - NULL, NULL); + NULL, NULL, true); /* * allow 'btrfs dev replace_cancel' if src/tgt device is * missing @@ -862,6 +862,7 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) btrfs_destroy_dev_replace_tgtdev(tgt_device); break; default: + up_write(&dev_replace->rwsem); result = -EINVAL; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6a2a2a951705..5216e7b3f9ad 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -17,6 +17,7 @@ #include <linux/semaphore.h> #include <linux/error-injection.h> #include <linux/crc32c.h> +#include <linux/sched/mm.h> #include <asm/unaligned.h> #include "ctree.h" #include "disk-io.h" @@ -341,7 +342,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, if (need_lock) { btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + btrfs_set_lock_blocking_read(eb); } lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1, @@ -1120,7 +1121,7 @@ void clean_tree_block(struct btrfs_fs_info *fs_info, -buf->len, fs_info->dirty_metadata_batch); /* ugh, clear_extent_buffer_dirty needs to lock the page */ - btrfs_set_lock_blocking(buf); + btrfs_set_lock_blocking_write(buf); clear_extent_buffer_dirty(buf); } } @@ -1175,6 +1176,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, INIT_LIST_HEAD(&root->delalloc_root); INIT_LIST_HEAD(&root->ordered_extents); INIT_LIST_HEAD(&root->ordered_root); + INIT_LIST_HEAD(&root->reloc_dirty_list); INIT_LIST_HEAD(&root->logged_list[0]); INIT_LIST_HEAD(&root->logged_list[1]); spin_lock_init(&root->inode_lock); @@ -1218,6 +1220,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->anon_dev = 0; spin_lock_init(&root->root_item_lock); + btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks); } static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, @@ -1258,10 +1261,17 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_root *root; struct btrfs_key key; + unsigned int nofs_flag; int ret = 0; uuid_le uuid = NULL_UUID_LE; + /* + * We're holding a transaction handle, so use a NOFS memory allocation + * context to avoid deadlock if reclaim happens. + */ + nofs_flag = memalloc_nofs_save(); root = btrfs_alloc_root(fs_info, GFP_KERNEL); + memalloc_nofs_restore(nofs_flag); if (!root) return ERR_PTR(-ENOMEM); @@ -1707,9 +1717,7 @@ static int cleaner_kthread(void *arg) goto sleep; } - mutex_lock(&fs_info->cleaner_delayed_iput_mutex); btrfs_run_delayed_iputs(fs_info); - mutex_unlock(&fs_info->cleaner_delayed_iput_mutex); again = btrfs_clean_one_deleted_snapshot(root); mutex_unlock(&fs_info->cleaner_mutex); @@ -2101,7 +2109,7 @@ static void btrfs_init_scrub(struct btrfs_fs_info *fs_info) atomic_set(&fs_info->scrubs_paused, 0); atomic_set(&fs_info->scrub_cancel_req, 0); init_waitqueue_head(&fs_info->scrub_pause_wait); - fs_info->scrub_workers_refcnt = 0; + refcount_set(&fs_info->scrub_workers_refcnt, 0); } static void btrfs_init_balance(struct btrfs_fs_info *fs_info) @@ -2666,7 +2674,6 @@ int open_ctree(struct super_block *sb, mutex_init(&fs_info->delete_unused_bgs_mutex); mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->delalloc_root_mutex); - mutex_init(&fs_info->cleaner_delayed_iput_mutex); seqlock_init(&fs_info->profiles_lock); INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); @@ -2688,6 +2695,7 @@ int open_ctree(struct super_block *sb, atomic_set(&fs_info->defrag_running, 0); atomic_set(&fs_info->qgroup_op_seq, 0); atomic_set(&fs_info->reada_works_cnt, 0); + atomic_set(&fs_info->nr_delayed_iputs, 0); atomic64_set(&fs_info->tree_mod_seq, 0); fs_info->sb = sb; fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE; @@ -2765,6 +2773,7 @@ int open_ctree(struct super_block *sb, init_waitqueue_head(&fs_info->transaction_wait); init_waitqueue_head(&fs_info->transaction_blocked_wait); init_waitqueue_head(&fs_info->async_submit_wait); + init_waitqueue_head(&fs_info->delayed_iputs_wait); INIT_LIST_HEAD(&fs_info->pinned_chunks); @@ -4238,16 +4247,9 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, head = rb_entry(node, struct btrfs_delayed_ref_head, href_node); - if (!mutex_trylock(&head->mutex)) { - refcount_inc(&head->refs); - spin_unlock(&delayed_refs->lock); - - mutex_lock(&head->mutex); - mutex_unlock(&head->mutex); - btrfs_put_delayed_ref_head(head); - spin_lock(&delayed_refs->lock); + if (btrfs_delayed_ref_lock(delayed_refs, head)) continue; - } + spin_lock(&head->lock); while ((n = rb_first_cached(&head->ref_tree)) != NULL) { ref = rb_entry(n, struct btrfs_delayed_ref_node, @@ -4263,12 +4265,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, if (head->must_insert_reserved) pin_bytes = true; btrfs_free_delayed_extent_op(head->extent_op); - delayed_refs->num_heads--; - if (head->processing == 0) - delayed_refs->num_heads_ready--; - atomic_dec(&delayed_refs->num_entries); - rb_erase_cached(&head->href_node, &delayed_refs->href_root); - RB_CLEAR_NODE(&head->href_node); + btrfs_delete_ref_head(delayed_refs, head); spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); mutex_unlock(&head->mutex); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d81035b7ea7d..994f0cc41799 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2492,9 +2492,6 @@ void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, } } - /* Also free its reserved qgroup space */ - btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root, - head->qgroup_reserved); btrfs_delayed_refs_rsv_release(fs_info, nr_items); } @@ -3013,8 +3010,7 @@ again: } if (run_all) { - if (!list_empty(&trans->new_bgs)) - btrfs_create_pending_block_groups(trans); + btrfs_create_pending_block_groups(trans); spin_lock(&delayed_refs->lock); node = rb_first_cached(&delayed_refs->href_root); @@ -4280,10 +4276,14 @@ commit_trans: /* * The cleaner kthread might still be doing iput * operations. Wait for it to finish so that - * more space is released. + * more space is released. We don't need to + * explicitly run the delayed iputs here because + * the commit_transaction would have woken up + * the cleaner. */ - mutex_lock(&fs_info->cleaner_delayed_iput_mutex); - mutex_unlock(&fs_info->cleaner_delayed_iput_mutex); + ret = btrfs_wait_on_delayed_iputs(fs_info); + if (ret) + return ret; goto again; } else { btrfs_end_transaction(trans); @@ -4396,7 +4396,6 @@ static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) static int should_alloc_chunk(struct btrfs_fs_info *fs_info, struct btrfs_space_info *sinfo, int force) { - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; u64 bytes_used = btrfs_space_info_used(sinfo, false); u64 thresh; @@ -4404,14 +4403,6 @@ static int should_alloc_chunk(struct btrfs_fs_info *fs_info, return 1; /* - * We need to take into account the global rsv because for all intents - * and purposes it's used space. Don't worry about locking the - * global_rsv, it doesn't change except when the transaction commits. - */ - if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA) - bytes_used += calc_global_rsv_need_space(global_rsv); - - /* * in limited mode, we want to have some free space up to * about 1% of the FS size. */ @@ -4741,7 +4732,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, struct btrfs_space_info *space_info; struct btrfs_trans_handle *trans; u64 delalloc_bytes; - u64 max_reclaim; + u64 async_pages; u64 items; long time_left; unsigned long nr_pages; @@ -4766,25 +4757,36 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, loops = 0; while (delalloc_bytes && loops < 3) { - max_reclaim = min(delalloc_bytes, to_reclaim); - nr_pages = max_reclaim >> PAGE_SHIFT; + nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; + + /* + * Triggers inode writeback for up to nr_pages. This will invoke + * ->writepages callback and trigger delalloc filling + * (btrfs_run_delalloc_range()). + */ btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items); + /* - * We need to wait for the async pages to actually start before - * we do anything. + * We need to wait for the compressed pages to start before + * we continue. */ - max_reclaim = atomic_read(&fs_info->async_delalloc_pages); - if (!max_reclaim) + async_pages = atomic_read(&fs_info->async_delalloc_pages); + if (!async_pages) goto skip_async; - if (max_reclaim <= nr_pages) - max_reclaim = 0; + /* + * Calculate how many compressed pages we want to be written + * before we continue. I.e if there are more async pages than we + * require wait_event will wait until nr_pages are written. + */ + if (async_pages <= nr_pages) + async_pages = 0; else - max_reclaim -= nr_pages; + async_pages -= nr_pages; wait_event(fs_info->async_submit_wait, atomic_read(&fs_info->async_delalloc_pages) <= - (int)max_reclaim); + (int)async_pages); skip_async: spin_lock(&space_info->lock); if (list_empty(&space_info->tickets) && @@ -4808,6 +4810,7 @@ skip_async: } struct reserve_ticket { + u64 orig_bytes; u64 bytes; int error; struct list_head list; @@ -4851,10 +4854,19 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, if (!bytes_needed) return 0; - /* See if there is enough pinned space to make this reservation */ - if (__percpu_counter_compare(&space_info->total_bytes_pinned, - bytes_needed, - BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) + trans = btrfs_join_transaction(fs_info->extent_root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + /* + * See if there is enough pinned space to make this reservation, or if + * we have block groups that are going to be freed, allowing us to + * possibly do a chunk allocation the next loop through. + */ + if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || + __percpu_counter_compare(&space_info->total_bytes_pinned, + bytes_needed, + BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) goto commit; /* @@ -4862,7 +4874,7 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, * this reservation. */ if (space_info != delayed_rsv->space_info) - return -ENOSPC; + goto enospc; spin_lock(&delayed_rsv->lock); reclaim_bytes += delayed_rsv->reserved; @@ -4877,16 +4889,14 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, if (__percpu_counter_compare(&space_info->total_bytes_pinned, bytes_needed, - BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) { - return -ENOSPC; - } + BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) + goto enospc; commit: - trans = btrfs_join_transaction(fs_info->extent_root); - if (IS_ERR(trans)) - return -ENOSPC; - return btrfs_commit_transaction(trans); +enospc: + btrfs_end_transaction(trans); + return -ENOSPC; } /* @@ -4939,6 +4949,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, btrfs_end_transaction(trans); break; case ALLOC_CHUNK: + case ALLOC_CHUNK_FORCE: trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -4946,7 +4957,8 @@ static void flush_space(struct btrfs_fs_info *fs_info, } ret = do_chunk_alloc(trans, btrfs_metadata_alloc_profile(fs_info), - CHUNK_ALLOC_NO_FORCE); + (state == ALLOC_CHUNK) ? + CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE); btrfs_end_transaction(trans); if (ret > 0 || ret == -ENOSPC) ret = 0; @@ -4957,9 +4969,8 @@ static void flush_space(struct btrfs_fs_info *fs_info, * bunch of pinned space, so make sure we run the iputs before * we do our pinned bytes check below. */ - mutex_lock(&fs_info->cleaner_delayed_iput_mutex); btrfs_run_delayed_iputs(fs_info); - mutex_unlock(&fs_info->cleaner_delayed_iput_mutex); + btrfs_wait_on_delayed_iputs(fs_info); ret = may_commit_transaction(fs_info, space_info); break; @@ -5030,7 +5041,7 @@ static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); } -static void wake_all_tickets(struct list_head *head) +static bool wake_all_tickets(struct list_head *head) { struct reserve_ticket *ticket; @@ -5039,7 +5050,10 @@ static void wake_all_tickets(struct list_head *head) list_del_init(&ticket->list); ticket->error = -ENOSPC; wake_up(&ticket->wait); + if (ticket->bytes != ticket->orig_bytes) + return true; } + return false; } /* @@ -5091,11 +5105,28 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) commit_cycles--; } + /* + * We don't want to force a chunk allocation until we've tried + * pretty hard to reclaim space. Think of the case where we + * freed up a bunch of space and so have a lot of pinned space + * to reclaim. We would rather use that than possibly create a + * underutilized metadata chunk. So if this is our first run + * through the flushing state machine skip ALLOC_CHUNK_FORCE and + * commit the transaction. If nothing has changed the next go + * around then we can force a chunk allocation. + */ + if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles) + flush_state++; + if (flush_state > COMMIT_TRANS) { commit_cycles++; if (commit_cycles > 2) { - wake_all_tickets(&space_info->tickets); - space_info->flush = 0; + if (wake_all_tickets(&space_info->tickets)) { + flush_state = FLUSH_DELAYED_ITEMS_NR; + commit_cycles--; + } else { + space_info->flush = 0; + } } else { flush_state = FLUSH_DELAYED_ITEMS_NR; } @@ -5109,12 +5140,18 @@ void btrfs_init_async_reclaim_work(struct work_struct *work) INIT_WORK(work, btrfs_async_reclaim_metadata_space); } +static const enum btrfs_flush_state priority_flush_states[] = { + FLUSH_DELAYED_ITEMS_NR, + FLUSH_DELAYED_ITEMS, + ALLOC_CHUNK, +}; + static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, struct reserve_ticket *ticket) { u64 to_reclaim; - int flush_state = FLUSH_DELAYED_ITEMS_NR; + int flush_state; spin_lock(&space_info->lock); to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, @@ -5125,8 +5162,10 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, } spin_unlock(&space_info->lock); + flush_state = 0; do { - flush_space(fs_info, space_info, to_reclaim, flush_state); + flush_space(fs_info, space_info, to_reclaim, + priority_flush_states[flush_state]); flush_state++; spin_lock(&space_info->lock); if (ticket->bytes == 0) { @@ -5134,23 +5173,16 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, return; } spin_unlock(&space_info->lock); - - /* - * Priority flushers can't wait on delalloc without - * deadlocking. - */ - if (flush_state == FLUSH_DELALLOC || - flush_state == FLUSH_DELALLOC_WAIT) - flush_state = ALLOC_CHUNK; - } while (flush_state < COMMIT_TRANS); + } while (flush_state < ARRAY_SIZE(priority_flush_states)); } static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, - struct reserve_ticket *ticket, u64 orig_bytes) + struct reserve_ticket *ticket) { DEFINE_WAIT(wait); + u64 reclaim_bytes = 0; int ret = 0; spin_lock(&space_info->lock); @@ -5171,14 +5203,12 @@ static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, ret = ticket->error; if (!list_empty(&ticket->list)) list_del_init(&ticket->list); - if (ticket->bytes && ticket->bytes < orig_bytes) { - u64 num_bytes = orig_bytes - ticket->bytes; - update_bytes_may_use(space_info, -num_bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, num_bytes, 0); - } + if (ticket->bytes && ticket->bytes < ticket->orig_bytes) + reclaim_bytes = ticket->orig_bytes - ticket->bytes; spin_unlock(&space_info->lock); + if (reclaim_bytes) + space_info_add_old_bytes(fs_info, space_info, reclaim_bytes); return ret; } @@ -5204,6 +5234,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, { struct reserve_ticket ticket; u64 used; + u64 reclaim_bytes = 0; int ret = 0; ASSERT(orig_bytes); @@ -5239,6 +5270,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, * the list and we will do our own flushing further down. */ if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { + ticket.orig_bytes = orig_bytes; ticket.bytes = orig_bytes; ticket.error = 0; init_waitqueue_head(&ticket.wait); @@ -5279,25 +5311,21 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, return ret; if (flush == BTRFS_RESERVE_FLUSH_ALL) - return wait_reserve_ticket(fs_info, space_info, &ticket, - orig_bytes); + return wait_reserve_ticket(fs_info, space_info, &ticket); ret = 0; priority_reclaim_metadata_space(fs_info, space_info, &ticket); spin_lock(&space_info->lock); if (ticket.bytes) { - if (ticket.bytes < orig_bytes) { - u64 num_bytes = orig_bytes - ticket.bytes; - update_bytes_may_use(space_info, -num_bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, - num_bytes, 0); - - } + if (ticket.bytes < orig_bytes) + reclaim_bytes = orig_bytes - ticket.bytes; list_del_init(&ticket.list); ret = -ENOSPC; } spin_unlock(&space_info->lock); + + if (reclaim_bytes) + space_info_add_old_bytes(fs_info, space_info, reclaim_bytes); ASSERT(list_empty(&ticket.list)); return ret; } @@ -5775,6 +5803,21 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, return ret; } +static void calc_refill_bytes(struct btrfs_block_rsv *block_rsv, + u64 *metadata_bytes, u64 *qgroup_bytes) +{ + *metadata_bytes = 0; + *qgroup_bytes = 0; + + spin_lock(&block_rsv->lock); + if (block_rsv->reserved < block_rsv->size) + *metadata_bytes = block_rsv->size - block_rsv->reserved; + if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size) + *qgroup_bytes = block_rsv->qgroup_rsv_size - + block_rsv->qgroup_rsv_reserved; + spin_unlock(&block_rsv->lock); +} + /** * btrfs_inode_rsv_refill - refill the inode block rsv. * @inode - the inode we are refilling. @@ -5790,25 +5833,42 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, { struct btrfs_root *root = inode->root; struct btrfs_block_rsv *block_rsv = &inode->block_rsv; - u64 num_bytes = 0; - u64 qgroup_num_bytes = 0; + u64 num_bytes, last = 0; + u64 qgroup_num_bytes; int ret = -ENOSPC; - spin_lock(&block_rsv->lock); - if (block_rsv->reserved < block_rsv->size) - num_bytes = block_rsv->size - block_rsv->reserved; - if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size) - qgroup_num_bytes = block_rsv->qgroup_rsv_size - - block_rsv->qgroup_rsv_reserved; - spin_unlock(&block_rsv->lock); - + calc_refill_bytes(block_rsv, &num_bytes, &qgroup_num_bytes); if (num_bytes == 0) return 0; - ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, true); - if (ret) - return ret; - ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); + do { + ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, + true); + if (ret) + return ret; + ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); + if (ret) { + btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); + last = num_bytes; + /* + * If we are fragmented we can end up with a lot of + * outstanding extents which will make our size be much + * larger than our reserved amount. + * + * If the reservation happens here, it might be very + * big though not needed in the end, if the delalloc + * flushing happens. + * + * If this is the case try and do the reserve again. + */ + if (flush == BTRFS_RESERVE_FLUSH_ALL) + calc_refill_bytes(block_rsv, &num_bytes, + &qgroup_num_bytes); + if (num_bytes == 0) + return 0; + } + } while (ret && last != num_bytes); + if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, false); trace_btrfs_space_reservation(root->fs_info, "delalloc", @@ -5818,8 +5878,7 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, spin_lock(&block_rsv->lock); block_rsv->qgroup_rsv_reserved += qgroup_num_bytes; spin_unlock(&block_rsv->lock); - } else - btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); + } return ret; } @@ -8066,6 +8125,15 @@ loop: return ret; } +#define DUMP_BLOCK_RSV(fs_info, rsv_name) \ +do { \ + struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \ + spin_lock(&__rsv->lock); \ + btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \ + __rsv->size, __rsv->reserved); \ + spin_unlock(&__rsv->lock); \ +} while (0) + static void dump_space_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *info, u64 bytes, int dump_block_groups) @@ -8085,6 +8153,12 @@ static void dump_space_info(struct btrfs_fs_info *fs_info, info->bytes_readonly); spin_unlock(&info->lock); + DUMP_BLOCK_RSV(fs_info, global_block_rsv); + DUMP_BLOCK_RSV(fs_info, trans_block_rsv); + DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); + DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); + DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); + if (!dump_block_groups) return; @@ -8492,7 +8566,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, clean_tree_block(fs_info, buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); - btrfs_set_lock_blocking(buf); + btrfs_set_lock_blocking_write(buf); set_extent_buffer_uptodate(buf); memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header)); @@ -8917,7 +8991,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, reada = 1; } btrfs_tree_lock(next); - btrfs_set_lock_blocking(next); + btrfs_set_lock_blocking_write(next); ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1, &wc->refs[level - 1], @@ -8977,7 +9051,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, return -EIO; } btrfs_tree_lock(next); - btrfs_set_lock_blocking(next); + btrfs_set_lock_blocking_write(next); } level--; @@ -9089,7 +9163,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (!path->locks[level]) { BUG_ON(level == 0); btrfs_tree_lock(eb); - btrfs_set_lock_blocking(eb); + btrfs_set_lock_blocking_write(eb); path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; ret = btrfs_lookup_extent_info(trans, fs_info, @@ -9131,7 +9205,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (!path->locks[level] && btrfs_header_generation(eb) == trans->transid) { btrfs_tree_lock(eb); - btrfs_set_lock_blocking(eb); + btrfs_set_lock_blocking_write(eb); path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; } clean_tree_block(fs_info, eb); @@ -9298,7 +9372,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { level = btrfs_header_level(root->node); path->nodes[level] = btrfs_lock_root_node(root); - btrfs_set_lock_blocking(path->nodes[level]); + btrfs_set_lock_blocking_write(path->nodes[level]); path->slots[level] = 0; path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; memset(&wc->update_progress, 0, @@ -9328,7 +9402,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, level = btrfs_header_level(root->node); while (1) { btrfs_tree_lock(path->nodes[level]); - btrfs_set_lock_blocking(path->nodes[level]); + btrfs_set_lock_blocking_write(path->nodes[level]); path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; ret = btrfs_lookup_extent_info(trans, fs_info, @@ -9595,6 +9669,7 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force) { struct btrfs_space_info *sinfo = cache->space_info; u64 num_bytes; + u64 sinfo_used; u64 min_allocable_bytes; int ret = -ENOSPC; @@ -9621,9 +9696,10 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force) num_bytes = cache->key.offset - cache->reserved - cache->pinned - cache->bytes_super - btrfs_block_group_used(&cache->item); + sinfo_used = btrfs_space_info_used(sinfo, true); - if (btrfs_space_info_used(sinfo, true) + num_bytes + - min_allocable_bytes <= sinfo->total_bytes) { + if (sinfo_used + num_bytes + min_allocable_bytes <= + sinfo->total_bytes) { sinfo->bytes_readonly += num_bytes; cache->ro++; list_add_tail(&cache->ro_list, &sinfo->ro_bgs); @@ -9632,6 +9708,15 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force) out: spin_unlock(&cache->lock); spin_unlock(&sinfo->lock); + if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) { + btrfs_info(cache->fs_info, + "unable to make block group %llu ro", + cache->key.objectid); + btrfs_info(cache->fs_info, + "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu", + sinfo_used, num_bytes, min_allocable_bytes); + dump_space_info(cache->fs_info, cache->space_info, 0, 0); + } return ret; } @@ -10781,13 +10866,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, } spin_lock(&trans->transaction->dirty_bgs_lock); - if (!list_empty(&block_group->dirty_list)) { - WARN_ON(1); - } - if (!list_empty(&block_group->io_list)) { - WARN_ON(1); - } + WARN_ON(!list_empty(&block_group->dirty_list)); + WARN_ON(!list_empty(&block_group->io_list)); spin_unlock(&trans->transaction->dirty_bgs_lock); + btrfs_remove_free_space_cache(block_group); spin_lock(&block_group->space_info->lock); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 52abe4082680..ca259c75bbcd 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -147,7 +147,38 @@ static int add_extent_changeset(struct extent_state *state, unsigned bits, return ret; } -static void flush_write_bio(struct extent_page_data *epd); +static int __must_check submit_one_bio(struct bio *bio, int mirror_num, + unsigned long bio_flags) +{ + blk_status_t ret = 0; + struct bio_vec *bvec = bio_last_bvec_all(bio); + struct page *page = bvec->bv_page; + struct extent_io_tree *tree = bio->bi_private; + u64 start; + + start = page_offset(page) + bvec->bv_offset; + + bio->bi_private = NULL; + + if (tree->ops) + ret = tree->ops->submit_bio_hook(tree->private_data, bio, + mirror_num, bio_flags, start); + else + btrfsic_submit_bio(bio); + + return blk_status_to_errno(ret); +} + +static void flush_write_bio(struct extent_page_data *epd) +{ + if (epd->bio) { + int ret; + + ret = submit_one_bio(epd->bio, 0, 0); + BUG_ON(ret < 0); /* -ENOMEM */ + epd->bio = NULL; + } +} int __init extent_io_init(void) { @@ -281,8 +312,8 @@ do_insert: } static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, - struct rb_node **prev_ret, struct rb_node **next_ret, + struct rb_node **prev_ret, struct rb_node ***p_ret, struct rb_node **parent_ret) { @@ -311,23 +342,23 @@ static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, if (parent_ret) *parent_ret = prev; - if (prev_ret) { + if (next_ret) { orig_prev = prev; while (prev && offset > prev_entry->end) { prev = rb_next(prev); prev_entry = rb_entry(prev, struct tree_entry, rb_node); } - *prev_ret = prev; + *next_ret = prev; prev = orig_prev; } - if (next_ret) { + if (prev_ret) { prev_entry = rb_entry(prev, struct tree_entry, rb_node); while (prev && offset < prev_entry->start) { prev = rb_prev(prev); prev_entry = rb_entry(prev, struct tree_entry, rb_node); } - *next_ret = prev; + *prev_ret = prev; } return NULL; } @@ -338,12 +369,12 @@ tree_search_for_insert(struct extent_io_tree *tree, struct rb_node ***p_ret, struct rb_node **parent_ret) { - struct rb_node *prev = NULL; + struct rb_node *next= NULL; struct rb_node *ret; - ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret); + ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret); if (!ret) - return prev; + return next; return ret; } @@ -585,7 +616,6 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, if (delete) bits |= ~EXTENT_CTLBITS; - bits |= EXTENT_FIRST_DELALLOC; if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) clear = 1; @@ -850,7 +880,6 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, btrfs_debug_check_extent_io_range(tree, start, end); - bits |= EXTENT_FIRST_DELALLOC; again: if (!prealloc && gfpflags_allow_blocking(mask)) { /* @@ -2692,28 +2721,6 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) return bio; } -static int __must_check submit_one_bio(struct bio *bio, int mirror_num, - unsigned long bio_flags) -{ - blk_status_t ret = 0; - struct bio_vec *bvec = bio_last_bvec_all(bio); - struct page *page = bvec->bv_page; - struct extent_io_tree *tree = bio->bi_private; - u64 start; - - start = page_offset(page) + bvec->bv_offset; - - bio->bi_private = NULL; - - if (tree->ops) - ret = tree->ops->submit_bio_hook(tree->private_data, bio, - mirror_num, bio_flags, start); - else - btrfsic_submit_bio(bio); - - return blk_status_to_errno(ret); -} - /* * @opf: bio REQ_OP_* and REQ_* flags as one value * @tree: tree so we can call our merge_bio hook @@ -4007,17 +4014,6 @@ retry: return ret; } -static void flush_write_bio(struct extent_page_data *epd) -{ - if (epd->bio) { - int ret; - - ret = submit_one_bio(epd->bio, 0, 0); - BUG_ON(ret < 0); /* -ENOMEM */ - epd->bio = NULL; - } -} - int extent_write_full_page(struct page *page, struct writeback_control *wbc) { int ret; @@ -4259,8 +4255,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode, if (len == 0) break; len = ALIGN(len, sectorsize); - em = btrfs_get_extent_fiemap(BTRFS_I(inode), NULL, 0, offset, - len, 0); + em = btrfs_get_extent_fiemap(BTRFS_I(inode), offset, len); if (IS_ERR_OR_NULL(em)) return em; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 9673be3f3d1f..08749e0b9c32 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -18,17 +18,16 @@ #define EXTENT_BOUNDARY (1U << 9) #define EXTENT_NODATASUM (1U << 10) #define EXTENT_CLEAR_META_RESV (1U << 11) -#define EXTENT_FIRST_DELALLOC (1U << 12) -#define EXTENT_NEED_WAIT (1U << 13) -#define EXTENT_DAMAGED (1U << 14) -#define EXTENT_NORESERVE (1U << 15) -#define EXTENT_QGROUP_RESERVED (1U << 16) -#define EXTENT_CLEAR_DATA_RESV (1U << 17) -#define EXTENT_DELALLOC_NEW (1U << 18) +#define EXTENT_NEED_WAIT (1U << 12) +#define EXTENT_DAMAGED (1U << 13) +#define EXTENT_NORESERVE (1U << 14) +#define EXTENT_QGROUP_RESERVED (1U << 15) +#define EXTENT_CLEAR_DATA_RESV (1U << 16) +#define EXTENT_DELALLOC_NEW (1U << 17) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) #define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \ EXTENT_CLEAR_DATA_RESV) -#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) +#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING) /* * flags for bio submission. The high bits indicate the compression diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index a042a193c120..928f729c55ba 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -210,6 +210,9 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next) if (!list_empty(&prev->list) || !list_empty(&next->list)) return 0; + ASSERT(next->block_start != EXTENT_MAP_DELALLOC && + prev->block_start != EXTENT_MAP_DELALLOC); + if (extent_map_end(prev) == next->start && prev->flags == next->flags && prev->bdev == next->bdev && @@ -217,8 +220,6 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next) prev->block_start == EXTENT_MAP_HOLE) || (next->block_start == EXTENT_MAP_INLINE && prev->block_start == EXTENT_MAP_INLINE) || - (next->block_start == EXTENT_MAP_DELALLOC && - prev->block_start == EXTENT_MAP_DELALLOC) || (next->block_start < EXTENT_MAP_LAST_BYTE - 1 && next->block_start == extent_map_block_end(prev)))) { return 1; diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index ef05a0121652..473f039fcd7c 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -9,6 +9,7 @@ #define EXTENT_MAP_LAST_BYTE ((u64)-4) #define EXTENT_MAP_HOLE ((u64)-3) #define EXTENT_MAP_INLINE ((u64)-2) +/* used only during fiemap calls */ #define EXTENT_MAP_DELALLOC ((u64)-1) /* bits for the extent_map::flags field */ diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index d38dc8c31533..34fe8a58b0e9 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3218,8 +3218,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence) &cached_state); while (start < inode->i_size) { - em = btrfs_get_extent_fiemap(BTRFS_I(inode), NULL, 0, - start, len, 0); + em = btrfs_get_extent_fiemap(BTRFS_I(inode), start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); em = NULL; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5c349667c761..3f180b857e20 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -453,7 +453,6 @@ static noinline void compress_file_range(struct inode *inode, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 blocksize = fs_info->sectorsize; u64 actual_end; - u64 isize = i_size_read(inode); int ret = 0; struct page **pages = NULL; unsigned long nr_pages; @@ -467,7 +466,7 @@ static noinline void compress_file_range(struct inode *inode, inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1, SZ_16K); - actual_end = min_t(u64, isize, end + 1); + actual_end = min_t(u64, i_size_read(inode), end + 1); again: will_compress = 0; nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; @@ -714,9 +713,9 @@ static void free_async_extent_pages(struct async_extent *async_extent) * queued. We walk all the async extents created by compress_file_range * and send them down to the disk. */ -static noinline void submit_compressed_extents(struct inode *inode, - struct async_cow *async_cow) +static noinline void submit_compressed_extents(struct async_cow *async_cow) { + struct inode *inode = async_cow->inode; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct async_extent *async_extent; u64 alloc_hint = 0; @@ -1166,8 +1165,14 @@ static noinline void async_cow_submit(struct btrfs_work *work) 5 * SZ_1M) cond_wake_up_nomb(&fs_info->async_submit_wait); + /* + * ->inode could be NULL if async_cow_start has failed to compress, + * in which case we don't have anything to submit, yet we need to + * always adjust ->async_delalloc_pages as its paired with the init + * happening in cow_file_range_async + */ if (async_cow->inode) - submit_compressed_extents(async_cow->inode, async_cow); + submit_compressed_extents(async_cow); } static noinline void async_cow_free(struct btrfs_work *work) @@ -1194,7 +1199,12 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, while (start < end) { async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); BUG_ON(!async_cow); /* -ENOMEM */ - async_cow->inode = igrab(inode); + /* + * igrab is called higher up in the call chain, take only the + * lightweight reference for the callback lifetime + */ + ihold(inode); + async_cow->inode = inode; async_cow->fs_info = fs_info; async_cow->locked_page = locked_page; async_cow->start = start; @@ -1586,11 +1596,10 @@ static inline int need_force_cow(struct inode *inode, u64 start, u64 end) * Function to process delayed allocation (create CoW) for ranges which are * being touched for the first time. */ -int btrfs_run_delalloc_range(void *private_data, struct page *locked_page, +int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written, struct writeback_control *wbc) { - struct inode *inode = private_data; int ret; int force_cow = need_force_cow(inode, start, end); unsigned int write_flags = wbc_to_write_flags(wbc); @@ -3247,6 +3256,7 @@ void btrfs_add_delayed_iput(struct inode *inode) if (atomic_add_unless(&inode->i_count, -1, 1)) return; + atomic_inc(&fs_info->nr_delayed_iputs); spin_lock(&fs_info->delayed_iput_lock); ASSERT(list_empty(&binode->delayed_iput)); list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs); @@ -3267,11 +3277,32 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) list_del_init(&inode->delayed_iput); spin_unlock(&fs_info->delayed_iput_lock); iput(&inode->vfs_inode); + if (atomic_dec_and_test(&fs_info->nr_delayed_iputs)) + wake_up(&fs_info->delayed_iputs_wait); spin_lock(&fs_info->delayed_iput_lock); } spin_unlock(&fs_info->delayed_iput_lock); } +/** + * btrfs_wait_on_delayed_iputs - wait on the delayed iputs to be done running + * @fs_info - the fs_info for this fs + * @return - EINTR if we were killed, 0 if nothing's pending + * + * This will wait on any delayed iputs that are currently running with KILLABLE + * set. Once they are all done running we will return, unless we are killed in + * which case we return EINTR. This helps in user operations like fallocate etc + * that might get blocked on the iputs. + */ +int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info) +{ + int ret = wait_event_killable(fs_info->delayed_iputs_wait, + atomic_read(&fs_info->nr_delayed_iputs) == 0); + if (ret) + return -EINTR; + return 0; +} + /* * This creates an orphan entry for the given inode in case something goes wrong * in the middle of an unlink. @@ -5262,13 +5293,15 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + u64 delayed_refs_extra = btrfs_calc_trans_metadata_size(fs_info, 1); int failures = 0; for (;;) { struct btrfs_trans_handle *trans; int ret; - ret = btrfs_block_rsv_refill(root, rsv, rsv->size, + ret = btrfs_block_rsv_refill(root, rsv, + rsv->size + delayed_refs_extra, BTRFS_RESERVE_FLUSH_LIMIT); if (ret && ++failures > 2) { @@ -5277,9 +5310,28 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, return ERR_PTR(-ENOSPC); } + /* + * Evict can generate a large amount of delayed refs without + * having a way to add space back since we exhaust our temporary + * block rsv. We aren't allowed to do FLUSH_ALL in this case + * because we could deadlock with so many things in the flushing + * code, so we have to try and hold some extra space to + * compensate for our delayed ref generation. If we can't get + * that space then we need see if we can steal our minimum from + * the global reserve. We will be ratelimited by the amount of + * space we have for the delayed refs rsv, so we'll end up + * committing and trying again. + */ trans = btrfs_join_transaction(root); - if (IS_ERR(trans) || !ret) + if (IS_ERR(trans) || !ret) { + if (!IS_ERR(trans)) { + trans->block_rsv = &fs_info->trans_block_rsv; + trans->bytes_reserved = delayed_refs_extra; + btrfs_block_rsv_migrate(rsv, trans->block_rsv, + delayed_refs_extra, 1); + } return trans; + } /* * Try to steal from the global reserve if there is space for @@ -6731,7 +6783,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, u64 extent_start = 0; u64 extent_end = 0; u64 objectid = btrfs_ino(inode); - u32 found_type; + u8 extent_type; struct btrfs_path *path = NULL; struct btrfs_root *root = inode->root; struct btrfs_file_extent_item *item; @@ -6786,9 +6838,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, if (ret < 0) { err = ret; goto out; - } - - if (ret != 0) { + } else if (ret > 0) { if (path->slots[0] == 0) goto not_found; path->slots[0]--; @@ -6797,11 +6847,9 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - /* are we inside the extent that was found? */ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - found_type = found_key.type; if (found_key.objectid != objectid || - found_type != BTRFS_EXTENT_DATA_KEY) { + found_key.type != BTRFS_EXTENT_DATA_KEY) { /* * If we backup past the first extent we want to move forward * and see if there is an extent in front of us, otherwise we'll @@ -6812,16 +6860,16 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, goto next; } - found_type = btrfs_file_extent_type(leaf, item); + extent_type = btrfs_file_extent_type(leaf, item); extent_start = found_key.offset; - if (found_type == BTRFS_FILE_EXTENT_REG || - found_type == BTRFS_FILE_EXTENT_PREALLOC) { + if (extent_type == BTRFS_FILE_EXTENT_REG || + extent_type == BTRFS_FILE_EXTENT_PREALLOC) { extent_end = extent_start + btrfs_file_extent_num_bytes(leaf, item); trace_btrfs_get_extent_show_fi_regular(inode, leaf, item, extent_start); - } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { size_t size; size = btrfs_file_extent_ram_bytes(leaf, item); @@ -6840,9 +6888,9 @@ next: if (ret < 0) { err = ret; goto out; - } - if (ret > 0) + } else if (ret > 0) { goto not_found; + } leaf = path->nodes[0]; } btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); @@ -6853,19 +6901,22 @@ next: goto not_found; if (start > found_key.offset) goto next; + + /* New extent overlaps with existing one */ em->start = start; em->orig_start = start; em->len = found_key.offset - start; - goto not_found_em; + em->block_start = EXTENT_MAP_HOLE; + goto insert; } btrfs_extent_item_to_extent_map(inode, path, item, new_inline, em); - if (found_type == BTRFS_FILE_EXTENT_REG || - found_type == BTRFS_FILE_EXTENT_PREALLOC) { + if (extent_type == BTRFS_FILE_EXTENT_REG || + extent_type == BTRFS_FILE_EXTENT_PREALLOC) { goto insert; - } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { unsigned long ptr; char *map; size_t size; @@ -6916,7 +6967,6 @@ not_found: em->start = start; em->orig_start = start; em->len = len; -not_found_em: em->block_start = EXTENT_MAP_HOLE; insert: btrfs_release_path(path); @@ -6946,19 +6996,17 @@ out: } struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, - struct page *page, - size_t pg_offset, u64 start, u64 len, - int create) + u64 start, u64 len) { struct extent_map *em; struct extent_map *hole_em = NULL; - u64 range_start = start; + u64 delalloc_start = start; u64 end; - u64 found; - u64 found_end; + u64 delalloc_len; + u64 delalloc_end; int err = 0; - em = btrfs_get_extent(inode, page, pg_offset, start, len, create); + em = btrfs_get_extent(inode, NULL, 0, start, len, 0); if (IS_ERR(em)) return em; /* @@ -6983,80 +7031,84 @@ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, em = NULL; /* ok, we didn't find anything, lets look for delalloc */ - found = count_range_bits(&inode->io_tree, &range_start, + delalloc_len = count_range_bits(&inode->io_tree, &delalloc_start, end, len, EXTENT_DELALLOC, 1); - found_end = range_start + found; - if (found_end < range_start) - found_end = (u64)-1; + delalloc_end = delalloc_start + delalloc_len; + if (delalloc_end < delalloc_start) + delalloc_end = (u64)-1; /* - * we didn't find anything useful, return - * the original results from get_extent() + * We didn't find anything useful, return the original results from + * get_extent() */ - if (range_start > end || found_end <= start) { + if (delalloc_start > end || delalloc_end <= start) { em = hole_em; hole_em = NULL; goto out; } - /* adjust the range_start to make sure it doesn't - * go backwards from the start they passed in + /* + * Adjust the delalloc_start to make sure it doesn't go backwards from + * the start they passed in */ - range_start = max(start, range_start); - found = found_end - range_start; + delalloc_start = max(start, delalloc_start); + delalloc_len = delalloc_end - delalloc_start; - if (found > 0) { - u64 hole_start = start; - u64 hole_len = len; + if (delalloc_len > 0) { + u64 hole_start; + u64 hole_len; + const u64 hole_end = extent_map_end(hole_em); em = alloc_extent_map(); if (!em) { err = -ENOMEM; goto out; } + em->bdev = NULL; + + ASSERT(hole_em); /* - * when btrfs_get_extent can't find anything it - * returns one huge hole + * When btrfs_get_extent can't find anything it returns one + * huge hole * - * make sure what it found really fits our range, and - * adjust to make sure it is based on the start from - * the caller + * Make sure what it found really fits our range, and adjust to + * make sure it is based on the start from the caller */ - if (hole_em) { - u64 calc_end = extent_map_end(hole_em); - - if (calc_end <= start || (hole_em->start > end)) { - free_extent_map(hole_em); - hole_em = NULL; - } else { - hole_start = max(hole_em->start, start); - hole_len = calc_end - hole_start; - } + if (hole_end <= start || hole_em->start > end) { + free_extent_map(hole_em); + hole_em = NULL; + } else { + hole_start = max(hole_em->start, start); + hole_len = hole_end - hole_start; } - em->bdev = NULL; - if (hole_em && range_start > hole_start) { - /* our hole starts before our delalloc, so we - * have to return just the parts of the hole - * that go until the delalloc starts + + if (hole_em && delalloc_start > hole_start) { + /* + * Our hole starts before our delalloc, so we have to + * return just the parts of the hole that go until the + * delalloc starts */ - em->len = min(hole_len, - range_start - hole_start); + em->len = min(hole_len, delalloc_start - hole_start); em->start = hole_start; em->orig_start = hole_start; /* - * don't adjust block start at all, - * it is fixed at EXTENT_MAP_HOLE + * Don't adjust block start at all, it is fixed at + * EXTENT_MAP_HOLE */ em->block_start = hole_em->block_start; em->block_len = hole_len; if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags)) set_bit(EXTENT_FLAG_PREALLOC, &em->flags); } else { - em->start = range_start; - em->len = found; - em->orig_start = range_start; + /* + * Hole is out of passed range or it starts after + * delalloc range + */ + em->start = delalloc_start; + em->len = delalloc_len; + em->orig_start = delalloc_start; em->block_start = EXTENT_MAP_DELALLOC; - em->block_len = found; + em->block_len = delalloc_len; } } else { return hole_em; @@ -9910,7 +9962,6 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode init_completion(&work->completion); INIT_LIST_HEAD(&work->list); work->inode = inode; - WARN_ON_ONCE(!inode); btrfs_init_work(&work->work, btrfs_flush_delalloc_helper, btrfs_run_delalloc_work, NULL, NULL); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 9c8e1734429c..494f0f10d70e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1642,7 +1642,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, btrfs_info(fs_info, "resizing devid %llu", devid); } - device = btrfs_find_device(fs_info, devid, NULL, NULL); + device = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true); if (!device) { btrfs_info(fs_info, "resizer unable to find device %llu", devid); @@ -3178,7 +3178,8 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, s_uuid = di_args->uuid; rcu_read_lock(); - dev = btrfs_find_device(fs_info, di_args->devid, s_uuid, NULL); + dev = btrfs_find_device(fs_info->fs_devices, di_args->devid, s_uuid, + NULL, true); if (!dev) { ret = -ENODEV; @@ -3241,32 +3242,17 @@ static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); } -static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen, +static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, struct inode *dst, u64 dst_loff) { - u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; int ret; - u64 len = olen; - - if (loff + len == src->i_size) - len = ALIGN(src->i_size, bs) - loff; - /* - * For same inode case we don't want our length pushed out past i_size - * as comparing that data range makes no sense. - * - * This effectively means we require aligned extents for the single - * inode case, whereas the other cases allow an unaligned length so long - * as it ends at i_size. - */ - if (dst == src && len != olen) - return -EINVAL; /* * Lock destination range to serialize with concurrent readpages() and * source range to serialize with relocation. */ btrfs_double_extent_lock(src, loff, dst, dst_loff, len); - ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1); + ret = btrfs_clone(src, dst, loff, len, len, dst_loff, 1); btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); return ret; @@ -3278,21 +3264,10 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, struct inode *dst, u64 dst_loff) { int ret; - int num_pages = PAGE_ALIGN(BTRFS_MAX_DEDUPE_LEN) >> PAGE_SHIFT; u64 i, tail_len, chunk_count; - /* don't make the dst file partly checksummed */ - if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != - (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) - return -EINVAL; - - if (IS_SWAPFILE(src) || IS_SWAPFILE(dst)) - return -ETXTBSY; - tail_len = olen % BTRFS_MAX_DEDUPE_LEN; chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN); - if (chunk_count == 0) - num_pages = PAGE_ALIGN(tail_len) >> PAGE_SHIFT; for (i = 0; i < chunk_count; i++) { ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN, @@ -3908,14 +3883,6 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, * be either compressed or non-compressed. */ - /* don't make the dst file partly checksummed */ - if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != - (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) - return -EINVAL; - - if (IS_SWAPFILE(src) || IS_SWAPFILE(inode)) - return -ETXTBSY; - /* * VFS's generic_remap_file_range_prep() protects us from cloning the * eof block into the middle of a file, which would result in corruption @@ -3991,6 +3958,13 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, else btrfs_double_inode_lock(inode_in, inode_out); + /* don't make the dst file partly checksummed */ + if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) != + (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) { + ret = -EINVAL; + goto out_unlock; + } + /* * Now that the inodes are locked, we need to start writeback ourselves * and can not rely on the writeback from the VFS's generic helper @@ -4381,7 +4355,7 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg) &sa->progress, sa->flags & BTRFS_SCRUB_READONLY, 0); - if (copy_to_user(arg, sa, sizeof(*sa))) + if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa))) ret = -EFAULT; if (!(sa->flags & BTRFS_SCRUB_READONLY)) @@ -4414,7 +4388,7 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_fs_info *fs_info, ret = btrfs_scrub_progress(fs_info, sa->devid, &sa->progress); - if (copy_to_user(arg, sa, sizeof(*sa))) + if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa))) ret = -EFAULT; kfree(sa); @@ -4438,7 +4412,7 @@ static long btrfs_ioctl_get_dev_stats(struct btrfs_fs_info *fs_info, ret = btrfs_get_dev_stats(fs_info, sa); - if (copy_to_user(arg, sa, sizeof(*sa))) + if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa))) ret = -EFAULT; kfree(sa); @@ -4484,7 +4458,7 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info, break; } - if (copy_to_user(arg, p, sizeof(*p))) + if ((ret == 0 || ret == -ECANCELED) && copy_to_user(arg, p, sizeof(*p))) ret = -EFAULT; out: kfree(p); @@ -4790,7 +4764,7 @@ do_balance: ret = btrfs_balance(fs_info, bctl, bargs); bctl = NULL; - if (arg) { + if ((ret == 0 || ret == -ECANCELED) && arg) { if (copy_to_user(arg, bargs, sizeof(*bargs))) ret = -EFAULT; } diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 1da768e5ef75..82b84e4daad1 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -14,43 +14,58 @@ static void btrfs_assert_tree_read_locked(struct extent_buffer *eb); -/* - * if we currently have a spinning reader or writer lock - * (indicated by the rw flag) this will bump the count - * of blocking holders and drop the spinlock. - */ -void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw) +void btrfs_set_lock_blocking_read(struct extent_buffer *eb) { /* - * no lock is required. The lock owner may change if - * we have a read lock, but it won't change to or away - * from us. If we have the write lock, we are the owner - * and it'll never change. + * No lock is required. The lock owner may change if we have a read + * lock, but it won't change to or away from us. If we have the write + * lock, we are the owner and it'll never change. */ if (eb->lock_nested && current->pid == eb->lock_owner) return; - if (rw == BTRFS_WRITE_LOCK) { - if (atomic_read(&eb->blocking_writers) == 0) { - WARN_ON(atomic_read(&eb->spinning_writers) != 1); - atomic_dec(&eb->spinning_writers); - btrfs_assert_tree_locked(eb); - atomic_inc(&eb->blocking_writers); - write_unlock(&eb->lock); - } - } else if (rw == BTRFS_READ_LOCK) { - btrfs_assert_tree_read_locked(eb); - atomic_inc(&eb->blocking_readers); - WARN_ON(atomic_read(&eb->spinning_readers) == 0); - atomic_dec(&eb->spinning_readers); - read_unlock(&eb->lock); + btrfs_assert_tree_read_locked(eb); + atomic_inc(&eb->blocking_readers); + WARN_ON(atomic_read(&eb->spinning_readers) == 0); + atomic_dec(&eb->spinning_readers); + read_unlock(&eb->lock); +} + +void btrfs_set_lock_blocking_write(struct extent_buffer *eb) +{ + /* + * No lock is required. The lock owner may change if we have a read + * lock, but it won't change to or away from us. If we have the write + * lock, we are the owner and it'll never change. + */ + if (eb->lock_nested && current->pid == eb->lock_owner) + return; + if (atomic_read(&eb->blocking_writers) == 0) { + WARN_ON(atomic_read(&eb->spinning_writers) != 1); + atomic_dec(&eb->spinning_writers); + btrfs_assert_tree_locked(eb); + atomic_inc(&eb->blocking_writers); + write_unlock(&eb->lock); } } -/* - * if we currently have a blocking lock, take the spinlock - * and drop our blocking count - */ -void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) +void btrfs_clear_lock_blocking_read(struct extent_buffer *eb) +{ + /* + * No lock is required. The lock owner may change if we have a read + * lock, but it won't change to or away from us. If we have the write + * lock, we are the owner and it'll never change. + */ + if (eb->lock_nested && current->pid == eb->lock_owner) + return; + BUG_ON(atomic_read(&eb->blocking_readers) == 0); + read_lock(&eb->lock); + atomic_inc(&eb->spinning_readers); + /* atomic_dec_and_test implies a barrier */ + if (atomic_dec_and_test(&eb->blocking_readers)) + cond_wake_up_nomb(&eb->read_lock_wq); +} + +void btrfs_clear_lock_blocking_write(struct extent_buffer *eb) { /* * no lock is required. The lock owner may change if @@ -60,23 +75,13 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) */ if (eb->lock_nested && current->pid == eb->lock_owner) return; - - if (rw == BTRFS_WRITE_LOCK_BLOCKING) { - BUG_ON(atomic_read(&eb->blocking_writers) != 1); - write_lock(&eb->lock); - WARN_ON(atomic_read(&eb->spinning_writers)); - atomic_inc(&eb->spinning_writers); - /* atomic_dec_and_test implies a barrier */ - if (atomic_dec_and_test(&eb->blocking_writers)) - cond_wake_up_nomb(&eb->write_lock_wq); - } else if (rw == BTRFS_READ_LOCK_BLOCKING) { - BUG_ON(atomic_read(&eb->blocking_readers) == 0); - read_lock(&eb->lock); - atomic_inc(&eb->spinning_readers); - /* atomic_dec_and_test implies a barrier */ - if (atomic_dec_and_test(&eb->blocking_readers)) - cond_wake_up_nomb(&eb->read_lock_wq); - } + BUG_ON(atomic_read(&eb->blocking_writers) != 1); + write_lock(&eb->lock); + WARN_ON(atomic_read(&eb->spinning_writers)); + atomic_inc(&eb->spinning_writers); + /* atomic_dec_and_test implies a barrier */ + if (atomic_dec_and_test(&eb->blocking_writers)) + cond_wake_up_nomb(&eb->write_lock_wq); } /* @@ -232,16 +237,9 @@ again: wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0); wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); write_lock(&eb->lock); - if (atomic_read(&eb->blocking_readers)) { + if (atomic_read(&eb->blocking_readers) || + atomic_read(&eb->blocking_writers)) { write_unlock(&eb->lock); - wait_event(eb->read_lock_wq, - atomic_read(&eb->blocking_readers) == 0); - goto again; - } - if (atomic_read(&eb->blocking_writers)) { - write_unlock(&eb->lock); - wait_event(eb->write_lock_wq, - atomic_read(&eb->blocking_writers) == 0); goto again; } WARN_ON(atomic_read(&eb->spinning_writers)); diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 29135def468e..595014f64830 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -17,8 +17,10 @@ void btrfs_tree_unlock(struct extent_buffer *eb); void btrfs_tree_read_lock(struct extent_buffer *eb); void btrfs_tree_read_unlock(struct extent_buffer *eb); void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb); -void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw); -void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw); +void btrfs_set_lock_blocking_read(struct extent_buffer *eb); +void btrfs_set_lock_blocking_write(struct extent_buffer *eb); +void btrfs_clear_lock_blocking_read(struct extent_buffer *eb); +void btrfs_clear_lock_blocking_write(struct extent_buffer *eb); void btrfs_assert_tree_locked(struct extent_buffer *eb); int btrfs_try_tree_read_lock(struct extent_buffer *eb); int btrfs_try_tree_write_lock(struct extent_buffer *eb); @@ -37,13 +39,4 @@ static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw) BUG(); } -static inline void btrfs_set_lock_blocking(struct extent_buffer *eb) -{ - btrfs_set_lock_blocking_rw(eb, BTRFS_WRITE_LOCK); -} - -static inline void btrfs_clear_lock_blocking(struct extent_buffer *eb) -{ - btrfs_clear_lock_blocking_rw(eb, BTRFS_WRITE_LOCK_BLOCKING); -} #endif diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 90639140439f..579d53ae256f 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -61,6 +61,28 @@ struct workspace { struct list_head list; }; +static struct workspace_manager wsm; + +static void lzo_init_workspace_manager(void) +{ + btrfs_init_workspace_manager(&wsm, &btrfs_lzo_compress); +} + +static void lzo_cleanup_workspace_manager(void) +{ + btrfs_cleanup_workspace_manager(&wsm); +} + +static struct list_head *lzo_get_workspace(unsigned int level) +{ + return btrfs_get_workspace(&wsm, level); +} + +static void lzo_put_workspace(struct list_head *ws) +{ + btrfs_put_workspace(&wsm, ws); +} + static void lzo_free_workspace(struct list_head *ws) { struct workspace *workspace = list_entry(ws, struct workspace, list); @@ -71,7 +93,7 @@ static void lzo_free_workspace(struct list_head *ws) kfree(workspace); } -static struct list_head *lzo_alloc_workspace(void) +static struct list_head *lzo_alloc_workspace(unsigned int level) { struct workspace *workspace; @@ -485,11 +507,16 @@ out: return ret; } -static void lzo_set_level(struct list_head *ws, unsigned int type) +static unsigned int lzo_set_level(unsigned int level) { + return 0; } const struct btrfs_compress_op btrfs_lzo_compress = { + .init_workspace_manager = lzo_init_workspace_manager, + .cleanup_workspace_manager = lzo_cleanup_workspace_manager, + .get_workspace = lzo_get_workspace, + .put_workspace = lzo_put_workspace, .alloc_workspace = lzo_alloc_workspace, .free_workspace = lzo_free_workspace, .compress_pages = lzo_compress_pages, diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 4e473a998219..c1cd5558a646 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1546,12 +1546,18 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, parent_node = *p; entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record, node); - if (bytenr < entry->bytenr) + if (bytenr < entry->bytenr) { p = &(*p)->rb_left; - else if (bytenr > entry->bytenr) + } else if (bytenr > entry->bytenr) { p = &(*p)->rb_right; - else + } else { + if (record->data_rsv && !entry->data_rsv) { + entry->data_rsv = record->data_rsv; + entry->data_rsv_refroot = + record->data_rsv_refroot; + } return 1; + } } rb_link_node(&record->node, parent_node, p); @@ -1597,7 +1603,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || bytenr == 0 || num_bytes == 0) return 0; - record = kmalloc(sizeof(*record), gfp_flag); + record = kzalloc(sizeof(*record), gfp_flag); if (!record) return -ENOMEM; @@ -1832,7 +1838,7 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, src_path->nodes[cur_level] = eb; btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + btrfs_set_lock_blocking_read(eb); src_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING; } @@ -1973,7 +1979,7 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, dst_path->slots[cur_level] = 0; btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + btrfs_set_lock_blocking_read(eb); dst_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING; need_cleanup = true; } @@ -2017,86 +2023,30 @@ out: return ret; } -/* - * Inform qgroup to trace subtree swap used in balance. - * - * Unlike btrfs_qgroup_trace_subtree(), this function will only trace - * new tree blocks whose generation is equal to (or larger than) @last_snapshot. - * - * Will go down the tree block pointed by @dst_eb (pointed by @dst_parent and - * @dst_slot), and find any tree blocks whose generation is at @last_snapshot, - * and then go down @src_eb (pointed by @src_parent and @src_slot) to find - * the counterpart of the tree block, then mark both tree blocks as qgroup dirty, - * and skip all tree blocks whose generation is smaller than last_snapshot. - * - * This would skip tons of tree blocks of original btrfs_qgroup_trace_subtree(), - * which could be the cause of very slow balance if the file tree is large. - * - * @src_parent, @src_slot: pointer to src (file tree) eb. - * @dst_parent, @dst_slot: pointer to dst (reloc tree) eb. - */ -int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *bg_cache, - struct extent_buffer *src_parent, int src_slot, - struct extent_buffer *dst_parent, int dst_slot, - u64 last_snapshot) +static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, + struct extent_buffer *src_eb, + struct extent_buffer *dst_eb, + u64 last_snapshot, bool trace_leaf) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_path *dst_path = NULL; - struct btrfs_key first_key; - struct extent_buffer *src_eb = NULL; - struct extent_buffer *dst_eb = NULL; - bool trace_leaf = false; - u64 child_gen; - u64 child_bytenr; int level; int ret; if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) return 0; - /* Check parameter order */ - if (btrfs_node_ptr_generation(src_parent, src_slot) > - btrfs_node_ptr_generation(dst_parent, dst_slot)) { + /* Wrong parameter order */ + if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) { btrfs_err_rl(fs_info, "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__, - btrfs_node_ptr_generation(src_parent, src_slot), - btrfs_node_ptr_generation(dst_parent, dst_slot)); + btrfs_header_generation(src_eb), + btrfs_header_generation(dst_eb)); return -EUCLEAN; } - /* - * Only trace leaf if we're relocating data block groups, this could - * reduce tons of data extents tracing for meta/sys bg relocation. - */ - if (bg_cache->flags & BTRFS_BLOCK_GROUP_DATA) - trace_leaf = true; - /* Read out real @src_eb, pointed by @src_parent and @src_slot */ - child_bytenr = btrfs_node_blockptr(src_parent, src_slot); - child_gen = btrfs_node_ptr_generation(src_parent, src_slot); - btrfs_node_key_to_cpu(src_parent, &first_key, src_slot); - - src_eb = read_tree_block(fs_info, child_bytenr, child_gen, - btrfs_header_level(src_parent) - 1, &first_key); - if (IS_ERR(src_eb)) { - ret = PTR_ERR(src_eb); - goto out; - } - - /* Read out real @dst_eb, pointed by @src_parent and @src_slot */ - child_bytenr = btrfs_node_blockptr(dst_parent, dst_slot); - child_gen = btrfs_node_ptr_generation(dst_parent, dst_slot); - btrfs_node_key_to_cpu(dst_parent, &first_key, dst_slot); - - dst_eb = read_tree_block(fs_info, child_bytenr, child_gen, - btrfs_header_level(dst_parent) - 1, &first_key); - if (IS_ERR(dst_eb)) { - ret = PTR_ERR(dst_eb); - goto out; - } - if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) { - ret = -EINVAL; + ret = -EIO; goto out; } @@ -2106,14 +2056,13 @@ int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, ret = -ENOMEM; goto out; } - /* For dst_path */ extent_buffer_get(dst_eb); dst_path->nodes[level] = dst_eb; dst_path->slots[level] = 0; dst_path->locks[level] = 0; - /* Do the generation-aware breadth-first search */ + /* Do the generation aware breadth-first search */ ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level, level, last_snapshot, trace_leaf); if (ret < 0) @@ -2121,8 +2070,6 @@ int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, ret = 0; out: - free_extent_buffer(src_eb); - free_extent_buffer(dst_eb); btrfs_free_path(dst_path); if (ret < 0) fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; @@ -2207,7 +2154,7 @@ walk_down: path->slots[level] = 0; btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + btrfs_set_lock_blocking_read(eb); path->locks[level] = BTRFS_READ_LOCK_BLOCKING; ret = btrfs_qgroup_trace_extent(trans, child_bytenr, @@ -2576,6 +2523,11 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) goto cleanup; } + /* Free the reserved data space */ + btrfs_qgroup_free_refroot(fs_info, + record->data_rsv_refroot, + record->data_rsv, + BTRFS_QGROUP_RSV_DATA); /* * Use SEQ_LAST as time_seq to do special search, which * doesn't lock tree or delayed_refs and search current @@ -2842,16 +2794,15 @@ out: /* * Two limits to commit transaction in advance. * - * For RATIO, it will be 1/RATIO of the remaining limit - * (excluding data and prealloc meta) as threshold. + * For RATIO, it will be 1/RATIO of the remaining limit as threshold. * For SIZE, it will be in byte unit as threshold. */ -#define QGROUP_PERTRANS_RATIO 32 -#define QGROUP_PERTRANS_SIZE SZ_32M +#define QGROUP_FREE_RATIO 32 +#define QGROUP_FREE_SIZE SZ_32M static bool qgroup_check_limits(struct btrfs_fs_info *fs_info, const struct btrfs_qgroup *qg, u64 num_bytes) { - u64 limit; + u64 free; u64 threshold; if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && @@ -2870,20 +2821,21 @@ static bool qgroup_check_limits(struct btrfs_fs_info *fs_info, */ if ((qg->lim_flags & (BTRFS_QGROUP_LIMIT_MAX_RFER | BTRFS_QGROUP_LIMIT_MAX_EXCL))) { - if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) - limit = qg->max_excl; - else - limit = qg->max_rfer; - threshold = (limit - qg->rsv.values[BTRFS_QGROUP_RSV_DATA] - - qg->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC]) / - QGROUP_PERTRANS_RATIO; - threshold = min_t(u64, threshold, QGROUP_PERTRANS_SIZE); + if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) { + free = qg->max_excl - qgroup_rsv_total(qg) - qg->excl; + threshold = min_t(u64, qg->max_excl / QGROUP_FREE_RATIO, + QGROUP_FREE_SIZE); + } else { + free = qg->max_rfer - qgroup_rsv_total(qg) - qg->rfer; + threshold = min_t(u64, qg->max_rfer / QGROUP_FREE_RATIO, + QGROUP_FREE_SIZE); + } /* * Use transaction_kthread to commit transaction, so we no * longer need to bother nested transaction nor lock context. */ - if (qg->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > threshold) + if (free < threshold) btrfs_commit_transaction_locksafe(fs_info); } @@ -2959,7 +2911,6 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, qg = unode_aux_to_qgroup(unode); - trace_qgroup_update_reserve(fs_info, qg, num_bytes, type); qgroup_rsv_add(fs_info, qg, num_bytes, type); } @@ -3026,7 +2977,6 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, qg = unode_aux_to_qgroup(unode); - trace_qgroup_update_reserve(fs_info, qg, -(s64)num_bytes, type); qgroup_rsv_release(fs_info, qg, num_bytes, type); list_for_each_entry(glist, &qg->groups, next_group) { @@ -3783,3 +3733,241 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode) } extent_changeset_release(&changeset); } + +void btrfs_qgroup_init_swapped_blocks( + struct btrfs_qgroup_swapped_blocks *swapped_blocks) +{ + int i; + + spin_lock_init(&swapped_blocks->lock); + for (i = 0; i < BTRFS_MAX_LEVEL; i++) + swapped_blocks->blocks[i] = RB_ROOT; + swapped_blocks->swapped = false; +} + +/* + * Delete all swapped blocks record of @root. + * Every record here means we skipped a full subtree scan for qgroup. + * + * Gets called when committing one transaction. + */ +void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root) +{ + struct btrfs_qgroup_swapped_blocks *swapped_blocks; + int i; + + swapped_blocks = &root->swapped_blocks; + + spin_lock(&swapped_blocks->lock); + if (!swapped_blocks->swapped) + goto out; + for (i = 0; i < BTRFS_MAX_LEVEL; i++) { + struct rb_root *cur_root = &swapped_blocks->blocks[i]; + struct btrfs_qgroup_swapped_block *entry; + struct btrfs_qgroup_swapped_block *next; + + rbtree_postorder_for_each_entry_safe(entry, next, cur_root, + node) + kfree(entry); + swapped_blocks->blocks[i] = RB_ROOT; + } + swapped_blocks->swapped = false; +out: + spin_unlock(&swapped_blocks->lock); +} + +/* + * Add subtree roots record into @subvol_root. + * + * @subvol_root: tree root of the subvolume tree get swapped + * @bg: block group under balance + * @subvol_parent/slot: pointer to the subtree root in subvolume tree + * @reloc_parent/slot: pointer to the subtree root in reloc tree + * BOTH POINTERS ARE BEFORE TREE SWAP + * @last_snapshot: last snapshot generation of the subvolume tree + */ +int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, + struct btrfs_root *subvol_root, + struct btrfs_block_group_cache *bg, + struct extent_buffer *subvol_parent, int subvol_slot, + struct extent_buffer *reloc_parent, int reloc_slot, + u64 last_snapshot) +{ + struct btrfs_fs_info *fs_info = subvol_root->fs_info; + struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks; + struct btrfs_qgroup_swapped_block *block; + struct rb_node **cur; + struct rb_node *parent = NULL; + int level = btrfs_header_level(subvol_parent) - 1; + int ret = 0; + + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + return 0; + + if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) > + btrfs_node_ptr_generation(reloc_parent, reloc_slot)) { + btrfs_err_rl(fs_info, + "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu", + __func__, + btrfs_node_ptr_generation(subvol_parent, subvol_slot), + btrfs_node_ptr_generation(reloc_parent, reloc_slot)); + return -EUCLEAN; + } + + block = kmalloc(sizeof(*block), GFP_NOFS); + if (!block) { + ret = -ENOMEM; + goto out; + } + + /* + * @reloc_parent/slot is still before swap, while @block is going to + * record the bytenr after swap, so we do the swap here. + */ + block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot); + block->subvol_generation = btrfs_node_ptr_generation(reloc_parent, + reloc_slot); + block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot); + block->reloc_generation = btrfs_node_ptr_generation(subvol_parent, + subvol_slot); + block->last_snapshot = last_snapshot; + block->level = level; + if (bg->flags & BTRFS_BLOCK_GROUP_DATA) + block->trace_leaf = true; + else + block->trace_leaf = false; + btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot); + + /* Insert @block into @blocks */ + spin_lock(&blocks->lock); + cur = &blocks->blocks[level].rb_node; + while (*cur) { + struct btrfs_qgroup_swapped_block *entry; + + parent = *cur; + entry = rb_entry(parent, struct btrfs_qgroup_swapped_block, + node); + + if (entry->subvol_bytenr < block->subvol_bytenr) { + cur = &(*cur)->rb_left; + } else if (entry->subvol_bytenr > block->subvol_bytenr) { + cur = &(*cur)->rb_right; + } else { + if (entry->subvol_generation != + block->subvol_generation || + entry->reloc_bytenr != block->reloc_bytenr || + entry->reloc_generation != + block->reloc_generation) { + /* + * Duplicated but mismatch entry found. + * Shouldn't happen. + * + * Marking qgroup inconsistent should be enough + * for end users. + */ + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + ret = -EEXIST; + } + kfree(block); + goto out_unlock; + } + } + rb_link_node(&block->node, parent, cur); + rb_insert_color(&block->node, &blocks->blocks[level]); + blocks->swapped = true; +out_unlock: + spin_unlock(&blocks->lock); +out: + if (ret < 0) + fs_info->qgroup_flags |= + BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + return ret; +} + +/* + * Check if the tree block is a subtree root, and if so do the needed + * delayed subtree trace for qgroup. + * + * This is called during btrfs_cow_block(). + */ +int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *subvol_eb) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks; + struct btrfs_qgroup_swapped_block *block; + struct extent_buffer *reloc_eb = NULL; + struct rb_node *node; + bool found = false; + bool swapped = false; + int level = btrfs_header_level(subvol_eb); + int ret = 0; + int i; + + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + return 0; + if (!is_fstree(root->root_key.objectid) || !root->reloc_root) + return 0; + + spin_lock(&blocks->lock); + if (!blocks->swapped) { + spin_unlock(&blocks->lock); + return 0; + } + node = blocks->blocks[level].rb_node; + + while (node) { + block = rb_entry(node, struct btrfs_qgroup_swapped_block, node); + if (block->subvol_bytenr < subvol_eb->start) { + node = node->rb_left; + } else if (block->subvol_bytenr > subvol_eb->start) { + node = node->rb_right; + } else { + found = true; + break; + } + } + if (!found) { + spin_unlock(&blocks->lock); + goto out; + } + /* Found one, remove it from @blocks first and update blocks->swapped */ + rb_erase(&block->node, &blocks->blocks[level]); + for (i = 0; i < BTRFS_MAX_LEVEL; i++) { + if (RB_EMPTY_ROOT(&blocks->blocks[i])) { + swapped = true; + break; + } + } + blocks->swapped = swapped; + spin_unlock(&blocks->lock); + + /* Read out reloc subtree root */ + reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, + block->reloc_generation, block->level, + &block->first_key); + if (IS_ERR(reloc_eb)) { + ret = PTR_ERR(reloc_eb); + reloc_eb = NULL; + goto free_out; + } + if (!extent_buffer_uptodate(reloc_eb)) { + ret = -EIO; + goto free_out; + } + + ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb, + block->last_snapshot, block->trace_leaf); +free_out: + kfree(block); + free_extent_buffer(reloc_eb); +out: + if (ret < 0) { + btrfs_err_rl(fs_info, + "failed to account subtree at bytenr %llu: %d", + subvol_eb->start, ret); + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + } + return ret; +} diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 20c6bd5fa701..46ba7bd2961c 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -6,6 +6,8 @@ #ifndef BTRFS_QGROUP_H #define BTRFS_QGROUP_H +#include <linux/spinlock.h> +#include <linux/rbtree.h> #include "ulist.h" #include "delayed-ref.h" @@ -38,6 +40,66 @@ */ /* + * Special performance optimization for balance. + * + * For balance, we need to swap subtree of subvolume and reloc trees. + * In theory, we need to trace all subtree blocks of both subvolume and reloc + * trees, since their owner has changed during such swap. + * + * However since balance has ensured that both subtrees are containing the + * same contents and have the same tree structures, such swap won't cause + * qgroup number change. + * + * But there is a race window between subtree swap and transaction commit, + * during that window, if we increase/decrease tree level or merge/split tree + * blocks, we still need to trace the original subtrees. + * + * So for balance, we use a delayed subtree tracing, whose workflow is: + * + * 1) Record the subtree root block get swapped. + * + * During subtree swap: + * O = Old tree blocks + * N = New tree blocks + * reloc tree subvolume tree X + * Root Root + * / \ / \ + * NA OB OA OB + * / | | \ / | | \ + * NC ND OE OF OC OD OE OF + * + * In this case, NA and OA are going to be swapped, record (NA, OA) into + * subvolume tree X. + * + * 2) After subtree swap. + * reloc tree subvolume tree X + * Root Root + * / \ / \ + * OA OB NA OB + * / | | \ / | | \ + * OC OD OE OF NC ND OE OF + * + * 3a) COW happens for OB + * If we are going to COW tree block OB, we check OB's bytenr against + * tree X's swapped_blocks structure. + * If it doesn't fit any, nothing will happen. + * + * 3b) COW happens for NA + * Check NA's bytenr against tree X's swapped_blocks, and get a hit. + * Then we do subtree scan on both subtrees OA and NA. + * Resulting 6 tree blocks to be scanned (OA, OC, OD, NA, NC, ND). + * + * Then no matter what we do to subvolume tree X, qgroup numbers will + * still be correct. + * Then NA's record gets removed from X's swapped_blocks. + * + * 4) Transaction commit + * Any record in X's swapped_blocks gets removed, since there is no + * modification to the swapped subtrees, no need to trigger heavy qgroup + * subtree rescan for them. + */ + +/* * Record a dirty extent, and info qgroup to update quota on it * TODO: Use kmem cache to alloc it. */ @@ -45,9 +107,38 @@ struct btrfs_qgroup_extent_record { struct rb_node node; u64 bytenr; u64 num_bytes; + + /* + * For qgroup reserved data space freeing. + * + * @data_rsv_refroot and @data_rsv will be recorded after + * BTRFS_ADD_DELAYED_EXTENT is called. + * And will be used to free reserved qgroup space at + * transaction commit time. + */ + u32 data_rsv; /* reserved data space needs to be freed */ + u64 data_rsv_refroot; /* which root the reserved data belongs to */ struct ulist *old_roots; }; +struct btrfs_qgroup_swapped_block { + struct rb_node node; + + int level; + bool trace_leaf; + + /* bytenr/generation of the tree block in subvolume tree after swap */ + u64 subvol_bytenr; + u64 subvol_generation; + + /* bytenr/generation of the tree block in reloc tree after swap */ + u64 reloc_bytenr; + u64 reloc_generation; + + u64 last_snapshot; + struct btrfs_key first_key; +}; + /* * Qgroup reservation types: * @@ -236,12 +327,6 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *root_eb, u64 root_gen, int root_level); - -int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *bg_cache, - struct extent_buffer *src_parent, int src_slot, - struct extent_buffer *dst_parent, int dst_slot, - u64 last_snapshot); int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, struct ulist *old_roots, struct ulist *new_roots); @@ -252,15 +337,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, u64 ref_root, u64 num_bytes, enum btrfs_qgroup_rsv_type type); -static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info, - u64 ref_root, u64 num_bytes) -{ - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) - return; - trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes); - btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes, - BTRFS_QGROUP_RSV_DATA); -} #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, @@ -325,4 +401,18 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes); void btrfs_qgroup_check_reserved_leak(struct inode *inode); +/* btrfs_qgroup_swapped_blocks related functions */ +void btrfs_qgroup_init_swapped_blocks( + struct btrfs_qgroup_swapped_blocks *swapped_blocks); + +void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root); +int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, + struct btrfs_root *subvol_root, + struct btrfs_block_group_cache *bg, + struct extent_buffer *subvol_parent, int subvol_slot, + struct extent_buffer *reloc_parent, int reloc_slot, + u64 last_snapshot); +int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *eb); + #endif diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index c3557c12656b..d09b6cdb785a 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -583,7 +583,7 @@ static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path, return -EIO; } btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + btrfs_set_lock_blocking_read(eb); path->nodes[level-1] = eb; path->slots[level-1] = 0; path->locks[level-1] = BTRFS_READ_LOCK_BLOCKING; @@ -987,7 +987,7 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) return -ENOMEM; eb = btrfs_read_lock_root_node(fs_info->extent_root); - btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + btrfs_set_lock_blocking_read(eb); level = btrfs_header_level(eb); path->nodes[level] = eb; path->slots[level] = 0; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 272b287f8cf0..ddf028509931 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -162,6 +162,8 @@ struct reloc_control { struct mapping_tree reloc_root_tree; /* list of reloc trees */ struct list_head reloc_roots; + /* list of subvolume trees that get relocated */ + struct list_head dirty_subvol_roots; /* size of metadata reservation for merging reloc trees */ u64 merging_rsv_size; /* size of relocated tree nodes */ @@ -1467,15 +1469,17 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root_item *root_item; int ret; - if (!root->reloc_root) + if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state) || + !root->reloc_root) goto out; reloc_root = root->reloc_root; root_item = &reloc_root->root_item; + /* root->reloc_root will stay until current relocation finished */ if (fs_info->reloc_ctl->merge_reloc_tree && btrfs_root_refs(root_item) == 0) { - root->reloc_root = NULL; + set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); __del_reloc_root(reloc_root); } @@ -1773,7 +1777,7 @@ again: btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); eb = btrfs_lock_root_node(dest); - btrfs_set_lock_blocking(eb); + btrfs_set_lock_blocking_write(eb); level = btrfs_header_level(eb); if (level < lowest_level) { @@ -1786,7 +1790,7 @@ again: ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb); BUG_ON(ret); } - btrfs_set_lock_blocking(eb); + btrfs_set_lock_blocking_write(eb); if (next_key) { next_key->objectid = (u64)-1; @@ -1802,6 +1806,8 @@ again: BUG_ON(level < lowest_level); ret = btrfs_bin_search(parent, &key, level, &slot); + if (ret < 0) + break; if (ret && slot > 0) slot--; @@ -1852,7 +1858,7 @@ again: slot, &eb); BUG_ON(ret); } - btrfs_set_lock_blocking(eb); + btrfs_set_lock_blocking_write(eb); btrfs_tree_unlock(parent); free_extent_buffer(parent); @@ -1885,15 +1891,18 @@ again: * If not traced, we will leak data numbers * 2) Fs subtree * If not traced, we will double count old data - * and tree block numbers, if current trans doesn't free - * data reloc tree inode. + * + * We don't scan the subtree right now, but only record + * the swapped tree blocks. + * The real subtree rescan is delayed until we have new + * CoW on the subtree root node before transaction commit. */ - ret = btrfs_qgroup_trace_subtree_swap(trans, rc->block_group, - parent, slot, path->nodes[level], - path->slots[level], last_snapshot); + ret = btrfs_qgroup_add_swapped_blocks(trans, dest, + rc->block_group, parent, slot, + path->nodes[level], path->slots[level], + last_snapshot); if (ret < 0) break; - /* * swap blocks in fs tree and reloc tree. */ @@ -2121,6 +2130,58 @@ static int find_next_key(struct btrfs_path *path, int level, } /* + * Insert current subvolume into reloc_control::dirty_subvol_roots + */ +static void insert_dirty_subvol(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct btrfs_root *root) +{ + struct btrfs_root *reloc_root = root->reloc_root; + struct btrfs_root_item *reloc_root_item; + + /* @root must be a subvolume tree root with a valid reloc tree */ + ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); + ASSERT(reloc_root); + + reloc_root_item = &reloc_root->root_item; + memset(&reloc_root_item->drop_progress, 0, + sizeof(reloc_root_item->drop_progress)); + reloc_root_item->drop_level = 0; + btrfs_set_root_refs(reloc_root_item, 0); + btrfs_update_reloc_root(trans, root); + + if (list_empty(&root->reloc_dirty_list)) { + btrfs_grab_fs_root(root); + list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots); + } +} + +static int clean_dirty_subvols(struct reloc_control *rc) +{ + struct btrfs_root *root; + struct btrfs_root *next; + int ret = 0; + + list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots, + reloc_dirty_list) { + struct btrfs_root *reloc_root = root->reloc_root; + + clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); + list_del_init(&root->reloc_dirty_list); + root->reloc_root = NULL; + if (reloc_root) { + int ret2; + + ret2 = btrfs_drop_snapshot(reloc_root, NULL, 0, 1); + if (ret2 < 0 && !ret) + ret = ret2; + } + btrfs_put_fs_root(root); + } + return ret; +} + +/* * merge the relocated tree blocks in reloc tree with corresponding * fs tree. */ @@ -2128,7 +2189,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, struct btrfs_root *root) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; - LIST_HEAD(inode_list); struct btrfs_key key; struct btrfs_key next_key; struct btrfs_trans_handle *trans = NULL; @@ -2259,13 +2319,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, out: btrfs_free_path(path); - if (err == 0) { - memset(&root_item->drop_progress, 0, - sizeof(root_item->drop_progress)); - root_item->drop_level = 0; - btrfs_set_root_refs(root_item, 0); - btrfs_update_reloc_root(trans, root); - } + if (err == 0) + insert_dirty_subvol(trans, rc, root); if (trans) btrfs_end_transaction_throttle(trans); @@ -2410,14 +2465,6 @@ again: } else { list_del_init(&reloc_root->root_list); } - - ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1); - if (ret < 0) { - if (list_empty(&reloc_root->root_list)) - list_add_tail(&reloc_root->root_list, - &reloc_roots); - goto out; - } } if (found) { @@ -2685,6 +2732,10 @@ static int do_relocation(struct btrfs_trans_handle *trans, if (!lowest) { ret = btrfs_bin_search(upper->eb, key, upper->level, &slot); + if (ret < 0) { + err = ret; + goto next; + } BUG_ON(ret); bytenr = btrfs_node_blockptr(upper->eb, slot); if (node->eb->start == bytenr) @@ -2720,6 +2771,10 @@ static int do_relocation(struct btrfs_trans_handle *trans, } else { ret = btrfs_bin_search(upper->eb, key, upper->level, &slot); + if (ret < 0) { + err = ret; + goto next; + } BUG_ON(ret); } @@ -2752,7 +2807,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, goto next; } btrfs_tree_lock(eb); - btrfs_set_lock_blocking(eb); + btrfs_set_lock_blocking_write(eb); if (!node->eb) { ret = btrfs_cow_block(trans, root, eb, upper->eb, @@ -4079,6 +4134,9 @@ restart: goto out_free; } btrfs_commit_transaction(trans); + ret = clean_dirty_subvols(rc); + if (ret < 0 && !err) + err = ret; out_free: btrfs_free_block_rsv(fs_info, rc->block_rsv); btrfs_free_path(path); @@ -4173,6 +4231,7 @@ static struct reloc_control *alloc_reloc_control(void) return NULL; INIT_LIST_HEAD(&rc->reloc_roots); + INIT_LIST_HEAD(&rc->dirty_subvol_roots); backref_cache_init(&rc->backref_cache); mapping_tree_init(&rc->reloc_root_tree); extent_io_tree_init(&rc->processed_blocks, NULL); @@ -4468,6 +4527,10 @@ int btrfs_recover_relocation(struct btrfs_root *root) goto out_free; } err = btrfs_commit_transaction(trans); + + ret = clean_dirty_subvols(rc); + if (ret < 0 && !err) + err = ret; out_free: kfree(rc); out: diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 65bda0682928..0d2b957ca3a3 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -21,12 +21,12 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot, struct btrfs_root_item *item) { uuid_le uuid; - int len; + u32 len; int need_reset = 0; len = btrfs_item_size_nr(eb, slot); read_extent_buffer(eb, item, btrfs_item_ptr_offset(eb, slot), - min_t(int, len, (int)sizeof(*item))); + min_t(u32, len, sizeof(*item))); if (len < sizeof(*item)) need_reset = 1; if (!need_reset && btrfs_root_generation(item) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 6dcd36d7b849..a99588536c79 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -584,6 +584,7 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO; sctx->curr = -1; sctx->fs_info = fs_info; + INIT_LIST_HEAD(&sctx->csum_list); for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { struct scrub_bio *sbio; @@ -608,7 +609,6 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( atomic_set(&sctx->workers_pending, 0); atomic_set(&sctx->cancel_req, 0); sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy); - INIT_LIST_HEAD(&sctx->csum_list); spin_lock_init(&sctx->list_lock); spin_lock_init(&sctx->stat_lock); @@ -3741,25 +3741,33 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; int max_active = fs_info->thread_pool_size; - if (fs_info->scrub_workers_refcnt == 0) { + lockdep_assert_held(&fs_info->scrub_lock); + + if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) { + ASSERT(fs_info->scrub_workers == NULL); fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", flags, is_dev_replace ? 1 : max_active, 4); if (!fs_info->scrub_workers) goto fail_scrub_workers; + ASSERT(fs_info->scrub_wr_completion_workers == NULL); fs_info->scrub_wr_completion_workers = btrfs_alloc_workqueue(fs_info, "scrubwrc", flags, max_active, 2); if (!fs_info->scrub_wr_completion_workers) goto fail_scrub_wr_completion_workers; + ASSERT(fs_info->scrub_parity_workers == NULL); fs_info->scrub_parity_workers = btrfs_alloc_workqueue(fs_info, "scrubparity", flags, max_active, 2); if (!fs_info->scrub_parity_workers) goto fail_scrub_parity_workers; + + refcount_set(&fs_info->scrub_workers_refcnt, 1); + } else { + refcount_inc(&fs_info->scrub_workers_refcnt); } - ++fs_info->scrub_workers_refcnt; return 0; fail_scrub_parity_workers: @@ -3770,16 +3778,6 @@ fail_scrub_workers: return -ENOMEM; } -static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) -{ - if (--fs_info->scrub_workers_refcnt == 0) { - btrfs_destroy_workqueue(fs_info->scrub_workers); - btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers); - btrfs_destroy_workqueue(fs_info->scrub_parity_workers); - } - WARN_ON(fs_info->scrub_workers_refcnt < 0); -} - int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, u64 end, struct btrfs_scrub_progress *progress, int readonly, int is_dev_replace) @@ -3788,6 +3786,9 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, int ret; struct btrfs_device *dev; unsigned int nofs_flag; + struct btrfs_workqueue *scrub_workers = NULL; + struct btrfs_workqueue *scrub_wr_comp = NULL; + struct btrfs_workqueue *scrub_parity = NULL; if (btrfs_fs_closing(fs_info)) return -EINVAL; @@ -3835,7 +3836,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, return PTR_ERR(sctx); mutex_lock(&fs_info->fs_devices->device_list_mutex); - dev = btrfs_find_device(fs_info, devid, NULL, NULL); + dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true); if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) && !is_dev_replace)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); @@ -3903,6 +3904,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, */ nofs_flag = memalloc_nofs_save(); if (!is_dev_replace) { + btrfs_info(fs_info, "scrub: started on devid %llu", devid); /* * by holding device list mutex, we can * kick off writing super in log tree sync. @@ -3925,11 +3927,26 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, if (progress) memcpy(progress, &sctx->stat, sizeof(*progress)); + if (!is_dev_replace) + btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d", + ret ? "not finished" : "finished", devid, ret); + mutex_lock(&fs_info->scrub_lock); dev->scrub_ctx = NULL; - scrub_workers_put(fs_info); + if (refcount_dec_and_test(&fs_info->scrub_workers_refcnt)) { + scrub_workers = fs_info->scrub_workers; + scrub_wr_comp = fs_info->scrub_wr_completion_workers; + scrub_parity = fs_info->scrub_parity_workers; + + fs_info->scrub_workers = NULL; + fs_info->scrub_wr_completion_workers = NULL; + fs_info->scrub_parity_workers = NULL; + } mutex_unlock(&fs_info->scrub_lock); + btrfs_destroy_workqueue(scrub_workers); + btrfs_destroy_workqueue(scrub_wr_comp); + btrfs_destroy_workqueue(scrub_parity); scrub_put_ctx(sctx); return ret; @@ -4012,7 +4029,7 @@ int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, struct scrub_ctx *sctx = NULL; mutex_lock(&fs_info->fs_devices->device_list_mutex); - dev = btrfs_find_device(fs_info, devid, NULL, NULL); + dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true); if (dev) sctx = dev->scrub_ctx; if (sctx) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 0a3f122dd61f..120e4340792a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -529,7 +529,9 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, if (token != Opt_compress && token != Opt_compress_force) info->compress_level = - btrfs_compress_str2level(args[0].from); + btrfs_compress_str2level( + BTRFS_COMPRESS_ZLIB, + args[0].from + 4); btrfs_set_opt(info->mount_opt, COMPRESS); btrfs_clear_opt(info->mount_opt, NODATACOW); btrfs_clear_opt(info->mount_opt, NODATASUM); @@ -542,9 +544,13 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, btrfs_clear_opt(info->mount_opt, NODATASUM); btrfs_set_fs_incompat(info, COMPRESS_LZO); no_compress = 0; - } else if (strcmp(args[0].from, "zstd") == 0) { + } else if (strncmp(args[0].from, "zstd", 4) == 0) { compress_type = "zstd"; info->compress_type = BTRFS_COMPRESS_ZSTD; + info->compress_level = + btrfs_compress_str2level( + BTRFS_COMPRESS_ZSTD, + args[0].from + 4); btrfs_set_opt(info->mount_opt, COMPRESS); btrfs_clear_opt(info->mount_opt, NODATACOW); btrfs_clear_opt(info->mount_opt, NODATASUM); @@ -2190,6 +2196,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, ret = PTR_ERR_OR_ZERO(device); mutex_unlock(&uuid_mutex); break; + case BTRFS_IOC_FORGET_DEV: + ret = btrfs_forget_devices(vol->name); + break; case BTRFS_IOC_DEVICES_READY: mutex_lock(&uuid_mutex); device = btrfs_scan_one_device(vol->name, FMODE_READ, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 4ec2b660d014..acdad6d658f5 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -122,6 +122,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans) if (is_fstree(root->root_key.objectid)) btrfs_unpin_free_ino(root); clear_btree_io_tree(&root->dirty_log_pages); + btrfs_qgroup_clean_swapped_blocks(root); } /* We can free old roots now. */ @@ -845,8 +846,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, btrfs_trans_release_metadata(trans); trans->block_rsv = NULL; - if (!list_empty(&trans->new_bgs)) - btrfs_create_pending_block_groups(trans); + btrfs_create_pending_block_groups(trans); btrfs_trans_release_chunk_metadata(trans); @@ -1532,7 +1532,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } - btrfs_set_lock_blocking(old); + btrfs_set_lock_blocking_write(old); ret = btrfs_copy_root(trans, root, old, &tmp, objectid); /* clean up in any case */ @@ -1943,8 +1943,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) cur_trans->delayed_refs.flushing = 1; smp_wmb(); - if (!list_empty(&trans->new_bgs)) - btrfs_create_pending_block_groups(trans); + btrfs_create_pending_block_groups(trans); ret = btrfs_run_delayed_refs(trans, 0); if (ret) { diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index 3c0987ab587d..5f9e2dd413af 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -52,7 +52,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, u32 nritems; root_node = btrfs_lock_root_node(root); - btrfs_set_lock_blocking(root_node); + btrfs_set_lock_blocking_write(root_node); nritems = btrfs_header_nritems(root_node); root->defrag_max.objectid = 0; /* from above we know this is not a leaf */ diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index ac232b3d6d7e..f06454a55e00 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -27,6 +27,7 @@ #define LOG_INODE_ALL 0 #define LOG_INODE_EXISTS 1 #define LOG_OTHER_INODE 2 +#define LOG_OTHER_INODE_ALL 3 /* * directory trouble cases @@ -1330,6 +1331,67 @@ out: return ret; } +static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct inode *dir, struct inode *inode, const char *name, + int namelen, u64 ref_index) +{ + struct btrfs_dir_item *dir_item; + struct btrfs_key key; + struct btrfs_path *path; + struct inode *other_inode = NULL; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + dir_item = btrfs_lookup_dir_item(NULL, root, path, + btrfs_ino(BTRFS_I(dir)), + name, namelen, 0); + if (!dir_item) { + btrfs_release_path(path); + goto add_link; + } else if (IS_ERR(dir_item)) { + ret = PTR_ERR(dir_item); + goto out; + } + + /* + * Our inode's dentry collides with the dentry of another inode which is + * in the log but not yet processed since it has a higher inode number. + * So delete that other dentry. + */ + btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key); + btrfs_release_path(path); + other_inode = read_one_inode(root, key.objectid); + if (!other_inode) { + ret = -ENOENT; + goto out; + } + ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode), + name, namelen); + if (ret) + goto out; + /* + * If we dropped the link count to 0, bump it so that later the iput() + * on the inode will not free it. We will fixup the link count later. + */ + if (other_inode->i_nlink == 0) + inc_nlink(other_inode); + + ret = btrfs_run_delayed_items(trans); + if (ret) + goto out; +add_link: + ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), + name, namelen, 0, ref_index); +out: + iput(other_inode); + btrfs_free_path(path); + + return ret; +} + /* * replay one inode back reference item found in the log tree. * eb, slot and key refer to the buffer and key found in the log tree. @@ -1466,9 +1528,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, goto out; /* insert our name */ - ret = btrfs_add_link(trans, BTRFS_I(dir), - BTRFS_I(inode), - name, namelen, 0, ref_index); + ret = add_link(trans, root, dir, inode, name, namelen, + ref_index); if (ret) goto out; @@ -2663,7 +2724,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); - btrfs_set_lock_blocking(next); + btrfs_set_lock_blocking_write(next); clean_tree_block(fs_info, next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); @@ -2747,7 +2808,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); - btrfs_set_lock_blocking(next); + btrfs_set_lock_blocking_write(next); clean_tree_block(fs_info, next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); @@ -2829,7 +2890,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); - btrfs_set_lock_blocking(next); + btrfs_set_lock_blocking_write(next); clean_tree_block(fs_info, next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); @@ -3706,6 +3767,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans, found_key.type = 0; ret = btrfs_bin_search(path->nodes[0], &found_key, 0, &start_slot); + if (ret < 0) + break; ret = btrfs_del_items(trans, log, path, start_slot, path->slots[0] - start_slot + 1); @@ -4717,7 +4780,7 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, const int slot, const struct btrfs_key *key, struct btrfs_inode *inode, - u64 *other_ino) + u64 *other_ino, u64 *other_parent) { int ret; struct btrfs_path *search_path; @@ -4780,8 +4843,13 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, btrfs_dir_item_key_to_cpu(search_path->nodes[0], di, &di_key); if (di_key.type == BTRFS_INODE_ITEM_KEY) { - ret = 1; - *other_ino = di_key.objectid; + if (di_key.objectid != key->objectid) { + ret = 1; + *other_ino = di_key.objectid; + *other_parent = parent; + } else { + ret = 0; + } } else { ret = -EAGAIN; } @@ -4801,6 +4869,144 @@ out: return ret; } +struct btrfs_ino_list { + u64 ino; + u64 parent; + struct list_head list; +}; + +static int log_conflicting_inodes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_log_ctx *ctx, + u64 ino, u64 parent) +{ + struct btrfs_ino_list *ino_elem; + LIST_HEAD(inode_list); + int ret = 0; + + ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); + if (!ino_elem) + return -ENOMEM; + ino_elem->ino = ino; + ino_elem->parent = parent; + list_add_tail(&ino_elem->list, &inode_list); + + while (!list_empty(&inode_list)) { + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_key key; + struct inode *inode; + + ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list, + list); + ino = ino_elem->ino; + parent = ino_elem->parent; + list_del(&ino_elem->list); + kfree(ino_elem); + if (ret) + continue; + + btrfs_release_path(path); + + key.objectid = ino; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + inode = btrfs_iget(fs_info->sb, &key, root, NULL); + /* + * If the other inode that had a conflicting dir entry was + * deleted in the current transaction, we need to log its parent + * directory. + */ + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + if (ret == -ENOENT) { + key.objectid = parent; + inode = btrfs_iget(fs_info->sb, &key, root, + NULL); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + } else { + ret = btrfs_log_inode(trans, root, + BTRFS_I(inode), + LOG_OTHER_INODE_ALL, + 0, LLONG_MAX, ctx); + iput(inode); + } + } + continue; + } + /* + * We are safe logging the other inode without acquiring its + * lock as long as we log with the LOG_INODE_EXISTS mode. We + * are safe against concurrent renames of the other inode as + * well because during a rename we pin the log and update the + * log with the new name before we unpin it. + */ + ret = btrfs_log_inode(trans, root, BTRFS_I(inode), + LOG_OTHER_INODE, 0, LLONG_MAX, ctx); + if (ret) { + iput(inode); + continue; + } + + key.objectid = ino; + key.type = BTRFS_INODE_REF_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + iput(inode); + continue; + } + + while (true) { + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + u64 other_ino = 0; + u64 other_parent = 0; + + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + break; + } else if (ret > 0) { + ret = 0; + break; + } + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != ino || + (key.type != BTRFS_INODE_REF_KEY && + key.type != BTRFS_INODE_EXTREF_KEY)) { + ret = 0; + break; + } + + ret = btrfs_check_ref_name_override(leaf, slot, &key, + BTRFS_I(inode), &other_ino, + &other_parent); + if (ret < 0) + break; + if (ret > 0) { + ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); + if (!ino_elem) { + ret = -ENOMEM; + break; + } + ino_elem->ino = other_ino; + ino_elem->parent = other_parent; + list_add_tail(&ino_elem->list, &inode_list); + ret = 0; + } + path->slots[0]++; + } + iput(inode); + } + + return ret; +} + /* log a single inode in the tree log. * At least one parent directory for this inode must exist in the tree * or be logged already. @@ -4840,6 +5046,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, u64 logged_isize = 0; bool need_log_inode_item = true; bool xattrs_logged = false; + bool recursive_logging = false; path = btrfs_alloc_path(); if (!path) @@ -4885,8 +5092,12 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, return ret; } - if (inode_only == LOG_OTHER_INODE) { - inode_only = LOG_INODE_EXISTS; + if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) { + recursive_logging = true; + if (inode_only == LOG_OTHER_INODE) + inode_only = LOG_INODE_EXISTS; + else + inode_only = LOG_INODE_ALL; mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING); } else { mutex_lock(&inode->log_mutex); @@ -4981,20 +5192,19 @@ again: if ((min_key.type == BTRFS_INODE_REF_KEY || min_key.type == BTRFS_INODE_EXTREF_KEY) && - inode->generation == trans->transid) { + inode->generation == trans->transid && + !recursive_logging) { u64 other_ino = 0; + u64 other_parent = 0; ret = btrfs_check_ref_name_override(path->nodes[0], path->slots[0], &min_key, inode, - &other_ino); + &other_ino, &other_parent); if (ret < 0) { err = ret; goto out_unlock; } else if (ret > 0 && ctx && other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { - struct btrfs_key inode_key; - struct inode *other_inode; - if (ins_nr > 0) { ins_nr++; } else { @@ -5010,43 +5220,13 @@ again: goto out_unlock; } ins_nr = 0; - btrfs_release_path(path); - inode_key.objectid = other_ino; - inode_key.type = BTRFS_INODE_ITEM_KEY; - inode_key.offset = 0; - other_inode = btrfs_iget(fs_info->sb, - &inode_key, root, - NULL); - /* - * If the other inode that had a conflicting dir - * entry was deleted in the current transaction, - * we don't need to do more work nor fallback to - * a transaction commit. - */ - if (other_inode == ERR_PTR(-ENOENT)) { - goto next_key; - } else if (IS_ERR(other_inode)) { - err = PTR_ERR(other_inode); - goto out_unlock; - } - /* - * We are safe logging the other inode without - * acquiring its i_mutex as long as we log with - * the LOG_INODE_EXISTS mode. We're safe against - * concurrent renames of the other inode as well - * because during a rename we pin the log and - * update the log with the new name before we - * unpin it. - */ - err = btrfs_log_inode(trans, root, - BTRFS_I(other_inode), - LOG_OTHER_INODE, 0, LLONG_MAX, - ctx); - iput(other_inode); + + err = log_conflicting_inodes(trans, root, path, + ctx, other_ino, other_parent); if (err) goto out_unlock; - else - goto next_key; + btrfs_release_path(path); + goto next_key; } } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 15561926ab32..9024eee889b9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -415,27 +415,6 @@ static struct btrfs_device *__alloc_device(void) return dev; } -/* - * Find a device specified by @devid or @uuid in the list of @fs_devices, or - * return NULL. - * - * If devid and uuid are both specified, the match must be exact, otherwise - * only devid is used. - */ -static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices, - u64 devid, const u8 *uuid) -{ - struct btrfs_device *dev; - - list_for_each_entry(dev, &fs_devices->devices, dev_list) { - if (dev->devid == devid && - (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { - return dev; - } - } - return NULL; -} - static noinline struct btrfs_fs_devices *find_fsid( const u8 *fsid, const u8 *metadata_fsid) { @@ -734,6 +713,17 @@ static void pending_bios_fn(struct btrfs_work *work) run_scheduled_bios(device); } +static bool device_path_matched(const char *path, struct btrfs_device *device) +{ + int found; + + rcu_read_lock(); + found = strcmp(rcu_str_deref(device->name), path); + rcu_read_unlock(); + + return found == 0; +} + /* * Search and remove all stale (devices which are not mounted) devices. * When both inputs are NULL, it will search and release all stale devices. @@ -741,52 +731,57 @@ static void pending_bios_fn(struct btrfs_work *work) * matching this path only. * skip_dev: Optional. Will skip this device when searching for the stale * devices. + * Return: 0 for success or if @path is NULL. + * -EBUSY if @path is a mounted device. + * -ENOENT if @path does not match any device in the list. */ -static void btrfs_free_stale_devices(const char *path, +static int btrfs_free_stale_devices(const char *path, struct btrfs_device *skip_device) { struct btrfs_fs_devices *fs_devices, *tmp_fs_devices; struct btrfs_device *device, *tmp_device; + int ret = 0; + + if (path) + ret = -ENOENT; list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) { - mutex_lock(&fs_devices->device_list_mutex); - if (fs_devices->opened) { - mutex_unlock(&fs_devices->device_list_mutex); - continue; - } + mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, dev_list) { - int not_found = 0; - if (skip_device && skip_device == device) continue; if (path && !device->name) continue; - - rcu_read_lock(); - if (path) - not_found = strcmp(rcu_str_deref(device->name), - path); - rcu_read_unlock(); - if (not_found) + if (path && !device_path_matched(path, device)) continue; + if (fs_devices->opened) { + /* for an already deleted device return 0 */ + if (path && ret != 0) + ret = -EBUSY; + break; + } /* delete the stale device */ fs_devices->num_devices--; list_del(&device->dev_list); btrfs_free_device(device); + ret = 0; if (fs_devices->num_devices == 0) break; } mutex_unlock(&fs_devices->device_list_mutex); + if (fs_devices->num_devices == 0) { btrfs_sysfs_remove_fsid(fs_devices); list_del(&fs_devices->fs_list); free_fs_devices(fs_devices); } } + + return ret; } static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, @@ -968,8 +963,8 @@ static noinline struct btrfs_device *device_list_add(const char *path, device = NULL; } else { mutex_lock(&fs_devices->device_list_mutex); - device = find_device(fs_devices, devid, - disk_super->dev_item.uuid); + device = btrfs_find_device(fs_devices, devid, + disk_super->dev_item.uuid, NULL, false); /* * If this disk has been pulled into an fs devices created by @@ -1134,7 +1129,6 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) mutex_lock(&orig->device_list_mutex); fs_devices->total_devices = orig->total_devices; - /* We have held the volume lock, it is safe to get the devices. */ list_for_each_entry(orig_dev, &orig->devices, dev_list) { struct rcu_string *name; @@ -1451,6 +1445,17 @@ static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, return 0; } +int btrfs_forget_devices(const char *path) +{ + int ret; + + mutex_lock(&uuid_mutex); + ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL); + mutex_unlock(&uuid_mutex); + + return ret; +} + /* * Look for a btrfs signature on a device. This may be called out of the mount path * and we are not allowed to call set_blocksize during the scan. The superblock @@ -2385,11 +2390,11 @@ static struct btrfs_device *btrfs_find_device_by_path( devid = btrfs_stack_device_id(&disk_super->dev_item); dev_uuid = disk_super->dev_item.uuid; if (btrfs_fs_incompat(fs_info, METADATA_UUID)) - device = btrfs_find_device(fs_info, devid, dev_uuid, - disk_super->metadata_uuid); + device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, + disk_super->metadata_uuid, true); else - device = btrfs_find_device(fs_info, devid, - dev_uuid, disk_super->fsid); + device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, + disk_super->fsid, true); brelse(bh); if (!device) @@ -2398,50 +2403,38 @@ static struct btrfs_device *btrfs_find_device_by_path( return device; } -static struct btrfs_device *btrfs_find_device_missing_or_by_path( - struct btrfs_fs_info *fs_info, const char *device_path) -{ - struct btrfs_device *device = NULL; - if (strcmp(device_path, "missing") == 0) { - struct list_head *devices; - struct btrfs_device *tmp; - - devices = &fs_info->fs_devices->devices; - list_for_each_entry(tmp, devices, dev_list) { - if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, - &tmp->dev_state) && !tmp->bdev) { - device = tmp; - break; - } - } - - if (!device) - return ERR_PTR(-ENOENT); - } else { - device = btrfs_find_device_by_path(fs_info, device_path); - } - - return device; -} - /* * Lookup a device given by device id, or the path if the id is 0. */ struct btrfs_device *btrfs_find_device_by_devspec( - struct btrfs_fs_info *fs_info, u64 devid, const char *devpath) + struct btrfs_fs_info *fs_info, u64 devid, + const char *device_path) { struct btrfs_device *device; if (devid) { - device = btrfs_find_device(fs_info, devid, NULL, NULL); + device = btrfs_find_device(fs_info->fs_devices, devid, NULL, + NULL, true); if (!device) return ERR_PTR(-ENOENT); - } else { - if (!devpath || !devpath[0]) - return ERR_PTR(-EINVAL); - device = btrfs_find_device_missing_or_by_path(fs_info, devpath); + return device; } - return device; + + if (!device_path || !device_path[0]) + return ERR_PTR(-EINVAL); + + if (strcmp(device_path, "missing") == 0) { + /* Find first missing device */ + list_for_each_entry(device, &fs_info->fs_devices->devices, + dev_list) { + if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, + &device->dev_state) && !device->bdev) + return device; + } + return ERR_PTR(-ENOENT); + } + + return btrfs_find_device_by_path(fs_info, device_path); } /* @@ -2563,7 +2556,8 @@ next_slot: BTRFS_UUID_SIZE); read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), BTRFS_FSID_SIZE); - device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid); + device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, + fs_uuid, true); BUG_ON(!device); /* Logic error */ if (device->fs_devices->seeding) { @@ -6616,21 +6610,36 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, return BLK_STS_OK; } -struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, - u8 *uuid, u8 *fsid) +/* + * Find a device specified by @devid or @uuid in the list of @fs_devices, or + * return NULL. + * + * If devid and uuid are both specified, the match must be exact, otherwise + * only devid is used. + * + * If @seed is true, traverse through the seed devices. + */ +struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, + u64 devid, u8 *uuid, u8 *fsid, + bool seed) { struct btrfs_device *device; - struct btrfs_fs_devices *cur_devices; - cur_devices = fs_info->fs_devices; - while (cur_devices) { + while (fs_devices) { if (!fsid || - !memcmp(cur_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { - device = find_device(cur_devices, devid, uuid); - if (device) - return device; + !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { + list_for_each_entry(device, &fs_devices->devices, + dev_list) { + if (device->devid == devid && + (!uuid || memcmp(device->uuid, uuid, + BTRFS_UUID_SIZE) == 0)) + return device; + } } - cur_devices = cur_devices->seed; + if (seed) + fs_devices = fs_devices->seed; + else + return NULL; } return NULL; } @@ -6782,10 +6791,10 @@ static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, } if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) || - (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) || + (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) || (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) || (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) || - (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) || + (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) || ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && num_stripes != 1)) { btrfs_err(fs_info, @@ -6875,8 +6884,8 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, read_extent_buffer(leaf, uuid, (unsigned long) btrfs_stripe_dev_uuid_nr(chunk, i), BTRFS_UUID_SIZE); - map->stripes[i].dev = btrfs_find_device(fs_info, devid, - uuid, NULL); + map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, + devid, uuid, NULL, true); if (!map->stripes[i].dev && !btrfs_test_opt(fs_info, DEGRADED)) { free_extent_map(em); @@ -7015,7 +7024,8 @@ static int read_one_dev(struct btrfs_fs_info *fs_info, return PTR_ERR(fs_devices); } - device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid); + device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, + fs_uuid, true); if (!device) { if (!btrfs_test_opt(fs_info, DEGRADED)) { btrfs_report_missing_device(fs_info, devid, @@ -7605,7 +7615,8 @@ int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, int i; mutex_lock(&fs_devices->device_list_mutex); - dev = btrfs_find_device(fs_info, stats->devid, NULL, NULL); + dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL, + true); mutex_unlock(&fs_devices->device_list_mutex); if (!dev) { @@ -7819,7 +7830,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, } /* Make sure no dev extent is beyond device bondary */ - dev = btrfs_find_device(fs_info, devid, NULL, NULL); + dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true); if (!dev) { btrfs_err(fs_info, "failed to find devid %llu", devid); ret = -EUCLEAN; @@ -7828,7 +7839,8 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, /* It's possible this device is a dummy for seed device */ if (dev->disk_total_bytes == 0) { - dev = find_device(fs_info->fs_devices->seed, devid, NULL); + dev = btrfs_find_device(fs_info->fs_devices->seed, devid, NULL, + NULL, false); if (!dev) { btrfs_err(fs_info, "failed to find seed devid %llu", devid); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index ed806649a473..3ad9d58d1b66 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -416,6 +416,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder); struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, void *holder); +int btrfs_forget_devices(const char *path); int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step); void btrfs_assign_next_active_device(struct btrfs_device *device, @@ -433,8 +434,8 @@ void __exit btrfs_cleanup_fs_uuids(void); int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); int btrfs_grow_device(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 new_size); -struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, - u8 *uuid, u8 *fsid); +struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, + u64 devid, u8 *uuid, u8 *fsid, bool seed); int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path); int btrfs_balance(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 970ff3e35bb3..b86b7ad6b900 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -27,6 +27,33 @@ struct workspace { int level; }; +static struct workspace_manager wsm; + +static void zlib_init_workspace_manager(void) +{ + btrfs_init_workspace_manager(&wsm, &btrfs_zlib_compress); +} + +static void zlib_cleanup_workspace_manager(void) +{ + btrfs_cleanup_workspace_manager(&wsm); +} + +static struct list_head *zlib_get_workspace(unsigned int level) +{ + struct list_head *ws = btrfs_get_workspace(&wsm, level); + struct workspace *workspace = list_entry(ws, struct workspace, list); + + workspace->level = level; + + return ws; +} + +static void zlib_put_workspace(struct list_head *ws) +{ + btrfs_put_workspace(&wsm, ws); +} + static void zlib_free_workspace(struct list_head *ws) { struct workspace *workspace = list_entry(ws, struct workspace, list); @@ -36,7 +63,7 @@ static void zlib_free_workspace(struct list_head *ws) kfree(workspace); } -static struct list_head *zlib_alloc_workspace(void) +static struct list_head *zlib_alloc_workspace(unsigned int level) { struct workspace *workspace; int workspacesize; @@ -48,6 +75,7 @@ static struct list_head *zlib_alloc_workspace(void) workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), zlib_inflate_workspacesize()); workspace->strm.workspace = kvmalloc(workspacesize, GFP_KERNEL); + workspace->level = level; workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!workspace->strm.workspace || !workspace->buf) goto fail; @@ -390,18 +418,19 @@ next: return ret; } -static void zlib_set_level(struct list_head *ws, unsigned int type) +static unsigned int zlib_set_level(unsigned int level) { - struct workspace *workspace = list_entry(ws, struct workspace, list); - unsigned level = (type & 0xF0) >> 4; - - if (level > 9) - level = 9; + if (!level) + return BTRFS_ZLIB_DEFAULT_LEVEL; - workspace->level = level > 0 ? level : 3; + return min_t(unsigned int, level, 9); } const struct btrfs_compress_op btrfs_zlib_compress = { + .init_workspace_manager = zlib_init_workspace_manager, + .cleanup_workspace_manager = zlib_cleanup_workspace_manager, + .get_workspace = zlib_get_workspace, + .put_workspace = zlib_put_workspace, .alloc_workspace = zlib_alloc_workspace, .free_workspace = zlib_free_workspace, .compress_pages = zlib_compress_pages, diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index af6ec59972f5..3e418a3aeb11 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -6,25 +6,31 @@ */ #include <linux/bio.h> +#include <linux/bitmap.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/mm.h> +#include <linux/sched/mm.h> #include <linux/pagemap.h> #include <linux/refcount.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/zstd.h> #include "compression.h" +#include "ctree.h" #define ZSTD_BTRFS_MAX_WINDOWLOG 17 #define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG) #define ZSTD_BTRFS_DEFAULT_LEVEL 3 +#define ZSTD_BTRFS_MAX_LEVEL 15 +/* 307s to avoid pathologically clashing with transaction commit */ +#define ZSTD_BTRFS_RECLAIM_JIFFIES (307 * HZ) -static ZSTD_parameters zstd_get_btrfs_parameters(size_t src_len) +static ZSTD_parameters zstd_get_btrfs_parameters(unsigned int level, + size_t src_len) { - ZSTD_parameters params = ZSTD_getParams(ZSTD_BTRFS_DEFAULT_LEVEL, - src_len, 0); + ZSTD_parameters params = ZSTD_getParams(level, src_len, 0); if (params.cParams.windowLog > ZSTD_BTRFS_MAX_WINDOWLOG) params.cParams.windowLog = ZSTD_BTRFS_MAX_WINDOWLOG; @@ -36,11 +42,290 @@ struct workspace { void *mem; size_t size; char *buf; + unsigned int level; + unsigned int req_level; + unsigned long last_used; /* jiffies */ struct list_head list; + struct list_head lru_list; ZSTD_inBuffer in_buf; ZSTD_outBuffer out_buf; }; +/* + * Zstd Workspace Management + * + * Zstd workspaces have different memory requirements depending on the level. + * The zstd workspaces are managed by having individual lists for each level + * and a global lru. Forward progress is maintained by protecting a max level + * workspace. + * + * Getting a workspace is done by using the bitmap to identify the levels that + * have available workspaces and scans up. This lets us recycle higher level + * workspaces because of the monotonic memory guarantee. A workspace's + * last_used is only updated if it is being used by the corresponding memory + * level. Putting a workspace involves adding it back to the appropriate places + * and adding it back to the lru if necessary. + * + * A timer is used to reclaim workspaces if they have not been used for + * ZSTD_BTRFS_RECLAIM_JIFFIES. This helps keep only active workspaces around. + * The upper bound is provided by the workqueue limit which is 2 (percpu limit). + */ + +struct zstd_workspace_manager { + const struct btrfs_compress_op *ops; + spinlock_t lock; + struct list_head lru_list; + struct list_head idle_ws[ZSTD_BTRFS_MAX_LEVEL]; + unsigned long active_map; + wait_queue_head_t wait; + struct timer_list timer; +}; + +static struct zstd_workspace_manager wsm; + +static size_t zstd_ws_mem_sizes[ZSTD_BTRFS_MAX_LEVEL]; + +static inline struct workspace *list_to_workspace(struct list_head *list) +{ + return container_of(list, struct workspace, list); +} + +/* + * zstd_reclaim_timer_fn - reclaim timer + * @t: timer + * + * This scans the lru_list and attempts to reclaim any workspace that hasn't + * been used for ZSTD_BTRFS_RECLAIM_JIFFIES. + */ +static void zstd_reclaim_timer_fn(struct timer_list *timer) +{ + unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES; + struct list_head *pos, *next; + + spin_lock(&wsm.lock); + + if (list_empty(&wsm.lru_list)) { + spin_unlock(&wsm.lock); + return; + } + + list_for_each_prev_safe(pos, next, &wsm.lru_list) { + struct workspace *victim = container_of(pos, struct workspace, + lru_list); + unsigned int level; + + if (time_after(victim->last_used, reclaim_threshold)) + break; + + /* workspace is in use */ + if (victim->req_level) + continue; + + level = victim->level; + list_del(&victim->lru_list); + list_del(&victim->list); + wsm.ops->free_workspace(&victim->list); + + if (list_empty(&wsm.idle_ws[level - 1])) + clear_bit(level - 1, &wsm.active_map); + + } + + if (!list_empty(&wsm.lru_list)) + mod_timer(&wsm.timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES); + + spin_unlock(&wsm.lock); +} + +/* + * zstd_calc_ws_mem_sizes - calculate monotonic memory bounds + * + * It is possible based on the level configurations that a higher level + * workspace uses less memory than a lower level workspace. In order to reuse + * workspaces, this must be made a monotonic relationship. This precomputes + * the required memory for each level and enforces the monotonicity between + * level and memory required. + */ +static void zstd_calc_ws_mem_sizes(void) +{ + size_t max_size = 0; + unsigned int level; + + for (level = 1; level <= ZSTD_BTRFS_MAX_LEVEL; level++) { + ZSTD_parameters params = + zstd_get_btrfs_parameters(level, ZSTD_BTRFS_MAX_INPUT); + size_t level_size = + max_t(size_t, + ZSTD_CStreamWorkspaceBound(params.cParams), + ZSTD_DStreamWorkspaceBound(ZSTD_BTRFS_MAX_INPUT)); + + max_size = max_t(size_t, max_size, level_size); + zstd_ws_mem_sizes[level - 1] = max_size; + } +} + +static void zstd_init_workspace_manager(void) +{ + struct list_head *ws; + int i; + + zstd_calc_ws_mem_sizes(); + + wsm.ops = &btrfs_zstd_compress; + spin_lock_init(&wsm.lock); + init_waitqueue_head(&wsm.wait); + timer_setup(&wsm.timer, zstd_reclaim_timer_fn, 0); + + INIT_LIST_HEAD(&wsm.lru_list); + for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) + INIT_LIST_HEAD(&wsm.idle_ws[i]); + + ws = wsm.ops->alloc_workspace(ZSTD_BTRFS_MAX_LEVEL); + if (IS_ERR(ws)) { + pr_warn( + "BTRFS: cannot preallocate zstd compression workspace\n"); + } else { + set_bit(ZSTD_BTRFS_MAX_LEVEL - 1, &wsm.active_map); + list_add(ws, &wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1]); + } +} + +static void zstd_cleanup_workspace_manager(void) +{ + struct workspace *workspace; + int i; + + del_timer(&wsm.timer); + + for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) { + while (!list_empty(&wsm.idle_ws[i])) { + workspace = container_of(wsm.idle_ws[i].next, + struct workspace, list); + list_del(&workspace->list); + list_del(&workspace->lru_list); + wsm.ops->free_workspace(&workspace->list); + } + } +} + +/* + * zstd_find_workspace - find workspace + * @level: compression level + * + * This iterates over the set bits in the active_map beginning at the requested + * compression level. This lets us utilize already allocated workspaces before + * allocating a new one. If the workspace is of a larger size, it is used, but + * the place in the lru_list and last_used times are not updated. This is to + * offer the opportunity to reclaim the workspace in favor of allocating an + * appropriately sized one in the future. + */ +static struct list_head *zstd_find_workspace(unsigned int level) +{ + struct list_head *ws; + struct workspace *workspace; + int i = level - 1; + + spin_lock(&wsm.lock); + for_each_set_bit_from(i, &wsm.active_map, ZSTD_BTRFS_MAX_LEVEL) { + if (!list_empty(&wsm.idle_ws[i])) { + ws = wsm.idle_ws[i].next; + workspace = list_to_workspace(ws); + list_del_init(ws); + /* keep its place if it's a lower level using this */ + workspace->req_level = level; + if (level == workspace->level) + list_del(&workspace->lru_list); + if (list_empty(&wsm.idle_ws[i])) + clear_bit(i, &wsm.active_map); + spin_unlock(&wsm.lock); + return ws; + } + } + spin_unlock(&wsm.lock); + + return NULL; +} + +/* + * zstd_get_workspace - zstd's get_workspace + * @level: compression level + * + * If @level is 0, then any compression level can be used. Therefore, we begin + * scanning from 1. We first scan through possible workspaces and then after + * attempt to allocate a new workspace. If we fail to allocate one due to + * memory pressure, go to sleep waiting for the max level workspace to free up. + */ +static struct list_head *zstd_get_workspace(unsigned int level) +{ + struct list_head *ws; + unsigned int nofs_flag; + + /* level == 0 means we can use any workspace */ + if (!level) + level = 1; + +again: + ws = zstd_find_workspace(level); + if (ws) + return ws; + + nofs_flag = memalloc_nofs_save(); + ws = wsm.ops->alloc_workspace(level); + memalloc_nofs_restore(nofs_flag); + + if (IS_ERR(ws)) { + DEFINE_WAIT(wait); + + prepare_to_wait(&wsm.wait, &wait, TASK_UNINTERRUPTIBLE); + schedule(); + finish_wait(&wsm.wait, &wait); + + goto again; + } + + return ws; +} + +/* + * zstd_put_workspace - zstd put_workspace + * @ws: list_head for the workspace + * + * When putting back a workspace, we only need to update the LRU if we are of + * the requested compression level. Here is where we continue to protect the + * max level workspace or update last_used accordingly. If the reclaim timer + * isn't set, it is also set here. Only the max level workspace tries and wakes + * up waiting workspaces. + */ +static void zstd_put_workspace(struct list_head *ws) +{ + struct workspace *workspace = list_to_workspace(ws); + + spin_lock(&wsm.lock); + + /* A node is only taken off the lru if we are the corresponding level */ + if (workspace->req_level == workspace->level) { + /* Hide a max level workspace from reclaim */ + if (list_empty(&wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1])) { + INIT_LIST_HEAD(&workspace->lru_list); + } else { + workspace->last_used = jiffies; + list_add(&workspace->lru_list, &wsm.lru_list); + if (!timer_pending(&wsm.timer)) + mod_timer(&wsm.timer, + jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES); + } + } + + set_bit(workspace->level - 1, &wsm.active_map); + list_add(&workspace->list, &wsm.idle_ws[workspace->level - 1]); + workspace->req_level = 0; + + spin_unlock(&wsm.lock); + + if (workspace->level == ZSTD_BTRFS_MAX_LEVEL) + cond_wake_up(&wsm.wait); +} + static void zstd_free_workspace(struct list_head *ws) { struct workspace *workspace = list_entry(ws, struct workspace, list); @@ -50,25 +335,25 @@ static void zstd_free_workspace(struct list_head *ws) kfree(workspace); } -static struct list_head *zstd_alloc_workspace(void) +static struct list_head *zstd_alloc_workspace(unsigned int level) { - ZSTD_parameters params = - zstd_get_btrfs_parameters(ZSTD_BTRFS_MAX_INPUT); struct workspace *workspace; workspace = kzalloc(sizeof(*workspace), GFP_KERNEL); if (!workspace) return ERR_PTR(-ENOMEM); - workspace->size = max_t(size_t, - ZSTD_CStreamWorkspaceBound(params.cParams), - ZSTD_DStreamWorkspaceBound(ZSTD_BTRFS_MAX_INPUT)); + workspace->size = zstd_ws_mem_sizes[level - 1]; + workspace->level = level; + workspace->req_level = level; + workspace->last_used = jiffies; workspace->mem = kvmalloc(workspace->size, GFP_KERNEL); workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!workspace->mem || !workspace->buf) goto fail; INIT_LIST_HEAD(&workspace->list); + INIT_LIST_HEAD(&workspace->lru_list); return &workspace->list; fail: @@ -95,7 +380,8 @@ static int zstd_compress_pages(struct list_head *ws, unsigned long len = *total_out; const unsigned long nr_dest_pages = *out_pages; unsigned long max_out = nr_dest_pages * PAGE_SIZE; - ZSTD_parameters params = zstd_get_btrfs_parameters(len); + ZSTD_parameters params = zstd_get_btrfs_parameters(workspace->req_level, + len); *out_pages = 0; *total_out = 0; @@ -419,11 +705,19 @@ finish: return ret; } -static void zstd_set_level(struct list_head *ws, unsigned int type) +static unsigned int zstd_set_level(unsigned int level) { + if (!level) + return ZSTD_BTRFS_DEFAULT_LEVEL; + + return min_t(unsigned int, level, ZSTD_BTRFS_MAX_LEVEL); } const struct btrfs_compress_op btrfs_zstd_compress = { + .init_workspace_manager = zstd_init_workspace_manager, + .cleanup_workspace_manager = zstd_cleanup_workspace_manager, + .get_workspace = zstd_get_workspace, + .put_workspace = zstd_put_workspace, .alloc_workspace = zstd_alloc_workspace, .free_workspace = zstd_free_workspace, .compress_pages = zstd_compress_pages, diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 3b8114def693..13318e255ebf 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -252,33 +252,10 @@ ext2_validate_entry(char *base, unsigned offset, unsigned mask) return (char *)p - base; } -static unsigned char ext2_filetype_table[EXT2_FT_MAX] = { - [EXT2_FT_UNKNOWN] = DT_UNKNOWN, - [EXT2_FT_REG_FILE] = DT_REG, - [EXT2_FT_DIR] = DT_DIR, - [EXT2_FT_CHRDEV] = DT_CHR, - [EXT2_FT_BLKDEV] = DT_BLK, - [EXT2_FT_FIFO] = DT_FIFO, - [EXT2_FT_SOCK] = DT_SOCK, - [EXT2_FT_SYMLINK] = DT_LNK, -}; - -#define S_SHIFT 12 -static unsigned char ext2_type_by_mode[S_IFMT >> S_SHIFT] = { - [S_IFREG >> S_SHIFT] = EXT2_FT_REG_FILE, - [S_IFDIR >> S_SHIFT] = EXT2_FT_DIR, - [S_IFCHR >> S_SHIFT] = EXT2_FT_CHRDEV, - [S_IFBLK >> S_SHIFT] = EXT2_FT_BLKDEV, - [S_IFIFO >> S_SHIFT] = EXT2_FT_FIFO, - [S_IFSOCK >> S_SHIFT] = EXT2_FT_SOCK, - [S_IFLNK >> S_SHIFT] = EXT2_FT_SYMLINK, -}; - static inline void ext2_set_de_type(ext2_dirent *de, struct inode *inode) { - umode_t mode = inode->i_mode; if (EXT2_HAS_INCOMPAT_FEATURE(inode->i_sb, EXT2_FEATURE_INCOMPAT_FILETYPE)) - de->file_type = ext2_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; + de->file_type = fs_umode_to_ftype(inode->i_mode); else de->file_type = 0; } @@ -293,14 +270,14 @@ ext2_readdir(struct file *file, struct dir_context *ctx) unsigned long n = pos >> PAGE_SHIFT; unsigned long npages = dir_pages(inode); unsigned chunk_mask = ~(ext2_chunk_size(inode)-1); - unsigned char *types = NULL; bool need_revalidate = !inode_eq_iversion(inode, file->f_version); + bool has_filetype; if (pos > inode->i_size - EXT2_DIR_REC_LEN(1)) return 0; - if (EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_FILETYPE)) - types = ext2_filetype_table; + has_filetype = + EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_FILETYPE); for ( ; n < npages; n++, offset = 0) { char *kaddr, *limit; @@ -335,8 +312,8 @@ ext2_readdir(struct file *file, struct dir_context *ctx) if (de->inode) { unsigned char d_type = DT_UNKNOWN; - if (types && de->file_type < EXT2_FT_MAX) - d_type = types[de->file_type]; + if (has_filetype) + d_type = fs_ftype_to_dtype(de->file_type); if (!dir_emit(ctx, de->name, de->name_len, le32_to_cpu(de->inode), diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index e770cd100a6a..10ab238de9a6 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -604,22 +604,6 @@ struct ext2_dir_entry_2 { }; /* - * Ext2 directory file types. Only the low 3 bits are used. The - * other bits are reserved for now. - */ -enum { - EXT2_FT_UNKNOWN = 0, - EXT2_FT_REG_FILE = 1, - EXT2_FT_DIR = 2, - EXT2_FT_CHRDEV = 3, - EXT2_FT_BLKDEV = 4, - EXT2_FT_FIFO = 5, - EXT2_FT_SOCK = 6, - EXT2_FT_SYMLINK = 7, - EXT2_FT_MAX -}; - -/* * EXT2_DIR_PAD defines the directory entries boundaries * * NOTE: It must be a multiple of 4 @@ -774,6 +758,7 @@ extern int ext2_write_inode (struct inode *, struct writeback_control *); extern void ext2_evict_inode(struct inode *); extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); extern int ext2_setattr (struct dentry *, struct iattr *); +extern int ext2_getattr (const struct path *, struct kstat *, u32, unsigned int); extern void ext2_set_inode_flags(struct inode *inode); extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 28b2609f25c1..39c4772e96c9 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -199,6 +199,7 @@ const struct inode_operations ext2_file_inode_operations = { #ifdef CONFIG_EXT2_FS_XATTR .listxattr = ext2_listxattr, #endif + .getattr = ext2_getattr, .setattr = ext2_setattr, .get_acl = ext2_get_acl, .set_acl = ext2_set_acl, diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 5c3d7b7e4975..a0c5ea91fcd4 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -222,8 +222,6 @@ static int find_group_dir(struct super_block *sb, struct inode *parent) best_desc = desc; } } - if (!best_desc) - return -1; return best_group; } diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index e4bb9386c045..c27c27300d95 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -717,7 +717,7 @@ static int ext2_get_blocks(struct inode *inode, /* the number of blocks need to allocate for [d,t]indirect blocks */ indirect_blks = (chain + depth) - partial - 1; /* - * Next look up the indirect map to count the totoal number of + * Next look up the indirect map to count the total number of * direct blocks to allocate for this branch. */ count = ext2_blks_to_allocate(partial, indirect_blks, @@ -1239,6 +1239,7 @@ do_indirects: mark_inode_dirty(inode); ext2_free_branches(inode, &nr, &nr+1, 1); } + /* fall through */ case EXT2_IND_BLOCK: nr = i_data[EXT2_DIND_BLOCK]; if (nr) { @@ -1246,6 +1247,7 @@ do_indirects: mark_inode_dirty(inode); ext2_free_branches(inode, &nr, &nr+1, 2); } + /* fall through */ case EXT2_DIND_BLOCK: nr = i_data[EXT2_TIND_BLOCK]; if (nr) { @@ -1635,6 +1637,32 @@ int ext2_write_inode(struct inode *inode, struct writeback_control *wbc) return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); } +int ext2_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_falgs) +{ + struct inode *inode = d_inode(path->dentry); + struct ext2_inode_info *ei = EXT2_I(inode); + unsigned int flags; + + flags = ei->i_flags & EXT2_FL_USER_VISIBLE; + if (flags & EXT2_APPEND_FL) + stat->attributes |= STATX_ATTR_APPEND; + if (flags & EXT2_COMPR_FL) + stat->attributes |= STATX_ATTR_COMPRESSED; + if (flags & EXT2_IMMUTABLE_FL) + stat->attributes |= STATX_ATTR_IMMUTABLE; + if (flags & EXT2_NODUMP_FL) + stat->attributes |= STATX_ATTR_NODUMP; + stat->attributes_mask |= (STATX_ATTR_APPEND | + STATX_ATTR_COMPRESSED | + STATX_ATTR_ENCRYPTED | + STATX_ATTR_IMMUTABLE | + STATX_ATTR_NODUMP); + + generic_fillattr(inode, stat); + return 0; +} + int ext2_setattr(struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 0c26dcc5d850..ccfbbf59e2fc 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -416,6 +416,7 @@ const struct inode_operations ext2_dir_inode_operations = { #ifdef CONFIG_EXT2_FS_XATTR .listxattr = ext2_listxattr, #endif + .getattr = ext2_getattr, .setattr = ext2_setattr, .get_acl = ext2_get_acl, .set_acl = ext2_set_acl, @@ -426,6 +427,7 @@ const struct inode_operations ext2_special_inode_operations = { #ifdef CONFIG_EXT2_FS_XATTR .listxattr = ext2_listxattr, #endif + .getattr = ext2_getattr, .setattr = ext2_setattr, .get_acl = ext2_get_acl, .set_acl = ext2_set_acl, diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 73b2d528237f..0128010a0874 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -757,7 +757,8 @@ static loff_t ext2_max_size(int bits) { loff_t res = EXT2_NDIR_BLOCKS; int meta_blocks; - loff_t upper_limit; + unsigned int upper_limit; + unsigned int ppb = 1 << (bits-2); /* This is calculated to be the largest file size for a * dense, file such that the total number of @@ -771,24 +772,34 @@ static loff_t ext2_max_size(int bits) /* total blocks in file system block size */ upper_limit >>= (bits - 9); + /* Compute how many blocks we can address by block tree */ + res += 1LL << (bits-2); + res += 1LL << (2*(bits-2)); + res += 1LL << (3*(bits-2)); + /* Does block tree limit file size? */ + if (res < upper_limit) + goto check_lfs; + res = upper_limit; + /* How many metadata blocks are needed for addressing upper_limit? */ + upper_limit -= EXT2_NDIR_BLOCKS; /* indirect blocks */ meta_blocks = 1; + upper_limit -= ppb; /* double indirect blocks */ - meta_blocks += 1 + (1LL << (bits-2)); - /* tripple indirect blocks */ - meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2))); - - upper_limit -= meta_blocks; - upper_limit <<= bits; - - res += 1LL << (bits-2); - res += 1LL << (2*(bits-2)); - res += 1LL << (3*(bits-2)); + if (upper_limit < ppb * ppb) { + meta_blocks += 1 + DIV_ROUND_UP(upper_limit, ppb); + res -= meta_blocks; + goto check_lfs; + } + meta_blocks += 1 + ppb; + upper_limit -= ppb * ppb; + /* tripple indirect blocks for the rest */ + meta_blocks += 1 + DIV_ROUND_UP(upper_limit, ppb) + + DIV_ROUND_UP(upper_limit, ppb*ppb); + res -= meta_blocks; +check_lfs: res <<= bits; - if (res > upper_limit) - res = upper_limit; - if (res > MAX_LFS_FILESIZE) res = MAX_LFS_FILESIZE; @@ -1024,8 +1035,6 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group); sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); - if (EXT2_INODE_SIZE(sb) == 0) - goto cantfind_ext2; sbi->s_inodes_per_block = sb->s_blocksize / EXT2_INODE_SIZE(sb); if (sbi->s_inodes_per_block == 0 || sbi->s_inodes_per_group == 0) goto cantfind_ext2; @@ -1087,12 +1096,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) sizeof(struct buffer_head *), GFP_KERNEL); if (sbi->s_group_desc == NULL) { + ret = -ENOMEM; ext2_msg(sb, KERN_ERR, "error: not enough memory"); goto failed_mount; } bgl_lock_init(sbi->s_blockgroup_lock); sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL); if (!sbi->s_debts) { + ret = -ENOMEM; ext2_msg(sb, KERN_ERR, "error: not enough memory"); goto failed_mount_group_desc; } @@ -1148,6 +1159,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) #ifdef CONFIG_EXT2_FS_XATTR sbi->s_ea_block_cache = ext2_xattr_create_cache(); if (!sbi->s_ea_block_cache) { + ret = -ENOMEM; ext2_msg(sb, KERN_ERR, "Failed to create ea_block_cache"); goto failed_mount3; } diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c index d5589ddcc281..00cdb8679486 100644 --- a/fs/ext2/symlink.c +++ b/fs/ext2/symlink.c @@ -23,6 +23,7 @@ const struct inode_operations ext2_symlink_inode_operations = { .get_link = page_get_link, + .getattr = ext2_getattr, .setattr = ext2_setattr, #ifdef CONFIG_EXT2_FS_XATTR .listxattr = ext2_listxattr, @@ -31,6 +32,7 @@ const struct inode_operations ext2_symlink_inode_operations = { const struct inode_operations ext2_fast_symlink_inode_operations = { .get_link = simple_get_link, + .getattr = ext2_getattr, .setattr = ext2_setattr, #ifdef CONFIG_EXT2_FS_XATTR .listxattr = ext2_listxattr, diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 4f30876ee325..1e33e0ac8cf1 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -342,6 +342,7 @@ static void ext2_xattr_update_super_block(struct super_block *sb) return; spin_lock(&EXT2_SB(sb)->s_lock); + ext2_update_dynamic_rev(sb); EXT2_SET_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR); spin_unlock(&EXT2_SB(sb)->s_lock); mark_buffer_dirty(EXT2_SB(sb)->s_sbh); diff --git a/fs/fs_types.c b/fs/fs_types.c new file mode 100644 index 000000000000..78365e5dc08c --- /dev/null +++ b/fs/fs_types.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/fs.h> +#include <linux/export.h> + +/* + * fs on-disk file type to dirent file type conversion + */ +static const unsigned char fs_dtype_by_ftype[FT_MAX] = { + [FT_UNKNOWN] = DT_UNKNOWN, + [FT_REG_FILE] = DT_REG, + [FT_DIR] = DT_DIR, + [FT_CHRDEV] = DT_CHR, + [FT_BLKDEV] = DT_BLK, + [FT_FIFO] = DT_FIFO, + [FT_SOCK] = DT_SOCK, + [FT_SYMLINK] = DT_LNK +}; + +/** + * fs_ftype_to_dtype() - fs on-disk file type to dirent type. + * @filetype: The on-disk file type to convert. + * + * This function converts the on-disk file type value (FT_*) to the directory + * entry type (DT_*). + * + * Context: Any context. + * Return: + * * DT_UNKNOWN - Unknown type + * * DT_FIFO - FIFO + * * DT_CHR - Character device + * * DT_DIR - Directory + * * DT_BLK - Block device + * * DT_REG - Regular file + * * DT_LNK - Symbolic link + * * DT_SOCK - Local-domain socket + */ +unsigned char fs_ftype_to_dtype(unsigned int filetype) +{ + if (filetype >= FT_MAX) + return DT_UNKNOWN; + + return fs_dtype_by_ftype[filetype]; +} +EXPORT_SYMBOL_GPL(fs_ftype_to_dtype); + +/* + * dirent file type to fs on-disk file type conversion + * Values not initialized explicitly are FT_UNKNOWN (0). + */ +static const unsigned char fs_ftype_by_dtype[DT_MAX] = { + [DT_REG] = FT_REG_FILE, + [DT_DIR] = FT_DIR, + [DT_LNK] = FT_SYMLINK, + [DT_CHR] = FT_CHRDEV, + [DT_BLK] = FT_BLKDEV, + [DT_FIFO] = FT_FIFO, + [DT_SOCK] = FT_SOCK, +}; + +/** + * fs_umode_to_ftype() - file mode to on-disk file type. + * @mode: The file mode to convert. + * + * This function converts the file mode value to the on-disk file type (FT_*). + * + * Context: Any context. + * Return: + * * FT_UNKNOWN - Unknown type + * * FT_REG_FILE - Regular file + * * FT_DIR - Directory + * * FT_CHRDEV - Character device + * * FT_BLKDEV - Block device + * * FT_FIFO - FIFO + * * FT_SOCK - Local-domain socket + * * FT_SYMLINK - Symbolic link + */ +unsigned char fs_umode_to_ftype(umode_t mode) +{ + return fs_ftype_by_dtype[S_DT(mode)]; +} +EXPORT_SYMBOL_GPL(fs_umode_to_ftype); + +/** + * fs_umode_to_dtype() - file mode to dirent file type. + * @mode: The file mode to convert. + * + * This function converts the file mode value to the directory + * entry type (DT_*). + * + * Context: Any context. + * Return: + * * DT_UNKNOWN - Unknown type + * * DT_FIFO - FIFO + * * DT_CHR - Character device + * * DT_DIR - Directory + * * DT_BLK - Block device + * * DT_REG - Regular file + * * DT_LNK - Symbolic link + * * DT_SOCK - Local-domain socket + */ +unsigned char fs_umode_to_dtype(umode_t mode) +{ + return fs_ftype_to_dtype(fs_umode_to_ftype(mode)); +} +EXPORT_SYMBOL_GPL(fs_umode_to_dtype); diff --git a/fs/namei.c b/fs/namei.c index d604f6b3bcc3..0a8c5c27f90e 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2718,7 +2718,7 @@ filename_mountpoint(int dfd, struct filename *name, struct path *path, if (unlikely(error == -ESTALE)) error = path_mountpoint(&nd, flags | LOOKUP_REVAL, path); if (likely(!error)) - audit_inode(name, path->dentry, 0); + audit_inode(name, path->dentry, flags & LOOKUP_NO_EVAL); restore_nameidata(); putname(name); return error; diff --git a/fs/namespace.c b/fs/namespace.c index c4e83d94840c..98a8c182af4f 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1640,6 +1640,8 @@ int ksys_umount(char __user *name, int flags) if (!(flags & UMOUNT_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; + lookup_flags |= LOOKUP_NO_EVAL; + retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path); if (retval) goto out; diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig index 41355ce74ac0..735bfb2e9190 100644 --- a/fs/notify/fanotify/Kconfig +++ b/fs/notify/fanotify/Kconfig @@ -2,6 +2,7 @@ config FANOTIFY bool "Filesystem wide access notification" select FSNOTIFY select ANON_INODES + select EXPORTFS default n ---help--- Say Y here to enable fanotify support. fanotify is a file access diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 3723f3d18d20..6b9c27548997 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -13,22 +13,40 @@ #include <linux/wait.h> #include <linux/audit.h> #include <linux/sched/mm.h> +#include <linux/statfs.h> #include "fanotify.h" static bool should_merge(struct fsnotify_event *old_fsn, struct fsnotify_event *new_fsn) { - struct fanotify_event_info *old, *new; + struct fanotify_event *old, *new; pr_debug("%s: old=%p new=%p\n", __func__, old_fsn, new_fsn); old = FANOTIFY_E(old_fsn); new = FANOTIFY_E(new_fsn); - if (old_fsn->inode == new_fsn->inode && old->pid == new->pid && - old->path.mnt == new->path.mnt && - old->path.dentry == new->path.dentry) - return true; + if (old_fsn->inode != new_fsn->inode || old->pid != new->pid || + old->fh_type != new->fh_type || old->fh_len != new->fh_len) + return false; + + if (fanotify_event_has_path(old)) { + return old->path.mnt == new->path.mnt && + old->path.dentry == new->path.dentry; + } else if (fanotify_event_has_fid(old)) { + /* + * We want to merge many dirent events in the same dir (i.e. + * creates/unlinks/renames), but we do not want to merge dirent + * events referring to subdirs with dirent events referring to + * non subdirs, otherwise, user won't be able to tell from a + * mask FAN_CREATE|FAN_DELETE|FAN_ONDIR if it describes mkdir+ + * unlink pair or rmdir+create pair of events. + */ + return (old->mask & FS_ISDIR) == (new->mask & FS_ISDIR) && + fanotify_fid_equal(&old->fid, &new->fid, old->fh_len); + } + + /* Do not merge events if we failed to encode fid */ return false; } @@ -36,20 +54,22 @@ static bool should_merge(struct fsnotify_event *old_fsn, static int fanotify_merge(struct list_head *list, struct fsnotify_event *event) { struct fsnotify_event *test_event; + struct fanotify_event *new; pr_debug("%s: list=%p event=%p\n", __func__, list, event); + new = FANOTIFY_E(event); /* * Don't merge a permission event with any other event so that we know * the event structure we have created in fanotify_handle_event() is the * one we should check for permission response. */ - if (fanotify_is_perm_event(event->mask)) + if (fanotify_is_perm_event(new->mask)) return 0; list_for_each_entry_reverse(test_event, list, list) { if (should_merge(test_event, event)) { - test_event->mask |= event->mask; + FANOTIFY_E(test_event)->mask |= new->mask; return 1; } } @@ -57,15 +77,44 @@ static int fanotify_merge(struct list_head *list, struct fsnotify_event *event) return 0; } +/* + * Wait for response to permission event. The function also takes care of + * freeing the permission event (or offloads that in case the wait is canceled + * by a signal). The function returns 0 in case access got allowed by userspace, + * -EPERM in case userspace disallowed the access, and -ERESTARTSYS in case + * the wait got interrupted by a signal. + */ static int fanotify_get_response(struct fsnotify_group *group, - struct fanotify_perm_event_info *event, + struct fanotify_perm_event *event, struct fsnotify_iter_info *iter_info) { int ret; pr_debug("%s: group=%p event=%p\n", __func__, group, event); - wait_event(group->fanotify_data.access_waitq, event->response); + ret = wait_event_killable(group->fanotify_data.access_waitq, + event->state == FAN_EVENT_ANSWERED); + /* Signal pending? */ + if (ret < 0) { + spin_lock(&group->notification_lock); + /* Event reported to userspace and no answer yet? */ + if (event->state == FAN_EVENT_REPORTED) { + /* Event will get freed once userspace answers to it */ + event->state = FAN_EVENT_CANCELED; + spin_unlock(&group->notification_lock); + return ret; + } + /* Event not yet reported? Just remove it. */ + if (event->state == FAN_EVENT_INIT) + fsnotify_remove_queued_event(group, &event->fae.fse); + /* + * Event may be also answered in case signal delivery raced + * with wakeup. In that case we have nothing to do besides + * freeing the event and reporting error. + */ + spin_unlock(&group->notification_lock); + goto out; + } /* userspace responded, convert to something usable */ switch (event->response & ~FAN_AUDIT) { @@ -81,11 +130,11 @@ static int fanotify_get_response(struct fsnotify_group *group, if (event->response & FAN_AUDIT) audit_fanotify(event->response & ~FAN_AUDIT); - event->response = 0; - pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__, group, event, ret); - +out: + fsnotify_destroy_event(group, &event->fae.fse); + return ret; } @@ -95,11 +144,13 @@ static int fanotify_get_response(struct fsnotify_group *group, * been included within the event mask, but have not been explicitly * requested by the user, will not be present in the returned mask. */ -static u32 fanotify_group_event_mask(struct fsnotify_iter_info *iter_info, - u32 event_mask, const void *data, - int data_type) +static u32 fanotify_group_event_mask(struct fsnotify_group *group, + struct fsnotify_iter_info *iter_info, + u32 event_mask, const void *data, + int data_type) { __u32 marks_mask = 0, marks_ignored_mask = 0; + __u32 test_mask, user_mask = FANOTIFY_OUTGOING_EVENTS; const struct path *path = data; struct fsnotify_mark *mark; int type; @@ -107,14 +158,14 @@ static u32 fanotify_group_event_mask(struct fsnotify_iter_info *iter_info, pr_debug("%s: report_mask=%x mask=%x data=%p data_type=%d\n", __func__, iter_info->report_mask, event_mask, data, data_type); - /* If we don't have enough info to send an event to userspace say no */ - if (data_type != FSNOTIFY_EVENT_PATH) - return 0; - - /* Sorry, fanotify only gives a damn about files and dirs */ - if (!d_is_reg(path->dentry) && - !d_can_lookup(path->dentry)) - return 0; + if (!FAN_GROUP_FLAG(group, FAN_REPORT_FID)) { + /* Do we have path to open a file descriptor? */ + if (data_type != FSNOTIFY_EVENT_PATH) + return 0; + /* Path type events are only relevant for files and dirs */ + if (!d_is_reg(path->dentry) && !d_can_lookup(path->dentry)) + return 0; + } fsnotify_foreach_obj_type(type) { if (!fsnotify_iter_should_report_type(iter_info, type)) @@ -133,20 +184,106 @@ static u32 fanotify_group_event_mask(struct fsnotify_iter_info *iter_info, marks_ignored_mask |= mark->ignored_mask; } - if (d_is_dir(path->dentry) && + test_mask = event_mask & marks_mask & ~marks_ignored_mask; + + /* + * dirent modification events (create/delete/move) do not carry the + * child entry name/inode information. Instead, we report FAN_ONDIR + * for mkdir/rmdir so user can differentiate them from creat/unlink. + * + * For backward compatibility and consistency, do not report FAN_ONDIR + * to user in legacy fanotify mode (reporting fd) and report FAN_ONDIR + * to user in FAN_REPORT_FID mode for all event types. + */ + if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) { + /* Do not report FAN_ONDIR without any event */ + if (!(test_mask & ~FAN_ONDIR)) + return 0; + } else { + user_mask &= ~FAN_ONDIR; + } + + if (event_mask & FS_ISDIR && !(marks_mask & FS_ISDIR & ~marks_ignored_mask)) return 0; - return event_mask & FANOTIFY_OUTGOING_EVENTS & marks_mask & - ~marks_ignored_mask; + return test_mask & user_mask; +} + +static int fanotify_encode_fid(struct fanotify_event *event, + struct inode *inode, gfp_t gfp, + __kernel_fsid_t *fsid) +{ + struct fanotify_fid *fid = &event->fid; + int dwords, bytes = 0; + int err, type; + + fid->ext_fh = NULL; + dwords = 0; + err = -ENOENT; + type = exportfs_encode_inode_fh(inode, NULL, &dwords, NULL); + if (!dwords) + goto out_err; + + bytes = dwords << 2; + if (bytes > FANOTIFY_INLINE_FH_LEN) { + /* Treat failure to allocate fh as failure to allocate event */ + err = -ENOMEM; + fid->ext_fh = kmalloc(bytes, gfp); + if (!fid->ext_fh) + goto out_err; + } + + type = exportfs_encode_inode_fh(inode, fanotify_fid_fh(fid, bytes), + &dwords, NULL); + err = -EINVAL; + if (!type || type == FILEID_INVALID || bytes != dwords << 2) + goto out_err; + + fid->fsid = *fsid; + event->fh_len = bytes; + + return type; + +out_err: + pr_warn_ratelimited("fanotify: failed to encode fid (fsid=%x.%x, " + "type=%d, bytes=%d, err=%i)\n", + fsid->val[0], fsid->val[1], type, bytes, err); + kfree(fid->ext_fh); + fid->ext_fh = NULL; + event->fh_len = 0; + + return FILEID_INVALID; } -struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group, - struct inode *inode, u32 mask, - const struct path *path) +/* + * The inode to use as identifier when reporting fid depends on the event. + * Report the modified directory inode on dirent modification events. + * Report the "victim" inode otherwise. + * For example: + * FS_ATTRIB reports the child inode even if reported on a watched parent. + * FS_CREATE reports the modified dir inode and not the created inode. + */ +static struct inode *fanotify_fid_inode(struct inode *to_tell, u32 event_mask, + const void *data, int data_type) { - struct fanotify_event_info *event = NULL; + if (event_mask & ALL_FSNOTIFY_DIRENT_EVENTS) + return to_tell; + else if (data_type == FSNOTIFY_EVENT_INODE) + return (struct inode *)data; + else if (data_type == FSNOTIFY_EVENT_PATH) + return d_inode(((struct path *)data)->dentry); + return NULL; +} + +struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, + struct inode *inode, u32 mask, + const void *data, int data_type, + __kernel_fsid_t *fsid) +{ + struct fanotify_event *event = NULL; gfp_t gfp = GFP_KERNEL_ACCOUNT; + struct inode *id = fanotify_fid_inode(inode, mask, data, data_type); /* * For queues with unlimited length lost events are not expected and @@ -160,28 +297,36 @@ struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group, memalloc_use_memcg(group->memcg); if (fanotify_is_perm_event(mask)) { - struct fanotify_perm_event_info *pevent; + struct fanotify_perm_event *pevent; pevent = kmem_cache_alloc(fanotify_perm_event_cachep, gfp); if (!pevent) goto out; event = &pevent->fae; pevent->response = 0; + pevent->state = FAN_EVENT_INIT; goto init; } event = kmem_cache_alloc(fanotify_event_cachep, gfp); if (!event) goto out; init: __maybe_unused - fsnotify_init_event(&event->fse, inode, mask); + fsnotify_init_event(&event->fse, inode); + event->mask = mask; if (FAN_GROUP_FLAG(group, FAN_REPORT_TID)) event->pid = get_pid(task_pid(current)); else event->pid = get_pid(task_tgid(current)); - if (path) { - event->path = *path; + event->fh_len = 0; + if (id && FAN_GROUP_FLAG(group, FAN_REPORT_FID)) { + /* Report the event without a file identifier on encode error */ + event->fh_type = fanotify_encode_fid(event, id, gfp, fsid); + } else if (data_type == FSNOTIFY_EVENT_PATH) { + event->fh_type = FILEID_ROOT; + event->path = *((struct path *)data); path_get(&event->path); } else { + event->fh_type = FILEID_INVALID; event->path.mnt = NULL; event->path.dentry = NULL; } @@ -190,6 +335,29 @@ out: return event; } +/* + * Get cached fsid of the filesystem containing the object from any connector. + * All connectors are supposed to have the same fsid, but we do not verify that + * here. + */ +static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info) +{ + int type; + __kernel_fsid_t fsid = {}; + + fsnotify_foreach_obj_type(type) { + if (!fsnotify_iter_should_report_type(iter_info, type)) + continue; + + fsid = iter_info->marks[type]->connector->fsid; + if (WARN_ON_ONCE(!fsid.val[0] && !fsid.val[1])) + continue; + return fsid; + } + + return fsid; +} + static int fanotify_handle_event(struct fsnotify_group *group, struct inode *inode, u32 mask, const void *data, int data_type, @@ -197,14 +365,22 @@ static int fanotify_handle_event(struct fsnotify_group *group, struct fsnotify_iter_info *iter_info) { int ret = 0; - struct fanotify_event_info *event; + struct fanotify_event *event; struct fsnotify_event *fsn_event; + __kernel_fsid_t fsid = {}; BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS); BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY); + BUILD_BUG_ON(FAN_ATTRIB != FS_ATTRIB); BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE); BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE); BUILD_BUG_ON(FAN_OPEN != FS_OPEN); + BUILD_BUG_ON(FAN_MOVED_TO != FS_MOVED_TO); + BUILD_BUG_ON(FAN_MOVED_FROM != FS_MOVED_FROM); + BUILD_BUG_ON(FAN_CREATE != FS_CREATE); + BUILD_BUG_ON(FAN_DELETE != FS_DELETE); + BUILD_BUG_ON(FAN_DELETE_SELF != FS_DELETE_SELF); + BUILD_BUG_ON(FAN_MOVE_SELF != FS_MOVE_SELF); BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD); BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW); BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM); @@ -213,9 +389,10 @@ static int fanotify_handle_event(struct fsnotify_group *group, BUILD_BUG_ON(FAN_OPEN_EXEC != FS_OPEN_EXEC); BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM); - BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 12); + BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 19); - mask = fanotify_group_event_mask(iter_info, mask, data, data_type); + mask = fanotify_group_event_mask(group, iter_info, mask, data, + data_type); if (!mask) return 0; @@ -231,7 +408,11 @@ static int fanotify_handle_event(struct fsnotify_group *group, return 0; } - event = fanotify_alloc_event(group, inode, mask, data); + if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) + fsid = fanotify_get_fsid(iter_info); + + event = fanotify_alloc_event(group, inode, mask, data, data_type, + &fsid); ret = -ENOMEM; if (unlikely(!event)) { /* @@ -255,7 +436,6 @@ static int fanotify_handle_event(struct fsnotify_group *group, } else if (fanotify_is_perm_event(mask)) { ret = fanotify_get_response(group, FANOTIFY_PE(fsn_event), iter_info); - fsnotify_destroy_event(group, fsn_event); } finish: if (fanotify_is_perm_event(mask)) @@ -275,12 +455,15 @@ static void fanotify_free_group_priv(struct fsnotify_group *group) static void fanotify_free_event(struct fsnotify_event *fsn_event) { - struct fanotify_event_info *event; + struct fanotify_event *event; event = FANOTIFY_E(fsn_event); - path_put(&event->path); + if (fanotify_event_has_path(event)) + path_put(&event->path); + else if (fanotify_event_has_ext_fh(event)) + kfree(event->fid.ext_fh); put_pid(event->pid); - if (fanotify_is_perm_event(fsn_event->mask)) { + if (fanotify_is_perm_event(event->mask)) { kmem_cache_free(fanotify_perm_event_cachep, FANOTIFY_PE(fsn_event)); return; diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index ea05b8a401e7..68b30504284c 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -2,26 +2,112 @@ #include <linux/fsnotify_backend.h> #include <linux/path.h> #include <linux/slab.h> +#include <linux/exportfs.h> extern struct kmem_cache *fanotify_mark_cache; extern struct kmem_cache *fanotify_event_cachep; extern struct kmem_cache *fanotify_perm_event_cachep; +/* Possible states of the permission event */ +enum { + FAN_EVENT_INIT, + FAN_EVENT_REPORTED, + FAN_EVENT_ANSWERED, + FAN_EVENT_CANCELED, +}; + +/* + * 3 dwords are sufficient for most local fs (64bit ino, 32bit generation). + * For 32bit arch, fid increases the size of fanotify_event by 12 bytes and + * fh_* fields increase the size of fanotify_event by another 4 bytes. + * For 64bit arch, fid increases the size of fanotify_fid by 8 bytes and + * fh_* fields are packed in a hole after mask. + */ +#if BITS_PER_LONG == 32 +#define FANOTIFY_INLINE_FH_LEN (3 << 2) +#else +#define FANOTIFY_INLINE_FH_LEN (4 << 2) +#endif + +struct fanotify_fid { + __kernel_fsid_t fsid; + union { + unsigned char fh[FANOTIFY_INLINE_FH_LEN]; + unsigned char *ext_fh; + }; +}; + +static inline void *fanotify_fid_fh(struct fanotify_fid *fid, + unsigned int fh_len) +{ + return fh_len <= FANOTIFY_INLINE_FH_LEN ? fid->fh : fid->ext_fh; +} + +static inline bool fanotify_fid_equal(struct fanotify_fid *fid1, + struct fanotify_fid *fid2, + unsigned int fh_len) +{ + return fid1->fsid.val[0] == fid2->fsid.val[0] && + fid1->fsid.val[1] == fid2->fsid.val[1] && + !memcmp(fanotify_fid_fh(fid1, fh_len), + fanotify_fid_fh(fid2, fh_len), fh_len); +} + /* * Structure for normal fanotify events. It gets allocated in * fanotify_handle_event() and freed when the information is retrieved by * userspace */ -struct fanotify_event_info { +struct fanotify_event { struct fsnotify_event fse; + u32 mask; /* - * We hold ref to this path so it may be dereferenced at any point - * during this object's lifetime + * Those fields are outside fanotify_fid to pack fanotify_event nicely + * on 64bit arch and to use fh_type as an indication of whether path + * or fid are used in the union: + * FILEID_ROOT (0) for path, > 0 for fid, FILEID_INVALID for neither. */ - struct path path; + u8 fh_type; + u8 fh_len; + u16 pad; + union { + /* + * We hold ref to this path so it may be dereferenced at any + * point during this object's lifetime + */ + struct path path; + /* + * With FAN_REPORT_FID, we do not hold any reference on the + * victim object. Instead we store its NFS file handle and its + * filesystem's fsid as a unique identifier. + */ + struct fanotify_fid fid; + }; struct pid *pid; }; +static inline bool fanotify_event_has_path(struct fanotify_event *event) +{ + return event->fh_type == FILEID_ROOT; +} + +static inline bool fanotify_event_has_fid(struct fanotify_event *event) +{ + return event->fh_type != FILEID_ROOT && + event->fh_type != FILEID_INVALID; +} + +static inline bool fanotify_event_has_ext_fh(struct fanotify_event *event) +{ + return fanotify_event_has_fid(event) && + event->fh_len > FANOTIFY_INLINE_FH_LEN; +} + +static inline void *fanotify_event_fh(struct fanotify_event *event) +{ + return fanotify_fid_fh(&event->fid, event->fh_len); +} + /* * Structure for permission fanotify events. It gets allocated and freed in * fanotify_handle_event() since we wait there for user response. When the @@ -29,16 +115,17 @@ struct fanotify_event_info { * group->notification_list to group->fanotify_data.access_list to wait for * user response. */ -struct fanotify_perm_event_info { - struct fanotify_event_info fae; - int response; /* userspace answer to question */ +struct fanotify_perm_event { + struct fanotify_event fae; + unsigned short response; /* userspace answer to the event */ + unsigned short state; /* state of the event */ int fd; /* fd we passed to userspace for this event */ }; -static inline struct fanotify_perm_event_info * +static inline struct fanotify_perm_event * FANOTIFY_PE(struct fsnotify_event *fse) { - return container_of(fse, struct fanotify_perm_event_info, fae.fse); + return container_of(fse, struct fanotify_perm_event, fae.fse); } static inline bool fanotify_is_perm_event(u32 mask) @@ -47,11 +134,12 @@ static inline bool fanotify_is_perm_event(u32 mask) mask & FANOTIFY_PERM_EVENTS; } -static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse) +static inline struct fanotify_event *FANOTIFY_E(struct fsnotify_event *fse) { - return container_of(fse, struct fanotify_event_info, fse); + return container_of(fse, struct fanotify_event, fse); } -struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group, - struct inode *inode, u32 mask, - const struct path *path); +struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, + struct inode *inode, u32 mask, + const void *data, int data_type, + __kernel_fsid_t *fsid); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 9c870b0d2b56..56992b32c6bb 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -17,6 +17,8 @@ #include <linux/compat.h> #include <linux/sched/signal.h> #include <linux/memcontrol.h> +#include <linux/statfs.h> +#include <linux/exportfs.h> #include <asm/ioctls.h> @@ -47,33 +49,55 @@ struct kmem_cache *fanotify_mark_cache __read_mostly; struct kmem_cache *fanotify_event_cachep __read_mostly; struct kmem_cache *fanotify_perm_event_cachep __read_mostly; +#define FANOTIFY_EVENT_ALIGN 4 + +static int fanotify_event_info_len(struct fanotify_event *event) +{ + if (!fanotify_event_has_fid(event)) + return 0; + + return roundup(sizeof(struct fanotify_event_info_fid) + + sizeof(struct file_handle) + event->fh_len, + FANOTIFY_EVENT_ALIGN); +} + /* * Get an fsnotify notification event if one exists and is small * enough to fit in "count". Return an error pointer if the count - * is not large enough. - * - * Called with the group->notification_lock held. + * is not large enough. When permission event is dequeued, its state is + * updated accordingly. */ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, size_t count) { - assert_spin_locked(&group->notification_lock); + size_t event_size = FAN_EVENT_METADATA_LEN; + struct fsnotify_event *fsn_event = NULL; pr_debug("%s: group=%p count=%zd\n", __func__, group, count); + spin_lock(&group->notification_lock); if (fsnotify_notify_queue_is_empty(group)) - return NULL; + goto out; - if (FAN_EVENT_METADATA_LEN > count) - return ERR_PTR(-EINVAL); + if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) { + event_size += fanotify_event_info_len( + FANOTIFY_E(fsnotify_peek_first_event(group))); + } - /* held the notification_lock the whole time, so this is the - * same event we peeked above */ - return fsnotify_remove_first_event(group); + if (event_size > count) { + fsn_event = ERR_PTR(-EINVAL); + goto out; + } + fsn_event = fsnotify_remove_first_event(group); + if (fanotify_is_perm_event(FANOTIFY_E(fsn_event)->mask)) + FANOTIFY_PE(fsn_event)->state = FAN_EVENT_REPORTED; +out: + spin_unlock(&group->notification_lock); + return fsn_event; } static int create_fd(struct fsnotify_group *group, - struct fanotify_event_info *event, + struct fanotify_event *event, struct file **file) { int client_fd; @@ -114,62 +138,32 @@ static int create_fd(struct fsnotify_group *group, return client_fd; } -static int fill_event_metadata(struct fsnotify_group *group, - struct fanotify_event_metadata *metadata, - struct fsnotify_event *fsn_event, - struct file **file) -{ - int ret = 0; - struct fanotify_event_info *event; - - pr_debug("%s: group=%p metadata=%p event=%p\n", __func__, - group, metadata, fsn_event); - - *file = NULL; - event = container_of(fsn_event, struct fanotify_event_info, fse); - metadata->event_len = FAN_EVENT_METADATA_LEN; - metadata->metadata_len = FAN_EVENT_METADATA_LEN; - metadata->vers = FANOTIFY_METADATA_VERSION; - metadata->reserved = 0; - metadata->mask = fsn_event->mask & FANOTIFY_OUTGOING_EVENTS; - metadata->pid = pid_vnr(event->pid); - if (unlikely(fsn_event->mask & FAN_Q_OVERFLOW)) - metadata->fd = FAN_NOFD; - else { - metadata->fd = create_fd(group, event, file); - if (metadata->fd < 0) - ret = metadata->fd; - } - - return ret; -} - -static struct fanotify_perm_event_info *dequeue_event( - struct fsnotify_group *group, int fd) +/* + * Finish processing of permission event by setting it to ANSWERED state and + * drop group->notification_lock. + */ +static void finish_permission_event(struct fsnotify_group *group, + struct fanotify_perm_event *event, + unsigned int response) + __releases(&group->notification_lock) { - struct fanotify_perm_event_info *event, *return_e = NULL; - - spin_lock(&group->notification_lock); - list_for_each_entry(event, &group->fanotify_data.access_list, - fae.fse.list) { - if (event->fd != fd) - continue; + bool destroy = false; - list_del_init(&event->fae.fse.list); - return_e = event; - break; - } + assert_spin_locked(&group->notification_lock); + event->response = response; + if (event->state == FAN_EVENT_CANCELED) + destroy = true; + else + event->state = FAN_EVENT_ANSWERED; spin_unlock(&group->notification_lock); - - pr_debug("%s: found return_re=%p\n", __func__, return_e); - - return return_e; + if (destroy) + fsnotify_destroy_event(group, &event->fae.fse); } static int process_access_response(struct fsnotify_group *group, struct fanotify_response *response_struct) { - struct fanotify_perm_event_info *event; + struct fanotify_perm_event *event; int fd = response_struct->fd; int response = response_struct->response; @@ -194,48 +188,115 @@ static int process_access_response(struct fsnotify_group *group, if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT)) return -EINVAL; - event = dequeue_event(group, fd); - if (!event) - return -ENOENT; + spin_lock(&group->notification_lock); + list_for_each_entry(event, &group->fanotify_data.access_list, + fae.fse.list) { + if (event->fd != fd) + continue; - event->response = response; - wake_up(&group->fanotify_data.access_waitq); + list_del_init(&event->fae.fse.list); + finish_permission_event(group, event, response); + wake_up(&group->fanotify_data.access_waitq); + return 0; + } + spin_unlock(&group->notification_lock); + + return -ENOENT; +} + +static int copy_fid_to_user(struct fanotify_event *event, char __user *buf) +{ + struct fanotify_event_info_fid info = { }; + struct file_handle handle = { }; + size_t fh_len = event->fh_len; + size_t len = fanotify_event_info_len(event); + + if (!len) + return 0; + + if (WARN_ON_ONCE(len < sizeof(info) + sizeof(handle) + fh_len)) + return -EFAULT; + + /* Copy event info fid header followed by vaiable sized file handle */ + info.hdr.info_type = FAN_EVENT_INFO_TYPE_FID; + info.hdr.len = len; + info.fsid = event->fid.fsid; + if (copy_to_user(buf, &info, sizeof(info))) + return -EFAULT; + + buf += sizeof(info); + len -= sizeof(info); + handle.handle_type = event->fh_type; + handle.handle_bytes = fh_len; + if (copy_to_user(buf, &handle, sizeof(handle))) + return -EFAULT; + + buf += sizeof(handle); + len -= sizeof(handle); + if (copy_to_user(buf, fanotify_event_fh(event), fh_len)) + return -EFAULT; + + /* Pad with 0's */ + buf += fh_len; + len -= fh_len; + WARN_ON_ONCE(len < 0 || len >= FANOTIFY_EVENT_ALIGN); + if (len > 0 && clear_user(buf, len)) + return -EFAULT; return 0; } static ssize_t copy_event_to_user(struct fsnotify_group *group, - struct fsnotify_event *event, + struct fsnotify_event *fsn_event, char __user *buf, size_t count) { - struct fanotify_event_metadata fanotify_event_metadata; - struct file *f; - int fd, ret; - - pr_debug("%s: group=%p event=%p\n", __func__, group, event); - - ret = fill_event_metadata(group, &fanotify_event_metadata, event, &f); - if (ret < 0) - return ret; + struct fanotify_event_metadata metadata; + struct fanotify_event *event; + struct file *f = NULL; + int ret, fd = FAN_NOFD; + + pr_debug("%s: group=%p event=%p\n", __func__, group, fsn_event); + + event = container_of(fsn_event, struct fanotify_event, fse); + metadata.event_len = FAN_EVENT_METADATA_LEN; + metadata.metadata_len = FAN_EVENT_METADATA_LEN; + metadata.vers = FANOTIFY_METADATA_VERSION; + metadata.reserved = 0; + metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS; + metadata.pid = pid_vnr(event->pid); + + if (fanotify_event_has_path(event)) { + fd = create_fd(group, event, &f); + if (fd < 0) + return fd; + } else if (fanotify_event_has_fid(event)) { + metadata.event_len += fanotify_event_info_len(event); + } + metadata.fd = fd; - fd = fanotify_event_metadata.fd; ret = -EFAULT; /* * Sanity check copy size in case get_one_event() and * fill_event_metadata() event_len sizes ever get out of sync. */ - if (WARN_ON_ONCE(fanotify_event_metadata.event_len > count)) + if (WARN_ON_ONCE(metadata.event_len > count)) goto out_close_fd; - if (copy_to_user(buf, &fanotify_event_metadata, - fanotify_event_metadata.event_len)) + + if (copy_to_user(buf, &metadata, FAN_EVENT_METADATA_LEN)) goto out_close_fd; if (fanotify_is_perm_event(event->mask)) - FANOTIFY_PE(event)->fd = fd; + FANOTIFY_PE(fsn_event)->fd = fd; - if (fd != FAN_NOFD) + if (fanotify_event_has_path(event)) { fd_install(fd, f); - return fanotify_event_metadata.event_len; + } else if (fanotify_event_has_fid(event)) { + ret = copy_fid_to_user(event, buf + FAN_EVENT_METADATA_LEN); + if (ret < 0) + return ret; + } + + return metadata.event_len; out_close_fd: if (fd != FAN_NOFD) { @@ -276,10 +337,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, add_wait_queue(&group->notification_waitq, &wait); while (1) { - spin_lock(&group->notification_lock); kevent = get_one_event(group, count); - spin_unlock(&group->notification_lock); - if (IS_ERR(kevent)) { ret = PTR_ERR(kevent); break; @@ -316,11 +374,13 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, * Permission events get queued to wait for response. Other * events can be destroyed now. */ - if (!fanotify_is_perm_event(kevent->mask)) { + if (!fanotify_is_perm_event(FANOTIFY_E(kevent)->mask)) { fsnotify_destroy_event(group, kevent); } else { if (ret <= 0) { - FANOTIFY_PE(kevent)->response = FAN_DENY; + spin_lock(&group->notification_lock); + finish_permission_event(group, + FANOTIFY_PE(kevent), FAN_DENY); wake_up(&group->fanotify_data.access_waitq); } else { spin_lock(&group->notification_lock); @@ -370,7 +430,7 @@ static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t static int fanotify_release(struct inode *ignored, struct file *file) { struct fsnotify_group *group = file->private_data; - struct fanotify_perm_event_info *event, *next; + struct fanotify_perm_event *event; struct fsnotify_event *fsn_event; /* @@ -385,13 +445,12 @@ static int fanotify_release(struct inode *ignored, struct file *file) * and simulate reply from userspace. */ spin_lock(&group->notification_lock); - list_for_each_entry_safe(event, next, &group->fanotify_data.access_list, - fae.fse.list) { - pr_debug("%s: found group=%p event=%p\n", __func__, group, - event); - + while (!list_empty(&group->fanotify_data.access_list)) { + event = list_first_entry(&group->fanotify_data.access_list, + struct fanotify_perm_event, fae.fse.list); list_del_init(&event->fae.fse.list); - event->response = FAN_ALLOW; + finish_permission_event(group, event, FAN_ALLOW); + spin_lock(&group->notification_lock); } /* @@ -401,13 +460,14 @@ static int fanotify_release(struct inode *ignored, struct file *file) */ while (!fsnotify_notify_queue_is_empty(group)) { fsn_event = fsnotify_remove_first_event(group); - if (!(fsn_event->mask & FANOTIFY_PERM_EVENTS)) { + if (!(FANOTIFY_E(fsn_event)->mask & FANOTIFY_PERM_EVENTS)) { spin_unlock(&group->notification_lock); fsnotify_destroy_event(group, fsn_event); - spin_lock(&group->notification_lock); } else { - FANOTIFY_PE(fsn_event)->response = FAN_ALLOW; + finish_permission_event(group, FANOTIFY_PE(fsn_event), + FAN_ALLOW); } + spin_lock(&group->notification_lock); } spin_unlock(&group->notification_lock); @@ -598,7 +658,8 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, fsnotify_connp_t *connp, - unsigned int type) + unsigned int type, + __kernel_fsid_t *fsid) { struct fsnotify_mark *mark; int ret; @@ -611,7 +672,7 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, return ERR_PTR(-ENOMEM); fsnotify_init_mark(mark, group); - ret = fsnotify_add_mark_locked(mark, connp, type, 0); + ret = fsnotify_add_mark_locked(mark, connp, type, 0, fsid); if (ret) { fsnotify_put_mark(mark); return ERR_PTR(ret); @@ -623,7 +684,8 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, static int fanotify_add_mark(struct fsnotify_group *group, fsnotify_connp_t *connp, unsigned int type, - __u32 mask, unsigned int flags) + __u32 mask, unsigned int flags, + __kernel_fsid_t *fsid) { struct fsnotify_mark *fsn_mark; __u32 added; @@ -631,7 +693,7 @@ static int fanotify_add_mark(struct fsnotify_group *group, mutex_lock(&group->mark_mutex); fsn_mark = fsnotify_find_mark(connp, group); if (!fsn_mark) { - fsn_mark = fanotify_add_new_mark(group, connp, type); + fsn_mark = fanotify_add_new_mark(group, connp, type, fsid); if (IS_ERR(fsn_mark)) { mutex_unlock(&group->mark_mutex); return PTR_ERR(fsn_mark); @@ -648,23 +710,23 @@ static int fanotify_add_mark(struct fsnotify_group *group, static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, struct vfsmount *mnt, __u32 mask, - unsigned int flags) + unsigned int flags, __kernel_fsid_t *fsid) { return fanotify_add_mark(group, &real_mount(mnt)->mnt_fsnotify_marks, - FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags); + FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags, fsid); } static int fanotify_add_sb_mark(struct fsnotify_group *group, - struct super_block *sb, __u32 mask, - unsigned int flags) + struct super_block *sb, __u32 mask, + unsigned int flags, __kernel_fsid_t *fsid) { return fanotify_add_mark(group, &sb->s_fsnotify_marks, - FSNOTIFY_OBJ_TYPE_SB, mask, flags); + FSNOTIFY_OBJ_TYPE_SB, mask, flags, fsid); } static int fanotify_add_inode_mark(struct fsnotify_group *group, struct inode *inode, __u32 mask, - unsigned int flags) + unsigned int flags, __kernel_fsid_t *fsid) { pr_debug("%s: group=%p inode=%p\n", __func__, group, inode); @@ -679,7 +741,7 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group, return 0; return fanotify_add_mark(group, &inode->i_fsnotify_marks, - FSNOTIFY_OBJ_TYPE_INODE, mask, flags); + FSNOTIFY_OBJ_TYPE_INODE, mask, flags, fsid); } /* fanotify syscalls */ @@ -688,7 +750,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) struct fsnotify_group *group; int f_flags, fd; struct user_struct *user; - struct fanotify_event_info *oevent; + struct fanotify_event *oevent; pr_debug("%s: flags=%x event_f_flags=%x\n", __func__, flags, event_f_flags); @@ -715,6 +777,10 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) return -EINVAL; } + if ((flags & FAN_REPORT_FID) && + (flags & FANOTIFY_CLASS_BITS) != FAN_CLASS_NOTIF) + return -EINVAL; + user = get_current_user(); if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) { free_uid(user); @@ -739,7 +805,8 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) atomic_inc(&user->fanotify_listeners); group->memcg = get_mem_cgroup_from_mm(current->mm); - oevent = fanotify_alloc_event(group, NULL, FS_Q_OVERFLOW, NULL); + oevent = fanotify_alloc_event(group, NULL, FS_Q_OVERFLOW, NULL, + FSNOTIFY_EVENT_NONE, NULL); if (unlikely(!oevent)) { fd = -ENOMEM; goto out_destroy_group; @@ -801,6 +868,48 @@ out_destroy_group: return fd; } +/* Check if filesystem can encode a unique fid */ +static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid) +{ + __kernel_fsid_t root_fsid; + int err; + + /* + * Make sure path is not in filesystem with zero fsid (e.g. tmpfs). + */ + err = vfs_get_fsid(path->dentry, fsid); + if (err) + return err; + + if (!fsid->val[0] && !fsid->val[1]) + return -ENODEV; + + /* + * Make sure path is not inside a filesystem subvolume (e.g. btrfs) + * which uses a different fsid than sb root. + */ + err = vfs_get_fsid(path->dentry->d_sb->s_root, &root_fsid); + if (err) + return err; + + if (root_fsid.val[0] != fsid->val[0] || + root_fsid.val[1] != fsid->val[1]) + return -EXDEV; + + /* + * We need to make sure that the file system supports at least + * encoding a file handle so user can use name_to_handle_at() to + * compare fid returned with event to the file handle of watched + * objects. However, name_to_handle_at() requires that the + * filesystem also supports decoding file handles. + */ + if (!path->dentry->d_sb->s_export_op || + !path->dentry->d_sb->s_export_op->fh_to_dentry) + return -EOPNOTSUPP; + + return 0; +} + static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, int dfd, const char __user *pathname) { @@ -809,6 +918,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, struct fsnotify_group *group; struct fd f; struct path path; + __kernel_fsid_t __fsid, *fsid = NULL; u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS; unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; int ret; @@ -871,6 +981,18 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, group->priority == FS_PRIO_0) goto fput_and_out; + /* + * Events with data type inode do not carry enough information to report + * event->fd, so we do not allow setting a mask for inode events unless + * group supports reporting fid. + * inode events are not supported on a mount mark, because they do not + * carry enough information (i.e. path) to be filtered by mount point. + */ + if (mask & FANOTIFY_INODE_EVENTS && + (!FAN_GROUP_FLAG(group, FAN_REPORT_FID) || + mark_type == FAN_MARK_MOUNT)) + goto fput_and_out; + if (flags & FAN_MARK_FLUSH) { ret = 0; if (mark_type == FAN_MARK_MOUNT) @@ -886,6 +1008,14 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, if (ret) goto fput_and_out; + if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) { + ret = fanotify_test_fid(&path, &__fsid); + if (ret) + goto path_put_and_out; + + fsid = &__fsid; + } + /* inode held in place by reference to path; group by fget on fd */ if (mark_type == FAN_MARK_INODE) inode = path.dentry->d_inode; @@ -896,24 +1026,31 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) { case FAN_MARK_ADD: if (mark_type == FAN_MARK_MOUNT) - ret = fanotify_add_vfsmount_mark(group, mnt, mask, flags); + ret = fanotify_add_vfsmount_mark(group, mnt, mask, + flags, fsid); else if (mark_type == FAN_MARK_FILESYSTEM) - ret = fanotify_add_sb_mark(group, mnt->mnt_sb, mask, flags); + ret = fanotify_add_sb_mark(group, mnt->mnt_sb, mask, + flags, fsid); else - ret = fanotify_add_inode_mark(group, inode, mask, flags); + ret = fanotify_add_inode_mark(group, inode, mask, + flags, fsid); break; case FAN_MARK_REMOVE: if (mark_type == FAN_MARK_MOUNT) - ret = fanotify_remove_vfsmount_mark(group, mnt, mask, flags); + ret = fanotify_remove_vfsmount_mark(group, mnt, mask, + flags); else if (mark_type == FAN_MARK_FILESYSTEM) - ret = fanotify_remove_sb_mark(group, mnt->mnt_sb, mask, flags); + ret = fanotify_remove_sb_mark(group, mnt->mnt_sb, mask, + flags); else - ret = fanotify_remove_inode_mark(group, inode, mask, flags); + ret = fanotify_remove_inode_mark(group, inode, mask, + flags); break; default: ret = -EINVAL; } +path_put_and_out: path_put(&path); fput_and_out: fdput(f); @@ -950,15 +1087,15 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark, */ static int __init fanotify_user_setup(void) { - BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 7); + BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 8); BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9); fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC|SLAB_ACCOUNT); - fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC); + fanotify_event_cachep = KMEM_CACHE(fanotify_event, SLAB_PANIC); if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) { fanotify_perm_event_cachep = - KMEM_CACHE(fanotify_perm_event_info, SLAB_PANIC); + KMEM_CACHE(fanotify_perm_event, SLAB_PANIC); } return 0; diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index ecf09b6243d9..df06f3da166c 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -328,16 +328,15 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, const unsigned char *file_name, u32 cookie) { struct fsnotify_iter_info iter_info = {}; - struct super_block *sb = NULL; + struct super_block *sb = to_tell->i_sb; struct mount *mnt = NULL; - __u32 mnt_or_sb_mask = 0; + __u32 mnt_or_sb_mask = sb->s_fsnotify_mask; int ret = 0; __u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS); if (data_is == FSNOTIFY_EVENT_PATH) { mnt = real_mount(((const struct path *)data)->mnt); - sb = mnt->mnt.mnt_sb; - mnt_or_sb_mask = mnt->mnt_fsnotify_mask | sb->s_fsnotify_mask; + mnt_or_sb_mask |= mnt->mnt_fsnotify_mask; } /* An event "on child" is not intended for a mount/sb mark */ if (mask & FS_EVENT_ON_CHILD) @@ -350,8 +349,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, * SRCU because we have no references to any objects and do not * need SRCU to keep them "alive". */ - if (!to_tell->i_fsnotify_marks && - (!mnt || (!mnt->mnt_fsnotify_marks && !sb->s_fsnotify_marks))) + if (!to_tell->i_fsnotify_marks && !sb->s_fsnotify_marks && + (!mnt || !mnt->mnt_fsnotify_marks)) return 0; /* * if this is a modify event we may need to clear the ignored masks @@ -366,11 +365,11 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, iter_info.marks[FSNOTIFY_OBJ_TYPE_INODE] = fsnotify_first_mark(&to_tell->i_fsnotify_marks); + iter_info.marks[FSNOTIFY_OBJ_TYPE_SB] = + fsnotify_first_mark(&sb->s_fsnotify_marks); if (mnt) { iter_info.marks[FSNOTIFY_OBJ_TYPE_VFSMOUNT] = fsnotify_first_mark(&mnt->mnt_fsnotify_marks); - iter_info.marks[FSNOTIFY_OBJ_TYPE_SB] = - fsnotify_first_mark(&sb->s_fsnotify_marks); } /* diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h index 7e4578d35b61..74ae60305189 100644 --- a/fs/notify/inotify/inotify.h +++ b/fs/notify/inotify/inotify.h @@ -5,6 +5,7 @@ struct inotify_event_info { struct fsnotify_event fse; + u32 mask; int wd; u32 sync_cookie; int name_len; diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index f4184b4f3815..ff30abd6a49b 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -43,11 +43,11 @@ static bool event_compare(struct fsnotify_event *old_fsn, { struct inotify_event_info *old, *new; - if (old_fsn->mask & FS_IN_IGNORED) - return false; old = INOTIFY_E(old_fsn); new = INOTIFY_E(new_fsn); - if ((old_fsn->mask == new_fsn->mask) && + if (old->mask & FS_IN_IGNORED) + return false; + if ((old->mask == new->mask) && (old_fsn->inode == new_fsn->inode) && (old->name_len == new->name_len) && (!old->name_len || !strcmp(old->name, new->name))) @@ -113,8 +113,18 @@ int inotify_handle_event(struct fsnotify_group *group, return -ENOMEM; } + /* + * We now report FS_ISDIR flag with MOVE_SELF and DELETE_SELF events + * for fanotify. inotify never reported IN_ISDIR with those events. + * It looks like an oversight, but to avoid the risk of breaking + * existing inotify programs, mask the flag out from those events. + */ + if (mask & (IN_MOVE_SELF | IN_DELETE_SELF)) + mask &= ~IN_ISDIR; + fsn_event = &event->fse; - fsnotify_init_event(fsn_event, inode, mask); + fsnotify_init_event(fsn_event, inode); + event->mask = mask; event->wd = i_mark->wd; event->sync_cookie = cookie; event->name_len = len; diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 798f1253141a..e2901fbb9f76 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -189,7 +189,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, */ pad_name_len = round_event_name_len(fsn_event); inotify_event.len = pad_name_len; - inotify_event.mask = inotify_mask_to_arg(fsn_event->mask); + inotify_event.mask = inotify_mask_to_arg(event->mask); inotify_event.wd = event->wd; inotify_event.cookie = event->sync_cookie; @@ -634,7 +634,8 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events) return ERR_PTR(-ENOMEM); } group->overflow_event = &oevent->fse; - fsnotify_init_event(group->overflow_event, NULL, FS_Q_OVERFLOW); + fsnotify_init_event(group->overflow_event, NULL); + oevent->mask = FS_Q_OVERFLOW; oevent->wd = -1; oevent->sync_cookie = 0; oevent->name_len = 0; diff --git a/fs/notify/mark.c b/fs/notify/mark.c index d2dd16cb5989..d593d4269561 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -82,6 +82,7 @@ #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/srcu.h> +#include <linux/ratelimit.h> #include <linux/atomic.h> @@ -481,7 +482,8 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b) } static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, - unsigned int type) + unsigned int type, + __kernel_fsid_t *fsid) { struct inode *inode = NULL; struct fsnotify_mark_connector *conn; @@ -493,6 +495,11 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, INIT_HLIST_HEAD(&conn->list); conn->type = type; conn->obj = connp; + /* Cache fsid of filesystem containing the object */ + if (fsid) + conn->fsid = *fsid; + else + conn->fsid.val[0] = conn->fsid.val[1] = 0; if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) inode = igrab(fsnotify_conn_inode(conn)); /* @@ -544,7 +551,7 @@ out: */ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, fsnotify_connp_t *connp, unsigned int type, - int allow_dups) + int allow_dups, __kernel_fsid_t *fsid) { struct fsnotify_mark *lmark, *last = NULL; struct fsnotify_mark_connector *conn; @@ -553,15 +560,36 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, if (WARN_ON(!fsnotify_valid_obj_type(type))) return -EINVAL; + + /* Backend is expected to check for zero fsid (e.g. tmpfs) */ + if (fsid && WARN_ON_ONCE(!fsid->val[0] && !fsid->val[1])) + return -ENODEV; + restart: spin_lock(&mark->lock); conn = fsnotify_grab_connector(connp); if (!conn) { spin_unlock(&mark->lock); - err = fsnotify_attach_connector_to_object(connp, type); + err = fsnotify_attach_connector_to_object(connp, type, fsid); if (err) return err; goto restart; + } else if (fsid && (conn->fsid.val[0] || conn->fsid.val[1]) && + (fsid->val[0] != conn->fsid.val[0] || + fsid->val[1] != conn->fsid.val[1])) { + /* + * Backend is expected to check for non uniform fsid + * (e.g. btrfs), but maybe we missed something? + * Only allow setting conn->fsid once to non zero fsid. + * inotify and non-fid fanotify groups do not set nor test + * conn->fsid. + */ + pr_warn_ratelimited("%s: fsid mismatch on object of type %u: " + "%x.%x != %x.%x\n", __func__, conn->type, + fsid->val[0], fsid->val[1], + conn->fsid.val[0], conn->fsid.val[1]); + err = -EXDEV; + goto out_err; } /* is mark the first mark? */ @@ -606,7 +634,7 @@ out_err: */ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, fsnotify_connp_t *connp, unsigned int type, - int allow_dups) + int allow_dups, __kernel_fsid_t *fsid) { struct fsnotify_group *group = mark->group; int ret = 0; @@ -627,7 +655,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, fsnotify_get_mark(mark); /* for g_list */ spin_unlock(&mark->lock); - ret = fsnotify_add_mark_list(mark, connp, type, allow_dups); + ret = fsnotify_add_mark_list(mark, connp, type, allow_dups, fsid); if (ret) goto err; @@ -648,13 +676,13 @@ err: } int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp, - unsigned int type, int allow_dups) + unsigned int type, int allow_dups, __kernel_fsid_t *fsid) { int ret; struct fsnotify_group *group = mark->group; mutex_lock(&group->mark_mutex); - ret = fsnotify_add_mark_locked(mark, connp, type, allow_dups); + ret = fsnotify_add_mark_locked(mark, connp, type, allow_dups, fsid); mutex_unlock(&group->mark_mutex); return ret; } diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 3c3e36745f59..5f3a54d444b5 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -71,7 +71,7 @@ void fsnotify_destroy_event(struct fsnotify_group *group, struct fsnotify_event *event) { /* Overflow events are per-group and we don't want to free them */ - if (!event || event->mask == FS_Q_OVERFLOW) + if (!event || event == group->overflow_event) return; /* * If the event is still queued, we have a problem... Do an unreliable @@ -141,6 +141,18 @@ queue: return ret; } +void fsnotify_remove_queued_event(struct fsnotify_group *group, + struct fsnotify_event *event) +{ + assert_spin_locked(&group->notification_lock); + /* + * We need to init list head for the case of overflow event so that + * check in fsnotify_add_event() works + */ + list_del_init(&event->list); + group->q_len--; +} + /* * Remove and return the first event from the notification list. It is the * responsibility of the caller to destroy the obtained event @@ -155,13 +167,7 @@ struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group) event = list_first_entry(&group->notification_list, struct fsnotify_event, list); - /* - * We need to init list head for the case of overflow event so that - * check in fsnotify_add_event() works - */ - list_del_init(&event->list); - group->q_len--; - + fsnotify_remove_queued_event(group, event); return event; } @@ -194,23 +200,3 @@ void fsnotify_flush_notify(struct fsnotify_group *group) } spin_unlock(&group->notification_lock); } - -/* - * fsnotify_create_event - Allocate a new event which will be sent to each - * group's handle_event function if the group was interested in this - * particular event. - * - * @inode the inode which is supposed to receive the event (sometimes a - * parent of the inode to which the event happened. - * @mask what actually happened. - * @data pointer to the object which was actually affected - * @data_type flag indication if the data is a file, path, inode, nothing... - * @name the filename, if available - */ -void fsnotify_init_event(struct fsnotify_event *event, struct inode *inode, - u32 mask) -{ - INIT_LIST_HEAD(&event->list); - event->inode = inode; - event->mask = mask; -} diff --git a/fs/proc/base.c b/fs/proc/base.c index 511b279ec69c..5ab1849971b4 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -140,9 +140,13 @@ struct pid_entry { #define REG(NAME, MODE, fops) \ NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {}) #define ONE(NAME, MODE, show) \ - NOD(NAME, (S_IFREG|(MODE)), \ + NOD(NAME, (S_IFREG|(MODE)), \ NULL, &proc_single_file_operations, \ { .proc_show = show } ) +#define ATTR(LSM, NAME, MODE) \ + NOD(NAME, (S_IFREG|(MODE)), \ + NULL, &proc_pid_attr_operations, \ + { .lsm = LSM }) /* * Count the number of hardlinks for the pid_entry table, excluding the . @@ -1206,7 +1210,7 @@ static const struct file_operations proc_oom_score_adj_operations = { .llseek = default_llseek, }; -#ifdef CONFIG_AUDITSYSCALL +#ifdef CONFIG_AUDIT #define TMPBUFLEN 11 static ssize_t proc_loginuid_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) @@ -2521,7 +2525,7 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, if (!task) return -ESRCH; - length = security_getprocattr(task, + length = security_getprocattr(task, PROC_I(inode)->op.lsm, (char*)file->f_path.dentry->d_name.name, &p); put_task_struct(task); @@ -2570,7 +2574,9 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, if (rv < 0) goto out_free; - rv = security_setprocattr(file->f_path.dentry->d_name.name, page, count); + rv = security_setprocattr(PROC_I(inode)->op.lsm, + file->f_path.dentry->d_name.name, page, + count); mutex_unlock(¤t->signal->cred_guard_mutex); out_free: kfree(page); @@ -2584,13 +2590,53 @@ static const struct file_operations proc_pid_attr_operations = { .llseek = generic_file_llseek, }; +#define LSM_DIR_OPS(LSM) \ +static int proc_##LSM##_attr_dir_iterate(struct file *filp, \ + struct dir_context *ctx) \ +{ \ + return proc_pident_readdir(filp, ctx, \ + LSM##_attr_dir_stuff, \ + ARRAY_SIZE(LSM##_attr_dir_stuff)); \ +} \ +\ +static const struct file_operations proc_##LSM##_attr_dir_ops = { \ + .read = generic_read_dir, \ + .iterate = proc_##LSM##_attr_dir_iterate, \ + .llseek = default_llseek, \ +}; \ +\ +static struct dentry *proc_##LSM##_attr_dir_lookup(struct inode *dir, \ + struct dentry *dentry, unsigned int flags) \ +{ \ + return proc_pident_lookup(dir, dentry, \ + LSM##_attr_dir_stuff, \ + ARRAY_SIZE(LSM##_attr_dir_stuff)); \ +} \ +\ +static const struct inode_operations proc_##LSM##_attr_dir_inode_ops = { \ + .lookup = proc_##LSM##_attr_dir_lookup, \ + .getattr = pid_getattr, \ + .setattr = proc_setattr, \ +} + +#ifdef CONFIG_SECURITY_SMACK +static const struct pid_entry smack_attr_dir_stuff[] = { + ATTR("smack", "current", 0666), +}; +LSM_DIR_OPS(smack); +#endif + static const struct pid_entry attr_dir_stuff[] = { - REG("current", S_IRUGO|S_IWUGO, proc_pid_attr_operations), - REG("prev", S_IRUGO, proc_pid_attr_operations), - REG("exec", S_IRUGO|S_IWUGO, proc_pid_attr_operations), - REG("fscreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations), - REG("keycreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations), - REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations), + ATTR(NULL, "current", 0666), + ATTR(NULL, "prev", 0444), + ATTR(NULL, "exec", 0666), + ATTR(NULL, "fscreate", 0666), + ATTR(NULL, "keycreate", 0666), + ATTR(NULL, "sockcreate", 0666), +#ifdef CONFIG_SECURITY_SMACK + DIR("smack", 0555, + proc_smack_attr_dir_inode_ops, proc_smack_attr_dir_ops), +#endif }; static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx) @@ -2998,7 +3044,7 @@ static const struct pid_entry tgid_base_stuff[] = { ONE("oom_score", S_IRUGO, proc_oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), -#ifdef CONFIG_AUDITSYSCALL +#ifdef CONFIG_AUDIT REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), REG("sessionid", S_IRUGO, proc_sessionid_operations), #endif @@ -3386,7 +3432,7 @@ static const struct pid_entry tid_base_stuff[] = { ONE("oom_score", S_IRUGO, proc_oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), -#ifdef CONFIG_AUDITSYSCALL +#ifdef CONFIG_AUDIT REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), REG("sessionid", S_IRUGO, proc_sessionid_operations), #endif diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 4fc5a9b68f76..ea575375f210 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -82,6 +82,7 @@ union proc_op { int (*proc_show)(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task); + const char *lsm; }; struct proc_inode { diff --git a/fs/statfs.c b/fs/statfs.c index f0216629621d..eea7af6f2f22 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -67,6 +67,20 @@ static int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf) return retval; } +int vfs_get_fsid(struct dentry *dentry, __kernel_fsid_t *fsid) +{ + struct kstatfs st; + int error; + + error = statfs_by_dentry(dentry, &st); + if (error) + return error; + + *fsid = st.f_fsid; + return 0; +} +EXPORT_SYMBOL(vfs_get_fsid); + int vfs_statfs(const struct path *path, struct kstatfs *buf) { int error; diff --git a/fs/udf/super.c b/fs/udf/super.c index e3d684ea3203..ffd8038ff728 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -1474,6 +1474,17 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block, if (lvd->integritySeqExt.extLength) udf_load_logicalvolint(sb, leea_to_cpu(lvd->integritySeqExt)); ret = 0; + + if (!sbi->s_lvid_bh) { + /* We can't generate unique IDs without a valid LVID */ + if (sb_rdonly(sb)) { + UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT); + } else { + udf_warn(sb, "Damaged or missing LVID, forcing " + "readonly mount\n"); + ret = -EACCES; + } + } out_bh: brelse(bh); return ret; @@ -1943,13 +1954,24 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt, return 0; } +static void udf_finalize_lvid(struct logicalVolIntegrityDesc *lvid) +{ + struct timespec64 ts; + + ktime_get_real_ts64(&ts); + udf_time_to_disk_stamp(&lvid->recordingDateAndTime, ts); + lvid->descTag.descCRC = cpu_to_le16( + crc_itu_t(0, (char *)lvid + sizeof(struct tag), + le16_to_cpu(lvid->descTag.descCRCLength))); + lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag); +} + static void udf_open_lvid(struct super_block *sb) { struct udf_sb_info *sbi = UDF_SB(sb); struct buffer_head *bh = sbi->s_lvid_bh; struct logicalVolIntegrityDesc *lvid; struct logicalVolIntegrityDescImpUse *lvidiu; - struct timespec64 ts; if (!bh) return; @@ -1961,18 +1983,12 @@ static void udf_open_lvid(struct super_block *sb) mutex_lock(&sbi->s_alloc_mutex); lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; - ktime_get_real_ts64(&ts); - udf_time_to_disk_stamp(&lvid->recordingDateAndTime, ts); if (le32_to_cpu(lvid->integrityType) == LVID_INTEGRITY_TYPE_CLOSE) lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN); else UDF_SET_FLAG(sb, UDF_FLAG_INCONSISTENT); - lvid->descTag.descCRC = cpu_to_le16( - crc_itu_t(0, (char *)lvid + sizeof(struct tag), - le16_to_cpu(lvid->descTag.descCRCLength))); - - lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag); + udf_finalize_lvid(lvid); mark_buffer_dirty(bh); sbi->s_lvid_dirty = 0; mutex_unlock(&sbi->s_alloc_mutex); @@ -1986,7 +2002,6 @@ static void udf_close_lvid(struct super_block *sb) struct buffer_head *bh = sbi->s_lvid_bh; struct logicalVolIntegrityDesc *lvid; struct logicalVolIntegrityDescImpUse *lvidiu; - struct timespec64 ts; if (!bh) return; @@ -1998,8 +2013,6 @@ static void udf_close_lvid(struct super_block *sb) mutex_lock(&sbi->s_alloc_mutex); lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; - ktime_get_real_ts64(&ts); - udf_time_to_disk_stamp(&lvid->recordingDateAndTime, ts); if (UDF_MAX_WRITE_VERSION > le16_to_cpu(lvidiu->maxUDFWriteRev)) lvidiu->maxUDFWriteRev = cpu_to_le16(UDF_MAX_WRITE_VERSION); if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFReadRev)) @@ -2009,17 +2022,13 @@ static void udf_close_lvid(struct super_block *sb) if (!UDF_QUERY_FLAG(sb, UDF_FLAG_INCONSISTENT)) lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE); - lvid->descTag.descCRC = cpu_to_le16( - crc_itu_t(0, (char *)lvid + sizeof(struct tag), - le16_to_cpu(lvid->descTag.descCRCLength))); - - lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag); /* * We set buffer uptodate unconditionally here to avoid spurious * warnings from mark_buffer_dirty() when previous EIO has marked * the buffer as !uptodate */ set_buffer_uptodate(bh); + udf_finalize_lvid(lvid); mark_buffer_dirty(bh); sbi->s_lvid_dirty = 0; mutex_unlock(&sbi->s_alloc_mutex); @@ -2048,8 +2057,8 @@ u64 lvid_get_unique_id(struct super_block *sb) if (!(++uniqueID & 0xFFFFFFFF)) uniqueID += 16; lvhd->uniqueID = cpu_to_le64(uniqueID); + udf_updated_lvid(sb); mutex_unlock(&sbi->s_alloc_mutex); - mark_buffer_dirty(bh); return ret; } @@ -2320,11 +2329,17 @@ static int udf_sync_fs(struct super_block *sb, int wait) mutex_lock(&sbi->s_alloc_mutex); if (sbi->s_lvid_dirty) { + struct buffer_head *bh = sbi->s_lvid_bh; + struct logicalVolIntegrityDesc *lvid; + + lvid = (struct logicalVolIntegrityDesc *)bh->b_data; + udf_finalize_lvid(lvid); + /* * Blockdevice will be synced later so we don't have to submit * the buffer for IO */ - mark_buffer_dirty(sbi->s_lvid_bh); + mark_buffer_dirty(bh); sbi->s_lvid_dirty = 0; } mutex_unlock(&sbi->s_alloc_mutex); diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 999ad8d00d43..1ef8acf35e7d 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -339,14 +339,14 @@ xfs_ag_init_headers( { /* BNO root block */ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_BNO_BLOCK(mp)), .numblks = BTOBB(mp->m_sb.sb_blocksize), - .ops = &xfs_allocbt_buf_ops, + .ops = &xfs_bnobt_buf_ops, .work = &xfs_bnoroot_init, .need_init = true }, { /* CNT root block */ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_CNT_BLOCK(mp)), .numblks = BTOBB(mp->m_sb.sb_blocksize), - .ops = &xfs_allocbt_buf_ops, + .ops = &xfs_cntbt_buf_ops, .work = &xfs_cntroot_init, .need_init = true }, @@ -361,7 +361,7 @@ xfs_ag_init_headers( { /* FINO root block */ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_FIBT_BLOCK(mp)), .numblks = BTOBB(mp->m_sb.sb_blocksize), - .ops = &xfs_inobt_buf_ops, + .ops = &xfs_finobt_buf_ops, .work = &xfs_btroot_init, .type = XFS_BTNUM_FINO, .need_init = xfs_sb_version_hasfinobt(&mp->m_sb) diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index e701ebc36c06..e2ba2a3b63b2 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -281,7 +281,7 @@ xfs_ag_resv_init( */ ask = used = 0; - mp->m_inotbt_nores = true; + mp->m_finobt_nores = true; error = xfs_refcountbt_calc_reserves(mp, tp, agno, &ask, &used); diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index b715668886a4..bc3367b8b7bb 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -568,9 +568,9 @@ xfs_agfl_verify( if (!xfs_sb_version_hascrc(&mp->m_sb)) return NULL; - if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid)) + if (!xfs_verify_magic(bp, agfl->agfl_magicnum)) return __this_address; - if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC) + if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; /* * during growfs operations, the perag is not fully initialised, @@ -643,6 +643,7 @@ xfs_agfl_write_verify( const struct xfs_buf_ops xfs_agfl_buf_ops = { .name = "xfs_agfl", + .magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) }, .verify_read = xfs_agfl_read_verify, .verify_write = xfs_agfl_write_verify, .verify_struct = xfs_agfl_verify, @@ -2587,8 +2588,10 @@ xfs_agf_verify( return __this_address; } - if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && - XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && + if (!xfs_verify_magic(bp, agf->agf_magicnum)) + return __this_address; + + if (!(XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) && be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) && @@ -2670,6 +2673,7 @@ xfs_agf_write_verify( const struct xfs_buf_ops xfs_agf_buf_ops = { .name = "xfs_agf", + .magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) }, .verify_read = xfs_agf_read_verify, .verify_write = xfs_agf_write_verify, .verify_struct = xfs_agf_verify, diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 4e59cc8a2802..9fe949f6055e 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -297,48 +297,34 @@ xfs_allocbt_verify( struct xfs_perag *pag = bp->b_pag; xfs_failaddr_t fa; unsigned int level; + xfs_btnum_t btnum = XFS_BTNUM_BNOi; + + if (!xfs_verify_magic(bp, block->bb_magic)) + return __this_address; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + fa = xfs_btree_sblock_v5hdr_verify(bp); + if (fa) + return fa; + } /* - * magic number and level verification - * - * During growfs operations, we can't verify the exact level or owner as - * the perag is not fully initialised and hence not attached to the - * buffer. In this case, check against the maximum tree depth. + * The perag may not be attached during grow operations or fully + * initialized from the AGF during log recovery. Therefore we can only + * check against maximum tree depth from those contexts. * - * Similarly, during log recovery we will have a perag structure - * attached, but the agf information will not yet have been initialised - * from the on disk AGF. Again, we can only check against maximum limits - * in this case. + * Otherwise check against the per-tree limit. Peek at one of the + * verifier magic values to determine the type of tree we're verifying + * against. */ level = be16_to_cpu(block->bb_level); - switch (block->bb_magic) { - case cpu_to_be32(XFS_ABTB_CRC_MAGIC): - fa = xfs_btree_sblock_v5hdr_verify(bp); - if (fa) - return fa; - /* fall through */ - case cpu_to_be32(XFS_ABTB_MAGIC): - if (pag && pag->pagf_init) { - if (level >= pag->pagf_levels[XFS_BTNUM_BNOi]) - return __this_address; - } else if (level >= mp->m_ag_maxlevels) + if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC)) + btnum = XFS_BTNUM_CNTi; + if (pag && pag->pagf_init) { + if (level >= pag->pagf_levels[btnum]) return __this_address; - break; - case cpu_to_be32(XFS_ABTC_CRC_MAGIC): - fa = xfs_btree_sblock_v5hdr_verify(bp); - if (fa) - return fa; - /* fall through */ - case cpu_to_be32(XFS_ABTC_MAGIC): - if (pag && pag->pagf_init) { - if (level >= pag->pagf_levels[XFS_BTNUM_CNTi]) - return __this_address; - } else if (level >= mp->m_ag_maxlevels) - return __this_address; - break; - default: + } else if (level >= mp->m_ag_maxlevels) return __this_address; - } return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]); } @@ -377,13 +363,23 @@ xfs_allocbt_write_verify( } -const struct xfs_buf_ops xfs_allocbt_buf_ops = { - .name = "xfs_allocbt", +const struct xfs_buf_ops xfs_bnobt_buf_ops = { + .name = "xfs_bnobt", + .magic = { cpu_to_be32(XFS_ABTB_MAGIC), + cpu_to_be32(XFS_ABTB_CRC_MAGIC) }, .verify_read = xfs_allocbt_read_verify, .verify_write = xfs_allocbt_write_verify, .verify_struct = xfs_allocbt_verify, }; +const struct xfs_buf_ops xfs_cntbt_buf_ops = { + .name = "xfs_cntbt", + .magic = { cpu_to_be32(XFS_ABTC_MAGIC), + cpu_to_be32(XFS_ABTC_CRC_MAGIC) }, + .verify_read = xfs_allocbt_read_verify, + .verify_write = xfs_allocbt_write_verify, + .verify_struct = xfs_allocbt_verify, +}; STATIC int xfs_bnobt_keys_inorder( @@ -448,7 +444,7 @@ static const struct xfs_btree_ops xfs_bnobt_ops = { .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, .key_diff = xfs_bnobt_key_diff, - .buf_ops = &xfs_allocbt_buf_ops, + .buf_ops = &xfs_bnobt_buf_ops, .diff_two_keys = xfs_bnobt_diff_two_keys, .keys_inorder = xfs_bnobt_keys_inorder, .recs_inorder = xfs_bnobt_recs_inorder, @@ -470,7 +466,7 @@ static const struct xfs_btree_ops xfs_cntbt_ops = { .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, .key_diff = xfs_cntbt_key_diff, - .buf_ops = &xfs_allocbt_buf_ops, + .buf_ops = &xfs_cntbt_buf_ops, .diff_two_keys = xfs_cntbt_diff_two_keys, .keys_inorder = xfs_cntbt_keys_inorder, .recs_inorder = xfs_cntbt_recs_inorder, diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 844ed87b1900..2dd9ee2a2e08 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -1336,3 +1336,20 @@ xfs_attr_node_get(xfs_da_args_t *args) xfs_da_state_free(state); return retval; } + +/* Returns true if the attribute entry name is valid. */ +bool +xfs_attr_namecheck( + const void *name, + size_t length) +{ + /* + * MAXNAMELEN includes the trailing null, but (name/length) leave it + * out, so use >= for the length check. + */ + if (length >= MAXNAMELEN) + return false; + + /* There shouldn't be any nulls here */ + return !memchr(name, 0, length); +} diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index bdf52a333f3f..2297d8467666 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -145,6 +145,6 @@ int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); int xfs_attr_remove_args(struct xfs_da_args *args); int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, int flags, struct attrlist_cursor_kern *cursor); - +bool xfs_attr_namecheck(const void *name, size_t length); #endif /* __XFS_ATTR_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 2652d00842d6..1f6e3965ff74 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -245,25 +245,14 @@ xfs_attr3_leaf_verify( struct xfs_attr_leaf_entry *entries; uint32_t end; /* must be 32bit - see below */ int i; + xfs_failaddr_t fa; xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); - if (xfs_sb_version_hascrc(&mp->m_sb)) { - struct xfs_da3_node_hdr *hdr3 = bp->b_addr; - - if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC) - return __this_address; + fa = xfs_da3_blkinfo_verify(bp, bp->b_addr); + if (fa) + return fa; - if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid)) - return __this_address; - if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) - return __this_address; - if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn))) - return __this_address; - } else { - if (ichdr.magic != XFS_ATTR_LEAF_MAGIC) - return __this_address; - } /* * In recovery there is a transient state where count == 0 is valid * because we may have transitioned an empty shortform attr to a leaf @@ -369,6 +358,8 @@ xfs_attr3_leaf_read_verify( const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = { .name = "xfs_attr3_leaf", + .magic16 = { cpu_to_be16(XFS_ATTR_LEAF_MAGIC), + cpu_to_be16(XFS_ATTR3_LEAF_MAGIC) }, .verify_read = xfs_attr3_leaf_read_verify, .verify_write = xfs_attr3_leaf_write_verify, .verify_struct = xfs_attr3_leaf_verify, diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index d89363c6b523..65ff600a8067 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -79,6 +79,7 @@ xfs_attr3_rmt_hdr_ok( static xfs_failaddr_t xfs_attr3_rmt_verify( struct xfs_mount *mp, + struct xfs_buf *bp, void *ptr, int fsbsize, xfs_daddr_t bno) @@ -87,7 +88,7 @@ xfs_attr3_rmt_verify( if (!xfs_sb_version_hascrc(&mp->m_sb)) return __this_address; - if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC)) + if (!xfs_verify_magic(bp, rmt->rm_magic)) return __this_address; if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; @@ -131,7 +132,7 @@ __xfs_attr3_rmt_read_verify( *failaddr = __this_address; return -EFSBADCRC; } - *failaddr = xfs_attr3_rmt_verify(mp, ptr, blksize, bno); + *failaddr = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno); if (*failaddr) return -EFSCORRUPTED; len -= blksize; @@ -193,7 +194,7 @@ xfs_attr3_rmt_write_verify( while (len > 0) { struct xfs_attr3_rmt_hdr *rmt = (struct xfs_attr3_rmt_hdr *)ptr; - fa = xfs_attr3_rmt_verify(mp, ptr, blksize, bno); + fa = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno); if (fa) { xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; @@ -220,6 +221,7 @@ xfs_attr3_rmt_write_verify( const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = { .name = "xfs_attr3_rmt", + .magic = { 0, cpu_to_be32(XFS_ATTR3_RMT_MAGIC) }, .verify_read = xfs_attr3_rmt_read_verify, .verify_write = xfs_attr3_rmt_write_verify, .verify_struct = xfs_attr3_rmt_verify_struct, diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 332eefa2700b..48502cb9990f 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -577,42 +577,44 @@ __xfs_bmap_add_free( */ /* - * Transform a btree format file with only one leaf node, where the - * extents list will fit in the inode, into an extents format file. - * Since the file extents are already in-core, all we have to do is - * give up the space for the btree root and pitch the leaf block. + * Convert the inode format to extent format if it currently is in btree format, + * but the extent list is small enough that it fits into the extent format. + * + * Since the extents are already in-core, all we have to do is give up the space + * for the btree root and pitch the leaf block. */ STATIC int /* error */ xfs_bmap_btree_to_extents( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_btree_cur_t *cur, /* btree cursor */ + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode pointer */ + struct xfs_btree_cur *cur, /* btree cursor */ int *logflagsp, /* inode logging flags */ int whichfork) /* data or attr fork */ { - /* REFERENCED */ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_mount *mp = ip->i_mount; + struct xfs_btree_block *rblock = ifp->if_broot; struct xfs_btree_block *cblock;/* child btree block */ xfs_fsblock_t cbno; /* child block number */ xfs_buf_t *cbp; /* child block's buffer */ int error; /* error return value */ - struct xfs_ifork *ifp; /* inode fork data */ - xfs_mount_t *mp; /* mount point structure */ __be64 *pp; /* ptr to block address */ - struct xfs_btree_block *rblock;/* root btree block */ struct xfs_owner_info oinfo; - mp = ip->i_mount; - ifp = XFS_IFORK_PTR(ip, whichfork); + /* check if we actually need the extent format first: */ + if (!xfs_bmap_wants_extents(ip, whichfork)) + return 0; + + ASSERT(cur); ASSERT(whichfork != XFS_COW_FORK); ASSERT(ifp->if_flags & XFS_IFEXTENTS); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); - rblock = ifp->if_broot; ASSERT(be16_to_cpu(rblock->bb_level) == 1); ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1); ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes); cbno = be64_to_cpu(*pp); - *logflagsp = 0; #ifdef DEBUG XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, xfs_btree_check_lptr(cur, cbno, 1)); @@ -635,7 +637,7 @@ xfs_bmap_btree_to_extents( ASSERT(ifp->if_broot == NULL); ASSERT((ifp->if_flags & XFS_IFBROOT) == 0); XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); - *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); + *logflagsp |= XFS_ILOG_CORE | xfs_ilog_fext(whichfork); return 0; } @@ -2029,7 +2031,7 @@ done: /* * Convert an unwritten allocation to a real allocation or vice versa. */ -STATIC int /* error */ +int /* error */ xfs_bmap_add_extent_unwritten_real( struct xfs_trans *tp, xfs_inode_t *ip, /* incore inode pointer */ @@ -3685,17 +3687,6 @@ xfs_trim_extent( } } -/* trim extent to within eof */ -void -xfs_trim_extent_eof( - struct xfs_bmbt_irec *irec, - struct xfs_inode *ip) - -{ - xfs_trim_extent(irec, 0, XFS_B_TO_FSB(ip->i_mount, - i_size_read(VFS_I(ip)))); -} - /* * Trim the returned map to the required bounds */ @@ -4203,6 +4194,44 @@ xfs_bmapi_convert_unwritten( return 0; } +static inline xfs_extlen_t +xfs_bmapi_minleft( + struct xfs_trans *tp, + struct xfs_inode *ip, + int fork) +{ + if (tp && tp->t_firstblock != NULLFSBLOCK) + return 0; + if (XFS_IFORK_FORMAT(ip, fork) != XFS_DINODE_FMT_BTREE) + return 1; + return be16_to_cpu(XFS_IFORK_PTR(ip, fork)->if_broot->bb_level) + 1; +} + +/* + * Log whatever the flags say, even if error. Otherwise we might miss detecting + * a case where the data is changed, there's an error, and it's not logged so we + * don't shutdown when we should. Don't bother logging extents/btree changes if + * we converted to the other format. + */ +static void +xfs_bmapi_finish( + struct xfs_bmalloca *bma, + int whichfork, + int error) +{ + if ((bma->logflags & xfs_ilog_fext(whichfork)) && + XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + bma->logflags &= ~xfs_ilog_fext(whichfork); + else if ((bma->logflags & xfs_ilog_fbroot(whichfork)) && + XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_BTREE) + bma->logflags &= ~xfs_ilog_fbroot(whichfork); + + if (bma->logflags) + xfs_trans_log_inode(bma->tp, bma->ip, bma->logflags); + if (bma->cur) + xfs_btree_del_cursor(bma->cur, error); +} + /* * Map file blocks to filesystem blocks, and allocate blocks or convert the * extent state if necessary. Details behaviour is controlled by the flags @@ -4247,9 +4276,7 @@ xfs_bmapi_write( ASSERT(*nmap >= 1); ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); - ASSERT(tp != NULL || - (flags & (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)) == - (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)); + ASSERT(tp != NULL); ASSERT(len > 0); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -4282,25 +4309,12 @@ xfs_bmapi_write( XFS_STATS_INC(mp, xs_blk_mapw); - if (!tp || tp->t_firstblock == NULLFSBLOCK) { - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE) - bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1; - else - bma.minleft = 1; - } else { - bma.minleft = 0; - } - if (!(ifp->if_flags & XFS_IFEXTENTS)) { error = xfs_iread_extents(tp, ip, whichfork); if (error) goto error0; } - n = 0; - end = bno + len; - obno = bno; - if (!xfs_iext_lookup_extent(ip, ifp, bno, &bma.icur, &bma.got)) eof = true; if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev)) @@ -4309,7 +4323,11 @@ xfs_bmapi_write( bma.ip = ip; bma.total = total; bma.datatype = 0; + bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork); + n = 0; + end = bno + len; + obno = bno; while (bno < end && n < *nmap) { bool need_alloc = false, wasdelay = false; @@ -4323,26 +4341,7 @@ xfs_bmapi_write( ASSERT(!((flags & XFS_BMAPI_CONVERT) && (flags & XFS_BMAPI_COWFORK))); - if (flags & XFS_BMAPI_DELALLOC) { - /* - * For the COW fork we can reasonably get a - * request for converting an extent that races - * with other threads already having converted - * part of it, as there converting COW to - * regular blocks is not protected using the - * IOLOCK. - */ - ASSERT(flags & XFS_BMAPI_COWFORK); - if (!(flags & XFS_BMAPI_COWFORK)) { - error = -EIO; - goto error0; - } - - if (eof || bno >= end) - break; - } else { - need_alloc = true; - } + need_alloc = true; } else if (isnullstartblock(bma.got.br_startblock)) { wasdelay = true; } @@ -4351,8 +4350,7 @@ xfs_bmapi_write( * First, deal with the hole before the allocated space * that we found, if any. */ - if ((need_alloc || wasdelay) && - !(flags & XFS_BMAPI_CONVERT_ONLY)) { + if (need_alloc || wasdelay) { bma.eof = eof; bma.conv = !!(flags & XFS_BMAPI_CONVERT); bma.wasdel = wasdelay; @@ -4420,49 +4418,130 @@ xfs_bmapi_write( } *nmap = n; - /* - * Transform from btree to extents, give it cur. - */ - if (xfs_bmap_wants_extents(ip, whichfork)) { - int tmp_logflags = 0; - - ASSERT(bma.cur); - error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, - &tmp_logflags, whichfork); - bma.logflags |= tmp_logflags; - if (error) - goto error0; - } + error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags, + whichfork); + if (error) + goto error0; ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE || XFS_IFORK_NEXTENTS(ip, whichfork) > XFS_IFORK_MAXEXT(ip, whichfork)); - error = 0; + xfs_bmapi_finish(&bma, whichfork, 0); + xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval, + orig_nmap, *nmap); + return 0; error0: + xfs_bmapi_finish(&bma, whichfork, error); + return error; +} + +/* + * Convert an existing delalloc extent to real blocks based on file offset. This + * attempts to allocate the entire delalloc extent and may require multiple + * invocations to allocate the target offset if a large enough physical extent + * is not available. + */ +int +xfs_bmapi_convert_delalloc( + struct xfs_inode *ip, + int whichfork, + xfs_fileoff_t offset_fsb, + struct xfs_bmbt_irec *imap, + unsigned int *seq) +{ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmalloca bma = { NULL }; + struct xfs_trans *tp; + int error; + /* - * Log everything. Do this after conversion, there's no point in - * logging the extent records if we've converted to btree format. + * Space for the extent and indirect blocks was reserved when the + * delalloc extent was created so there's no need to do so here. */ - if ((bma.logflags & xfs_ilog_fext(whichfork)) && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) - bma.logflags &= ~xfs_ilog_fext(whichfork); - else if ((bma.logflags & xfs_ilog_fbroot(whichfork)) && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) - bma.logflags &= ~xfs_ilog_fbroot(whichfork); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, + XFS_TRANS_RESERVE, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &bma.icur, &bma.got) || + bma.got.br_startoff > offset_fsb) { + /* + * No extent found in the range we are trying to convert. This + * should only happen for the COW fork, where another thread + * might have moved the extent to the data fork in the meantime. + */ + WARN_ON_ONCE(whichfork != XFS_COW_FORK); + error = -EAGAIN; + goto out_trans_cancel; + } + /* - * Log whatever the flags say, even if error. Otherwise we might miss - * detecting a case where the data is changed, there's an error, - * and it's not logged so we don't shutdown when we should. + * If we find a real extent here we raced with another thread converting + * the extent. Just return the real extent at this offset. */ - if (bma.logflags) - xfs_trans_log_inode(tp, ip, bma.logflags); + if (!isnullstartblock(bma.got.br_startblock)) { + *imap = bma.got; + *seq = READ_ONCE(ifp->if_seq); + goto out_trans_cancel; + } + + bma.tp = tp; + bma.ip = ip; + bma.wasdel = true; + bma.offset = bma.got.br_startoff; + bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, MAXEXTLEN); + bma.total = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); + bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork); + if (whichfork == XFS_COW_FORK) + bma.flags = XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC; - if (bma.cur) { - xfs_btree_del_cursor(bma.cur, error); + if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev)) + bma.prev.br_startoff = NULLFILEOFF; + + error = xfs_bmapi_allocate(&bma); + if (error) + goto out_finish; + + error = -ENOSPC; + if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK)) + goto out_finish; + error = -EFSCORRUPTED; + if (WARN_ON_ONCE(!bma.got.br_startblock && !XFS_IS_REALTIME_INODE(ip))) + goto out_finish; + + XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length)); + XFS_STATS_INC(mp, xs_xstrat_quick); + + ASSERT(!isnullstartblock(bma.got.br_startblock)); + *imap = bma.got; + *seq = READ_ONCE(ifp->if_seq); + + if (whichfork == XFS_COW_FORK) { + error = xfs_refcount_alloc_cow_extent(tp, bma.blkno, + bma.length); + if (error) + goto out_finish; } - if (!error) - xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval, - orig_nmap, *nmap); + + error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags, + whichfork); + if (error) + goto out_finish; + + xfs_bmapi_finish(&bma, whichfork, 0); + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; + +out_finish: + xfs_bmapi_finish(&bma, whichfork, error); +out_trans_cancel: + xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -4536,13 +4615,7 @@ xfs_bmapi_remap( if (error) goto error0; - if (xfs_bmap_wants_extents(ip, whichfork)) { - int tmp_logflags = 0; - - error = xfs_bmap_btree_to_extents(tp, ip, cur, - &tmp_logflags, whichfork); - logflags |= tmp_logflags; - } + error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags, whichfork); error0: if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) @@ -5406,24 +5479,11 @@ nodelete: error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0, &tmp_logflags, whichfork); logflags |= tmp_logflags; - if (error) - goto error0; - } - /* - * transform from btree to extents, give it cur - */ - else if (xfs_bmap_wants_extents(ip, whichfork)) { - ASSERT(cur != NULL); - error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags, + } else { + error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags, whichfork); - logflags |= tmp_logflags; - if (error) - goto error0; } - /* - * transform from extents to local? - */ - error = 0; + error0: /* * Log everything. Do this after conversion, there's no point in diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 09d3ea97cc15..8f597f9abdbe 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -95,12 +95,6 @@ struct xfs_extent_free_item /* Map something in the CoW fork. */ #define XFS_BMAPI_COWFORK 0x200 -/* Only convert delalloc space, don't allocate entirely new extents */ -#define XFS_BMAPI_DELALLOC 0x400 - -/* Only convert unwritten extents, don't allocate new blocks */ -#define XFS_BMAPI_CONVERT_ONLY 0x800 - /* Skip online discard of freed extents */ #define XFS_BMAPI_NODISCARD 0x1000 @@ -117,8 +111,6 @@ struct xfs_extent_free_item { XFS_BMAPI_ZERO, "ZERO" }, \ { XFS_BMAPI_REMAP, "REMAP" }, \ { XFS_BMAPI_COWFORK, "COWFORK" }, \ - { XFS_BMAPI_DELALLOC, "DELALLOC" }, \ - { XFS_BMAPI_CONVERT_ONLY, "CONVERT_ONLY" }, \ { XFS_BMAPI_NODISCARD, "NODISCARD" }, \ { XFS_BMAPI_NORMAP, "NORMAP" } @@ -181,7 +173,6 @@ static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec) void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, xfs_filblks_t len); -void xfs_trim_extent_eof(struct xfs_bmbt_irec *, struct xfs_inode *); int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); int xfs_bmap_set_attrforkoff(struct xfs_inode *ip, int size, int *version); void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork); @@ -228,6 +219,13 @@ int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc, struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur, int eof); +int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork, + xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap, + unsigned int *seq); +int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp, + struct xfs_inode *ip, int whichfork, + struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp, + struct xfs_bmbt_irec *new, int *logflagsp); static inline void xfs_bmap_add_free( diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index cdb74d2e2a43..aff82ed112c9 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -416,8 +416,10 @@ xfs_bmbt_verify( xfs_failaddr_t fa; unsigned int level; - switch (block->bb_magic) { - case cpu_to_be32(XFS_BMAP_CRC_MAGIC): + if (!xfs_verify_magic(bp, block->bb_magic)) + return __this_address; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { /* * XXX: need a better way of verifying the owner here. Right now * just make sure there has been one set. @@ -425,11 +427,6 @@ xfs_bmbt_verify( fa = xfs_btree_lblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN); if (fa) return fa; - /* fall through */ - case cpu_to_be32(XFS_BMAP_MAGIC): - break; - default: - return __this_address; } /* @@ -481,6 +478,8 @@ xfs_bmbt_write_verify( const struct xfs_buf_ops xfs_bmbt_buf_ops = { .name = "xfs_bmbt", + .magic = { cpu_to_be32(XFS_BMAP_MAGIC), + cpu_to_be32(XFS_BMAP_CRC_MAGIC) }, .verify_read = xfs_bmbt_read_verify, .verify_write = xfs_bmbt_write_verify, .verify_struct = xfs_bmbt_verify, diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 376bee94b5dd..e2737e2ac2ae 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -116,6 +116,34 @@ xfs_da_state_free(xfs_da_state_t *state) kmem_zone_free(xfs_da_state_zone, state); } +/* + * Verify an xfs_da3_blkinfo structure. Note that the da3 fields are only + * accessible on v5 filesystems. This header format is common across da node, + * attr leaf and dir leaf blocks. + */ +xfs_failaddr_t +xfs_da3_blkinfo_verify( + struct xfs_buf *bp, + struct xfs_da3_blkinfo *hdr3) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_da_blkinfo *hdr = &hdr3->hdr; + + if (!xfs_verify_magic16(bp, hdr->magic)) + return __this_address; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) + return __this_address; + if (be64_to_cpu(hdr3->blkno) != bp->b_bn) + return __this_address; + if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) + return __this_address; + } + + return NULL; +} + static xfs_failaddr_t xfs_da3_node_verify( struct xfs_buf *bp) @@ -124,27 +152,16 @@ xfs_da3_node_verify( struct xfs_da_intnode *hdr = bp->b_addr; struct xfs_da3_icnode_hdr ichdr; const struct xfs_dir_ops *ops; + xfs_failaddr_t fa; ops = xfs_dir_get_ops(mp, NULL); ops->node_hdr_from_disk(&ichdr, hdr); - if (xfs_sb_version_hascrc(&mp->m_sb)) { - struct xfs_da3_node_hdr *hdr3 = bp->b_addr; - - if (ichdr.magic != XFS_DA3_NODE_MAGIC) - return __this_address; + fa = xfs_da3_blkinfo_verify(bp, bp->b_addr); + if (fa) + return fa; - if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid)) - return __this_address; - if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) - return __this_address; - if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn))) - return __this_address; - } else { - if (ichdr.magic != XFS_DA_NODE_MAGIC) - return __this_address; - } if (ichdr.level == 0) return __this_address; if (ichdr.level > XFS_DA_NODE_MAXDEPTH) @@ -257,6 +274,8 @@ xfs_da3_node_verify_struct( const struct xfs_buf_ops xfs_da3_node_buf_ops = { .name = "xfs_da3_node", + .magic16 = { cpu_to_be16(XFS_DA_NODE_MAGIC), + cpu_to_be16(XFS_DA3_NODE_MAGIC) }, .verify_read = xfs_da3_node_read_verify, .verify_write = xfs_da3_node_write_verify, .verify_struct = xfs_da3_node_verify_struct, diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 5d5bf3bffc78..ae654e06b2fb 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -869,4 +869,7 @@ static inline unsigned int xfs_dir2_dirblock_bytes(struct xfs_sb *sbp) return 1 << (sbp->sb_blocklog + sbp->sb_dirblklog); } +xfs_failaddr_t xfs_da3_blkinfo_verify(struct xfs_buf *bp, + struct xfs_da3_blkinfo *hdr3); + #endif /* __XFS_DA_FORMAT_H__ */ diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 229152cd1a24..156ce95c9c45 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -703,3 +703,20 @@ xfs_dir2_shrink_inode( xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); return 0; } + +/* Returns true if the directory entry name is valid. */ +bool +xfs_dir2_namecheck( + const void *name, + size_t length) +{ + /* + * MAXNAMELEN includes the trailing null, but (name/length) leave it + * out, so use >= for the length check. + */ + if (length >= MAXNAMELEN) + return false; + + /* There shouldn't be any slashes or nulls here */ + return !memchr(name, '/', length) && !memchr(name, 0, length); +} diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index c3e3f6b813d8..f54244779492 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -326,5 +326,6 @@ xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp) unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp, uint8_t filetype); void *xfs_dir3_data_endp(struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr); +bool xfs_dir2_namecheck(const void *name, size_t length); #endif /* __XFS_DIR2_H__ */ diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index 30ed5919da72..b7d6d78f4ce2 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -53,18 +53,16 @@ xfs_dir3_block_verify( struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + if (!xfs_verify_magic(bp, hdr3->magic)) + return __this_address; + if (xfs_sb_version_hascrc(&mp->m_sb)) { - if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) - return __this_address; if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (be64_to_cpu(hdr3->blkno) != bp->b_bn) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) return __this_address; - } else { - if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) - return __this_address; } return __xfs_dir3_data_check(NULL, bp); } @@ -112,6 +110,8 @@ xfs_dir3_block_write_verify( const struct xfs_buf_ops xfs_dir3_block_buf_ops = { .name = "xfs_dir3_block", + .magic = { cpu_to_be32(XFS_DIR2_BLOCK_MAGIC), + cpu_to_be32(XFS_DIR3_BLOCK_MAGIC) }, .verify_read = xfs_dir3_block_read_verify, .verify_write = xfs_dir3_block_write_verify, .verify_struct = xfs_dir3_block_verify, diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index 01162c62ec8f..b7b9ce002cb9 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -252,18 +252,16 @@ xfs_dir3_data_verify( struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + if (!xfs_verify_magic(bp, hdr3->magic)) + return __this_address; + if (xfs_sb_version_hascrc(&mp->m_sb)) { - if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC)) - return __this_address; if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (be64_to_cpu(hdr3->blkno) != bp->b_bn) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) return __this_address; - } else { - if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC)) - return __this_address; } return __xfs_dir3_data_check(NULL, bp); } @@ -339,6 +337,8 @@ xfs_dir3_data_write_verify( const struct xfs_buf_ops xfs_dir3_data_buf_ops = { .name = "xfs_dir3_data", + .magic = { cpu_to_be32(XFS_DIR2_DATA_MAGIC), + cpu_to_be32(XFS_DIR3_DATA_MAGIC) }, .verify_read = xfs_dir3_data_read_verify, .verify_write = xfs_dir3_data_write_verify, .verify_struct = xfs_dir3_data_verify, @@ -346,6 +346,8 @@ const struct xfs_buf_ops xfs_dir3_data_buf_ops = { static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = { .name = "xfs_dir3_data_reada", + .magic = { cpu_to_be32(XFS_DIR2_DATA_MAGIC), + cpu_to_be32(XFS_DIR3_DATA_MAGIC) }, .verify_read = xfs_dir3_data_reada_verify, .verify_write = xfs_dir3_data_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index 1728a3e6f5cf..9a3767818c50 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -142,41 +142,22 @@ xfs_dir3_leaf_check_int( */ static xfs_failaddr_t xfs_dir3_leaf_verify( - struct xfs_buf *bp, - uint16_t magic) + struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_dir2_leaf *leaf = bp->b_addr; + xfs_failaddr_t fa; - ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC); - - if (xfs_sb_version_hascrc(&mp->m_sb)) { - struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; - uint16_t magic3; - - magic3 = (magic == XFS_DIR2_LEAF1_MAGIC) ? XFS_DIR3_LEAF1_MAGIC - : XFS_DIR3_LEAFN_MAGIC; - - if (leaf3->info.hdr.magic != cpu_to_be16(magic3)) - return __this_address; - if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid)) - return __this_address; - if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) - return __this_address; - if (!xfs_log_check_lsn(mp, be64_to_cpu(leaf3->info.lsn))) - return __this_address; - } else { - if (leaf->hdr.info.magic != cpu_to_be16(magic)) - return __this_address; - } + fa = xfs_da3_blkinfo_verify(bp, bp->b_addr); + if (fa) + return fa; return xfs_dir3_leaf_check_int(mp, NULL, NULL, leaf); } static void -__read_verify( - struct xfs_buf *bp, - uint16_t magic) +xfs_dir3_leaf_read_verify( + struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; xfs_failaddr_t fa; @@ -185,23 +166,22 @@ __read_verify( !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { - fa = xfs_dir3_leaf_verify(bp, magic); + fa = xfs_dir3_leaf_verify(bp); if (fa) xfs_verifier_error(bp, -EFSCORRUPTED, fa); } } static void -__write_verify( - struct xfs_buf *bp, - uint16_t magic) +xfs_dir3_leaf_write_verify( + struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_buf_log_item *bip = bp->b_log_item; struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr; xfs_failaddr_t fa; - fa = xfs_dir3_leaf_verify(bp, magic); + fa = xfs_dir3_leaf_verify(bp); if (fa) { xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; @@ -216,60 +196,22 @@ __write_verify( xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF); } -static xfs_failaddr_t -xfs_dir3_leaf1_verify( - struct xfs_buf *bp) -{ - return xfs_dir3_leaf_verify(bp, XFS_DIR2_LEAF1_MAGIC); -} - -static void -xfs_dir3_leaf1_read_verify( - struct xfs_buf *bp) -{ - __read_verify(bp, XFS_DIR2_LEAF1_MAGIC); -} - -static void -xfs_dir3_leaf1_write_verify( - struct xfs_buf *bp) -{ - __write_verify(bp, XFS_DIR2_LEAF1_MAGIC); -} - -static xfs_failaddr_t -xfs_dir3_leafn_verify( - struct xfs_buf *bp) -{ - return xfs_dir3_leaf_verify(bp, XFS_DIR2_LEAFN_MAGIC); -} - -static void -xfs_dir3_leafn_read_verify( - struct xfs_buf *bp) -{ - __read_verify(bp, XFS_DIR2_LEAFN_MAGIC); -} - -static void -xfs_dir3_leafn_write_verify( - struct xfs_buf *bp) -{ - __write_verify(bp, XFS_DIR2_LEAFN_MAGIC); -} - const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = { .name = "xfs_dir3_leaf1", - .verify_read = xfs_dir3_leaf1_read_verify, - .verify_write = xfs_dir3_leaf1_write_verify, - .verify_struct = xfs_dir3_leaf1_verify, + .magic16 = { cpu_to_be16(XFS_DIR2_LEAF1_MAGIC), + cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) }, + .verify_read = xfs_dir3_leaf_read_verify, + .verify_write = xfs_dir3_leaf_write_verify, + .verify_struct = xfs_dir3_leaf_verify, }; const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = { .name = "xfs_dir3_leafn", - .verify_read = xfs_dir3_leafn_read_verify, - .verify_write = xfs_dir3_leafn_write_verify, - .verify_struct = xfs_dir3_leafn_verify, + .magic16 = { cpu_to_be16(XFS_DIR2_LEAFN_MAGIC), + cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) }, + .verify_read = xfs_dir3_leaf_read_verify, + .verify_write = xfs_dir3_leaf_write_verify, + .verify_struct = xfs_dir3_leaf_verify, }; int diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index f1bb3434f51c..3b03703c5c3d 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -87,20 +87,18 @@ xfs_dir3_free_verify( struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_dir2_free_hdr *hdr = bp->b_addr; + if (!xfs_verify_magic(bp, hdr->magic)) + return __this_address; + if (xfs_sb_version_hascrc(&mp->m_sb)) { struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; - if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC)) - return __this_address; if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (be64_to_cpu(hdr3->blkno) != bp->b_bn) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) return __this_address; - } else { - if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC)) - return __this_address; } /* XXX: should bounds check the xfs_dir3_icfree_hdr here */ @@ -151,6 +149,8 @@ xfs_dir3_free_write_verify( const struct xfs_buf_ops xfs_dir3_free_buf_ops = { .name = "xfs_dir3_free", + .magic = { cpu_to_be32(XFS_DIR2_FREE_MAGIC), + cpu_to_be32(XFS_DIR3_FREE_MAGIC) }, .verify_read = xfs_dir3_free_read_verify, .verify_write = xfs_dir3_free_write_verify, .verify_struct = xfs_dir3_free_verify, diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index d293f371dd54..fb5bd9a804f6 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -277,6 +277,8 @@ xfs_dquot_buf_write_verify( const struct xfs_buf_ops xfs_dquot_buf_ops = { .name = "xfs_dquot", + .magic16 = { cpu_to_be16(XFS_DQUOT_MAGIC), + cpu_to_be16(XFS_DQUOT_MAGIC) }, .verify_read = xfs_dquot_buf_read_verify, .verify_write = xfs_dquot_buf_write_verify, .verify_struct = xfs_dquot_buf_verify_struct, @@ -284,6 +286,8 @@ const struct xfs_buf_ops xfs_dquot_buf_ops = { const struct xfs_buf_ops xfs_dquot_buf_ra_ops = { .name = "xfs_dquot_ra", + .magic16 = { cpu_to_be16(XFS_DQUOT_MAGIC), + cpu_to_be16(XFS_DQUOT_MAGIC) }, .verify_read = xfs_dquot_buf_readahead_verify, .verify_write = xfs_dquot_buf_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index 66077a105cbb..79e6c4fb1d8a 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -54,7 +54,8 @@ #define XFS_ERRTAG_BUF_LRU_REF 31 #define XFS_ERRTAG_FORCE_SCRUB_REPAIR 32 #define XFS_ERRTAG_FORCE_SUMMARY_RECALC 33 -#define XFS_ERRTAG_MAX 34 +#define XFS_ERRTAG_IUNLINK_FALLBACK 34 +#define XFS_ERRTAG_MAX 35 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -93,5 +94,6 @@ #define XFS_RANDOM_BUF_LRU_REF 2 #define XFS_RANDOM_FORCE_SCRUB_REPAIR 1 #define XFS_RANDOM_FORCE_SUMMARY_RECALC 1 +#define XFS_RANDOM_IUNLINK_FALLBACK (XFS_RANDOM_DEFAULT/10) #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index d32152fc8a6c..fe9898875097 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2508,7 +2508,7 @@ xfs_agi_verify( /* * Validate the magic number of the agi block. */ - if (agi->agi_magicnum != cpu_to_be32(XFS_AGI_MAGIC)) + if (!xfs_verify_magic(bp, agi->agi_magicnum)) return __this_address; if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum))) return __this_address; @@ -2582,6 +2582,7 @@ xfs_agi_write_verify( const struct xfs_buf_ops xfs_agi_buf_ops = { .name = "xfs_agi", + .magic = { cpu_to_be32(XFS_AGI_MAGIC), cpu_to_be32(XFS_AGI_MAGIC) }, .verify_read = xfs_agi_read_verify, .verify_write = xfs_agi_write_verify, .verify_struct = xfs_agi_verify, diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 9b25e7a0df47..1080381ff243 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -124,7 +124,7 @@ xfs_finobt_alloc_block( union xfs_btree_ptr *new, int *stat) { - if (cur->bc_mp->m_inotbt_nores) + if (cur->bc_mp->m_finobt_nores) return xfs_inobt_alloc_block(cur, start, new, stat); return __xfs_inobt_alloc_block(cur, start, new, stat, XFS_AG_RESV_METADATA); @@ -154,7 +154,7 @@ xfs_finobt_free_block( struct xfs_btree_cur *cur, struct xfs_buf *bp) { - if (cur->bc_mp->m_inotbt_nores) + if (cur->bc_mp->m_finobt_nores) return xfs_inobt_free_block(cur, bp); return __xfs_inobt_free_block(cur, bp, XFS_AG_RESV_METADATA); } @@ -260,6 +260,9 @@ xfs_inobt_verify( xfs_failaddr_t fa; unsigned int level; + if (!xfs_verify_magic(bp, block->bb_magic)) + return __this_address; + /* * During growfs operations, we can't verify the exact owner as the * perag is not fully initialised and hence not attached to the buffer. @@ -270,18 +273,10 @@ xfs_inobt_verify( * but beware of the landmine (i.e. need to check pag->pagi_init) if we * ever do. */ - switch (block->bb_magic) { - case cpu_to_be32(XFS_IBT_CRC_MAGIC): - case cpu_to_be32(XFS_FIBT_CRC_MAGIC): + if (xfs_sb_version_hascrc(&mp->m_sb)) { fa = xfs_btree_sblock_v5hdr_verify(bp); if (fa) return fa; - /* fall through */ - case cpu_to_be32(XFS_IBT_MAGIC): - case cpu_to_be32(XFS_FIBT_MAGIC): - break; - default: - return __this_address; } /* level verification */ @@ -328,6 +323,16 @@ xfs_inobt_write_verify( const struct xfs_buf_ops xfs_inobt_buf_ops = { .name = "xfs_inobt", + .magic = { cpu_to_be32(XFS_IBT_MAGIC), cpu_to_be32(XFS_IBT_CRC_MAGIC) }, + .verify_read = xfs_inobt_read_verify, + .verify_write = xfs_inobt_write_verify, + .verify_struct = xfs_inobt_verify, +}; + +const struct xfs_buf_ops xfs_finobt_buf_ops = { + .name = "xfs_finobt", + .magic = { cpu_to_be32(XFS_FIBT_MAGIC), + cpu_to_be32(XFS_FIBT_CRC_MAGIC) }, .verify_read = xfs_inobt_read_verify, .verify_write = xfs_inobt_write_verify, .verify_struct = xfs_inobt_verify, @@ -389,7 +394,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = { .init_rec_from_cur = xfs_inobt_init_rec_from_cur, .init_ptr_from_cur = xfs_finobt_init_ptr_from_cur, .key_diff = xfs_inobt_key_diff, - .buf_ops = &xfs_inobt_buf_ops, + .buf_ops = &xfs_finobt_buf_ops, .diff_two_keys = xfs_inobt_diff_two_keys, .keys_inorder = xfs_inobt_keys_inorder, .recs_inorder = xfs_inobt_recs_inorder, diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c index 771dd072015d..bc690f2409fa 100644 --- a/fs/xfs/libxfs/xfs_iext_tree.c +++ b/fs/xfs/libxfs/xfs_iext_tree.c @@ -614,16 +614,15 @@ xfs_iext_realloc_root( } /* - * Increment the sequence counter if we are on a COW fork. This allows - * the writeback code to skip looking for a COW extent if the COW fork - * hasn't changed. We use WRITE_ONCE here to ensure the update to the - * sequence counter is seen before the modifications to the extent - * tree itself take effect. + * Increment the sequence counter on extent tree changes. If we are on a COW + * fork, this allows the writeback code to skip looking for a COW extent if the + * COW fork hasn't changed. We use WRITE_ONCE here to ensure the update to the + * sequence counter is seen before the modifications to the extent tree itself + * take effect. */ static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp, int state) { - if (state & BMAP_COWFORK) - WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1); + WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1); } void diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 09d9c8cfa4a0..e021d5133ccb 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -97,10 +97,9 @@ xfs_inode_buf_verify( dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog)); unlinked_ino = be32_to_cpu(dip->di_next_unlinked); - di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && + di_ok = xfs_verify_magic16(bp, dip->di_magic) && xfs_dinode_good_version(mp, dip->di_version) && - (unlinked_ino == NULLAGINO || - xfs_verify_agino(mp, agno, unlinked_ino)); + xfs_verify_agino_or_null(mp, agno, unlinked_ino); if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP))) { if (readahead) { @@ -147,12 +146,16 @@ xfs_inode_buf_write_verify( const struct xfs_buf_ops xfs_inode_buf_ops = { .name = "xfs_inode", + .magic16 = { cpu_to_be16(XFS_DINODE_MAGIC), + cpu_to_be16(XFS_DINODE_MAGIC) }, .verify_read = xfs_inode_buf_read_verify, .verify_write = xfs_inode_buf_write_verify, }; const struct xfs_buf_ops xfs_inode_buf_ra_ops = { - .name = "xxfs_inode_ra", + .name = "xfs_inode_ra", + .magic16 = { cpu_to_be16(XFS_DINODE_MAGIC), + cpu_to_be16(XFS_DINODE_MAGIC) }, .verify_read = xfs_inode_buf_readahead_verify, .verify_write = xfs_inode_buf_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 60361d2d74a1..00c62ce170d0 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -14,7 +14,7 @@ struct xfs_dinode; */ struct xfs_ifork { int if_bytes; /* bytes in if_u1 */ - unsigned int if_seq; /* cow fork mod counter */ + unsigned int if_seq; /* fork mod counter */ struct xfs_btree_block *if_broot; /* file's incore btree root */ short if_broot_bytes; /* bytes allocated for root */ unsigned char if_flags; /* per-fork flags */ diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index d9eab657b63e..6f47ab876d90 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -209,7 +209,7 @@ xfs_refcountbt_verify( xfs_failaddr_t fa; unsigned int level; - if (block->bb_magic != cpu_to_be32(XFS_REFC_CRC_MAGIC)) + if (!xfs_verify_magic(bp, block->bb_magic)) return __this_address; if (!xfs_sb_version_hasreflink(&mp->m_sb)) @@ -264,6 +264,7 @@ xfs_refcountbt_write_verify( const struct xfs_buf_ops xfs_refcountbt_buf_ops = { .name = "xfs_refcountbt", + .magic = { 0, cpu_to_be32(XFS_REFC_CRC_MAGIC) }, .verify_read = xfs_refcountbt_read_verify, .verify_write = xfs_refcountbt_write_verify, .verify_struct = xfs_refcountbt_verify, diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index f79cf040d745..5738e11055e6 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -310,7 +310,7 @@ xfs_rmapbt_verify( * from the on disk AGF. Again, we can only check against maximum limits * in this case. */ - if (block->bb_magic != cpu_to_be32(XFS_RMAP_CRC_MAGIC)) + if (!xfs_verify_magic(bp, block->bb_magic)) return __this_address; if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) @@ -365,6 +365,7 @@ xfs_rmapbt_write_verify( const struct xfs_buf_ops xfs_rmapbt_buf_ops = { .name = "xfs_rmapbt", + .magic = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) }, .verify_read = xfs_rmapbt_read_verify, .verify_write = xfs_rmapbt_write_verify, .verify_struct = xfs_rmapbt_verify, diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index b5a82acd7dfe..77a3a4085de3 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -225,10 +225,11 @@ xfs_validate_sb_common( struct xfs_buf *bp, struct xfs_sb *sbp) { + struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); uint32_t agcount = 0; uint32_t rem; - if (sbp->sb_magicnum != XFS_SB_MAGIC) { + if (!xfs_verify_magic(bp, dsb->sb_magicnum)) { xfs_warn(mp, "bad magic number"); return -EWRONGFS; } @@ -781,12 +782,14 @@ out_error: const struct xfs_buf_ops xfs_sb_buf_ops = { .name = "xfs_sb", + .magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) }, .verify_read = xfs_sb_read_verify, .verify_write = xfs_sb_write_verify, }; const struct xfs_buf_ops xfs_sb_quiet_buf_ops = { .name = "xfs_sb_quiet", + .magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) }, .verify_read = xfs_sb_quiet_read_verify, .verify_write = xfs_sb_write_verify, }; @@ -874,7 +877,7 @@ xfs_initialize_perag_data( uint64_t bfreelst = 0; uint64_t btree = 0; uint64_t fdblocks; - int error; + int error = 0; for (index = 0; index < agcount; index++) { /* diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 1c5debe748f0..4e909791aeac 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -25,7 +25,8 @@ extern const struct xfs_buf_ops xfs_agf_buf_ops; extern const struct xfs_buf_ops xfs_agi_buf_ops; extern const struct xfs_buf_ops xfs_agf_buf_ops; extern const struct xfs_buf_ops xfs_agfl_buf_ops; -extern const struct xfs_buf_ops xfs_allocbt_buf_ops; +extern const struct xfs_buf_ops xfs_bnobt_buf_ops; +extern const struct xfs_buf_ops xfs_cntbt_buf_ops; extern const struct xfs_buf_ops xfs_rmapbt_buf_ops; extern const struct xfs_buf_ops xfs_refcountbt_buf_ops; extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops; @@ -36,6 +37,7 @@ extern const struct xfs_buf_ops xfs_dquot_buf_ops; extern const struct xfs_buf_ops xfs_symlink_buf_ops; extern const struct xfs_buf_ops xfs_agi_buf_ops; extern const struct xfs_buf_ops xfs_inobt_buf_ops; +extern const struct xfs_buf_ops xfs_finobt_buf_ops; extern const struct xfs_buf_ops xfs_inode_buf_ops; extern const struct xfs_buf_ops xfs_inode_buf_ra_ops; extern const struct xfs_buf_ops xfs_dquot_buf_ops; diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index 77d80106f989..a0ccc253c43d 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -95,7 +95,7 @@ xfs_symlink_verify( if (!xfs_sb_version_hascrc(&mp->m_sb)) return __this_address; - if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC)) + if (!xfs_verify_magic(bp, dsl->sl_magic)) return __this_address; if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; @@ -159,6 +159,7 @@ xfs_symlink_write_verify( const struct xfs_buf_ops xfs_symlink_buf_ops = { .name = "xfs_symlink", + .magic = { 0, cpu_to_be32(XFS_SYMLINK_MAGIC) }, .verify_read = xfs_symlink_read_verify, .verify_write = xfs_symlink_write_verify, .verify_struct = xfs_symlink_verify, diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c index 3306fc42cfad..de310712dd6d 100644 --- a/fs/xfs/libxfs/xfs_types.c +++ b/fs/xfs/libxfs/xfs_types.c @@ -116,6 +116,19 @@ xfs_verify_agino( } /* + * Verify that an AG inode number pointer neither points outside the AG + * nor points at static metadata, or is NULLAGINO. + */ +bool +xfs_verify_agino_or_null( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agino_t agino) +{ + return agino == NULLAGINO || xfs_verify_agino(mp, agno, agino); +} + +/* * Verify that an FS inode number pointer neither points outside the * filesystem nor points at static AG metadata. */ @@ -204,3 +217,14 @@ xfs_verify_icount( xfs_icount_range(mp, &min, &max); return icount >= min && icount <= max; } + +/* Sanity-checking of dir/attr block offsets. */ +bool +xfs_verify_dablk( + struct xfs_mount *mp, + xfs_fileoff_t dabno) +{ + xfs_dablk_t max_dablk = -1U; + + return dabno <= max_dablk; +} diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 8f02855a019a..c5a25403b4db 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -183,10 +183,13 @@ void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t *first, xfs_agino_t *last); bool xfs_verify_agino(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino); +bool xfs_verify_agino_or_null(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agino_t agino); bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno); bool xfs_verify_icount(struct xfs_mount *mp, unsigned long long icount); +bool xfs_verify_dablk(struct xfs_mount *mp, xfs_fileoff_t off); #endif /* __XFS_TYPES_H__ */ diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index 90955ab1e895..ddf06bfaa29d 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -399,7 +399,7 @@ xchk_agf_xref_cntbt( if (!xchk_should_check_xref(sc, &error, &sc->sa.cnt_cur)) return; if (!have) { - if (agf->agf_freeblks != be32_to_cpu(0)) + if (agf->agf_freeblks != cpu_to_be32(0)) xchk_block_xref_set_corrupt(sc, sc->sa.agf_bp); return; } @@ -864,19 +864,17 @@ xchk_agi( /* Check inode pointers */ agino = be32_to_cpu(agi->agi_newino); - if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino)) + if (!xfs_verify_agino_or_null(mp, agno, agino)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); agino = be32_to_cpu(agi->agi_dirino); - if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino)) + if (!xfs_verify_agino_or_null(mp, agno, agino)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); /* Check unlinked inode buckets */ for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) { agino = be32_to_cpu(agi->agi_unlinked[i]); - if (agino == NULLAGINO) - continue; - if (!xfs_verify_agino(mp, agno, agino)) + if (!xfs_verify_agino_or_null(mp, agno, agino)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); } diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 03d1e15cceba..64e31f87d490 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -341,23 +341,19 @@ xrep_agf( struct xrep_find_ag_btree fab[XREP_AGF_MAX] = { [XREP_AGF_BNOBT] = { .rmap_owner = XFS_RMAP_OWN_AG, - .buf_ops = &xfs_allocbt_buf_ops, - .magic = XFS_ABTB_CRC_MAGIC, + .buf_ops = &xfs_bnobt_buf_ops, }, [XREP_AGF_CNTBT] = { .rmap_owner = XFS_RMAP_OWN_AG, - .buf_ops = &xfs_allocbt_buf_ops, - .magic = XFS_ABTC_CRC_MAGIC, + .buf_ops = &xfs_cntbt_buf_ops, }, [XREP_AGF_RMAPBT] = { .rmap_owner = XFS_RMAP_OWN_AG, .buf_ops = &xfs_rmapbt_buf_ops, - .magic = XFS_RMAP_CRC_MAGIC, }, [XREP_AGF_REFCOUNTBT] = { .rmap_owner = XFS_RMAP_OWN_REFC, .buf_ops = &xfs_refcountbt_buf_ops, - .magic = XFS_REFC_CRC_MAGIC, }, [XREP_AGF_END] = { .buf_ops = NULL, @@ -875,12 +871,10 @@ xrep_agi( [XREP_AGI_INOBT] = { .rmap_owner = XFS_RMAP_OWN_INOBT, .buf_ops = &xfs_inobt_buf_ops, - .magic = XFS_IBT_CRC_MAGIC, }, [XREP_AGI_FINOBT] = { .rmap_owner = XFS_RMAP_OWN_INOBT, - .buf_ops = &xfs_inobt_buf_ops, - .magic = XFS_FIBT_CRC_MAGIC, + .buf_ops = &xfs_finobt_buf_ops, }, [XREP_AGI_END] = { .buf_ops = NULL diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index 81d5e90547a1..dce74ec57038 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -82,12 +82,23 @@ xchk_xattr_listent( sx = container_of(context, struct xchk_xattr, context); + if (xchk_should_terminate(sx->sc, &error)) { + context->seen_enough = 1; + return; + } + if (flags & XFS_ATTR_INCOMPLETE) { /* Incomplete attr key, just mark the inode for preening. */ xchk_ino_set_preen(sx->sc, context->dp->i_ino); return; } + /* Does this name make sense? */ + if (!xfs_attr_namecheck(name, namelen)) { + xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno); + return; + } + args.flags = ATTR_KERNOTIME; if (flags & XFS_ATTR_ROOT) args.flags |= ATTR_ROOT; diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index e1d11f3223e3..a703cd58a90e 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -281,6 +281,31 @@ xchk_bmap_extent_xref( xchk_ag_free(info->sc, &info->sc->sa); } +/* + * Directories and attr forks should never have blocks that can't be addressed + * by a xfs_dablk_t. + */ +STATIC void +xchk_bmap_dirattr_extent( + struct xfs_inode *ip, + struct xchk_bmap_info *info, + struct xfs_bmbt_irec *irec) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t off; + + if (!S_ISDIR(VFS_I(ip)->i_mode) && info->whichfork != XFS_ATTR_FORK) + return; + + if (!xfs_verify_dablk(mp, irec->br_startoff)) + xchk_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + off = irec->br_startoff + irec->br_blockcount - 1; + if (!xfs_verify_dablk(mp, off)) + xchk_fblock_set_corrupt(info->sc, info->whichfork, off); +} + /* Scrub a single extent record. */ STATIC int xchk_bmap_extent( @@ -305,6 +330,8 @@ xchk_bmap_extent( xchk_fblock_set_corrupt(info->sc, info->whichfork, irec->br_startoff); + xchk_bmap_dirattr_extent(ip, info, irec); + /* There should never be a "hole" extent in either extent list. */ if (irec->br_startblock == HOLESTARTBLOCK) xchk_fblock_set_corrupt(info->sc, info->whichfork, diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index cd3e4d768a18..a38a22785a1a 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -129,6 +129,12 @@ xchk_dir_actor( goto out; } + /* Does this name make sense? */ + if (!xfs_dir2_namecheck(name, namelen)) { + xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset); + goto out; + } + if (!strncmp(".", name, namelen)) { /* If this is "." then check that the inum matches the dir. */ if (xfs_sb_version_hasftype(&mp->m_sb) && type != DT_DIR) diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 882dc56c5c21..700114f79a7d 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -47,6 +47,12 @@ xchk_setup_ag_iallocbt( struct xchk_iallocbt { /* Number of inodes we see while scanning inobt. */ unsigned long long inodes; + + /* Expected next startino, for big block filesystems. */ + xfs_agino_t next_startino; + + /* Expected end of the current inode cluster. */ + xfs_agino_t next_cluster_ino; }; /* @@ -128,41 +134,57 @@ xchk_iallocbt_freecount( return hweight64(freemask); } -/* Check a particular inode with ir_free. */ +/* + * Check that an inode's allocation status matches ir_free in the inobt + * record. First we try querying the in-core inode state, and if the inode + * isn't loaded we examine the on-disk inode directly. + * + * Since there can be 1:M and M:1 mappings between inobt records and inode + * clusters, we pass in the inode location information as an inobt record; + * the index of an inode cluster within the inobt record (as well as the + * cluster buffer itself); and the index of the inode within the cluster. + * + * @irec is the inobt record. + * @irec_ino is the inode offset from the start of the record. + * @dip is the on-disk inode. + */ STATIC int -xchk_iallocbt_check_cluster_freemask( +xchk_iallocbt_check_cluster_ifree( struct xchk_btree *bs, - xfs_ino_t fsino, - xfs_agino_t chunkino, - xfs_agino_t clusterino, struct xfs_inobt_rec_incore *irec, - struct xfs_buf *bp) + unsigned int irec_ino, + struct xfs_dinode *dip) { - struct xfs_dinode *dip; struct xfs_mount *mp = bs->cur->bc_mp; - bool inode_is_free = false; + xfs_ino_t fsino; + xfs_agino_t agino; + bool irec_free; + bool ino_inuse; bool freemask_ok; - bool inuse; int error = 0; if (xchk_should_terminate(bs->sc, &error)) return error; - dip = xfs_buf_offset(bp, clusterino * mp->m_sb.sb_inodesize); + /* + * Given an inobt record and the offset of an inode from the start of + * the record, compute which fs inode we're talking about. + */ + agino = irec->ir_startino + irec_ino; + fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino); + irec_free = (irec->ir_free & XFS_INOBT_MASK(irec_ino)); + if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC || - (dip->di_version >= 3 && - be64_to_cpu(dip->di_ino) != fsino + clusterino)) { + (dip->di_version >= 3 && be64_to_cpu(dip->di_ino) != fsino)) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); goto out; } - if (irec->ir_free & XFS_INOBT_MASK(chunkino + clusterino)) - inode_is_free = true; - error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp, - fsino + clusterino, &inuse); + error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp, fsino, + &ino_inuse); if (error == -ENODATA) { /* Not cached, just read the disk buffer */ - freemask_ok = inode_is_free ^ !!(dip->di_mode); + freemask_ok = irec_free ^ !!(dip->di_mode); if (!bs->sc->try_harder && !freemask_ok) return -EDEADLOCK; } else if (error < 0) { @@ -174,7 +196,7 @@ xchk_iallocbt_check_cluster_freemask( goto out; } else { /* Inode is all there. */ - freemask_ok = inode_is_free ^ inuse; + freemask_ok = irec_free ^ ino_inuse; } if (!freemask_ok) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); @@ -182,86 +204,221 @@ out: return 0; } -/* Make sure the free mask is consistent with what the inodes think. */ +/* + * Check that the holemask and freemask of a hypothetical inode cluster match + * what's actually on disk. If sparse inodes are enabled, the cluster does + * not actually have to map to inodes if the corresponding holemask bit is set. + * + * @cluster_base is the first inode in the cluster within the @irec. + */ STATIC int -xchk_iallocbt_check_freemask( +xchk_iallocbt_check_cluster( struct xchk_btree *bs, - struct xfs_inobt_rec_incore *irec) + struct xfs_inobt_rec_incore *irec, + unsigned int cluster_base) { struct xfs_imap imap; struct xfs_mount *mp = bs->cur->bc_mp; struct xfs_dinode *dip; - struct xfs_buf *bp; - xfs_ino_t fsino; - xfs_agino_t nr_inodes; - xfs_agino_t agino; - xfs_agino_t chunkino; - xfs_agino_t clusterino; + struct xfs_buf *cluster_bp; + unsigned int nr_inodes; + xfs_agnumber_t agno = bs->cur->bc_private.a.agno; xfs_agblock_t agbno; - uint16_t holemask; + unsigned int cluster_index; + uint16_t cluster_mask = 0; uint16_t ir_holemask; int error = 0; - /* Make sure the freemask matches the inode records. */ - nr_inodes = mp->m_inodes_per_cluster; - - for (agino = irec->ir_startino; - agino < irec->ir_startino + XFS_INODES_PER_CHUNK; - agino += mp->m_inodes_per_cluster) { - fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino); - chunkino = agino - irec->ir_startino; - agbno = XFS_AGINO_TO_AGBNO(mp, agino); - - /* Compute the holemask mask for this cluster. */ - for (clusterino = 0, holemask = 0; clusterino < nr_inodes; - clusterino += XFS_INODES_PER_HOLEMASK_BIT) - holemask |= XFS_INOBT_MASK((chunkino + clusterino) / - XFS_INODES_PER_HOLEMASK_BIT); - - /* The whole cluster must be a hole or not a hole. */ - ir_holemask = (irec->ir_holemask & holemask); - if (ir_holemask != holemask && ir_holemask != 0) { + nr_inodes = min_t(unsigned int, XFS_INODES_PER_CHUNK, + mp->m_inodes_per_cluster); + + /* Map this inode cluster */ + agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino + cluster_base); + + /* Compute a bitmask for this cluster that can be used for holemask. */ + for (cluster_index = 0; + cluster_index < nr_inodes; + cluster_index += XFS_INODES_PER_HOLEMASK_BIT) + cluster_mask |= XFS_INOBT_MASK((cluster_base + cluster_index) / + XFS_INODES_PER_HOLEMASK_BIT); + + /* + * Map the first inode of this cluster to a buffer and offset. + * Be careful about inobt records that don't align with the start of + * the inode buffer when block sizes are large enough to hold multiple + * inode chunks. When this happens, cluster_base will be zero but + * ir_startino can be large enough to make im_boffset nonzero. + */ + ir_holemask = (irec->ir_holemask & cluster_mask); + imap.im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno); + imap.im_len = XFS_FSB_TO_BB(mp, mp->m_blocks_per_cluster); + imap.im_boffset = XFS_INO_TO_OFFSET(mp, irec->ir_startino); + + if (imap.im_boffset != 0 && cluster_base != 0) { + ASSERT(imap.im_boffset == 0 || cluster_base == 0); + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return 0; + } + + trace_xchk_iallocbt_check_cluster(mp, agno, irec->ir_startino, + imap.im_blkno, imap.im_len, cluster_base, nr_inodes, + cluster_mask, ir_holemask, + XFS_INO_TO_OFFSET(mp, irec->ir_startino + + cluster_base)); + + /* The whole cluster must be a hole or not a hole. */ + if (ir_holemask != cluster_mask && ir_holemask != 0) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return 0; + } + + /* If any part of this is a hole, skip it. */ + if (ir_holemask) { + xchk_xref_is_not_owned_by(bs->sc, agbno, + mp->m_blocks_per_cluster, + &XFS_RMAP_OINFO_INODES); + return 0; + } + + xchk_xref_is_owned_by(bs->sc, agbno, mp->m_blocks_per_cluster, + &XFS_RMAP_OINFO_INODES); + + /* Grab the inode cluster buffer. */ + error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, &dip, &cluster_bp, + 0, 0); + if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0, &error)) + return error; + + /* Check free status of each inode within this cluster. */ + for (cluster_index = 0; cluster_index < nr_inodes; cluster_index++) { + struct xfs_dinode *dip; + + if (imap.im_boffset >= BBTOB(cluster_bp->b_length)) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); - continue; + break; } - /* If any part of this is a hole, skip it. */ - if (ir_holemask) { - xchk_xref_is_not_owned_by(bs->sc, agbno, - mp->m_blocks_per_cluster, - &XFS_RMAP_OINFO_INODES); - continue; + dip = xfs_buf_offset(cluster_bp, imap.im_boffset); + error = xchk_iallocbt_check_cluster_ifree(bs, irec, + cluster_base + cluster_index, dip); + if (error) + break; + imap.im_boffset += mp->m_sb.sb_inodesize; + } + + xfs_trans_brelse(bs->cur->bc_tp, cluster_bp); + return error; +} + +/* + * For all the inode clusters that could map to this inobt record, make sure + * that the holemask makes sense and that the allocation status of each inode + * matches the freemask. + */ +STATIC int +xchk_iallocbt_check_clusters( + struct xchk_btree *bs, + struct xfs_inobt_rec_incore *irec) +{ + unsigned int cluster_base; + int error = 0; + + /* + * For the common case where this inobt record maps to multiple inode + * clusters this will call _check_cluster for each cluster. + * + * For the case that multiple inobt records map to a single cluster, + * this will call _check_cluster once. + */ + for (cluster_base = 0; + cluster_base < XFS_INODES_PER_CHUNK; + cluster_base += bs->sc->mp->m_inodes_per_cluster) { + error = xchk_iallocbt_check_cluster(bs, irec, cluster_base); + if (error) + break; + } + + return error; +} + +/* + * Make sure this inode btree record is aligned properly. Because a fs block + * contains multiple inodes, we check that the inobt record is aligned to the + * correct inode, not just the correct block on disk. This results in a finer + * grained corruption check. + */ +STATIC void +xchk_iallocbt_rec_alignment( + struct xchk_btree *bs, + struct xfs_inobt_rec_incore *irec) +{ + struct xfs_mount *mp = bs->sc->mp; + struct xchk_iallocbt *iabt = bs->private; + + /* + * finobt records have different positioning requirements than inobt + * records: each finobt record must have a corresponding inobt record. + * That is checked in the xref function, so for now we only catch the + * obvious case where the record isn't at all aligned properly. + * + * Note that if a fs block contains more than a single chunk of inodes, + * we will have finobt records only for those chunks containing free + * inodes, and therefore expect chunk alignment of finobt records. + * Otherwise, we expect that the finobt record is aligned to the + * cluster alignment as told by the superblock. + */ + if (bs->cur->bc_btnum == XFS_BTNUM_FINO) { + unsigned int imask; + + imask = min_t(unsigned int, XFS_INODES_PER_CHUNK, + mp->m_cluster_align_inodes) - 1; + if (irec->ir_startino & imask) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return; + } + + if (iabt->next_startino != NULLAGINO) { + /* + * We're midway through a cluster of inodes that is mapped by + * multiple inobt records. Did we get the record for the next + * irec in the sequence? + */ + if (irec->ir_startino != iabt->next_startino) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return; } - xchk_xref_is_owned_by(bs->sc, agbno, mp->m_blocks_per_cluster, - &XFS_RMAP_OINFO_INODES); + iabt->next_startino += XFS_INODES_PER_CHUNK; - /* Grab the inode cluster buffer. */ - imap.im_blkno = XFS_AGB_TO_DADDR(mp, bs->cur->bc_private.a.agno, - agbno); - imap.im_len = XFS_FSB_TO_BB(mp, mp->m_blocks_per_cluster); - imap.im_boffset = 0; - - error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, - &dip, &bp, 0, 0); - if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0, - &error)) - continue; - - /* Which inodes are free? */ - for (clusterino = 0; clusterino < nr_inodes; clusterino++) { - error = xchk_iallocbt_check_cluster_freemask(bs, - fsino, chunkino, clusterino, irec, bp); - if (error) { - xfs_trans_brelse(bs->cur->bc_tp, bp); - return error; - } + /* Are we done with the cluster? */ + if (iabt->next_startino >= iabt->next_cluster_ino) { + iabt->next_startino = NULLAGINO; + iabt->next_cluster_ino = NULLAGINO; } + return; + } + + /* inobt records must be aligned to cluster and inoalignmnt size. */ + if (irec->ir_startino & (mp->m_cluster_align_inodes - 1)) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return; + } - xfs_trans_brelse(bs->cur->bc_tp, bp); + if (irec->ir_startino & (mp->m_inodes_per_cluster - 1)) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return; } - return error; + if (mp->m_inodes_per_cluster <= XFS_INODES_PER_CHUNK) + return; + + /* + * If this is the start of an inode cluster that can be mapped by + * multiple inobt records, the next inobt record must follow exactly + * after this one. + */ + iabt->next_startino = irec->ir_startino + XFS_INODES_PER_CHUNK; + iabt->next_cluster_ino = irec->ir_startino + mp->m_inodes_per_cluster; } /* Scrub an inobt/finobt record. */ @@ -276,7 +433,6 @@ xchk_iallocbt_rec( uint64_t holes; xfs_agnumber_t agno = bs->cur->bc_private.a.agno; xfs_agino_t agino; - xfs_agblock_t agbno; xfs_extlen_t len; int holecount; int i; @@ -303,11 +459,9 @@ xchk_iallocbt_rec( goto out; } - /* Make sure this record is aligned to cluster and inoalignmnt size. */ - agbno = XFS_AGINO_TO_AGBNO(mp, irec.ir_startino); - if ((agbno & (mp->m_cluster_align - 1)) || - (agbno & (mp->m_blocks_per_cluster - 1))) - xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + xchk_iallocbt_rec_alignment(bs, &irec); + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; iabt->inodes += irec.ir_count; @@ -320,7 +474,7 @@ xchk_iallocbt_rec( if (!xchk_iallocbt_chunk(bs, &irec, agino, len)) goto out; - goto check_freemask; + goto check_clusters; } /* Check each chunk of a sparse inode cluster. */ @@ -346,8 +500,8 @@ xchk_iallocbt_rec( holecount + irec.ir_count != XFS_INODES_PER_CHUNK) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); -check_freemask: - error = xchk_iallocbt_check_freemask(bs, &irec); +check_clusters: + error = xchk_iallocbt_check_clusters(bs, &irec); if (error) goto out; @@ -429,6 +583,8 @@ xchk_iallocbt( struct xfs_btree_cur *cur; struct xchk_iallocbt iabt = { .inodes = 0, + .next_startino = NULLAGINO, + .next_cluster_ino = NULLAGINO, }; int error; diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 6acf1bfa0bfe..f28f4bad317b 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -743,7 +743,8 @@ xrep_findroot_block( /* Ensure the block magic matches the btree type we're looking for. */ btblock = XFS_BUF_TO_BLOCK(bp); - if (be32_to_cpu(btblock->bb_magic) != fab->magic) + ASSERT(fab->buf_ops->magic[1] != 0); + if (btblock->bb_magic != fab->buf_ops->magic[1]) goto out; /* diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index f2fc18bb7605..d990314eb08b 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -42,9 +42,6 @@ struct xrep_find_ag_btree { /* in: buffer ops */ const struct xfs_buf_ops *buf_ops; - /* in: magic number of the btree */ - uint32_t magic; - /* out: the highest btree block found and the tree height */ xfs_agblock_t root; unsigned int height; diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index 665d4bbb17cc..dbe115b075f7 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -141,9 +141,8 @@ xchk_xref_is_used_rt_space( startext = fsbno; endext = fsbno + len - 1; do_div(startext, sc->mp->m_sb.sb_rextsize); - if (do_div(endext, sc->mp->m_sb.sb_rextsize)) - endext++; - extcount = endext - startext; + do_div(endext, sc->mp->m_sb.sb_rextsize); + extcount = endext - startext + 1; xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext, extcount, &is_free); diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 8344b14031ef..3c83e8b3b39c 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -545,6 +545,51 @@ TRACE_EVENT(xchk_xref_error, __entry->ret_ip) ); +TRACE_EVENT(xchk_iallocbt_check_cluster, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agino_t startino, xfs_daddr_t map_daddr, + unsigned short map_len, unsigned int chunk_ino, + unsigned int nr_inodes, uint16_t cluster_mask, + uint16_t holemask, unsigned int cluster_ino), + TP_ARGS(mp, agno, startino, map_daddr, map_len, chunk_ino, nr_inodes, + cluster_mask, holemask, cluster_ino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, startino) + __field(xfs_daddr_t, map_daddr) + __field(unsigned short, map_len) + __field(unsigned int, chunk_ino) + __field(unsigned int, nr_inodes) + __field(unsigned int, cluster_ino) + __field(uint16_t, cluster_mask) + __field(uint16_t, holemask) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->startino = startino; + __entry->map_daddr = map_daddr; + __entry->map_len = map_len; + __entry->chunk_ino = chunk_ino; + __entry->nr_inodes = nr_inodes; + __entry->cluster_mask = cluster_mask; + __entry->holemask = holemask; + __entry->cluster_ino = cluster_ino; + ), + TP_printk("dev %d:%d agno %d startino %u daddr 0x%llx len %d chunkino %u nr_inodes %u cluster_mask 0x%x holemask 0x%x cluster_ino %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->startino, + __entry->map_daddr, + __entry->map_len, + __entry->chunk_ino, + __entry->nr_inodes, + __entry->cluster_mask, + __entry->holemask, + __entry->cluster_ino) +) + /* repair tracepoints */ #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index d9048bcea49c..7b8bb6bde981 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -28,7 +28,8 @@ */ struct xfs_writepage_ctx { struct xfs_bmbt_irec imap; - unsigned int io_type; + int fork; + unsigned int data_seq; unsigned int cow_seq; struct xfs_ioend *ioend; }; @@ -255,30 +256,20 @@ xfs_end_io( */ error = blk_status_to_errno(ioend->io_bio->bi_status); if (unlikely(error)) { - switch (ioend->io_type) { - case XFS_IO_COW: + if (ioend->io_fork == XFS_COW_FORK) xfs_reflink_cancel_cow_range(ip, offset, size, true); - break; - } - goto done; } /* - * Success: commit the COW or unwritten blocks if needed. + * Success: commit the COW or unwritten blocks if needed. */ - switch (ioend->io_type) { - case XFS_IO_COW: + if (ioend->io_fork == XFS_COW_FORK) error = xfs_reflink_end_cow(ip, offset, size); - break; - case XFS_IO_UNWRITTEN: - /* writeback should never update isize */ + else if (ioend->io_state == XFS_EXT_UNWRITTEN) error = xfs_iomap_write_unwritten(ip, offset, size, false); - break; - default: + else ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans); - break; - } done: if (ioend->io_append_trans) @@ -293,7 +284,8 @@ xfs_end_bio( struct xfs_ioend *ioend = bio->bi_private; struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; - if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_type == XFS_IO_COW) + if (ioend->io_fork == XFS_COW_FORK || + ioend->io_state == XFS_EXT_UNWRITTEN) queue_work(mp->m_unwritten_workqueue, &ioend->io_work); else if (ioend->io_append_trans) queue_work(mp->m_data_workqueue, &ioend->io_work); @@ -301,6 +293,75 @@ xfs_end_bio( xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status)); } +/* + * Fast revalidation of the cached writeback mapping. Return true if the current + * mapping is valid, false otherwise. + */ +static bool +xfs_imap_valid( + struct xfs_writepage_ctx *wpc, + struct xfs_inode *ip, + xfs_fileoff_t offset_fsb) +{ + if (offset_fsb < wpc->imap.br_startoff || + offset_fsb >= wpc->imap.br_startoff + wpc->imap.br_blockcount) + return false; + /* + * If this is a COW mapping, it is sufficient to check that the mapping + * covers the offset. Be careful to check this first because the caller + * can revalidate a COW mapping without updating the data seqno. + */ + if (wpc->fork == XFS_COW_FORK) + return true; + + /* + * This is not a COW mapping. Check the sequence number of the data fork + * because concurrent changes could have invalidated the extent. Check + * the COW fork because concurrent changes since the last time we + * checked (and found nothing at this offset) could have added + * overlapping blocks. + */ + if (wpc->data_seq != READ_ONCE(ip->i_df.if_seq)) + return false; + if (xfs_inode_has_cow_data(ip) && + wpc->cow_seq != READ_ONCE(ip->i_cowfp->if_seq)) + return false; + return true; +} + +/* + * Pass in a dellalloc extent and convert it to real extents, return the real + * extent that maps offset_fsb in wpc->imap. + * + * The current page is held locked so nothing could have removed the block + * backing offset_fsb, although it could have moved from the COW to the data + * fork by another thread. + */ +static int +xfs_convert_blocks( + struct xfs_writepage_ctx *wpc, + struct xfs_inode *ip, + xfs_fileoff_t offset_fsb) +{ + int error; + + /* + * Attempt to allocate whatever delalloc extent currently backs + * offset_fsb and put the result into wpc->imap. Allocate in a loop + * because it may take several attempts to allocate real blocks for a + * contiguous delalloc extent if free space is sufficiently fragmented. + */ + do { + error = xfs_bmapi_convert_delalloc(ip, wpc->fork, offset_fsb, + &wpc->imap, wpc->fork == XFS_COW_FORK ? + &wpc->cow_seq : &wpc->data_seq); + if (error) + return error; + } while (wpc->imap.br_startoff + wpc->imap.br_blockcount <= offset_fsb); + + return 0; +} + STATIC int xfs_map_blocks( struct xfs_writepage_ctx *wpc, @@ -310,26 +371,16 @@ xfs_map_blocks( struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; ssize_t count = i_blocksize(inode); - xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset), end_fsb; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); xfs_fileoff_t cow_fsb = NULLFILEOFF; struct xfs_bmbt_irec imap; - int whichfork = XFS_DATA_FORK; struct xfs_iext_cursor icur; - bool imap_valid; + int retries = 0; int error = 0; - /* - * We have to make sure the cached mapping is within EOF to protect - * against eofblocks trimming on file release leaving us with a stale - * mapping. Otherwise, a page for a subsequent file extending buffered - * write could get picked up by this writeback cycle and written to the - * wrong blocks. - * - * Note that what we really want here is a generic mapping invalidation - * mechanism to protect us from arbitrary extent modifying contexts, not - * just eofblocks. - */ - xfs_trim_extent_eof(&wpc->imap, ip); + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; /* * COW fork blocks can overlap data fork blocks even if the blocks @@ -346,31 +397,19 @@ xfs_map_blocks( * against concurrent updates and provides a memory barrier on the way * out that ensures that we always see the current value. */ - imap_valid = offset_fsb >= wpc->imap.br_startoff && - offset_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount; - if (imap_valid && - (!xfs_inode_has_cow_data(ip) || - wpc->io_type == XFS_IO_COW || - wpc->cow_seq == READ_ONCE(ip->i_cowfp->if_seq))) + if (xfs_imap_valid(wpc, ip, offset_fsb)) return 0; - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - /* * If we don't have a valid map, now it's time to get a new one for this * offset. This will convert delayed allocations (including COW ones) * into real extents. If we return without a valid map, it means we * landed in a hole and we skip the block. */ +retry: xfs_ilock(ip, XFS_ILOCK_SHARED); ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || (ip->i_df.if_flags & XFS_IFEXTENTS)); - ASSERT(offset <= mp->m_super->s_maxbytes); - - if (offset > mp->m_super->s_maxbytes - count) - count = mp->m_super->s_maxbytes - offset; - end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); /* * Check if this is offset is covered by a COW extents, and if yes use @@ -382,30 +421,16 @@ xfs_map_blocks( if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) { wpc->cow_seq = READ_ONCE(ip->i_cowfp->if_seq); xfs_iunlock(ip, XFS_ILOCK_SHARED); - /* - * Truncate can race with writeback since writeback doesn't - * take the iolock and truncate decreases the file size before - * it starts truncating the pages between new_size and old_size. - * Therefore, we can end up in the situation where writeback - * gets a CoW fork mapping but the truncate makes the mapping - * invalid and we end up in here trying to get a new mapping. - * bail out here so that we simply never get a valid mapping - * and so we drop the write altogether. The page truncation - * will kill the contents anyway. - */ - if (offset > i_size_read(inode)) { - wpc->io_type = XFS_IO_HOLE; - return 0; - } - whichfork = XFS_COW_FORK; - wpc->io_type = XFS_IO_COW; + + wpc->fork = XFS_COW_FORK; goto allocate_blocks; } /* - * Map valid and no COW extent in the way? We're done. + * No COW extent overlap. Revalidate now that we may have updated + * ->cow_seq. If the data mapping is still valid, we're done. */ - if (imap_valid) { + if (xfs_imap_valid(wpc, ip, offset_fsb)) { xfs_iunlock(ip, XFS_ILOCK_SHARED); return 0; } @@ -417,51 +442,65 @@ xfs_map_blocks( */ if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) imap.br_startoff = end_fsb; /* fake a hole past EOF */ + wpc->data_seq = READ_ONCE(ip->i_df.if_seq); xfs_iunlock(ip, XFS_ILOCK_SHARED); + wpc->fork = XFS_DATA_FORK; + + /* landed in a hole or beyond EOF? */ if (imap.br_startoff > offset_fsb) { - /* landed in a hole or beyond EOF */ imap.br_blockcount = imap.br_startoff - offset_fsb; imap.br_startoff = offset_fsb; imap.br_startblock = HOLESTARTBLOCK; - wpc->io_type = XFS_IO_HOLE; - } else { - /* - * Truncate to the next COW extent if there is one. This is the - * only opportunity to do this because we can skip COW fork - * lookups for the subsequent blocks in the mapping; however, - * the requirement to treat the COW range separately remains. - */ - if (cow_fsb != NULLFILEOFF && - cow_fsb < imap.br_startoff + imap.br_blockcount) - imap.br_blockcount = cow_fsb - imap.br_startoff; - - if (isnullstartblock(imap.br_startblock)) { - /* got a delalloc extent */ - wpc->io_type = XFS_IO_DELALLOC; - goto allocate_blocks; - } - - if (imap.br_state == XFS_EXT_UNWRITTEN) - wpc->io_type = XFS_IO_UNWRITTEN; - else - wpc->io_type = XFS_IO_OVERWRITE; + imap.br_state = XFS_EXT_NORM; } + /* + * Truncate to the next COW extent if there is one. This is the only + * opportunity to do this because we can skip COW fork lookups for the + * subsequent blocks in the mapping; however, the requirement to treat + * the COW range separately remains. + */ + if (cow_fsb != NULLFILEOFF && + cow_fsb < imap.br_startoff + imap.br_blockcount) + imap.br_blockcount = cow_fsb - imap.br_startoff; + + /* got a delalloc extent? */ + if (imap.br_startblock != HOLESTARTBLOCK && + isnullstartblock(imap.br_startblock)) + goto allocate_blocks; + wpc->imap = imap; - xfs_trim_extent_eof(&wpc->imap, ip); - trace_xfs_map_blocks_found(ip, offset, count, wpc->io_type, &imap); + trace_xfs_map_blocks_found(ip, offset, count, wpc->fork, &imap); return 0; allocate_blocks: - error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap, - &wpc->cow_seq); - if (error) + error = xfs_convert_blocks(wpc, ip, offset_fsb); + if (error) { + /* + * If we failed to find the extent in the COW fork we might have + * raced with a COW to data fork conversion or truncate. + * Restart the lookup to catch the extent in the data fork for + * the former case, but prevent additional retries to avoid + * looping forever for the latter case. + */ + if (error == -EAGAIN && wpc->fork == XFS_COW_FORK && !retries++) + goto retry; + ASSERT(error != -EAGAIN); return error; - ASSERT(whichfork == XFS_COW_FORK || cow_fsb == NULLFILEOFF || - imap.br_startoff + imap.br_blockcount <= cow_fsb); - wpc->imap = imap; - xfs_trim_extent_eof(&wpc->imap, ip); - trace_xfs_map_blocks_alloc(ip, offset, count, wpc->io_type, &imap); + } + + /* + * Due to merging the return real extent might be larger than the + * original delalloc one. Trim the return extent to the next COW + * boundary again to force a re-lookup. + */ + if (wpc->fork != XFS_COW_FORK && cow_fsb != NULLFILEOFF && + cow_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount) + wpc->imap.br_blockcount = cow_fsb - wpc->imap.br_startoff; + + ASSERT(wpc->imap.br_startoff <= offset_fsb); + ASSERT(wpc->imap.br_startoff + wpc->imap.br_blockcount > offset_fsb); + trace_xfs_map_blocks_alloc(ip, offset, count, wpc->fork, &imap); return 0; } @@ -486,7 +525,7 @@ xfs_submit_ioend( int status) { /* Convert CoW extents to regular */ - if (!status && ioend->io_type == XFS_IO_COW) { + if (!status && ioend->io_fork == XFS_COW_FORK) { /* * Yuk. This can do memory allocation, but is not a * transactional operation so everything is done in GFP_KERNEL @@ -504,7 +543,8 @@ xfs_submit_ioend( /* Reserve log space if we might write beyond the on-disk inode size. */ if (!status && - ioend->io_type != XFS_IO_UNWRITTEN && + (ioend->io_fork == XFS_COW_FORK || + ioend->io_state != XFS_EXT_UNWRITTEN) && xfs_ioend_is_append(ioend) && !ioend->io_append_trans) status = xfs_setfilesize_trans_alloc(ioend); @@ -533,7 +573,8 @@ xfs_submit_ioend( static struct xfs_ioend * xfs_alloc_ioend( struct inode *inode, - unsigned int type, + int fork, + xfs_exntst_t state, xfs_off_t offset, struct block_device *bdev, sector_t sector) @@ -547,7 +588,8 @@ xfs_alloc_ioend( ioend = container_of(bio, struct xfs_ioend, io_inline_bio); INIT_LIST_HEAD(&ioend->io_list); - ioend->io_type = type; + ioend->io_fork = fork; + ioend->io_state = state; ioend->io_inode = inode; ioend->io_size = 0; ioend->io_offset = offset; @@ -608,13 +650,15 @@ xfs_add_to_ioend( sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) + ((offset - XFS_FSB_TO_B(mp, wpc->imap.br_startoff)) >> 9); - if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type || + if (!wpc->ioend || + wpc->fork != wpc->ioend->io_fork || + wpc->imap.br_state != wpc->ioend->io_state || sector != bio_end_sector(wpc->ioend->io_bio) || offset != wpc->ioend->io_offset + wpc->ioend->io_size) { if (wpc->ioend) list_add(&wpc->ioend->io_list, iolist); - wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset, - bdev, sector); + wpc->ioend = xfs_alloc_ioend(inode, wpc->fork, + wpc->imap.br_state, offset, bdev, sector); } if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) { @@ -723,7 +767,7 @@ xfs_writepage_map( error = xfs_map_blocks(wpc, inode, file_offset); if (error) break; - if (wpc->io_type == XFS_IO_HOLE) + if (wpc->imap.br_startblock == HOLESTARTBLOCK) continue; xfs_add_to_ioend(inode, file_offset, page, iop, wpc, wbc, &submit_list); @@ -918,9 +962,7 @@ xfs_vm_writepage( struct page *page, struct writeback_control *wbc) { - struct xfs_writepage_ctx wpc = { - .io_type = XFS_IO_HOLE, - }; + struct xfs_writepage_ctx wpc = { }; int ret; ret = xfs_do_writepage(page, wbc, &wpc); @@ -934,9 +976,7 @@ xfs_vm_writepages( struct address_space *mapping, struct writeback_control *wbc) { - struct xfs_writepage_ctx wpc = { - .io_type = XFS_IO_HOLE, - }; + struct xfs_writepage_ctx wpc = { }; int ret; xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); @@ -983,7 +1023,7 @@ xfs_vm_bmap( * Since we don't pass back blockdev info, we can't return bmap * information for rt files either. */ - if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip)) + if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip)) return 0; return iomap_bmap(mapping, block, &xfs_iomap_ops); } diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index e5c23948a8ab..6c2615b83c5d 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -9,32 +9,12 @@ extern struct bio_set xfs_ioend_bioset; /* - * Types of I/O for bmap clustering and I/O completion tracking. - * - * This enum is used in string mapping in xfs_trace.h; please keep the - * TRACE_DEFINE_ENUMs for it up to date. - */ -enum { - XFS_IO_HOLE, /* covers region without any block allocation */ - XFS_IO_DELALLOC, /* covers delalloc region */ - XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */ - XFS_IO_OVERWRITE, /* covers already allocated extent */ - XFS_IO_COW, /* covers copy-on-write extent */ -}; - -#define XFS_IO_TYPES \ - { XFS_IO_HOLE, "hole" }, \ - { XFS_IO_DELALLOC, "delalloc" }, \ - { XFS_IO_UNWRITTEN, "unwritten" }, \ - { XFS_IO_OVERWRITE, "overwrite" }, \ - { XFS_IO_COW, "CoW" } - -/* * Structure for buffered I/O completions. */ struct xfs_ioend { struct list_head io_list; /* next ioend in chain */ - unsigned int io_type; /* delalloc / unwritten */ + int io_fork; /* inode fork written back */ + xfs_exntst_t io_state; /* extent state */ struct inode *io_inode; /* file being written to */ size_t io_size; /* size of the extent */ xfs_off_t io_offset; /* offset in the file */ diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index a58034049995..3d213a7394c5 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -555,6 +555,7 @@ xfs_attr_put_listent( attrlist_ent_t *aep; int arraytop; + ASSERT(!context->seen_enough); ASSERT(!(context->flags & ATTR_KERNOVAL)); ASSERT(context->count >= 0); ASSERT(context->count < (ATTR_MAX_VALUELEN/8)); diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 1ee8c5539fa4..2db43ff4f8b5 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1162,16 +1162,13 @@ xfs_zero_file_space( * by virtue of the hole punch. */ error = xfs_free_file_space(ip, offset, len); - if (error) - goto out; + if (error || xfs_is_always_cow_inode(ip)) + return error; - error = xfs_alloc_file_space(ip, round_down(offset, blksize), + return xfs_alloc_file_space(ip, round_down(offset, blksize), round_up(offset + len, blksize) - round_down(offset, blksize), XFS_BMAPI_PREALLOC); -out: - return error; - } static int diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 4f5f2ff3f70f..548344e25128 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -776,29 +776,24 @@ _xfs_buf_read( } /* - * Set buffer ops on an unchecked buffer and validate it, if possible. + * Reverify a buffer found in cache without an attached ->b_ops. * - * If the caller passed in an ops structure and the buffer doesn't have ops - * assigned, set the ops and use them to verify the contents. If the contents - * cannot be verified, we'll clear XBF_DONE. We assume the buffer has no - * recorded errors and is already in XBF_DONE state. + * If the caller passed an ops structure and the buffer doesn't have ops + * assigned, set the ops and use it to verify the contents. If verification + * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is + * already in XBF_DONE state on entry. * - * Under normal operations, every in-core buffer must have buffer ops assigned - * to them when the buffer is read in from disk so that we can validate the - * metadata. - * - * However, there are two scenarios where one can encounter in-core buffers - * that don't have buffer ops. The first is during log recovery of buffers on - * a V4 filesystem, though these buffers are purged at the end of recovery. - * - * The other is online repair, which tries to match arbitrary metadata blocks - * with btree types in order to find the root. If online repair doesn't match - * the buffer with /any/ btree type, the buffer remains in memory in DONE state - * with no ops, and a subsequent read_buf call from elsewhere will not set the - * ops. This function helps us fix this situation. + * Under normal operations, every in-core buffer is verified on read I/O + * completion. There are two scenarios that can lead to in-core buffers without + * an assigned ->b_ops. The first is during log recovery of buffers on a V4 + * filesystem, though these buffers are purged at the end of recovery. The + * other is online repair, which intentionally reads with a NULL buffer ops to + * run several verifiers across an in-core buffer in order to establish buffer + * type. If repair can't establish that, the buffer will be left in memory + * with NULL buffer ops. */ int -xfs_buf_ensure_ops( +xfs_buf_reverify( struct xfs_buf *bp, const struct xfs_buf_ops *ops) { @@ -840,7 +835,7 @@ xfs_buf_read_map( return bp; } - xfs_buf_ensure_ops(bp, ops); + xfs_buf_reverify(bp, ops); if (flags & XBF_ASYNC) { /* @@ -2209,3 +2204,40 @@ void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) atomic_set(&bp->b_lru_ref, lru_ref); } + +/* + * Verify an on-disk magic value against the magic value specified in the + * verifier structure. The verifier magic is in disk byte order so the caller is + * expected to pass the value directly from disk. + */ +bool +xfs_verify_magic( + struct xfs_buf *bp, + __be32 dmagic) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + int idx; + + idx = xfs_sb_version_hascrc(&mp->m_sb); + if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx]))) + return false; + return dmagic == bp->b_ops->magic[idx]; +} +/* + * Verify an on-disk magic value against the magic value specified in the + * verifier structure. The verifier magic is in disk byte order so the caller is + * expected to pass the value directly from disk. + */ +bool +xfs_verify_magic16( + struct xfs_buf *bp, + __be16 dmagic) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + int idx; + + idx = xfs_sb_version_hascrc(&mp->m_sb); + if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx]))) + return false; + return dmagic == bp->b_ops->magic16[idx]; +} diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index b9f5511ea998..d0b96e071cec 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -125,6 +125,10 @@ struct xfs_buf_map { struct xfs_buf_ops { char *name; + union { + __be32 magic[2]; /* v4 and v5 on disk magic values */ + __be16 magic16[2]; /* v4 and v5 on disk magic values */ + }; void (*verify_read)(struct xfs_buf *); void (*verify_write)(struct xfs_buf *); xfs_failaddr_t (*verify_struct)(struct xfs_buf *bp); @@ -385,6 +389,8 @@ extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int); #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) -int xfs_buf_ensure_ops(struct xfs_buf *bp, const struct xfs_buf_ops *ops); +int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops); +bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic); +bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic); #endif /* __XFS_BUF_H__ */ diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 9866f542e77b..a1e177f66404 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -51,6 +51,7 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_BUF_LRU_REF, XFS_RANDOM_FORCE_SCRUB_REPAIR, XFS_RANDOM_FORCE_SUMMARY_RECALC, + XFS_RANDOM_IUNLINK_FALLBACK, }; struct xfs_errortag_attr { @@ -159,6 +160,7 @@ XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN); XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF); XFS_ERRORTAG_ATTR_RW(force_repair, XFS_ERRTAG_FORCE_SCRUB_REPAIR); XFS_ERRORTAG_ATTR_RW(bad_summary, XFS_ERRTAG_FORCE_SUMMARY_RECALC); +XFS_ERRORTAG_ATTR_RW(iunlink_fallback, XFS_ERRTAG_IUNLINK_FALLBACK); static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(noerror), @@ -195,6 +197,7 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(buf_lru_ref), XFS_ERRORTAG_ATTR_LIST(force_repair), XFS_ERRORTAG_ATTR_LIST(bad_summary), + XFS_ERRORTAG_ATTR_LIST(iunlink_fallback), NULL, }; @@ -357,7 +360,8 @@ xfs_buf_verifier_error( fa = failaddr ? failaddr : __return_address; __xfs_buf_ioerror(bp, error, fa); - xfs_alert(mp, "Metadata %s detected at %pS, %s block 0x%llx %s", + xfs_alert_tag(mp, XFS_PTAG_VERIFIER_ERROR, + "Metadata %s detected at %pS, %s block 0x%llx %s", bp->b_error == -EFSBADCRC ? "CRC error" : "corruption", fa, bp->b_ops->name, bp->b_bn, name); diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 246d3e989c6c..602aa7d62b66 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -98,5 +98,6 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp); #define XFS_PTAG_SHUTDOWN_IOERROR 0x00000020 #define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040 #define XFS_PTAG_FSBLOCK_ZERO 0x00000080 +#define XFS_PTAG_VERIFIER_ERROR 0x00000100 #endif /* __XFS_ERROR_H__ */ diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index e47425071e65..770cc2edf777 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -507,7 +507,7 @@ xfs_file_dio_aio_write( * We can't properly handle unaligned direct I/O to reflink * files yet, as we can't unshare a partial block. */ - if (xfs_is_reflink_inode(ip)) { + if (xfs_is_cow_inode(ip)) { trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count); return -EREMCHG; } @@ -872,14 +872,27 @@ xfs_file_fallocate( goto out_unlock; } - if (mode & FALLOC_FL_ZERO_RANGE) + if (mode & FALLOC_FL_ZERO_RANGE) { error = xfs_zero_file_space(ip, offset, len); - else { - if (mode & FALLOC_FL_UNSHARE_RANGE) { - error = xfs_reflink_unshare(ip, offset, len); - if (error) - goto out_unlock; + } else if (mode & FALLOC_FL_UNSHARE_RANGE) { + error = xfs_reflink_unshare(ip, offset, len); + if (error) + goto out_unlock; + + if (!xfs_is_always_cow_inode(ip)) { + error = xfs_alloc_file_space(ip, offset, len, + XFS_BMAPI_PREALLOC); } + } else { + /* + * If always_cow mode we can't use preallocations and + * thus should not create them. + */ + if (xfs_is_always_cow_inode(ip)) { + error = -EOPNOTSUPP; + goto out_unlock; + } + error = xfs_alloc_file_space(ip, offset, len, XFS_BMAPI_PREALLOC); } @@ -1068,10 +1081,10 @@ xfs_file_llseek( default: return generic_file_llseek(file, offset, whence); case SEEK_HOLE: - offset = iomap_seek_hole(inode, offset, &xfs_iomap_ops); + offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops); break; case SEEK_DATA: - offset = iomap_seek_data(inode, offset, &xfs_iomap_ops); + offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops); break; } diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index f3ef70c542e1..584648582ba7 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -533,6 +533,7 @@ xfs_fs_reserve_ag_blocks( int error = 0; int err2; + mp->m_finobt_nores = false; for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { pag = xfs_perag_get(mp, agno); err2 = xfs_ag_resv_init(pag, NULL); diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index 5169e84ae382..d0d377384120 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -16,7 +16,7 @@ xfs_param_t xfs_params = { /* MIN DFLT MAX */ .sgid_inherit = { 0, 0, 1 }, .symlink_mode = { 0, 0, 1 }, - .panic_mask = { 0, 0, 255 }, + .panic_mask = { 0, 0, 256 }, .error_level = { 0, 3, 11 }, .syncd_timer = { 1*100, 30*100, 7200*100}, .stats_clear = { 0, 0, 1 }, diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ae667ba74a1c..f643a9295179 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1332,7 +1332,7 @@ xfs_create_tmpfile( if (error) goto out_trans_cancel; - error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, prid, &ip); + error = xfs_dir_ialloc(&tp, dp, mode, 0, 0, prid, &ip); if (error) goto out_trans_cancel; @@ -1754,7 +1754,7 @@ xfs_inactive_ifree( * now remains allocated and sits on the unlinked list until the fs is * repaired. */ - if (unlikely(mp->m_inotbt_nores)) { + if (unlikely(mp->m_finobt_nores)) { error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); @@ -1907,86 +1907,510 @@ xfs_inactive( } /* - * This is called when the inode's link count goes to 0 or we are creating a - * tmpfile via O_TMPFILE. In the case of a tmpfile, @ignore_linkcount will be - * set to true as the link count is dropped to zero by the VFS after we've - * created the file successfully, so we have to add it to the unlinked list - * while the link count is non-zero. + * In-Core Unlinked List Lookups + * ============================= + * + * Every inode is supposed to be reachable from some other piece of metadata + * with the exception of the root directory. Inodes with a connection to a + * file descriptor but not linked from anywhere in the on-disk directory tree + * are collectively known as unlinked inodes, though the filesystem itself + * maintains links to these inodes so that on-disk metadata are consistent. + * + * XFS implements a per-AG on-disk hash table of unlinked inodes. The AGI + * header contains a number of buckets that point to an inode, and each inode + * record has a pointer to the next inode in the hash chain. This + * singly-linked list causes scaling problems in the iunlink remove function + * because we must walk that list to find the inode that points to the inode + * being removed from the unlinked hash bucket list. + * + * What if we modelled the unlinked list as a collection of records capturing + * "X.next_unlinked = Y" relations? If we indexed those records on Y, we'd + * have a fast way to look up unlinked list predecessors, which avoids the + * slow list walk. That's exactly what we do here (in-core) with a per-AG + * rhashtable. + * + * Because this is a backref cache, we ignore operational failures since the + * iunlink code can fall back to the slow bucket walk. The only errors that + * should bubble out are for obviously incorrect situations. + * + * All users of the backref cache MUST hold the AGI buffer lock to serialize + * access or have otherwise provided for concurrency control. + */ + +/* Capture a "X.next_unlinked = Y" relationship. */ +struct xfs_iunlink { + struct rhash_head iu_rhash_head; + xfs_agino_t iu_agino; /* X */ + xfs_agino_t iu_next_unlinked; /* Y */ +}; + +/* Unlinked list predecessor lookup hashtable construction */ +static int +xfs_iunlink_obj_cmpfn( + struct rhashtable_compare_arg *arg, + const void *obj) +{ + const xfs_agino_t *key = arg->key; + const struct xfs_iunlink *iu = obj; + + if (iu->iu_next_unlinked != *key) + return 1; + return 0; +} + +static const struct rhashtable_params xfs_iunlink_hash_params = { + .min_size = XFS_AGI_UNLINKED_BUCKETS, + .key_len = sizeof(xfs_agino_t), + .key_offset = offsetof(struct xfs_iunlink, + iu_next_unlinked), + .head_offset = offsetof(struct xfs_iunlink, iu_rhash_head), + .automatic_shrinking = true, + .obj_cmpfn = xfs_iunlink_obj_cmpfn, +}; + +/* + * Return X, where X.next_unlinked == @agino. Returns NULLAGINO if no such + * relation is found. + */ +static xfs_agino_t +xfs_iunlink_lookup_backref( + struct xfs_perag *pag, + xfs_agino_t agino) +{ + struct xfs_iunlink *iu; + + iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino, + xfs_iunlink_hash_params); + return iu ? iu->iu_agino : NULLAGINO; +} + +/* + * Take ownership of an iunlink cache entry and insert it into the hash table. + * If successful, the entry will be owned by the cache; if not, it is freed. + * Either way, the caller does not own @iu after this call. + */ +static int +xfs_iunlink_insert_backref( + struct xfs_perag *pag, + struct xfs_iunlink *iu) +{ + int error; + + error = rhashtable_insert_fast(&pag->pagi_unlinked_hash, + &iu->iu_rhash_head, xfs_iunlink_hash_params); + /* + * Fail loudly if there already was an entry because that's a sign of + * corruption of in-memory data. Also fail loudly if we see an error + * code we didn't anticipate from the rhashtable code. Currently we + * only anticipate ENOMEM. + */ + if (error) { + WARN(error != -ENOMEM, "iunlink cache insert error %d", error); + kmem_free(iu); + } + /* + * Absorb any runtime errors that aren't a result of corruption because + * this is a cache and we can always fall back to bucket list scanning. + */ + if (error != 0 && error != -EEXIST) + error = 0; + return error; +} + +/* Remember that @prev_agino.next_unlinked = @this_agino. */ +static int +xfs_iunlink_add_backref( + struct xfs_perag *pag, + xfs_agino_t prev_agino, + xfs_agino_t this_agino) +{ + struct xfs_iunlink *iu; + + if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK)) + return 0; + + iu = kmem_zalloc(sizeof(*iu), KM_SLEEP | KM_NOFS); + iu->iu_agino = prev_agino; + iu->iu_next_unlinked = this_agino; + + return xfs_iunlink_insert_backref(pag, iu); +} + +/* + * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked. + * If @next_unlinked is NULLAGINO, we drop the backref and exit. If there + * wasn't any such entry then we don't bother. + */ +static int +xfs_iunlink_change_backref( + struct xfs_perag *pag, + xfs_agino_t agino, + xfs_agino_t next_unlinked) +{ + struct xfs_iunlink *iu; + int error; + + /* Look up the old entry; if there wasn't one then exit. */ + iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino, + xfs_iunlink_hash_params); + if (!iu) + return 0; + + /* + * Remove the entry. This shouldn't ever return an error, but if we + * couldn't remove the old entry we don't want to add it again to the + * hash table, and if the entry disappeared on us then someone's + * violated the locking rules and we need to fail loudly. Either way + * we cannot remove the inode because internal state is or would have + * been corrupt. + */ + error = rhashtable_remove_fast(&pag->pagi_unlinked_hash, + &iu->iu_rhash_head, xfs_iunlink_hash_params); + if (error) + return error; + + /* If there is no new next entry just free our item and return. */ + if (next_unlinked == NULLAGINO) { + kmem_free(iu); + return 0; + } + + /* Update the entry and re-add it to the hash table. */ + iu->iu_next_unlinked = next_unlinked; + return xfs_iunlink_insert_backref(pag, iu); +} + +/* Set up the in-core predecessor structures. */ +int +xfs_iunlink_init( + struct xfs_perag *pag) +{ + return rhashtable_init(&pag->pagi_unlinked_hash, + &xfs_iunlink_hash_params); +} + +/* Free the in-core predecessor structures. */ +static void +xfs_iunlink_free_item( + void *ptr, + void *arg) +{ + struct xfs_iunlink *iu = ptr; + bool *freed_anything = arg; + + *freed_anything = true; + kmem_free(iu); +} + +void +xfs_iunlink_destroy( + struct xfs_perag *pag) +{ + bool freed_anything = false; + + rhashtable_free_and_destroy(&pag->pagi_unlinked_hash, + xfs_iunlink_free_item, &freed_anything); + + ASSERT(freed_anything == false || XFS_FORCED_SHUTDOWN(pag->pag_mount)); +} + +/* + * Point the AGI unlinked bucket at an inode and log the results. The caller + * is responsible for validating the old value. + */ +STATIC int +xfs_iunlink_update_bucket( + struct xfs_trans *tp, + xfs_agnumber_t agno, + struct xfs_buf *agibp, + unsigned int bucket_index, + xfs_agino_t new_agino) +{ + struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp); + xfs_agino_t old_value; + int offset; + + ASSERT(xfs_verify_agino_or_null(tp->t_mountp, agno, new_agino)); + + old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]); + trace_xfs_iunlink_update_bucket(tp->t_mountp, agno, bucket_index, + old_value, new_agino); + + /* + * We should never find the head of the list already set to the value + * passed in because either we're adding or removing ourselves from the + * head of the list. + */ + if (old_value == new_agino) + return -EFSCORRUPTED; + + agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino); + offset = offsetof(struct xfs_agi, agi_unlinked) + + (sizeof(xfs_agino_t) * bucket_index); + xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1); + return 0; +} + +/* Set an on-disk inode's next_unlinked pointer. */ +STATIC void +xfs_iunlink_update_dinode( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agino_t agino, + struct xfs_buf *ibp, + struct xfs_dinode *dip, + struct xfs_imap *imap, + xfs_agino_t next_agino) +{ + struct xfs_mount *mp = tp->t_mountp; + int offset; + + ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino)); + + trace_xfs_iunlink_update_dinode(mp, agno, agino, + be32_to_cpu(dip->di_next_unlinked), next_agino); + + dip->di_next_unlinked = cpu_to_be32(next_agino); + offset = imap->im_boffset + + offsetof(struct xfs_dinode, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, dip); + xfs_trans_inode_buf(tp, ibp); + xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1); + xfs_inobp_check(mp, ibp); +} + +/* Set an in-core inode's unlinked pointer and return the old value. */ +STATIC int +xfs_iunlink_update_inode( + struct xfs_trans *tp, + struct xfs_inode *ip, + xfs_agnumber_t agno, + xfs_agino_t next_agino, + xfs_agino_t *old_next_agino) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_dinode *dip; + struct xfs_buf *ibp; + xfs_agino_t old_value; + int error; + + ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino)); + + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0, 0); + if (error) + return error; + + /* Make sure the old pointer isn't garbage. */ + old_value = be32_to_cpu(dip->di_next_unlinked); + if (!xfs_verify_agino_or_null(mp, agno, old_value)) { + error = -EFSCORRUPTED; + goto out; + } + + /* + * Since we're updating a linked list, we should never find that the + * current pointer is the same as the new value, unless we're + * terminating the list. + */ + *old_next_agino = old_value; + if (old_value == next_agino) { + if (next_agino != NULLAGINO) + error = -EFSCORRUPTED; + goto out; + } + + /* Ok, update the new pointer. */ + xfs_iunlink_update_dinode(tp, agno, XFS_INO_TO_AGINO(mp, ip->i_ino), + ibp, dip, &ip->i_imap, next_agino); + return 0; +out: + xfs_trans_brelse(tp, ibp); + return error; +} + +/* + * This is called when the inode's link count has gone to 0 or we are creating + * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0. * * We place the on-disk inode on a list in the AGI. It will be pulled from this * list when the inode is freed. */ STATIC int xfs_iunlink( - struct xfs_trans *tp, - struct xfs_inode *ip) + struct xfs_trans *tp, + struct xfs_inode *ip) { - xfs_mount_t *mp = tp->t_mountp; - xfs_agi_t *agi; - xfs_dinode_t *dip; - xfs_buf_t *agibp; - xfs_buf_t *ibp; - xfs_agino_t agino; - short bucket_index; - int offset; - int error; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agi *agi; + struct xfs_buf *agibp; + xfs_agino_t next_agino; + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; + int error; + ASSERT(VFS_I(ip)->i_nlink == 0); ASSERT(VFS_I(ip)->i_mode != 0); + trace_xfs_iunlink(ip); - /* - * Get the agi buffer first. It ensures lock ordering - * on the list. - */ - error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp); + /* Get the agi buffer first. It ensures lock ordering on the list. */ + error = xfs_read_agi(mp, tp, agno, &agibp); if (error) return error; agi = XFS_BUF_TO_AGI(agibp); /* - * Get the index into the agi hash table for the - * list this inode will go on. + * Get the index into the agi hash table for the list this inode will + * go on. Make sure the pointer isn't garbage and that this inode + * isn't already on the list. */ - agino = XFS_INO_TO_AGINO(mp, ip->i_ino); - ASSERT(agino != 0); - bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; - ASSERT(agi->agi_unlinked[bucket_index]); - ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino); + next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); + if (next_agino == agino || + !xfs_verify_agino_or_null(mp, agno, next_agino)) + return -EFSCORRUPTED; + + if (next_agino != NULLAGINO) { + struct xfs_perag *pag; + xfs_agino_t old_agino; + + /* + * There is already another inode in the bucket, so point this + * inode to the current head of the list. + */ + error = xfs_iunlink_update_inode(tp, ip, agno, next_agino, + &old_agino); + if (error) + return error; + ASSERT(old_agino == NULLAGINO); - if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) { /* - * There is already another inode in the bucket we need - * to add ourselves to. Add us at the front of the list. - * Here we put the head pointer into our next pointer, - * and then we fall through to point the head at us. + * agino has been unlinked, add a backref from the next inode + * back to agino. */ - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, - 0, 0); + pag = xfs_perag_get(mp, agno); + error = xfs_iunlink_add_backref(pag, agino, next_agino); + xfs_perag_put(pag); if (error) return error; + } + + /* Point the head of the list to point to this inode. */ + return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, agino); +} - ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO)); - dip->di_next_unlinked = agi->agi_unlinked[bucket_index]; - offset = ip->i_imap.im_boffset + - offsetof(xfs_dinode_t, di_next_unlinked); +/* Return the imap, dinode pointer, and buffer for an inode. */ +STATIC int +xfs_iunlink_map_ino( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agino_t agino, + struct xfs_imap *imap, + struct xfs_dinode **dipp, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = tp->t_mountp; + int error; - /* need to recalc the inode CRC if appropriate */ - xfs_dinode_calc_crc(mp, dip); + imap->im_blkno = 0; + error = xfs_imap(mp, tp, XFS_AGINO_TO_INO(mp, agno, agino), imap, 0); + if (error) { + xfs_warn(mp, "%s: xfs_imap returned error %d.", + __func__, error); + return error; + } - xfs_trans_inode_buf(tp, ibp); - xfs_trans_log_buf(tp, ibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); - xfs_inobp_check(mp, ibp); + error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0, 0); + if (error) { + xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.", + __func__, error); + return error; + } + + return 0; +} + +/* + * Walk the unlinked chain from @head_agino until we find the inode that + * points to @target_agino. Return the inode number, map, dinode pointer, + * and inode cluster buffer of that inode as @agino, @imap, @dipp, and @bpp. + * + * @tp, @pag, @head_agino, and @target_agino are input parameters. + * @agino, @imap, @dipp, and @bpp are all output parameters. + * + * Do not call this function if @target_agino is the head of the list. + */ +STATIC int +xfs_iunlink_map_prev( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agino_t head_agino, + xfs_agino_t target_agino, + xfs_agino_t *agino, + struct xfs_imap *imap, + struct xfs_dinode **dipp, + struct xfs_buf **bpp, + struct xfs_perag *pag) +{ + struct xfs_mount *mp = tp->t_mountp; + xfs_agino_t next_agino; + int error; + + ASSERT(head_agino != target_agino); + *bpp = NULL; + + /* See if our backref cache can find it faster. */ + *agino = xfs_iunlink_lookup_backref(pag, target_agino); + if (*agino != NULLAGINO) { + error = xfs_iunlink_map_ino(tp, agno, *agino, imap, dipp, bpp); + if (error) + return error; + + if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino) + return 0; + + /* + * If we get here the cache contents were corrupt, so drop the + * buffer and fall back to walking the bucket list. + */ + xfs_trans_brelse(tp, *bpp); + *bpp = NULL; + WARN_ON_ONCE(1); + } + + trace_xfs_iunlink_map_prev_fallback(mp, agno); + + /* Otherwise, walk the entire bucket until we find it. */ + next_agino = head_agino; + while (next_agino != target_agino) { + xfs_agino_t unlinked_agino; + + if (*bpp) + xfs_trans_brelse(tp, *bpp); + + *agino = next_agino; + error = xfs_iunlink_map_ino(tp, agno, next_agino, imap, dipp, + bpp); + if (error) + return error; + + unlinked_agino = be32_to_cpu((*dipp)->di_next_unlinked); + /* + * Make sure this pointer is valid and isn't an obvious + * infinite loop. + */ + if (!xfs_verify_agino(mp, agno, unlinked_agino) || + next_agino == unlinked_agino) { + XFS_CORRUPTION_ERROR(__func__, + XFS_ERRLEVEL_LOW, mp, + *dipp, sizeof(**dipp)); + error = -EFSCORRUPTED; + return error; + } + next_agino = unlinked_agino; } - /* - * Point the bucket head pointer at the inode being inserted. - */ - ASSERT(agino != 0); - agi->agi_unlinked[bucket_index] = cpu_to_be32(agino); - offset = offsetof(xfs_agi_t, agi_unlinked) + - (sizeof(xfs_agino_t) * bucket_index); - xfs_trans_log_buf(tp, agibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); return 0; } @@ -1995,181 +2419,106 @@ xfs_iunlink( */ STATIC int xfs_iunlink_remove( - xfs_trans_t *tp, - xfs_inode_t *ip) + struct xfs_trans *tp, + struct xfs_inode *ip) { - xfs_ino_t next_ino; - xfs_mount_t *mp; - xfs_agi_t *agi; - xfs_dinode_t *dip; - xfs_buf_t *agibp; - xfs_buf_t *ibp; - xfs_agnumber_t agno; - xfs_agino_t agino; - xfs_agino_t next_agino; - xfs_buf_t *last_ibp; - xfs_dinode_t *last_dip = NULL; - short bucket_index; - int offset, last_offset = 0; - int error; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agi *agi; + struct xfs_buf *agibp; + struct xfs_buf *last_ibp; + struct xfs_dinode *last_dip = NULL; + struct xfs_perag *pag = NULL; + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + xfs_agino_t next_agino; + xfs_agino_t head_agino; + short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; + int error; - mp = tp->t_mountp; - agno = XFS_INO_TO_AGNO(mp, ip->i_ino); + trace_xfs_iunlink_remove(ip); - /* - * Get the agi buffer first. It ensures lock ordering - * on the list. - */ + /* Get the agi buffer first. It ensures lock ordering on the list. */ error = xfs_read_agi(mp, tp, agno, &agibp); if (error) return error; - agi = XFS_BUF_TO_AGI(agibp); /* - * Get the index into the agi hash table for the - * list this inode will go on. + * Get the index into the agi hash table for the list this inode will + * go on. Make sure the head pointer isn't garbage. */ - agino = XFS_INO_TO_AGINO(mp, ip->i_ino); - if (!xfs_verify_agino(mp, agno, agino)) - return -EFSCORRUPTED; - bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; - if (!xfs_verify_agino(mp, agno, - be32_to_cpu(agi->agi_unlinked[bucket_index]))) { + head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); + if (!xfs_verify_agino(mp, agno, head_agino)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agi, sizeof(*agi)); return -EFSCORRUPTED; } - if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) { - /* - * We're at the head of the list. Get the inode's on-disk - * buffer to see if there is anyone after us on the list. - * Only modify our next pointer if it is not already NULLAGINO. - * This saves us the overhead of dealing with the buffer when - * there is no need to change it. - */ - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, - 0, 0); - if (error) { - xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.", - __func__, error); - return error; - } - next_agino = be32_to_cpu(dip->di_next_unlinked); - ASSERT(next_agino != 0); - if (next_agino != NULLAGINO) { - dip->di_next_unlinked = cpu_to_be32(NULLAGINO); - offset = ip->i_imap.im_boffset + - offsetof(xfs_dinode_t, di_next_unlinked); - - /* need to recalc the inode CRC if appropriate */ - xfs_dinode_calc_crc(mp, dip); - - xfs_trans_inode_buf(tp, ibp); - xfs_trans_log_buf(tp, ibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); - xfs_inobp_check(mp, ibp); - } else { - xfs_trans_brelse(tp, ibp); - } - /* - * Point the bucket head pointer at the next inode. - */ - ASSERT(next_agino != 0); - ASSERT(next_agino != agino); - agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino); - offset = offsetof(xfs_agi_t, agi_unlinked) + - (sizeof(xfs_agino_t) * bucket_index); - xfs_trans_log_buf(tp, agibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); - } else { - /* - * We need to search the list for the inode being freed. - */ - next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); - last_ibp = NULL; - while (next_agino != agino) { - struct xfs_imap imap; + /* + * Set our inode's next_unlinked pointer to NULL and then return + * the old pointer value so that we can update whatever was previous + * to us in the list to point to whatever was next in the list. + */ + error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO, &next_agino); + if (error) + return error; - if (last_ibp) - xfs_trans_brelse(tp, last_ibp); + /* + * If there was a backref pointing from the next inode back to this + * one, remove it because we've removed this inode from the list. + * + * Later, if this inode was in the middle of the list we'll update + * this inode's backref to point from the next inode. + */ + if (next_agino != NULLAGINO) { + pag = xfs_perag_get(mp, agno); + error = xfs_iunlink_change_backref(pag, next_agino, + NULLAGINO); + if (error) + goto out; + } - imap.im_blkno = 0; - next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino); + if (head_agino == agino) { + /* Point the head of the list to the next unlinked inode. */ + error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, + next_agino); + if (error) + goto out; + } else { + struct xfs_imap imap; + xfs_agino_t prev_agino; - error = xfs_imap(mp, tp, next_ino, &imap, 0); - if (error) { - xfs_warn(mp, - "%s: xfs_imap returned error %d.", - __func__, error); - return error; - } + if (!pag) + pag = xfs_perag_get(mp, agno); - error = xfs_imap_to_bp(mp, tp, &imap, &last_dip, - &last_ibp, 0, 0); - if (error) { - xfs_warn(mp, - "%s: xfs_imap_to_bp returned error %d.", - __func__, error); - return error; - } + /* We need to search the list for the inode being freed. */ + error = xfs_iunlink_map_prev(tp, agno, head_agino, agino, + &prev_agino, &imap, &last_dip, &last_ibp, + pag); + if (error) + goto out; - last_offset = imap.im_boffset; - next_agino = be32_to_cpu(last_dip->di_next_unlinked); - if (!xfs_verify_agino(mp, agno, next_agino)) { - XFS_CORRUPTION_ERROR(__func__, - XFS_ERRLEVEL_LOW, mp, - last_dip, sizeof(*last_dip)); - return -EFSCORRUPTED; - } - } + /* Point the previous inode on the list to the next inode. */ + xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp, + last_dip, &imap, next_agino); /* - * Now last_ibp points to the buffer previous to us on the - * unlinked list. Pull us from the list. + * Now we deal with the backref for this inode. If this inode + * pointed at a real inode, change the backref that pointed to + * us to point to our old next. If this inode was the end of + * the list, delete the backref that pointed to us. Note that + * change_backref takes care of deleting the backref if + * next_agino is NULLAGINO. */ - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, - 0, 0); - if (error) { - xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.", - __func__, error); - return error; - } - next_agino = be32_to_cpu(dip->di_next_unlinked); - ASSERT(next_agino != 0); - ASSERT(next_agino != agino); - if (next_agino != NULLAGINO) { - dip->di_next_unlinked = cpu_to_be32(NULLAGINO); - offset = ip->i_imap.im_boffset + - offsetof(xfs_dinode_t, di_next_unlinked); - - /* need to recalc the inode CRC if appropriate */ - xfs_dinode_calc_crc(mp, dip); - - xfs_trans_inode_buf(tp, ibp); - xfs_trans_log_buf(tp, ibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); - xfs_inobp_check(mp, ibp); - } else { - xfs_trans_brelse(tp, ibp); - } - /* - * Point the previous inode on the list to the next inode. - */ - last_dip->di_next_unlinked = cpu_to_be32(next_agino); - ASSERT(next_agino != 0); - offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked); - - /* need to recalc the inode CRC if appropriate */ - xfs_dinode_calc_crc(mp, last_dip); - - xfs_trans_inode_buf(tp, last_ibp); - xfs_trans_log_buf(tp, last_ibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); - xfs_inobp_check(mp, last_ibp); + error = xfs_iunlink_change_backref(pag, agino, next_agino); + if (error) + goto out; } - return 0; + +out: + if (pag) + xfs_perag_put(pag); + return error; } /* @@ -2833,11 +3182,9 @@ xfs_rename_alloc_whiteout( /* * Prepare the tmpfile inode as if it were created through the VFS. - * Otherwise, the link increment paths will complain about nlink 0->1. - * Drop the link count as done by d_tmpfile(), complete the inode setup - * and flag it as linkable. + * Complete the inode setup and flag it as linkable. nlink is already + * zero, so we can skip the drop_nlink. */ - drop_nlink(VFS_I(tmpfile)); xfs_setup_iops(tmpfile); xfs_finish_inode_setup(tmpfile); VFS_I(tmpfile)->i_state |= I_LINKABLE; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index be2014520155..e62074a5257c 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -500,4 +500,7 @@ extern struct kmem_zone *xfs_inode_zone; bool xfs_inode_verify_forks(struct xfs_inode *ip); +int xfs_iunlink_init(struct xfs_perag *pag); +void xfs_iunlink_destroy(struct xfs_perag *pag); + #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 27c93b5f029d..63d323916bba 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -35,18 +35,40 @@ #define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ << mp->m_writeio_log) -void +static int +xfs_alert_fsblock_zero( + xfs_inode_t *ip, + xfs_bmbt_irec_t *imap) +{ + xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO, + "Access to block zero in inode %llu " + "start_block: %llx start_off: %llx " + "blkcnt: %llx extent-state: %x", + (unsigned long long)ip->i_ino, + (unsigned long long)imap->br_startblock, + (unsigned long long)imap->br_startoff, + (unsigned long long)imap->br_blockcount, + imap->br_state); + return -EFSCORRUPTED; +} + +int xfs_bmbt_to_iomap( struct xfs_inode *ip, struct iomap *iomap, - struct xfs_bmbt_irec *imap) + struct xfs_bmbt_irec *imap, + bool shared) { struct xfs_mount *mp = ip->i_mount; + if (unlikely(!imap->br_startblock && !XFS_IS_REALTIME_INODE(ip))) + return xfs_alert_fsblock_zero(ip, imap); + if (imap->br_startblock == HOLESTARTBLOCK) { iomap->addr = IOMAP_NULL_ADDR; iomap->type = IOMAP_HOLE; - } else if (imap->br_startblock == DELAYSTARTBLOCK) { + } else if (imap->br_startblock == DELAYSTARTBLOCK || + isnullstartblock(imap->br_startblock)) { iomap->addr = IOMAP_NULL_ADDR; iomap->type = IOMAP_DELALLOC; } else { @@ -60,6 +82,13 @@ xfs_bmbt_to_iomap( iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip)); iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip)); + + if (xfs_ipincount(ip) && + (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) + iomap->flags |= IOMAP_F_DIRTY; + if (shared) + iomap->flags |= IOMAP_F_SHARED; + return 0; } static void @@ -138,23 +167,6 @@ xfs_iomap_eof_align_last_fsb( return 0; } -STATIC int -xfs_alert_fsblock_zero( - xfs_inode_t *ip, - xfs_bmbt_irec_t *imap) -{ - xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO, - "Access to block zero in inode %llu " - "start_block: %llx start_off: %llx " - "blkcnt: %llx extent-state: %x", - (unsigned long long)ip->i_ino, - (unsigned long long)imap->br_startblock, - (unsigned long long)imap->br_startoff, - (unsigned long long)imap->br_blockcount, - imap->br_state); - return -EFSCORRUPTED; -} - int xfs_iomap_write_direct( xfs_inode_t *ip, @@ -383,12 +395,13 @@ xfs_quota_calc_throttle( STATIC xfs_fsblock_t xfs_iomap_prealloc_size( struct xfs_inode *ip, + int whichfork, loff_t offset, loff_t count, struct xfs_iext_cursor *icur) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); struct xfs_bmbt_irec prev; int shift = 0; @@ -522,15 +535,16 @@ xfs_file_iomap_begin_delay( { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t maxbytes_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); xfs_fileoff_t end_fsb; - int error = 0, eof = 0; - struct xfs_bmbt_irec got; - struct xfs_iext_cursor icur; + struct xfs_bmbt_irec imap, cmap; + struct xfs_iext_cursor icur, ccur; xfs_fsblock_t prealloc_blocks = 0; + bool eof = false, cow_eof = false, shared = false; + int whichfork = XFS_DATA_FORK; + int error = 0; ASSERT(!XFS_IS_REALTIME_INODE(ip)); ASSERT(!xfs_get_extsz_hint(ip)); @@ -548,7 +562,7 @@ xfs_file_iomap_begin_delay( XFS_STATS_INC(mp, xs_blk_mapw); - if (!(ifp->if_flags & XFS_IFEXTENTS)) { + if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) { error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); if (error) goto out_unlock; @@ -556,53 +570,101 @@ xfs_file_iomap_begin_delay( end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); - eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got); + /* + * Search the data fork fork first to look up our source mapping. We + * always need the data fork map, as we have to return it to the + * iomap code so that the higher level write code can read data in to + * perform read-modify-write cycles for unaligned writes. + */ + eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap); if (eof) - got.br_startoff = end_fsb; /* fake hole until the end */ + imap.br_startoff = end_fsb; /* fake hole until the end */ + + /* We never need to allocate blocks for zeroing a hole. */ + if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) { + xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff); + goto out_unlock; + } - if (got.br_startoff <= offset_fsb) { + /* + * Search the COW fork extent list even if we did not find a data fork + * extent. This serves two purposes: first this implements the + * speculative preallocation using cowextsize, so that we also unshare + * block adjacent to shared blocks instead of just the shared blocks + * themselves. Second the lookup in the extent list is generally faster + * than going out to the shared extent tree. + */ + if (xfs_is_cow_inode(ip)) { + if (!ip->i_cowfp) { + ASSERT(!xfs_is_reflink_inode(ip)); + xfs_ifork_init_cow(ip); + } + cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, + &ccur, &cmap); + if (!cow_eof && cmap.br_startoff <= offset_fsb) { + trace_xfs_reflink_cow_found(ip, &cmap); + whichfork = XFS_COW_FORK; + goto done; + } + } + + if (imap.br_startoff <= offset_fsb) { /* * For reflink files we may need a delalloc reservation when * overwriting shared extents. This includes zeroing of * existing extents that contain data. */ - if (xfs_is_reflink_inode(ip) && - ((flags & IOMAP_WRITE) || - got.br_state != XFS_EXT_UNWRITTEN)) { - xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb); - error = xfs_reflink_reserve_cow(ip, &got); - if (error) - goto out_unlock; + if (!xfs_is_cow_inode(ip) || + ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) { + trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK, + &imap); + goto done; } - trace_xfs_iomap_found(ip, offset, count, 0, &got); - goto done; - } + xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb); - if (flags & IOMAP_ZERO) { - xfs_hole_to_iomap(ip, iomap, offset_fsb, got.br_startoff); - goto out_unlock; + /* Trim the mapping to the nearest shared extent boundary. */ + error = xfs_inode_need_cow(ip, &imap, &shared); + if (error) + goto out_unlock; + + /* Not shared? Just report the (potentially capped) extent. */ + if (!shared) { + trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK, + &imap); + goto done; + } + + /* + * Fork all the shared blocks from our write offset until the + * end of the extent. + */ + whichfork = XFS_COW_FORK; + end_fsb = imap.br_startoff + imap.br_blockcount; + } else { + /* + * We cap the maximum length we map here to MAX_WRITEBACK_PAGES + * pages to keep the chunks of work done where somewhat + * symmetric with the work writeback does. This is a completely + * arbitrary number pulled out of thin air. + * + * Note that the values needs to be less than 32-bits wide until + * the lower level functions are updated. + */ + count = min_t(loff_t, count, 1024 * PAGE_SIZE); + end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); + + if (xfs_is_always_cow_inode(ip)) + whichfork = XFS_COW_FORK; } error = xfs_qm_dqattach_locked(ip, false); if (error) goto out_unlock; - /* - * We cap the maximum length we map here to MAX_WRITEBACK_PAGES pages - * to keep the chunks of work done where somewhat symmetric with the - * work writeback does. This is a completely arbitrary number pulled - * out of thin air as a best guess for initial testing. - * - * Note that the values needs to be less than 32-bits wide until - * the lower level functions are updated. - */ - count = min_t(loff_t, count, 1024 * PAGE_SIZE); - end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); - if (eof) { - prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count, - &icur); + prealloc_blocks = xfs_iomap_prealloc_size(ip, whichfork, offset, + count, &icur); if (prealloc_blocks) { xfs_extlen_t align; xfs_off_t end_offset; @@ -623,9 +685,11 @@ xfs_file_iomap_begin_delay( } retry: - error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb, - end_fsb - offset_fsb, prealloc_blocks, &got, &icur, - eof); + error = xfs_bmapi_reserve_delalloc(ip, whichfork, offset_fsb, + end_fsb - offset_fsb, prealloc_blocks, + whichfork == XFS_DATA_FORK ? &imap : &cmap, + whichfork == XFS_DATA_FORK ? &icur : &ccur, + whichfork == XFS_DATA_FORK ? eof : cow_eof); switch (error) { case 0: break; @@ -647,186 +711,22 @@ retry: * them out if the write happens to fail. */ iomap->flags |= IOMAP_F_NEW; - trace_xfs_iomap_alloc(ip, offset, count, 0, &got); + trace_xfs_iomap_alloc(ip, offset, count, whichfork, + whichfork == XFS_DATA_FORK ? &imap : &cmap); done: - if (isnullstartblock(got.br_startblock)) - got.br_startblock = DELAYSTARTBLOCK; - - if (!got.br_startblock) { - error = xfs_alert_fsblock_zero(ip, &got); - if (error) + if (whichfork == XFS_COW_FORK) { + if (imap.br_startoff > offset_fsb) { + xfs_trim_extent(&cmap, offset_fsb, + imap.br_startoff - offset_fsb); + error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true); goto out_unlock; - } - - xfs_bmbt_to_iomap(ip, iomap, &got); - -out_unlock: - xfs_iunlock(ip, XFS_ILOCK_EXCL); - return error; -} - -/* - * Pass in a delayed allocate extent, convert it to real extents; - * return to the caller the extent we create which maps on top of - * the originating callers request. - * - * Called without a lock on the inode. - * - * We no longer bother to look at the incoming map - all we have to - * guarantee is that whatever we allocate fills the required range. - */ -int -xfs_iomap_write_allocate( - xfs_inode_t *ip, - int whichfork, - xfs_off_t offset, - xfs_bmbt_irec_t *imap, - unsigned int *cow_seq) -{ - xfs_mount_t *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); - xfs_fileoff_t offset_fsb, last_block; - xfs_fileoff_t end_fsb, map_start_fsb; - xfs_filblks_t count_fsb; - xfs_trans_t *tp; - int nimaps; - int error = 0; - int flags = XFS_BMAPI_DELALLOC; - int nres; - - if (whichfork == XFS_COW_FORK) - flags |= XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC; - - /* - * Make sure that the dquots are there. - */ - error = xfs_qm_dqattach(ip); - if (error) - return error; - - offset_fsb = XFS_B_TO_FSBT(mp, offset); - count_fsb = imap->br_blockcount; - map_start_fsb = imap->br_startoff; - - XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb)); - - while (count_fsb != 0) { - /* - * Set up a transaction with which to allocate the - * backing store for the file. Do allocations in a - * loop until we get some space in the range we are - * interested in. The other space that might be allocated - * is in the delayed allocation extent on which we sit - * but before our buffer starts. - */ - nimaps = 0; - while (nimaps == 0) { - nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); - /* - * We have already reserved space for the extent and any - * indirect blocks when creating the delalloc extent, - * there is no need to reserve space in this transaction - * again. - */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, - 0, XFS_TRANS_RESERVE, &tp); - if (error) - return error; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); - - /* - * it is possible that the extents have changed since - * we did the read call as we dropped the ilock for a - * while. We have to be careful about truncates or hole - * punchs here - we are not allowed to allocate - * non-delalloc blocks here. - * - * The only protection against truncation is the pages - * for the range we are being asked to convert are - * locked and hence a truncate will block on them - * first. - * - * As a result, if we go beyond the range we really - * need and hit an delalloc extent boundary followed by - * a hole while we have excess blocks in the map, we - * will fill the hole incorrectly and overrun the - * transaction reservation. - * - * Using a single map prevents this as we are forced to - * check each map we look for overlap with the desired - * range and abort as soon as we find it. Also, given - * that we only return a single map, having one beyond - * what we can return is probably a bit silly. - * - * We also need to check that we don't go beyond EOF; - * this is a truncate optimisation as a truncate sets - * the new file size before block on the pages we - * currently have locked under writeback. Because they - * are about to be tossed, we don't need to write them - * back.... - */ - nimaps = 1; - end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); - error = xfs_bmap_last_offset(ip, &last_block, - XFS_DATA_FORK); - if (error) - goto trans_cancel; - - last_block = XFS_FILEOFF_MAX(last_block, end_fsb); - if ((map_start_fsb + count_fsb) > last_block) { - count_fsb = last_block - map_start_fsb; - if (count_fsb == 0) { - error = -EAGAIN; - goto trans_cancel; - } - } - - /* - * From this point onwards we overwrite the imap - * pointer that the caller gave to us. - */ - error = xfs_bmapi_write(tp, ip, map_start_fsb, - count_fsb, flags, nres, imap, - &nimaps); - if (error) - goto trans_cancel; - - error = xfs_trans_commit(tp); - if (error) - goto error0; - - if (whichfork == XFS_COW_FORK) - *cow_seq = READ_ONCE(ifp->if_seq); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } - - /* - * See if we were able to allocate an extent that - * covers at least part of the callers request - */ - if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) - return xfs_alert_fsblock_zero(ip, imap); - - if ((offset_fsb >= imap->br_startoff) && - (offset_fsb < (imap->br_startoff + - imap->br_blockcount))) { - XFS_STATS_INC(mp, xs_xstrat_quick); - return 0; } - - /* - * So far we have not mapped the requested part of the - * file, just surrounding data, try again. - */ - count_fsb -= imap->br_blockcount; - map_start_fsb = imap->br_startoff + imap->br_blockcount; + /* ensure we only report blocks we have a reservation for */ + xfs_trim_extent(&imap, cmap.br_startoff, cmap.br_blockcount); + shared = true; } - -trans_cancel: - xfs_trans_cancel(tp); -error0: + error = xfs_bmbt_to_iomap(ip, iomap, &imap, shared); +out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -975,7 +875,7 @@ xfs_ilock_for_iomap( * COW writes may allocate delalloc space or convert unwritten COW * extents, so we need to make sure to take the lock exclusively here. */ - if (xfs_is_reflink_inode(ip) && is_write) { + if (xfs_is_cow_inode(ip) && is_write) { /* * FIXME: It could still overwrite on unshared extents and not * need allocation. @@ -1009,7 +909,7 @@ relock: * check, so if we got ILOCK_SHARED for a write and but we're now a * reflink inode we have to switch to ILOCK_EXCL and relock. */ - if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_reflink_inode(ip)) { + if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_cow_inode(ip)) { xfs_iunlock(ip, mode); mode = XFS_ILOCK_EXCL; goto relock; @@ -1081,23 +981,33 @@ xfs_file_iomap_begin( * Break shared extents if necessary. Checks for non-blocking IO have * been done up front, so we don't need to do them here. */ - if (xfs_is_reflink_inode(ip)) { + if (xfs_is_cow_inode(ip)) { + struct xfs_bmbt_irec cmap; + bool directio = (flags & IOMAP_DIRECT); + /* if zeroing doesn't need COW allocation, then we are done. */ if ((flags & IOMAP_ZERO) && !needs_cow_for_zeroing(&imap, nimaps)) goto out_found; - if (flags & IOMAP_DIRECT) { - /* may drop and re-acquire the ilock */ - error = xfs_reflink_allocate_cow(ip, &imap, &shared, - &lockmode); - if (error) - goto out_unlock; - } else { - error = xfs_reflink_reserve_cow(ip, &imap); - if (error) - goto out_unlock; - } + /* may drop and re-acquire the ilock */ + cmap = imap; + error = xfs_reflink_allocate_cow(ip, &cmap, &shared, &lockmode, + directio); + if (error) + goto out_unlock; + + /* + * For buffered writes we need to report the address of the + * previous block (if there was any) so that the higher level + * write code can perform read-modify-write operations; we + * won't need the CoW fork mapping until writeback. For direct + * I/O, which must be block aligned, we need to report the + * newly allocated address. If the data fork has a hole, copy + * the COW fork mapping to avoid allocating to the data fork. + */ + if (directio || imap.br_startblock == HOLESTARTBLOCK) + imap = cmap; end_fsb = imap.br_startoff + imap.br_blockcount; length = XFS_FSB_TO_B(mp, end_fsb) - offset; @@ -1139,23 +1049,15 @@ xfs_file_iomap_begin( return error; iomap->flags |= IOMAP_F_NEW; - trace_xfs_iomap_alloc(ip, offset, length, 0, &imap); + trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap); out_finish: - if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields - & ~XFS_ILOG_TIMESTAMP)) - iomap->flags |= IOMAP_F_DIRTY; - - xfs_bmbt_to_iomap(ip, iomap, &imap); - - if (shared) - iomap->flags |= IOMAP_F_SHARED; - return 0; + return xfs_bmbt_to_iomap(ip, iomap, &imap, shared); out_found: ASSERT(nimaps); xfs_iunlock(ip, lockmode); - trace_xfs_iomap_found(ip, offset, length, 0, &imap); + trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); goto out_finish; out_unlock: @@ -1241,6 +1143,92 @@ const struct iomap_ops xfs_iomap_ops = { }; static int +xfs_seek_iomap_begin( + struct inode *inode, + loff_t offset, + loff_t length, + unsigned flags, + struct iomap *iomap) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length); + xfs_fileoff_t cow_fsb = NULLFILEOFF, data_fsb = NULLFILEOFF; + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec imap, cmap; + int error = 0; + unsigned lockmode; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + lockmode = xfs_ilock_data_map_shared(ip); + if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + if (error) + goto out_unlock; + } + + if (xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) { + /* + * If we found a data extent we are done. + */ + if (imap.br_startoff <= offset_fsb) + goto done; + data_fsb = imap.br_startoff; + } else { + /* + * Fake a hole until the end of the file. + */ + data_fsb = min(XFS_B_TO_FSB(mp, offset + length), + XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes)); + } + + /* + * If a COW fork extent covers the hole, report it - capped to the next + * data fork extent: + */ + if (xfs_inode_has_cow_data(ip) && + xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap)) + cow_fsb = cmap.br_startoff; + if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) { + if (data_fsb < cow_fsb + cmap.br_blockcount) + end_fsb = min(end_fsb, data_fsb); + xfs_trim_extent(&cmap, offset_fsb, end_fsb); + error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true); + /* + * This is a COW extent, so we must probe the page cache + * because there could be dirty page cache being backed + * by this extent. + */ + iomap->type = IOMAP_UNWRITTEN; + goto out_unlock; + } + + /* + * Else report a hole, capped to the next found data or COW extent. + */ + if (cow_fsb != NULLFILEOFF && cow_fsb < data_fsb) + imap.br_blockcount = cow_fsb - offset_fsb; + else + imap.br_blockcount = data_fsb - offset_fsb; + imap.br_startoff = offset_fsb; + imap.br_startblock = HOLESTARTBLOCK; + imap.br_state = XFS_EXT_NORM; +done: + xfs_trim_extent(&imap, offset_fsb, end_fsb); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, false); +out_unlock: + xfs_iunlock(ip, lockmode); + return error; +} + +const struct iomap_ops xfs_seek_iomap_ops = { + .iomap_begin = xfs_seek_iomap_begin, +}; + +static int xfs_xattr_iomap_begin( struct inode *inode, loff_t offset, @@ -1273,12 +1261,10 @@ xfs_xattr_iomap_begin( out_unlock: xfs_iunlock(ip, lockmode); - if (!error) { - ASSERT(nimaps); - xfs_bmbt_to_iomap(ip, iomap, &imap); - } - - return error; + if (error) + return error; + ASSERT(nimaps); + return xfs_bmbt_to_iomap(ip, iomap, &imap, false); } const struct iomap_ops xfs_xattr_iomap_ops = { diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index c6170548831b..5c2f6aa6d78f 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -13,12 +13,10 @@ struct xfs_bmbt_irec; int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, struct xfs_bmbt_irec *, int); -int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t, - struct xfs_bmbt_irec *, unsigned int *); int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool); -void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, - struct xfs_bmbt_irec *); +int xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, + struct xfs_bmbt_irec *, bool shared); xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize); static inline xfs_filblks_t @@ -42,6 +40,7 @@ xfs_aligned_fsb_count( } extern const struct iomap_ops xfs_iomap_ops; +extern const struct iomap_ops xfs_seek_iomap_ops; extern const struct iomap_ops xfs_xattr_iomap_ops; #endif /* __XFS_IOMAP_H__*/ diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index f48ffd7a8d3e..74047bd0c1ae 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -191,9 +191,18 @@ xfs_generic_create( xfs_setup_iops(ip); - if (tmpfile) + if (tmpfile) { + /* + * The VFS requires that any inode fed to d_tmpfile must have + * nlink == 1 so that it can decrement the nlink in d_tmpfile. + * However, we created the temp file with nlink == 0 because + * we're not allowed to put an inode with nlink > 0 on the + * unlinked list. Therefore we have to set nlink to 1 so that + * d_tmpfile can immediately set it back to zero. + */ + set_nlink(inode, 1); d_tmpfile(dentry, inode); - else + } else d_instantiate(dentry, inode); xfs_finish_inode_setup(ip); @@ -522,6 +531,10 @@ xfs_vn_getattr( } } + /* + * Note: If you add another clause to set an attribute flag, please + * update attributes_mask below. + */ if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE) stat->attributes |= STATX_ATTR_IMMUTABLE; if (ip->i_d.di_flags & XFS_DIFLAG_APPEND) @@ -529,6 +542,10 @@ xfs_vn_getattr( if (ip->i_d.di_flags & XFS_DIFLAG_NODUMP) stat->attributes |= STATX_ATTR_NODUMP; + stat->attributes_mask |= (STATX_ATTR_IMMUTABLE | + STATX_ATTR_APPEND | + STATX_ATTR_NODUMP); + switch (inode->i_mode & S_IFMT) { case S_IFBLK: case S_IFCHR: diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 9fe88d125f0a..3371d1ff27c4 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2439,17 +2439,21 @@ xlog_recover_validate_buf_type( case XFS_BLFT_BTREE_BUF: switch (magic32) { case XFS_ABTB_CRC_MAGIC: - case XFS_ABTC_CRC_MAGIC: case XFS_ABTB_MAGIC: + bp->b_ops = &xfs_bnobt_buf_ops; + break; + case XFS_ABTC_CRC_MAGIC: case XFS_ABTC_MAGIC: - bp->b_ops = &xfs_allocbt_buf_ops; + bp->b_ops = &xfs_cntbt_buf_ops; break; case XFS_IBT_CRC_MAGIC: - case XFS_FIBT_CRC_MAGIC: case XFS_IBT_MAGIC: - case XFS_FIBT_MAGIC: bp->b_ops = &xfs_inobt_buf_ops; break; + case XFS_FIBT_CRC_MAGIC: + case XFS_FIBT_MAGIC: + bp->b_ops = &xfs_finobt_buf_ops; + break; case XFS_BMAP_CRC_MAGIC: case XFS_BMAP_MAGIC: bp->b_ops = &xfs_bmbt_buf_ops; @@ -3045,7 +3049,7 @@ xlog_recover_inode_pass2( * Make sure the place we're flushing out to really looks * like an inode! */ - if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) { + if (unlikely(!xfs_verify_magic16(bp, dip->di_magic))) { xfs_alert(mp, "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld", __func__, dip, bp, in_f->ilf_ino); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index b4d8c318be3c..fd63b0b1307c 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -149,6 +149,7 @@ xfs_free_perag( spin_unlock(&mp->m_perag_lock); ASSERT(pag); ASSERT(atomic_read(&pag->pag_ref) == 0); + xfs_iunlink_destroy(pag); xfs_buf_hash_destroy(pag); mutex_destroy(&pag->pag_ici_reclaim_lock); call_rcu(&pag->rcu_head, __xfs_free_perag); @@ -227,6 +228,9 @@ xfs_initialize_perag( /* first new pag is fully initialized */ if (first_initialised == NULLAGNUMBER) first_initialised = index; + error = xfs_iunlink_init(pag); + if (error) + goto out_hash_destroy; } index = xfs_set_inode_alloc(mp, agcount); @@ -249,6 +253,7 @@ out_unwind_new_pags: if (!pag) break; xfs_buf_hash_destroy(pag); + xfs_iunlink_destroy(pag); mutex_destroy(&pag->pag_ici_reclaim_lock); kmem_free(pag); } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 7daafe064af8..110f927cf943 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -138,7 +138,7 @@ typedef struct xfs_mount { struct mutex m_growlock; /* growfs mutex */ int m_fixedfsid[2]; /* unchanged for life of FS */ uint64_t m_flags; /* global mount flags */ - bool m_inotbt_nores; /* no per-AG finobt resv. */ + bool m_finobt_nores; /* no per-AG finobt resv. */ int m_ialloc_inos; /* inodes in inode allocation */ int m_ialloc_blks; /* blocks in inode allocation */ int m_ialloc_min_blks;/* min blocks in sparse inode @@ -194,6 +194,7 @@ typedef struct xfs_mount { */ uint32_t m_generation; + bool m_always_cow; bool m_fail_unmount; #ifdef DEBUG /* @@ -396,6 +397,13 @@ typedef struct xfs_perag { /* reference count */ uint8_t pagf_refcount_level; + + /* + * Unlinked inode information. This incore information reflects + * data stored in the AGI, so callers must hold the AGI buffer lock + * or have some other means to control concurrency. + */ + struct rhashtable pagi_unlinked_hash; } xfs_perag_t; static inline struct xfs_ag_resv * diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index d3e04d20d8d4..c8ba98fae30a 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -125,6 +125,27 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56); XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20); XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16); + + /* + * The v5 superblock format extended several v4 header structures with + * additional data. While new fields are only accessible on v5 + * superblocks, it's important that the v5 structures place original v4 + * fields/headers in the correct location on-disk. For example, we must + * be able to find magic values at the same location in certain blocks + * regardless of superblock version. + * + * The following checks ensure that various v5 data structures place the + * subset of v4 metadata associated with the same type of block at the + * start of the on-disk block. If there is no data structure definition + * for certain types of v4 blocks, traverse down to the first field of + * common metadata (e.g., magic value) and make sure it is at offset + * zero. + */ + XFS_CHECK_OFFSET(struct xfs_dir3_leaf, hdr.info.hdr, 0); + XFS_CHECK_OFFSET(struct xfs_da3_intnode, hdr.info.hdr, 0); + XFS_CHECK_OFFSET(struct xfs_dir3_data_hdr, hdr.magic, 0); + XFS_CHECK_OFFSET(struct xfs_dir3_free, hdr.hdr.magic, 0); + XFS_CHECK_OFFSET(struct xfs_attr3_leafblock, hdr.info.hdr, 0); } #endif /* __XFS_ONDISK_H */ diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index f44c3599527d..bde2c9f56a46 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -185,7 +185,7 @@ xfs_fs_map_blocks( } xfs_iunlock(ip, XFS_IOLOCK_EXCL); - xfs_bmbt_to_iomap(ip, iomap, &imap); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, false); *device_generation = mp->m_generation; return error; out_unlock: diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index c5b4fa004ca4..680ae7662a78 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -192,7 +192,7 @@ xfs_reflink_trim_around_shared( int error = 0; /* Holes, unwritten, and delalloc extents cannot be shared */ - if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_real_extent(irec)) { + if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_real_extent(irec)) { *shared = false; return 0; } @@ -234,93 +234,59 @@ xfs_reflink_trim_around_shared( } } -/* - * Trim the passed in imap to the next shared/unshared extent boundary, and - * if imap->br_startoff points to a shared extent reserve space for it in the - * COW fork. - * - * Note that imap will always contain the block numbers for the existing blocks - * in the data fork, as the upper layers need them for read-modify-write - * operations. - */ -int -xfs_reflink_reserve_cow( +bool +xfs_inode_need_cow( struct xfs_inode *ip, - struct xfs_bmbt_irec *imap) + struct xfs_bmbt_irec *imap, + bool *shared) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); - struct xfs_bmbt_irec got; - int error = 0; - bool eof = false; - struct xfs_iext_cursor icur; - bool shared; - - /* - * Search the COW fork extent list first. This serves two purposes: - * first this implement the speculative preallocation using cowextisze, - * so that we also unshared block adjacent to shared blocks instead - * of just the shared blocks themselves. Second the lookup in the - * extent list is generally faster than going out to the shared extent - * tree. - */ - - if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &icur, &got)) - eof = true; - if (!eof && got.br_startoff <= imap->br_startoff) { - trace_xfs_reflink_cow_found(ip, imap); - xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); + /* We can't update any real extents in always COW mode. */ + if (xfs_is_always_cow_inode(ip) && + !isnullstartblock(imap->br_startblock)) { + *shared = true; return 0; } /* Trim the mapping to the nearest shared extent boundary. */ - error = xfs_reflink_trim_around_shared(ip, imap, &shared); - if (error) - return error; - - /* Not shared? Just report the (potentially capped) extent. */ - if (!shared) - return 0; - - /* - * Fork all the shared blocks from our write offset until the end of - * the extent. - */ - error = xfs_qm_dqattach_locked(ip, false); - if (error) - return error; - - error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff, - imap->br_blockcount, 0, &got, &icur, eof); - if (error == -ENOSPC || error == -EDQUOT) - trace_xfs_reflink_cow_enospc(ip, imap); - if (error) - return error; - - xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); - trace_xfs_reflink_cow_alloc(ip, &got); - return 0; + return xfs_reflink_trim_around_shared(ip, imap, shared); } -/* Convert part of an unwritten CoW extent to a real one. */ -STATIC int -xfs_reflink_convert_cow_extent( - struct xfs_inode *ip, - struct xfs_bmbt_irec *imap, - xfs_fileoff_t offset_fsb, - xfs_filblks_t count_fsb) +static int +xfs_reflink_convert_cow_locked( + struct xfs_inode *ip, + xfs_fileoff_t offset_fsb, + xfs_filblks_t count_fsb) { - int nimaps = 1; + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec got; + struct xfs_btree_cur *dummy_cur = NULL; + int dummy_logflags; + int error = 0; - if (imap->br_state == XFS_EXT_NORM) + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got)) return 0; - xfs_trim_extent(imap, offset_fsb, count_fsb); - trace_xfs_reflink_convert_cow(ip, imap); - if (imap->br_blockcount == 0) - return 0; - return xfs_bmapi_write(NULL, ip, imap->br_startoff, imap->br_blockcount, - XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT, 0, imap, - &nimaps); + do { + if (got.br_startoff >= offset_fsb + count_fsb) + break; + if (got.br_state == XFS_EXT_NORM) + continue; + if (WARN_ON_ONCE(isnullstartblock(got.br_startblock))) + return -EIO; + + xfs_trim_extent(&got, offset_fsb, count_fsb); + if (!got.br_blockcount) + continue; + + got.br_state = XFS_EXT_NORM; + error = xfs_bmap_add_extent_unwritten_real(NULL, ip, + XFS_COW_FORK, &icur, &dummy_cur, &got, + &dummy_logflags); + if (error) + return error; + } while (xfs_iext_next_extent(ip->i_cowfp, &icur, &got)); + + return error; } /* Convert all of the unwritten CoW extents in a file's range to real ones. */ @@ -334,15 +300,12 @@ xfs_reflink_convert_cow( xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); xfs_filblks_t count_fsb = end_fsb - offset_fsb; - struct xfs_bmbt_irec imap; - int nimaps = 1, error = 0; + int error; ASSERT(count != 0); xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_bmapi_write(NULL, ip, offset_fsb, count_fsb, - XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT | - XFS_BMAPI_CONVERT_ONLY, 0, &imap, &nimaps); + error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -375,7 +338,7 @@ xfs_find_trim_cow_extent( if (got.br_startoff > offset_fsb) { xfs_trim_extent(imap, imap->br_startoff, got.br_startoff - imap->br_startoff); - return xfs_reflink_trim_around_shared(ip, imap, shared); + return xfs_inode_need_cow(ip, imap, shared); } *shared = true; @@ -397,7 +360,8 @@ xfs_reflink_allocate_cow( struct xfs_inode *ip, struct xfs_bmbt_irec *imap, bool *shared, - uint *lockmode) + uint *lockmode, + bool convert_now) { struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t offset_fsb = imap->br_startoff; @@ -409,7 +373,10 @@ xfs_reflink_allocate_cow( xfs_extlen_t resblks = 0; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT(xfs_is_reflink_inode(ip)); + if (!ip->i_cowfp) { + ASSERT(!xfs_is_reflink_inode(ip)); + xfs_ifork_init_cow(ip); + } error = xfs_find_trim_cow_extent(ip, imap, shared, &found); if (error || !*shared) @@ -471,7 +438,16 @@ xfs_reflink_allocate_cow( if (nimaps == 0) return -ENOSPC; convert: - return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb); + xfs_trim_extent(imap, offset_fsb, count_fsb); + /* + * COW fork extents are supposed to remain unwritten until we're ready + * to initiate a disk write. For direct I/O we are going to write the + * data and need the conversion, but for buffered writes we're done. + */ + if (!convert_now || imap->br_state == XFS_EXT_NORM) + return 0; + trace_xfs_reflink_convert_cow(ip, imap); + return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); out_unreserve: xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0, @@ -586,7 +562,7 @@ xfs_reflink_cancel_cow_range( int error; trace_xfs_reflink_cancel_cow_range(ip, offset, count); - ASSERT(xfs_is_reflink_inode(ip)); + ASSERT(ip->i_cowfp); offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); if (count == NULLFILEOFF) @@ -1192,7 +1168,7 @@ xfs_reflink_remap_blocks( break; ASSERT(nimaps == 1); - trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE, + trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_DATA_FORK, &imap); /* Translate imap into the destination file. */ diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index 6d73daef1f13..28a43b7f581d 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -6,16 +6,28 @@ #ifndef __XFS_REFLINK_H #define __XFS_REFLINK_H 1 +static inline bool xfs_is_always_cow_inode(struct xfs_inode *ip) +{ + return ip->i_mount->m_always_cow && + xfs_sb_version_hasreflink(&ip->i_mount->m_sb); +} + +static inline bool xfs_is_cow_inode(struct xfs_inode *ip) +{ + return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip); +} + extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal); extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip, struct xfs_bmbt_irec *irec, bool *shared); +bool xfs_inode_need_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, + bool *shared); -extern int xfs_reflink_reserve_cow(struct xfs_inode *ip, - struct xfs_bmbt_irec *imap); extern int xfs_reflink_allocate_cow(struct xfs_inode *ip, - struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode); + struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode, + bool convert_now); extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index c9097cb0b955..f093ea244849 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1594,6 +1594,13 @@ xfs_mount_alloc( INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker); mp->m_kobj.kobject.kset = xfs_kset; + /* + * We don't create the finobt per-ag space reservation until after log + * recovery, so we must set this to true so that an ifree transaction + * started during log recovery will not depend on space reservations + * for finobt expansion. + */ + mp->m_finobt_nores = true; return mp; } @@ -1729,11 +1736,18 @@ xfs_fs_fill_super( } } - if (xfs_sb_version_hasreflink(&mp->m_sb) && mp->m_sb.sb_rblocks) { - xfs_alert(mp, + if (xfs_sb_version_hasreflink(&mp->m_sb)) { + if (mp->m_sb.sb_rblocks) { + xfs_alert(mp, "reflink not compatible with realtime device!"); - error = -EINVAL; - goto out_filestream_unmount; + error = -EINVAL; + goto out_filestream_unmount; + } + + if (xfs_globals.always_cow) { + xfs_info(mp, "using DEBUG-only always_cow mode."); + mp->m_always_cow = true; + } } if (xfs_sb_version_hasrmapbt(&mp->m_sb) && mp->m_sb.sb_rblocks) { diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index 168488130a19..ad7f9be13087 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -85,6 +85,7 @@ struct xfs_globals { int log_recovery_delay; /* log recovery delay (secs) */ int mount_delay; /* mount setup delay (secs) */ bool bug_on_assert; /* BUG() the kernel on assert failure */ + bool always_cow; /* use COW fork for all overwrites */ }; extern struct xfs_globals xfs_globals; diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index cd6a994a7250..cabda13f3c64 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -183,10 +183,34 @@ mount_delay_show( } XFS_SYSFS_ATTR_RW(mount_delay); +static ssize_t +always_cow_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + ssize_t ret; + + ret = kstrtobool(buf, &xfs_globals.always_cow); + if (ret < 0) + return ret; + return count; +} + +static ssize_t +always_cow_show( + struct kobject *kobject, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.always_cow); +} +XFS_SYSFS_ATTR_RW(always_cow); + static struct attribute *xfs_dbg_attrs[] = { ATTR_LIST(bug_on_assert), ATTR_LIST(log_recovery_delay), ATTR_LIST(mount_delay), + ATTR_LIST(always_cow), NULL, }; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 6fcc893dfc91..47fb07d86efd 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1218,23 +1218,17 @@ DEFINE_EVENT(xfs_readpage_class, name, \ DEFINE_READPAGE_EVENT(xfs_vm_readpage); DEFINE_READPAGE_EVENT(xfs_vm_readpages); -TRACE_DEFINE_ENUM(XFS_IO_HOLE); -TRACE_DEFINE_ENUM(XFS_IO_DELALLOC); -TRACE_DEFINE_ENUM(XFS_IO_UNWRITTEN); -TRACE_DEFINE_ENUM(XFS_IO_OVERWRITE); -TRACE_DEFINE_ENUM(XFS_IO_COW); - DECLARE_EVENT_CLASS(xfs_imap_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, - int type, struct xfs_bmbt_irec *irec), - TP_ARGS(ip, offset, count, type, irec), + int whichfork, struct xfs_bmbt_irec *irec), + TP_ARGS(ip, offset, count, whichfork, irec), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) __field(loff_t, size) __field(loff_t, offset) __field(size_t, count) - __field(int, type) + __field(int, whichfork) __field(xfs_fileoff_t, startoff) __field(xfs_fsblock_t, startblock) __field(xfs_filblks_t, blockcount) @@ -1245,33 +1239,33 @@ DECLARE_EVENT_CLASS(xfs_imap_class, __entry->size = ip->i_d.di_size; __entry->offset = offset; __entry->count = count; - __entry->type = type; + __entry->whichfork = whichfork; __entry->startoff = irec ? irec->br_startoff : 0; __entry->startblock = irec ? irec->br_startblock : 0; __entry->blockcount = irec ? irec->br_blockcount : 0; ), TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd " - "type %s startoff 0x%llx startblock %lld blockcount 0x%llx", + "fork %s startoff 0x%llx startblock %lld blockcount 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, __entry->offset, __entry->count, - __print_symbolic(__entry->type, XFS_IO_TYPES), + __entry->whichfork == XFS_COW_FORK ? "cow" : "data", __entry->startoff, (int64_t)__entry->startblock, __entry->blockcount) ) -#define DEFINE_IOMAP_EVENT(name) \ +#define DEFINE_IMAP_EVENT(name) \ DEFINE_EVENT(xfs_imap_class, name, \ TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \ - int type, struct xfs_bmbt_irec *irec), \ - TP_ARGS(ip, offset, count, type, irec)) -DEFINE_IOMAP_EVENT(xfs_map_blocks_found); -DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc); -DEFINE_IOMAP_EVENT(xfs_iomap_alloc); -DEFINE_IOMAP_EVENT(xfs_iomap_found); + int whichfork, struct xfs_bmbt_irec *irec), \ + TP_ARGS(ip, offset, count, whichfork, irec)) +DEFINE_IMAP_EVENT(xfs_map_blocks_found); +DEFINE_IMAP_EVENT(xfs_map_blocks_alloc); +DEFINE_IMAP_EVENT(xfs_iomap_alloc); +DEFINE_IMAP_EVENT(xfs_iomap_found); DECLARE_EVENT_CLASS(xfs_simple_io_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), @@ -3078,7 +3072,7 @@ DEFINE_EVENT(xfs_inode_irec_class, name, \ DEFINE_INODE_EVENT(xfs_reflink_set_inode_flag); DEFINE_INODE_EVENT(xfs_reflink_unset_inode_flag); DEFINE_ITRUNC_EVENT(xfs_reflink_update_inode_size); -DEFINE_IOMAP_EVENT(xfs_reflink_remap_imap); +DEFINE_IMAP_EVENT(xfs_reflink_remap_imap); TRACE_EVENT(xfs_reflink_remap_blocks_loop, TP_PROTO(struct xfs_inode *src, xfs_fileoff_t soffset, xfs_filblks_t len, struct xfs_inode *dest, @@ -3202,13 +3196,10 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error); /* copy on write */ DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared); -DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc); DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow); -DEFINE_RW_EVENT(xfs_reflink_reserve_cow); - DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write); DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range); @@ -3371,6 +3362,84 @@ DEFINE_TRANS_EVENT(xfs_trans_roll); DEFINE_TRANS_EVENT(xfs_trans_add_item); DEFINE_TRANS_EVENT(xfs_trans_free_items); +TRACE_EVENT(xfs_iunlink_update_bucket, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int bucket, + xfs_agino_t old_ptr, xfs_agino_t new_ptr), + TP_ARGS(mp, agno, bucket, old_ptr, new_ptr), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(unsigned int, bucket) + __field(xfs_agino_t, old_ptr) + __field(xfs_agino_t, new_ptr) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->bucket = bucket; + __entry->old_ptr = old_ptr; + __entry->new_ptr = new_ptr; + ), + TP_printk("dev %d:%d agno %u bucket %u old 0x%x new 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->bucket, + __entry->old_ptr, + __entry->new_ptr) +); + +TRACE_EVENT(xfs_iunlink_update_dinode, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino, + xfs_agino_t old_ptr, xfs_agino_t new_ptr), + TP_ARGS(mp, agno, agino, old_ptr, new_ptr), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(xfs_agino_t, old_ptr) + __field(xfs_agino_t, new_ptr) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agino = agino; + __entry->old_ptr = old_ptr; + __entry->new_ptr = new_ptr; + ), + TP_printk("dev %d:%d agno %u agino 0x%x old 0x%x new 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agino, + __entry->old_ptr, + __entry->new_ptr) +); + +DECLARE_EVENT_CLASS(xfs_ag_inode_class, + TP_PROTO(struct xfs_inode *ip), + TP_ARGS(ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino); + __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino); + ), + TP_printk("dev %d:%d agno %u agino %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, __entry->agino) +) + +#define DEFINE_AGINODE_EVENT(name) \ +DEFINE_EVENT(xfs_ag_inode_class, name, \ + TP_PROTO(struct xfs_inode *ip), \ + TP_ARGS(ip)) +DEFINE_AGINODE_EVENT(xfs_iunlink); +DEFINE_AGINODE_EVENT(xfs_iunlink_remove); +DEFINE_AG_EVENT(xfs_iunlink_map_prev_fallback); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c index 11cff449d055..e1c7d55b32c3 100644 --- a/fs/xfs/xfs_trans_bmap.c +++ b/fs/xfs/xfs_trans_bmap.c @@ -17,7 +17,6 @@ #include "xfs_alloc.h" #include "xfs_bmap.h" #include "xfs_inode.h" -#include "xfs_defer.h" /* * This routine is called to allocate a "bmap update done" diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 629f1479c9d2..7d65ebf1e847 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -277,7 +277,7 @@ xfs_trans_read_buf_map( * release this buffer when it kills the tranaction. */ ASSERT(bp->b_ops != NULL); - error = xfs_buf_ensure_ops(bp, ops); + error = xfs_buf_reverify(bp, ops); if (error) { xfs_buf_ioerror_alert(bp, __func__); diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c index 0710434eb240..8ee7a3f8bb20 100644 --- a/fs/xfs/xfs_trans_extfree.c +++ b/fs/xfs/xfs_trans_extfree.c @@ -18,7 +18,6 @@ #include "xfs_alloc.h" #include "xfs_bmap.h" #include "xfs_trace.h" -#include "xfs_defer.h" /* * This routine is called to allocate an "extent free done" diff --git a/fs/xfs/xfs_trans_refcount.c b/fs/xfs/xfs_trans_refcount.c index 6c947ff4faf6..8d734728dd1b 100644 --- a/fs/xfs/xfs_trans_refcount.c +++ b/fs/xfs/xfs_trans_refcount.c @@ -16,7 +16,6 @@ #include "xfs_refcount_item.h" #include "xfs_alloc.h" #include "xfs_refcount.h" -#include "xfs_defer.h" /* * This routine is called to allocate a "refcount update done" diff --git a/fs/xfs/xfs_trans_rmap.c b/fs/xfs/xfs_trans_rmap.c index a42890931ecd..5c7936b1be13 100644 --- a/fs/xfs/xfs_trans_rmap.c +++ b/fs/xfs/xfs_trans_rmap.c @@ -16,7 +16,6 @@ #include "xfs_rmap_item.h" #include "xfs_alloc.h" #include "xfs_rmap.h" -#include "xfs_defer.h" /* Set the map extent flags for this reverse mapping. */ static void diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 63ee1d5bf1d7..9a63016009a1 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -129,6 +129,9 @@ __xfs_xattr_put_listent( char *offset; int arraytop; + if (context->count < 0 || context->seen_enough) + return; + if (!context->alist) goto compute_size; |