diff options
Diffstat (limited to 'fs')
146 files changed, 3272 insertions, 2058 deletions
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index a4e9e6e07e93..d3c6bb22c5f4 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -322,6 +322,8 @@ static int afs_deliver_cb_callback(struct afs_call *call) return ret; call->unmarshall++; + fallthrough; + case 5: break; } @@ -418,6 +420,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) r->node[loop] = ntohl(b[loop + 5]); call->unmarshall++; + fallthrough; case 2: break; @@ -530,6 +533,7 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call) r->node[loop] = ntohl(b[loop + 5]); call->unmarshall++; + fallthrough; case 2: break; @@ -663,6 +667,7 @@ static int afs_deliver_yfs_cb_callback(struct afs_call *call) afs_extract_to_tmp(call); call->unmarshall++; + fallthrough; case 3: break; diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 9fbe5a5ec9bd..78719f2f567e 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -1919,7 +1919,9 @@ static void afs_rename_edit_dir(struct afs_operation *op) new_inode = d_inode(new_dentry); if (new_inode) { spin_lock(&new_inode->i_lock); - if (new_inode->i_nlink > 0) + if (S_ISDIR(new_inode->i_mode)) + clear_nlink(new_inode); + else if (new_inode->i_nlink > 0) drop_nlink(new_inode); spin_unlock(&new_inode->i_lock); } diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 2f695a260442..dd3f45d906d2 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -388,6 +388,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) req->file_size = vp->scb.status.size; call->unmarshall++; + fallthrough; case 5: break; @@ -1408,6 +1409,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) _debug("motd '%s'", p); call->unmarshall++; + fallthrough; case 8: break; @@ -1845,6 +1847,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) xdr_decode_AFSVolSync(&bp, &op->volsync); call->unmarshall++; + fallthrough; case 6: break; @@ -1979,6 +1982,7 @@ static int afs_deliver_fs_fetch_acl(struct afs_call *call) xdr_decode_AFSVolSync(&bp, &op->volsync); call->unmarshall++; + fallthrough; case 4: break; diff --git a/fs/afs/main.c b/fs/afs/main.c index b2975256dadb..179004b15566 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -203,8 +203,8 @@ static int __init afs_init(void) goto error_fs; afs_proc_symlink = proc_symlink("fs/afs", NULL, "../self/net/afs"); - if (IS_ERR(afs_proc_symlink)) { - ret = PTR_ERR(afs_proc_symlink); + if (!afs_proc_symlink) { + ret = -ENOMEM; goto error_proc; } diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c index dc9327332f06..00fca3c66ba6 100644 --- a/fs/afs/vlclient.c +++ b/fs/afs/vlclient.c @@ -593,6 +593,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call) if (ret < 0) return ret; call->unmarshall = 6; + fallthrough; case 6: break; diff --git a/fs/afs/write.c b/fs/afs/write.c index 3edb6204b937..3104b62c2082 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -118,6 +118,15 @@ int afs_write_end(struct file *file, struct address_space *mapping, _enter("{%llx:%llu},{%lx}", vnode->fid.vid, vnode->fid.vnode, page->index); + if (!PageUptodate(page)) { + if (copied < len) { + copied = 0; + goto out; + } + + SetPageUptodate(page); + } + if (copied == 0) goto out; @@ -132,8 +141,6 @@ int afs_write_end(struct file *file, struct address_space *mapping, write_sequnlock(&vnode->cb_lock); } - ASSERT(PageUptodate(page)); - if (PagePrivate(page)) { priv = page_private(page); f = afs_page_dirty_from(page, priv); @@ -730,7 +737,7 @@ static int afs_writepages_region(struct address_space *mapping, return ret; } - start += ret * PAGE_SIZE; + start += ret; cond_resched(); } while (wbc->nr_to_write > 0); @@ -837,6 +844,7 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) struct inode *inode = file_inode(file); struct afs_vnode *vnode = AFS_FS_I(inode); unsigned long priv; + vm_fault_t ret = VM_FAULT_RETRY; _enter("{{%llx:%llu}},{%lx}", vnode->fid.vid, vnode->fid.vnode, page->index); @@ -848,14 +856,14 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) #ifdef CONFIG_AFS_FSCACHE if (PageFsCache(page) && wait_on_page_fscache_killable(page) < 0) - return VM_FAULT_RETRY; + goto out; #endif if (wait_on_page_writeback_killable(page)) - return VM_FAULT_RETRY; + goto out; if (lock_page_killable(page) < 0) - return VM_FAULT_RETRY; + goto out; /* We mustn't change page->private until writeback is complete as that * details the portion of the page we need to write back and we might @@ -863,7 +871,7 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) */ if (wait_on_page_writeback_killable(page) < 0) { unlock_page(page); - return VM_FAULT_RETRY; + goto out; } priv = afs_page_dirty(page, 0, thp_size(page)); @@ -877,8 +885,10 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) } file_update_time(file); + ret = VM_FAULT_LOCKED; +out: sb_end_pagefault(inode->i_sb); - return VM_FAULT_LOCKED; + return ret; } /* diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 187b3f2b9202..3d73cbb439fa 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1537,7 +1537,8 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, { const struct cred *cred; unsigned int i, len; - + unsigned int state; + /* first copy the parameters from user space */ memset(psinfo, 0, sizeof(struct elf_prpsinfo)); @@ -1559,7 +1560,8 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, psinfo->pr_pgrp = task_pgrp_vnr(p); psinfo->pr_sid = task_session_vnr(p); - i = p->state ? ffz(~p->state) + 1 : 0; + state = READ_ONCE(p->__state); + i = state ? ffz(~state) + 1 : 0; psinfo->pr_state = i; psinfo->pr_sname = (i > 5) ? '.' : "RSDTZW"[i]; psinfo->pr_zomb = psinfo->pr_sname == 'Z'; @@ -1571,7 +1573,7 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid)); rcu_read_unlock(); strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname)); - + return 0; } diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 2c99b102c860..ab9c31ddffda 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1331,6 +1331,7 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, { const struct cred *cred; unsigned int i, len; + unsigned int state; /* first copy the parameters from user space */ memset(psinfo, 0, sizeof(struct elf_prpsinfo)); @@ -1353,7 +1354,8 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, psinfo->pr_pgrp = task_pgrp_vnr(p); psinfo->pr_sid = task_session_vnr(p); - i = p->state ? ffz(~p->state) + 1 : 0; + state = READ_ONCE(p->__state); + i = state ? ffz(~state) + 1 : 0; psinfo->pr_state = i; psinfo->pr_sname = (i > 5) ? '.' : "RSDTZW"[i]; psinfo->pr_zomb = psinfo->pr_sname == 'Z'; diff --git a/fs/block_dev.c b/fs/block_dev.c index b8abccd03e5d..6cc4d4cfe0c2 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1244,6 +1244,9 @@ int bdev_disk_changed(struct block_device *bdev, bool invalidate) lockdep_assert_held(&bdev->bd_mutex); + if (!(disk->flags & GENHD_FL_UP)) + return -ENXIO; + rescan: if (bdev->bd_part_count) return -EBUSY; @@ -1298,6 +1301,9 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode) struct gendisk *disk = bdev->bd_disk; int ret = 0; + if (!(disk->flags & GENHD_FL_UP)) + return -ENXIO; + if (!bdev->bd_openers) { if (!bdev_is_partition(bdev)) { ret = 0; @@ -1332,8 +1338,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode) whole->bd_part_count++; mutex_unlock(&whole->bd_mutex); - if (!(disk->flags & GENHD_FL_UP) || - !bdev_nr_sectors(bdev)) { + if (!bdev_nr_sectors(bdev)) { __blkdev_put(whole, mode, 1); bdput(whole); return -ENXIO; @@ -1364,16 +1369,12 @@ struct block_device *blkdev_get_no_open(dev_t dev) struct block_device *bdev; struct gendisk *disk; - down_read(&bdev_lookup_sem); bdev = bdget(dev); if (!bdev) { - up_read(&bdev_lookup_sem); blk_request_module(dev); - down_read(&bdev_lookup_sem); - bdev = bdget(dev); if (!bdev) - goto unlock; + return NULL; } disk = bdev->bd_disk; @@ -1383,14 +1384,11 @@ struct block_device *blkdev_get_no_open(dev_t dev) goto put_disk; if (!try_module_get(bdev->bd_disk->fops->owner)) goto put_disk; - up_read(&bdev_lookup_sem); return bdev; put_disk: put_disk(disk); bdput: bdput(bdev); -unlock: - up_read(&bdev_lookup_sem); return NULL; } diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 68b95ad82126..520a0f6a7d9e 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -18,6 +18,8 @@ config BTRFS_FS select RAID6_PQ select XOR_BLOCKS select SRCU + depends on !PPC_256K_PAGES # powerpc + depends on !PAGE_SIZE_256KB # hexagon help Btrfs is a general purpose copy-on-write filesystem with extents, diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 117d423fdb93..7a8a2fc19533 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -2675,7 +2675,7 @@ static int handle_direct_tree_backref(struct btrfs_backref_cache *cache, * * @ref_key: The same as @ref_key in handle_direct_tree_backref() * @tree_key: The first key of this tree block. - * @path: A clean (released) path, to avoid allocating path everytime + * @path: A clean (released) path, to avoid allocating path every time * the function get called. */ static int handle_indirect_tree_backref(struct btrfs_backref_cache *cache, diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index aa57bdc8fc89..38b127b9edfc 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1399,7 +1399,6 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) btrfs_space_info_update_bytes_pinned(fs_info, space_info, -block_group->pinned); space_info->bytes_readonly += block_group->pinned; - __btrfs_mod_total_bytes_pinned(space_info, -block_group->pinned); block_group->pinned = 0; spin_unlock(&block_group->lock); @@ -1491,7 +1490,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) container_of(work, struct btrfs_fs_info, reclaim_bgs_work); struct btrfs_block_group *bg; struct btrfs_space_info *space_info; - int ret; + LIST_HEAD(again_list); if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) return; @@ -1502,6 +1501,8 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) mutex_lock(&fs_info->reclaim_bgs_lock); spin_lock(&fs_info->unused_bgs_lock); while (!list_empty(&fs_info->reclaim_bgs)) { + int ret = 0; + bg = list_first_entry(&fs_info->reclaim_bgs, struct btrfs_block_group, bg_list); @@ -1547,9 +1548,13 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) bg->start); next: - btrfs_put_block_group(bg); spin_lock(&fs_info->unused_bgs_lock); + if (ret == -EAGAIN && list_empty(&bg->bg_list)) + list_add_tail(&bg->bg_list, &again_list); + else + btrfs_put_block_group(bg); } + list_splice_tail(&again_list, &fs_info->reclaim_bgs); spin_unlock(&fs_info->unused_bgs_lock); mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_exclop_finish(fs_info); @@ -2442,16 +2447,16 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache) spin_lock(&sinfo->lock); spin_lock(&cache->lock); if (!--cache->ro) { - num_bytes = cache->length - cache->reserved - - cache->pinned - cache->bytes_super - - cache->zone_unusable - cache->used; - sinfo->bytes_readonly -= num_bytes; if (btrfs_is_zoned(cache->fs_info)) { /* Migrate zone_unusable bytes back */ cache->zone_unusable = cache->alloc_offset - cache->used; sinfo->bytes_zone_unusable += cache->zone_unusable; sinfo->bytes_readonly -= cache->zone_unusable; } + num_bytes = cache->length - cache->reserved - + cache->pinned - cache->bytes_super - + cache->zone_unusable - cache->used; + sinfo->bytes_readonly -= num_bytes; list_del_init(&cache->ro_list); } spin_unlock(&cache->lock); @@ -2505,7 +2510,7 @@ static int cache_save_setup(struct btrfs_block_group *block_group, struct extent_changeset *data_reserved = NULL; u64 alloc_hint = 0; int dcs = BTRFS_DC_ERROR; - u64 num_pages = 0; + u64 cache_size = 0; int retries = 0; int ret = 0; @@ -2617,20 +2622,20 @@ again: * taking up quite a bit since it's not folded into the other space * cache. */ - num_pages = div_u64(block_group->length, SZ_256M); - if (!num_pages) - num_pages = 1; + cache_size = div_u64(block_group->length, SZ_256M); + if (!cache_size) + cache_size = 1; - num_pages *= 16; - num_pages *= PAGE_SIZE; + cache_size *= 16; + cache_size *= fs_info->sectorsize; ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0, - num_pages); + cache_size); if (ret) goto out_put; - ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, - num_pages, num_pages, + ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, cache_size, + cache_size, cache_size, &alloc_hint); /* * Our cache requires contiguous chunks so that we don't modify a bunch @@ -3062,8 +3067,6 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - __btrfs_mod_total_bytes_pinned(cache->space_info, - num_bytes); set_extent_dirty(&trans->transaction->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 2bea01d23a5b..9a023ae0f98b 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -28,6 +28,7 @@ #include "compression.h" #include "extent_io.h" #include "extent_map.h" +#include "zoned.h" static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" }; @@ -148,7 +149,7 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio, const u32 csum_size = fs_info->csum_size; const u32 sectorsize = fs_info->sectorsize; struct page *page; - unsigned long i; + unsigned int i; char *kaddr; u8 csum[BTRFS_CSUM_SIZE]; struct compressed_bio *cb = bio->bi_private; @@ -207,7 +208,7 @@ static void end_compressed_bio_read(struct bio *bio) struct compressed_bio *cb = bio->bi_private; struct inode *inode; struct page *page; - unsigned long index; + unsigned int index; unsigned int mirror = btrfs_io_bio(bio)->mirror_num; int ret = 0; @@ -333,7 +334,7 @@ static void end_compressed_bio_write(struct bio *bio) struct compressed_bio *cb = bio->bi_private; struct inode *inode; struct page *page; - unsigned long index; + unsigned int index; if (bio->bi_status) cb->errors = 1; @@ -348,11 +349,10 @@ static void end_compressed_bio_write(struct bio *bio) * call back into the FS and do all the end_io operations */ inode = cb->inode; - cb->compressed_pages[0]->mapping = cb->inode->i_mapping; - btrfs_writepage_endio_finish_ordered(cb->compressed_pages[0], + btrfs_record_physical_zoned(inode, cb->start, bio); + btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL, cb->start, cb->start + cb->len - 1, bio->bi_status == BLK_STS_OK); - cb->compressed_pages[0]->mapping = NULL; end_compressed_writeback(inode, cb); /* note, our inode could be gone now */ @@ -385,10 +385,10 @@ out: * the end io hooks. */ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, - unsigned long len, u64 disk_start, - unsigned long compressed_len, + unsigned int len, u64 disk_start, + unsigned int compressed_len, struct page **compressed_pages, - unsigned long nr_pages, + unsigned int nr_pages, unsigned int write_flags, struct cgroup_subsys_state *blkcg_css) { @@ -401,6 +401,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, u64 first_byte = disk_start; blk_status_t ret; int skip_sum = inode->flags & BTRFS_INODE_NODATASUM; + const bool use_append = btrfs_use_zone_append(inode, disk_start); + const unsigned int bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE; WARN_ON(!PAGE_ALIGNED(start)); cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); @@ -418,10 +420,23 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, cb->nr_pages = nr_pages; bio = btrfs_bio_alloc(first_byte); - bio->bi_opf = REQ_OP_WRITE | write_flags; + bio->bi_opf = bio_op | write_flags; bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; + if (use_append) { + struct btrfs_device *device; + + device = btrfs_zoned_get_device(fs_info, disk_start, PAGE_SIZE); + if (IS_ERR(device)) { + kfree(cb); + bio_put(bio); + return BLK_STS_NOTSUPP; + } + + bio_set_dev(bio, device->bdev); + } + if (blkcg_css) { bio->bi_opf |= REQ_CGROUP_PUNT; kthread_associate_blkcg(blkcg_css); @@ -432,6 +447,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, bytes_left = compressed_len; for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) { int submit = 0; + int len = 0; page = compressed_pages[pg_index]; page->mapping = inode->vfs_inode.i_mapping; @@ -439,9 +455,20 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio, 0); + /* + * Page can only be added to bio if the current bio fits in + * stripe. + */ + if (!submit) { + if (pg_index == 0 && use_append) + len = bio_add_zone_append_page(bio, page, + PAGE_SIZE, 0); + else + len = bio_add_page(bio, page, PAGE_SIZE, 0); + } + page->mapping = NULL; - if (submit || bio_add_page(bio, page, PAGE_SIZE, 0) < - PAGE_SIZE) { + if (submit || len < PAGE_SIZE) { /* * inc the count before we submit the bio so * we know the end IO handler won't happen before @@ -465,16 +492,20 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, } bio = btrfs_bio_alloc(first_byte); - bio->bi_opf = REQ_OP_WRITE | write_flags; + bio->bi_opf = bio_op | write_flags; bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; if (blkcg_css) bio->bi_opf |= REQ_CGROUP_PUNT; + /* + * Use bio_add_page() to ensure the bio has at least one + * page. + */ bio_add_page(bio, page, PAGE_SIZE, 0); } if (bytes_left < PAGE_SIZE) { btrfs_info(fs_info, - "bytes left %lu compress len %lu nr %lu", + "bytes left %lu compress len %u nr %u", bytes_left, cb->compressed_len, cb->nr_pages); } bytes_left -= PAGE_SIZE; @@ -636,9 +667,9 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map_tree *em_tree; struct compressed_bio *cb; - unsigned long compressed_len; - unsigned long nr_pages; - unsigned long pg_index; + unsigned int compressed_len; + unsigned int nr_pages; + unsigned int pg_index; struct page *page; struct bio *comp_bio; u64 cur_disk_byte = bio->bi_iter.bi_sector << 9; @@ -1161,9 +1192,6 @@ static unsigned int btrfs_compress_set_level(int type, unsigned level) * * @total_out is an in/out parameter, must be set to the input length and will * be also used to return the total number of compressed bytes - * - * @max_out tells us the max number of bytes that we're allowed to - * stuff into pages */ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, u64 start, struct page **pages, @@ -1184,20 +1212,6 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, return ret; } -/* - * pages_in is an array of pages with compressed data. - * - * disk_start is the starting logical offset of this array in the file - * - * orig_bio contains the pages from the file that we want to decompress into - * - * srclen is the number of bytes in pages_in - * - * The basic idea is that we have a bio that was created by readpages. - * The pages in the bio are for the uncompressed data, and they may not - * be contiguous. They all correspond to the range of bytes covered by - * the compressed extent. - */ static int btrfs_decompress_bio(struct compressed_bio *cb) { struct list_head *workspace; diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 8001b700ea3a..c359f20920d0 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -31,6 +31,9 @@ struct compressed_bio { /* number of bios pending for this compressed extent */ refcount_t pending_bios; + /* Number of compressed pages in the array */ + unsigned int nr_pages; + /* the pages with the compressed data on them */ struct page **compressed_pages; @@ -40,20 +43,17 @@ struct compressed_bio { /* starting offset in the inode for our pages */ u64 start; - /* number of bytes in the inode we're working on */ - unsigned long len; - - /* number of bytes on disk */ - unsigned long compressed_len; + /* Number of bytes in the inode we're working on */ + unsigned int len; - /* the compression algorithm for this bio */ - int compress_type; + /* Number of bytes on disk */ + unsigned int compressed_len; - /* number of compressed pages in the array */ - unsigned long nr_pages; + /* The compression algorithm for this bio */ + u8 compress_type; /* IO errors */ - int errors; + u8 errors; int mirror_num; /* for reads, this is the bio we are copying the data into */ @@ -91,10 +91,10 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start, struct bio *bio); blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, - unsigned long len, u64 disk_start, - unsigned long compressed_len, + unsigned int len, u64 disk_start, + unsigned int compressed_len, struct page **compressed_pages, - unsigned long nr_pages, + unsigned int nr_pages, unsigned int write_flags, struct cgroup_subsys_state *blkcg_css); blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index a484fb72a01f..4bc3ca2cbd7d 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -596,7 +596,6 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, trans->transid, fs_info->generation); if (!should_cow_block(trans, root, buf)) { - trans->dirty = true; *cow_ret = buf; return 0; } @@ -1788,10 +1787,8 @@ again: * then we don't want to set the path blocking, * so we test it here */ - if (!should_cow_block(trans, root, b)) { - trans->dirty = true; + if (!should_cow_block(trans, root, b)) goto cow_done; - } /* * must have write locks on this node and the diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9fb76829a281..e5e53e592d4f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -561,10 +561,16 @@ enum { /* * Indicate that balance has been set up from the ioctl and is in the * main phase. The fs_info::balance_ctl is initialized. - * Set and cleared while holding fs_info::balance_mutex. */ BTRFS_FS_BALANCE_RUNNING, + /* + * Indicate that relocation of a chunk has started, it's set per chunk + * and is toggled between chunks. + * Set, tested and cleared while holding fs_info::send_reloc_lock. + */ + BTRFS_FS_RELOC_RUNNING, + /* Indicate that the cleaner thread is awake and doing something. */ BTRFS_FS_CLEANER_RUNNING, @@ -817,8 +823,6 @@ struct btrfs_fs_info { struct kobject *space_info_kobj; struct kobject *qgroups_kobj; - u64 total_pinned; - /* used to keep from writing metadata until there is a nice batch */ struct percpu_counter dirty_metadata_bytes; struct percpu_counter delalloc_bytes; @@ -871,6 +875,9 @@ struct btrfs_fs_info { struct btrfs_balance_control *balance_ctl; wait_queue_head_t balance_wait_q; + /* Cancellation requests for chunk relocation */ + atomic_t reloc_cancel_req; + u32 data_chunk_allocations; u32 metadata_ratio; @@ -986,14 +993,15 @@ struct btrfs_fs_info { struct crypto_shash *csum_shash; + spinlock_t send_reloc_lock; /* * Number of send operations in progress. - * Updated while holding fs_info::balance_mutex. + * Updated while holding fs_info::send_reloc_lock. */ int send_in_progress; - /* Type of exclusive operation running */ - unsigned long exclusive_operation; + /* Type of exclusive operation running, protected by super_lock */ + enum btrfs_exclusive_operation exclusive_operation; /* * Zone size > 0 when in ZONED mode, otherwise it's used for a check @@ -1375,38 +1383,39 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) * * Note: don't forget to add new options to btrfs_show_options() */ -#define BTRFS_MOUNT_NODATASUM (1 << 0) -#define BTRFS_MOUNT_NODATACOW (1 << 1) -#define BTRFS_MOUNT_NOBARRIER (1 << 2) -#define BTRFS_MOUNT_SSD (1 << 3) -#define BTRFS_MOUNT_DEGRADED (1 << 4) -#define BTRFS_MOUNT_COMPRESS (1 << 5) -#define BTRFS_MOUNT_NOTREELOG (1 << 6) -#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7) -#define BTRFS_MOUNT_SSD_SPREAD (1 << 8) -#define BTRFS_MOUNT_NOSSD (1 << 9) -#define BTRFS_MOUNT_DISCARD_SYNC (1 << 10) -#define BTRFS_MOUNT_FORCE_COMPRESS (1 << 11) -#define BTRFS_MOUNT_SPACE_CACHE (1 << 12) -#define BTRFS_MOUNT_CLEAR_CACHE (1 << 13) -#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14) -#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) -#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) -/* bit 17 is free */ -#define BTRFS_MOUNT_USEBACKUPROOT (1 << 18) -#define BTRFS_MOUNT_SKIP_BALANCE (1 << 19) -#define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20) -#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) -#define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22) -#define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23) -#define BTRFS_MOUNT_FRAGMENT_DATA (1 << 24) -#define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25) -#define BTRFS_MOUNT_FREE_SPACE_TREE (1 << 26) -#define BTRFS_MOUNT_NOLOGREPLAY (1 << 27) -#define BTRFS_MOUNT_REF_VERIFY (1 << 28) -#define BTRFS_MOUNT_DISCARD_ASYNC (1 << 29) -#define BTRFS_MOUNT_IGNOREBADROOTS (1 << 30) -#define BTRFS_MOUNT_IGNOREDATACSUMS (1 << 31) +enum { + BTRFS_MOUNT_NODATASUM = (1UL << 0), + BTRFS_MOUNT_NODATACOW = (1UL << 1), + BTRFS_MOUNT_NOBARRIER = (1UL << 2), + BTRFS_MOUNT_SSD = (1UL << 3), + BTRFS_MOUNT_DEGRADED = (1UL << 4), + BTRFS_MOUNT_COMPRESS = (1UL << 5), + BTRFS_MOUNT_NOTREELOG = (1UL << 6), + BTRFS_MOUNT_FLUSHONCOMMIT = (1UL << 7), + BTRFS_MOUNT_SSD_SPREAD = (1UL << 8), + BTRFS_MOUNT_NOSSD = (1UL << 9), + BTRFS_MOUNT_DISCARD_SYNC = (1UL << 10), + BTRFS_MOUNT_FORCE_COMPRESS = (1UL << 11), + BTRFS_MOUNT_SPACE_CACHE = (1UL << 12), + BTRFS_MOUNT_CLEAR_CACHE = (1UL << 13), + BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED = (1UL << 14), + BTRFS_MOUNT_ENOSPC_DEBUG = (1UL << 15), + BTRFS_MOUNT_AUTO_DEFRAG = (1UL << 16), + BTRFS_MOUNT_USEBACKUPROOT = (1UL << 17), + BTRFS_MOUNT_SKIP_BALANCE = (1UL << 18), + BTRFS_MOUNT_CHECK_INTEGRITY = (1UL << 19), + BTRFS_MOUNT_CHECK_INTEGRITY_DATA = (1UL << 20), + BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1UL << 21), + BTRFS_MOUNT_RESCAN_UUID_TREE = (1UL << 22), + BTRFS_MOUNT_FRAGMENT_DATA = (1UL << 23), + BTRFS_MOUNT_FRAGMENT_METADATA = (1UL << 24), + BTRFS_MOUNT_FREE_SPACE_TREE = (1UL << 25), + BTRFS_MOUNT_NOLOGREPLAY = (1UL << 26), + BTRFS_MOUNT_REF_VERIFY = (1UL << 27), + BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 28), + BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 29), + BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 30), +}; #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) #define BTRFS_DEFAULT_MAX_INLINE (2048) @@ -2216,11 +2225,13 @@ BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item, static inline bool btrfs_root_readonly(const struct btrfs_root *root) { + /* Byte-swap the constant at compile time, root_item::flags is LE */ return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0; } static inline bool btrfs_root_dead(const struct btrfs_root *root) { + /* Byte-swap the constant at compile time, root_item::flags is LE */ return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0; } @@ -2746,9 +2757,9 @@ enum btrfs_reserve_flush_enum { /* * Flush space by above mentioned methods and by: * - Running delayed iputs - * - Commiting transaction + * - Committing transaction * - * Can be interruped by fatal signal. + * Can be interrupted by a fatal signal. */ BTRFS_RESERVE_FLUSH_DATA, BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE, @@ -2758,7 +2769,7 @@ enum btrfs_reserve_flush_enum { * Pretty much the same as FLUSH_ALL, but can also steal space from * global rsv. * - * Can be interruped by fatal signal. + * Can be interrupted by a fatal signal. */ BTRFS_RESERVE_FLUSH_ALL_STEAL, }; @@ -2774,7 +2785,6 @@ enum btrfs_flush_state { ALLOC_CHUNK_FORCE = 8, RUN_DELAYED_IPUTS = 9, COMMIT_TRANS = 10, - FORCE_COMMIT_TRANS = 11, }; int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, @@ -3100,8 +3110,8 @@ u64 btrfs_file_extent_end(const struct btrfs_path *path); /* inode.c */ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags); -int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, - struct page *page, u64 start, u64 end); +unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, + struct page *page, u64 start, u64 end); struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, u64 start, u64 len); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, @@ -3125,7 +3135,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, u64 new_size, - u32 min_type); + u32 min_type, u64 *extents_found); int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context); int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, @@ -3146,9 +3156,7 @@ void btrfs_split_delalloc_extent(struct inode *inode, struct extent_state *orig, u64 split); int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, unsigned long bio_flags); -bool btrfs_bio_fits_in_ordered_extent(struct page *page, struct bio *bio, - unsigned int size); -void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end); +void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end); vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); void btrfs_evict_inode(struct inode *inode); @@ -3187,7 +3195,8 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page u64 start, u64 end, int *page_started, unsigned long *nr_written, struct writeback_control *wbc); int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end); -void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, +void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, + struct page *page, u64 start, u64 end, int uptodate); extern const struct dentry_operations btrfs_dentry_operations; extern const struct iomap_ops btrfs_dio_iomap_ops; @@ -3222,6 +3231,9 @@ void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, enum btrfs_exclusive_operation type); +bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type); +void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info); void btrfs_exclop_finish(struct btrfs_fs_info *fs_info); /* file.c */ @@ -3786,4 +3798,14 @@ static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) return fs_info->zoned != 0; } +/* + * We use page status Private2 to indicate there is an ordered extent with + * unfinished IO. + * + * Rename the Private2 accessors to Ordered, to improve readability. + */ +#define PageOrdered(page) PagePrivate2(page) +#define SetPageOrdered(page) SetPagePrivate2(page) +#define ClearPageOrdered(page) ClearPagePrivate2(page) + #endif diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 56642ca7af10..2059d1504149 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -89,7 +89,7 @@ * ->outstanding_extents += 1 (current value is 1) * * -> set_delalloc - * ->outstanding_extents += 1 (currrent value is 2) + * ->outstanding_extents += 1 (current value is 2) * * -> btrfs_delalloc_release_extents() * ->outstanding_extents -= 1 (current value is 1) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 1a88f6214ebc..257c1e18abd4 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -681,7 +681,7 @@ static int btrfs_batch_insert_items(struct btrfs_root *root, { struct btrfs_delayed_item *curr, *next; int free_space; - int total_data_size = 0, total_size = 0; + int total_size = 0; struct extent_buffer *leaf; char *data_ptr; struct btrfs_key *keys; @@ -706,7 +706,6 @@ static int btrfs_batch_insert_items(struct btrfs_root *root, */ while (total_size + next->data_len + sizeof(struct btrfs_item) <= free_space) { - total_data_size += next->data_len; total_size += next->data_len + sizeof(struct btrfs_item); list_add_tail(&next->tree_list, &head); nitems++; @@ -974,14 +973,16 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node) static void btrfs_release_delayed_iref(struct btrfs_delayed_node *delayed_node) { - struct btrfs_delayed_root *delayed_root; - ASSERT(delayed_node->root); - clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags); - delayed_node->count--; + if (test_and_clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) { + struct btrfs_delayed_root *delayed_root; - delayed_root = delayed_node->root->fs_info->delayed_root; - finish_one_item(delayed_root); + ASSERT(delayed_node->root); + delayed_node->count--; + + delayed_root = delayed_node->root->fs_info->delayed_root; + finish_one_item(delayed_root); + } } static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, @@ -1009,12 +1010,10 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, nofs_flag = memalloc_nofs_save(); ret = btrfs_lookup_inode(trans, root, path, &key, mod); memalloc_nofs_restore(nofs_flag); - if (ret > 0) { - btrfs_release_path(path); - return -ENOENT; - } else if (ret < 0) { - return ret; - } + if (ret > 0) + ret = -ENOENT; + if (ret < 0) + goto out; leaf = path->nodes[0]; inode_item = btrfs_item_ptr(leaf, path->slots[0], @@ -1024,7 +1023,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags)) - goto no_iref; + goto out; path->slots[0]++; if (path->slots[0] >= btrfs_header_nritems(leaf)) @@ -1046,12 +1045,19 @@ again: btrfs_del_item(trans, root, path); out: btrfs_release_delayed_iref(node); -no_iref: btrfs_release_path(path); err_out: btrfs_delayed_inode_release_metadata(fs_info, node, (ret < 0)); btrfs_release_delayed_inode(node); + /* + * If we fail to update the delayed inode we need to abort the + * transaction, because we could leave the inode with the improper + * counts behind. + */ + if (ret && ret != -ENOENT) + btrfs_abort_transaction(trans, ret); + return ret; search: @@ -1898,8 +1904,7 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node) btrfs_release_delayed_item(prev_item); } - if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) - btrfs_release_delayed_iref(delayed_node); + btrfs_release_delayed_iref(delayed_node); if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { btrfs_delayed_inode_release_metadata(fs_info, delayed_node, false); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index c92d9d4f5f46..06bc842ecdb3 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -641,7 +641,6 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_root *delayed_refs = &trans->transaction->delayed_refs; struct btrfs_fs_info *fs_info = trans->fs_info; - u64 flags = btrfs_ref_head_to_space_flags(existing); int old_ref_mod; BUG_ON(existing->is_data != update->is_data); @@ -711,26 +710,6 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, } } - /* - * This handles the following conditions: - * - * 1. We had a ref mod of 0 or more and went negative, indicating that - * we may be freeing space, so add our space to the - * total_bytes_pinned counter. - * 2. We were negative and went to 0 or positive, so no longer can say - * that the space would be pinned, decrement our counter from the - * total_bytes_pinned counter. - * 3. We are now at 0 and have ->must_insert_reserved set, which means - * this was a new allocation and then we dropped it, and thus must - * add our space to the total_bytes_pinned counter. - */ - if (existing->total_ref_mod < 0 && old_ref_mod >= 0) - btrfs_mod_total_bytes_pinned(fs_info, flags, existing->num_bytes); - else if (existing->total_ref_mod >= 0 && old_ref_mod < 0) - btrfs_mod_total_bytes_pinned(fs_info, flags, -existing->num_bytes); - else if (existing->total_ref_mod == 0 && existing->must_insert_reserved) - btrfs_mod_total_bytes_pinned(fs_info, flags, existing->num_bytes); - spin_unlock(&existing->lock); } @@ -835,17 +814,12 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); head_ref = existing; } else { - u64 flags = btrfs_ref_head_to_space_flags(head_ref); - if (head_ref->is_data && head_ref->ref_mod < 0) { delayed_refs->pending_csums += head_ref->num_bytes; trans->delayed_ref_updates += btrfs_csum_bytes_to_leaves(trans->fs_info, head_ref->num_bytes); } - if (head_ref->ref_mod < 0) - btrfs_mod_total_bytes_pinned(trans->fs_info, flags, - head_ref->num_bytes); delayed_refs->num_heads++; delayed_refs->num_heads_ready++; atomic_inc(&delayed_refs->num_entries); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index d05f73530af7..d029be40ea6f 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -37,7 +37,7 @@ * - Write duplication * * All new writes will be written to both target and source devices, so even - * if replace gets canceled, sources device still contans up-to-date data. + * if replace gets canceled, sources device still contains up-to-date data. * * Location: handle_ops_on_dev_replace() from __btrfs_map_block() * Start: btrfs_dev_replace_start() diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index 306ff20af70f..e1b7bd927d69 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -624,7 +624,7 @@ void btrfs_discard_update_discardable(struct btrfs_block_group *block_group) * @fs_info: fs_info of interest * * The unused_bgs list needs to be punted to the discard lists because the - * order of operations is changed. In the normal sychronous discard path, the + * order of operations is changed. In the normal synchronous discard path, the * block groups are trimmed via a single large trim in transaction commit. This * is ultimately what we are trying to avoid with asynchronous discard. Thus, * it must be done before going down the unused_bgs path. diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c9a3036c23bf..b117dd3b8172 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -241,7 +241,6 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, { struct extent_state *cached_state = NULL; int ret; - bool need_lock = (current->journal_info == BTRFS_SEND_TRANS_STUB); if (!parent_transid || btrfs_header_generation(eb) == parent_transid) return 0; @@ -249,9 +248,6 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, if (atomic) return -EAGAIN; - if (need_lock) - btrfs_tree_read_lock(eb); - lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1, &cached_state); if (extent_buffer_uptodate(eb) && @@ -264,22 +260,10 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, eb->start, parent_transid, btrfs_header_generation(eb)); ret = 1; - - /* - * Things reading via commit roots that don't have normal protection, - * like send, can have a really old block in cache that may point at a - * block that has been freed and re-allocated. So don't clear uptodate - * if we find an eb that is under IO (dirty/writeback) because we could - * end up reading in the stale data and then writing it back out and - * making everybody very sad. - */ - if (!extent_buffer_under_io(eb)) - clear_extent_buffer_uptodate(eb); + clear_extent_buffer_uptodate(eb); out: unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, &cached_state); - if (need_lock) - btrfs_tree_read_unlock(eb); return ret; } @@ -584,6 +568,7 @@ static int validate_extent_buffer(struct extent_buffer *eb) const u32 csum_size = fs_info->csum_size; u8 found_level; u8 result[BTRFS_CSUM_SIZE]; + const u8 *header_csum; int ret = 0; found_start = btrfs_header_bytenr(eb); @@ -608,15 +593,14 @@ static int validate_extent_buffer(struct extent_buffer *eb) } csum_tree_block(eb, result); + header_csum = page_address(eb->pages[0]) + + get_eb_offset_in_page(eb, offsetof(struct btrfs_header, csum)); - if (memcmp_extent_buffer(eb, result, 0, csum_size)) { - u8 val[BTRFS_CSUM_SIZE] = { 0 }; - - read_extent_buffer(eb, &val, 0, csum_size); + if (memcmp(result, header_csum, csum_size) != 0) { btrfs_warn_rl(fs_info, - "%s checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d", - fs_info->sb->s_id, eb->start, - CSUM_FMT_VALUE(csum_size, val), + "checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d", + eb->start, + CSUM_FMT_VALUE(csum_size, header_csum), CSUM_FMT_VALUE(csum_size, result), btrfs_header_level(eb)); ret = -EUCLEAN; @@ -917,23 +901,22 @@ static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio, return btree_csum_one_bio(bio); } -static int check_async_write(struct btrfs_fs_info *fs_info, +static bool should_async_write(struct btrfs_fs_info *fs_info, struct btrfs_inode *bi) { if (btrfs_is_zoned(fs_info)) - return 0; + return false; if (atomic_read(&bi->sync_writers)) - return 0; + return false; if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) - return 0; - return 1; + return false; + return true; } blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int async = check_async_write(fs_info, BTRFS_I(inode)); blk_status_t ret; if (btrfs_op(bio) != BTRFS_MAP_WRITE) { @@ -946,7 +929,7 @@ blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, if (ret) goto out_w_error; ret = btrfs_map_bio(fs_info, bio, mirror_num); - } else if (!async) { + } else if (!should_async_write(fs_info, BTRFS_I(inode))) { ret = btree_csum_one_bio(bio); if (ret) goto out_w_error; @@ -2252,6 +2235,7 @@ static void btrfs_init_balance(struct btrfs_fs_info *fs_info) atomic_set(&fs_info->balance_cancel_req, 0); fs_info->balance_ctl = NULL; init_waitqueue_head(&fs_info->balance_wait_q); + atomic_set(&fs_info->reloc_cancel_req, 0); } static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) @@ -2648,6 +2632,24 @@ static int validate_super(struct btrfs_fs_info *fs_info, ret = -EINVAL; } + if (memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid, + BTRFS_FSID_SIZE)) { + btrfs_err(fs_info, + "superblock fsid doesn't match fsid of fs_devices: %pU != %pU", + fs_info->super_copy->fsid, fs_info->fs_devices->fsid); + ret = -EINVAL; + } + + if (btrfs_fs_incompat(fs_info, METADATA_UUID) && + memcmp(fs_info->fs_devices->metadata_uuid, + fs_info->super_copy->metadata_uuid, BTRFS_FSID_SIZE)) { + btrfs_err(fs_info, +"superblock metadata_uuid doesn't match metadata uuid of fs_devices: %pU != %pU", + fs_info->super_copy->metadata_uuid, + fs_info->fs_devices->metadata_uuid); + ret = -EINVAL; + } + if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid, BTRFS_FSID_SIZE) != 0) { btrfs_err(fs_info, @@ -2981,6 +2983,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) spin_lock_init(&fs_info->swapfile_pins_lock); fs_info->swapfile_pins = RB_ROOT; + spin_lock_init(&fs_info->send_reloc_lock); fs_info->send_in_progress = 0; fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH; @@ -3279,14 +3282,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device disk_super = fs_info->super_copy; - ASSERT(!memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid, - BTRFS_FSID_SIZE)); - - if (btrfs_fs_incompat(fs_info, METADATA_UUID)) { - ASSERT(!memcmp(fs_info->fs_devices->metadata_uuid, - fs_info->super_copy->metadata_uuid, - BTRFS_FSID_SIZE)); - } features = btrfs_super_flags(disk_super); if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) { @@ -3461,7 +3456,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device * At this point we know all the devices that make this filesystem, * including the seed devices but we don't know yet if the replace * target is required. So free devices that are not part of this - * filesystem but skip the replace traget device which is checked + * filesystem but skip the replace target device which is checked * below in btrfs_init_dev_replace(). */ btrfs_free_extra_devids(fs_devices); @@ -3588,8 +3583,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) { ret = btrfsic_mount(fs_info, fs_devices, btrfs_test_opt(fs_info, - CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ? - 1 : 0, + CHECK_INTEGRITY_DATA) ? 1 : 0, fs_info->check_integrity_print_mask); if (ret) btrfs_warn(fs_info, @@ -4686,9 +4680,6 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, cache->space_info->bytes_reserved -= head->num_bytes; spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - percpu_counter_add_batch( - &cache->space_info->total_bytes_pinned, - head->num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH); btrfs_put_block_group(cache); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f1d15b68994a..d296483d148f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1425,7 +1425,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, * bytenr of the parent block. Since new extents are always * created with indirect references, this will only be the case * when relocating a shared extent. In that case, root_objectid - * will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must + * will be BTRFS_TREE_RELOC_OBJECTID. Otherwise, parent must * be 0 * * @root_objectid: The id of the root where this modification has originated, @@ -1804,19 +1804,6 @@ void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes); } - /* - * We were dropping refs, or had a new ref and dropped it, and thus must - * adjust down our total_bytes_pinned, the space may or may not have - * been pinned and so is accounted for properly in the pinned space by - * now. - */ - if (head->total_ref_mod < 0 || - (head->total_ref_mod == 0 && head->must_insert_reserved)) { - u64 flags = btrfs_ref_head_to_space_flags(head); - - btrfs_mod_total_bytes_pinned(fs_info, flags, -head->num_bytes); - } - btrfs_delayed_refs_rsv_release(fs_info, nr_items); } @@ -1868,7 +1855,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, trace_run_delayed_ref_head(fs_info, head, 0); btrfs_delayed_ref_unlock(head); btrfs_put_delayed_ref_head(head); - return 0; + return ret; } static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head( @@ -2551,7 +2538,6 @@ static int pin_down_extent(struct btrfs_trans_handle *trans, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - __btrfs_mod_total_bytes_pinned(cache->space_info, num_bytes); set_extent_dirty(&trans->transaction->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); return 0; @@ -2762,7 +2748,6 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, cache->pinned -= len; btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len); space_info->max_extent_size = 0; - __btrfs_mod_total_bytes_pinned(space_info, -len); if (cache->ro) { space_info->bytes_readonly += len; readonly = true; @@ -4784,7 +4769,6 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, set_extent_dirty(&trans->transaction->dirty_pages, buf->start, buf->start + buf->len - 1, GFP_NOFS); } - trans->dirty = true; /* this returns a buffer locked for blocking */ return buf; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 074a78a202b8..9e81d25dea70 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -136,7 +136,7 @@ struct tree_entry { }; struct extent_page_data { - struct bio *bio; + struct btrfs_bio_ctrl bio_ctrl; /* tells writepage not to lock the state bits for this range * it still does the unlocking */ @@ -185,10 +185,12 @@ int __must_check submit_one_bio(struct bio *bio, int mirror_num, /* Cleanup unsubmitted bios */ static void end_write_bio(struct extent_page_data *epd, int ret) { - if (epd->bio) { - epd->bio->bi_status = errno_to_blk_status(ret); - bio_endio(epd->bio); - epd->bio = NULL; + struct bio *bio = epd->bio_ctrl.bio; + + if (bio) { + bio->bi_status = errno_to_blk_status(ret); + bio_endio(bio); + epd->bio_ctrl.bio = NULL; } } @@ -201,9 +203,10 @@ static void end_write_bio(struct extent_page_data *epd, int ret) static int __must_check flush_write_bio(struct extent_page_data *epd) { int ret = 0; + struct bio *bio = epd->bio_ctrl.bio; - if (epd->bio) { - ret = submit_one_bio(epd->bio, 0, 0); + if (bio) { + ret = submit_one_bio(bio, 0, 0); /* * Clean up of epd->bio is handled by its endio function. * And endio is either triggered by successful bio execution @@ -211,7 +214,7 @@ static int __must_check flush_write_bio(struct extent_page_data *epd) * So at this point, no matter what happened, we don't need * to clean up epd->bio. */ - epd->bio = NULL; + epd->bio_ctrl.bio = NULL; } return ret; } @@ -1805,10 +1808,130 @@ out: return found; } +/* + * Process one page for __process_pages_contig(). + * + * Return >0 if we hit @page == @locked_page. + * Return 0 if we updated the page status. + * Return -EGAIN if the we need to try again. + * (For PAGE_LOCK case but got dirty page or page not belong to mapping) + */ +static int process_one_page(struct btrfs_fs_info *fs_info, + struct address_space *mapping, + struct page *page, struct page *locked_page, + unsigned long page_ops, u64 start, u64 end) +{ + u32 len; + + ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX); + len = end + 1 - start; + + if (page_ops & PAGE_SET_ORDERED) + btrfs_page_clamp_set_ordered(fs_info, page, start, len); + if (page_ops & PAGE_SET_ERROR) + btrfs_page_clamp_set_error(fs_info, page, start, len); + if (page_ops & PAGE_START_WRITEBACK) { + btrfs_page_clamp_clear_dirty(fs_info, page, start, len); + btrfs_page_clamp_set_writeback(fs_info, page, start, len); + } + if (page_ops & PAGE_END_WRITEBACK) + btrfs_page_clamp_clear_writeback(fs_info, page, start, len); + + if (page == locked_page) + return 1; + + if (page_ops & PAGE_LOCK) { + int ret; + + ret = btrfs_page_start_writer_lock(fs_info, page, start, len); + if (ret) + return ret; + if (!PageDirty(page) || page->mapping != mapping) { + btrfs_page_end_writer_lock(fs_info, page, start, len); + return -EAGAIN; + } + } + if (page_ops & PAGE_UNLOCK) + btrfs_page_end_writer_lock(fs_info, page, start, len); + return 0; +} + static int __process_pages_contig(struct address_space *mapping, struct page *locked_page, - pgoff_t start_index, pgoff_t end_index, - unsigned long page_ops, pgoff_t *index_ret); + u64 start, u64 end, unsigned long page_ops, + u64 *processed_end) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb); + pgoff_t start_index = start >> PAGE_SHIFT; + pgoff_t end_index = end >> PAGE_SHIFT; + pgoff_t index = start_index; + unsigned long nr_pages = end_index - start_index + 1; + unsigned long pages_processed = 0; + struct page *pages[16]; + int err = 0; + int i; + + if (page_ops & PAGE_LOCK) { + ASSERT(page_ops == PAGE_LOCK); + ASSERT(processed_end && *processed_end == start); + } + + if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0) + mapping_set_error(mapping, -EIO); + + while (nr_pages > 0) { + int found_pages; + + found_pages = find_get_pages_contig(mapping, index, + min_t(unsigned long, + nr_pages, ARRAY_SIZE(pages)), pages); + if (found_pages == 0) { + /* + * Only if we're going to lock these pages, we can find + * nothing at @index. + */ + ASSERT(page_ops & PAGE_LOCK); + err = -EAGAIN; + goto out; + } + + for (i = 0; i < found_pages; i++) { + int process_ret; + + process_ret = process_one_page(fs_info, mapping, + pages[i], locked_page, page_ops, + start, end); + if (process_ret < 0) { + for (; i < found_pages; i++) + put_page(pages[i]); + err = -EAGAIN; + goto out; + } + put_page(pages[i]); + pages_processed++; + } + nr_pages -= found_pages; + index += found_pages; + cond_resched(); + } +out: + if (err && processed_end) { + /* + * Update @processed_end. I know this is awful since it has + * two different return value patterns (inclusive vs exclusive). + * + * But the exclusive pattern is necessary if @start is 0, or we + * underflow and check against processed_end won't work as + * expected. + */ + if (pages_processed) + *processed_end = min(end, + ((u64)(start_index + pages_processed) << PAGE_SHIFT) - 1); + else + *processed_end = start; + } + return err; +} static noinline void __unlock_for_delalloc(struct inode *inode, struct page *locked_page, @@ -1821,7 +1944,7 @@ static noinline void __unlock_for_delalloc(struct inode *inode, if (index == locked_page->index && end_index == index) return; - __process_pages_contig(inode->i_mapping, locked_page, index, end_index, + __process_pages_contig(inode->i_mapping, locked_page, start, end, PAGE_UNLOCK, NULL); } @@ -1831,19 +1954,19 @@ static noinline int lock_delalloc_pages(struct inode *inode, u64 delalloc_end) { unsigned long index = delalloc_start >> PAGE_SHIFT; - unsigned long index_ret = index; unsigned long end_index = delalloc_end >> PAGE_SHIFT; + u64 processed_end = delalloc_start; int ret; ASSERT(locked_page); if (index == locked_page->index && index == end_index) return 0; - ret = __process_pages_contig(inode->i_mapping, locked_page, index, - end_index, PAGE_LOCK, &index_ret); - if (ret == -EAGAIN) + ret = __process_pages_contig(inode->i_mapping, locked_page, delalloc_start, + delalloc_end, PAGE_LOCK, &processed_end); + if (ret == -EAGAIN && processed_end > delalloc_start) __unlock_for_delalloc(inode, locked_page, delalloc_start, - (u64)index_ret << PAGE_SHIFT); + processed_end); return ret; } @@ -1936,84 +2059,6 @@ out_failed: return found; } -static int __process_pages_contig(struct address_space *mapping, - struct page *locked_page, - pgoff_t start_index, pgoff_t end_index, - unsigned long page_ops, pgoff_t *index_ret) -{ - unsigned long nr_pages = end_index - start_index + 1; - unsigned long pages_processed = 0; - pgoff_t index = start_index; - struct page *pages[16]; - unsigned ret; - int err = 0; - int i; - - if (page_ops & PAGE_LOCK) { - ASSERT(page_ops == PAGE_LOCK); - ASSERT(index_ret && *index_ret == start_index); - } - - if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0) - mapping_set_error(mapping, -EIO); - - while (nr_pages > 0) { - ret = find_get_pages_contig(mapping, index, - min_t(unsigned long, - nr_pages, ARRAY_SIZE(pages)), pages); - if (ret == 0) { - /* - * Only if we're going to lock these pages, - * can we find nothing at @index. - */ - ASSERT(page_ops & PAGE_LOCK); - err = -EAGAIN; - goto out; - } - - for (i = 0; i < ret; i++) { - if (page_ops & PAGE_SET_PRIVATE2) - SetPagePrivate2(pages[i]); - - if (locked_page && pages[i] == locked_page) { - put_page(pages[i]); - pages_processed++; - continue; - } - if (page_ops & PAGE_START_WRITEBACK) { - clear_page_dirty_for_io(pages[i]); - set_page_writeback(pages[i]); - } - if (page_ops & PAGE_SET_ERROR) - SetPageError(pages[i]); - if (page_ops & PAGE_END_WRITEBACK) - end_page_writeback(pages[i]); - if (page_ops & PAGE_UNLOCK) - unlock_page(pages[i]); - if (page_ops & PAGE_LOCK) { - lock_page(pages[i]); - if (!PageDirty(pages[i]) || - pages[i]->mapping != mapping) { - unlock_page(pages[i]); - for (; i < ret; i++) - put_page(pages[i]); - err = -EAGAIN; - goto out; - } - } - put_page(pages[i]); - pages_processed++; - } - nr_pages -= ret; - index += ret; - cond_resched(); - } -out: - if (err && index_ret) - *index_ret = start_index + pages_processed - 1; - return err; -} - void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct page *locked_page, u32 clear_bits, unsigned long page_ops) @@ -2021,8 +2066,7 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL); __process_pages_contig(inode->vfs_inode.i_mapping, locked_page, - start >> PAGE_SHIFT, end >> PAGE_SHIFT, - page_ops, NULL); + start, end, page_ops, NULL); } /* @@ -2381,13 +2425,6 @@ int clean_io_failure(struct btrfs_fs_info *fs_info, BUG_ON(!failrec->this_mirror); - if (failrec->in_validation) { - /* there was no real error, just free the record */ - btrfs_debug(fs_info, - "clean_io_failure: freeing dummy error at %llu", - failrec->start); - goto out; - } if (sb_rdonly(fs_info->sb)) goto out; @@ -2449,7 +2486,7 @@ void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end) } static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode, - u64 start, u64 end) + u64 start) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct io_failure_record *failrec; @@ -2457,15 +2494,15 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + const u32 sectorsize = fs_info->sectorsize; int ret; u64 logical; failrec = get_state_failrec(failure_tree, start); if (!IS_ERR(failrec)) { btrfs_debug(fs_info, - "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d", - failrec->logical, failrec->start, failrec->len, - failrec->in_validation); + "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu", + failrec->logical, failrec->start, failrec->len); /* * when data can be on disk more than twice, add to failrec here * (e.g. with a list for failed_mirror) to make @@ -2480,10 +2517,9 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode return ERR_PTR(-ENOMEM); failrec->start = start; - failrec->len = end - start + 1; + failrec->len = sectorsize; failrec->this_mirror = 0; failrec->bio_flags = 0; - failrec->in_validation = 0; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, failrec->len); @@ -2519,12 +2555,13 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode free_extent_map(em); /* Set the bits in the private failure tree */ - ret = set_extent_bits(failure_tree, start, end, + ret = set_extent_bits(failure_tree, start, start + sectorsize - 1, EXTENT_LOCKED | EXTENT_DIRTY); if (ret >= 0) { ret = set_state_failrec(failure_tree, start, failrec); /* Set the bits in the inode's tree */ - ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED); + ret = set_extent_bits(tree, start, start + sectorsize - 1, + EXTENT_DAMAGED); } else if (ret < 0) { kfree(failrec); return ERR_PTR(ret); @@ -2533,7 +2570,7 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode return failrec; } -static bool btrfs_check_repairable(struct inode *inode, bool needs_validation, +static bool btrfs_check_repairable(struct inode *inode, struct io_failure_record *failrec, int failed_mirror) { @@ -2553,39 +2590,22 @@ static bool btrfs_check_repairable(struct inode *inode, bool needs_validation, return false; } + /* The failure record should only contain one sector */ + ASSERT(failrec->len == fs_info->sectorsize); + /* - * there are two premises: - * a) deliver good data to the caller - * b) correct the bad sectors on disk + * There are two premises: + * a) deliver good data to the caller + * b) correct the bad sectors on disk + * + * Since we're only doing repair for one sector, we only need to get + * a good copy of the failed sector and if we succeed, we have setup + * everything for repair_io_failure to do the rest for us. */ - if (needs_validation) { - /* - * to fulfill b), we need to know the exact failing sectors, as - * we don't want to rewrite any more than the failed ones. thus, - * we need separate read requests for the failed bio - * - * if the following BUG_ON triggers, our validation request got - * merged. we need separate requests for our algorithm to work. - */ - BUG_ON(failrec->in_validation); - failrec->in_validation = 1; - failrec->this_mirror = failed_mirror; - } else { - /* - * we're ready to fulfill a) and b) alongside. get a good copy - * of the failed sector and if we succeed, we have setup - * everything for repair_io_failure to do the rest for us. - */ - if (failrec->in_validation) { - BUG_ON(failrec->this_mirror != failed_mirror); - failrec->in_validation = 0; - failrec->this_mirror = 0; - } - failrec->failed_mirror = failed_mirror; + failrec->failed_mirror = failed_mirror; + failrec->this_mirror++; + if (failrec->this_mirror == failed_mirror) failrec->this_mirror++; - if (failrec->this_mirror == failed_mirror) - failrec->this_mirror++; - } if (failrec->this_mirror > num_copies) { btrfs_debug(fs_info, @@ -2597,53 +2617,11 @@ static bool btrfs_check_repairable(struct inode *inode, bool needs_validation, return true; } -static bool btrfs_io_needs_validation(struct inode *inode, struct bio *bio) -{ - u64 len = 0; - const u32 blocksize = inode->i_sb->s_blocksize; - - /* - * If bi_status is BLK_STS_OK, then this was a checksum error, not an - * I/O error. In this case, we already know exactly which sector was - * bad, so we don't need to validate. - */ - if (bio->bi_status == BLK_STS_OK) - return false; - - /* - * We need to validate each sector individually if the failed I/O was - * for multiple sectors. - * - * There are a few possible bios that can end up here: - * 1. A buffered read bio, which is not cloned. - * 2. A direct I/O read bio, which is cloned. - * 3. A (buffered or direct) repair bio, which is not cloned. - * - * For cloned bios (case 2), we can get the size from - * btrfs_io_bio->iter; for non-cloned bios (cases 1 and 3), we can get - * it from the bvecs. - */ - if (bio_flagged(bio, BIO_CLONED)) { - if (btrfs_io_bio(bio)->iter.bi_size > blocksize) - return true; - } else { - struct bio_vec *bvec; - int i; - - bio_for_each_bvec_all(bvec, bio, i) { - len += bvec->bv_len; - if (len > blocksize) - return true; - } - } - return false; -} - -blk_status_t btrfs_submit_read_repair(struct inode *inode, - struct bio *failed_bio, u32 bio_offset, - struct page *page, unsigned int pgoff, - u64 start, u64 end, int failed_mirror, - submit_bio_hook_t *submit_bio_hook) +int btrfs_repair_one_sector(struct inode *inode, + struct bio *failed_bio, u32 bio_offset, + struct page *page, unsigned int pgoff, + u64 start, int failed_mirror, + submit_bio_hook_t *submit_bio_hook) { struct io_failure_record *failrec; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -2651,7 +2629,6 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode, struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; struct btrfs_io_bio *failed_io_bio = btrfs_io_bio(failed_bio); const int icsum = bio_offset >> fs_info->sectorsize_bits; - bool need_validation; struct bio *repair_bio; struct btrfs_io_bio *repair_io_bio; blk_status_t status; @@ -2661,23 +2638,19 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode, BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); - failrec = btrfs_get_io_failure_record(inode, start, end); + failrec = btrfs_get_io_failure_record(inode, start); if (IS_ERR(failrec)) - return errno_to_blk_status(PTR_ERR(failrec)); + return PTR_ERR(failrec); - need_validation = btrfs_io_needs_validation(inode, failed_bio); - if (!btrfs_check_repairable(inode, need_validation, failrec, - failed_mirror)) { + if (!btrfs_check_repairable(inode, failrec, failed_mirror)) { free_io_failure(failure_tree, tree, failrec); - return BLK_STS_IOERR; + return -EIO; } repair_bio = btrfs_io_bio_alloc(1); repair_io_bio = btrfs_io_bio(repair_bio); repair_bio->bi_opf = REQ_OP_READ; - if (need_validation) - repair_bio->bi_opf |= REQ_FAILFAST_DEV; repair_bio->bi_end_io = failed_bio->bi_end_io; repair_bio->bi_iter.bi_sector = failrec->logical >> 9; repair_bio->bi_private = failed_bio->bi_private; @@ -2695,8 +2668,8 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode, repair_io_bio->iter = repair_bio->bi_iter; btrfs_debug(btrfs_sb(inode->i_sb), -"repair read error: submitting new read to mirror %d, in_validation=%d", - failrec->this_mirror, failrec->in_validation); + "repair read error: submitting new read to mirror %d", + failrec->this_mirror); status = submit_bio_hook(inode, repair_bio, failrec->this_mirror, failrec->bio_flags); @@ -2704,17 +2677,114 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode, free_io_failure(failure_tree, tree, failrec); bio_put(repair_bio); } - return status; + return blk_status_to_errno(status); +} + +static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + + ASSERT(page_offset(page) <= start && + start + len <= page_offset(page) + PAGE_SIZE); + + if (uptodate) { + btrfs_page_set_uptodate(fs_info, page, start, len); + } else { + btrfs_page_clear_uptodate(fs_info, page, start, len); + btrfs_page_set_error(fs_info, page, start, len); + } + + if (fs_info->sectorsize == PAGE_SIZE) + unlock_page(page); + else + btrfs_subpage_end_reader(fs_info, page, start, len); +} + +static blk_status_t submit_read_repair(struct inode *inode, + struct bio *failed_bio, u32 bio_offset, + struct page *page, unsigned int pgoff, + u64 start, u64 end, int failed_mirror, + unsigned int error_bitmap, + submit_bio_hook_t *submit_bio_hook) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + const u32 sectorsize = fs_info->sectorsize; + const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits; + int error = 0; + int i; + + BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); + + /* We're here because we had some read errors or csum mismatch */ + ASSERT(error_bitmap); + + /* + * We only get called on buffered IO, thus page must be mapped and bio + * must not be cloned. + */ + ASSERT(page->mapping && !bio_flagged(failed_bio, BIO_CLONED)); + + /* Iterate through all the sectors in the range */ + for (i = 0; i < nr_bits; i++) { + const unsigned int offset = i * sectorsize; + struct extent_state *cached = NULL; + bool uptodate = false; + int ret; + + if (!(error_bitmap & (1U << i))) { + /* + * This sector has no error, just end the page read + * and unlock the range. + */ + uptodate = true; + goto next; + } + + ret = btrfs_repair_one_sector(inode, failed_bio, + bio_offset + offset, + page, pgoff + offset, start + offset, + failed_mirror, submit_bio_hook); + if (!ret) { + /* + * We have submitted the read repair, the page release + * will be handled by the endio function of the + * submitted repair bio. + * Thus we don't need to do any thing here. + */ + continue; + } + /* + * Repair failed, just record the error but still continue. + * Or the remaining sectors will not be properly unlocked. + */ + if (!error) + error = ret; +next: + end_page_read(page, uptodate, start + offset, sectorsize); + if (uptodate) + set_extent_uptodate(&BTRFS_I(inode)->io_tree, + start + offset, + start + offset + sectorsize - 1, + &cached, GFP_ATOMIC); + unlock_extent_cached_atomic(&BTRFS_I(inode)->io_tree, + start + offset, + start + offset + sectorsize - 1, + &cached); + } + return errno_to_blk_status(error); } /* lots and lots of room for performance fixes in the end_bio funcs */ void end_extent_writepage(struct page *page, int err, u64 start, u64 end) { + struct btrfs_inode *inode; int uptodate = (err == 0); int ret = 0; - btrfs_writepage_endio_finish_ordered(page, start, end, uptodate); + ASSERT(page && page->mapping); + inode = BTRFS_I(page->mapping->host); + btrfs_writepage_endio_finish_ordered(inode, page, start, end, uptodate); if (!uptodate) { ClearPageUptodate(page); @@ -2747,25 +2817,20 @@ static void end_bio_extent_writepage(struct bio *bio) struct page *page = bvec->bv_page; struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + const u32 sectorsize = fs_info->sectorsize; - /* We always issue full-page reads, but if some block - * in a page fails to read, blk_update_request() will - * advance bv_offset and adjust bv_len to compensate. - * Print a warning for nonzero offsets, and an error - * if they don't add up to a full page. */ - if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) { - if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE) - btrfs_err(fs_info, - "partial page write in btrfs with offset %u and length %u", - bvec->bv_offset, bvec->bv_len); - else - btrfs_info(fs_info, - "incomplete page write in btrfs with offset %u and length %u", - bvec->bv_offset, bvec->bv_len); - } + /* Our read/write should always be sector aligned. */ + if (!IS_ALIGNED(bvec->bv_offset, sectorsize)) + btrfs_err(fs_info, + "partial page write in btrfs with offset %u and length %u", + bvec->bv_offset, bvec->bv_len); + else if (!IS_ALIGNED(bvec->bv_len, sectorsize)) + btrfs_info(fs_info, + "incomplete page write with offset %u and length %u", + bvec->bv_offset, bvec->bv_len); - start = page_offset(page); - end = start + bvec->bv_offset + bvec->bv_len - 1; + start = page_offset(page) + bvec->bv_offset; + end = start + bvec->bv_len - 1; if (first_bvec) { btrfs_record_physical_zoned(inode, start, bio); @@ -2773,7 +2838,8 @@ static void end_bio_extent_writepage(struct bio *bio) } end_extent_writepage(page, error, start, end); - end_page_writeback(page); + + btrfs_page_clear_writeback(fs_info, page, start, bvec->bv_len); } bio_put(bio); @@ -2862,30 +2928,6 @@ static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE); } -static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); - - ASSERT(page_offset(page) <= start && - start + len <= page_offset(page) + PAGE_SIZE); - - if (uptodate) { - btrfs_page_set_uptodate(fs_info, page, start, len); - } else { - btrfs_page_clear_uptodate(fs_info, page, start, len); - btrfs_page_set_error(fs_info, page, start, len); - } - - if (fs_info->sectorsize == PAGE_SIZE) - unlock_page(page); - else if (is_data_inode(page->mapping->host)) - /* - * For subpage data, unlock the page if we're the last reader. - * For subpage metadata, page lock is not utilized for read. - */ - btrfs_subpage_end_reader(fs_info, page, start, len); -} - /* * Find extent buffer for a givne bytenr. * @@ -2929,7 +2971,6 @@ static struct extent_buffer *find_extent_buffer_readpage( static void end_bio_extent_readpage(struct bio *bio) { struct bio_vec *bvec; - int uptodate = !bio->bi_status; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); struct extent_io_tree *tree, *failure_tree; struct processed_extent processed = { 0 }; @@ -2944,10 +2985,12 @@ static void end_bio_extent_readpage(struct bio *bio) ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_segment_all(bvec, bio, iter_all) { + bool uptodate = !bio->bi_status; struct page *page = bvec->bv_page; struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const u32 sectorsize = fs_info->sectorsize; + unsigned int error_bitmap = (unsigned int)-1; u64 start; u64 end; u32 len; @@ -2982,14 +3025,16 @@ static void end_bio_extent_readpage(struct bio *bio) mirror = io_bio->mirror_num; if (likely(uptodate)) { - if (is_data_inode(inode)) - ret = btrfs_verify_data_csum(io_bio, + if (is_data_inode(inode)) { + error_bitmap = btrfs_verify_data_csum(io_bio, bio_offset, page, start, end); - else + ret = error_bitmap; + } else { ret = btrfs_validate_metadata_buffer(io_bio, page, start, end, mirror); + } if (ret) - uptodate = 0; + uptodate = false; else clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree, tree, start, @@ -3001,27 +3046,18 @@ static void end_bio_extent_readpage(struct bio *bio) goto readpage_ok; if (is_data_inode(inode)) { - /* - * The generic bio_readpage_error handles errors the - * following way: If possible, new read requests are - * created and submitted and will end up in - * end_bio_extent_readpage as well (if we're lucky, - * not in the !uptodate case). In that case it returns - * 0 and we just go on with the next page in our bio. - * If it can't handle the error it will return -EIO and - * we remain responsible for that page. + * btrfs_submit_read_repair() will handle all the good + * and bad sectors, we just continue to the next bvec. */ - if (!btrfs_submit_read_repair(inode, bio, bio_offset, - page, - start - page_offset(page), - start, end, mirror, - btrfs_submit_data_bio)) { - uptodate = !bio->bi_status; - ASSERT(bio_offset + len > bio_offset); - bio_offset += len; - continue; - } + submit_read_repair(inode, bio, bio_offset, page, + start - page_offset(page), start, + end, mirror, error_bitmap, + btrfs_submit_data_bio); + + ASSERT(bio_offset + len > bio_offset); + bio_offset += len; + continue; } else { struct extent_buffer *eb; @@ -3151,42 +3187,99 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) * * Return true if successfully page added. Otherwise, return false. */ -static bool btrfs_bio_add_page(struct bio *bio, struct page *page, +static bool btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, + struct page *page, u64 disk_bytenr, unsigned int size, unsigned int pg_offset, - unsigned long prev_bio_flags, unsigned long bio_flags) { + struct bio *bio = bio_ctrl->bio; + u32 bio_size = bio->bi_iter.bi_size; const sector_t sector = disk_bytenr >> SECTOR_SHIFT; bool contig; int ret; - if (prev_bio_flags != bio_flags) + ASSERT(bio); + /* The limit should be calculated when bio_ctrl->bio is allocated */ + ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary); + if (bio_ctrl->bio_flags != bio_flags) return false; - if (prev_bio_flags & EXTENT_BIO_COMPRESSED) + if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED) contig = bio->bi_iter.bi_sector == sector; else contig = bio_end_sector(bio) == sector; if (!contig) return false; - if (btrfs_bio_fits_in_stripe(page, size, bio, bio_flags)) + if (bio_size + size > bio_ctrl->len_to_oe_boundary || + bio_size + size > bio_ctrl->len_to_stripe_boundary) return false; - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - struct page *first_page = bio_first_bvec_all(bio)->bv_page; - - if (!btrfs_bio_fits_in_ordered_extent(first_page, bio, size)) - return false; + if (bio_op(bio) == REQ_OP_ZONE_APPEND) ret = bio_add_zone_append_page(bio, page, size, pg_offset); - } else { + else ret = bio_add_page(bio, page, size, pg_offset); - } return ret == size; } +static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, + struct btrfs_inode *inode) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_io_geometry geom; + struct btrfs_ordered_extent *ordered; + struct extent_map *em; + u64 logical = (bio_ctrl->bio->bi_iter.bi_sector << SECTOR_SHIFT); + int ret; + + /* + * Pages for compressed extent are never submitted to disk directly, + * thus it has no real boundary, just set them to U32_MAX. + * + * The split happens for real compressed bio, which happens in + * btrfs_submit_compressed_read/write(). + */ + if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED) { + bio_ctrl->len_to_oe_boundary = U32_MAX; + bio_ctrl->len_to_stripe_boundary = U32_MAX; + return 0; + } + em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize); + if (IS_ERR(em)) + return PTR_ERR(em); + ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio_ctrl->bio), + logical, &geom); + free_extent_map(em); + if (ret < 0) { + return ret; + } + if (geom.len > U32_MAX) + bio_ctrl->len_to_stripe_boundary = U32_MAX; + else + bio_ctrl->len_to_stripe_boundary = (u32)geom.len; + + if (!btrfs_is_zoned(fs_info) || + bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) { + bio_ctrl->len_to_oe_boundary = U32_MAX; + return 0; + } + + ASSERT(fs_info->max_zone_append_size > 0); + /* Ordered extent not yet created, so we're good */ + ordered = btrfs_lookup_ordered_extent(inode, logical); + if (!ordered) { + bio_ctrl->len_to_oe_boundary = U32_MAX; + return 0; + } + + bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX, + ordered->disk_bytenr + ordered->disk_num_bytes - logical); + btrfs_put_ordered_extent(ordered); + return 0; +} + /* * @opf: bio REQ_OP_* and REQ_* flags as one value * @wbc: optional writeback control for io accounting @@ -3203,12 +3296,11 @@ static bool btrfs_bio_add_page(struct bio *bio, struct page *page, */ static int submit_extent_page(unsigned int opf, struct writeback_control *wbc, + struct btrfs_bio_ctrl *bio_ctrl, struct page *page, u64 disk_bytenr, size_t size, unsigned long pg_offset, - struct bio **bio_ret, bio_end_io_t end_io_func, int mirror_num, - unsigned long prev_bio_flags, unsigned long bio_flags, bool force_bio_submit) { @@ -3219,19 +3311,19 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree = &inode->io_tree; struct btrfs_fs_info *fs_info = inode->root->fs_info; - ASSERT(bio_ret); + ASSERT(bio_ctrl); - if (*bio_ret) { - bio = *bio_ret; + ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE && + pg_offset + size <= PAGE_SIZE); + if (bio_ctrl->bio) { + bio = bio_ctrl->bio; if (force_bio_submit || - !btrfs_bio_add_page(bio, page, disk_bytenr, io_size, - pg_offset, prev_bio_flags, bio_flags)) { - ret = submit_one_bio(bio, mirror_num, prev_bio_flags); - if (ret < 0) { - *bio_ret = NULL; + !btrfs_bio_add_page(bio_ctrl, page, disk_bytenr, io_size, + pg_offset, bio_flags)) { + ret = submit_one_bio(bio, mirror_num, bio_ctrl->bio_flags); + bio_ctrl->bio = NULL; + if (ret < 0) return ret; - } - bio = NULL; } else { if (wbc) wbc_account_cgroup_owner(wbc, page, io_size); @@ -3254,22 +3346,18 @@ static int submit_extent_page(unsigned int opf, wbc_account_cgroup_owner(wbc, page, io_size); } if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) { - struct extent_map *em; - struct map_lookup *map; - - em = btrfs_get_chunk_map(fs_info, disk_bytenr, io_size); - if (IS_ERR(em)) - return PTR_ERR(em); + struct btrfs_device *device; - map = em->map_lookup; - /* We only support single profile for now */ - ASSERT(map->num_stripes == 1); - btrfs_io_bio(bio)->device = map->stripes[0].dev; + device = btrfs_zoned_get_device(fs_info, disk_bytenr, io_size); + if (IS_ERR(device)) + return PTR_ERR(device); - free_extent_map(em); + btrfs_io_bio(bio)->device = device; } - *bio_ret = bio; + bio_ctrl->bio = bio; + bio_ctrl->bio_flags = bio_flags; + ret = calc_bio_boundaries(bio_ctrl, inode); return ret; } @@ -3382,7 +3470,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, * return 0 on success, otherwise return error */ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, - struct bio **bio, unsigned long *bio_flags, + struct btrfs_bio_ctrl *bio_ctrl, unsigned int read_flags, u64 *prev_em_start) { struct inode *inode = page->mapping->host; @@ -3558,15 +3646,13 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, } ret = submit_extent_page(REQ_OP_READ | read_flags, NULL, - page, disk_bytenr, iosize, - pg_offset, bio, + bio_ctrl, page, disk_bytenr, iosize, + pg_offset, end_bio_extent_readpage, 0, - *bio_flags, this_bio_flag, force_bio_submit); if (!ret) { nr++; - *bio_flags = this_bio_flag; } else { unlock_extent(tree, cur, cur + iosize - 1); end_page_read(page, false, cur, iosize); @@ -3580,11 +3666,10 @@ out: } static inline void contiguous_readpages(struct page *pages[], int nr_pages, - u64 start, u64 end, - struct extent_map **em_cached, - struct bio **bio, - unsigned long *bio_flags, - u64 *prev_em_start) + u64 start, u64 end, + struct extent_map **em_cached, + struct btrfs_bio_ctrl *bio_ctrl, + u64 *prev_em_start) { struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host); int index; @@ -3592,7 +3677,7 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages, btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); for (index = 0; index < nr_pages; index++) { - btrfs_do_readpage(pages[index], em_cached, bio, bio_flags, + btrfs_do_readpage(pages[index], em_cached, bio_ctrl, REQ_RAHEAD, prev_em_start); put_page(pages[index]); } @@ -3680,6 +3765,54 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, } /* + * Find the first byte we need to write. + * + * For subpage, one page can contain several sectors, and + * __extent_writepage_io() will just grab all extent maps in the page + * range and try to submit all non-inline/non-compressed extents. + * + * This is a big problem for subpage, we shouldn't re-submit already written + * data at all. + * This function will lookup subpage dirty bit to find which range we really + * need to submit. + * + * Return the next dirty range in [@start, @end). + * If no dirty range is found, @start will be page_offset(page) + PAGE_SIZE. + */ +static void find_next_dirty_byte(struct btrfs_fs_info *fs_info, + struct page *page, u64 *start, u64 *end) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + u64 orig_start = *start; + /* Declare as unsigned long so we can use bitmap ops */ + unsigned long dirty_bitmap; + unsigned long flags; + int nbits = (orig_start - page_offset(page)) >> fs_info->sectorsize_bits; + int range_start_bit = nbits; + int range_end_bit; + + /* + * For regular sector size == page size case, since one page only + * contains one sector, we return the page offset directly. + */ + if (fs_info->sectorsize == PAGE_SIZE) { + *start = page_offset(page); + *end = page_offset(page) + PAGE_SIZE; + return; + } + + /* We should have the page locked, but just in case */ + spin_lock_irqsave(&subpage->lock, flags); + dirty_bitmap = subpage->dirty_bitmap; + spin_unlock_irqrestore(&subpage->lock, flags); + + bitmap_next_set_region(&dirty_bitmap, &range_start_bit, &range_end_bit, + BTRFS_SUBPAGE_BITMAP_SIZE); + *start = page_offset(page) + range_start_bit * fs_info->sectorsize; + *end = page_offset(page) + range_end_bit * fs_info->sectorsize; +} + +/* * helper for __extent_writepage. This calls the writepage start hooks, * and does the loop to map the page into extents and bios. * @@ -3696,7 +3829,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, int *nr_ret) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct extent_io_tree *tree = &inode->io_tree; u64 start = page_offset(page); u64 end = start + PAGE_SIZE - 1; u64 cur = start; @@ -3727,15 +3859,26 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, while (cur <= end) { u64 disk_bytenr; u64 em_end; + u64 dirty_range_start = cur; + u64 dirty_range_end; u32 iosize; if (cur >= i_size) { - btrfs_writepage_endio_finish_ordered(page, cur, end, 1); + btrfs_writepage_endio_finish_ordered(inode, page, cur, + end, 1); break; } + + find_next_dirty_byte(fs_info, page, &dirty_range_start, + &dirty_range_end); + if (cur < dirty_range_start) { + cur = dirty_range_start; + continue; + } + em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1); if (IS_ERR_OR_NULL(em)) { - SetPageError(page); + btrfs_page_set_error(fs_info, page, cur, end - cur + 1); ret = PTR_ERR_OR_ZERO(em); break; } @@ -3750,10 +3893,13 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); disk_bytenr = em->block_start + extent_offset; - /* Note that em_end from extent_map_end() is exclusive */ - iosize = min(em_end, end + 1) - cur; + /* + * Note that em_end from extent_map_end() and dirty_range_end from + * find_next_dirty_byte() are all exclusive + */ + iosize = min(min(em_end, end + 1), dirty_range_end) - cur; - if (btrfs_use_zone_append(inode, em)) + if (btrfs_use_zone_append(inode, em->block_start)) opf = REQ_OP_ZONE_APPEND; free_extent_map(em); @@ -3768,28 +3914,38 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, if (compressed) nr++; else - btrfs_writepage_endio_finish_ordered(page, cur, - cur + iosize - 1, 1); + btrfs_writepage_endio_finish_ordered(inode, + page, cur, cur + iosize - 1, 1); cur += iosize; continue; } - btrfs_set_range_writeback(tree, cur, cur + iosize - 1); + btrfs_set_range_writeback(inode, cur, cur + iosize - 1); if (!PageWriteback(page)) { btrfs_err(inode->root->fs_info, "page %lu not writeback, cur %llu end %llu", page->index, cur, end); } - ret = submit_extent_page(opf | write_flags, wbc, page, + /* + * Although the PageDirty bit is cleared before entering this + * function, subpage dirty bit is not cleared. + * So clear subpage dirty bit here so next time we won't submit + * page for range already written to disk. + */ + btrfs_page_clear_dirty(fs_info, page, cur, iosize); + + ret = submit_extent_page(opf | write_flags, wbc, + &epd->bio_ctrl, page, disk_bytenr, iosize, - cur - page_offset(page), &epd->bio, + cur - page_offset(page), end_bio_extent_writepage, - 0, 0, 0, false); + 0, 0, false); if (ret) { - SetPageError(page); + btrfs_page_set_error(fs_info, page, cur, iosize); if (PageWriteback(page)) - end_page_writeback(page); + btrfs_page_clear_writeback(fs_info, page, cur, + iosize); } cur += iosize; @@ -4098,12 +4254,15 @@ static struct extent_buffer *find_extent_buffer_nolock( * Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback() * after all extent buffers in the page has finished their writeback. */ -static void end_bio_subpage_eb_writepage(struct btrfs_fs_info *fs_info, - struct bio *bio) +static void end_bio_subpage_eb_writepage(struct bio *bio) { + struct btrfs_fs_info *fs_info; struct bio_vec *bvec; struct bvec_iter_all iter_all; + fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb); + ASSERT(fs_info->sectorsize < PAGE_SIZE); + ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_segment_all(bvec, bio, iter_all) { struct page *page = bvec->bv_page; @@ -4154,16 +4313,11 @@ static void end_bio_subpage_eb_writepage(struct btrfs_fs_info *fs_info, static void end_bio_extent_buffer_writepage(struct bio *bio) { - struct btrfs_fs_info *fs_info; struct bio_vec *bvec; struct extent_buffer *eb; int done; struct bvec_iter_all iter_all; - fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb); - if (fs_info->sectorsize < PAGE_SIZE) - return end_bio_subpage_eb_writepage(fs_info, bio); - ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_segment_all(bvec, bio, iter_all) { struct page *page = bvec->bv_page; @@ -4189,12 +4343,34 @@ static void end_bio_extent_buffer_writepage(struct bio *bio) bio_put(bio); } +static void prepare_eb_write(struct extent_buffer *eb) +{ + u32 nritems; + unsigned long start; + unsigned long end; + + clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); + atomic_set(&eb->io_pages, num_extent_pages(eb)); + + /* Set btree blocks beyond nritems with 0 to avoid stale content */ + nritems = btrfs_header_nritems(eb); + if (btrfs_header_level(eb) > 0) { + end = btrfs_node_key_ptr_offset(nritems); + memzero_extent_buffer(eb, end, eb->len - end); + } else { + /* + * Leaf: + * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 + */ + start = btrfs_item_nr_offset(nritems); + end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb); + memzero_extent_buffer(eb, start, end - start); + } +} + /* * Unlike the work in write_one_eb(), we rely completely on extent locking. * Page locking is only utilized at minimum to keep the VMM code happy. - * - * Caller should still call write_one_eb() other than this function directly. - * As write_one_eb() has extra preparation before submitting the extent buffer. */ static int write_one_subpage_eb(struct extent_buffer *eb, struct writeback_control *wbc, @@ -4206,6 +4382,8 @@ static int write_one_subpage_eb(struct extent_buffer *eb, bool no_dirty_ebs = false; int ret; + prepare_eb_write(eb); + /* clear_page_dirty_for_io() in subpage helper needs page locked */ lock_page(page); btrfs_subpage_set_writeback(fs_info, page, eb->start, eb->len); @@ -4216,10 +4394,10 @@ static int write_one_subpage_eb(struct extent_buffer *eb, if (no_dirty_ebs) clear_page_dirty_for_io(page); - ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, page, - eb->start, eb->len, eb->start - page_offset(page), - &epd->bio, end_bio_extent_buffer_writepage, 0, 0, 0, - false); + ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, + &epd->bio_ctrl, page, eb->start, eb->len, + eb->start - page_offset(page), + end_bio_subpage_eb_writepage, 0, 0, false); if (ret) { btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len); set_btree_ioerr(page, eb); @@ -4244,45 +4422,23 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, struct extent_page_data *epd) { u64 disk_bytenr = eb->start; - u32 nritems; int i, num_pages; - unsigned long start, end; unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META; int ret = 0; - clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); - num_pages = num_extent_pages(eb); - atomic_set(&eb->io_pages, num_pages); - - /* set btree blocks beyond nritems with 0 to avoid stale content. */ - nritems = btrfs_header_nritems(eb); - if (btrfs_header_level(eb) > 0) { - end = btrfs_node_key_ptr_offset(nritems); - - memzero_extent_buffer(eb, end, eb->len - end); - } else { - /* - * leaf: - * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 - */ - start = btrfs_item_nr_offset(nritems); - end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb); - memzero_extent_buffer(eb, start, end - start); - } - - if (eb->fs_info->sectorsize < PAGE_SIZE) - return write_one_subpage_eb(eb, wbc, epd); + prepare_eb_write(eb); + num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { struct page *p = eb->pages[i]; clear_page_dirty_for_io(p); set_page_writeback(p); ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, - p, disk_bytenr, PAGE_SIZE, 0, - &epd->bio, + &epd->bio_ctrl, p, disk_bytenr, + PAGE_SIZE, 0, end_bio_extent_buffer_writepage, - 0, 0, 0, false); + 0, 0, false); if (ret) { set_btree_ioerr(p, eb); if (PageWriteback(p)) @@ -4386,7 +4542,7 @@ static int submit_eb_subpage(struct page *page, free_extent_buffer(eb); goto cleanup; } - ret = write_one_eb(eb, wbc, epd); + ret = write_one_subpage_eb(eb, wbc, epd); free_extent_buffer(eb); if (ret < 0) goto cleanup; @@ -4498,7 +4654,7 @@ int btree_write_cache_pages(struct address_space *mapping, { struct extent_buffer *eb_context = NULL; struct extent_page_data epd = { - .bio = NULL, + .bio_ctrl = { 0 }, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; @@ -4780,7 +4936,7 @@ int extent_write_full_page(struct page *page, struct writeback_control *wbc) { int ret; struct extent_page_data epd = { - .bio = NULL, + .bio_ctrl = { 0 }, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; @@ -4807,7 +4963,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end, PAGE_SHIFT; struct extent_page_data epd = { - .bio = NULL, + .bio_ctrl = { 0 }, .extent_locked = 1, .sync_io = mode == WB_SYNC_ALL, }; @@ -4827,8 +4983,8 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end, if (clear_page_dirty_for_io(page)) ret = __extent_writepage(page, &wbc_writepages, &epd); else { - btrfs_writepage_endio_finish_ordered(page, start, - start + PAGE_SIZE - 1, 1); + btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), + page, start, start + PAGE_SIZE - 1, 1); unlock_page(page); } put_page(page); @@ -4850,7 +5006,7 @@ int extent_writepages(struct address_space *mapping, { int ret = 0; struct extent_page_data epd = { - .bio = NULL, + .bio_ctrl = { 0 }, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; @@ -4867,8 +5023,7 @@ int extent_writepages(struct address_space *mapping, void extent_readahead(struct readahead_control *rac) { - struct bio *bio = NULL; - unsigned long bio_flags = 0; + struct btrfs_bio_ctrl bio_ctrl = { 0 }; struct page *pagepool[16]; struct extent_map *em_cached = NULL; u64 prev_em_start = (u64)-1; @@ -4879,14 +5034,14 @@ void extent_readahead(struct readahead_control *rac) u64 contig_end = contig_start + readahead_batch_length(rac) - 1; contiguous_readpages(pagepool, nr, contig_start, contig_end, - &em_cached, &bio, &bio_flags, &prev_em_start); + &em_cached, &bio_ctrl, &prev_em_start); } if (em_cached) free_extent_map(em_cached); - if (bio) { - if (submit_one_bio(bio, 0, bio_flags)) + if (bio_ctrl.bio) { + if (submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags)) return; } } @@ -5196,7 +5351,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { int ret = 0; - u64 off = start; + u64 off; u64 max = start + len; u32 flags = 0; u32 found_type; @@ -5231,6 +5386,11 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, goto out_free_ulist; } + /* + * We can't initialize that to 'start' as this could miss extents due + * to extent item merging + */ + off = 0; start = round_down(start, btrfs_inode_sectorsize(inode)); len = round_up(max, btrfs_inode_sectorsize(inode)) - start; @@ -5424,6 +5584,12 @@ static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page) subpage = (struct btrfs_subpage *)page->private; if (atomic_read(&subpage->eb_refs)) return true; + /* + * Even there is no eb refs here, we may still have + * end_page_read() call relying on page::private. + */ + if (atomic_read(&subpage->readers)) + return true; } return false; } @@ -5484,7 +5650,7 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag /* * We can only detach the page private if there are no other ebs in the - * page range. + * page range and no unfinished IO. */ if (!page_range_has_eb(fs_info, page)) btrfs_detach_subpage(fs_info, page); @@ -6171,7 +6337,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, struct btrfs_fs_info *fs_info = eb->fs_info; struct extent_io_tree *io_tree; struct page *page = eb->pages[0]; - struct bio *bio = NULL; + struct btrfs_bio_ctrl bio_ctrl = { 0 }; int ret = 0; ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)); @@ -6179,10 +6345,8 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; if (wait == WAIT_NONE) { - ret = try_lock_extent(io_tree, eb->start, - eb->start + eb->len - 1); - if (ret <= 0) - return ret; + if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1)) + return -EAGAIN; } else { ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1); if (ret < 0) @@ -6204,9 +6368,11 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, check_buffer_tree_ref(eb); btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len); - ret = submit_extent_page(REQ_OP_READ | REQ_META, NULL, page, eb->start, - eb->len, eb->start - page_offset(page), &bio, - end_bio_extent_readpage, mirror_num, 0, 0, + btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len); + ret = submit_extent_page(REQ_OP_READ | REQ_META, NULL, &bio_ctrl, + page, eb->start, eb->len, + eb->start - page_offset(page), + end_bio_extent_readpage, mirror_num, 0, true); if (ret) { /* @@ -6216,10 +6382,11 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, */ atomic_dec(&eb->io_pages); } - if (bio) { + if (bio_ctrl.bio) { int tmp; - tmp = submit_one_bio(bio, mirror_num, 0); + tmp = submit_one_bio(bio_ctrl.bio, mirror_num, 0); + bio_ctrl.bio = NULL; if (tmp < 0) return tmp; } @@ -6242,8 +6409,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) int all_uptodate = 1; int num_pages; unsigned long num_reads = 0; - struct bio *bio = NULL; - unsigned long bio_flags = 0; + struct btrfs_bio_ctrl bio_ctrl = { 0 }; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; @@ -6307,9 +6473,9 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) ClearPageError(page); err = submit_extent_page(REQ_OP_READ | REQ_META, NULL, - page, page_offset(page), PAGE_SIZE, 0, - &bio, end_bio_extent_readpage, - mirror_num, 0, 0, false); + &bio_ctrl, page, page_offset(page), + PAGE_SIZE, 0, end_bio_extent_readpage, + mirror_num, 0, false); if (err) { /* * We failed to submit the bio so it's the @@ -6326,8 +6492,9 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) } } - if (bio) { - err = submit_one_bio(bio, mirror_num, bio_flags); + if (bio_ctrl.bio) { + err = submit_one_bio(bio_ctrl.bio, mirror_num, bio_ctrl.bio_flags); + bio_ctrl.bio = NULL; if (err) return err; } @@ -6510,9 +6677,10 @@ void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb, char *kaddr; assert_eb_page_uptodate(eb, eb->pages[0]); - kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0); - memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv, - BTRFS_FSID_SIZE); + kaddr = page_address(eb->pages[0]) + + get_eb_offset_in_page(eb, offsetof(struct btrfs_header, + chunk_tree_uuid)); + memcpy(kaddr, srcv, BTRFS_FSID_SIZE); } void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv) @@ -6520,9 +6688,9 @@ void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv) char *kaddr; assert_eb_page_uptodate(eb, eb->pages[0]); - kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0); - memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv, - BTRFS_FSID_SIZE); + kaddr = page_address(eb->pages[0]) + + get_eb_offset_in_page(eb, offsetof(struct btrfs_header, fsid)); + memcpy(kaddr, srcv, BTRFS_FSID_SIZE); } void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 227215a5722c..62027f551b44 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -39,7 +39,7 @@ enum { /* Page starts writeback, clear dirty bit and set writeback bit */ #define PAGE_START_WRITEBACK (1 << 1) #define PAGE_END_WRITEBACK (1 << 2) -#define PAGE_SET_PRIVATE2 (1 << 3) +#define PAGE_SET_ORDERED (1 << 3) #define PAGE_SET_ERROR (1 << 4) #define PAGE_LOCK (1 << 5) @@ -102,6 +102,17 @@ struct extent_buffer { }; /* + * Structure to record info about the bio being assembled, and other info like + * how many bytes are there before stripe/ordered extent boundary. + */ +struct btrfs_bio_ctrl { + struct bio *bio; + unsigned long bio_flags; + u32 len_to_stripe_boundary; + u32 len_to_oe_boundary; +}; + +/* * Structure to record how many bytes and which ranges are set/cleared */ struct extent_changeset { @@ -169,7 +180,7 @@ int try_release_extent_buffer(struct page *page); int __must_check submit_one_bio(struct bio *bio, int mirror_num, unsigned long bio_flags); int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, - struct bio **bio, unsigned long *bio_flags, + struct btrfs_bio_ctrl *bio_ctrl, unsigned int read_flags, u64 *prev_em_start); int extent_write_full_page(struct page *page, struct writeback_control *wbc); int extent_write_locked_range(struct inode *inode, u64 start, u64 end, @@ -281,7 +292,7 @@ int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num); * When IO fails, either with EIO or csum verification fails, we * try other mirrors that might have a good copy of the data. This * io_failure_record is used to record state as we go through all the - * mirrors. If another mirror has good data, the page is set up to date + * mirrors. If another mirror has good data, the sector is set up to date * and things continue. If a good mirror can't be found, the original * bio end_io callback is called to indicate things have failed. */ @@ -293,15 +304,13 @@ struct io_failure_record { unsigned long bio_flags; int this_mirror; int failed_mirror; - int in_validation; }; - -blk_status_t btrfs_submit_read_repair(struct inode *inode, - struct bio *failed_bio, u32 bio_offset, - struct page *page, unsigned int pgoff, - u64 start, u64 end, int failed_mirror, - submit_bio_hook_t *submit_bio_hook); +int btrfs_repair_one_sector(struct inode *inode, + struct bio *failed_bio, u32 bio_offset, + struct page *page, unsigned int pgoff, + u64 start, int failed_mirror, + submit_bio_hook_t *submit_bio_hook); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS bool find_lock_delalloc_range(struct inode *inode, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 294602f139ef..df6631eefc65 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -618,7 +618,7 @@ fail: * @file_start: offset in file this bio begins to describe * @contig: Boolean. If true/1 means all bio vecs in this bio are * contiguous and they begin at @file_start in the file. False/0 - * means this bio can contains potentially discontigous bio vecs + * means this bio can contain potentially discontiguous bio vecs * so the logical offset of each should be calculated separately. */ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, @@ -788,7 +788,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, u64 end_byte = bytenr + len; u64 csum_end; struct extent_buffer *leaf; - int ret; + int ret = 0; const u32 csum_size = fs_info->csum_size; u32 blocksize_bits = fs_info->sectorsize_bits; @@ -806,6 +806,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { + ret = 0; if (path->slots[0] == 0) break; path->slots[0]--; @@ -862,7 +863,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, ret = btrfs_del_items(trans, root, path, path->slots[0], del_nr); if (ret) - goto out; + break; if (key.offset == bytenr) break; } else if (key.offset < bytenr && csum_end > end_byte) { @@ -906,8 +907,9 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, ret = btrfs_split_item(trans, root, path, &key, offset); if (ret && ret != -EAGAIN) { btrfs_abort_transaction(trans, ret); - goto out; + break; } + ret = 0; key.offset = end_byte - 1; } else { @@ -917,12 +919,41 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, } btrfs_release_path(path); } - ret = 0; -out: btrfs_free_path(path); return ret; } +static int find_next_csum_offset(struct btrfs_root *root, + struct btrfs_path *path, + u64 *next_offset) +{ + const u32 nritems = btrfs_header_nritems(path->nodes[0]); + struct btrfs_key found_key; + int slot = path->slots[0] + 1; + int ret; + + if (nritems == 0 || slot >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + return ret; + } else if (ret > 0) { + *next_offset = (u64)-1; + return 0; + } + slot = path->slots[0]; + } + + btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot); + + if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || + found_key.type != BTRFS_EXTENT_CSUM_KEY) + *next_offset = (u64)-1; + else + *next_offset = found_key.offset; + + return 0; +} + int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums) @@ -938,7 +969,6 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, u64 total_bytes = 0; u64 csum_offset; u64 bytenr; - u32 nritems; u32 ins_size; int index = 0; int found_next; @@ -981,26 +1011,10 @@ again: goto insert; } } else { - int slot = path->slots[0] + 1; - /* we didn't find a csum item, insert one */ - nritems = btrfs_header_nritems(path->nodes[0]); - if (!nritems || (path->slots[0] >= nritems - 1)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) { - goto out; - } else if (ret > 0) { - found_next = 1; - goto insert; - } - slot = path->slots[0]; - } - btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot); - if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || - found_key.type != BTRFS_EXTENT_CSUM_KEY) { - found_next = 1; - goto insert; - } - next_offset = found_key.offset; + /* We didn't find a csum item, insert one. */ + ret = find_next_csum_offset(root, path, &next_offset); + if (ret < 0) + goto out; found_next = 1; goto insert; } @@ -1056,8 +1070,48 @@ extend_csum: tmp = sums->len - total_bytes; tmp >>= fs_info->sectorsize_bits; WARN_ON(tmp < 1); + extend_nr = max_t(int, 1, tmp); + + /* + * A log tree can already have checksum items with a subset of + * the checksums we are trying to log. This can happen after + * doing a sequence of partial writes into prealloc extents and + * fsyncs in between, with a full fsync logging a larger subrange + * of an extent for which a previous fast fsync logged a smaller + * subrange. And this happens in particular due to merging file + * extent items when we complete an ordered extent for a range + * covered by a prealloc extent - this is done at + * btrfs_mark_extent_written(). + * + * So if we try to extend the previous checksum item, which has + * a range that ends at the start of the range we want to insert, + * make sure we don't extend beyond the start offset of the next + * checksum item. If we are at the last item in the leaf, then + * forget the optimization of extending and add a new checksum + * item - it is not worth the complexity of releasing the path, + * getting the first key for the next leaf, repeat the btree + * search, etc, because log trees are temporary anyway and it + * would only save a few bytes of leaf space. + */ + if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { + if (path->slots[0] + 1 >= + btrfs_header_nritems(path->nodes[0])) { + ret = find_next_csum_offset(root, path, &next_offset); + if (ret < 0) + goto out; + found_next = 1; + goto insert; + } + + ret = find_next_csum_offset(root, path, &next_offset); + if (ret < 0) + goto out; + + tmp = (next_offset - bytenr) >> fs_info->sectorsize_bits; + if (tmp <= INT_MAX) + extend_nr = min_t(int, extend_nr, tmp); + } - extend_nr = max_t(int, 1, (int)tmp); diff = (csum_offset + extend_nr) * csum_size; diff = min(diff, MAX_CSUM_ITEMS(fs_info, csum_size) * csum_size); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 3b10d98b4ebb..28a05ba47060 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -28,6 +28,7 @@ #include "compression.h" #include "delalloc-space.h" #include "reflink.h" +#include "subpage.h" static struct kmem_cache *btrfs_inode_defrag_cachep; /* @@ -482,6 +483,7 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, start_pos = round_down(pos, fs_info->sectorsize); num_bytes = round_up(write_bytes + pos - start_pos, fs_info->sectorsize); + ASSERT(num_bytes <= U32_MAX); end_of_last_block = start_pos + num_bytes - 1; @@ -500,9 +502,10 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, for (i = 0; i < num_pages; i++) { struct page *p = pages[i]; - SetPageUptodate(p); + + btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes); ClearPageChecked(p); - set_page_dirty(p); + btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes); } /* @@ -1094,7 +1097,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, int del_nr = 0; int del_slot = 0; int recow; - int ret; + int ret = 0; u64 ino = btrfs_ino(inode); path = btrfs_alloc_path(); @@ -1315,7 +1318,7 @@ again: } out: btrfs_free_path(path); - return 0; + return ret; } /* @@ -2483,6 +2486,17 @@ static int btrfs_punch_hole_lock_range(struct inode *inode, const u64 lockend, struct extent_state **cached_state) { + /* + * For subpage case, if the range is not at page boundary, we could + * have pages at the leading/tailing part of the range. + * This could lead to dead loop since filemap_range_has_page() + * will always return true. + * So here we need to do extra page alignment for + * filemap_range_has_page(). + */ + const u64 page_lockstart = round_up(lockstart, PAGE_SIZE); + const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1; + while (1) { struct btrfs_ordered_extent *ordered; int ret; @@ -2503,7 +2517,7 @@ static int btrfs_punch_hole_lock_range(struct inode *inode, (ordered->file_offset + ordered->num_bytes <= lockstart || ordered->file_offset > lockend)) && !filemap_range_has_page(inode->i_mapping, - lockstart, lockend)) { + page_lockstart, page_lockend)) { if (ordered) btrfs_put_ordered_extent(ordered); break; @@ -3034,22 +3048,20 @@ struct falloc_range { */ static int add_falloc_range(struct list_head *head, u64 start, u64 len) { - struct falloc_range *prev = NULL; struct falloc_range *range = NULL; - if (list_empty(head)) - goto insert; - - /* - * As fallocate iterate by bytenr order, we only need to check - * the last range. - */ - prev = list_entry(head->prev, struct falloc_range, list); - if (prev->start + prev->len == start) { - prev->len += len; - return 0; + if (!list_empty(head)) { + /* + * As fallocate iterates by bytenr order, we only need to check + * the last range. + */ + range = list_last_entry(head, struct falloc_range, list); + if (range->start + range->len == start) { + range->len += len; + return 0; + } } -insert: + range = kmalloc(sizeof(*range), GFP_KERNEL); if (!range) return -ENOMEM; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 4806295116d8..2131ae5b9ed7 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -327,7 +327,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, * need to check for -EAGAIN. */ ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), - 0, BTRFS_EXTENT_DATA_KEY); + 0, BTRFS_EXTENT_DATA_KEY, NULL); if (ret) goto fail; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index eb6fddf40841..e6eb20987351 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -51,6 +51,7 @@ #include "block-group.h" #include "space-info.h" #include "zoned.h" +#include "subpage.h" struct btrfs_iget_args { u64 ino; @@ -166,22 +167,47 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, struct page *page; while (index <= end_index) { + /* + * For locked page, we will call end_extent_writepage() on it + * in run_delalloc_range() for the error handling. That + * end_extent_writepage() function will call + * btrfs_mark_ordered_io_finished() to clear page Ordered and + * run the ordered extent accounting. + * + * Here we can't just clear the Ordered bit, or + * btrfs_mark_ordered_io_finished() would skip the accounting + * for the page range, and the ordered extent will never finish. + */ + if (index == (page_offset(locked_page) >> PAGE_SHIFT)) { + index++; + continue; + } page = find_get_page(inode->vfs_inode.i_mapping, index); index++; if (!page) continue; - ClearPagePrivate2(page); + + /* + * Here we just clear all Ordered bits for every page in the + * range, then __endio_write_update_ordered() will handle + * the ordered extent accounting for the range. + */ + btrfs_page_clamp_clear_ordered(inode->root->fs_info, page, + offset, bytes); put_page(page); } + /* The locked page covers the full range, nothing needs to be done */ + if (bytes + offset <= page_offset(locked_page) + PAGE_SIZE) + return; /* * In case this page belongs to the delalloc range being instantiated * then skip it, since the first page of a range is going to be * properly cleaned up by the caller of run_delalloc_range */ if (page_start >= offset && page_end <= (offset + bytes - 1)) { - offset += PAGE_SIZE; - bytes -= PAGE_SIZE; + bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE; + offset = page_offset(locked_page) + PAGE_SIZE; } return __endio_write_update_ordered(inode, offset, bytes, false); @@ -603,7 +629,7 @@ again: * inode has not been flagged as nocompress. This flag can * change at any time if we discover bad compression ratios. */ - if (inode_need_compress(BTRFS_I(inode), start, end)) { + if (nr_pages > 1 && inode_need_compress(BTRFS_I(inode), start, end)) { WARN_ON(pages); pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) { @@ -946,7 +972,8 @@ retry: const u64 end = start + async_extent->ram_size - 1; p->mapping = inode->vfs_inode.i_mapping; - btrfs_writepage_endio_finish_ordered(p, start, end, 0); + btrfs_writepage_endio_finish_ordered(inode, p, start, + end, 0); p->mapping = NULL; extent_clear_unlock_delalloc(inode, start, end, NULL, 0, @@ -1064,7 +1091,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * our outstanding extent for clearing delalloc for this * range. */ - extent_clear_unlock_delalloc(inode, start, end, NULL, + extent_clear_unlock_delalloc(inode, start, end, + locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | @@ -1072,6 +1100,19 @@ static noinline int cow_file_range(struct btrfs_inode *inode, *nr_written = *nr_written + (end - start + PAGE_SIZE) / PAGE_SIZE; *page_started = 1; + /* + * locked_page is locked by the caller of + * writepage_delalloc(), not locked by + * __process_pages_contig(). + * + * We can't let __process_pages_contig() to unlock it, + * as it doesn't have any subpage::writers recorded. + * + * Here we manually unlock the page, since the caller + * can't use page_started to determine if it's an + * inline extent or a compressed extent. + */ + unlock_page(locked_page); goto out; } else if (ret < 0) { goto out_unlock; @@ -1150,15 +1191,16 @@ static noinline int cow_file_range(struct btrfs_inode *inode, btrfs_dec_block_group_reservations(fs_info, ins.objectid); - /* we're not doing compressed IO, don't unlock the first - * page (which the caller expects to stay locked), don't - * clear any dirty bits and don't set any writeback bits + /* + * We're not doing compressed IO, don't unlock the first page + * (which the caller expects to stay locked), don't clear any + * dirty bits and don't set any writeback bits * - * Do set the Private2 bit so we know this page was properly - * setup for writepage + * Do set the Ordered (Private2) bit so we know this page was + * properly setup for writepage. */ page_ops = unlock ? PAGE_UNLOCK : 0; - page_ops |= PAGE_SET_PRIVATE2; + page_ops |= PAGE_SET_ORDERED; extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, locked_page, @@ -1822,7 +1864,7 @@ out_check: locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_CLEAR_DATA_RESV, - PAGE_UNLOCK | PAGE_SET_PRIVATE2); + PAGE_UNLOCK | PAGE_SET_ORDERED); cur_offset = extent_end; @@ -2193,26 +2235,22 @@ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 logical = bio->bi_iter.bi_sector << 9; + u32 bio_len = bio->bi_iter.bi_size; struct extent_map *em; - u64 length = 0; - u64 map_length; int ret = 0; struct btrfs_io_geometry geom; if (bio_flags & EXTENT_BIO_COMPRESSED) return 0; - length = bio->bi_iter.bi_size; - map_length = length; - em = btrfs_get_chunk_map(fs_info, logical, map_length); + em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize); if (IS_ERR(em)) return PTR_ERR(em); - ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical, - map_length, &geom); + ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical, &geom); if (ret < 0) goto out; - if (geom.len < length + size) + if (geom.len < bio_len + size) ret = 1; out: free_extent_map(em); @@ -2233,33 +2271,6 @@ static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio, return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); } -bool btrfs_bio_fits_in_ordered_extent(struct page *page, struct bio *bio, - unsigned int size) -{ - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_ordered_extent *ordered; - u64 len = bio->bi_iter.bi_size + size; - bool ret = true; - - ASSERT(btrfs_is_zoned(fs_info)); - ASSERT(fs_info->max_zone_append_size > 0); - ASSERT(bio_op(bio) == REQ_OP_ZONE_APPEND); - - /* Ordered extent not yet created, so we're good */ - ordered = btrfs_lookup_ordered_extent(inode, page_offset(page)); - if (!ordered) - return ret; - - if ((bio->bi_iter.bi_sector << SECTOR_SHIFT) + len > - ordered->disk_bytenr + ordered->disk_num_bytes) - ret = false; - - btrfs_put_ordered_extent(ordered); - - return ret; -} - static blk_status_t extract_ordered_extent(struct btrfs_inode *inode, struct bio *bio, loff_t file_offset) { @@ -2601,7 +2612,7 @@ again: lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state); /* already ordered? We're done */ - if (PagePrivate2(page)) + if (PageOrdered(page)) goto out_reserved; ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); @@ -2676,8 +2687,8 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end) struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_writepage_fixup *fixup; - /* this page is properly in the ordered list */ - if (TestClearPagePrivate2(page)) + /* This page has ordered extent covering it already */ + if (PageOrdered(page)) return 0; /* @@ -2773,7 +2784,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, /* * If we dropped an inline extent here, we know the range where it is * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the - * number of bytes only for that range contaning the inline extent. + * number of bytes only for that range containing the inline extent. * The remaining of the range will be processed when clearning the * EXTENT_DELALLOC_BIT bit through the ordered extent completion. */ @@ -3000,6 +3011,18 @@ out: if (ret || truncated) { u64 unwritten_start = start; + /* + * If we failed to finish this ordered extent for any reason we + * need to make sure BTRFS_ORDERED_IOERR is set on the ordered + * extent, and mark the inode with the error if it wasn't + * already set. Any error during writeback would have already + * set the mapping error, so we need to set it if we're the ones + * marking this ordered extent as failed. + */ + if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR, + &ordered_extent->flags)) + mapping_set_error(ordered_extent->inode->i_mapping, -EIO); + if (truncated) unwritten_start += logical_len; clear_extent_uptodate(io_tree, unwritten_start, end, NULL); @@ -3057,28 +3080,14 @@ static void finish_ordered_fn(struct btrfs_work *work) btrfs_finish_ordered_io(ordered_extent); } -void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, +void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, + struct page *page, u64 start, u64 end, int uptodate) { - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_ordered_extent *ordered_extent = NULL; - struct btrfs_workqueue *wq; + trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate); - trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); - - ClearPagePrivate2(page); - if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, - end - start + 1, uptodate)) - return; - - if (btrfs_is_free_space_inode(inode)) - wq = fs_info->endio_freespace_worker; - else - wq = fs_info->endio_write_workers; - - btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL); - btrfs_queue_work(wq, &ordered_extent->work); + btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start, + finish_ordered_fn, uptodate); } /* @@ -3140,15 +3149,19 @@ zeroit: * @bio_offset: offset to the beginning of the bio (in bytes) * @start: file offset of the range start * @end: file offset of the range end (inclusive) + * + * Return a bitmap where bit set means a csum mismatch, and bit not set means + * csum match. */ -int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, - struct page *page, u64 start, u64 end) +unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, + struct page *page, u64 start, u64 end) { struct inode *inode = page->mapping->host; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_root *root = BTRFS_I(inode)->root; const u32 sectorsize = root->fs_info->sectorsize; u32 pg_off; + unsigned int result = 0; if (PageChecked(page)) { ClearPageChecked(page); @@ -3176,10 +3189,14 @@ int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off, page_offset(page) + pg_off); - if (ret < 0) - return -EIO; + if (ret < 0) { + const int nr_bit = (pg_off - offset_in_page(start)) >> + root->fs_info->sectorsize_bits; + + result |= (1U << nr_bit); + } } - return 0; + return result; } /* @@ -3241,6 +3258,7 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) inode = list_first_entry(&fs_info->delayed_iputs, struct btrfs_inode, delayed_iput); run_delayed_iput_locked(fs_info, inode); + cond_resched_lock(&fs_info->delayed_iput_lock); } spin_unlock(&fs_info->delayed_iput_lock); } @@ -4096,7 +4114,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, * This is a placeholder inode for a subvolume we didn't have a * reference to at the time of the snapshot creation. In the meantime * we could have renamed the real subvol link into our snapshot, so - * depending on btrfs_del_root_ref to return -ENOENT here is incorret. + * depending on btrfs_del_root_ref to return -ENOENT here is incorrect. * Instead simply lookup the dir_index_item for this entry so we can * remove it. Otherwise we know we have a ref to the root and we can * call btrfs_del_root_ref, and it _shouldn't_ fail. @@ -4451,20 +4469,36 @@ out: #define NEED_TRUNCATE_BLOCK 1 /* - * this can truncate away extent items, csum items and directory items. - * It starts at a high offset and removes keys until it can't find - * any higher than new_size + * Remove inode items from a given root. * - * csum items that cross the new i_size are truncated to the new size - * as well. + * @trans: A transaction handle. + * @root: The root from which to remove items. + * @inode: The inode whose items we want to remove. + * @new_size: The new i_size for the inode. This is only applicable when + * @min_type is BTRFS_EXTENT_DATA_KEY, must be 0 otherwise. + * @min_type: The minimum key type to remove. All keys with a type + * greater than this value are removed and all keys with + * this type are removed only if their offset is >= @new_size. + * @extents_found: Output parameter that will contain the number of file + * extent items that were removed or adjusted to the new + * inode i_size. The caller is responsible for initializing + * the counter. Also, it can be NULL if the caller does not + * need this counter. * - * min_type is the minimum key type to truncate down to. If set to 0, this - * will kill all the items on this inode, including the INODE_ITEM_KEY. + * Remove all keys associated with the inode from the given root that have a key + * with a type greater than or equals to @min_type. When @min_type has a value of + * BTRFS_EXTENT_DATA_KEY, only remove file extent items that have an offset value + * greater than or equals to @new_size. If a file extent item that starts before + * @new_size and ends after it is found, its length is adjusted. + * + * Returns: 0 on success, < 0 on error and NEED_TRUNCATE_BLOCK when @min_type is + * BTRFS_EXTENT_DATA_KEY and the caller must truncate the last block. */ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, - u64 new_size, u32 min_type) + u64 new_size, u32 min_type, + u64 *extents_found) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; @@ -4610,6 +4644,9 @@ search_again: if (found_type != BTRFS_EXTENT_DATA_KEY) goto delete; + if (extents_found != NULL) + (*extents_found)++; + if (extent_type != BTRFS_FILE_EXTENT_INLINE) { u64 num_dec; @@ -4928,7 +4965,7 @@ again: flush_dcache_page(page); } ClearPageChecked(page); - set_page_dirty(page); + btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start); unlock_extent_cached(io_tree, block_start, block_end, &cached_state); if (only_release_metadata) @@ -5442,7 +5479,7 @@ void btrfs_evict_inode(struct inode *inode) trans->block_rsv = rsv; ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), - 0, 0); + 0, 0, NULL); trans->block_rsv = &fs_info->trans_block_rsv; btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); @@ -7785,7 +7822,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, iomap->bdev = fs_info->fs_devices->latest_bdev; iomap->length = len; - if (write && btrfs_use_zone_append(BTRFS_I(inode), em)) + if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start)) iomap->flags |= IOMAP_F_ZONE_APPEND; free_extent_map(em); @@ -7924,19 +7961,17 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode, btrfs_ino(BTRFS_I(inode)), pgoff); } else { - blk_status_t status; + int ret; ASSERT((start - io_bio->logical) < UINT_MAX); - status = btrfs_submit_read_repair(inode, - &io_bio->bio, - start - io_bio->logical, - bvec.bv_page, pgoff, - start, - start + sectorsize - 1, - io_bio->mirror_num, - submit_dio_repair_bio); - if (status) - err = status; + ret = btrfs_repair_one_sector(inode, + &io_bio->bio, + start - io_bio->logical, + bvec.bv_page, pgoff, + start, io_bio->mirror_num, + submit_dio_repair_bio); + if (ret) + err = errno_to_blk_status(ret); } start += sectorsize; ASSERT(bio_offset + sectorsize > bio_offset); @@ -7951,41 +7986,8 @@ static void __endio_write_update_ordered(struct btrfs_inode *inode, const u64 offset, const u64 bytes, const bool uptodate) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_ordered_extent *ordered = NULL; - struct btrfs_workqueue *wq; - u64 ordered_offset = offset; - u64 ordered_bytes = bytes; - u64 last_offset; - - if (btrfs_is_free_space_inode(inode)) - wq = fs_info->endio_freespace_worker; - else - wq = fs_info->endio_write_workers; - - while (ordered_offset < offset + bytes) { - last_offset = ordered_offset; - if (btrfs_dec_test_first_ordered_pending(inode, &ordered, - &ordered_offset, - ordered_bytes, - uptodate)) { - btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, - NULL); - btrfs_queue_work(wq, &ordered->work); - } - - /* No ordered extent found in the range, exit */ - if (ordered_offset == last_offset) - return; - /* - * Our bio might span multiple ordered extents. In this case - * we keep going until we have accounted the whole dio. - */ - if (ordered_offset < offset + bytes) { - ordered_bytes = offset + bytes - ordered_offset; - ordered = NULL; - } - } + btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, + finish_ordered_fn, uptodate); } static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, @@ -8159,7 +8161,7 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, goto out_err_em; } ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio), - logical, submit_len, &geom); + logical, &geom); if (ret) { status = errno_to_blk_status(ret); goto out_err_em; @@ -8263,15 +8265,14 @@ int btrfs_readpage(struct file *file, struct page *page) struct btrfs_inode *inode = BTRFS_I(page->mapping->host); u64 start = page_offset(page); u64 end = start + PAGE_SIZE - 1; - unsigned long bio_flags = 0; - struct bio *bio = NULL; + struct btrfs_bio_ctrl bio_ctrl = { 0 }; int ret; btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - ret = btrfs_do_readpage(page, NULL, &bio, &bio_flags, 0, NULL); - if (bio) - ret = submit_one_bio(bio, 0, bio_flags); + ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL); + if (bio_ctrl.bio) + ret = submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags); return ret; } @@ -8340,9 +8341,9 @@ static int btrfs_migratepage(struct address_space *mapping, if (page_has_private(page)) attach_page_private(newpage, detach_page_private(page)); - if (PagePrivate2(page)) { - ClearPagePrivate2(page); - SetPagePrivate2(newpage); + if (PageOrdered(page)) { + ClearPageOrdered(page); + SetPageOrdered(newpage); } if (mode != MIGRATE_SYNC_NO_COPY) @@ -8357,27 +8358,42 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, unsigned int length) { struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_io_tree *tree = &inode->io_tree; - struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; u64 page_start = page_offset(page); u64 page_end = page_start + PAGE_SIZE - 1; - u64 start; - u64 end; + u64 cur; int inode_evicting = inode->vfs_inode.i_state & I_FREEING; - bool found_ordered = false; - bool completed_ordered = false; /* - * we have the page locked, so new writeback can't start, - * and the dirty bit won't be cleared while we are here. + * We have page locked so no new ordered extent can be created on this + * page, nor bio can be submitted for this page. * - * Wait for IO on this page so that we can safely clear - * the PagePrivate2 bit and do ordered accounting + * But already submitted bio can still be finished on this page. + * Furthermore, endio function won't skip page which has Ordered + * (Private2) already cleared, so it's possible for endio and + * invalidatepage to do the same ordered extent accounting twice + * on one page. + * + * So here we wait for any submitted bios to finish, so that we won't + * do double ordered extent accounting on the same page. */ wait_on_page_writeback(page); - if (offset) { + /* + * For subpage case, we have call sites like + * btrfs_punch_hole_lock_range() which passes range not aligned to + * sectorsize. + * If the range doesn't cover the full page, we don't need to and + * shouldn't clear page extent mapped, as page->private can still + * record subpage dirty bits for other part of the range. + * + * For cases that can invalidate the full even the range doesn't + * cover the full page, like invalidating the last page, we're + * still safe to wait for ordered extent to finish. + */ + if (!(offset == 0 && length == PAGE_SIZE)) { btrfs_releasepage(page, GFP_NOFS); return; } @@ -8385,89 +8401,123 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, if (!inode_evicting) lock_extent_bits(tree, page_start, page_end, &cached_state); - start = page_start; -again: - ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1); - if (ordered) { - found_ordered = true; - end = min(page_end, - ordered->file_offset + ordered->num_bytes - 1); + cur = page_start; + while (cur < page_end) { + struct btrfs_ordered_extent *ordered; + bool delete_states; + u64 range_end; + u32 range_len; + + ordered = btrfs_lookup_first_ordered_range(inode, cur, + page_end + 1 - cur); + if (!ordered) { + range_end = page_end; + /* + * No ordered extent covering this range, we are safe + * to delete all extent states in the range. + */ + delete_states = true; + goto next; + } + if (ordered->file_offset > cur) { + /* + * There is a range between [cur, oe->file_offset) not + * covered by any ordered extent. + * We are safe to delete all extent states, and handle + * the ordered extent in the next iteration. + */ + range_end = ordered->file_offset - 1; + delete_states = true; + goto next; + } + + range_end = min(ordered->file_offset + ordered->num_bytes - 1, + page_end); + ASSERT(range_end + 1 - cur < U32_MAX); + range_len = range_end + 1 - cur; + if (!btrfs_page_test_ordered(fs_info, page, cur, range_len)) { + /* + * If Ordered (Private2) is cleared, it means endio has + * already been executed for the range. + * We can't delete the extent states as + * btrfs_finish_ordered_io() may still use some of them. + */ + delete_states = false; + goto next; + } + btrfs_page_clear_ordered(fs_info, page, cur, range_len); + /* * IO on this page will never be started, so we need to account * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW * here, must leave that up for the ordered extent completion. + * + * This will also unlock the range for incoming + * btrfs_finish_ordered_io(). */ if (!inode_evicting) - clear_extent_bit(tree, start, end, + clear_extent_bit(tree, cur, range_end, EXTENT_DELALLOC | EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 0, &cached_state); + + spin_lock_irq(&inode->ordered_tree.lock); + set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); + ordered->truncated_len = min(ordered->truncated_len, + cur - ordered->file_offset); + spin_unlock_irq(&inode->ordered_tree.lock); + + if (btrfs_dec_test_ordered_pending(inode, &ordered, + cur, range_end + 1 - cur, 1)) { + btrfs_finish_ordered_io(ordered); + /* + * The ordered extent has finished, now we're again + * safe to delete all extent states of the range. + */ + delete_states = true; + } else { + /* + * btrfs_finish_ordered_io() will get executed by endio + * of other pages, thus we can't delete extent states + * anymore + */ + delete_states = false; + } +next: + if (ordered) + btrfs_put_ordered_extent(ordered); /* - * whoever cleared the private bit is responsible - * for the finish_ordered_io + * Qgroup reserved space handler + * Sector(s) here will be either: + * + * 1) Already written to disk or bio already finished + * Then its QGROUP_RESERVED bit in io_tree is already cleared. + * Qgroup will be handled by its qgroup_record then. + * btrfs_qgroup_free_data() call will do nothing here. + * + * 2) Not written to disk yet + * Then btrfs_qgroup_free_data() call will clear the + * QGROUP_RESERVED bit of its io_tree, and free the qgroup + * reserved data space. + * Since the IO will never happen for this page. */ - if (TestClearPagePrivate2(page)) { - spin_lock_irq(&inode->ordered_tree.lock); - set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); - ordered->truncated_len = min(ordered->truncated_len, - start - ordered->file_offset); - spin_unlock_irq(&inode->ordered_tree.lock); - - if (btrfs_dec_test_ordered_pending(inode, &ordered, - start, - end - start + 1, 1)) { - btrfs_finish_ordered_io(ordered); - completed_ordered = true; - } - } - btrfs_put_ordered_extent(ordered); + btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur); if (!inode_evicting) { - cached_state = NULL; - lock_extent_bits(tree, start, end, - &cached_state); + clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED | + EXTENT_DELALLOC | EXTENT_UPTODATE | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, + delete_states, &cached_state); } - - start = end + 1; - if (start < page_end) - goto again; + cur = range_end + 1; } - /* - * Qgroup reserved space handler - * Page here will be either - * 1) Already written to disk or ordered extent already submitted - * Then its QGROUP_RESERVED bit in io_tree is already cleaned. - * Qgroup will be handled by its qgroup_record then. - * btrfs_qgroup_free_data() call will do nothing here. - * - * 2) Not written to disk yet - * Then btrfs_qgroup_free_data() call will clear the QGROUP_RESERVED - * bit of its io_tree, and free the qgroup reserved data space. - * Since the IO will never happen for this page. + * We have iterated through all ordered extents of the page, the page + * should not have Ordered (Private2) anymore, or the above iteration + * did something wrong. */ - btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE); - if (!inode_evicting) { - bool delete = true; - - /* - * If there's an ordered extent for this range and we have not - * finished it ourselves, we must leave EXTENT_DELALLOC_NEW set - * in the range for the ordered extent completion. We must also - * not delete the range, otherwise we would lose that bit (and - * any other bits set in the range). Make sure EXTENT_UPTODATE - * is cleared if we don't delete, otherwise it can lead to - * corruptions if the i_size is extented later. - */ - if (found_ordered && !completed_ordered) - delete = false; - clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | - EXTENT_DELALLOC | EXTENT_UPTODATE | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, - delete, &cached_state); - + ASSERT(!PageOrdered(page)); + if (!inode_evicting) __btrfs_releasepage(page, GFP_NOFS); - } - ClearPageChecked(page); clear_page_extent_mapped(page); } @@ -8613,8 +8663,8 @@ again: flush_dcache_page(page); } ClearPageChecked(page); - set_page_dirty(page); - SetPageUptodate(page); + btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start); + btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start); btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); @@ -8648,6 +8698,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) struct btrfs_trans_handle *trans; u64 mask = fs_info->sectorsize - 1; u64 min_size = btrfs_calc_metadata_size(fs_info, 1); + u64 extents_found = 0; if (!skip_writeback) { ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask), @@ -8705,20 +8756,13 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) min_size, false); BUG_ON(ret); - /* - * So if we truncate and then write and fsync we normally would just - * write the extents that changed, which is a problem if we need to - * first truncate that entire inode. So set this flag so we write out - * all of the extents in the inode to the sync log so we're completely - * safe. - */ - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); trans->block_rsv = rsv; while (1) { ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), inode->i_size, - BTRFS_EXTENT_DATA_KEY); + BTRFS_EXTENT_DATA_KEY, + &extents_found); trans->block_rsv = &fs_info->trans_block_rsv; if (ret != -ENOSPC && ret != -EAGAIN) break; @@ -8780,6 +8824,22 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) } out: btrfs_free_block_rsv(fs_info, rsv); + /* + * So if we truncate and then write and fsync we normally would just + * write the extents that changed, which is a problem if we need to + * first truncate that entire inode. So set this flag so we write out + * all of the extents in the inode to the sync log so we're completely + * safe. + * + * If no extents were dropped or trimmed we don't need to force the next + * fsync to truncate all the inode's items from the log and re-log them + * all. This means the truncate operation did not change the file size, + * or changed it to a smaller size but there was only an implicit hole + * between the old i_size and the new i_size, and there were no prealloc + * extents beyond i_size to drop. + */ + if (extents_found > 0) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); return ret; } @@ -9075,6 +9135,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, int ret2; bool root_log_pinned = false; bool dest_log_pinned = false; + bool need_abort = false; /* we only allow rename subvolume link between subvolumes */ if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) @@ -9134,6 +9195,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, old_idx); if (ret) goto out_fail; + need_abort = true; } /* And now for the dest. */ @@ -9149,8 +9211,11 @@ static int btrfs_rename_exchange(struct inode *old_dir, new_ino, btrfs_ino(BTRFS_I(old_dir)), new_idx); - if (ret) + if (ret) { + if (need_abort) + btrfs_abort_transaction(trans, ret); goto out_fail; + } } /* Update inode version and ctime/mtime. */ @@ -10181,17 +10246,21 @@ out: return ret; } -void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) +void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) { - struct inode *inode = tree->private_data; + struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned long index = start >> PAGE_SHIFT; unsigned long end_index = end >> PAGE_SHIFT; struct page *page; + u32 len; + ASSERT(end + 1 - start <= U32_MAX); + len = end + 1 - start; while (index <= end_index) { - page = find_get_page(inode->i_mapping, index); + page = find_get_page(inode->vfs_inode.i_mapping, index); ASSERT(page); /* Pages should be in the extent_io_tree */ - set_page_writeback(page); + + btrfs_page_set_writeback(fs_info, page, start, len); put_page(page); index++; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 5dc2fd843ae3..0ba98e08a029 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -353,15 +353,55 @@ update_flags: return ret; } +/* + * Start exclusive operation @type, return true on success + */ bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, enum btrfs_exclusive_operation type) { - return !cmpxchg(&fs_info->exclusive_operation, BTRFS_EXCLOP_NONE, type); + bool ret = false; + + spin_lock(&fs_info->super_lock); + if (fs_info->exclusive_operation == BTRFS_EXCLOP_NONE) { + fs_info->exclusive_operation = type; + ret = true; + } + spin_unlock(&fs_info->super_lock); + + return ret; +} + +/* + * Conditionally allow to enter the exclusive operation in case it's compatible + * with the running one. This must be paired with btrfs_exclop_start_unlock and + * btrfs_exclop_finish. + * + * Compatibility: + * - the same type is already running + * - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller + * must check the condition first that would allow none -> @type + */ +bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type) +{ + spin_lock(&fs_info->super_lock); + if (fs_info->exclusive_operation == type) + return true; + + spin_unlock(&fs_info->super_lock); + return false; +} + +void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info) +{ + spin_unlock(&fs_info->super_lock); } void btrfs_exclop_finish(struct btrfs_fs_info *fs_info) { + spin_lock(&fs_info->super_lock); WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE); + spin_unlock(&fs_info->super_lock); sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation"); } @@ -1455,7 +1495,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, if (btrfs_defrag_cancelled(fs_info)) { btrfs_debug(fs_info, "defrag_file cancelled"); ret = -EAGAIN; - break; + goto error; } if (!should_defrag_range(inode, (u64)i << PAGE_SHIFT, @@ -1533,6 +1573,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, } } + ret = defrag_count; +error: if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) { filemap_flush(inode->i_mapping); if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, @@ -1546,8 +1588,6 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); } - ret = defrag_count; - out_ra: if (do_compress) { btrfs_inode_lock(inode, 0); @@ -1560,6 +1600,48 @@ out_ra: return ret; } +/* + * Try to start exclusive operation @type or cancel it if it's running. + * + * Return: + * 0 - normal mode, newly claimed op started + * >0 - normal mode, something else is running, + * return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS to user space + * ECANCELED - cancel mode, successful cancel + * ENOTCONN - cancel mode, operation not running anymore + */ +static int exclop_start_or_cancel_reloc(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type, bool cancel) +{ + if (!cancel) { + /* Start normal op */ + if (!btrfs_exclop_start(fs_info, type)) + return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + /* Exclusive operation is now claimed */ + return 0; + } + + /* Cancel running op */ + if (btrfs_exclop_start_try_lock(fs_info, type)) { + /* + * This blocks any exclop finish from setting it to NONE, so we + * request cancellation. Either it runs and we will wait for it, + * or it has finished and no waiting will happen. + */ + atomic_inc(&fs_info->reloc_cancel_req); + btrfs_exclop_start_unlock(fs_info); + + if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) + wait_on_bit(&fs_info->flags, BTRFS_FS_RELOC_RUNNING, + TASK_INTERRUPTIBLE); + + return -ECANCELED; + } + + /* Something else is running or none */ + return -ENOTCONN; +} + static noinline int btrfs_ioctl_resize(struct file *file, void __user *arg) { @@ -1577,6 +1659,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, char *devstr = NULL; int ret = 0; int mod = 0; + bool cancel; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1585,20 +1668,23 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (ret) return ret; - if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_RESIZE)) { - mnt_drop_write_file(file); - return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; - } - + /* + * Read the arguments before checking exclusivity to be able to + * distinguish regular resize and cancel + */ vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { ret = PTR_ERR(vol_args); - goto out; + goto out_drop; } - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - sizestr = vol_args->name; + cancel = (strcmp("cancel", sizestr) == 0); + ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_RESIZE, cancel); + if (ret) + goto out_free; + /* Exclusive operation is now claimed */ + devstr = strchr(sizestr, ':'); if (devstr) { sizestr = devstr + 1; @@ -1606,10 +1692,10 @@ static noinline int btrfs_ioctl_resize(struct file *file, devstr = vol_args->name; ret = kstrtoull(devstr, 10, &devid); if (ret) - goto out_free; + goto out_finish; if (!devid) { ret = -EINVAL; - goto out_free; + goto out_finish; } btrfs_info(fs_info, "resizing devid %llu", devid); } @@ -1619,7 +1705,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, btrfs_info(fs_info, "resizer unable to find device %llu", devid); ret = -ENODEV; - goto out_free; + goto out_finish; } if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { @@ -1627,7 +1713,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, "resizer unable to apply on readonly device %llu", devid); ret = -EPERM; - goto out_free; + goto out_finish; } if (!strcmp(sizestr, "max")) @@ -1643,13 +1729,13 @@ static noinline int btrfs_ioctl_resize(struct file *file, new_size = memparse(sizestr, &retptr); if (*retptr != '\0' || new_size == 0) { ret = -EINVAL; - goto out_free; + goto out_finish; } } if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { ret = -EPERM; - goto out_free; + goto out_finish; } old_size = btrfs_device_get_total_bytes(device); @@ -1657,24 +1743,24 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (mod < 0) { if (new_size > old_size) { ret = -EINVAL; - goto out_free; + goto out_finish; } new_size = old_size - new_size; } else if (mod > 0) { if (new_size > ULLONG_MAX - old_size) { ret = -ERANGE; - goto out_free; + goto out_finish; } new_size = old_size + new_size; } if (new_size < SZ_256M) { ret = -EINVAL; - goto out_free; + goto out_finish; } if (new_size > device->bdev->bd_inode->i_size) { ret = -EFBIG; - goto out_free; + goto out_finish; } new_size = round_down(new_size, fs_info->sectorsize); @@ -1683,7 +1769,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - goto out_free; + goto out_finish; } ret = btrfs_grow_device(trans, device, new_size); btrfs_commit_transaction(trans); @@ -1696,10 +1782,11 @@ static noinline int btrfs_ioctl_resize(struct file *file, "resize device %s (devid %llu) from %llu to %llu", rcu_str_deref(device->name), device->devid, old_size, new_size); +out_finish: + btrfs_exclop_finish(fs_info); out_free: kfree(vol_args); -out: - btrfs_exclop_finish(fs_info); +out_drop: mnt_drop_write_file(file); return ret; } @@ -2897,7 +2984,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, err = PTR_ERR(subvol_name_ptr); goto free_parent; } - /* subvol_name_ptr is already NULL termined */ + /* subvol_name_ptr is already nul terminated */ subvol_name = (char *)kbasename(subvol_name_ptr); } } else { @@ -3119,6 +3206,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ioctl_vol_args_v2 *vol_args; int ret; + bool cancel = false; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -3137,18 +3225,22 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) ret = -EOPNOTSUPP; goto out; } + vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; + if (!(vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) && + strcmp("cancel", vol_args->name) == 0) + cancel = true; - if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REMOVE)) { - ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE, + cancel); + if (ret) goto out; - } + /* Exclusive operation is now claimed */ - if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) { + if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) ret = btrfs_rm_device(fs_info, NULL, vol_args->devid); - } else { - vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; + else ret = btrfs_rm_device(fs_info, vol_args->name, 0); - } + btrfs_exclop_finish(fs_info); if (!ret) { @@ -3172,6 +3264,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ioctl_vol_args *vol_args; int ret; + bool cancel; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -3180,25 +3273,24 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) if (ret) return ret; - if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REMOVE)) { - ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; - goto out_drop_write; - } - vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { ret = PTR_ERR(vol_args); - goto out; + goto out_drop_write; } - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - ret = btrfs_rm_device(fs_info, vol_args->name, 0); + cancel = (strcmp("cancel", vol_args->name) == 0); + + ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE, + cancel); + if (ret == 0) { + ret = btrfs_rm_device(fs_info, vol_args->name, 0); + if (!ret) + btrfs_info(fs_info, "disk deleted %s", vol_args->name); + btrfs_exclop_finish(fs_info); + } - if (!ret) - btrfs_info(fs_info, "disk deleted %s", vol_args->name); kfree(vol_args); -out: - btrfs_exclop_finish(fs_info); out_drop_write: mnt_drop_write_file(file); @@ -3551,7 +3643,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, goto out; } transid = trans->transid; - ret = btrfs_commit_transaction_async(trans, 0); + ret = btrfs_commit_transaction_async(trans); if (ret) { btrfs_end_transaction(trans); return ret; diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 5fafc5e89bb7..313d9d685adb 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -57,7 +57,7 @@ void btrfs_tree_read_lock(struct extent_buffer *eb) /* * Try-lock for read. * - * Retrun 1 if the rwlock has been taken, 0 otherwise + * Return 1 if the rwlock has been taken, 0 otherwise */ int btrfs_try_tree_read_lock(struct extent_buffer *eb) { @@ -72,7 +72,7 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb) /* * Try-lock for write. * - * Retrun 1 if the rwlock has been taken, 0 otherwise + * Return 1 if the rwlock has been taken, 0 otherwise */ int btrfs_try_tree_write_lock(struct extent_buffer *eb) { diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 6c413bb451a3..6eb41b7c0c84 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -16,6 +16,7 @@ #include "compression.h" #include "delalloc-space.h" #include "qgroup.h" +#include "subpage.h" static struct kmem_cache *btrfs_ordered_extent_cache; @@ -300,81 +301,142 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, } /* - * Finish IO for one ordered extent across a given range. The range can - * contain several ordered extents. + * Mark all ordered extents io inside the specified range finished. * - * @found_ret: Return the finished ordered extent - * @file_offset: File offset for the finished IO - * Will also be updated to one byte past the range that is - * recordered as finished. This allows caller to walk forward. - * @io_size: Length of the finish IO range - * @uptodate: If the IO finished without problem - * - * Return true if any ordered extent is finished in the range, and update - * @found_ret and @file_offset. - * Return false otherwise. + * @page: The invovled page for the opeartion. + * For uncompressed buffered IO, the page status also needs to be + * updated to indicate whether the pending ordered io is finished. + * Can be NULL for direct IO and compressed write. + * For these cases, callers are ensured they won't execute the + * endio function twice. + * @finish_func: The function to be executed when all the IO of an ordered + * extent are finished. * - * NOTE: Although The range can cross multiple ordered extents, only one - * ordered extent will be updated during one call. The caller is responsible to - * iterate all ordered extents in the range. + * This function is called for endio, thus the range must have ordered + * extent(s) coveri it. */ -bool btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, - struct btrfs_ordered_extent **finished_ret, - u64 *file_offset, u64 io_size, int uptodate) +void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, + struct page *page, u64 file_offset, + u64 num_bytes, btrfs_func_t finish_func, + bool uptodate) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_workqueue *wq; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - bool finished = false; unsigned long flags; - u64 dec_end; - u64 dec_start; - u64 to_dec; + u64 cur = file_offset; + + if (btrfs_is_free_space_inode(inode)) + wq = fs_info->endio_freespace_worker; + else + wq = fs_info->endio_write_workers; + + if (page) + ASSERT(page->mapping && page_offset(page) <= file_offset && + file_offset + num_bytes <= page_offset(page) + PAGE_SIZE); spin_lock_irqsave(&tree->lock, flags); - node = tree_search(tree, *file_offset); - if (!node) - goto out; + while (cur < file_offset + num_bytes) { + u64 entry_end; + u64 end; + u32 len; - entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); - if (!in_range(*file_offset, entry->file_offset, entry->num_bytes)) - goto out; + node = tree_search(tree, cur); + /* No ordered extents at all */ + if (!node) + break; - dec_start = max(*file_offset, entry->file_offset); - dec_end = min(*file_offset + io_size, - entry->file_offset + entry->num_bytes); - *file_offset = dec_end; - if (dec_start > dec_end) { - btrfs_crit(fs_info, "bad ordering dec_start %llu end %llu", - dec_start, dec_end); - } - to_dec = dec_end - dec_start; - if (to_dec > entry->bytes_left) { - btrfs_crit(fs_info, - "bad ordered accounting left %llu size %llu", - entry->bytes_left, to_dec); - } - entry->bytes_left -= to_dec; - if (!uptodate) - set_bit(BTRFS_ORDERED_IOERR, &entry->flags); + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + entry_end = entry->file_offset + entry->num_bytes; + /* + * |<-- OE --->| | + * cur + * Go to next OE. + */ + if (cur >= entry_end) { + node = rb_next(node); + /* No more ordered extents, exit */ + if (!node) + break; + entry = rb_entry(node, struct btrfs_ordered_extent, + rb_node); + + /* Go to next ordered extent and continue */ + cur = entry->file_offset; + continue; + } + /* + * | |<--- OE --->| + * cur + * Go to the start of OE. + */ + if (cur < entry->file_offset) { + cur = entry->file_offset; + continue; + } - if (entry->bytes_left == 0) { /* - * Ensure only one caller can set the flag and finished_ret - * accordingly + * Now we are definitely inside one ordered extent. + * + * |<--- OE --->| + * | + * cur */ - finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); - /* test_and_set_bit implies a barrier */ - cond_wake_up_nomb(&entry->wait); - } -out: - if (finished && finished_ret && entry) { - *finished_ret = entry; - refcount_inc(&entry->refs); + end = min(entry->file_offset + entry->num_bytes, + file_offset + num_bytes) - 1; + ASSERT(end + 1 - cur < U32_MAX); + len = end + 1 - cur; + + if (page) { + /* + * Ordered (Private2) bit indicates whether we still + * have pending io unfinished for the ordered extent. + * + * If there's no such bit, we need to skip to next range. + */ + if (!btrfs_page_test_ordered(fs_info, page, cur, len)) { + cur += len; + continue; + } + btrfs_page_clear_ordered(fs_info, page, cur, len); + } + + /* Now we're fine to update the accounting */ + if (unlikely(len > entry->bytes_left)) { + WARN_ON(1); + btrfs_crit(fs_info, +"bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%u left=%llu", + inode->root->root_key.objectid, + btrfs_ino(inode), + entry->file_offset, + entry->num_bytes, + len, entry->bytes_left); + entry->bytes_left = 0; + } else { + entry->bytes_left -= len; + } + + if (!uptodate) + set_bit(BTRFS_ORDERED_IOERR, &entry->flags); + + /* + * All the IO of the ordered extent is finished, we need to queue + * the finish_func to be executed. + */ + if (entry->bytes_left == 0) { + set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); + cond_wake_up(&entry->wait); + refcount_inc(&entry->refs); + spin_unlock_irqrestore(&tree->lock, flags); + btrfs_init_work(&entry->work, finish_func, NULL, NULL); + btrfs_queue_work(wq, &entry->work); + spin_lock_irqsave(&tree->lock, flags); + } + cur += len; } spin_unlock_irqrestore(&tree->lock, flags); - return finished; } /* @@ -870,6 +932,81 @@ out: } /* + * Lookup the first ordered extent that overlaps the range + * [@file_offset, @file_offset + @len). + * + * The difference between this and btrfs_lookup_first_ordered_extent() is + * that this one won't return any ordered extent that does not overlap the range. + * And the difference against btrfs_lookup_ordered_extent() is, this function + * ensures the first ordered extent gets returned. + */ +struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range( + struct btrfs_inode *inode, u64 file_offset, u64 len) +{ + struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; + struct rb_node *node; + struct rb_node *cur; + struct rb_node *prev; + struct rb_node *next; + struct btrfs_ordered_extent *entry = NULL; + + spin_lock_irq(&tree->lock); + node = tree->tree.rb_node; + /* + * Here we don't want to use tree_search() which will use tree->last + * and screw up the search order. + * And __tree_search() can't return the adjacent ordered extents + * either, thus here we do our own search. + */ + while (node) { + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + + if (file_offset < entry->file_offset) { + node = node->rb_left; + } else if (file_offset >= entry_end(entry)) { + node = node->rb_right; + } else { + /* + * Direct hit, got an ordered extent that starts at + * @file_offset + */ + goto out; + } + } + if (!entry) { + /* Empty tree */ + goto out; + } + + cur = &entry->rb_node; + /* We got an entry around @file_offset, check adjacent entries */ + if (entry->file_offset < file_offset) { + prev = cur; + next = rb_next(cur); + } else { + prev = rb_prev(cur); + next = cur; + } + if (prev) { + entry = rb_entry(prev, struct btrfs_ordered_extent, rb_node); + if (range_overlaps(entry, file_offset, len)) + goto out; + } + if (next) { + entry = rb_entry(next, struct btrfs_ordered_extent, rb_node); + if (range_overlaps(entry, file_offset, len)) + goto out; + } + /* No ordered extent in the range */ + entry = NULL; +out: + if (entry) + refcount_inc(&entry->refs); + spin_unlock_irq(&tree->lock); + return entry; +} + +/* * btrfs_flush_ordered_range - Lock the passed range and ensures all pending * ordered extents in it are run to completion. * diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index e60c07f36427..566472004edd 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -172,13 +172,13 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_ordered_extent *entry); +void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, + struct page *page, u64 file_offset, + u64 num_bytes, btrfs_func_t finish_func, + bool uptodate); bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size, int uptodate); -bool btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, - struct btrfs_ordered_extent **finished_ret, - u64 *file_offset, u64 io_size, - int uptodate); int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, int type); @@ -196,6 +196,8 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait); int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset); +struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range( + struct btrfs_inode *inode, u64 file_offset, u64 len); struct btrfs_ordered_extent *btrfs_lookup_ordered_range( struct btrfs_inode *inode, u64 file_offset, diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 2dcb1cb21634..b1cb5a8c2999 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -260,6 +260,10 @@ static int prop_compression_validate(const char *value, size_t len) if (btrfs_compress_is_valid_type(value, len)) return 0; + if ((len == 2 && strncmp("no", value, 2) == 0) || + (len == 4 && strncmp("none", value, 4) == 0)) + return 0; + return -EINVAL; } @@ -269,7 +273,17 @@ static int prop_compression_apply(struct inode *inode, const char *value, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int type; + /* Reset to defaults */ if (len == 0) { + BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; + BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; + BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE; + return 0; + } + + /* Set NOCOMPRESS flag */ + if ((len == 2 && strncmp("no", value, 2) == 0) || + (len == 4 && strncmp("none", value, 4) == 0)) { BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE; @@ -348,7 +362,7 @@ static int inherit_props(struct btrfs_trans_handle *trans, /* * This is not strictly necessary as the property should be - * valid, but in case it isn't, don't propagate it futher. + * valid, but in case it isn't, don't propagate it further. */ ret = h->validate(value, strlen(value)); if (ret) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 3ded812f522c..07ec06d4e972 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2521,7 +2521,7 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, int ret = 0; /* - * If quotas get disabled meanwhile, the resouces need to be freed and + * If quotas get disabled meanwhile, the resources need to be freed and * we can't just exit here. */ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) @@ -3545,13 +3545,7 @@ static int try_flush_qgroup(struct btrfs_root *root) struct btrfs_trans_handle *trans; int ret; - /* - * Can't hold an open transaction or we run the risk of deadlocking, - * and can't either be under the context of a send operation (where - * current->journal_info is set to BTRFS_SEND_TRANS_STUB), as that - * would result in a crash when starting a transaction and does not - * make sense either (send is a read-only operation). - */ + /* Can't hold an open transaction or we run the risk of deadlocking. */ ASSERT(current->journal_info == NULL); if (WARN_ON(current->journal_info)) return 0; diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 3928ecc40d7b..9b0814318e72 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -7,6 +7,7 @@ #include "delalloc-space.h" #include "reflink.h" #include "transaction.h" +#include "subpage.h" #define BTRFS_MAX_DEDUPE_LEN SZ_16M @@ -52,7 +53,8 @@ static int copy_inline_to_page(struct btrfs_inode *inode, const u64 datal, const u8 comp_type) { - const u64 block_size = btrfs_inode_sectorsize(inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + const u32 block_size = fs_info->sectorsize; const u64 range_end = file_offset + block_size - 1; const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0); char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0); @@ -106,10 +108,12 @@ static int copy_inline_to_page(struct btrfs_inode *inode, set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags); if (comp_type == BTRFS_COMPRESS_NONE) { - memcpy_to_page(page, 0, data_start, datal); + memcpy_to_page(page, offset_in_page(file_offset), data_start, + datal); flush_dcache_page(page); } else { - ret = btrfs_decompress(comp_type, data_start, page, 0, + ret = btrfs_decompress(comp_type, data_start, page, + offset_in_page(file_offset), inline_size, datal); if (ret) goto out_unlock; @@ -133,9 +137,9 @@ static int copy_inline_to_page(struct btrfs_inode *inode, flush_dcache_page(page); } - SetPageUptodate(page); + btrfs_page_set_uptodate(fs_info, page, file_offset, block_size); ClearPageChecked(page); - set_page_dirty(page); + btrfs_page_set_dirty(fs_info, page, file_offset, block_size); out_unlock: if (page) { unlock_page(page); @@ -203,10 +207,7 @@ static int clone_copy_inline_extent(struct inode *dst, * inline extent's data to the page. */ ASSERT(key.offset > 0); - ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, - inline_data, size, datal, - comp_type); - goto out; + goto copy_to_page; } } else if (i_size_read(dst) <= datal) { struct btrfs_file_extent_item *ei; @@ -222,13 +223,10 @@ static int clone_copy_inline_extent(struct inode *dst, BTRFS_FILE_EXTENT_INLINE) goto copy_inline_extent; - ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, - inline_data, size, datal, comp_type); - goto out; + goto copy_to_page; } copy_inline_extent: - ret = 0; /* * We have no extent items, or we have an extent at offset 0 which may * or may not be inlined. All these cases are dealt the same way. @@ -240,11 +238,13 @@ copy_inline_extent: * clone. Deal with all these cases by copying the inline extent * data into the respective page at the destination inode. */ - ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, - inline_data, size, datal, comp_type); - goto out; + goto copy_to_page; } + /* + * Release path before starting a new transaction so we don't hold locks + * that would confuse lockdep. + */ btrfs_release_path(path); /* * If we end up here it means were copy the inline extent into a leaf @@ -301,6 +301,21 @@ out: *trans_out = trans; return ret; + +copy_to_page: + /* + * Release our path because we don't need it anymore and also because + * copy_inline_to_page() needs to reserve data and metadata, which may + * need to flush delalloc when we are low on available space and + * therefore cause a deadlock if writeback of an inline extent needs to + * write to the same leaf or an ordered extent completion needs to write + * to the same leaf. + */ + btrfs_release_path(path); + + ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, + inline_data, size, datal, comp_type); + goto out; } /** diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index b70be2ac2e9e..fc831597cb22 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2876,11 +2876,12 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end, } /* - * Allow error injection to test balance cancellation + * Allow error injection to test balance/relocation cancellation */ noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info) { return atomic_read(&fs_info->balance_cancel_req) || + atomic_read(&fs_info->reloc_cancel_req) || fatal_signal_pending(current); } ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE); @@ -3780,6 +3781,60 @@ out: return inode; } +/* + * Mark start of chunk relocation that is cancellable. Check if the cancellation + * has been requested meanwhile and don't start in that case. + * + * Return: + * 0 success + * -EINPROGRESS operation is already in progress, that's probably a bug + * -ECANCELED cancellation request was set before the operation started + * -EAGAIN can not start because there are ongoing send operations + */ +static int reloc_chunk_start(struct btrfs_fs_info *fs_info) +{ + spin_lock(&fs_info->send_reloc_lock); + if (fs_info->send_in_progress) { + btrfs_warn_rl(fs_info, +"cannot run relocation while send operations are in progress (%d in progress)", + fs_info->send_in_progress); + spin_unlock(&fs_info->send_reloc_lock); + return -EAGAIN; + } + if (test_and_set_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) { + /* This should not happen */ + spin_unlock(&fs_info->send_reloc_lock); + btrfs_err(fs_info, "reloc already running, cannot start"); + return -EINPROGRESS; + } + spin_unlock(&fs_info->send_reloc_lock); + + if (atomic_read(&fs_info->reloc_cancel_req) > 0) { + btrfs_info(fs_info, "chunk relocation canceled on start"); + /* + * On cancel, clear all requests but let the caller mark + * the end after cleanup operations. + */ + atomic_set(&fs_info->reloc_cancel_req, 0); + return -ECANCELED; + } + return 0; +} + +/* + * Mark end of chunk relocation that is cancellable and wake any waiters. + */ +static void reloc_chunk_end(struct btrfs_fs_info *fs_info) +{ + /* Requested after start, clear bit first so any waiters can continue */ + if (atomic_read(&fs_info->reloc_cancel_req) > 0) + btrfs_info(fs_info, "chunk relocation canceled during operation"); + spin_lock(&fs_info->send_reloc_lock); + clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags); + spin_unlock(&fs_info->send_reloc_lock); + atomic_set(&fs_info->reloc_cancel_req, 0); +} + static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) { struct reloc_control *rc; @@ -3862,6 +3917,12 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) return -ENOMEM; } + ret = reloc_chunk_start(fs_info); + if (ret < 0) { + err = ret; + goto out_put_bg; + } + rc->extent_root = extent_root; rc->block_group = bg; @@ -3952,7 +4013,9 @@ out: if (err && rw) btrfs_dec_block_group_ro(rc->block_group); iput(rc->data_inode); - btrfs_put_block_group(rc->block_group); +out_put_bg: + btrfs_put_block_group(bg); + reloc_chunk_end(fs_info); free_reloc_control(rc); return err; } @@ -4073,6 +4136,12 @@ int btrfs_recover_relocation(struct btrfs_root *root) goto out; } + ret = reloc_chunk_start(fs_info); + if (ret < 0) { + err = ret; + goto out_end; + } + rc->extent_root = fs_info->extent_root; set_reloc_control(rc); @@ -4137,6 +4206,8 @@ out_clean: err = ret; out_unset: unset_reloc_control(rc); +out_end: + reloc_chunk_end(fs_info); free_reloc_control(rc); out: free_reloc_roots(&reloc_roots); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 485cda3eb8d7..088641ba7a8e 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -165,6 +165,10 @@ struct scrub_ctx { int readonly; int pages_per_rd_bio; + /* State of IO submission throttling affecting the associated device */ + ktime_t throttle_deadline; + u64 throttle_sent; + int is_dev_replace; u64 write_pointer; @@ -605,6 +609,7 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( spin_lock_init(&sctx->list_lock); spin_lock_init(&sctx->stat_lock); init_waitqueue_head(&sctx->list_wait); + sctx->throttle_deadline = 0; WARN_ON(sctx->wr_curr_bio != NULL); mutex_init(&sctx->wr_lock); @@ -626,7 +631,6 @@ nomem: static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *warn_ctx) { - u64 isize; u32 nlink; int ret; int i; @@ -662,7 +666,6 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, eb = swarn->path->nodes[0]; inode_item = btrfs_item_ptr(eb, swarn->path->slots[0], struct btrfs_inode_item); - isize = btrfs_inode_size(eb, inode_item); nlink = btrfs_inode_nlink(eb, inode_item); btrfs_release_path(swarn->path); @@ -691,12 +694,12 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, */ for (i = 0; i < ipath->fspath->elem_cnt; ++i) btrfs_warn_in_rcu(fs_info, -"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)", +"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)", swarn->errstr, swarn->logical, rcu_str_deref(swarn->dev->name), swarn->physical, root, inum, offset, - min(isize - offset, (u64)PAGE_SIZE), nlink, + fs_info->sectorsize, nlink, (char *)(unsigned long)ipath->fspath->val[i]); btrfs_put_root(local_root); @@ -885,25 +888,25 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) * read all mirrors one after the other. This includes to * re-read the extent or metadata block that failed (that was * the cause that this fixup code is called) another time, - * page by page this time in order to know which pages + * sector by sector this time in order to know which sectors * caused I/O errors and which ones are good (for all mirrors). * It is the goal to handle the situation when more than one * mirror contains I/O errors, but the errors do not * overlap, i.e. the data can be repaired by selecting the - * pages from those mirrors without I/O error on the - * particular pages. One example (with blocks >= 2 * PAGE_SIZE) - * would be that mirror #1 has an I/O error on the first page, - * the second page is good, and mirror #2 has an I/O error on - * the second page, but the first page is good. - * Then the first page of the first mirror can be repaired by - * taking the first page of the second mirror, and the - * second page of the second mirror can be repaired by - * copying the contents of the 2nd page of the 1st mirror. - * One more note: if the pages of one mirror contain I/O + * sectors from those mirrors without I/O error on the + * particular sectors. One example (with blocks >= 2 * sectorsize) + * would be that mirror #1 has an I/O error on the first sector, + * the second sector is good, and mirror #2 has an I/O error on + * the second sector, but the first sector is good. + * Then the first sector of the first mirror can be repaired by + * taking the first sector of the second mirror, and the + * second sector of the second mirror can be repaired by + * copying the contents of the 2nd sector of the 1st mirror. + * One more note: if the sectors of one mirror contain I/O * errors, the checksum cannot be verified. In order to get * the best data for repairing, the first attempt is to find * a mirror without I/O errors and with a validated checksum. - * Only if this is not possible, the pages are picked from + * Only if this is not possible, the sectors are picked from * mirrors with I/O errors without considering the checksum. * If the latter is the case, at the end, the checksum of the * repaired area is verified in order to correctly maintain @@ -1060,26 +1063,26 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) /* * In case of I/O errors in the area that is supposed to be - * repaired, continue by picking good copies of those pages. - * Select the good pages from mirrors to rewrite bad pages from + * repaired, continue by picking good copies of those sectors. + * Select the good sectors from mirrors to rewrite bad sectors from * the area to fix. Afterwards verify the checksum of the block * that is supposed to be repaired. This verification step is * only done for the purpose of statistic counting and for the * final scrub report, whether errors remain. * A perfect algorithm could make use of the checksum and try - * all possible combinations of pages from the different mirrors + * all possible combinations of sectors from the different mirrors * until the checksum verification succeeds. For example, when - * the 2nd page of mirror #1 faces I/O errors, and the 2nd page + * the 2nd sector of mirror #1 faces I/O errors, and the 2nd sector * of mirror #2 is readable but the final checksum test fails, - * then the 2nd page of mirror #3 could be tried, whether now + * then the 2nd sector of mirror #3 could be tried, whether now * the final checksum succeeds. But this would be a rare * exception and is therefore not implemented. At least it is * avoided that the good copy is overwritten. * A more useful improvement would be to pick the sectors * without I/O error based on sector sizes (512 bytes on legacy - * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one + * disks) instead of on sectorsize. Then maybe 512 byte of one * mirror could be repaired by taking 512 byte of a different - * mirror, even if other 512 byte sectors in the same PAGE_SIZE + * mirror, even if other 512 byte sectors in the same sectorsize * area are unreadable. */ success = 1; @@ -1260,7 +1263,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, { struct scrub_ctx *sctx = original_sblock->sctx; struct btrfs_fs_info *fs_info = sctx->fs_info; - u64 length = original_sblock->page_count * PAGE_SIZE; + u64 length = original_sblock->page_count * fs_info->sectorsize; u64 logical = original_sblock->pagev[0]->logical; u64 generation = original_sblock->pagev[0]->generation; u64 flags = original_sblock->pagev[0]->flags; @@ -1283,13 +1286,13 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, */ while (length > 0) { - sublen = min_t(u64, length, PAGE_SIZE); + sublen = min_t(u64, length, fs_info->sectorsize); mapped_length = sublen; bbio = NULL; /* - * with a length of PAGE_SIZE, each returned stripe - * represents one mirror + * With a length of sectorsize, each returned stripe represents + * one mirror */ btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, @@ -1480,7 +1483,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, bio = btrfs_io_bio_alloc(1); bio_set_dev(bio, spage->dev->bdev); - bio_add_page(bio, spage->page, PAGE_SIZE, 0); + bio_add_page(bio, spage->page, fs_info->sectorsize, 0); bio->bi_iter.bi_sector = spage->physical >> 9; bio->bi_opf = REQ_OP_READ; @@ -1544,6 +1547,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, struct scrub_page *spage_bad = sblock_bad->pagev[page_num]; struct scrub_page *spage_good = sblock_good->pagev[page_num]; struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info; + const u32 sectorsize = fs_info->sectorsize; BUG_ON(spage_bad->page == NULL); BUG_ON(spage_good->page == NULL); @@ -1563,8 +1567,8 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, bio->bi_iter.bi_sector = spage_bad->physical >> 9; bio->bi_opf = REQ_OP_WRITE; - ret = bio_add_page(bio, spage_good->page, PAGE_SIZE, 0); - if (PAGE_SIZE != ret) { + ret = bio_add_page(bio, spage_good->page, sectorsize, 0); + if (ret != sectorsize) { bio_put(bio); return -EIO; } @@ -1642,6 +1646,7 @@ static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, { struct scrub_bio *sbio; int ret; + const u32 sectorsize = sctx->fs_info->sectorsize; mutex_lock(&sctx->wr_lock); again: @@ -1681,16 +1686,16 @@ again: bio->bi_iter.bi_sector = sbio->physical >> 9; bio->bi_opf = REQ_OP_WRITE; sbio->status = 0; - } else if (sbio->physical + sbio->page_count * PAGE_SIZE != + } else if (sbio->physical + sbio->page_count * sectorsize != spage->physical_for_dev_replace || - sbio->logical + sbio->page_count * PAGE_SIZE != + sbio->logical + sbio->page_count * sectorsize != spage->logical) { scrub_wr_submit(sctx); goto again; } - ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0); - if (ret != PAGE_SIZE) { + ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0); + if (ret != sectorsize) { if (sbio->page_count < 1) { bio_put(sbio->bio); sbio->bio = NULL; @@ -1729,7 +1734,8 @@ static void scrub_wr_submit(struct scrub_ctx *sctx) btrfsic_submit_bio(sbio->bio); if (btrfs_is_zoned(sctx->fs_info)) - sctx->write_pointer = sbio->physical + sbio->page_count * PAGE_SIZE; + sctx->write_pointer = sbio->physical + sbio->page_count * + sctx->fs_info->sectorsize; } static void scrub_wr_bio_end_io(struct bio *bio) @@ -1988,6 +1994,65 @@ static void scrub_page_put(struct scrub_page *spage) } } +/* + * Throttling of IO submission, bandwidth-limit based, the timeslice is 1 + * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max. + */ +static void scrub_throttle(struct scrub_ctx *sctx) +{ + const int time_slice = 1000; + struct scrub_bio *sbio; + struct btrfs_device *device; + s64 delta; + ktime_t now; + u32 div; + u64 bwlimit; + + sbio = sctx->bios[sctx->curr]; + device = sbio->dev; + bwlimit = READ_ONCE(device->scrub_speed_max); + if (bwlimit == 0) + return; + + /* + * Slice is divided into intervals when the IO is submitted, adjust by + * bwlimit and maximum of 64 intervals. + */ + div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024))); + div = min_t(u32, 64, div); + + /* Start new epoch, set deadline */ + now = ktime_get(); + if (sctx->throttle_deadline == 0) { + sctx->throttle_deadline = ktime_add_ms(now, time_slice / div); + sctx->throttle_sent = 0; + } + + /* Still in the time to send? */ + if (ktime_before(now, sctx->throttle_deadline)) { + /* If current bio is within the limit, send it */ + sctx->throttle_sent += sbio->bio->bi_iter.bi_size; + if (sctx->throttle_sent <= div_u64(bwlimit, div)) + return; + + /* We're over the limit, sleep until the rest of the slice */ + delta = ktime_ms_delta(sctx->throttle_deadline, now); + } else { + /* New request after deadline, start new epoch */ + delta = 0; + } + + if (delta) { + long timeout; + + timeout = div_u64(delta * HZ, 1000); + schedule_timeout_interruptible(timeout); + } + + /* Next call will start the deadline period */ + sctx->throttle_deadline = 0; +} + static void scrub_submit(struct scrub_ctx *sctx) { struct scrub_bio *sbio; @@ -1995,6 +2060,8 @@ static void scrub_submit(struct scrub_ctx *sctx) if (sctx->curr == -1) return; + scrub_throttle(sctx); + sbio = sctx->bios[sctx->curr]; sctx->curr = -1; scrub_pending_bio_inc(sctx); @@ -2006,6 +2073,7 @@ static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, { struct scrub_block *sblock = spage->sblock; struct scrub_bio *sbio; + const u32 sectorsize = sctx->fs_info->sectorsize; int ret; again: @@ -2044,9 +2112,9 @@ again: bio->bi_iter.bi_sector = sbio->physical >> 9; bio->bi_opf = REQ_OP_READ; sbio->status = 0; - } else if (sbio->physical + sbio->page_count * PAGE_SIZE != + } else if (sbio->physical + sbio->page_count * sectorsize != spage->physical || - sbio->logical + sbio->page_count * PAGE_SIZE != + sbio->logical + sbio->page_count * sectorsize != spage->logical || sbio->dev != spage->dev) { scrub_submit(sctx); @@ -2054,8 +2122,8 @@ again: } sbio->pagev[sbio->page_count] = spage; - ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0); - if (ret != PAGE_SIZE) { + ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0); + if (ret != sectorsize) { if (sbio->page_count < 1) { bio_put(sbio->bio); sbio->bio = NULL; @@ -2398,7 +2466,7 @@ static void scrub_block_complete(struct scrub_block *sblock) if (sblock->sparity && corrupted && !sblock->data_corrected) { u64 start = sblock->pagev[0]->logical; u64 end = sblock->pagev[sblock->page_count - 1]->logical + - PAGE_SIZE; + sblock->sctx->fs_info->sectorsize; ASSERT(end - start <= U32_MAX); scrub_parity_mark_sectors_error(sblock->sparity, @@ -2418,7 +2486,7 @@ static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *su * the csum into @csum. * * The search source is sctx->csum_list, which is a pre-populated list - * storing bytenr ordered csum ranges. We're reponsible to cleanup any range + * storing bytenr ordered csum ranges. We're responsible to cleanup any range * that is before @logical. * * Return 0 if there is no csum for the range. @@ -3138,28 +3206,23 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, physical = map->stripes[num].physical; offset = 0; nstripes = div64_u64(length, map->stripe_len); + mirror_num = 1; + increment = map->stripe_len; if (map->type & BTRFS_BLOCK_GROUP_RAID0) { offset = map->stripe_len * num; increment = map->stripe_len * map->num_stripes; - mirror_num = 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { int factor = map->num_stripes / map->sub_stripes; offset = map->stripe_len * (num / map->sub_stripes); increment = map->stripe_len * factor; mirror_num = num % map->sub_stripes + 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { - increment = map->stripe_len; mirror_num = num % map->num_stripes + 1; } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - increment = map->stripe_len; mirror_num = num % map->num_stripes + 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { get_raid56_logic_offset(physical, num, map, &offset, NULL); increment = map->stripe_len * nr_data_stripes(map); - mirror_num = 1; - } else { - increment = map->stripe_len; - mirror_num = 1; } path = btrfs_alloc_path(); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index bd69db72acc5..6ac37ae6c811 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -2078,16 +2078,6 @@ static struct name_cache_entry *name_cache_search(struct send_ctx *sctx, } /* - * Removes the entry from the list and adds it back to the end. This marks the - * entry as recently used so that name_cache_clean_unused does not remove it. - */ -static void name_cache_used(struct send_ctx *sctx, struct name_cache_entry *nce) -{ - list_del(&nce->list); - list_add_tail(&nce->list, &sctx->name_cache_list); -} - -/* * Remove some entries from the beginning of name_cache_list. */ static void name_cache_clean_unused(struct send_ctx *sctx) @@ -2147,7 +2137,13 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, kfree(nce); nce = NULL; } else { - name_cache_used(sctx, nce); + /* + * Removes the entry from the list and adds it back to + * the end. This marks the entry as recently used so + * that name_cache_clean_unused does not remove it. + */ + list_move_tail(&nce->list, &sctx->name_cache_list); + *parent_ino = nce->parent_ino; *parent_gen = nce->parent_gen; ret = fs_path_add(dest, nce->name, nce->name_len); @@ -4064,6 +4060,17 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) if (ret < 0) goto out; } else { + /* + * If we previously orphanized a directory that + * collided with a new reference that we already + * processed, recompute the current path because + * that directory may be part of the path. + */ + if (orphanized_dir) { + ret = refresh_ref_path(sctx, cur); + if (ret < 0) + goto out; + } ret = send_unlink(sctx, cur->full_path); if (ret < 0) goto out; @@ -6507,7 +6514,7 @@ static int changed_extent(struct send_ctx *sctx, * updates the inode item, but it only changes the iversion (sequence * field in the inode item) of the inode, so if a file is deduplicated * the same amount of times in both the parent and send snapshots, its - * iversion becames the same in both snapshots, whence the inode item is + * iversion becomes the same in both snapshots, whence the inode item is * the same on both snapshots. */ if (sctx->cur_ino != sctx->cmp_key->objectid) @@ -7409,23 +7416,21 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) if (ret) goto out; - mutex_lock(&fs_info->balance_mutex); - if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { - mutex_unlock(&fs_info->balance_mutex); + spin_lock(&fs_info->send_reloc_lock); + if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) { + spin_unlock(&fs_info->send_reloc_lock); btrfs_warn_rl(fs_info, - "cannot run send because a balance operation is in progress"); + "cannot run send because a relocation operation is in progress"); ret = -EAGAIN; goto out; } fs_info->send_in_progress++; - mutex_unlock(&fs_info->balance_mutex); + spin_unlock(&fs_info->send_reloc_lock); - current->journal_info = BTRFS_SEND_TRANS_STUB; ret = send_subvol(sctx); - current->journal_info = NULL; - mutex_lock(&fs_info->balance_mutex); + spin_lock(&fs_info->send_reloc_lock); fs_info->send_in_progress--; - mutex_unlock(&fs_info->balance_mutex); + spin_unlock(&fs_info->send_reloc_lock); if (ret < 0) goto out; diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 2dc674b7c3b1..f79bf85f2439 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -133,18 +133,13 @@ * operations, however they won't be usable until the transaction commits. * * COMMIT_TRANS - * may_commit_transaction() is the ultimate arbiter on whether we commit the - * transaction or not. In order to avoid constantly churning we do all the - * above flushing first and then commit the transaction as the last resort. - * However we need to take into account things like pinned space that would - * be freed, plus any delayed work we may not have gotten rid of in the case - * of metadata. - * - * FORCE_COMMIT_TRANS - * For use by the preemptive flusher. We use this to bypass the ticketing - * checks in may_commit_transaction, as we have more information about the - * overall state of the system and may want to commit the transaction ahead - * of actual ENOSPC conditions. + * This will commit the transaction. Historically we had a lot of logic + * surrounding whether or not we'd commit the transaction, but this waits born + * out of a pre-tickets era where we could end up committing the transaction + * thousands of times in a row without making progress. Now thanks to our + * ticketing system we know if we're not making progress and can error + * everybody out after a few commits rather than burning the disk hoping for + * a different answer. * * OVERCOMMIT * @@ -197,13 +192,6 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) if (!space_info) return -ENOMEM; - ret = percpu_counter_init(&space_info->total_bytes_pinned, 0, - GFP_KERNEL); - if (ret) { - kfree(space_info); - return ret; - } - for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) INIT_LIST_HEAD(&space_info->block_groups[i]); init_rwsem(&space_info->groups_sem); @@ -389,7 +377,7 @@ again: ticket = list_first_entry(head, struct reserve_ticket, list); - /* Check and see if our ticket can be satisified now. */ + /* Check and see if our ticket can be satisfied now. */ if ((used + ticket->bytes <= space_info->total_bytes) || btrfs_can_overcommit(fs_info, space_info, ticket->bytes, flush)) { @@ -495,7 +483,8 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, */ static void shrink_delalloc(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, - u64 to_reclaim, bool wait_ordered) + u64 to_reclaim, bool wait_ordered, + bool for_preempt) { struct btrfs_trans_handle *trans; u64 delalloc_bytes; @@ -532,7 +521,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, * ordered extents, otherwise we'll waste time trying to flush delalloc * that likely won't give us the space back we need. */ - if (ordered_bytes > delalloc_bytes) + if (ordered_bytes > delalloc_bytes && !for_preempt) wait_ordered = true; loops = 0; @@ -551,6 +540,14 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, break; } + /* + * If we are for preemption we just want a one-shot of delalloc + * flushing so we can stop flushing if we decide we don't need + * to anymore. + */ + if (for_preempt) + break; + spin_lock(&space_info->lock); if (list_empty(&space_info->tickets) && list_empty(&space_info->priority_tickets)) { @@ -566,109 +563,6 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, } } -/** - * Possibly commit the transaction if its ok to - * - * @fs_info: the filesystem - * @space_info: space_info we are checking for commit, either data or metadata - * - * This will check to make sure that committing the transaction will actually - * get us somewhere and then commit the transaction if it does. Otherwise it - * will return -ENOSPC. - */ -static int may_commit_transaction(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info) -{ - struct reserve_ticket *ticket = NULL; - struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; - struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; - struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv; - struct btrfs_trans_handle *trans; - u64 reclaim_bytes = 0; - u64 bytes_needed = 0; - u64 cur_free_bytes = 0; - - trans = (struct btrfs_trans_handle *)current->journal_info; - if (trans) - return -EAGAIN; - - spin_lock(&space_info->lock); - cur_free_bytes = btrfs_space_info_used(space_info, true); - if (cur_free_bytes < space_info->total_bytes) - cur_free_bytes = space_info->total_bytes - cur_free_bytes; - else - cur_free_bytes = 0; - - if (!list_empty(&space_info->priority_tickets)) - ticket = list_first_entry(&space_info->priority_tickets, - struct reserve_ticket, list); - else if (!list_empty(&space_info->tickets)) - ticket = list_first_entry(&space_info->tickets, - struct reserve_ticket, list); - if (ticket) - bytes_needed = ticket->bytes; - - if (bytes_needed > cur_free_bytes) - bytes_needed -= cur_free_bytes; - else - bytes_needed = 0; - spin_unlock(&space_info->lock); - - if (!bytes_needed) - return 0; - - trans = btrfs_join_transaction(fs_info->extent_root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - /* - * See if there is enough pinned space to make this reservation, or if - * we have block groups that are going to be freed, allowing us to - * possibly do a chunk allocation the next loop through. - */ - if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || - __percpu_counter_compare(&space_info->total_bytes_pinned, - bytes_needed, - BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) - goto commit; - - /* - * See if there is some space in the delayed insertion reserve for this - * reservation. If the space_info's don't match (like for DATA or - * SYSTEM) then just go enospc, reclaiming this space won't recover any - * space to satisfy those reservations. - */ - if (space_info != delayed_rsv->space_info) - goto enospc; - - spin_lock(&delayed_rsv->lock); - reclaim_bytes += delayed_rsv->reserved; - spin_unlock(&delayed_rsv->lock); - - spin_lock(&delayed_refs_rsv->lock); - reclaim_bytes += delayed_refs_rsv->reserved; - spin_unlock(&delayed_refs_rsv->lock); - - spin_lock(&trans_rsv->lock); - reclaim_bytes += trans_rsv->reserved; - spin_unlock(&trans_rsv->lock); - - if (reclaim_bytes >= bytes_needed) - goto commit; - bytes_needed -= reclaim_bytes; - - if (__percpu_counter_compare(&space_info->total_bytes_pinned, - bytes_needed, - BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) - goto enospc; - -commit: - return btrfs_commit_transaction(trans); -enospc: - btrfs_end_transaction(trans); - return -ENOSPC; -} - /* * Try to flush some data based on policy set by @state. This is only advisory * and may fail for various reasons. The caller is supposed to examine the @@ -702,7 +596,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, case FLUSH_DELALLOC: case FLUSH_DELALLOC_WAIT: shrink_delalloc(fs_info, space_info, num_bytes, - state == FLUSH_DELALLOC_WAIT); + state == FLUSH_DELALLOC_WAIT, for_preempt); break; case FLUSH_DELAYED_REFS_NR: case FLUSH_DELAYED_REFS: @@ -743,9 +637,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, btrfs_wait_on_delayed_iputs(fs_info); break; case COMMIT_TRANS: - ret = may_commit_transaction(fs_info, space_info); - break; - case FORCE_COMMIT_TRANS: + ASSERT(current->journal_info == NULL); trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -792,12 +684,14 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info) { + u64 global_rsv_size = fs_info->global_block_rsv.reserved; u64 ordered, delalloc; u64 thresh = div_factor_fine(space_info->total_bytes, 98); u64 used; /* If we're just plain full then async reclaim just slows us down. */ - if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) + if ((space_info->bytes_used + space_info->bytes_reserved + + global_rsv_size) >= thresh) return false; /* @@ -838,8 +732,10 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, thresh = calc_available_free_space(fs_info, space_info, BTRFS_RESERVE_FLUSH_ALL); - thresh += (space_info->total_bytes - space_info->bytes_used - - space_info->bytes_reserved - space_info->bytes_readonly); + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_readonly + global_rsv_size; + if (used < space_info->total_bytes) + thresh += space_info->total_bytes - used; thresh >>= space_info->clamp; used = space_info->bytes_pinned; @@ -860,14 +756,20 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, * clearly be heavy enough to warrant preemptive flushing. In the case * of heavy DIO or ordered reservations, preemptive flushing will just * waste time and cause us to slow down. + * + * We want to make sure we truly are maxed out on ordered however, so + * cut ordered in half, and if it's still higher than delalloc then we + * can keep flushing. This is to avoid the case where we start + * flushing, and now delalloc == ordered and we stop preemptively + * flushing when we could still have several gigs of delalloc to flush. */ - ordered = percpu_counter_read_positive(&fs_info->ordered_bytes); + ordered = percpu_counter_read_positive(&fs_info->ordered_bytes) >> 1; delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes); if (ordered >= delalloc) used += fs_info->delayed_refs_rsv.reserved + fs_info->delayed_block_rsv.reserved; else - used += space_info->bytes_may_use; + used += space_info->bytes_may_use - global_rsv_size; return (used >= thresh && !btrfs_fs_closing(fs_info) && !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); @@ -921,7 +823,6 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, { struct reserve_ticket *ticket; u64 tickets_id = space_info->tickets_id; - u64 first_ticket_bytes = 0; if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { btrfs_info(fs_info, "cannot satisfy tickets, dumping space info"); @@ -937,21 +838,6 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, steal_from_global_rsv(fs_info, space_info, ticket)) return true; - /* - * may_commit_transaction will avoid committing the transaction - * if it doesn't feel like the space reclaimed by the commit - * would result in the ticket succeeding. However if we have a - * smaller ticket in the queue it may be small enough to be - * satisified by committing the transaction, so if any - * subsequent ticket is smaller than the first ticket go ahead - * and send us back for another loop through the enospc flushing - * code. - */ - if (first_ticket_bytes == 0) - first_ticket_bytes = ticket->bytes; - else if (first_ticket_bytes > ticket->bytes) - return true; - if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) btrfs_info(fs_info, "failing ticket with %llu bytes", ticket->bytes); @@ -1117,7 +1003,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) (delayed_block_rsv->reserved + delayed_refs_rsv->reserved)) { to_reclaim = space_info->bytes_pinned; - flush = FORCE_COMMIT_TRANS; + flush = COMMIT_TRANS; } else if (delayed_block_rsv->reserved > delayed_refs_rsv->reserved) { to_reclaim = delayed_block_rsv->reserved; @@ -1171,28 +1057,9 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) * immediately re-usable, it comes in the form of a delayed ref, which must be * run and then the transaction must be committed. * - * FLUSH_DELAYED_REFS - * The above two cases generate delayed refs that will affect - * ->total_bytes_pinned. However this counter can be inconsistent with - * reality if there are outstanding delayed refs. This is because we adjust - * the counter based solely on the current set of delayed refs and disregard - * any on-disk state which might include more refs. So for example, if we - * have an extent with 2 references, but we only drop 1, we'll see that there - * is a negative delayed ref count for the extent and assume that the space - * will be freed, and thus increase ->total_bytes_pinned. - * - * Running the delayed refs gives us the actual real view of what will be - * freed at the transaction commit time. This stage will not actually free - * space for us, it just makes sure that may_commit_transaction() has all of - * the information it needs to make the right decision. - * * COMMIT_TRANS - * This is where we reclaim all of the pinned space generated by the previous - * two stages. We will not commit the transaction if we don't think we're - * likely to satisfy our request, which means if our current free space + - * total_bytes_pinned < reservation we will not commit. This is why the - * previous states are actually important, to make sure we know for sure - * whether committing the transaction will allow us to make progress. + * This is where we reclaim all of the pinned space generated by running the + * iputs * * ALLOC_CHUNK_FORCE * For data we start with alloc chunk force, however we could have been full @@ -1202,7 +1069,6 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) static const enum btrfs_flush_state data_flush_states[] = { FLUSH_DELALLOC_WAIT, RUN_DELAYED_IPUTS, - FLUSH_DELAYED_REFS, COMMIT_TRANS, ALLOC_CHUNK_FORCE, }; @@ -1561,6 +1427,15 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, flush == BTRFS_RESERVE_FLUSH_DATA) { list_add_tail(&ticket.list, &space_info->tickets); if (!space_info->flush) { + /* + * We were forced to add a reserve ticket, so + * our preemptive flushing is unable to keep + * up. Clamp down on the threshold for the + * preemptive flushing in order to keep up with + * the workload. + */ + maybe_clamp_preempt(fs_info, space_info); + space_info->flush = 1; trace_btrfs_trigger_flush(fs_info, space_info->flags, @@ -1572,14 +1447,6 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, list_add_tail(&ticket.list, &space_info->priority_tickets); } - - /* - * We were forced to add a reserve ticket, so our preemptive - * flushing is unable to keep up. Clamp down on the threshold - * for the preemptive flushing in order to keep up with the - * workload. - */ - maybe_clamp_preempt(fs_info, space_info); } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { used += orig_bytes; /* @@ -1588,8 +1455,8 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, * the async reclaim as we will panic. */ if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && - need_preemptive_reclaim(fs_info, space_info) && - !work_busy(&fs_info->preempt_reclaim_work)) { + !work_busy(&fs_info->preempt_reclaim_work) && + need_preemptive_reclaim(fs_info, space_info)) { trace_btrfs_trigger_flush(fs_info, space_info->flags, orig_bytes, flush, "preempt"); queue_work(system_unbound_wq, diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index b1a8ffb03b3e..cb5056472e79 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -43,18 +43,6 @@ struct btrfs_space_info { u64 flags; - /* - * bytes_pinned is kept in line with what is actually pinned, as in - * we've called update_block_group and dropped the bytes_used counter - * and increased the bytes_pinned counter. However this means that - * bytes_pinned does not reflect the bytes that will be pinned once the - * delayed refs are flushed, so this counter is inc'ed every time we - * call btrfs_free_extent so it is a realtime count of what will be - * freed once the transaction is committed. It will be zeroed every - * time the transaction commits. - */ - struct percpu_counter total_bytes_pinned; - struct list_head list; /* Protected by the spinlock 'lock'. */ struct list_head ro_bgs; @@ -157,22 +145,4 @@ static inline void btrfs_space_info_free_bytes_may_use( } int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, enum btrfs_reserve_flush_enum flush); - -static inline void __btrfs_mod_total_bytes_pinned( - struct btrfs_space_info *space_info, - s64 mod) -{ - percpu_counter_add_batch(&space_info->total_bytes_pinned, mod, - BTRFS_TOTAL_BYTES_PINNED_BATCH); -} - -static inline void btrfs_mod_total_bytes_pinned(struct btrfs_fs_info *fs_info, - u64 flags, s64 mod) -{ - struct btrfs_space_info *space_info = btrfs_find_space_info(fs_info, flags); - - ASSERT(space_info); - __btrfs_mod_total_bytes_pinned(space_info, mod); -} - #endif /* BTRFS_SPACE_INFO_H */ diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 2d19089ab625..640bcd21bf28 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -3,6 +3,7 @@ #include <linux/slab.h> #include "ctree.h" #include "subpage.h" +#include "btrfs_inode.h" /* * Subpage (sectorsize < PAGE_SIZE) support overview: @@ -110,10 +111,12 @@ int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, if (!*ret) return -ENOMEM; spin_lock_init(&(*ret)->lock); - if (type == BTRFS_SUBPAGE_METADATA) + if (type == BTRFS_SUBPAGE_METADATA) { atomic_set(&(*ret)->eb_refs, 0); - else + } else { atomic_set(&(*ret)->readers, 0); + atomic_set(&(*ret)->writers, 0); + } return 0; } @@ -183,12 +186,10 @@ void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; const int nbits = len >> fs_info->sectorsize_bits; - int ret; btrfs_subpage_assert(fs_info, page, start, len); - ret = atomic_add_return(nbits, &subpage->readers); - ASSERT(ret == nbits); + atomic_add(nbits, &subpage->readers); } void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, @@ -196,10 +197,95 @@ void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; const int nbits = len >> fs_info->sectorsize_bits; + bool is_data; + bool last; btrfs_subpage_assert(fs_info, page, start, len); + is_data = is_data_inode(page->mapping->host); ASSERT(atomic_read(&subpage->readers) >= nbits); - if (atomic_sub_and_test(nbits, &subpage->readers)) + last = atomic_sub_and_test(nbits, &subpage->readers); + + /* + * For data we need to unlock the page if the last read has finished. + * + * And please don't replace @last with atomic_sub_and_test() call + * inside if () condition. + * As we want the atomic_sub_and_test() to be always executed. + */ + if (is_data && last) + unlock_page(page); +} + +static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len) +{ + u64 orig_start = *start; + u32 orig_len = *len; + + *start = max_t(u64, page_offset(page), orig_start); + *len = min_t(u64, page_offset(page) + PAGE_SIZE, + orig_start + orig_len) - *start; +} + +void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + const int nbits = (len >> fs_info->sectorsize_bits); + int ret; + + btrfs_subpage_assert(fs_info, page, start, len); + + ASSERT(atomic_read(&subpage->readers) == 0); + ret = atomic_add_return(nbits, &subpage->writers); + ASSERT(ret == nbits); +} + +bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + const int nbits = (len >> fs_info->sectorsize_bits); + + btrfs_subpage_assert(fs_info, page, start, len); + + ASSERT(atomic_read(&subpage->writers) >= nbits); + return atomic_sub_and_test(nbits, &subpage->writers); +} + +/* + * Lock a page for delalloc page writeback. + * + * Return -EAGAIN if the page is not properly initialized. + * Return 0 with the page locked, and writer counter updated. + * + * Even with 0 returned, the page still need extra check to make sure + * it's really the correct page, as the caller is using + * find_get_pages_contig(), which can race with page invalidating. + */ +int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { + lock_page(page); + return 0; + } + lock_page(page); + if (!PagePrivate(page) || !page->private) { + unlock_page(page); + return -EAGAIN; + } + btrfs_subpage_clamp_range(page, &start, &len); + btrfs_subpage_start_writer(fs_info, page, start, len); + return 0; +} + +void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) + return unlock_page(page); + btrfs_subpage_clamp_range(page, &start, &len); + if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len)) unlock_page(page); } @@ -354,6 +440,32 @@ void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info, spin_unlock_irqrestore(&subpage->lock, flags); } +void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + subpage->ordered_bitmap |= tmp; + SetPageOrdered(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} + +void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + subpage->ordered_bitmap &= ~tmp; + if (subpage->ordered_bitmap == 0) + ClearPageOrdered(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} /* * Unlike set/clear which is dependent on each page status, for test all bits * are tested in the same way. @@ -376,6 +488,7 @@ IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate); IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error); IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty); IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback); +IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(ordered); /* * Note that, in selftests (extent-io-tests), we can have empty fs_info passed @@ -408,6 +521,34 @@ bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \ return test_page_func(page); \ return btrfs_subpage_test_##name(fs_info, page, start, len); \ +} \ +void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len) \ +{ \ + if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ + set_page_func(page); \ + return; \ + } \ + btrfs_subpage_clamp_range(page, &start, &len); \ + btrfs_subpage_set_##name(fs_info, page, start, len); \ +} \ +void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len) \ +{ \ + if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ + clear_page_func(page); \ + return; \ + } \ + btrfs_subpage_clamp_range(page, &start, &len); \ + btrfs_subpage_clear_##name(fs_info, page, start, len); \ +} \ +bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len) \ +{ \ + if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \ + return test_page_func(page); \ + btrfs_subpage_clamp_range(page, &start, &len); \ + return btrfs_subpage_test_##name(fs_info, page, start, len); \ } IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate, PageUptodate); @@ -416,3 +557,5 @@ IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io, PageDirty); IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback, PageWriteback); +IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered, + PageOrdered); diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index bfd626e955be..4d7aca85d915 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -22,6 +22,14 @@ struct btrfs_subpage { u16 error_bitmap; u16 dirty_bitmap; u16 writeback_bitmap; + /* + * Both data and metadata needs to track how many readers are for the + * page. + * Data relies on @readers to unlock the page when last reader finished. + * While metadata doesn't need page unlock, it needs to prevent + * page::private get cleared before the last end_page_read(). + */ + atomic_t readers; union { /* * Structures only used by metadata @@ -32,7 +40,10 @@ struct btrfs_subpage { atomic_t eb_refs; /* Structures only used by data */ struct { - atomic_t readers; + atomic_t writers; + + /* Tracke pending ordered extent in this sector */ + u16 ordered_bitmap; }; }; }; @@ -63,6 +74,15 @@ void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len); +void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len); +bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len); +int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len); +void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len); + /* * Template for subpage related operations. * @@ -72,6 +92,10 @@ void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, * btrfs_page_*() are for call sites where the page can either be subpage * specific or regular page. The function will handle both cases. * But the range still needs to be inside the page. + * + * btrfs_page_clamp_*() are similar to btrfs_page_*(), except the range doesn't + * need to be inside the page. Those functions will truncate the range + * automatically. */ #define DECLARE_BTRFS_SUBPAGE_OPS(name) \ void btrfs_subpage_set_##name(const struct btrfs_fs_info *fs_info, \ @@ -85,12 +109,19 @@ void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \ void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len); \ bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); \ +void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); \ +void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); \ +bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len); DECLARE_BTRFS_SUBPAGE_OPS(uptodate); DECLARE_BTRFS_SUBPAGE_OPS(error); DECLARE_BTRFS_SUBPAGE_OPS(dirty); DECLARE_BTRFS_SUBPAGE_OPS(writeback); +DECLARE_BTRFS_SUBPAGE_OPS(ordered); bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 4a396c1147f1..d07b18b2b250 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -299,17 +299,6 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = trans->fs_info; WRITE_ONCE(trans->aborted, errno); - /* Nothing used. The other threads that have joined this - * transaction may be able to continue. */ - if (!trans->dirty && list_empty(&trans->new_bgs)) { - const char *errstr; - - errstr = btrfs_decode_error(errno); - btrfs_warn(fs_info, - "%s:%d: Aborting unused transaction(%s).", - function, line, errstr); - return; - } WRITE_ONCE(trans->transaction->aborted, errno); /* Wake up anybody who may be waiting on this transaction */ wake_up(&fs_info->transaction_wait); @@ -945,8 +934,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, case Opt_check_integrity_including_extent_data: btrfs_info(info, "enabling check integrity including extent data"); - btrfs_set_opt(info->mount_opt, - CHECK_INTEGRITY_INCLUDING_EXTENT_DATA); + btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY_DATA); btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); break; case Opt_check_integrity: @@ -1527,7 +1515,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) if (btrfs_test_opt(info, SKIP_BALANCE)) seq_puts(seq, ",skip_balance"); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_test_opt(info, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA)) + if (btrfs_test_opt(info, CHECK_INTEGRITY_DATA)) seq_puts(seq, ",check_int_data"); else if (btrfs_test_opt(info, CHECK_INTEGRITY)) seq_puts(seq, ",check_int"); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 436ac7b4b334..9d1d140118ff 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -429,7 +429,7 @@ static ssize_t btrfs_discard_bitmap_bytes_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%lld\n", + return scnprintf(buf, PAGE_SIZE, "%llu\n", fs_info->discard_ctl.discard_bitmap_bytes); } BTRFS_ATTR(discard, discard_bitmap_bytes, btrfs_discard_bitmap_bytes_show); @@ -451,7 +451,7 @@ static ssize_t btrfs_discard_extent_bytes_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%lld\n", + return scnprintf(buf, PAGE_SIZE, "%llu\n", fs_info->discard_ctl.discard_extent_bytes); } BTRFS_ATTR(discard, discard_extent_bytes, btrfs_discard_extent_bytes_show); @@ -665,15 +665,6 @@ static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \ } \ BTRFS_ATTR(space_info, field, btrfs_space_info_show_##field) -static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj, - struct kobj_attribute *a, - char *buf) -{ - struct btrfs_space_info *sinfo = to_space_info(kobj); - s64 val = percpu_counter_sum(&sinfo->total_bytes_pinned); - return scnprintf(buf, PAGE_SIZE, "%lld\n", val); -} - SPACE_INFO_ATTR(flags); SPACE_INFO_ATTR(total_bytes); SPACE_INFO_ATTR(bytes_used); @@ -684,8 +675,6 @@ SPACE_INFO_ATTR(bytes_readonly); SPACE_INFO_ATTR(bytes_zone_unusable); SPACE_INFO_ATTR(disk_used); SPACE_INFO_ATTR(disk_total); -BTRFS_ATTR(space_info, total_bytes_pinned, - btrfs_space_info_show_total_bytes_pinned); static struct attribute *space_info_attrs[] = { BTRFS_ATTR_PTR(space_info, flags), @@ -698,7 +687,6 @@ static struct attribute *space_info_attrs[] = { BTRFS_ATTR_PTR(space_info, bytes_zone_unusable), BTRFS_ATTR_PTR(space_info, disk_used), BTRFS_ATTR_PTR(space_info, disk_total), - BTRFS_ATTR_PTR(space_info, total_bytes_pinned), NULL, }; ATTRIBUTE_GROUPS(space_info); @@ -706,7 +694,6 @@ ATTRIBUTE_GROUPS(space_info); static void space_info_release(struct kobject *kobj) { struct btrfs_space_info *sinfo = to_space_info(kobj); - percpu_counter_destroy(&sinfo->total_bytes_pinned); kfree(sinfo); } @@ -1455,6 +1442,33 @@ static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj, } BTRFS_ATTR(devid, replace_target, btrfs_devinfo_replace_target_show); +static ssize_t btrfs_devinfo_scrub_speed_max_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_device *device = container_of(kobj, struct btrfs_device, + devid_kobj); + + return scnprintf(buf, PAGE_SIZE, "%llu\n", + READ_ONCE(device->scrub_speed_max)); +} + +static ssize_t btrfs_devinfo_scrub_speed_max_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_device *device = container_of(kobj, struct btrfs_device, + devid_kobj); + char *endptr; + unsigned long long limit; + + limit = memparse(buf, &endptr); + WRITE_ONCE(device->scrub_speed_max, limit); + return len; +} +BTRFS_ATTR_RW(devid, scrub_speed_max, btrfs_devinfo_scrub_speed_max_show, + btrfs_devinfo_scrub_speed_max_store); + static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { @@ -1468,10 +1482,40 @@ static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj, } BTRFS_ATTR(devid, writeable, btrfs_devinfo_writeable_show); +static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_device *device = container_of(kobj, struct btrfs_device, + devid_kobj); + + if (!device->dev_stats_valid) + return scnprintf(buf, PAGE_SIZE, "invalid\n"); + + /* + * Print all at once so we get a snapshot of all values from the same + * time. Keep them in sync and in order of definition of + * btrfs_dev_stat_values. + */ + return scnprintf(buf, PAGE_SIZE, + "write_errs %d\n" + "read_errs %d\n" + "flush_errs %d\n" + "corruption_errs %d\n" + "generation_errs %d\n", + btrfs_dev_stat_read(device, BTRFS_DEV_STAT_WRITE_ERRS), + btrfs_dev_stat_read(device, BTRFS_DEV_STAT_READ_ERRS), + btrfs_dev_stat_read(device, BTRFS_DEV_STAT_FLUSH_ERRS), + btrfs_dev_stat_read(device, BTRFS_DEV_STAT_CORRUPTION_ERRS), + btrfs_dev_stat_read(device, BTRFS_DEV_STAT_GENERATION_ERRS)); +} +BTRFS_ATTR(devid, error_stats, btrfs_devinfo_error_stats_show); + static struct attribute *devid_attrs[] = { + BTRFS_ATTR_PTR(devid, error_stats), BTRFS_ATTR_PTR(devid, in_fs_metadata), BTRFS_ATTR_PTR(devid, missing), BTRFS_ATTR_PTR(devid, replace_target), + BTRFS_ATTR_PTR(devid, scrub_speed_max), BTRFS_ATTR_PTR(devid, writeable), NULL }; diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index c0aefe6dee0b..319fed82d741 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -557,7 +557,7 @@ int btrfs_test_extent_map(void) { /* * Test a chunk with 2 data stripes one of which - * interesects the physical address of the super block + * intersects the physical address of the super block * is correctly recognised. */ .raid_type = BTRFS_BLOCK_GROUP_RAID1, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f75de9f6c0ad..50318231c1a8 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -583,9 +583,6 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, bool do_chunk_alloc = false; int ret; - /* Send isn't supposed to start transactions. */ - ASSERT(current->journal_info != BTRFS_SEND_TRANS_STUB); - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) return ERR_PTR(-EROFS); @@ -1406,8 +1403,10 @@ int btrfs_defrag_root(struct btrfs_root *root) while (1) { trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } ret = btrfs_defrag_leaves(trans, root); @@ -1476,7 +1475,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); if (ret) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } /* @@ -1869,31 +1868,6 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info) } /* - * wait for the current transaction commit to start and block subsequent - * transaction joins - */ -static void wait_current_trans_commit_start(struct btrfs_fs_info *fs_info, - struct btrfs_transaction *trans) -{ - wait_event(fs_info->transaction_blocked_wait, - trans->state >= TRANS_STATE_COMMIT_START || - TRANS_ABORTED(trans)); -} - -/* - * wait for the current transaction to start and then become unblocked. - * caller holds ref. - */ -static void wait_current_trans_commit_start_and_unblock( - struct btrfs_fs_info *fs_info, - struct btrfs_transaction *trans) -{ - wait_event(fs_info->transaction_wait, - trans->state >= TRANS_STATE_UNBLOCKED || - TRANS_ABORTED(trans)); -} - -/* * commit transactions asynchronously. once btrfs_commit_transaction_async * returns, any subsequent transaction will not be allowed to join. */ @@ -1920,8 +1894,7 @@ static void do_async_commit(struct work_struct *work) kfree(ac); } -int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, - int wait_for_unblock) +int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_async_commit *ac; @@ -1953,13 +1926,13 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, __sb_writers_release(fs_info->sb, SB_FREEZE_FS); schedule_work(&ac->work); - - /* wait for transaction to start and unblock */ - if (wait_for_unblock) - wait_current_trans_commit_start_and_unblock(fs_info, cur_trans); - else - wait_current_trans_commit_start(fs_info, cur_trans); - + /* + * Wait for the current transaction commit to start and block + * subsequent transaction joins + */ + wait_event(fs_info->transaction_blocked_wait, + cur_trans->state >= TRANS_STATE_COMMIT_START || + TRANS_ABORTED(cur_trans)); if (current->journal_info == trans) current->journal_info = NULL; @@ -2074,14 +2047,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) ASSERT(refcount_read(&trans->use_count) == 1); - /* - * Some places just start a transaction to commit it. We need to make - * sure that if this commit fails that the abort code actually marks the - * transaction as failed, so set trans->dirty to make the abort code do - * the right thing. - */ - trans->dirty = true; - /* Stop the commit early if ->aborted is set */ if (TRANS_ABORTED(cur_trans)) { ret = cur_trans->aborted; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 364cfbb4c5c5..07d76029f598 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -122,8 +122,6 @@ struct btrfs_transaction { #define TRANS_EXTWRITERS (__TRANS_START | __TRANS_ATTACH) -#define BTRFS_SEND_TRANS_STUB ((void *)1) - struct btrfs_trans_handle { u64 transid; u64 bytes_reserved; @@ -143,7 +141,6 @@ struct btrfs_trans_handle { bool allocating_chunk; bool can_flush_pending_bgs; bool reloc_reserved; - bool dirty; bool in_fsync; struct btrfs_root *root; struct btrfs_fs_info *fs_info; @@ -227,8 +224,7 @@ void btrfs_add_dead_root(struct btrfs_root *root); int btrfs_defrag_root(struct btrfs_root *root); int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root); int btrfs_commit_transaction(struct btrfs_trans_handle *trans); -int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, - int wait_for_unblock); +int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans); int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans); bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans); void btrfs_throttle(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 95a600034d61..cab451d19547 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1574,7 +1574,9 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, if (ret) goto out; - btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + if (ret) + goto out; } ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; @@ -1749,7 +1751,9 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, if (nlink != inode->i_nlink) { set_nlink(inode, nlink); - btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + if (ret) + goto out; } BTRFS_I(inode)->index_cnt = (u64)-1; @@ -1787,6 +1791,7 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, break; if (ret == 1) { + ret = 0; if (path->slots[0] == 0) break; path->slots[0]--; @@ -1799,17 +1804,19 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, ret = btrfs_del_item(trans, root, path); if (ret) - goto out; + break; btrfs_release_path(path); inode = read_one_inode(root, key.offset); - if (!inode) - return -EIO; + if (!inode) { + ret = -EIO; + break; + } ret = fixup_inode_link_count(trans, root, inode); iput(inode); if (ret) - goto out; + break; /* * fixup on a directory may create new entries, @@ -1818,8 +1825,6 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, */ key.offset = (u64)-1; } - ret = 0; -out: btrfs_release_path(path); return ret; } @@ -1858,8 +1863,6 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); } else if (ret == -EEXIST) { ret = 0; - } else { - BUG(); /* Logic Error */ } iput(inode); @@ -3299,6 +3302,22 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * begins and releases it only after writing its superblock. */ mutex_lock(&fs_info->tree_log_mutex); + + /* + * The previous transaction writeout phase could have failed, and thus + * marked the fs in an error state. We must not commit here, as we + * could have updated our generation in the super_for_commit and + * writing the super here would result in transid mismatches. If there + * is an error here just bail. + */ + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + ret = -EIO; + btrfs_set_log_full_commit(trans); + btrfs_abort_transaction(trans, ret); + mutex_unlock(&fs_info->tree_log_mutex); + goto out_wake_log_root; + } + btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start); btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level); ret = write_all_supers(fs_info, 1); @@ -4449,7 +4468,8 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, ret = btrfs_truncate_inode_items(trans, root->log_root, inode, truncate_offset, - BTRFS_EXTENT_DATA_KEY); + BTRFS_EXTENT_DATA_KEY, + NULL); } while (ret == -EAGAIN); if (ret) goto out; @@ -5397,7 +5417,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, &inode->runtime_flags); while(1) { ret = btrfs_truncate_inode_items(trans, - log, inode, 0, 0); + log, inode, 0, 0, NULL); if (ret != -EAGAIN) break; } @@ -5447,13 +5467,23 @@ log_extents: btrfs_release_path(dst_path); if (need_log_inode_item) { err = log_inode_item(trans, log, dst_path, inode); - if (!err && !xattrs_logged) { + if (err) + goto out_unlock; + /* + * If we are doing a fast fsync and the inode was logged before + * in this transaction, we don't need to log the xattrs because + * they were logged before. If xattrs were added, changed or + * deleted since the last time we logged the inode, then we have + * already logged them because the inode had the runtime flag + * BTRFS_INODE_COPY_EVERYTHING set. + */ + if (!xattrs_logged && inode->logged_trans < trans->transid) { err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); + if (err) + goto out_unlock; btrfs_release_path(path); } - if (err) - goto out_unlock; } if (fast_search) { ret = btrfs_log_changed_extents(trans, root, inode, dst_path, @@ -6352,6 +6382,7 @@ next: error: if (wc.trans) btrfs_end_transaction(wc.trans); + clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); btrfs_free_path(path); return ret; } @@ -6463,6 +6494,24 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, (!old_dir || old_dir->logged_trans < trans->transid)) return; + /* + * If we are doing a rename (old_dir is not NULL) from a directory that + * was previously logged, make sure the next log attempt on the directory + * is not skipped and logs the inode again. This is because the log may + * not currently be authoritative for a range including the old + * BTRFS_DIR_ITEM_KEY and BTRFS_DIR_INDEX_KEY keys, so we want to make + * sure after a log replay we do not end up with both the new and old + * dentries around (in case the inode is a directory we would have a + * directory with two hard links and 2 inode references for different + * parents). The next log attempt of old_dir will happen at + * btrfs_log_all_parents(), called through btrfs_log_inode_parent() + * below, because we have previously set inode->last_unlink_trans to the + * current transaction ID, either here or at btrfs_record_unlink_dir() in + * case inode is a directory. + */ + if (old_dir) + old_dir->logged_trans = 0; + btrfs_init_log_ctx(&ctx, &inode->vfs_inode); ctx.logging_new_name = true; /* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9a1ead0c4a31..782e16795bc4 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -717,7 +717,7 @@ static struct btrfs_fs_devices *find_fsid_changed( /* * Handles the case where scanned device is part of an fs that had - * multiple successful changes of FSID but curently device didn't + * multiple successful changes of FSID but currently device didn't * observe it. Meaning our fsid will be different than theirs. We need * to handle two subcases : * 1 - The fs still continues to have different METADATA/FSID uuids. @@ -1459,7 +1459,7 @@ static bool dev_extent_hole_check_zoned(struct btrfs_device *device, /* Given hole range was invalid (outside of device) */ if (ret == -ERANGE) { *hole_start += *hole_size; - *hole_size = false; + *hole_size = 0; return true; } @@ -1550,7 +1550,7 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, * check to ensure dev extents are not double allocated. * This makes the function safe to allocate dev extents but may not report * correct usable device space, as device extent freed in current transaction - * is not reported as avaiable. + * is not reported as available. */ static int find_free_dev_extent_start(struct btrfs_device *device, u64 num_bytes, u64 search_start, u64 *start, @@ -4217,14 +4217,6 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, btrfs_bg_type_to_raid_name(data_target)); } - if (fs_info->send_in_progress) { - btrfs_warn_rl(fs_info, -"cannot run balance while send operations are in progress (%d in progress)", - fs_info->send_in_progress); - ret = -EAGAIN; - goto out; - } - ret = insert_balance_item(fs_info, bctl); if (ret && ret != -EEXIST) goto out; @@ -6127,17 +6119,17 @@ static bool need_full_stripe(enum btrfs_map_op op) * @em: mapping containing the logical extent * @op: type of operation - write or read * @logical: address that we want to figure out the geometry of - * @len: the length of IO we are going to perform, starting at @logical * @io_geom: pointer used to return values * * Returns < 0 in case a chunk for the given logical address cannot be found, * usually shouldn't happen unless @logical is corrupted, 0 otherwise. */ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, - enum btrfs_map_op op, u64 logical, u64 len, + enum btrfs_map_op op, u64 logical, struct btrfs_io_geometry *io_geom) { struct map_lookup *map; + u64 len; u64 offset; u64 stripe_offset; u64 stripe_nr; @@ -6152,7 +6144,7 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, offset = logical - em->start; /* Len of a stripe in a chunk */ stripe_len = map->stripe_len; - /* Stripe wher this block falls in */ + /* Stripe where this block falls in */ stripe_nr = div64_u64(offset, stripe_len); /* Offset of stripe in the chunk */ stripe_offset = stripe_nr * stripe_len; @@ -6243,7 +6235,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, em = btrfs_get_chunk_map(fs_info, logical, *length); ASSERT(!IS_ERR(em)); - ret = btrfs_get_io_geometry(fs_info, em, op, logical, *length, &geom); + ret = btrfs_get_io_geometry(fs_info, em, op, logical, &geom); if (ret < 0) return ret; @@ -6670,8 +6662,6 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, * * If devid and uuid are both specified, the match must be exact, otherwise * only devid is used. - * - * If @seed is true, traverse through the seed devices. */ struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, u64 devid, u8 *uuid, u8 *fsid) @@ -7865,7 +7855,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, ret = -EUCLEAN; } - /* Make sure no dev extent is beyond device bondary */ + /* Make sure no dev extent is beyond device boundary */ dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL); if (!dev) { btrfs_err(fs_info, "failed to find devid %llu", devid); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 9c0d84e5ec06..c7fc7caf575c 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -143,6 +143,9 @@ struct btrfs_device { struct completion kobj_unregister; /* For sysfs/FSID/devinfo/devid/ */ struct kobject devid_kobj; + + /* Bandwidth limit for scrub, in bytes */ + u64 scrub_speed_max; }; /* @@ -443,7 +446,7 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret); int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map, - enum btrfs_map_op op, u64 logical, u64 len, + enum btrfs_map_op op, u64 logical, struct btrfs_io_geometry *io_geom); int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 304ce64c70a4..297c0b1c0634 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -81,7 +81,7 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, * *: Special case, no superblock is written * 0: Use write pointer of zones[0] * 1: Use write pointer of zones[1] - * C: Compare super blcoks from zones[0] and zones[1], use the latest + * C: Compare super blocks from zones[0] and zones[1], use the latest * one determined by generation * x: Invalid state */ @@ -150,6 +150,18 @@ static inline u32 sb_zone_number(int shift, int mirror) return (u32)zone; } +static inline sector_t zone_start_sector(u32 zone_number, + struct block_device *bdev) +{ + return (sector_t)zone_number << ilog2(bdev_zone_sectors(bdev)); +} + +static inline u64 zone_start_physical(u32 zone_number, + struct btrfs_zoned_device_info *zone_info) +{ + return (u64)zone_number << zone_info->zone_size_shift; +} + /* * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block * device into static sized chunks and fake a conventional zone on each of @@ -405,8 +417,8 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) if (sb_zone + 1 >= zone_info->nr_zones) continue; - sector = sb_zone << (zone_info->zone_size_shift - SECTOR_SHIFT); - ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, + ret = btrfs_get_dev_zones(device, + zone_start_physical(sb_zone, zone_info), &zone_info->sb_zones[sb_pos], &nr_zones); if (ret) @@ -421,7 +433,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) } /* - * If zones[0] is conventional, always use the beggining of the + * If zones[0] is conventional, always use the beginning of the * zone to record superblock. No need to validate in that case. */ if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type == @@ -721,7 +733,7 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, if (sb_zone + 1 >= nr_zones) return -ENOENT; - ret = blkdev_report_zones(bdev, sb_zone << zone_sectors_shift, + ret = blkdev_report_zones(bdev, zone_start_sector(sb_zone, bdev), BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb, zones); if (ret < 0) @@ -826,7 +838,7 @@ int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) return -ENOENT; return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, - sb_zone << zone_sectors_shift, + zone_start_sector(sb_zone, bdev), zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS); } @@ -878,7 +890,8 @@ u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start, if (!(end <= sb_zone || sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) { have_sb = true; - pos = ((u64)sb_zone + BTRFS_NR_SB_LOG_ZONES) << shift; + pos = zone_start_physical( + sb_zone + BTRFS_NR_SB_LOG_ZONES, zinfo); break; } @@ -1127,6 +1140,10 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) } if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) { + btrfs_err_in_rcu(fs_info, + "zoned: unexpected conventional zone %llu on device %s (devid %llu)", + zone.start << SECTOR_SHIFT, + rcu_str_deref(device->name), device->devid); ret = -EIO; goto out; } @@ -1187,6 +1204,13 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { case 0: /* single */ + if (alloc_offsets[0] == WP_MISSING_DEV) { + btrfs_err(fs_info, + "zoned: cannot recover write pointer for zone %llu", + physical); + ret = -EIO; + goto out; + } cache->alloc_offset = alloc_offsets[0]; break; case BTRFS_BLOCK_GROUP_DUP: @@ -1204,6 +1228,13 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) } out: + if (cache->alloc_offset > fs_info->zone_size) { + btrfs_err(fs_info, + "zoned: invalid write pointer %llu in block group %llu", + cache->alloc_offset, cache->start); + ret = -EIO; + } + /* An extent is allocated after the write pointer */ if (!ret && num_conventional && last_alloc > cache->alloc_offset) { btrfs_err(fs_info, @@ -1278,7 +1309,7 @@ void btrfs_free_redirty_list(struct btrfs_transaction *trans) spin_unlock(&trans->releasing_ebs_lock); } -bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em) +bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_block_group *cache; @@ -1293,7 +1324,7 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em) if (!is_data_inode(&inode->vfs_inode)) return false; - cache = btrfs_lookup_block_group(fs_info, em->block_start); + cache = btrfs_lookup_block_group(fs_info, start); ASSERT(cache); if (!cache) return false; @@ -1502,3 +1533,24 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, length = wp - physical_pos; return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length); } + +struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) +{ + struct btrfs_device *device; + struct extent_map *em; + struct map_lookup *map; + + em = btrfs_get_chunk_map(fs_info, logical, length); + if (IS_ERR(em)) + return ERR_CAST(em); + + map = em->map_lookup; + /* We only support single profile for now */ + ASSERT(map->num_stripes == 1); + device = map->stripes[0].dev; + + free_extent_map(em); + + return device; +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 5e41a74a9cb2..b0ae2608cb6b 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -53,7 +53,7 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache); void btrfs_redirty_list_add(struct btrfs_transaction *trans, struct extent_buffer *eb); void btrfs_free_redirty_list(struct btrfs_transaction *trans); -bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em); +bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start); void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset, struct bio *bio); void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered); @@ -65,6 +65,8 @@ void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length); int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, u64 physical_start, u64 physical_pos); +struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, + u64 logical, u64 length); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -152,8 +154,7 @@ static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans, struct extent_buffer *eb) { } static inline void btrfs_free_redirty_list(struct btrfs_transaction *trans) { } -static inline bool btrfs_use_zone_append(struct btrfs_inode *inode, - struct extent_map *em) +static inline bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) { return false; } @@ -192,6 +193,13 @@ static inline int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, return -EOPNOTSUPP; } +static inline struct btrfs_device *btrfs_zoned_get_device( + struct btrfs_fs_info *fs_info, + u64 logical, u64 length) +{ + return ERR_PTR(-EOPNOTSUPP); +} + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 5624fae7a603..9ba79b6531fb 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -668,14 +668,13 @@ out: * Handle lookups for the hidden .snap directory. */ struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req, - struct dentry *dentry, int err) + struct dentry *dentry) { struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); struct inode *parent = d_inode(dentry->d_parent); /* we hold i_mutex */ /* .snap dir? */ - if (err == -ENOENT && - ceph_snap(parent) == CEPH_NOSNAP && + if (ceph_snap(parent) == CEPH_NOSNAP && strcmp(dentry->d_name.name, fsc->mount_options->snapdir_name) == 0) { struct dentry *res; struct inode *inode = ceph_get_snapdir(parent); @@ -742,7 +741,6 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); struct ceph_mds_request *req; - struct dentry *res; int op; int mask; int err; @@ -793,12 +791,16 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, req->r_parent = dir; set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); err = ceph_mdsc_do_request(mdsc, NULL, req); - res = ceph_handle_snapdir(req, dentry, err); - if (IS_ERR(res)) { - err = PTR_ERR(res); - } else { - dentry = res; - err = 0; + if (err == -ENOENT) { + struct dentry *res; + + res = ceph_handle_snapdir(req, dentry); + if (IS_ERR(res)) { + err = PTR_ERR(res); + } else { + dentry = res; + err = 0; + } } dentry = ceph_finish_lookup(req, dentry, err); ceph_mdsc_put_request(req); /* will dput(dentry) */ diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 77fc037d5beb..d51af3698032 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -578,6 +578,7 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, struct ceph_inode_info *ci = ceph_inode(dir); struct inode *inode; struct timespec64 now; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); struct ceph_vino vino = { .ino = req->r_deleg_ino, .snap = CEPH_NOSNAP }; @@ -615,8 +616,10 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, ceph_file_layout_to_legacy(lo, &in.layout); + down_read(&mdsc->snap_rwsem); ret = ceph_fill_inode(inode, NULL, &iinfo, NULL, req->r_session, req->r_fmode, NULL); + up_read(&mdsc->snap_rwsem); if (ret) { dout("%s failed to fill inode: %d\n", __func__, ret); ceph_dir_clear_complete(dir); @@ -739,14 +742,16 @@ retry: err = ceph_mdsc_do_request(mdsc, (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, req); - dentry = ceph_handle_snapdir(req, dentry, err); - if (IS_ERR(dentry)) { - err = PTR_ERR(dentry); - goto out_req; + if (err == -ENOENT) { + dentry = ceph_handle_snapdir(req, dentry); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out_req; + } + err = 0; } - err = 0; - if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry) + if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); if (d_in_lookup(dentry)) { diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index e1c63adb196d..df0c8a724609 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -777,6 +777,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, umode_t mode = le32_to_cpu(info->mode); dev_t rdev = le32_to_cpu(info->rdev); + lockdep_assert_held(&mdsc->snap_rwsem); + dout("%s %p ino %llx.%llx v %llu had %llu\n", __func__, inode, ceph_vinop(inode), le64_to_cpu(info->version), ci->i_version); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index db80d89556b1..839e6b0239ee 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1218,7 +1218,7 @@ extern const struct dentry_operations ceph_dentry_ops; extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order); extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry); extern struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req, - struct dentry *dentry, int err); + struct dentry *dentry); extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, struct dentry *dentry, int err); diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h index 4a97fe12006b..37fc7d6ac457 100644 --- a/fs/cifs/cifs_ioctl.h +++ b/fs/cifs/cifs_ioctl.h @@ -72,15 +72,28 @@ struct smb3_key_debug_info { } __packed; /* - * Dump full key (32 byte encrypt/decrypt keys instead of 16 bytes) - * is needed if GCM256 (stronger encryption) negotiated + * Dump variable-sized keys */ struct smb3_full_key_debug_info { - __u64 Suid; + /* INPUT: size of userspace buffer */ + __u32 in_size; + + /* + * INPUT: 0 for current user, otherwise session to dump + * OUTPUT: session id that was dumped + */ + __u64 session_id; __u16 cipher_type; - __u8 auth_key[16]; /* SMB2_NTLMV2_SESSKEY_SIZE */ - __u8 smb3encryptionkey[32]; /* SMB3_ENC_DEC_KEY_SIZE */ - __u8 smb3decryptionkey[32]; /* SMB3_ENC_DEC_KEY_SIZE */ + __u8 session_key_length; + __u8 server_in_key_length; + __u8 server_out_key_length; + __u8 data[]; + /* + * return this struct with the keys appended at the end: + * __u8 session_key[session_key_length]; + * __u8 server_in_key[server_in_key_length]; + * __u8 server_out_key[server_out_key_length]; + */ } __packed; struct smb3_notify { diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index d7ea9c5fe0f8..2ffcb29d5c8f 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -133,7 +133,7 @@ struct workqueue_struct *cifsiod_wq; struct workqueue_struct *decrypt_wq; struct workqueue_struct *fileinfo_put_wq; struct workqueue_struct *cifsoplockd_wq; -struct workqueue_struct *deferredclose_wq; +struct workqueue_struct *deferredclose_wq; __u32 cifs_lock_secret; /* diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index d88b4b523dcc..8488d7024462 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1257,8 +1257,7 @@ struct cifsFileInfo { struct work_struct oplock_break; /* work for oplock breaks */ struct work_struct put; /* work for the final part of _put */ struct delayed_work deferred; - bool oplock_break_received; /* Flag to indicate oplock break */ - bool deferred_scheduled; + bool deferred_close_scheduled; /* Flag to indicate close is scheduled */ }; struct cifs_io_parms { @@ -1418,6 +1417,7 @@ struct cifsInodeInfo { struct inode vfs_inode; struct list_head deferred_closes; /* list of deferred closes */ spinlock_t deferred_lock; /* protection on deferred list */ + bool lease_granted; /* Flag to indicate whether lease or oplock is granted. */ }; static inline struct cifsInodeInfo * diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index b53a87db282f..554d64fe171e 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -148,7 +148,8 @@ #define SMB3_SIGN_KEY_SIZE (16) /* - * Size of the smb3 encryption/decryption keys + * Size of the smb3 encryption/decryption key storage. + * This size is big enough to store any cipher key types. */ #define SMB3_ENC_DEC_KEY_SIZE (32) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 6caad100c3f3..379a427f3c2f 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -323,8 +323,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, cfile->dentry = dget(dentry); cfile->f_flags = file->f_flags; cfile->invalidHandle = false; - cfile->oplock_break_received = false; - cfile->deferred_scheduled = false; + cfile->deferred_close_scheduled = false; cfile->tlink = cifs_get_tlink(tlink); INIT_WORK(&cfile->oplock_break, cifs_oplock_break); INIT_WORK(&cfile->put, cifsFileInfo_put_work); @@ -574,21 +573,18 @@ int cifs_open(struct inode *inode, struct file *file) file->f_op = &cifs_file_direct_ops; } - spin_lock(&CIFS_I(inode)->deferred_lock); /* Get the cached handle as SMB2 close is deferred */ rc = cifs_get_readable_path(tcon, full_path, &cfile); if (rc == 0) { if (file->f_flags == cfile->f_flags) { file->private_data = cfile; + spin_lock(&CIFS_I(inode)->deferred_lock); cifs_del_deferred_close(cfile); spin_unlock(&CIFS_I(inode)->deferred_lock); goto out; } else { - spin_unlock(&CIFS_I(inode)->deferred_lock); _cifsFileInfo_put(cfile, true, false); } - } else { - spin_unlock(&CIFS_I(inode)->deferred_lock); } if (server->oplocks) @@ -878,12 +874,8 @@ void smb2_deferred_work_close(struct work_struct *work) struct cifsFileInfo, deferred.work); spin_lock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); - if (!cfile->deferred_scheduled) { - spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); - return; - } cifs_del_deferred_close(cfile); - cfile->deferred_scheduled = false; + cfile->deferred_close_scheduled = false; spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); _cifsFileInfo_put(cfile, true, false); } @@ -900,19 +892,26 @@ int cifs_close(struct inode *inode, struct file *file) file->private_data = NULL; dclose = kmalloc(sizeof(struct cifs_deferred_close), GFP_KERNEL); if ((cinode->oplock == CIFS_CACHE_RHW_FLG) && + cinode->lease_granted && dclose) { if (test_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) inode->i_ctime = inode->i_mtime = current_time(inode); spin_lock(&cinode->deferred_lock); cifs_add_deferred_close(cfile, dclose); - if (cfile->deferred_scheduled) { - mod_delayed_work(deferredclose_wq, - &cfile->deferred, cifs_sb->ctx->acregmax); + if (cfile->deferred_close_scheduled && + delayed_work_pending(&cfile->deferred)) { + /* + * If there is no pending work, mod_delayed_work queues new work. + * So, Increase the ref count to avoid use-after-free. + */ + if (!mod_delayed_work(deferredclose_wq, + &cfile->deferred, cifs_sb->ctx->acregmax)) + cifsFileInfo_get(cfile); } else { /* Deferred close for files */ queue_delayed_work(deferredclose_wq, &cfile->deferred, cifs_sb->ctx->acregmax); - cfile->deferred_scheduled = true; + cfile->deferred_close_scheduled = true; spin_unlock(&cinode->deferred_lock); return 0; } @@ -2020,8 +2019,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, if (fsuid_only && !uid_eq(open_file->uid, current_fsuid())) continue; if (OPEN_FMODE(open_file->f_flags) & FMODE_READ) { - if ((!open_file->invalidHandle) && - (!open_file->oplock_break_received)) { + if ((!open_file->invalidHandle)) { /* found a good file */ /* lock it so it will not be closed on us */ cifsFileInfo_get(open_file); @@ -4874,14 +4872,20 @@ oplock_break_ack: } /* * When oplock break is received and there are no active - * file handles but cached, then set the flag oplock_break_received. + * file handles but cached, then schedule deferred close immediately. * So, new open will not use cached handle. */ spin_lock(&CIFS_I(inode)->deferred_lock); is_deferred = cifs_is_deferred_close(cfile, &dclose); - if (is_deferred && cfile->deferred_scheduled) { - cfile->oplock_break_received = true; - mod_delayed_work(deferredclose_wq, &cfile->deferred, 0); + if (is_deferred && + cfile->deferred_close_scheduled && + delayed_work_pending(&cfile->deferred)) { + /* + * If there is no pending work, mod_delayed_work queues new work. + * So, Increase the ref count to avoid use-after-free. + */ + if (!mod_delayed_work(deferredclose_wq, &cfile->deferred, 0)) + cifsFileInfo_get(cfile); } spin_unlock(&CIFS_I(inode)->deferred_lock); _cifsFileInfo_put(cfile, false /* do not wait for ourself */, false); diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c index 5d21cd905315..92d4ab029c91 100644 --- a/fs/cifs/fs_context.c +++ b/fs/cifs/fs_context.c @@ -1145,7 +1145,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, /* if iocharset not set then load_nls_default * is used by caller */ - cifs_dbg(FYI, "iocharset set to %s\n", ctx->iocharset); + cifs_dbg(FYI, "iocharset set to %s\n", ctx->iocharset); break; case Opt_netbiosname: memset(ctx->source_rfc1001_name, 0x20, diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 28ec8d7c521a..d67d281ab863 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -33,6 +33,7 @@ #include "cifsfs.h" #include "cifs_ioctl.h" #include "smb2proto.h" +#include "smb2glob.h" #include <linux/btrfs.h> static long cifs_ioctl_query_info(unsigned int xid, struct file *filep, @@ -214,48 +215,112 @@ static int cifs_shutdown(struct super_block *sb, unsigned long arg) return 0; } -static int cifs_dump_full_key(struct cifs_tcon *tcon, unsigned long arg) +static int cifs_dump_full_key(struct cifs_tcon *tcon, struct smb3_full_key_debug_info __user *in) { - struct smb3_full_key_debug_info pfull_key_inf; - __u64 suid; - struct list_head *tmp; + struct smb3_full_key_debug_info out; struct cifs_ses *ses; + int rc = 0; bool found = false; + u8 __user *end; - if (!smb3_encryption_required(tcon)) - return -EOPNOTSUPP; + if (!smb3_encryption_required(tcon)) { + rc = -EOPNOTSUPP; + goto out; + } + + /* copy user input into our output buffer */ + if (copy_from_user(&out, in, sizeof(out))) { + rc = -EINVAL; + goto out; + } + + if (!out.session_id) { + /* if ses id is 0, use current user session */ + ses = tcon->ses; + } else { + /* otherwise if a session id is given, look for it in all our sessions */ + struct cifs_ses *ses_it = NULL; + struct TCP_Server_Info *server_it = NULL; - ses = tcon->ses; /* default to user id for current user */ - if (get_user(suid, (__u64 __user *)arg)) - suid = 0; - if (suid) { - /* search to see if there is a session with a matching SMB UID */ spin_lock(&cifs_tcp_ses_lock); - list_for_each(tmp, &tcon->ses->server->smb_ses_list) { - ses = list_entry(tmp, struct cifs_ses, smb_ses_list); - if (ses->Suid == suid) { - found = true; - break; + list_for_each_entry(server_it, &cifs_tcp_ses_list, tcp_ses_list) { + list_for_each_entry(ses_it, &server_it->smb_ses_list, smb_ses_list) { + if (ses_it->Suid == out.session_id) { + ses = ses_it; + /* + * since we are using the session outside the crit + * section, we need to make sure it won't be released + * so increment its refcount + */ + ses->ses_count++; + found = true; + goto search_end; + } } } +search_end: spin_unlock(&cifs_tcp_ses_lock); - if (found == false) - return -EINVAL; - } /* else uses default user's SMB UID (ie current user) */ - - pfull_key_inf.cipher_type = le16_to_cpu(ses->server->cipher_type); - pfull_key_inf.Suid = ses->Suid; - memcpy(pfull_key_inf.auth_key, ses->auth_key.response, - 16 /* SMB2_NTLMV2_SESSKEY_SIZE */); - memcpy(pfull_key_inf.smb3decryptionkey, ses->smb3decryptionkey, - 32 /* SMB3_ENC_DEC_KEY_SIZE */); - memcpy(pfull_key_inf.smb3encryptionkey, - ses->smb3encryptionkey, 32 /* SMB3_ENC_DEC_KEY_SIZE */); - if (copy_to_user((void __user *)arg, &pfull_key_inf, - sizeof(struct smb3_full_key_debug_info))) - return -EFAULT; + if (!found) { + rc = -ENOENT; + goto out; + } + } - return 0; + switch (ses->server->cipher_type) { + case SMB2_ENCRYPTION_AES128_CCM: + case SMB2_ENCRYPTION_AES128_GCM: + out.session_key_length = CIFS_SESS_KEY_SIZE; + out.server_in_key_length = out.server_out_key_length = SMB3_GCM128_CRYPTKEY_SIZE; + break; + case SMB2_ENCRYPTION_AES256_CCM: + case SMB2_ENCRYPTION_AES256_GCM: + out.session_key_length = CIFS_SESS_KEY_SIZE; + out.server_in_key_length = out.server_out_key_length = SMB3_GCM256_CRYPTKEY_SIZE; + break; + default: + rc = -EOPNOTSUPP; + goto out; + } + + /* check if user buffer is big enough to store all the keys */ + if (out.in_size < sizeof(out) + out.session_key_length + out.server_in_key_length + + out.server_out_key_length) { + rc = -ENOBUFS; + goto out; + } + + out.session_id = ses->Suid; + out.cipher_type = le16_to_cpu(ses->server->cipher_type); + + /* overwrite user input with our output */ + if (copy_to_user(in, &out, sizeof(out))) { + rc = -EINVAL; + goto out; + } + + /* append all the keys at the end of the user buffer */ + end = in->data; + if (copy_to_user(end, ses->auth_key.response, out.session_key_length)) { + rc = -EINVAL; + goto out; + } + end += out.session_key_length; + + if (copy_to_user(end, ses->smb3encryptionkey, out.server_in_key_length)) { + rc = -EINVAL; + goto out; + } + end += out.server_in_key_length; + + if (copy_to_user(end, ses->smb3decryptionkey, out.server_out_key_length)) { + rc = -EINVAL; + goto out; + } + +out: + if (found) + cifs_put_smb_ses(ses); + return rc; } long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) @@ -371,6 +436,10 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) rc = -EOPNOTSUPP; break; case CIFS_DUMP_KEY: + /* + * Dump encryption keys. This is an old ioctl that only + * handles AES-128-{CCM,GCM}. + */ if (pSMBFile == NULL) break; if (!capable(CAP_SYS_ADMIN)) { @@ -398,11 +467,10 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) else rc = 0; break; - /* - * Dump full key (32 bytes instead of 16 bytes) is - * needed if GCM256 (stronger encryption) negotiated - */ case CIFS_DUMP_FULL_KEY: + /* + * Dump encryption keys (handles any key sizes) + */ if (pSMBFile == NULL) break; if (!capable(CAP_SYS_ADMIN)) { @@ -410,8 +478,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) break; } tcon = tlink_tcon(pSMBFile->tlink); - rc = cifs_dump_full_key(tcon, arg); - + rc = cifs_dump_full_key(tcon, (void __user *)arg); break; case CIFS_IOC_NOTIFY: if (!S_ISDIR(inode->i_mode)) { diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 524dbdfb7184..7207a63819cb 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -672,6 +672,11 @@ cifs_add_pending_open(struct cifs_fid *fid, struct tcon_link *tlink, spin_unlock(&tlink_tcon(open->tlink)->open_file_lock); } +/* + * Critical section which runs after acquiring deferred_lock. + * As there is no reference count on cifs_deferred_close, pdclose + * should not be used outside deferred_lock. + */ bool cifs_is_deferred_close(struct cifsFileInfo *cfile, struct cifs_deferred_close **pdclose) { @@ -688,6 +693,9 @@ cifs_is_deferred_close(struct cifsFileInfo *cfile, struct cifs_deferred_close ** return false; } +/* + * Critical section which runs after acquiring deferred_lock. + */ void cifs_add_deferred_close(struct cifsFileInfo *cfile, struct cifs_deferred_close *dclose) { @@ -707,6 +715,9 @@ cifs_add_deferred_close(struct cifsFileInfo *cfile, struct cifs_deferred_close * list_add_tail(&dclose->dlist, &CIFS_I(d_inode(cfile->dentry))->deferred_closes); } +/* + * Critical section which runs after acquiring deferred_lock. + */ void cifs_del_deferred_close(struct cifsFileInfo *cfile) { @@ -738,15 +749,19 @@ void cifs_close_all_deferred_files(struct cifs_tcon *tcon) { struct cifsFileInfo *cfile; - struct cifsInodeInfo *cinode; struct list_head *tmp; spin_lock(&tcon->open_file_lock); list_for_each(tmp, &tcon->openFileList) { cfile = list_entry(tmp, struct cifsFileInfo, tlist); - cinode = CIFS_I(d_inode(cfile->dentry)); - if (delayed_work_pending(&cfile->deferred)) - mod_delayed_work(deferredclose_wq, &cfile->deferred, 0); + if (delayed_work_pending(&cfile->deferred)) { + /* + * If there is no pending work, mod_delayed_work queues new work. + * So, Increase the ref count to avoid use-after-free. + */ + if (!mod_delayed_work(deferredclose_wq, &cfile->deferred, 0)) + cifsFileInfo_get(cfile); + } } spin_unlock(&tcon->open_file_lock); } diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index dd0eb665b680..21ef51d338e0 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1861,6 +1861,8 @@ smb2_copychunk_range(const unsigned int xid, cpu_to_le32(min_t(u32, len, tcon->max_bytes_chunk)); /* Request server copy to target from src identified by key */ + kfree(retbuf); + retbuf = NULL; rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid, trgtfile->fid.volatile_fid, FSCTL_SRV_COPYCHUNK_WRITE, true /* is_fsctl */, (char *)pcchunk, @@ -3981,6 +3983,7 @@ smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, unsigned int epoch, bool *purge_cache) { oplock &= 0xFF; + cinode->lease_granted = false; if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE) return; if (oplock == SMB2_OPLOCK_LEVEL_BATCH) { @@ -4007,6 +4010,7 @@ smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, unsigned int new_oplock = 0; oplock &= 0xFF; + cinode->lease_granted = true; if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE) return; diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index a8bf43184773..c205f93e0a10 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -958,6 +958,13 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) /* Internal types */ server->capabilities |= SMB2_NT_FIND | SMB2_LARGE_FILES; + /* + * SMB3.0 supports only 1 cipher and doesn't have a encryption neg context + * Set the cipher type manually. + */ + if (server->dialect == SMB30_PROT_ID && (server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION)) + server->cipher_type = SMB2_ENCRYPTION_AES128_CCM; + security_blob = smb2_get_data_area_len(&blob_offset, &blob_length, (struct smb2_sync_hdr *)rsp); /* @@ -3900,10 +3907,10 @@ smb2_new_read_req(void **buf, unsigned int *total_len, * Related requests use info from previous read request * in chain. */ - shdr->SessionId = 0xFFFFFFFF; + shdr->SessionId = 0xFFFFFFFFFFFFFFFF; shdr->TreeId = 0xFFFFFFFF; - req->PersistentFileId = 0xFFFFFFFF; - req->VolatileFileId = 0xFFFFFFFF; + req->PersistentFileId = 0xFFFFFFFFFFFFFFFF; + req->VolatileFileId = 0xFFFFFFFFFFFFFFFF; } } if (remaining_bytes > io_parms->length) diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index d6df908dccad..dafcb6ab050d 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -12,6 +12,11 @@ #include <linux/tracepoint.h> +/* + * Please use this 3-part article as a reference for writing new tracepoints: + * https://lwn.net/Articles/379903/ + */ + /* For logging errors in read or write */ DECLARE_EVENT_CLASS(smb3_rw_err_class, TP_PROTO(unsigned int xid, @@ -529,16 +534,16 @@ DECLARE_EVENT_CLASS(smb3_exit_err_class, TP_ARGS(xid, func_name, rc), TP_STRUCT__entry( __field(unsigned int, xid) - __field(const char *, func_name) + __string(func_name, func_name) __field(int, rc) ), TP_fast_assign( __entry->xid = xid; - __entry->func_name = func_name; + __assign_str(func_name, func_name); __entry->rc = rc; ), TP_printk("\t%s: xid=%u rc=%d", - __entry->func_name, __entry->xid, __entry->rc) + __get_str(func_name), __entry->xid, __entry->rc) ) #define DEFINE_SMB3_EXIT_ERR_EVENT(name) \ @@ -583,14 +588,14 @@ DECLARE_EVENT_CLASS(smb3_enter_exit_class, TP_ARGS(xid, func_name), TP_STRUCT__entry( __field(unsigned int, xid) - __field(const char *, func_name) + __string(func_name, func_name) ), TP_fast_assign( __entry->xid = xid; - __entry->func_name = func_name; + __assign_str(func_name, func_name); ), TP_printk("\t%s: xid=%u", - __entry->func_name, __entry->xid) + __get_str(func_name), __entry->xid) ) #define DEFINE_SMB3_ENTER_EXIT_EVENT(name) \ @@ -857,16 +862,16 @@ DECLARE_EVENT_CLASS(smb3_reconnect_class, TP_STRUCT__entry( __field(__u64, currmid) __field(__u64, conn_id) - __field(char *, hostname) + __string(hostname, hostname) ), TP_fast_assign( __entry->currmid = currmid; __entry->conn_id = conn_id; - __entry->hostname = hostname; + __assign_str(hostname, hostname); ), TP_printk("conn_id=0x%llx server=%s current_mid=%llu", __entry->conn_id, - __entry->hostname, + __get_str(hostname), __entry->currmid) ) @@ -891,7 +896,7 @@ DECLARE_EVENT_CLASS(smb3_credit_class, TP_STRUCT__entry( __field(__u64, currmid) __field(__u64, conn_id) - __field(char *, hostname) + __string(hostname, hostname) __field(int, credits) __field(int, credits_to_add) __field(int, in_flight) @@ -899,7 +904,7 @@ DECLARE_EVENT_CLASS(smb3_credit_class, TP_fast_assign( __entry->currmid = currmid; __entry->conn_id = conn_id; - __entry->hostname = hostname; + __assign_str(hostname, hostname); __entry->credits = credits; __entry->credits_to_add = credits_to_add; __entry->in_flight = in_flight; @@ -907,7 +912,7 @@ DECLARE_EVENT_CLASS(smb3_credit_class, TP_printk("conn_id=0x%llx server=%s current_mid=%llu " "credits=%d credit_change=%d in_flight=%d", __entry->conn_id, - __entry->hostname, + __get_str(hostname), __entry->currmid, __entry->credits, __entry->credits_to_add, diff --git a/fs/coredump.c b/fs/coredump.c index 2868e3e171ae..c3d8fc14b993 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -519,7 +519,7 @@ static bool dump_interrupted(void) * but then we need to teach dump_write() to restart and clear * TIF_SIGPENDING. */ - return signal_pending(current); + return fatal_signal_pending(current) || freezing(current); } static void wait_for_dump_helpers(struct file *file) diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 6ca7d16593ff..d00455440d08 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -344,13 +344,9 @@ int fscrypt_fname_disk_to_usr(const struct inode *inode, offsetof(struct fscrypt_nokey_name, sha256)); BUILD_BUG_ON(BASE64_CHARS(FSCRYPT_NOKEY_NAME_MAX) > NAME_MAX); - if (hash) { - nokey_name.dirhash[0] = hash; - nokey_name.dirhash[1] = minor_hash; - } else { - nokey_name.dirhash[0] = 0; - nokey_name.dirhash[1] = 0; - } + nokey_name.dirhash[0] = hash; + nokey_name.dirhash[1] = minor_hash; + if (iname->len <= sizeof(nokey_name.bytes)) { memcpy(nokey_name.bytes, iname->name, iname->len); size = offsetof(struct fscrypt_nokey_name, bytes[iname->len]); diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index 261293fb7097..bca9c6658a7c 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -210,15 +210,40 @@ out_unlock: return err; } +/* + * Derive a SipHash key from the given fscrypt master key and the given + * application-specific information string. + * + * Note that the KDF produces a byte array, but the SipHash APIs expect the key + * as a pair of 64-bit words. Therefore, on big endian CPUs we have to do an + * endianness swap in order to get the same results as on little endian CPUs. + */ +static int fscrypt_derive_siphash_key(const struct fscrypt_master_key *mk, + u8 context, const u8 *info, + unsigned int infolen, siphash_key_t *key) +{ + int err; + + err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, context, info, infolen, + (u8 *)key, sizeof(*key)); + if (err) + return err; + + BUILD_BUG_ON(sizeof(*key) != 16); + BUILD_BUG_ON(ARRAY_SIZE(key->key) != 2); + le64_to_cpus(&key->key[0]); + le64_to_cpus(&key->key[1]); + return 0; +} + int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, const struct fscrypt_master_key *mk) { int err; - err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, HKDF_CONTEXT_DIRHASH_KEY, - ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE, - (u8 *)&ci->ci_dirhash_key, - sizeof(ci->ci_dirhash_key)); + err = fscrypt_derive_siphash_key(mk, HKDF_CONTEXT_DIRHASH_KEY, + ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE, + &ci->ci_dirhash_key); if (err) return err; ci->ci_dirhash_key_initialized = true; @@ -253,10 +278,9 @@ static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_info *ci, if (mk->mk_ino_hash_key_initialized) goto unlock; - err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, - HKDF_CONTEXT_INODE_HASH_KEY, NULL, 0, - (u8 *)&mk->mk_ino_hash_key, - sizeof(mk->mk_ino_hash_key)); + err = fscrypt_derive_siphash_key(mk, + HKDF_CONTEXT_INODE_HASH_KEY, + NULL, 0, &mk->mk_ino_hash_key); if (err) goto unlock; /* pairs with smp_load_acquire() above */ diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index e813acfaa6e8..ba7c01cd9a5d 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -893,7 +893,7 @@ ssize_t debugfs_read_file_str(struct file *file, char __user *user_buf, copy[copy_len] = '\n'; - ret = simple_read_from_buffer(user_buf, count, ppos, copy, copy_len); + ret = simple_read_from_buffer(user_buf, count, ppos, copy, len); kfree(copy); return ret; diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 1d252164d97b..8129a430d789 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -45,10 +45,13 @@ static unsigned int debugfs_allow __ro_after_init = DEFAULT_DEBUGFS_ALLOW_BITS; static int debugfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr *ia) { - int ret = security_locked_down(LOCKDOWN_DEBUGFS); + int ret; - if (ret && (ia->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID))) - return ret; + if (ia->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) { + ret = security_locked_down(LOCKDOWN_DEBUGFS); + if (ret) + return ret; + } return simple_setattr(&init_user_ns, dentry, ia); } diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 345f8061e3b4..e3f5d7f3c8a0 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -296,10 +296,6 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat, struct extent_crypt_result ecr; int rc = 0; - if (!crypt_stat || !crypt_stat->tfm - || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED)) - return -EINVAL; - if (unlikely(ecryptfs_verbosity > 0)) { ecryptfs_printk(KERN_DEBUG, "Key size [%zd]; key:\n", crypt_stat->key_size); diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 858b3339f381..906af0c1998c 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -75,4 +75,3 @@ config EROFS_FS_ZIP Enable fixed-sized output compression for EROFS. If you don't want to enable compression feature, say N. - diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index aea129ddda74..3701c72bacb2 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -2,7 +2,6 @@ /* * Copyright (C) 2019 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #ifndef __EROFS_FS_COMPRESS_H #define __EROFS_FS_COMPRESS_H @@ -85,4 +84,3 @@ int z_erofs_decompress(struct z_erofs_decompress_req *rq, struct list_head *pagepool); #endif - diff --git a/fs/erofs/data.c b/fs/erofs/data.c index ebac756cb2a3..3787a5fb0a42 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -2,7 +2,6 @@ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #include "internal.h" #include <linux/prefetch.h> @@ -315,4 +314,3 @@ const struct address_space_operations erofs_raw_access_aops = { .readahead = erofs_raw_access_readahead, .bmap = erofs_bmap, }; - diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 88e33addf229..a5bc4b1b7813 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -2,7 +2,6 @@ /* * Copyright (C) 2019 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #include "compress.h" #include <linux/module.h> @@ -407,4 +406,3 @@ int z_erofs_decompress(struct z_erofs_decompress_req *rq, return z_erofs_shifted_transform(rq, pagepool); return z_erofs_decompress_generic(rq, pagepool); } - diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c index 2776bb832127..eee9b0b31b63 100644 --- a/fs/erofs/dir.c +++ b/fs/erofs/dir.c @@ -2,7 +2,6 @@ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #include "internal.h" @@ -139,4 +138,3 @@ const struct file_operations erofs_dir_fops = { .read = generic_read_dir, .iterate_shared = erofs_readdir, }; - diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 8739d3adf51f..0f8da74570b4 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -4,7 +4,6 @@ * * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #ifndef __EROFS_FS_H #define __EROFS_FS_H @@ -348,4 +347,3 @@ static inline void erofs_check_ondisk_layout_definitions(void) } #endif - diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 7ed2d7391692..aa8a0d770ba3 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -2,7 +2,6 @@ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #include "xattr.h" @@ -374,4 +373,3 @@ const struct inode_operations erofs_fast_symlink_iops = { .listxattr = erofs_listxattr, .get_acl = erofs_get_acl, }; - diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index f92e3e32b9f4..543c2ff97d30 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -2,7 +2,6 @@ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #ifndef __EROFS_INTERNAL_H #define __EROFS_INTERNAL_H @@ -469,4 +468,3 @@ static inline int z_erofs_load_lz4_config(struct super_block *sb, #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #endif /* __EROFS_INTERNAL_H */ - diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c index 3a81e1f7fc06..a8271ce5e13f 100644 --- a/fs/erofs/namei.c +++ b/fs/erofs/namei.c @@ -2,7 +2,6 @@ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #include "xattr.h" @@ -247,4 +246,3 @@ const struct inode_operations erofs_dir_iops = { .listxattr = erofs_listxattr, .get_acl = erofs_get_acl, }; - diff --git a/fs/erofs/super.c b/fs/erofs/super.c index bbf3bbd908e0..8fc6c04b54f4 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -2,7 +2,6 @@ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #include <linux/module.h> #include <linux/buffer_head.h> @@ -285,6 +284,7 @@ static int erofs_read_superblock(struct super_block *sb) goto out; } + ret = -EINVAL; blkszbits = dsb->blkszbits; /* 9(512 bytes) + LOG_SECTORS_PER_BLOCK == LOG_BLOCK_SIZE */ if (blkszbits != LOG_BLOCK_SIZE) { @@ -751,4 +751,3 @@ module_exit(erofs_module_exit); MODULE_DESCRIPTION("Enhanced ROM File System"); MODULE_AUTHOR("Gao Xiang, Chao Yu, Miao Xie, CONSUMER BG, HUAWEI Inc."); MODULE_LICENSE("GPL"); - diff --git a/fs/erofs/tagptr.h b/fs/erofs/tagptr.h index a72897c86744..64ceb7270b5c 100644 --- a/fs/erofs/tagptr.h +++ b/fs/erofs/tagptr.h @@ -1,8 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* * A tagged pointer implementation - * - * Copyright (C) 2018 Gao Xiang <gaoxiang25@huawei.com> */ #ifndef __EROFS_FS_TAGPTR_H #define __EROFS_FS_TAGPTR_H @@ -107,4 +105,3 @@ tagptr_init(o, cmpxchg(&ptptr->v, o.v, n.v)); }) *ptptr; }) #endif /* __EROFS_FS_TAGPTR_H */ - diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c index 6758c5b19f7c..bd86067a63f7 100644 --- a/fs/erofs/utils.c +++ b/fs/erofs/utils.c @@ -2,7 +2,6 @@ /* * Copyright (C) 2018 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #include "internal.h" #include <linux/pagevec.h> @@ -278,4 +277,3 @@ void erofs_exit_shrinker(void) unregister_shrinker(&erofs_shrinker_info); } #endif /* !CONFIG_EROFS_FS_ZIP */ - diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index 47314a26767a..8dd54b420a1d 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -2,7 +2,6 @@ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #include <linux/security.h> #include "xattr.h" @@ -709,4 +708,3 @@ struct posix_acl *erofs_get_acl(struct inode *inode, int type) return acl; } #endif - diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h index 815304bd335f..366dcb400525 100644 --- a/fs/erofs/xattr.h +++ b/fs/erofs/xattr.h @@ -2,7 +2,6 @@ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #ifndef __EROFS_XATTR_H #define __EROFS_XATTR_H diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 78e4b598ecca..cb4d0889eca9 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -2,7 +2,6 @@ /* * Copyright (C) 2018 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #include "zdata.h" #include "compress.h" @@ -380,7 +379,6 @@ static int z_erofs_attach_page(struct z_erofs_collector *clt, enum z_erofs_page_type type) { int ret; - bool occupied; /* give priority for inplaceio */ if (clt->mode >= COLLECT_PRIMARY && @@ -388,8 +386,7 @@ static int z_erofs_attach_page(struct z_erofs_collector *clt, z_erofs_try_inplace_io(clt, page)) return 0; - ret = z_erofs_pagevec_enqueue(&clt->vector, - page, type, &occupied); + ret = z_erofs_pagevec_enqueue(&clt->vector, page, type); clt->cl->vcnt += (unsigned int)ret; return ret ? 0 : -EAGAIN; @@ -1471,4 +1468,3 @@ const struct address_space_operations z_erofs_aops = { .readpage = z_erofs_readpage, .readahead = z_erofs_readahead, }; - diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index 942ee69dff6a..3a008f1b9f78 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -2,7 +2,6 @@ /* * Copyright (C) 2018 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #ifndef __EROFS_FS_ZDATA_H #define __EROFS_FS_ZDATA_H diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index efaf32596b97..f68aea4baed7 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -2,7 +2,6 @@ /* * Copyright (C) 2018-2019 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #include "internal.h" #include <asm/unaligned.h> @@ -597,4 +596,3 @@ out: DBG_BUGON(err < 0 && err != -ENOMEM); return err; } - diff --git a/fs/erofs/zpvec.h b/fs/erofs/zpvec.h index 1d67cbd38704..dfd7fe0503bb 100644 --- a/fs/erofs/zpvec.h +++ b/fs/erofs/zpvec.h @@ -2,7 +2,6 @@ /* * Copyright (C) 2018 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #ifndef __EROFS_FS_ZPVEC_H #define __EROFS_FS_ZPVEC_H @@ -107,10 +106,8 @@ static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor, static inline bool z_erofs_pagevec_enqueue(struct z_erofs_pagevec_ctor *ctor, struct page *page, - enum z_erofs_page_type type, - bool *occupied) + enum z_erofs_page_type type) { - *occupied = false; if (!ctor->next && type) if (ctor->index + 1 == ctor->nr) return false; @@ -125,7 +122,6 @@ static inline bool z_erofs_pagevec_enqueue(struct z_erofs_pagevec_ctor *ctor, /* should remind that collector->next never equal to 1, 2 */ if (type == (uintptr_t)ctor->next) { ctor->next = page; - *occupied = true; } ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, page, type); return true; @@ -154,4 +150,3 @@ z_erofs_pagevec_dequeue(struct z_erofs_pagevec_ctor *ctor, return tagptr_unfold_ptr(t); } #endif - diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 77c84d6f1af6..cbf37b2cf871 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3206,7 +3206,10 @@ static int ext4_split_extent_at(handle_t *handle, ext4_ext_mark_unwritten(ex2); err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags); - if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) { + if (err != -ENOSPC && err != -EDQUOT) + goto out; + + if (EXT4_EXT_MAY_ZEROOUT & split_flag) { if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { if (split_flag & EXT4_EXT_DATA_VALID1) { err = ext4_ext_zeroout(inode, ex2); @@ -3232,25 +3235,22 @@ static int ext4_split_extent_at(handle_t *handle, ext4_ext_pblock(&orig_ex)); } - if (err) - goto fix_extent_len; - /* update the extent length and mark as initialized */ - ex->ee_len = cpu_to_le16(ee_len); - ext4_ext_try_to_merge(handle, inode, path, ex); - err = ext4_ext_dirty(handle, inode, path + path->p_depth); - if (err) - goto fix_extent_len; - - /* update extent status tree */ - err = ext4_zeroout_es(inode, &zero_ex); - - goto out; - } else if (err) - goto fix_extent_len; - -out: - ext4_ext_show_leaf(inode, path); - return err; + if (!err) { + /* update the extent length and mark as initialized */ + ex->ee_len = cpu_to_le16(ee_len); + ext4_ext_try_to_merge(handle, inode, path, ex); + err = ext4_ext_dirty(handle, inode, path + path->p_depth); + if (!err) + /* update extent status tree */ + err = ext4_zeroout_es(inode, &zero_ex); + /* If we failed at this point, we don't know in which + * state the extent tree exactly is so don't try to fix + * length of the original extent as it may do even more + * damage. + */ + goto out; + } + } fix_extent_len: ex->ee_len = orig_ex.ee_len; @@ -3260,6 +3260,9 @@ fix_extent_len: */ ext4_ext_dirty(handle, inode, path + path->p_depth); return err; +out: + ext4_ext_show_leaf(inode, path); + return err; } /* diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index f98ca4f37ef6..e8195229c252 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -1288,28 +1288,29 @@ struct dentry_info_args { }; static inline void tl_to_darg(struct dentry_info_args *darg, - struct ext4_fc_tl *tl) + struct ext4_fc_tl *tl, u8 *val) { - struct ext4_fc_dentry_info *fcd; + struct ext4_fc_dentry_info fcd; - fcd = (struct ext4_fc_dentry_info *)ext4_fc_tag_val(tl); + memcpy(&fcd, val, sizeof(fcd)); - darg->parent_ino = le32_to_cpu(fcd->fc_parent_ino); - darg->ino = le32_to_cpu(fcd->fc_ino); - darg->dname = fcd->fc_dname; - darg->dname_len = ext4_fc_tag_len(tl) - - sizeof(struct ext4_fc_dentry_info); + darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino); + darg->ino = le32_to_cpu(fcd.fc_ino); + darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname); + darg->dname_len = le16_to_cpu(tl->fc_len) - + sizeof(struct ext4_fc_dentry_info); } /* Unlink replay function */ -static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl) +static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl, + u8 *val) { struct inode *inode, *old_parent; struct qstr entry; struct dentry_info_args darg; int ret = 0; - tl_to_darg(&darg, tl); + tl_to_darg(&darg, tl, val); trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino, darg.parent_ino, darg.dname_len); @@ -1399,13 +1400,14 @@ out: } /* Link replay function */ -static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl) +static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl, + u8 *val) { struct inode *inode; struct dentry_info_args darg; int ret = 0; - tl_to_darg(&darg, tl); + tl_to_darg(&darg, tl, val); trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino, darg.parent_ino, darg.dname_len); @@ -1450,9 +1452,10 @@ static int ext4_fc_record_modified_inode(struct super_block *sb, int ino) /* * Inode replay function */ -static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl) +static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl, + u8 *val) { - struct ext4_fc_inode *fc_inode; + struct ext4_fc_inode fc_inode; struct ext4_inode *raw_inode; struct ext4_inode *raw_fc_inode; struct inode *inode = NULL; @@ -1460,9 +1463,9 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl) int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag); struct ext4_extent_header *eh; - fc_inode = (struct ext4_fc_inode *)ext4_fc_tag_val(tl); + memcpy(&fc_inode, val, sizeof(fc_inode)); - ino = le32_to_cpu(fc_inode->fc_ino); + ino = le32_to_cpu(fc_inode.fc_ino); trace_ext4_fc_replay(sb, tag, ino, 0, 0); inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); @@ -1474,12 +1477,13 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl) ext4_fc_record_modified_inode(sb, ino); - raw_fc_inode = (struct ext4_inode *)fc_inode->fc_raw_inode; + raw_fc_inode = (struct ext4_inode *) + (val + offsetof(struct ext4_fc_inode, fc_raw_inode)); ret = ext4_get_fc_inode_loc(sb, ino, &iloc); if (ret) goto out; - inode_len = ext4_fc_tag_len(tl) - sizeof(struct ext4_fc_inode); + inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode); raw_inode = ext4_raw_inode(&iloc); memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block)); @@ -1547,14 +1551,15 @@ out: * inode for which we are trying to create a dentry here, should already have * been replayed before we start here. */ -static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl) +static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl, + u8 *val) { int ret = 0; struct inode *inode = NULL; struct inode *dir = NULL; struct dentry_info_args darg; - tl_to_darg(&darg, tl); + tl_to_darg(&darg, tl, val); trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino, darg.parent_ino, darg.dname_len); @@ -1633,9 +1638,9 @@ static int ext4_fc_record_regions(struct super_block *sb, int ino, /* Replay add range tag */ static int ext4_fc_replay_add_range(struct super_block *sb, - struct ext4_fc_tl *tl) + struct ext4_fc_tl *tl, u8 *val) { - struct ext4_fc_add_range *fc_add_ex; + struct ext4_fc_add_range fc_add_ex; struct ext4_extent newex, *ex; struct inode *inode; ext4_lblk_t start, cur; @@ -1645,15 +1650,14 @@ static int ext4_fc_replay_add_range(struct super_block *sb, struct ext4_ext_path *path = NULL; int ret; - fc_add_ex = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl); - ex = (struct ext4_extent *)&fc_add_ex->fc_ex; + memcpy(&fc_add_ex, val, sizeof(fc_add_ex)); + ex = (struct ext4_extent *)&fc_add_ex.fc_ex; trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE, - le32_to_cpu(fc_add_ex->fc_ino), le32_to_cpu(ex->ee_block), + le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block), ext4_ext_get_actual_len(ex)); - inode = ext4_iget(sb, le32_to_cpu(fc_add_ex->fc_ino), - EXT4_IGET_NORMAL); + inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL); if (IS_ERR(inode)) { jbd_debug(1, "Inode not found."); return 0; @@ -1762,32 +1766,33 @@ next: /* Replay DEL_RANGE tag */ static int -ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl) +ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl, + u8 *val) { struct inode *inode; - struct ext4_fc_del_range *lrange; + struct ext4_fc_del_range lrange; struct ext4_map_blocks map; ext4_lblk_t cur, remaining; int ret; - lrange = (struct ext4_fc_del_range *)ext4_fc_tag_val(tl); - cur = le32_to_cpu(lrange->fc_lblk); - remaining = le32_to_cpu(lrange->fc_len); + memcpy(&lrange, val, sizeof(lrange)); + cur = le32_to_cpu(lrange.fc_lblk); + remaining = le32_to_cpu(lrange.fc_len); trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE, - le32_to_cpu(lrange->fc_ino), cur, remaining); + le32_to_cpu(lrange.fc_ino), cur, remaining); - inode = ext4_iget(sb, le32_to_cpu(lrange->fc_ino), EXT4_IGET_NORMAL); + inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL); if (IS_ERR(inode)) { - jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange->fc_ino)); + jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino)); return 0; } ret = ext4_fc_record_modified_inode(sb, inode->i_ino); jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n", - inode->i_ino, le32_to_cpu(lrange->fc_lblk), - le32_to_cpu(lrange->fc_len)); + inode->i_ino, le32_to_cpu(lrange.fc_lblk), + le32_to_cpu(lrange.fc_len)); while (remaining > 0) { map.m_lblk = cur; map.m_len = remaining; @@ -1808,8 +1813,8 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl) } ret = ext4_punch_hole(inode, - le32_to_cpu(lrange->fc_lblk) << sb->s_blocksize_bits, - le32_to_cpu(lrange->fc_len) << sb->s_blocksize_bits); + le32_to_cpu(lrange.fc_lblk) << sb->s_blocksize_bits, + le32_to_cpu(lrange.fc_len) << sb->s_blocksize_bits); if (ret) jbd_debug(1, "ext4_punch_hole returned %d", ret); ext4_ext_replay_shrink_inode(inode, @@ -1925,11 +1930,11 @@ static int ext4_fc_replay_scan(journal_t *journal, struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_fc_replay_state *state; int ret = JBD2_FC_REPLAY_CONTINUE; - struct ext4_fc_add_range *ext; - struct ext4_fc_tl *tl; - struct ext4_fc_tail *tail; - __u8 *start, *end; - struct ext4_fc_head *head; + struct ext4_fc_add_range ext; + struct ext4_fc_tl tl; + struct ext4_fc_tail tail; + __u8 *start, *end, *cur, *val; + struct ext4_fc_head head; struct ext4_extent *ex; state = &sbi->s_fc_replay_state; @@ -1956,15 +1961,17 @@ static int ext4_fc_replay_scan(journal_t *journal, } state->fc_replay_expected_off++; - fc_for_each_tl(start, end, tl) { + for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) { + memcpy(&tl, cur, sizeof(tl)); + val = cur + sizeof(tl); jbd_debug(3, "Scan phase, tag:%s, blk %lld\n", - tag2str(le16_to_cpu(tl->fc_tag)), bh->b_blocknr); - switch (le16_to_cpu(tl->fc_tag)) { + tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr); + switch (le16_to_cpu(tl.fc_tag)) { case EXT4_FC_TAG_ADD_RANGE: - ext = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl); - ex = (struct ext4_extent *)&ext->fc_ex; + memcpy(&ext, val, sizeof(ext)); + ex = (struct ext4_extent *)&ext.fc_ex; ret = ext4_fc_record_regions(sb, - le32_to_cpu(ext->fc_ino), + le32_to_cpu(ext.fc_ino), le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), ext4_ext_get_actual_len(ex)); if (ret < 0) @@ -1978,18 +1985,18 @@ static int ext4_fc_replay_scan(journal_t *journal, case EXT4_FC_TAG_INODE: case EXT4_FC_TAG_PAD: state->fc_cur_tag++; - state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl, - sizeof(*tl) + ext4_fc_tag_len(tl)); + state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, + sizeof(tl) + le16_to_cpu(tl.fc_len)); break; case EXT4_FC_TAG_TAIL: state->fc_cur_tag++; - tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl); - state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl, - sizeof(*tl) + + memcpy(&tail, val, sizeof(tail)); + state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, + sizeof(tl) + offsetof(struct ext4_fc_tail, fc_crc)); - if (le32_to_cpu(tail->fc_tid) == expected_tid && - le32_to_cpu(tail->fc_crc) == state->fc_crc) { + if (le32_to_cpu(tail.fc_tid) == expected_tid && + le32_to_cpu(tail.fc_crc) == state->fc_crc) { state->fc_replay_num_tags = state->fc_cur_tag; state->fc_regions_valid = state->fc_regions_used; @@ -2000,19 +2007,19 @@ static int ext4_fc_replay_scan(journal_t *journal, state->fc_crc = 0; break; case EXT4_FC_TAG_HEAD: - head = (struct ext4_fc_head *)ext4_fc_tag_val(tl); - if (le32_to_cpu(head->fc_features) & + memcpy(&head, val, sizeof(head)); + if (le32_to_cpu(head.fc_features) & ~EXT4_FC_SUPPORTED_FEATURES) { ret = -EOPNOTSUPP; break; } - if (le32_to_cpu(head->fc_tid) != expected_tid) { + if (le32_to_cpu(head.fc_tid) != expected_tid) { ret = JBD2_FC_REPLAY_STOP; break; } state->fc_cur_tag++; - state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl, - sizeof(*tl) + ext4_fc_tag_len(tl)); + state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, + sizeof(tl) + le16_to_cpu(tl.fc_len)); break; default: ret = state->fc_replay_num_tags ? @@ -2036,11 +2043,11 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_fc_tl *tl; - __u8 *start, *end; + struct ext4_fc_tl tl; + __u8 *start, *end, *cur, *val; int ret = JBD2_FC_REPLAY_CONTINUE; struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state; - struct ext4_fc_tail *tail; + struct ext4_fc_tail tail; if (pass == PASS_SCAN) { state->fc_current_pass = PASS_SCAN; @@ -2067,49 +2074,52 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, start = (u8 *)bh->b_data; end = (__u8 *)bh->b_data + journal->j_blocksize - 1; - fc_for_each_tl(start, end, tl) { + for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) { + memcpy(&tl, cur, sizeof(tl)); + val = cur + sizeof(tl); + if (state->fc_replay_num_tags == 0) { ret = JBD2_FC_REPLAY_STOP; ext4_fc_set_bitmaps_and_counters(sb); break; } jbd_debug(3, "Replay phase, tag:%s\n", - tag2str(le16_to_cpu(tl->fc_tag))); + tag2str(le16_to_cpu(tl.fc_tag))); state->fc_replay_num_tags--; - switch (le16_to_cpu(tl->fc_tag)) { + switch (le16_to_cpu(tl.fc_tag)) { case EXT4_FC_TAG_LINK: - ret = ext4_fc_replay_link(sb, tl); + ret = ext4_fc_replay_link(sb, &tl, val); break; case EXT4_FC_TAG_UNLINK: - ret = ext4_fc_replay_unlink(sb, tl); + ret = ext4_fc_replay_unlink(sb, &tl, val); break; case EXT4_FC_TAG_ADD_RANGE: - ret = ext4_fc_replay_add_range(sb, tl); + ret = ext4_fc_replay_add_range(sb, &tl, val); break; case EXT4_FC_TAG_CREAT: - ret = ext4_fc_replay_create(sb, tl); + ret = ext4_fc_replay_create(sb, &tl, val); break; case EXT4_FC_TAG_DEL_RANGE: - ret = ext4_fc_replay_del_range(sb, tl); + ret = ext4_fc_replay_del_range(sb, &tl, val); break; case EXT4_FC_TAG_INODE: - ret = ext4_fc_replay_inode(sb, tl); + ret = ext4_fc_replay_inode(sb, &tl, val); break; case EXT4_FC_TAG_PAD: trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0, - ext4_fc_tag_len(tl), 0); + le16_to_cpu(tl.fc_len), 0); break; case EXT4_FC_TAG_TAIL: trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0, - ext4_fc_tag_len(tl), 0); - tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl); - WARN_ON(le32_to_cpu(tail->fc_tid) != expected_tid); + le16_to_cpu(tl.fc_len), 0); + memcpy(&tail, val, sizeof(tail)); + WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid); break; case EXT4_FC_TAG_HEAD: break; default: - trace_ext4_fc_replay(sb, le16_to_cpu(tl->fc_tag), 0, - ext4_fc_tag_len(tl), 0); + trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0, + le16_to_cpu(tl.fc_len), 0); ret = -ECANCELED; break; } diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h index b77f70f55a62..937c381b4c85 100644 --- a/fs/ext4/fast_commit.h +++ b/fs/ext4/fast_commit.h @@ -153,13 +153,6 @@ struct ext4_fc_replay_state { #define region_last(__region) (((__region)->lblk) + ((__region)->len) - 1) #endif -#define fc_for_each_tl(__start, __end, __tl) \ - for (tl = (struct ext4_fc_tl *)(__start); \ - (__u8 *)tl < (__u8 *)(__end); \ - tl = (struct ext4_fc_tl *)((__u8 *)tl + \ - sizeof(struct ext4_fc_tl) + \ - + le16_to_cpu(tl->fc_len))) - static inline const char *tag2str(__u16 tag) { switch (tag) { @@ -186,16 +179,4 @@ static inline const char *tag2str(__u16 tag) } } -/* Get length of a particular tlv */ -static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl) -{ - return le16_to_cpu(tl->fc_len); -} - -/* Get a pointer to "value" of a tlv */ -static inline __u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl) -{ - return (__u8 *)tl + sizeof(*tl); -} - #endif /* __FAST_COMMIT_H__ */ diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 81a17a3cd80e..9bab7fd4ccd5 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -322,14 +322,16 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) if (is_directory) { count = ext4_used_dirs_count(sb, gdp) - 1; ext4_used_dirs_set(sb, gdp, count); - percpu_counter_dec(&sbi->s_dirs_counter); + if (percpu_counter_initialized(&sbi->s_dirs_counter)) + percpu_counter_dec(&sbi->s_dirs_counter); } ext4_inode_bitmap_csum_set(sb, block_group, gdp, bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); ext4_group_desc_csum_set(sb, block_group, gdp); ext4_unlock_group(sb, block_group); - percpu_counter_inc(&sbi->s_freeinodes_counter); + if (percpu_counter_initialized(&sbi->s_freeinodes_counter)) + percpu_counter_inc(&sbi->s_freeinodes_counter); if (sbi->s_log_groups_per_flex) { struct flex_groups *fg; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 3239e6669e84..c2c22c2baac0 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3217,7 +3217,7 @@ static int ext4_mb_init_backend(struct super_block *sb) */ if (sbi->s_es->s_log_groups_per_flex >= 32) { ext4_msg(sb, KERN_ERR, "too many log groups per flexible block group"); - goto err_freesgi; + goto err_freebuddy; } sbi->s_mb_prefetch = min_t(uint, 1 << sbi->s_es->s_log_groups_per_flex, BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9)); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index afb9d05a99ba..a4af26d4459a 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1376,7 +1376,8 @@ int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, struct dx_hash_info *hinfo = &name->hinfo; int len; - if (!IS_CASEFOLDED(dir) || !dir->i_sb->s_encoding) { + if (!IS_CASEFOLDED(dir) || !dir->i_sb->s_encoding || + (IS_ENCRYPTED(dir) && !fscrypt_has_encryption_key(dir))) { cf_name->name = NULL; return 0; } @@ -1427,7 +1428,8 @@ static bool ext4_match(struct inode *parent, #endif #ifdef CONFIG_UNICODE - if (parent->i_sb->s_encoding && IS_CASEFOLDED(parent)) { + if (parent->i_sb->s_encoding && IS_CASEFOLDED(parent) && + (!IS_ENCRYPTED(parent) || fscrypt_has_encryption_key(parent))) { if (fname->cf_name.name) { struct qstr cf = {.name = fname->cf_name.name, .len = fname->cf_name.len}; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7dc94f3e18e6..d29f6aa7d96e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4462,14 +4462,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } if (sb->s_blocksize != blocksize) { + /* + * bh must be released before kill_bdev(), otherwise + * it won't be freed and its page also. kill_bdev() + * is called by sb_set_blocksize(). + */ + brelse(bh); /* Validate the filesystem blocksize */ if (!sb_set_blocksize(sb, blocksize)) { ext4_msg(sb, KERN_ERR, "bad block size %d", blocksize); + bh = NULL; goto failed_mount; } - brelse(bh); logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE; offset = do_div(logical_sb_block, blocksize); bh = ext4_sb_bread_unmovable(sb, logical_sb_block); @@ -5202,8 +5208,9 @@ failed_mount: kfree(get_qf_name(sb, sbi, i)); #endif fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); - ext4_blkdev_remove(sbi); + /* ext4_blkdev_remove() calls kill_bdev(), release bh before it. */ brelse(bh); + ext4_blkdev_remove(sbi); out_fail: sb->s_fs_info = NULL; kfree(sbi->s_blockgroup_lock); diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 6f825dedc3d4..55fcab60a59a 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -315,7 +315,9 @@ EXT4_ATTR_FEATURE(verity); #endif EXT4_ATTR_FEATURE(metadata_csum_seed); EXT4_ATTR_FEATURE(fast_commit); +#if defined(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION) EXT4_ATTR_FEATURE(encrypted_casefold); +#endif static struct attribute *ext4_feat_attrs[] = { ATTR_LIST(lazy_itable_init), @@ -333,7 +335,9 @@ static struct attribute *ext4_feat_attrs[] = { #endif ATTR_LIST(metadata_csum_seed), ATTR_LIST(fast_commit), +#if defined(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION) ATTR_LIST(encrypted_casefold), +#endif NULL, }; ATTRIBUTE_GROUPS(ext4_feat); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index a0b542d84cd9..493a83e3f590 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -911,8 +911,11 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) current->backing_dev_info = inode_to_bdi(inode); buffered = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops); current->backing_dev_info = NULL; - if (unlikely(buffered <= 0)) + if (unlikely(buffered <= 0)) { + if (!ret) + ret = buffered; goto out_unlock; + } /* * We need to ensure that the page cache pages are written to diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index ea7fc5c641c7..d9cb261f55b0 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -582,6 +582,16 @@ out_locked: spin_unlock(&gl->gl_lockref.lock); } +static bool is_system_glock(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); + + if (gl == m_ip->i_gl) + return true; + return false; +} + /** * do_xmote - Calls the DLM to change the state of a lock * @gl: The lock state @@ -671,17 +681,25 @@ skip_inval: * to see sd_log_error and withdraw, and in the meantime, requeue the * work for later. * + * We make a special exception for some system glocks, such as the + * system statfs inode glock, which needs to be granted before the + * gfs2_quotad daemon can exit, and that exit needs to finish before + * we can unmount the withdrawn file system. + * * However, if we're just unlocking the lock (say, for unmount, when * gfs2_gl_hash_clear calls clear_glock) and recovery is complete * then it's okay to tell dlm to unlock it. */ if (unlikely(sdp->sd_log_error && !gfs2_withdrawn(sdp))) gfs2_withdraw_delayed(sdp); - if (glock_blocked_by_withdraw(gl)) { - if (target != LM_ST_UNLOCKED || - test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags)) { + if (glock_blocked_by_withdraw(gl) && + (target != LM_ST_UNLOCKED || + test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags))) { + if (!is_system_glock(gl)) { gfs2_glock_queue_work(gl, GL_GLOCK_DFT_HOLD); goto out; + } else { + clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); } } @@ -1466,9 +1484,11 @@ void gfs2_glock_dq(struct gfs2_holder *gh) glock_blocked_by_withdraw(gl) && gh->gh_gl != sdp->sd_jinode_gl) { sdp->sd_glock_dqs_held++; + spin_unlock(&gl->gl_lockref.lock); might_sleep(); wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY, TASK_UNINTERRUPTIBLE); + spin_lock(&gl->gl_lockref.lock); } if (gh->gh_flags & GL_NOCACHE) handle_callback(gl, LM_ST_UNLOCKED, 0, false); @@ -1775,6 +1795,7 @@ __acquires(&lru_lock) while(!list_empty(list)) { gl = list_first_entry(list, struct gfs2_glock, gl_lru); list_del_init(&gl->gl_lru); + clear_bit(GLF_LRU, &gl->gl_flags); if (!spin_trylock(&gl->gl_lockref.lock)) { add_back_to_lru: list_add(&gl->gl_lru, &lru_list); @@ -1820,7 +1841,6 @@ static long gfs2_scan_glock_lru(int nr) if (!test_bit(GLF_LOCK, &gl->gl_flags)) { list_move(&gl->gl_lru, &dispose); atomic_dec(&lru_count); - clear_bit(GLF_LRU, &gl->gl_flags); freed++; continue; } diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 454095e9fedf..54d3fbeb3002 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -396,7 +396,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) struct timespec64 atime; u16 height, depth; umode_t mode = be32_to_cpu(str->di_mode); - bool is_new = ip->i_inode.i_flags & I_NEW; + bool is_new = ip->i_inode.i_state & I_NEW; if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr))) goto corrupt; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 97d54e581a7b..42c15cfc0821 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -926,10 +926,10 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags) } /** - * ail_drain - drain the ail lists after a withdraw + * gfs2_ail_drain - drain the ail lists after a withdraw * @sdp: Pointer to GFS2 superblock */ -static void ail_drain(struct gfs2_sbd *sdp) +void gfs2_ail_drain(struct gfs2_sbd *sdp) { struct gfs2_trans *tr; @@ -956,6 +956,7 @@ static void ail_drain(struct gfs2_sbd *sdp) list_del(&tr->tr_list); gfs2_trans_free(sdp, tr); } + gfs2_drain_revokes(sdp); spin_unlock(&sdp->sd_ail_lock); } @@ -1162,7 +1163,6 @@ out_withdraw: if (tr && list_empty(&tr->tr_list)) list_add(&tr->tr_list, &sdp->sd_ail1_list); spin_unlock(&sdp->sd_ail_lock); - ail_drain(sdp); /* frees all transactions */ tr = NULL; goto out_end; } diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index eea58015710e..fc905c2af53c 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -93,5 +93,6 @@ extern int gfs2_logd(void *data); extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); extern void gfs2_glock_remove_revoke(struct gfs2_glock *gl); extern void gfs2_flush_revokes(struct gfs2_sbd *sdp); +extern void gfs2_ail_drain(struct gfs2_sbd *sdp); #endif /* __LOG_DOT_H__ */ diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 221e7118cc3b..8ee05d25dfa6 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -885,7 +885,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) gfs2_log_write_page(sdp, page); } -static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +void gfs2_drain_revokes(struct gfs2_sbd *sdp) { struct list_head *head = &sdp->sd_log_revokes; struct gfs2_bufdata *bd; @@ -900,6 +900,11 @@ static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) } } +static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +{ + gfs2_drain_revokes(sdp); +} + static void revoke_lo_before_scan(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, int pass) { diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index 31b6dd0d2e5d..f707601597dc 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -20,6 +20,7 @@ extern void gfs2_log_submit_bio(struct bio **biop, int opf); extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh); extern int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, bool keep_cache); +extern void gfs2_drain_revokes(struct gfs2_sbd *sdp); static inline unsigned int buf_limit(struct gfs2_sbd *sdp) { return sdp->sd_ldptrs; diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 3e08027a6c81..f4325b44956d 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -131,6 +131,7 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp) if (test_bit(SDF_NORECOVERY, &sdp->sd_flags) || !sdp->sd_jdesc) return; + gfs2_ail_drain(sdp); /* frees all transactions */ inode = sdp->sd_jdesc->jd_inode; ip = GFS2_I(inode); i_gl = ip->i_gl; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 9d9e0097c1d3..30dee68458c7 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -529,7 +529,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, * the subpool and global reserve usage count can need * to be adjusted. */ - VM_BUG_ON(PagePrivate(page)); + VM_BUG_ON(HPageRestoreReserve(page)); remove_huge_page(page); freed++; if (!truncate_op) { @@ -735,6 +735,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, __SetPageUptodate(page); error = huge_add_to_page_cache(page, mapping, index); if (unlikely(error)) { + restore_reserve_on_error(h, &pseudo_vma, addr, page); put_page(page); mutex_unlock(&hugetlb_fault_mutex_table[hash]); goto out; diff --git a/fs/io-wq.c b/fs/io-wq.c index 5361a9b4b47b..b3e8624a37d0 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -979,13 +979,16 @@ static bool io_task_work_match(struct callback_head *cb, void *data) return cwd->wqe->wq == data; } +void io_wq_exit_start(struct io_wq *wq) +{ + set_bit(IO_WQ_BIT_EXIT, &wq->state); +} + static void io_wq_exit_workers(struct io_wq *wq) { struct callback_head *cb; int node; - set_bit(IO_WQ_BIT_EXIT, &wq->state); - if (!wq->task) return; @@ -1003,13 +1006,16 @@ static void io_wq_exit_workers(struct io_wq *wq) struct io_wqe *wqe = wq->wqes[node]; io_wq_for_each_worker(wqe, io_wq_worker_wake, NULL); - spin_lock_irq(&wq->hash->wait.lock); - list_del_init(&wq->wqes[node]->wait.entry); - spin_unlock_irq(&wq->hash->wait.lock); } rcu_read_unlock(); io_worker_ref_put(wq); wait_for_completion(&wq->worker_done); + + for_each_node(node) { + spin_lock_irq(&wq->hash->wait.lock); + list_del_init(&wq->wqes[node]->wait.entry); + spin_unlock_irq(&wq->hash->wait.lock); + } put_task_struct(wq->task); wq->task = NULL; } @@ -1020,8 +1026,6 @@ static void io_wq_destroy(struct io_wq *wq) cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node); - io_wq_exit_workers(wq); - for_each_node(node) { struct io_wqe *wqe = wq->wqes[node]; struct io_cb_cancel_data match = { @@ -1036,16 +1040,13 @@ static void io_wq_destroy(struct io_wq *wq) kfree(wq); } -void io_wq_put(struct io_wq *wq) -{ - if (refcount_dec_and_test(&wq->refs)) - io_wq_destroy(wq); -} - void io_wq_put_and_exit(struct io_wq *wq) { + WARN_ON_ONCE(!test_bit(IO_WQ_BIT_EXIT, &wq->state)); + io_wq_exit_workers(wq); - io_wq_put(wq); + if (refcount_dec_and_test(&wq->refs)) + io_wq_destroy(wq); } static bool io_wq_worker_affinity(struct io_worker *worker, void *data) diff --git a/fs/io-wq.h b/fs/io-wq.h index 0e6d310999e8..af2df0680ee2 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -122,7 +122,7 @@ struct io_wq_data { }; struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data); -void io_wq_put(struct io_wq *wq); +void io_wq_exit_start(struct io_wq *wq); void io_wq_put_and_exit(struct io_wq *wq); void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); diff --git a/fs/io_uring.c b/fs/io_uring.c index e481ac8a757a..fa8794c61af7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -783,6 +783,11 @@ struct io_task_work { task_work_func_t func; }; +enum { + IORING_RSRC_FILE = 0, + IORING_RSRC_BUFFER = 1, +}; + /* * NOTE! Each of the iocb union members has the file pointer * as the first entry in their struct definition. So you can @@ -5019,10 +5024,10 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, * Can't handle multishot for double wait for now, turn it * into one-shot mode. */ - if (!(req->poll.events & EPOLLONESHOT)) - req->poll.events |= EPOLLONESHOT; + if (!(poll_one->events & EPOLLONESHOT)) + poll_one->events |= EPOLLONESHOT; /* double add on the same waitqueue head, ignore */ - if (poll->head == head) + if (poll_one->head == head) return; poll = kmalloc(sizeof(*poll), GFP_ATOMIC); if (!poll) { @@ -8228,6 +8233,7 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, { int i, ret; + imu->acct_pages = 0; for (i = 0; i < nr_pages; i++) { if (!PageCompound(pages[i])) { imu->acct_pages++; @@ -9035,14 +9041,19 @@ static void io_uring_del_task_file(unsigned long index) static void io_uring_clean_tctx(struct io_uring_task *tctx) { + struct io_wq *wq = tctx->io_wq; struct io_tctx_node *node; unsigned long index; xa_for_each(&tctx->xa, index, node) io_uring_del_task_file(index); - if (tctx->io_wq) { - io_wq_put_and_exit(tctx->io_wq); + if (wq) { + /* + * Must be after io_uring_del_task_file() (removes nodes under + * uring_lock) to avoid race with io_uring_try_cancel_iowq(). + */ tctx->io_wq = NULL; + io_wq_put_and_exit(wq); } } @@ -9078,6 +9089,9 @@ static void io_uring_cancel_sqpoll(struct io_sq_data *sqd) if (!current->io_uring) return; + if (tctx->io_wq) + io_wq_exit_start(tctx->io_wq); + WARN_ON_ONCE(!sqd || sqd->thread != current); atomic_inc(&tctx->in_idle); @@ -9112,6 +9126,9 @@ void __io_uring_cancel(struct files_struct *files) DEFINE_WAIT(wait); s64 inflight; + if (tctx->io_wq) + io_wq_exit_start(tctx->io_wq); + /* make sure overflow events are dropped */ atomic_inc(&tctx->in_idle); do { @@ -9659,7 +9676,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | - IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS; + IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | + IORING_FEAT_RSRC_TAGS; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; @@ -9899,7 +9917,7 @@ static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, } static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, - unsigned size) + unsigned size, unsigned type) { struct io_uring_rsrc_update2 up; @@ -9907,13 +9925,13 @@ static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, return -EINVAL; if (copy_from_user(&up, arg, sizeof(up))) return -EFAULT; - if (!up.nr) + if (!up.nr || up.resv) return -EINVAL; - return __io_register_rsrc_update(ctx, up.type, &up, up.nr); + return __io_register_rsrc_update(ctx, type, &up, up.nr); } static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, - unsigned int size) + unsigned int size, unsigned int type) { struct io_uring_rsrc_register rr; @@ -9924,10 +9942,10 @@ static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, memset(&rr, 0, sizeof(rr)); if (copy_from_user(&rr, arg, size)) return -EFAULT; - if (!rr.nr) + if (!rr.nr || rr.resv || rr.resv2) return -EINVAL; - switch (rr.type) { + switch (type) { case IORING_RSRC_FILE: return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), rr.nr, u64_to_user_ptr(rr.tags)); @@ -9949,8 +9967,10 @@ static bool io_register_op_must_quiesce(int op) case IORING_REGISTER_PROBE: case IORING_REGISTER_PERSONALITY: case IORING_UNREGISTER_PERSONALITY: - case IORING_REGISTER_RSRC: - case IORING_REGISTER_RSRC_UPDATE: + case IORING_REGISTER_FILES2: + case IORING_REGISTER_FILES_UPDATE2: + case IORING_REGISTER_BUFFERS2: + case IORING_REGISTER_BUFFERS_UPDATE: return false; default: return true; @@ -10076,11 +10096,19 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, case IORING_REGISTER_RESTRICTIONS: ret = io_register_restrictions(ctx, arg, nr_args); break; - case IORING_REGISTER_RSRC: - ret = io_register_rsrc(ctx, arg, nr_args); + case IORING_REGISTER_FILES2: + ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE); + break; + case IORING_REGISTER_FILES_UPDATE2: + ret = io_register_rsrc_update(ctx, arg, nr_args, + IORING_RSRC_FILE); + break; + case IORING_REGISTER_BUFFERS2: + ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER); break; - case IORING_REGISTER_RSRC_UPDATE: - ret = io_register_rsrc_update(ctx, arg, nr_args); + case IORING_REGISTER_BUFFERS_UPDATE: + ret = io_register_rsrc_update(ctx, arg, nr_args, + IORING_RSRC_BUFFER); break; default: ret = -EINVAL; diff --git a/fs/namespace.c b/fs/namespace.c index f63337828e1c..c3f1a78ba369 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3855,8 +3855,12 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP)) return -EINVAL; + /* Don't yet support filesystem mountable in user namespaces. */ + if (m->mnt_sb->s_user_ns != &init_user_ns) + return -EINVAL; + /* We're not controlling the superblock. */ - if (!ns_capable(m->mnt_sb->s_user_ns, CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_ADMIN)) return -EPERM; /* Mount has already been visible in the filesystem hierarchy. */ diff --git a/fs/netfs/Kconfig b/fs/netfs/Kconfig index 578112713703..b4db21022cb4 100644 --- a/fs/netfs/Kconfig +++ b/fs/netfs/Kconfig @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config NETFS_SUPPORT - tristate "Support for network filesystem high-level I/O" + tristate help This option enables support for network filesystems, including helpers for high-level buffered I/O, abstracting out read diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c index 193841d03de0..0b6cd3b8734c 100644 --- a/fs/netfs/read_helper.c +++ b/fs/netfs/read_helper.c @@ -1011,12 +1011,42 @@ out: } EXPORT_SYMBOL(netfs_readpage); -static void netfs_clear_thp(struct page *page) +/** + * netfs_skip_page_read - prep a page for writing without reading first + * @page: page being prepared + * @pos: starting position for the write + * @len: length of write + * + * In some cases, write_begin doesn't need to read at all: + * - full page write + * - write that lies in a page that is completely beyond EOF + * - write that covers the the page from start to EOF or beyond it + * + * If any of these criteria are met, then zero out the unwritten parts + * of the page and return true. Otherwise, return false. + */ +static bool netfs_skip_page_read(struct page *page, loff_t pos, size_t len) { - unsigned int i; + struct inode *inode = page->mapping->host; + loff_t i_size = i_size_read(inode); + size_t offset = offset_in_thp(page, pos); + + /* Full page write */ + if (offset == 0 && len >= thp_size(page)) + return true; + + /* pos beyond last page in the file */ + if (pos - offset >= i_size) + goto zero_out; + + /* Write that covers from the start of the page to EOF or beyond */ + if (offset == 0 && (pos + len) >= i_size) + goto zero_out; - for (i = 0; i < thp_nr_pages(page); i++) - clear_highpage(page + i); + return false; +zero_out: + zero_user_segments(page, 0, offset, offset + len, thp_size(page)); + return true; } /** @@ -1024,7 +1054,7 @@ static void netfs_clear_thp(struct page *page) * @file: The file to read from * @mapping: The mapping to read from * @pos: File position at which the write will begin - * @len: The length of the write in this page + * @len: The length of the write (may extend beyond the end of the page chosen) * @flags: AOP_* flags * @_page: Where to put the resultant page * @_fsdata: Place for the netfs to store a cookie @@ -1061,14 +1091,12 @@ int netfs_write_begin(struct file *file, struct address_space *mapping, struct inode *inode = file_inode(file); unsigned int debug_index = 0; pgoff_t index = pos >> PAGE_SHIFT; - int pos_in_page = pos & ~PAGE_MASK; - loff_t size; int ret; DEFINE_READAHEAD(ractl, file, NULL, mapping, index); retry: - page = grab_cache_page_write_begin(mapping, index, 0); + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; @@ -1090,13 +1118,8 @@ retry: * within the cache granule containing the EOF, in which case we need * to preload the granule. */ - size = i_size_read(inode); if (!ops->is_cache_enabled(inode) && - ((pos_in_page == 0 && len == thp_size(page)) || - (pos >= size) || - (pos_in_page == 0 && (pos + len) >= size))) { - netfs_clear_thp(page); - SetPageUptodate(page); + netfs_skip_page_read(page, pos, len)) { netfs_stat(&netfs_n_rh_write_zskip); goto have_page_no_wait; } diff --git a/fs/nfs/client.c b/fs/nfs/client.c index cfeaadf56bf0..330f65727c45 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -406,7 +406,7 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) if (cl_init->hostname == NULL) { WARN_ON(1); - return NULL; + return ERR_PTR(-EINVAL); } /* see if the client already exists */ diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index d158a500c25c..d2103852475f 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -718,7 +718,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, if (unlikely(!p)) goto out_err; fl->fh_array[i]->size = be32_to_cpup(p++); - if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) { + if (fl->fh_array[i]->size > NFS_MAXFHSIZE) { printk(KERN_ERR "NFS: Too big fh %d received %d\n", i, fl->fh_array[i]->size); goto out_err; diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 93e60e921f92..bc0c698f3350 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -362,7 +362,7 @@ static const struct kernel_param_ops param_ops_nfs_timeout = { .set = param_set_nfs_timeout, .get = param_get_nfs_timeout, }; -#define param_check_nfs_timeout(name, p) __param_check(name, p, int); +#define param_check_nfs_timeout(name, p) __param_check(name, p, int) module_param(nfs_mountpoint_expiry_timeout, nfs_timeout, 0644); MODULE_PARM_DESC(nfs_mountpoint_expiry_timeout, diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 065cb04222a1..543d916f79ab 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -205,6 +205,7 @@ struct nfs4_exception { struct inode *inode; nfs4_stateid *stateid; long timeout; + unsigned char task_is_privileged : 1; unsigned char delay : 1, recovering : 1, retry : 1; diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 889a9f4c0310..42719384e25f 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -435,8 +435,8 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, */ nfs_mark_client_ready(clp, -EPERM); } - nfs_put_client(clp); clear_bit(NFS_CS_TSM_POSSIBLE, &clp->cl_flags); + nfs_put_client(clp); return old; error: diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 57b3821d975a..a1e5c6b85ded 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -211,7 +211,7 @@ static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence) case SEEK_HOLE: case SEEK_DATA: ret = nfs42_proc_llseek(filep, offset, whence); - if (ret != -ENOTSUPP) + if (ret != -EOPNOTSUPP) return ret; fallthrough; default: diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 87d04f2c9385..e653654c10bc 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -589,6 +589,8 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_ goto out_retry; } if (exception->recovering) { + if (exception->task_is_privileged) + return -EDEADLOCK; ret = nfs4_wait_clnt_recover(clp); if (test_bit(NFS_MIG_FAILED, &server->mig_status)) return -EIO; @@ -614,6 +616,8 @@ nfs4_async_handle_exception(struct rpc_task *task, struct nfs_server *server, goto out_retry; } if (exception->recovering) { + if (exception->task_is_privileged) + return -EDEADLOCK; rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0) rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task); @@ -1706,7 +1710,7 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, rcu_read_unlock(); trace_nfs4_open_stateid_update_wait(state->inode, stateid, 0); - if (!signal_pending(current)) { + if (!fatal_signal_pending(current)) { if (schedule_timeout(5*HZ) == 0) status = -EAGAIN; else @@ -3487,7 +3491,7 @@ static bool nfs4_refresh_open_old_stateid(nfs4_stateid *dst, write_sequnlock(&state->seqlock); trace_nfs4_close_stateid_update_wait(state->inode, dst, 0); - if (signal_pending(current)) + if (fatal_signal_pending(current)) status = -EINTR; else if (schedule_timeout(5*HZ) != 0) @@ -3878,6 +3882,10 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f server->caps |= NFS_CAP_HARDLINKS; if (res.has_symlinks != 0) server->caps |= NFS_CAP_SYMLINKS; +#ifdef CONFIG_NFS_V4_SECURITY_LABEL + if (res.attr_bitmask[2] & FATTR4_WORD2_SECURITY_LABEL) + server->caps |= NFS_CAP_SECURITY_LABEL; +#endif if (!(res.attr_bitmask[0] & FATTR4_WORD0_FILEID)) server->fattr_valid &= ~NFS_ATTR_FATTR_FILEID; if (!(res.attr_bitmask[1] & FATTR4_WORD1_MODE)) @@ -3898,10 +3906,6 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f server->fattr_valid &= ~NFS_ATTR_FATTR_CTIME; if (!(res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY)) server->fattr_valid &= ~NFS_ATTR_FATTR_MTIME; -#ifdef CONFIG_NFS_V4_SECURITY_LABEL - if (!(res.attr_bitmask[2] & FATTR4_WORD2_SECURITY_LABEL)) - server->fattr_valid &= ~NFS_ATTR_FATTR_V4_SECURITY_LABEL; -#endif memcpy(server->attr_bitmask_nl, res.attr_bitmask, sizeof(server->attr_bitmask)); server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL; @@ -5968,6 +5972,14 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen do { err = __nfs4_proc_set_acl(inode, buf, buflen); trace_nfs4_set_acl(inode, err); + if (err == -NFS4ERR_BADOWNER || err == -NFS4ERR_BADNAME) { + /* + * no need to retry since the kernel + * isn't involved in encoding the ACEs. + */ + err = -EINVAL; + break; + } err = nfs4_handle_exception(NFS_SERVER(inode), err, &exception); } while (exception.retry); @@ -6409,6 +6421,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) struct nfs4_exception exception = { .inode = data->inode, .stateid = &data->stateid, + .task_is_privileged = data->args.seq_args.sa_privileged, }; if (!nfs4_sequence_done(task, &data->res.seq_res)) @@ -6532,7 +6545,6 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, data = kzalloc(sizeof(*data), GFP_NOFS); if (data == NULL) return -ENOMEM; - nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1, 0); nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_CLEANUP, @@ -6563,6 +6575,12 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, } } + if (!data->inode) + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1, + 1); + else + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1, + 0); task_setup_data.callback_data = data; msg.rpc_argp = &data->args; msg.rpc_resp = &data->res; @@ -9640,15 +9658,20 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync) &task_setup_data.rpc_client, &msg); dprintk("--> %s\n", __func__); + lrp->inode = nfs_igrab_and_active(lrp->args.inode); if (!sync) { - lrp->inode = nfs_igrab_and_active(lrp->args.inode); if (!lrp->inode) { nfs4_layoutreturn_release(lrp); return -EAGAIN; } task_setup_data.flags |= RPC_TASK_ASYNC; } - nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1, 0); + if (!lrp->inode) + nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1, + 1); + else + nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1, + 0); task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index eb1ef3462e84..ccef43e02b48 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -430,10 +430,6 @@ TRACE_DEFINE_ENUM(O_CLOEXEC); { O_NOATIME, "O_NOATIME" }, \ { O_CLOEXEC, "O_CLOEXEC" }) -TRACE_DEFINE_ENUM(FMODE_READ); -TRACE_DEFINE_ENUM(FMODE_WRITE); -TRACE_DEFINE_ENUM(FMODE_EXEC); - #define show_fmode_flags(mode) \ __print_flags(mode, "|", \ { ((__force unsigned long)FMODE_READ), "READ" }, \ diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 6c20b28d9d7c..cf9cc62ec48e 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -1094,15 +1094,16 @@ nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, struct nfs_page *prev = NULL; unsigned int size; - if (mirror->pg_count != 0) { - prev = nfs_list_entry(mirror->pg_list.prev); - } else { + if (list_empty(&mirror->pg_list)) { if (desc->pg_ops->pg_init) desc->pg_ops->pg_init(desc, req); if (desc->pg_error < 0) return 0; mirror->pg_base = req->wb_pgbase; - } + mirror->pg_count = 0; + mirror->pg_recoalesce = 0; + } else + prev = nfs_list_entry(mirror->pg_list.prev); if (desc->pg_maxretrans && req->wb_nio > desc->pg_maxretrans) { if (NFS_SERVER(desc->pg_inode)->flags & NFS_MOUNT_SOFTERR) @@ -1127,18 +1128,13 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc) { struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); - if (!list_empty(&mirror->pg_list)) { int error = desc->pg_ops->pg_doio(desc); if (error < 0) desc->pg_error = error; - else + if (list_empty(&mirror->pg_list)) mirror->pg_bytes_written += mirror->pg_count; } - if (list_empty(&mirror->pg_list)) { - mirror->pg_count = 0; - mirror->pg_base = 0; - } } static void @@ -1227,10 +1223,6 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) do { list_splice_init(&mirror->pg_list, &head); - mirror->pg_bytes_written -= mirror->pg_count; - mirror->pg_count = 0; - mirror->pg_base = 0; - mirror->pg_recoalesce = 0; while (!list_empty(&head)) { struct nfs_page *req; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 03e0b34c4a64..2c01ee805306 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1317,6 +1317,11 @@ _pnfs_return_layout(struct inode *ino) { struct pnfs_layout_hdr *lo = NULL; struct nfs_inode *nfsi = NFS_I(ino); + struct pnfs_layout_range range = { + .iomode = IOMODE_ANY, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; LIST_HEAD(tmp_list); const struct cred *cred; nfs4_stateid stateid; @@ -1344,16 +1349,10 @@ _pnfs_return_layout(struct inode *ino) } valid_layout = pnfs_layout_is_valid(lo); pnfs_clear_layoutcommit(ino, &tmp_list); - pnfs_mark_matching_lsegs_return(lo, &tmp_list, NULL, 0); + pnfs_mark_matching_lsegs_return(lo, &tmp_list, &range, 0); - if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) { - struct pnfs_layout_range range = { - .iomode = IOMODE_ANY, - .offset = 0, - .length = NFS4_MAX_UINT64, - }; + if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &range); - } /* Don't send a LAYOUTRETURN if list was initially empty */ if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) || @@ -2678,7 +2677,7 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_range); void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { - u64 rd_size = req->wb_bytes; + u64 rd_size; pnfs_generic_pg_check_layout(pgio); pnfs_generic_pg_check_range(pgio, req); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 19a212f9725d..fe58525cfed4 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1379,7 +1379,7 @@ static const struct kernel_param_ops param_ops_portnr = { .set = param_set_portnr, .get = param_get_uint, }; -#define param_check_portnr(name, p) __param_check(name, p, unsigned int); +#define param_check_portnr(name, p) __param_check(name, p, unsigned int) module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644); module_param_named(callback_nr_threads, nfs_callback_nr_threads, ushort, 0644); diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c index 303d71430bdd..9c6c0e2e5880 100644 --- a/fs/nilfs2/sysfs.c +++ b/fs/nilfs2/sysfs.c @@ -1053,6 +1053,7 @@ void nilfs_sysfs_delete_device_group(struct the_nilfs *nilfs) nilfs_sysfs_delete_superblock_group(nilfs); nilfs_sysfs_delete_segctor_group(nilfs); kobject_del(&nilfs->ns_dev_kobj); + kobject_put(&nilfs->ns_dev_kobj); kfree(nilfs->ns_dev_subgroups); } diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 71fefb30e015..64864fb40b40 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -424,11 +424,18 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, * events generated by the listener process itself, without disclosing * the pids of other processes. */ - if (!capable(CAP_SYS_ADMIN) && + if (FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) && task_tgid(current) != event->pid) metadata.pid = 0; - if (path && path->mnt && path->dentry) { + /* + * For now, fid mode is required for an unprivileged listener and + * fid mode does not report fd in events. Keep this check anyway + * for safety in case fid mode requirement is relaxed in the future + * to allow unprivileged listener to get events with no fd and no fid. + */ + if (!FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) && + path && path->mnt && path->dentry) { fd = create_fd(group, path, &f); if (fd < 0) return fd; @@ -464,7 +471,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, info_type, fanotify_info_name(info), info->name_len, buf, count); if (ret < 0) - return ret; + goto out_close_fd; buf += ret; count -= ret; @@ -512,7 +519,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, fanotify_event_object_fh(event), info_type, dot, dot_len, buf, count); if (ret < 0) - return ret; + goto out_close_fd; buf += ret; count -= ret; @@ -1040,6 +1047,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) int f_flags, fd; unsigned int fid_mode = flags & FANOTIFY_FID_BITS; unsigned int class = flags & FANOTIFY_CLASS_BITS; + unsigned int internal_flags = 0; pr_debug("%s: flags=%x event_f_flags=%x\n", __func__, flags, event_f_flags); @@ -1053,6 +1061,13 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) */ if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) || !fid_mode) return -EPERM; + + /* + * Setting the internal flag FANOTIFY_UNPRIV on the group + * prevents setting mount/filesystem marks on this group and + * prevents reporting pid and open fd in events. + */ + internal_flags |= FANOTIFY_UNPRIV; } #ifdef CONFIG_AUDITSYSCALL @@ -1105,7 +1120,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) goto out_destroy_group; } - group->fanotify_data.flags = flags; + group->fanotify_data.flags = flags | internal_flags; group->memcg = get_mem_cgroup_from_mm(current->mm); group->fanotify_data.merge_hash = fanotify_alloc_merge_hash(); @@ -1305,11 +1320,13 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, group = f.file->private_data; /* - * An unprivileged user is not allowed to watch a mount point nor - * a filesystem. + * An unprivileged user is not allowed to setup mount nor filesystem + * marks. This also includes setting up such marks by a group that + * was initialized by an unprivileged user. */ ret = -EPERM; - if (!capable(CAP_SYS_ADMIN) && + if ((!capable(CAP_SYS_ADMIN) || + FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV)) && mark_type != FAN_MARK_INODE) goto fput_and_out; @@ -1460,6 +1477,7 @@ static int __init fanotify_user_setup(void) max_marks = clamp(max_marks, FANOTIFY_OLD_DEFAULT_MAX_MARKS, FANOTIFY_DEFAULT_MAX_USER_MARKS); + BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS); BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 10); BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9); diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index a712b2aaa9ac..57f0d5d9f934 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -144,7 +144,7 @@ void fanotify_show_fdinfo(struct seq_file *m, struct file *f) struct fsnotify_group *group = f->private_data; seq_printf(m, "fanotify flags:%x event-flags:%x\n", - group->fanotify_data.flags, + group->fanotify_data.flags & FANOTIFY_INIT_FLAGS, group->fanotify_data.f_flags); show_fdinfo(m, f, fanotify_fdinfo); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index f17c3d33fb18..775657943057 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1856,6 +1856,45 @@ out: } /* + * zero out partial blocks of one cluster. + * + * start: file offset where zero starts, will be made upper block aligned. + * len: it will be trimmed to the end of current cluster if "start + len" + * is bigger than it. + */ +static int ocfs2_zeroout_partial_cluster(struct inode *inode, + u64 start, u64 len) +{ + int ret; + u64 start_block, end_block, nr_blocks; + u64 p_block, offset; + u32 cluster, p_cluster, nr_clusters; + struct super_block *sb = inode->i_sb; + u64 end = ocfs2_align_bytes_to_clusters(sb, start); + + if (start + len < end) + end = start + len; + + start_block = ocfs2_blocks_for_bytes(sb, start); + end_block = ocfs2_blocks_for_bytes(sb, end); + nr_blocks = end_block - start_block; + if (!nr_blocks) + return 0; + + cluster = ocfs2_bytes_to_clusters(sb, start); + ret = ocfs2_get_clusters(inode, cluster, &p_cluster, + &nr_clusters, NULL); + if (ret) + return ret; + if (!p_cluster) + return 0; + + offset = start_block - ocfs2_clusters_to_blocks(sb, cluster); + p_block = ocfs2_clusters_to_blocks(sb, p_cluster) + offset; + return sb_issue_zeroout(sb, p_block, nr_blocks, GFP_NOFS); +} + +/* * Parts of this function taken from xfs_change_file_space() */ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, @@ -1865,7 +1904,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, { int ret; s64 llen; - loff_t size; + loff_t size, orig_isize; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct buffer_head *di_bh = NULL; handle_t *handle; @@ -1896,6 +1935,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, goto out_inode_unlock; } + orig_isize = i_size_read(inode); switch (sr->l_whence) { case 0: /*SEEK_SET*/ break; @@ -1903,7 +1943,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, sr->l_start += f_pos; break; case 2: /*SEEK_END*/ - sr->l_start += i_size_read(inode); + sr->l_start += orig_isize; break; default: ret = -EINVAL; @@ -1957,6 +1997,14 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, default: ret = -EINVAL; } + + /* zeroout eof blocks in the cluster. */ + if (!ret && change_size && orig_isize < size) { + ret = ocfs2_zeroout_partial_cluster(inode, orig_isize, + size - orig_isize); + if (!ret) + i_size_write(inode, size); + } up_write(&OCFS2_I(inode)->ip_alloc_sem); if (ret) { mlog_errno(ret); @@ -1973,9 +2021,6 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, goto out_inode_unlock; } - if (change_size && i_size_read(inode) < size) - i_size_write(inode, size); - inode->i_ctime = inode->i_mtime = current_time(inode); ret = ocfs2_mark_inode_dirty(handle, inode, di_bh); if (ret < 0) diff --git a/fs/proc/base.c b/fs/proc/base.c index 3851bfcdba56..9cbd915025ad 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2674,6 +2674,13 @@ out: } #ifdef CONFIG_SECURITY +static int proc_pid_attr_open(struct inode *inode, struct file *file) +{ + file->private_data = NULL; + __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS); + return 0; +} + static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { @@ -2703,6 +2710,10 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, void *page; int rv; + /* A task may only write when it was the opener. */ + if (file->private_data != current->mm) + return -EPERM; + rcu_read_lock(); task = pid_task(proc_pid(inode), PIDTYPE_PID); if (!task) { @@ -2750,9 +2761,11 @@ out: } static const struct file_operations proc_pid_attr_operations = { + .open = proc_pid_attr_open, .read = proc_pid_attr_read, .write = proc_pid_attr_write, .llseek = generic_file_llseek, + .release = mem_release, }; #define LSM_DIR_OPS(LSM) \ diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c index 8468baee951d..f32878d9a39f 100644 --- a/fs/proc/loadavg.c +++ b/fs/proc/loadavg.c @@ -16,7 +16,7 @@ static int loadavg_proc_show(struct seq_file *m, void *v) get_avenrun(avnrun, FIXED_1/200, 0); - seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n", + seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %u/%d %d\n", LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]), LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]), LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]), diff --git a/fs/proc/stat.c b/fs/proc/stat.c index f25e8531fd27..6561a06ef905 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -200,8 +200,8 @@ static int show_stat(struct seq_file *p, void *v) "\nctxt %llu\n" "btime %llu\n" "processes %lu\n" - "procs_running %lu\n" - "procs_blocked %lu\n", + "procs_running %u\n" + "procs_blocked %u\n", nr_context_switches(), (unsigned long long)boottime.tv_sec, total_forks, diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 4f1373463766..22d904bde6ab 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -288,14 +288,12 @@ static inline void remove_dquot_hash(struct dquot *dquot) static struct dquot *find_dquot(unsigned int hashent, struct super_block *sb, struct kqid qid) { - struct hlist_node *node; struct dquot *dquot; - hlist_for_each (node, dquot_hash+hashent) { - dquot = hlist_entry(node, struct dquot, dq_hash); + hlist_for_each_entry(dquot, dquot_hash+hashent, dq_hash) if (dquot->dq_sb == sb && qid_eq(dquot->dq_id, qid)) return dquot; - } + return NULL; } diff --git a/fs/signalfd.c b/fs/signalfd.c index 040a1142915f..167b5889db4b 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -114,29 +114,24 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, break; case SIL_FAULT_BNDERR: case SIL_FAULT_PKUERR: + case SIL_PERF_EVENT: /* - * Fall through to the SIL_FAULT case. Both SIL_FAULT_BNDERR - * and SIL_FAULT_PKUERR are only generated by faults that - * deliver them synchronously to userspace. In case someone - * injects one of these signals and signalfd catches it treat - * it as SIL_FAULT. + * Fall through to the SIL_FAULT case. SIL_FAULT_BNDERR, + * SIL_FAULT_PKUERR, and SIL_PERF_EVENT are only + * generated by faults that deliver them synchronously to + * userspace. In case someone injects one of these signals + * and signalfd catches it treat it as SIL_FAULT. */ case SIL_FAULT: new.ssi_addr = (long) kinfo->si_addr; -#ifdef __ARCH_SI_TRAPNO - new.ssi_trapno = kinfo->si_trapno; -#endif break; - case SIL_FAULT_MCEERR: + case SIL_FAULT_TRAPNO: new.ssi_addr = (long) kinfo->si_addr; -#ifdef __ARCH_SI_TRAPNO new.ssi_trapno = kinfo->si_trapno; -#endif - new.ssi_addr_lsb = (short) kinfo->si_addr_lsb; break; - case SIL_PERF_EVENT: + case SIL_FAULT_MCEERR: new.ssi_addr = (long) kinfo->si_addr; - new.ssi_perf = kinfo->si_perf; + new.ssi_addr_lsb = (short) kinfo->si_addr_lsb; break; case SIL_CHLD: new.ssi_pid = kinfo->si_pid; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 14f92285d04f..dd7a6c62b56f 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -337,7 +337,7 @@ out: return ret; } -static inline long userfaultfd_get_blocking_state(unsigned int flags) +static inline unsigned int userfaultfd_get_blocking_state(unsigned int flags) { if (flags & FAULT_FLAG_INTERRUPTIBLE) return TASK_INTERRUPTIBLE; @@ -370,7 +370,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) struct userfaultfd_wait_queue uwq; vm_fault_t ret = VM_FAULT_SIGBUS; bool must_wait; - long blocking_state; + unsigned int blocking_state; /* * We don't do userfault handling for the final child pid update. diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index e32a1833d523..bbfea8022a3b 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -325,10 +325,22 @@ out: error2 = xfs_alloc_pagf_init(mp, tp, pag->pag_agno, 0); if (error2) return error2; - ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved + - xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved <= - pag->pagf_freeblks + pag->pagf_flcount); + + /* + * If there isn't enough space in the AG to satisfy the + * reservation, let the caller know that there wasn't enough + * space. Callers are responsible for deciding what to do + * next, since (in theory) we can stumble along with + * insufficient reservation if data blocks are being freed to + * replenish the AG's free space. + */ + if (!error && + xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved + + xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved > + pag->pagf_freeblks + pag->pagf_flcount) + error = -ENOSPC; } + return error; } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 7e3b9b01431e..a3e0e6f672d6 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -605,7 +605,6 @@ xfs_bmap_btree_to_extents( ASSERT(cur); ASSERT(whichfork != XFS_COW_FORK); - ASSERT(!xfs_need_iread_extents(ifp)); ASSERT(ifp->if_format == XFS_DINODE_FMT_BTREE); ASSERT(be16_to_cpu(rblock->bb_level) == 1); ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1); @@ -5350,7 +5349,6 @@ __xfs_bunmapi( xfs_fsblock_t sum; xfs_filblks_t len = *rlen; /* length to unmap in file */ xfs_fileoff_t max_len; - xfs_agnumber_t prev_agno = NULLAGNUMBER, agno; xfs_fileoff_t end; struct xfs_iext_cursor icur; bool done = false; @@ -5442,16 +5440,6 @@ __xfs_bunmapi( del = got; wasdel = isnullstartblock(del.br_startblock); - /* - * Make sure we don't touch multiple AGF headers out of order - * in a single transaction, as that could cause AB-BA deadlocks. - */ - if (!wasdel && !isrt) { - agno = XFS_FSB_TO_AGNO(mp, del.br_startblock); - if (prev_agno != NULLAGNUMBER && prev_agno > agno) - break; - prev_agno = agno; - } if (got.br_startoff < start) { del.br_startoff = start; del.br_blockcount -= start - got.br_startoff; diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index a83bdd0c47a8..bde2b4c64dbe 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -770,6 +770,8 @@ struct xfs_scrub_metadata { /* * ioctl commands that are used by Linux filesystems */ +#define XFS_IOC_GETXFLAGS FS_IOC_GETFLAGS +#define XFS_IOC_SETXFLAGS FS_IOC_SETFLAGS #define XFS_IOC_GETVERSION FS_IOC_GETVERSION /* @@ -780,6 +782,8 @@ struct xfs_scrub_metadata { #define XFS_IOC_ALLOCSP _IOW ('X', 10, struct xfs_flock64) #define XFS_IOC_FREESP _IOW ('X', 11, struct xfs_flock64) #define XFS_IOC_DIOINFO _IOR ('X', 30, struct dioattr) +#define XFS_IOC_FSGETXATTR FS_IOC_FSGETXATTR +#define XFS_IOC_FSSETXATTR FS_IOC_FSSETXATTR #define XFS_IOC_ALLOCSP64 _IOW ('X', 36, struct xfs_flock64) #define XFS_IOC_FREESP64 _IOW ('X', 37, struct xfs_flock64) #define XFS_IOC_GETBMAP _IOWR('X', 38, struct getbmap) diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 5c9a7440d9e4..f3254a4f4cb4 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -559,8 +559,17 @@ xfs_dinode_calc_crc( /* * Validate di_extsize hint. * - * The rules are documented at xfs_ioctl_setattr_check_extsize(). - * These functions must be kept in sync with each other. + * 1. Extent size hint is only valid for directories and regular files. + * 2. FS_XFLAG_EXTSIZE is only valid for regular files. + * 3. FS_XFLAG_EXTSZINHERIT is only valid for directories. + * 4. Hint cannot be larger than MAXTEXTLEN. + * 5. Can be changed on directories at any time. + * 6. Hint value of 0 turns off hints, clears inode flags. + * 7. Extent size must be a multiple of the appropriate block size. + * For realtime files, this is the rt extent size. + * 8. For non-realtime files, the extent size hint must be limited + * to half the AG size to avoid alignment extending the extent beyond the + * limits of the AG. */ xfs_failaddr_t xfs_inode_validate_extsize( @@ -580,6 +589,28 @@ xfs_inode_validate_extsize( inherit_flag = (flags & XFS_DIFLAG_EXTSZINHERIT); extsize_bytes = XFS_FSB_TO_B(mp, extsize); + /* + * This comment describes a historic gap in this verifier function. + * + * On older kernels, the extent size hint verifier doesn't check that + * the extent size hint is an integer multiple of the realtime extent + * size on a directory with both RTINHERIT and EXTSZINHERIT flags set. + * The verifier has always enforced the alignment rule for regular + * files with the REALTIME flag set. + * + * If a directory with a misaligned extent size hint is allowed to + * propagate that hint into a new regular realtime file, the result + * is that the inode cluster buffer verifier will trigger a corruption + * shutdown the next time it is run. + * + * Unfortunately, there could be filesystems with these misconfigured + * directories in the wild, so we cannot add a check to this verifier + * at this time because that will result a new source of directory + * corruption errors when reading an existing filesystem. Instead, we + * permit the misconfiguration to pass through the verifiers so that + * callers of this function can correct and mitigate externally. + */ + if (rt_flag) blocksize_bytes = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog; else @@ -616,8 +647,15 @@ xfs_inode_validate_extsize( /* * Validate di_cowextsize hint. * - * The rules are documented at xfs_ioctl_setattr_check_cowextsize(). - * These functions must be kept in sync with each other. + * 1. CoW extent size hint can only be set if reflink is enabled on the fs. + * The inode does not have to have any shared blocks, but it must be a v3. + * 2. FS_XFLAG_COWEXTSIZE is only valid for directories and regular files; + * for a directory, the hint is propagated to new files. + * 3. Can be changed on files & directories at any time. + * 4. Hint value of 0 turns off hints, clears inode flags. + * 5. Extent size must be a multiple of the appropriate block size. + * 6. The extent size hint must be limited to half the AG size to avoid + * alignment extending the extent beyond the limits of the AG. */ xfs_failaddr_t xfs_inode_validate_cowextsize( diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index 78324e043e25..8d595a5c4abd 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -143,6 +143,23 @@ xfs_trans_log_inode( } /* + * Inode verifiers on older kernels don't check that the extent size + * hint is an integer multiple of the rt extent size on a directory + * with both rtinherit and extszinherit flags set. If we're logging a + * directory that is misconfigured in this way, clear the hint. + */ + if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && + (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && + (ip->i_extsize % ip->i_mount->m_sb.sb_rextsize) > 0) { + xfs_info_once(ip->i_mount, + "Correcting misaligned extent size hint in inode 0x%llx.", ip->i_ino); + ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | + XFS_DIFLAG_EXTSZINHERIT); + ip->i_extsize = 0; + flags |= XFS_ILOG_CORE; + } + + /* * Record the specific change for fdatasync optimisation. This allows * fdatasync to skip log forces for inodes that are only timestamp * dirty. diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index aa874607618a..be38c960da85 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -74,7 +74,9 @@ __xchk_process_error( return true; case -EDEADLOCK: /* Used to restart an op with deadlock avoidance. */ - trace_xchk_deadlock_retry(sc->ip, sc->sm, *error); + trace_xchk_deadlock_retry( + sc->ip ? sc->ip : XFS_I(file_inode(sc->file)), + sc->sm, *error); break; case -EFSBADCRC: case -EFSCORRUPTED: diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index a5e9d7d34023..0936f3a96fe6 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -71,18 +71,24 @@ xfs_zero_extent( #ifdef CONFIG_XFS_RT int xfs_bmap_rtalloc( - struct xfs_bmalloca *ap) /* bmap alloc argument struct */ + struct xfs_bmalloca *ap) { - int error; /* error return value */ - xfs_mount_t *mp; /* mount point structure */ - xfs_extlen_t prod = 0; /* product factor for allocators */ - xfs_extlen_t mod = 0; /* product factor for allocators */ - xfs_extlen_t ralen = 0; /* realtime allocation length */ - xfs_extlen_t align; /* minimum allocation alignment */ - xfs_rtblock_t rtb; - - mp = ap->ip->i_mount; + struct xfs_mount *mp = ap->ip->i_mount; + xfs_fileoff_t orig_offset = ap->offset; + xfs_rtblock_t rtb; + xfs_extlen_t prod = 0; /* product factor for allocators */ + xfs_extlen_t mod = 0; /* product factor for allocators */ + xfs_extlen_t ralen = 0; /* realtime allocation length */ + xfs_extlen_t align; /* minimum allocation alignment */ + xfs_extlen_t orig_length = ap->length; + xfs_extlen_t minlen = mp->m_sb.sb_rextsize; + xfs_extlen_t raminlen; + bool rtlocked = false; + bool ignore_locality = false; + int error; + align = xfs_get_extsz_hint(ap->ip); +retry: prod = align / mp->m_sb.sb_rextsize; error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 1, ap->eof, 0, @@ -93,6 +99,15 @@ xfs_bmap_rtalloc( ASSERT(ap->length % mp->m_sb.sb_rextsize == 0); /* + * If we shifted the file offset downward to satisfy an extent size + * hint, increase minlen by that amount so that the allocator won't + * give us an allocation that's too short to cover at least one of the + * blocks that the caller asked for. + */ + if (ap->offset != orig_offset) + minlen += orig_offset - ap->offset; + + /* * If the offset & length are not perfectly aligned * then kill prod, it will just get us in trouble. */ @@ -116,10 +131,13 @@ xfs_bmap_rtalloc( /* * Lock out modifications to both the RT bitmap and summary inodes */ - xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP); - xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL); - xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM); - xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL); + if (!rtlocked) { + xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP); + xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM); + xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL); + rtlocked = true; + } /* * If it's an allocation to an empty file at offset 0, @@ -141,33 +159,59 @@ xfs_bmap_rtalloc( /* * Realtime allocation, done through xfs_rtallocate_extent. */ - do_div(ap->blkno, mp->m_sb.sb_rextsize); + if (ignore_locality) + ap->blkno = 0; + else + do_div(ap->blkno, mp->m_sb.sb_rextsize); rtb = ap->blkno; ap->length = ralen; - error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length, - &ralen, ap->wasdel, prod, &rtb); + raminlen = max_t(xfs_extlen_t, 1, minlen / mp->m_sb.sb_rextsize); + error = xfs_rtallocate_extent(ap->tp, ap->blkno, raminlen, ap->length, + &ralen, ap->wasdel, prod, &rtb); if (error) return error; - ap->blkno = rtb; - if (ap->blkno != NULLFSBLOCK) { - ap->blkno *= mp->m_sb.sb_rextsize; - ralen *= mp->m_sb.sb_rextsize; - ap->length = ralen; - ap->ip->i_nblocks += ralen; + if (rtb != NULLRTBLOCK) { + ap->blkno = rtb * mp->m_sb.sb_rextsize; + ap->length = ralen * mp->m_sb.sb_rextsize; + ap->ip->i_nblocks += ap->length; xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); if (ap->wasdel) - ap->ip->i_delayed_blks -= ralen; + ap->ip->i_delayed_blks -= ap->length; /* * Adjust the disk quota also. This was reserved * earlier. */ xfs_trans_mod_dquot_byino(ap->tp, ap->ip, ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT : - XFS_TRANS_DQ_RTBCOUNT, (long) ralen); - } else { - ap->length = 0; + XFS_TRANS_DQ_RTBCOUNT, ap->length); + return 0; } + + if (align > mp->m_sb.sb_rextsize) { + /* + * We previously enlarged the request length to try to satisfy + * an extent size hint. The allocator didn't return anything, + * so reset the parameters to the original values and try again + * without alignment criteria. + */ + ap->offset = orig_offset; + ap->length = orig_length; + minlen = align = mp->m_sb.sb_rextsize; + goto retry; + } + + if (!ignore_locality && ap->blkno != 0) { + /* + * If we can't allocate near a specific rt extent, try again + * without locality criteria. + */ + ignore_locality = true; + goto retry; + } + + ap->blkno = NULLFSBLOCK; + ap->length = 0; return 0; } #endif /* CONFIG_XFS_RT */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 0369eb22c1bb..e4c2da4566f1 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -690,6 +690,7 @@ xfs_inode_inherit_flags( const struct xfs_inode *pip) { unsigned int di_flags = 0; + xfs_failaddr_t failaddr; umode_t mode = VFS_I(ip)->i_mode; if (S_ISDIR(mode)) { @@ -729,6 +730,24 @@ xfs_inode_inherit_flags( di_flags |= XFS_DIFLAG_FILESTREAM; ip->i_diflags |= di_flags; + + /* + * Inode verifiers on older kernels only check that the extent size + * hint is an integer multiple of the rt extent size on realtime files. + * They did not check the hint alignment on a directory with both + * rtinherit and extszinherit flags set. If the misaligned hint is + * propagated from a directory into a new realtime file, new file + * allocations will fail due to math errors in the rt allocator and/or + * trip the verifiers. Validate the hint settings in the new file so + * that we don't let broken hints propagate. + */ + failaddr = xfs_inode_validate_extsize(ip->i_mount, ip->i_extsize, + VFS_I(ip)->i_mode, ip->i_diflags); + if (failaddr) { + ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | + XFS_DIFLAG_EXTSZINHERIT); + ip->i_extsize = 0; + } } /* Propagate di_flags2 from a parent inode to a child inode. */ @@ -737,12 +756,22 @@ xfs_inode_inherit_flags2( struct xfs_inode *ip, const struct xfs_inode *pip) { + xfs_failaddr_t failaddr; + if (pip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) { ip->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE; ip->i_cowextsize = pip->i_cowextsize; } if (pip->i_diflags2 & XFS_DIFLAG2_DAX) ip->i_diflags2 |= XFS_DIFLAG2_DAX; + + /* Don't let invalid cowextsize hints propagate. */ + failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize, + VFS_I(ip)->i_mode, ip->i_diflags, ip->i_diflags2); + if (failaddr) { + ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE; + ip->i_cowextsize = 0; + } } /* diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 3925bfcb2365..1fe4c1fc0aea 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1267,20 +1267,8 @@ out_error: } /* - * extent size hint validation is somewhat cumbersome. Rules are: - * - * 1. extent size hint is only valid for directories and regular files - * 2. FS_XFLAG_EXTSIZE is only valid for regular files - * 3. FS_XFLAG_EXTSZINHERIT is only valid for directories. - * 4. can only be changed on regular files if no extents are allocated - * 5. can be changed on directories at any time - * 6. extsize hint of 0 turns off hints, clears inode flags. - * 7. Extent size must be a multiple of the appropriate block size. - * 8. for non-realtime files, the extent size hint must be limited - * to half the AG size to avoid alignment extending the extent beyond the - * limits of the AG. - * - * Please keep this function in sync with xfs_scrub_inode_extsize. + * Validate a proposed extent size hint. For regular files, the hint can only + * be changed if no extents are allocated. */ static int xfs_ioctl_setattr_check_extsize( @@ -1288,86 +1276,65 @@ xfs_ioctl_setattr_check_extsize( struct fileattr *fa) { struct xfs_mount *mp = ip->i_mount; - xfs_extlen_t size; - xfs_fsblock_t extsize_fsb; + xfs_failaddr_t failaddr; + uint16_t new_diflags; if (!fa->fsx_valid) return 0; if (S_ISREG(VFS_I(ip)->i_mode) && ip->i_df.if_nextents && - ((ip->i_extsize << mp->m_sb.sb_blocklog) != fa->fsx_extsize)) + XFS_FSB_TO_B(mp, ip->i_extsize) != fa->fsx_extsize) return -EINVAL; - if (fa->fsx_extsize == 0) - return 0; - - extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize); - if (extsize_fsb > MAXEXTLEN) + if (fa->fsx_extsize & mp->m_blockmask) return -EINVAL; - if (XFS_IS_REALTIME_INODE(ip) || - (fa->fsx_xflags & FS_XFLAG_REALTIME)) { - size = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog; - } else { - size = mp->m_sb.sb_blocksize; - if (extsize_fsb > mp->m_sb.sb_agblocks / 2) + new_diflags = xfs_flags2diflags(ip, fa->fsx_xflags); + + /* + * Inode verifiers on older kernels don't check that the extent size + * hint is an integer multiple of the rt extent size on a directory + * with both rtinherit and extszinherit flags set. Don't let sysadmins + * misconfigure directories. + */ + if ((new_diflags & XFS_DIFLAG_RTINHERIT) && + (new_diflags & XFS_DIFLAG_EXTSZINHERIT)) { + unsigned int rtextsize_bytes; + + rtextsize_bytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize); + if (fa->fsx_extsize % rtextsize_bytes) return -EINVAL; } - if (fa->fsx_extsize % size) - return -EINVAL; - - return 0; + failaddr = xfs_inode_validate_extsize(ip->i_mount, + XFS_B_TO_FSB(mp, fa->fsx_extsize), + VFS_I(ip)->i_mode, new_diflags); + return failaddr != NULL ? -EINVAL : 0; } -/* - * CoW extent size hint validation rules are: - * - * 1. CoW extent size hint can only be set if reflink is enabled on the fs. - * The inode does not have to have any shared blocks, but it must be a v3. - * 2. FS_XFLAG_COWEXTSIZE is only valid for directories and regular files; - * for a directory, the hint is propagated to new files. - * 3. Can be changed on files & directories at any time. - * 4. CoW extsize hint of 0 turns off hints, clears inode flags. - * 5. Extent size must be a multiple of the appropriate block size. - * 6. The extent size hint must be limited to half the AG size to avoid - * alignment extending the extent beyond the limits of the AG. - * - * Please keep this function in sync with xfs_scrub_inode_cowextsize. - */ static int xfs_ioctl_setattr_check_cowextsize( struct xfs_inode *ip, struct fileattr *fa) { struct xfs_mount *mp = ip->i_mount; - xfs_extlen_t size; - xfs_fsblock_t cowextsize_fsb; + xfs_failaddr_t failaddr; + uint64_t new_diflags2; + uint16_t new_diflags; if (!fa->fsx_valid) return 0; - if (!(fa->fsx_xflags & FS_XFLAG_COWEXTSIZE)) - return 0; - - if (!xfs_sb_version_hasreflink(&ip->i_mount->m_sb)) + if (fa->fsx_cowextsize & mp->m_blockmask) return -EINVAL; - if (fa->fsx_cowextsize == 0) - return 0; + new_diflags = xfs_flags2diflags(ip, fa->fsx_xflags); + new_diflags2 = xfs_flags2diflags2(ip, fa->fsx_xflags); - cowextsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_cowextsize); - if (cowextsize_fsb > MAXEXTLEN) - return -EINVAL; - - size = mp->m_sb.sb_blocksize; - if (cowextsize_fsb > mp->m_sb.sb_agblocks / 2) - return -EINVAL; - - if (fa->fsx_cowextsize % size) - return -EINVAL; - - return 0; + failaddr = xfs_inode_validate_cowextsize(ip->i_mount, + XFS_B_TO_FSB(mp, fa->fsx_cowextsize), + VFS_I(ip)->i_mode, new_diflags, new_diflags2); + return failaddr != NULL ? -EINVAL : 0; } static int diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h index 3c392b1512ac..7ec1a9207517 100644 --- a/fs/xfs/xfs_message.h +++ b/fs/xfs/xfs_message.h @@ -73,6 +73,8 @@ do { \ xfs_printk_once(xfs_warn, dev, fmt, ##__VA_ARGS__) #define xfs_notice_once(dev, fmt, ...) \ xfs_printk_once(xfs_notice, dev, fmt, ##__VA_ARGS__) +#define xfs_info_once(dev, fmt, ...) \ + xfs_printk_once(xfs_info, dev, fmt, ##__VA_ARGS__) void assfail(struct xfs_mount *mp, char *expr, char *f, int l); void asswarn(struct xfs_mount *mp, char *expr, char *f, int l); |