From 4f44657d74873735e93a50eb25014721a66aac19 Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Fri, 5 Mar 2021 16:13:27 +0800 Subject: blk-cgroup: Fix the recursive blkg rwstat The current blkio.throttle.io_service_bytes_recursive doesn't work correctly. As an example, for the following blkcg hierarchy: (Made 1GB READ in test1, 512MB READ in test2) test / \ test1 test2 $ head -n 1 test/test1/blkio.throttle.io_service_bytes_recursive 8:0 Read 1073684480 $ head -n 1 test/test2/blkio.throttle.io_service_bytes_recursive 8:0 Read 537448448 $ head -n 1 test/blkio.throttle.io_service_bytes_recursive 8:0 Read 537448448 Clearly, above data of "test" reflects "test2" not "test1"+"test2". Do the correct summary in blkg_rwstat_recursive_sum(). Signed-off-by: Xunlei Pang Signed-off-by: Jens Axboe --- block/blk-cgroup-rwstat.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-cgroup-rwstat.c b/block/blk-cgroup-rwstat.c index 85d5790ac49b..3304e841df7c 100644 --- a/block/blk-cgroup-rwstat.c +++ b/block/blk-cgroup-rwstat.c @@ -109,6 +109,7 @@ void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, lockdep_assert_held(&blkg->q->queue_lock); + memset(sum, 0, sizeof(*sum)); rcu_read_lock(); blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { struct blkg_rwstat *rwstat; @@ -122,7 +123,7 @@ void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, rwstat = (void *)pos_blkg + off; for (i = 0; i < BLKG_RWSTAT_NR; i++) - sum->cnt[i] = blkg_rwstat_read_counter(rwstat, i); + sum->cnt[i] += blkg_rwstat_read_counter(rwstat, i); } rcu_read_unlock(); } -- cgit v1.2.3 From faa44c69daf9ccbd5b8a1aee13e0e0d037c0be17 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 10 Mar 2021 18:09:19 +0900 Subject: block: Fix REQ_OP_ZONE_RESET_ALL handling Similarly to a single zone reset operation (REQ_OP_ZONE_RESET), execute REQ_OP_ZONE_RESET_ALL operations with REQ_SYNC set. Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- block/blk-zoned.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 833978c02e60..8b9f3fc5a690 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -240,7 +240,7 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op, */ if (op == REQ_OP_ZONE_RESET && blkdev_allow_reset_all_zones(bdev, sector, nr_sectors)) { - bio->bi_opf = REQ_OP_ZONE_RESET_ALL; + bio->bi_opf = REQ_OP_ZONE_RESET_ALL | REQ_SYNC; break; } -- cgit v1.2.3 From a8affc03a9b375e19bc81573de0c9108317d78c7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Mar 2021 12:01:37 +0100 Subject: block: rename BIO_MAX_PAGES to BIO_MAX_VECS Ever since the addition of multipage bio_vecs BIO_MAX_PAGES has been horribly confusingly misnamed. Rename it to BIO_MAX_VECS to stop confusing users of the bio API. Signed-off-by: Christoph Hellwig Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210311110137.1132391-2-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 14 +++++++------- block/blk-crypto-fallback.c | 2 +- block/blk-lib.c | 2 +- block/blk-map.c | 2 +- block/bounce.c | 6 +++--- drivers/block/drbd/drbd_int.h | 2 +- drivers/md/bcache/super.c | 2 +- drivers/md/dm-crypt.c | 8 ++++---- drivers/md/dm-writecache.c | 4 ++-- drivers/md/raid5-cache.c | 4 ++-- drivers/md/raid5-ppl.c | 2 +- drivers/nvme/target/passthru.c | 6 +++--- fs/block_dev.c | 6 +++--- fs/btrfs/extent_io.c | 2 +- fs/btrfs/scrub.c | 2 +- fs/crypto/bio.c | 6 +++--- fs/erofs/zdata.c | 2 +- fs/ext4/page-io.c | 2 +- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/data.c | 4 ++-- fs/f2fs/segment.c | 2 +- fs/f2fs/segment.h | 4 ++-- fs/f2fs/super.c | 4 ++-- fs/gfs2/lops.c | 2 +- fs/iomap/buffered-io.c | 4 ++-- fs/iomap/direct-io.c | 4 ++-- fs/mpage.c | 2 +- fs/nilfs2/segbuf.c | 2 +- fs/squashfs/block.c | 2 +- fs/zonefs/super.c | 2 +- include/linux/bio.h | 4 ++-- 31 files changed, 56 insertions(+), 56 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index a1c4d2900c7a..26b7f721cda8 100644 --- a/block/bio.c +++ b/block/bio.c @@ -33,7 +33,7 @@ static struct biovec_slab { { .nr_vecs = 16, .name = "biovec-16" }, { .nr_vecs = 64, .name = "biovec-64" }, { .nr_vecs = 128, .name = "biovec-128" }, - { .nr_vecs = BIO_MAX_PAGES, .name = "biovec-max" }, + { .nr_vecs = BIO_MAX_VECS, .name = "biovec-max" }, }; static struct biovec_slab *biovec_slab(unsigned short nr_vecs) @@ -46,7 +46,7 @@ static struct biovec_slab *biovec_slab(unsigned short nr_vecs) return &bvec_slabs[1]; case 65 ... 128: return &bvec_slabs[2]; - case 129 ... BIO_MAX_PAGES: + case 129 ... BIO_MAX_VECS: return &bvec_slabs[3]; default: BUG(); @@ -151,9 +151,9 @@ out: void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs) { - BIO_BUG_ON(nr_vecs > BIO_MAX_PAGES); + BIO_BUG_ON(nr_vecs > BIO_MAX_VECS); - if (nr_vecs == BIO_MAX_PAGES) + if (nr_vecs == BIO_MAX_VECS) mempool_free(bv, pool); else if (nr_vecs > BIO_INLINE_VECS) kmem_cache_free(biovec_slab(nr_vecs)->slab, bv); @@ -186,15 +186,15 @@ struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, /* * Try a slab allocation first for all smaller allocations. If that * fails and __GFP_DIRECT_RECLAIM is set retry with the mempool. - * The mempool is sized to handle up to BIO_MAX_PAGES entries. + * The mempool is sized to handle up to BIO_MAX_VECS entries. */ - if (*nr_vecs < BIO_MAX_PAGES) { + if (*nr_vecs < BIO_MAX_VECS) { struct bio_vec *bvl; bvl = kmem_cache_alloc(bvs->slab, bvec_alloc_gfp(gfp_mask)); if (likely(bvl) || !(gfp_mask & __GFP_DIRECT_RECLAIM)) return bvl; - *nr_vecs = BIO_MAX_PAGES; + *nr_vecs = BIO_MAX_VECS; } return mempool_alloc(pool, gfp_mask); diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index c176b7af56a7..c322176a1e09 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -219,7 +219,7 @@ static bool blk_crypto_split_bio_if_needed(struct bio **bio_ptr) bio_for_each_segment(bv, bio, iter) { num_sectors += bv.bv_len >> SECTOR_SHIFT; - if (++i == BIO_MAX_PAGES) + if (++i == BIO_MAX_VECS) break; } if (num_sectors < bio_sectors(bio)) { diff --git a/block/blk-lib.c b/block/blk-lib.c index 752f9c722062..7b256131b20b 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -296,7 +296,7 @@ static unsigned int __blkdev_sectors_to_bio_pages(sector_t nr_sects) { sector_t pages = DIV_ROUND_UP_SECTOR_T(nr_sects, PAGE_SIZE / 512); - return min(pages, (sector_t)BIO_MAX_PAGES); + return min(pages, (sector_t)BIO_MAX_VECS); } static int __blkdev_issue_zero_pages(struct block_device *bdev, diff --git a/block/blk-map.c b/block/blk-map.c index 369e204d14d0..1ffef782fcf2 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -249,7 +249,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, if (!iov_iter_count(iter)) return -EINVAL; - bio = bio_kmalloc(gfp_mask, iov_iter_npages(iter, BIO_MAX_PAGES)); + bio = bio_kmalloc(gfp_mask, iov_iter_npages(iter, BIO_MAX_VECS)); if (!bio) return -ENOMEM; bio->bi_opf |= req_op(rq); diff --git a/block/bounce.c b/block/bounce.c index 87983a35079c..6c441f4f1cd4 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -229,10 +229,10 @@ static struct bio *bounce_clone_bio(struct bio *bio_src) * - The point of cloning the biovec is to produce a bio with a biovec * the caller can modify: bi_idx and bi_bvec_done should be 0. * - * - The original bio could've had more than BIO_MAX_PAGES biovecs; if + * - The original bio could've had more than BIO_MAX_VECS biovecs; if * we tried to clone the whole thing bio_alloc_bioset() would fail. * But the clone should succeed as long as the number of biovecs we - * actually need to allocate is fewer than BIO_MAX_PAGES. + * actually need to allocate is fewer than BIO_MAX_VECS. * * - Lastly, bi_vcnt should not be looked at or relied upon by code * that does not own the bio - reason being drivers don't use it for @@ -299,7 +299,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, int sectors = 0; bio_for_each_segment(from, *bio_orig, iter) { - if (i++ < BIO_MAX_PAGES) + if (i++ < BIO_MAX_VECS) sectors += from.bv_len >> 9; if (page_to_pfn(from.bv_page) > q->limits.bounce_pfn) bounce = true; diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 7d9cc433b758..5d9181382ce1 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1324,7 +1324,7 @@ struct bm_extent { * A followup commit may allow even bigger BIO sizes, * once we thought that through. */ #define DRBD_MAX_BIO_SIZE (1U << 20) -#if DRBD_MAX_BIO_SIZE > (BIO_MAX_PAGES << PAGE_SHIFT) +#if DRBD_MAX_BIO_SIZE > (BIO_MAX_VECS << PAGE_SHIFT) #error Architecture not supported: DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE #endif #define DRBD_MAX_BIO_SIZE_SAFE (1U << 12) /* Works always = 4k */ diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 71691f32959b..03e1fe4de53d 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -965,7 +965,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, q->limits.max_hw_sectors = UINT_MAX; q->limits.max_sectors = UINT_MAX; q->limits.max_segment_size = UINT_MAX; - q->limits.max_segments = BIO_MAX_PAGES; + q->limits.max_segments = BIO_MAX_VECS; blk_queue_max_discard_sectors(q, UINT_MAX); q->limits.discard_granularity = 512; q->limits.io_min = block_size; diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 11c105ecd165..b0ab080f2567 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -229,7 +229,7 @@ static DEFINE_SPINLOCK(dm_crypt_clients_lock); static unsigned dm_crypt_clients_n = 0; static volatile unsigned long dm_crypt_pages_per_client; #define DM_CRYPT_MEMORY_PERCENT 2 -#define DM_CRYPT_MIN_PAGES_PER_CLIENT (BIO_MAX_PAGES * 16) +#define DM_CRYPT_MIN_PAGES_PER_CLIENT (BIO_MAX_VECS * 16) static void clone_init(struct dm_crypt_io *, struct bio *); static void kcryptd_queue_crypt(struct dm_crypt_io *io); @@ -3246,7 +3246,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start + additional_req_size, ARCH_KMALLOC_MINALIGN); - ret = mempool_init(&cc->page_pool, BIO_MAX_PAGES, crypt_page_alloc, crypt_page_free, cc); + ret = mempool_init(&cc->page_pool, BIO_MAX_VECS, crypt_page_alloc, crypt_page_free, cc); if (ret) { ti->error = "Cannot allocate page mempool"; goto bad; @@ -3373,9 +3373,9 @@ static int crypt_map(struct dm_target *ti, struct bio *bio) /* * Check if bio is too large, split as needed. */ - if (unlikely(bio->bi_iter.bi_size > (BIO_MAX_PAGES << PAGE_SHIFT)) && + if (unlikely(bio->bi_iter.bi_size > (BIO_MAX_VECS << PAGE_SHIFT)) && (bio_data_dir(bio) == WRITE || cc->on_disk_tag_size)) - dm_accept_partial_bio(bio, ((BIO_MAX_PAGES << PAGE_SHIFT) >> SECTOR_SHIFT)); + dm_accept_partial_bio(bio, ((BIO_MAX_VECS << PAGE_SHIFT) >> SECTOR_SHIFT)); /* * Ensure that bio is a multiple of internal sector encryption size diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 844c4be11768..4f72b6f66c3a 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -1892,10 +1892,10 @@ restart: list_add(&g->lru, &wbl.list); wbl.size++; g->write_in_progress = true; - g->wc_list_contiguous = BIO_MAX_PAGES; + g->wc_list_contiguous = BIO_MAX_VECS; f = g; e->wc_list_contiguous++; - if (unlikely(e->wc_list_contiguous == BIO_MAX_PAGES)) { + if (unlikely(e->wc_list_contiguous == BIO_MAX_VECS)) { if (unlikely(wc->writeback_all)) { next_node = rb_next(&f->rb_node); if (likely(next_node)) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 4337ae0e6af2..0b5dcaabbc15 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -735,7 +735,7 @@ static void r5l_submit_current_io(struct r5l_log *log) static struct bio *r5l_bio_alloc(struct r5l_log *log) { - struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, &log->bs); + struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_VECS, &log->bs); bio_set_op_attrs(bio, REQ_OP_WRITE, 0); bio_set_dev(bio, log->rdev->bdev); @@ -1634,7 +1634,7 @@ static int r5l_recovery_allocate_ra_pool(struct r5l_log *log, { struct page *page; - ctx->ra_bio = bio_alloc_bioset(GFP_KERNEL, BIO_MAX_PAGES, &log->bs); + ctx->ra_bio = bio_alloc_bioset(GFP_KERNEL, BIO_MAX_VECS, &log->bs); if (!ctx->ra_bio) return -ENOMEM; diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index e8c118e05dfd..3ddc2aa0b530 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -496,7 +496,7 @@ static void ppl_submit_iounit(struct ppl_io_unit *io) if (!bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0)) { struct bio *prev = bio; - bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, + bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_VECS, &ppl_conf->bs); bio->bi_opf = prev->bi_opf; bio->bi_write_hint = prev->bi_write_hint; diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index 26c587ccd152..2798944899b7 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -50,9 +50,9 @@ static u16 nvmet_passthru_override_id_ctrl(struct nvmet_req *req) /* * nvmet_passthru_map_sg is limitted to using a single bio so limit - * the mdts based on BIO_MAX_PAGES as well + * the mdts based on BIO_MAX_VECS as well */ - max_hw_sectors = min_not_zero(BIO_MAX_PAGES << (PAGE_SHIFT - 9), + max_hw_sectors = min_not_zero(BIO_MAX_VECS << (PAGE_SHIFT - 9), max_hw_sectors); page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12; @@ -191,7 +191,7 @@ static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq) struct bio *bio; int i; - if (req->sg_cnt > BIO_MAX_PAGES) + if (req->sg_cnt > BIO_MAX_VECS) return -EINVAL; if (req->transfer_len <= NVMET_MAX_INLINE_DATA_LEN) { diff --git a/fs/block_dev.c b/fs/block_dev.c index 03166b3dea4d..92ed7d5df677 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -432,7 +432,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, dio->size += bio->bi_iter.bi_size; pos += bio->bi_iter.bi_size; - nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_PAGES); + nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS); if (!nr_pages) { bool polled = false; @@ -500,8 +500,8 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (!iov_iter_count(iter)) return 0; - nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_PAGES + 1); - if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES) + nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); + if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_VECS) return __blkdev_direct_IO_simple(iocb, iter, nr_pages); return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages)); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4dfb3ead1175..db8cb98c020c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3048,7 +3048,7 @@ struct bio *btrfs_bio_alloc(u64 first_byte) { struct bio *bio; - bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &btrfs_bioset); + bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_VECS, &btrfs_bioset); bio->bi_iter.bi_sector = first_byte >> 9; btrfs_io_bio_init(btrfs_io_bio(bio)); return bio; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 582df11d298a..6daa4309c974 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1428,7 +1428,7 @@ static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info, if (!first_page->dev->bdev) goto out; - bio = btrfs_io_bio_alloc(BIO_MAX_PAGES); + bio = btrfs_io_bio_alloc(BIO_MAX_VECS); bio_set_dev(bio, first_page->dev->bdev); for (page_num = 0; page_num < sblock->page_count; page_num++) { diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index b048a0e38516..68a2de6b5a9b 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -52,7 +52,7 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, int num_pages = 0; /* This always succeeds since __GFP_DIRECT_RECLAIM is set. */ - bio = bio_alloc(GFP_NOFS, BIO_MAX_PAGES); + bio = bio_alloc(GFP_NOFS, BIO_MAX_VECS); while (len) { unsigned int blocks_this_page = min(len, blocks_per_page); @@ -74,7 +74,7 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, len -= blocks_this_page; lblk += blocks_this_page; pblk += blocks_this_page; - if (num_pages == BIO_MAX_PAGES || !len || + if (num_pages == BIO_MAX_VECS || !len || !fscrypt_mergeable_bio(bio, inode, lblk)) { err = submit_bio_wait(bio); if (err) @@ -126,7 +126,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, return fscrypt_zeroout_range_inline_crypt(inode, lblk, pblk, len); - BUILD_BUG_ON(ARRAY_SIZE(pages) > BIO_MAX_PAGES); + BUILD_BUG_ON(ARRAY_SIZE(pages) > BIO_MAX_VECS); nr_pages = min_t(unsigned int, ARRAY_SIZE(pages), (len + blocks_per_page - 1) >> blocks_per_page_bits); diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 6cb356c4217b..3851e1a64f73 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1235,7 +1235,7 @@ submit_bio_retry: } if (!bio) { - bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); + bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS); bio->bi_end_io = z_erofs_decompressqueue_endio; bio_set_dev(bio, sb->s_bdev); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 03a44a0de86a..f038d578d8d8 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -398,7 +398,7 @@ static void io_submit_init_bio(struct ext4_io_submit *io, * bio_alloc will _always_ be able to allocate a bio if * __GFP_DIRECT_RECLAIM is set, see comments for bio_alloc_bioset(). */ - bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); + bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS); fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio_set_dev(bio, bh->b_bdev); diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 174a0819ad96..be5415a0dbbc 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -292,7 +292,7 @@ void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) f2fs_put_page(page, 0); if (readahead) - f2fs_ra_meta_pages(sbi, index, BIO_MAX_PAGES, META_POR, true); + f2fs_ra_meta_pages(sbi, index, BIO_MAX_VECS, META_POR, true); } static int __f2fs_write_meta_page(struct page *page, diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7c95818639a6..4e5257c763d0 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -857,7 +857,7 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio) f2fs_submit_merged_ipu_write(fio->sbi, &bio, NULL); alloc_new: if (!bio) { - bio = __bio_alloc(fio, BIO_MAX_PAGES); + bio = __bio_alloc(fio, BIO_MAX_VECS); __attach_io_flag(fio); f2fs_set_bio_crypt_ctx(bio, fio->page->mapping->host, fio->page->index, fio, GFP_NOIO); @@ -932,7 +932,7 @@ alloc_new: fio->retry = true; goto skip; } - io->bio = __bio_alloc(fio, BIO_MAX_PAGES); + io->bio = __bio_alloc(fio, BIO_MAX_VECS); f2fs_set_bio_crypt_ctx(io->bio, fio->page->mapping->host, bio_page->index, fio, GFP_NOIO); io->fio = *fio; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 993004f06a77..c2866561263e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -4381,7 +4381,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) block_t total_node_blocks = 0; do { - readed = f2fs_ra_meta_pages(sbi, start_blk, BIO_MAX_PAGES, + readed = f2fs_ra_meta_pages(sbi, start_blk, BIO_MAX_VECS, META_SIT, true); start = start_blk * sit_i->sents_per_block; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 229814b4f4a6..e9a7a637d688 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -851,7 +851,7 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) else if (type == NODE) return 8 * sbi->blocks_per_seg; else if (type == META) - return 8 * BIO_MAX_PAGES; + return 8 * BIO_MAX_VECS; else return 0; } @@ -868,7 +868,7 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, return 0; nr_to_write = wbc->nr_to_write; - desired = BIO_MAX_PAGES; + desired = BIO_MAX_VECS; if (type == NODE) desired <<= 1; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 7069793752f1..82592b19b4e0 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -753,9 +753,9 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) case Opt_io_size_bits: if (args->from && match_int(args, &arg)) return -EINVAL; - if (arg <= 0 || arg > __ilog2_u32(BIO_MAX_PAGES)) { + if (arg <= 0 || arg > __ilog2_u32(BIO_MAX_VECS)) { f2fs_warn(sbi, "Not support %d, larger than %d", - 1 << arg, BIO_MAX_PAGES); + 1 << arg, BIO_MAX_VECS); return -EINVAL; } F2FS_OPTION(sbi).write_io_size_bits = arg; diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index dc1b93a877c6..a82f4747aa8d 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -267,7 +267,7 @@ static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno, bio_end_io_t *end_io) { struct super_block *sb = sdp->sd_vfs; - struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); + struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS); bio->bi_iter.bi_sector = blkno << sdp->sd_fsb2bb_shift; bio_set_dev(bio, sb->s_bdev); diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 7ffcd7ef33d4..414769a6ad11 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1221,7 +1221,7 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend; struct bio *bio; - bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &iomap_ioend_bioset); + bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_VECS, &iomap_ioend_bioset); bio_set_dev(bio, wpc->iomap.bdev); bio->bi_iter.bi_sector = sector; bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); @@ -1252,7 +1252,7 @@ iomap_chain_bio(struct bio *prev) { struct bio *new; - new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES); + new = bio_alloc(GFP_NOFS, BIO_MAX_VECS); bio_copy_dev(new, prev);/* also copies over blkcg information */ new->bi_iter.bi_sector = bio_end_sector(prev); new->bi_opf = prev->bi_opf; diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index e2c4991833b8..bdd0d89bbf0a 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -296,7 +296,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, */ bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua); - nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_PAGES); + nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS); do { size_t n; if (dio->error) { @@ -338,7 +338,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, copied += n; nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, - BIO_MAX_PAGES); + BIO_MAX_VECS); iomap_dio_submit_bio(dio, iomap, bio, pos); pos += n; } while (nr_pages); diff --git a/fs/mpage.c b/fs/mpage.c index 961234d68779..334e7d09aa65 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -616,7 +616,7 @@ alloc_new: goto out; } bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), - BIO_MAX_PAGES, GFP_NOFS|__GFP_HIGH); + BIO_MAX_VECS, GFP_NOFS|__GFP_HIGH); if (bio == NULL) goto confused; diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 1e75417bfe6e..56872e93823d 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -399,7 +399,7 @@ static void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf, { wi->bio = NULL; wi->rest_blocks = segbuf->sb_sum.nblocks; - wi->max_pages = BIO_MAX_PAGES; + wi->max_pages = BIO_MAX_VECS; wi->nr_vecs = min(wi->max_pages, wi->rest_blocks); wi->start = wi->end = 0; wi->blocknr = segbuf->sb_pseg_start; diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 45f44425d856..b9e87ebb1060 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -87,7 +87,7 @@ static int squashfs_bio_read(struct super_block *sb, u64 index, int length, int error, i; struct bio *bio; - if (page_count <= BIO_MAX_PAGES) + if (page_count <= BIO_MAX_VECS) bio = bio_alloc(GFP_NOIO, page_count); else bio = bio_kmalloc(GFP_NOIO, page_count); diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index b6ff4a21abac..0fe76f376dee 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -684,7 +684,7 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize); iov_iter_truncate(from, max); - nr_pages = iov_iter_npages(from, BIO_MAX_PAGES); + nr_pages = iov_iter_npages(from, BIO_MAX_VECS); if (!nr_pages) return 0; diff --git a/include/linux/bio.h b/include/linux/bio.h index 983ed2fe7c85..d0246c92a6e8 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -20,11 +20,11 @@ #define BIO_BUG_ON #endif -#define BIO_MAX_PAGES 256U +#define BIO_MAX_VECS 256U static inline unsigned int bio_max_segs(unsigned int nr_segs) { - return min(nr_segs, BIO_MAX_PAGES); + return min(nr_segs, BIO_MAX_VECS); } #define bio_prio(bio) (bio)->bi_ioprio -- cgit v1.2.3 From 9ec491447b90ad6a4056a9656b13f0b3a1e83043 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Thu, 11 Mar 2021 16:19:17 +0100 Subject: block: Suppress uevent for hidden device when removed register_disk() suppress uevents for devices with the GENHD_FL_HIDDEN but enables uevents at the end again in order to announce disk after possible partitions are created. When the device is removed the uevents are still on and user land sees 'remove' messages for devices which were never 'add'ed to the system. KERNEL[95481.571887] remove /devices/virtual/nvme-fabrics/ctl/nvme5/nvme0c5n1 (block) Let's suppress the uevents for GENHD_FL_HIDDEN by not enabling the uevents at all. Signed-off-by: Daniel Wagner Reviewed-by: Christoph Hellwig Reviewed-by: Martin Wilck Link: https://lore.kernel.org/r/20210311151917.136091-1-dwagner@suse.de Signed-off-by: Jens Axboe --- block/genhd.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index c55e8f0fced1..8c8f543572e6 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -534,10 +534,8 @@ static void register_disk(struct device *parent, struct gendisk *disk, kobject_create_and_add("holders", &ddev->kobj); disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); - if (disk->flags & GENHD_FL_HIDDEN) { - dev_set_uevent_suppress(ddev, 0); + if (disk->flags & GENHD_FL_HIDDEN) return; - } disk_scan_partitions(disk); -- cgit v1.2.3 From e5113505904ea1c1c0e1f92c1cfa91fbf4da1694 Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Thu, 11 Mar 2021 16:25:46 +0900 Subject: block: Discard page cache of zone reset target range When zone reset ioctl and data read race for a same zone on zoned block devices, the data read leaves stale page cache even though the zone reset ioctl zero clears all the zone data on the device. To avoid non-zero data read from the stale page cache after zone reset, discard page cache of reset target zones in blkdev_zone_mgmt_ioctl(). Introduce the helper function blkdev_truncate_zone_range() to discard the page cache. Ensure the page cache discarded by calling the helper function before and after zone reset in same manner as fallocate does. This patch can be applied back to the stable kernel version v5.10.y. Rework is needed for older stable kernels. Signed-off-by: Shin'ichiro Kawasaki Fixes: 3ed05a987e0f ("blk-zoned: implement ioctls") Cc: # 5.10+ Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210311072546.678999-1-shinichiro.kawasaki@wdc.com Signed-off-by: Jens Axboe --- block/blk-zoned.c | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 8b9f3fc5a690..c0276b42d9fb 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -318,6 +318,22 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, return 0; } +static int blkdev_truncate_zone_range(struct block_device *bdev, fmode_t mode, + const struct blk_zone_range *zrange) +{ + loff_t start, end; + + if (zrange->sector + zrange->nr_sectors <= zrange->sector || + zrange->sector + zrange->nr_sectors > get_capacity(bdev->bd_disk)) + /* Out of range */ + return -EINVAL; + + start = zrange->sector << SECTOR_SHIFT; + end = ((zrange->sector + zrange->nr_sectors) << SECTOR_SHIFT) - 1; + + return truncate_bdev_range(bdev, mode, start, end); +} + /* * BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing. * Called from blkdev_ioctl. @@ -329,6 +345,7 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, struct request_queue *q; struct blk_zone_range zrange; enum req_opf op; + int ret; if (!argp) return -EINVAL; @@ -352,6 +369,11 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, switch (cmd) { case BLKRESETZONE: op = REQ_OP_ZONE_RESET; + + /* Invalidate the page cache, including dirty pages. */ + ret = blkdev_truncate_zone_range(bdev, mode, &zrange); + if (ret) + return ret; break; case BLKOPENZONE: op = REQ_OP_ZONE_OPEN; @@ -366,8 +388,20 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, return -ENOTTY; } - return blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors, - GFP_KERNEL); + ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors, + GFP_KERNEL); + + /* + * Invalidate the page cache again for zone reset: writes can only be + * direct for zoned devices so concurrent writes would not add any page + * to the page cache after/during reset. The page cache may be filled + * again due to concurrent reads though and dropping the pages for + * these is fine. + */ + if (!ret && cmd == BLKRESETZONE) + ret = blkdev_truncate_zone_range(bdev, mode, &zrange); + + return ret; } static inline unsigned long *blk_alloc_zone_bitmap(int node, -- cgit v1.2.3 From a958937ff166fc60d1c3a721036f6ff41bfa2821 Mon Sep 17 00:00:00 2001 From: David Jeffery Date: Thu, 11 Feb 2021 09:38:07 -0500 Subject: block: recalculate segment count for multi-segment discards correctly When a stacked block device inserts a request into another block device using blk_insert_cloned_request, the request's nr_phys_segments field gets recalculated by a call to blk_recalc_rq_segments in blk_cloned_rq_check_limits. But blk_recalc_rq_segments does not know how to handle multi-segment discards. For disk types which can handle multi-segment discards like nvme, this results in discard requests which claim a single segment when it should report several, triggering a warning in nvme and causing nvme to fail the discard from the invalid state. WARNING: CPU: 5 PID: 191 at drivers/nvme/host/core.c:700 nvme_setup_discard+0x170/0x1e0 [nvme_core] ... nvme_setup_cmd+0x217/0x270 [nvme_core] nvme_loop_queue_rq+0x51/0x1b0 [nvme_loop] __blk_mq_try_issue_directly+0xe7/0x1b0 blk_mq_request_issue_directly+0x41/0x70 ? blk_account_io_start+0x40/0x50 dm_mq_queue_rq+0x200/0x3e0 blk_mq_dispatch_rq_list+0x10a/0x7d0 ? __sbitmap_queue_get+0x25/0x90 ? elv_rb_del+0x1f/0x30 ? deadline_remove_request+0x55/0xb0 ? dd_dispatch_request+0x181/0x210 __blk_mq_do_dispatch_sched+0x144/0x290 ? bio_attempt_discard_merge+0x134/0x1f0 __blk_mq_sched_dispatch_requests+0x129/0x180 blk_mq_sched_dispatch_requests+0x30/0x60 __blk_mq_run_hw_queue+0x47/0xe0 __blk_mq_delay_run_hw_queue+0x15b/0x170 blk_mq_sched_insert_requests+0x68/0xe0 blk_mq_flush_plug_list+0xf0/0x170 blk_finish_plug+0x36/0x50 xlog_cil_committed+0x19f/0x290 [xfs] xlog_cil_process_committed+0x57/0x80 [xfs] xlog_state_do_callback+0x1e0/0x2a0 [xfs] xlog_ioend_work+0x2f/0x80 [xfs] process_one_work+0x1b6/0x350 worker_thread+0x53/0x3e0 ? process_one_work+0x350/0x350 kthread+0x11b/0x140 ? __kthread_bind_mask+0x60/0x60 ret_from_fork+0x22/0x30 This patch fixes blk_recalc_rq_segments to be aware of devices which can have multi-segment discards. It calculates the correct discard segment count by counting the number of bio as each discard bio is considered its own segment. Fixes: 1e739730c5b9 ("block: optionally merge discontiguous discard bios into a single request") Signed-off-by: David Jeffery Reviewed-by: Ming Lei Reviewed-by: Laurence Oberman Link: https://lore.kernel.org/r/20210211143807.GA115624@redhat Signed-off-by: Jens Axboe --- block/blk-merge.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'block') diff --git a/block/blk-merge.c b/block/blk-merge.c index ffb4aa0ea68b..4d97fb6dd226 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -382,6 +382,14 @@ unsigned int blk_recalc_rq_segments(struct request *rq) switch (bio_op(rq->bio)) { case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: + if (queue_max_discard_segments(rq->q) > 1) { + struct bio *bio = rq->bio; + + for_each_bio(bio) + nr_phys_segs++; + return nr_phys_segs; + } + return 1; case REQ_OP_WRITE_ZEROES: return 0; case REQ_OP_WRITE_SAME: -- cgit v1.2.3 From 7de55b7d6f09a2865279d3c41c0fbdbfdb87486a Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 25 Mar 2021 00:47:26 +0900 Subject: block: support zone append bvecs Christoph reported that we'll likely trigger the WARN_ON_ONCE() checking that we're not submitting a bvec with REQ_OP_ZONE_APPEND in bio_iov_iter_get_pages() some time ago using zoned btrfs, but I couldn't reproduce it back then. Now Naohiro was able to trigger the bug as well with xfstests generic/095 on a zoned btrfs. There is nothing that prevents bvec submissions via REQ_OP_ZONE_APPEND if the hardware's zone append limit is met. Reported-by: Naohiro Aota Reported-by: Christoph Hellwig Signed-off-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/10bd414d9326c90cd69029077db63b363854eee5.1616600835.git.johannes.thumshirn@wdc.com Signed-off-by: Jens Axboe --- block/bio.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 26b7f721cda8..963d1d406b3a 100644 --- a/block/bio.c +++ b/block/bio.c @@ -949,7 +949,7 @@ void bio_release_pages(struct bio *bio, bool mark_dirty) } EXPORT_SYMBOL_GPL(bio_release_pages); -static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) +static void __bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) { WARN_ON_ONCE(bio->bi_max_vecs); @@ -959,11 +959,26 @@ static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) bio->bi_iter.bi_size = iter->count; bio_set_flag(bio, BIO_NO_PAGE_REF); bio_set_flag(bio, BIO_CLONED); +} +static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) +{ + __bio_iov_bvec_set(bio, iter); iov_iter_advance(iter, iter->count); return 0; } +static int bio_iov_bvec_set_append(struct bio *bio, struct iov_iter *iter) +{ + struct request_queue *q = bio->bi_bdev->bd_disk->queue; + struct iov_iter i = *iter; + + iov_iter_truncate(&i, queue_max_zone_append_sectors(q) << 9); + __bio_iov_bvec_set(bio, &i); + iov_iter_advance(iter, i.count); + return 0; +} + #define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *)) /** @@ -1094,8 +1109,8 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) int ret = 0; if (iov_iter_is_bvec(iter)) { - if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND)) - return -EINVAL; + if (bio_op(bio) == REQ_OP_ZONE_APPEND) + return bio_iov_bvec_set_append(bio, iter); return bio_iov_bvec_set(bio, iter); } -- cgit v1.2.3 From e82fc7855749aa197740a60ef22c492c41ea5d5f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 27 Mar 2021 15:13:09 +0800 Subject: block: don't create too many partitions Commit a33df75c6328 ("block: use an xarray for disk->part_tbl") drops the check on max supported number of partitionsr, and allows partition with bigger partition numbers to be added. However, ->bd_partno is defined as u8, so partition index of xarray table may not match with ->bd_partno. Then delete_partition() may delete one unmatched partition, and caused use-after-free. Reviewed-by: Bart Van Assche Reported-by: syzbot+8fede7e30c7cee0de139@syzkaller.appspotmail.com Fixes: a33df75c6328 ("block: use an xarray for disk->part_tbl") Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/partitions/core.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'block') diff --git a/block/partitions/core.c b/block/partitions/core.c index 1a7558917c47..46f055bc7ecb 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -322,6 +322,13 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, const char *dname; int err; + /* + * disk_max_parts() won't be zero, either GENHD_FL_EXT_DEVT is set + * or 'minors' is passed to alloc_disk(). + */ + if (partno >= disk_max_parts(disk)) + return ERR_PTR(-EINVAL); + /* * Partitions are not supported on zoned block devices that are used as * such. -- cgit v1.2.3 From 3edf5346e4f2ce2fa0c94651a90a8dda169565ee Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Wed, 31 Mar 2021 07:53:59 -0400 Subject: block: only update parent bi_status when bio fail For multiple split bios, if one of the bio is fail, the whole should return error to application. But we found there is a race between bio_integrity_verify_fn and bio complete, which return io success to application after one of the bio fail. The race as following: split bio(READ) kworker nvme_complete_rq blk_update_request //split error=0 bio_endio bio_integrity_endio queue_work(kintegrityd_wq, &bip->bip_work); bio_integrity_verify_fn bio_endio //split bio __bio_chain_endio if (!parent->bi_status) nvme_irq blk_update_request //parent error=7 req_bio_endio bio->bi_status = 7 //parent bio parent->bi_status = 0 parent->bi_end_io() // return bi_status=0 The bio has been split as two: split and parent. When split bio completed, it depends on kworker to do endio, while bio_integrity_verify_fn have been interrupted by parent bio complete irq handler. Then, parent bio->bi_status which have been set in irq handler will overwrite by kworker. In fact, even without the above race, we also need to conside the concurrency beteen mulitple split bio complete and update the same parent bi_status. Normally, multiple split bios will be issued to the same hctx and complete from the same irq vector. But if we have updated queue map between multiple split bios, these bios may complete on different hw queue and different irq vector. Then the concurrency update parent bi_status may cause the final status error. Suggested-by: Keith Busch Signed-off-by: Yufen Yu Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20210331115359.1125679-1-yuyufen@huawei.com Signed-off-by: Jens Axboe --- block/bio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 963d1d406b3a..50e579088aca 100644 --- a/block/bio.c +++ b/block/bio.c @@ -277,7 +277,7 @@ static struct bio *__bio_chain_endio(struct bio *bio) { struct bio *parent = bio->bi_private; - if (!parent->bi_status) + if (bio->bi_status && !parent->bi_status) parent->bi_status = bio->bi_status; bio_put(bio); return parent; -- cgit v1.2.3 From f06c609645ecd043c79380fac94145926603fb33 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 2 Apr 2021 19:17:46 +0200 Subject: block: remove the unused RQF_ALLOCED flag Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 1 - include/linux/blkdev.h | 2 -- 2 files changed, 3 deletions(-) (limited to 'block') diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 9ebb344e2585..271f6596435b 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -302,7 +302,6 @@ static const char *const rqf_name[] = { RQF_NAME(QUIET), RQF_NAME(ELVPRIV), RQF_NAME(IO_STAT), - RQF_NAME(ALLOCED), RQF_NAME(PM), RQF_NAME(HASHED), RQF_NAME(STATS), diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bc6bc8383b43..158aefae1030 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -85,8 +85,6 @@ typedef __u32 __bitwise req_flags_t; #define RQF_ELVPRIV ((__force req_flags_t)(1 << 12)) /* account into disk and partition IO statistics */ #define RQF_IO_STAT ((__force req_flags_t)(1 << 13)) -/* request came from our alloc pool */ -#define RQF_ALLOCED ((__force req_flags_t)(1 << 14)) /* runtime pm request */ #define RQF_PM ((__force req_flags_t)(1 << 15)) /* on IO scheduler merge hash */ -- cgit v1.2.3