From 180dccb0dba4f5e84a4a70c1be1d34cbb6528b32 Mon Sep 17 00:00:00 2001 From: Laibin Qiu Date: Thu, 13 Jan 2022 10:55:36 +0800 Subject: blk-mq: fix tag_get wait task can't be awakened In case of shared tags, there might be more than one hctx which allocates from the same tags, and each hctx is limited to allocate at most: hctx_max_depth = max((bt->sb.depth + users - 1) / users, 4U); tag idle detection is lazy, and may be delayed for 30sec, so there could be just one real active hctx(queue) but all others are actually idle and still accounted as active because of the lazy idle detection. Then if wake_batch is > hctx_max_depth, driver tag allocation may wait forever on this real active hctx. Fix this by recalculating wake_batch when inc or dec active_queues. Fixes: 0d2602ca30e41 ("blk-mq: improve support for shared tags maps") Suggested-by: Ming Lei Suggested-by: John Garry Signed-off-by: Laibin Qiu Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220113025536.1479653-1-qiulaibin@huawei.com Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 40 +++++++++++++++++++++++++++++++++------- include/linux/sbitmap.h | 11 +++++++++++ lib/sbitmap.c | 25 ++++++++++++++++++++++--- 3 files changed, 66 insertions(+), 10 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index e55a6834c9a6..845f74e8dd7b 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -16,6 +16,21 @@ #include "blk-mq-sched.h" #include "blk-mq-tag.h" +/* + * Recalculate wakeup batch when tag is shared by hctx. + */ +static void blk_mq_update_wake_batch(struct blk_mq_tags *tags, + unsigned int users) +{ + if (!users) + return; + + sbitmap_queue_recalculate_wake_batch(&tags->bitmap_tags, + users); + sbitmap_queue_recalculate_wake_batch(&tags->breserved_tags, + users); +} + /* * If a previously inactive queue goes active, bump the active user count. * We need to do this before try to allocate driver tag, then even if fail @@ -24,18 +39,26 @@ */ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { + unsigned int users; + if (blk_mq_is_shared_tags(hctx->flags)) { struct request_queue *q = hctx->queue; - if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) && - !test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) - atomic_inc(&hctx->tags->active_queues); + if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) || + test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) { + return true; + } } else { - if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) && - !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) - atomic_inc(&hctx->tags->active_queues); + if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) || + test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) { + return true; + } } + users = atomic_inc_return(&hctx->tags->active_queues); + + blk_mq_update_wake_batch(hctx->tags, users); + return true; } @@ -56,6 +79,7 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) { struct blk_mq_tags *tags = hctx->tags; + unsigned int users; if (blk_mq_is_shared_tags(hctx->flags)) { struct request_queue *q = hctx->queue; @@ -68,7 +92,9 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) return; } - atomic_dec(&tags->active_queues); + users = atomic_dec_return(&tags->active_queues); + + blk_mq_update_wake_batch(tags, users); blk_mq_tag_wakeup_all(tags, false); } diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index fc0357a6e19b..95df357ec009 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -415,6 +415,17 @@ static inline void sbitmap_queue_free(struct sbitmap_queue *sbq) sbitmap_free(&sbq->sb); } +/** + * sbitmap_queue_recalculate_wake_batch() - Recalculate wake batch + * @sbq: Bitmap queue to recalculate wake batch. + * @users: Number of shares. + * + * Like sbitmap_queue_update_wake_batch(), this will calculate wake batch + * by depth. This interface is for HCTX shared tags or queue shared tags. + */ +void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq, + unsigned int users); + /** * sbitmap_queue_resize() - Resize a &struct sbitmap_queue. * @sbq: Bitmap queue to resize. diff --git a/lib/sbitmap.c b/lib/sbitmap.c index 2709ab825499..6220fa67fb7e 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -457,10 +457,9 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, } EXPORT_SYMBOL_GPL(sbitmap_queue_init_node); -static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq, - unsigned int depth) +static inline void __sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq, + unsigned int wake_batch) { - unsigned int wake_batch = sbq_calc_wake_batch(sbq, depth); int i; if (sbq->wake_batch != wake_batch) { @@ -476,6 +475,26 @@ static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq, } } +static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq, + unsigned int depth) +{ + unsigned int wake_batch; + + wake_batch = sbq_calc_wake_batch(sbq, depth); + __sbitmap_queue_update_wake_batch(sbq, wake_batch); +} + +void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq, + unsigned int users) +{ + unsigned int wake_batch; + + wake_batch = clamp_val((sbq->sb.depth + users - 1) / + users, 4, SBQ_WAKE_BATCH); + __sbitmap_queue_update_wake_batch(sbq, wake_batch); +} +EXPORT_SYMBOL_GPL(sbitmap_queue_recalculate_wake_batch); + void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth) { sbitmap_queue_update_wake_batch(sbq, depth); -- cgit v1.2.3 From 413ec8057bc3d368574abd05dd27e747063b2f59 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 13 Jan 2022 00:14:32 +0000 Subject: loop: remove redundant initialization of pointer node The pointer node is being initialized with a value that is never read, it is being re-assigned the same value a little futher on. Remove the redundant initialization. Cleans up clang scan warning: drivers/block/loop.c:823:19: warning: Value stored to 'node' during its initialization is never read [deadcode.DeadStores] Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20220113001432.1331871-1-colin.i.king@gmail.com Signed-off-by: Jens Axboe --- drivers/block/loop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index b1b05c45c07c..01cbbfc4e9e2 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -820,7 +820,7 @@ static inline int queue_on_root_worker(struct cgroup_subsys_state *css) static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd) { - struct rb_node **node = &(lo->worker_tree.rb_node), *parent = NULL; + struct rb_node **node, *parent = NULL; struct loop_worker *cur_worker, *worker = NULL; struct work_struct *work; struct list_head *cmd_list; -- cgit v1.2.3 From a6431e351c6ec5bb6800787d259b343088f369a3 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 13 Jan 2022 00:05:45 +0000 Subject: aoe: remove redundant assignment on variable n The variable n is being bit-wise or'd with a value and reassigned before being returned. The update of n is redundant, replace the |= operator with | instead. Cleans up clang scan warning: drivers/block/aoe/aoecmd.c:125:9: warning: Although the value stored to 'n' is used in the enclosing expression, the value is never actually read from 'n' [deadcode.DeadStores] Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20220113000545.1307091-1-colin.i.king@gmail.com Signed-off-by: Jens Axboe --- drivers/block/aoe/aoecmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 588889bea7c3..6af111f568e4 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -122,7 +122,7 @@ newtag(struct aoedev *d) register ulong n; n = jiffies & 0xffff; - return n |= (++d->lasttag & 0x7fff) << 16; + return n | (++d->lasttag & 0x7fff) << 16; } static u32 -- cgit v1.2.3 From 00358933f66c44d511368a57eb421e172447cfb9 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 6 Jan 2022 18:53:16 +0900 Subject: brd: remove brd_devices_mutex mutex If brd_alloc() from brd_probe() is called before brd_alloc() from brd_init() is called, module loading will fail with -EEXIST error. To close this race, call __register_blkdev() just before leaving brd_init(). Then, we can remove brd_devices_mutex mutex, for brd_device list will no longer be accessed concurrently. Signed-off-by: Tetsuo Handa Reviewed-by: Christoph Hellwig Reviewed-by: Luis Chamberlain Link: https://lore.kernel.org/r/6b074af7-c165-4fab-b7da-8270a4f6f6cd@i-love.sakura.ne.jp Signed-off-by: Jens Axboe --- drivers/block/brd.c | 73 ++++++++++++++++++++++------------------------------- 1 file changed, 30 insertions(+), 43 deletions(-) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 8fe2e4289dae..6e3f2f0d2352 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -362,7 +362,6 @@ __setup("ramdisk_size=", ramdisk_size); * (should share code eventually). */ static LIST_HEAD(brd_devices); -static DEFINE_MUTEX(brd_devices_mutex); static struct dentry *brd_debugfs_dir; static int brd_alloc(int i) @@ -372,21 +371,14 @@ static int brd_alloc(int i) char buf[DISK_NAME_LEN]; int err = -ENOMEM; - mutex_lock(&brd_devices_mutex); - list_for_each_entry(brd, &brd_devices, brd_list) { - if (brd->brd_number == i) { - mutex_unlock(&brd_devices_mutex); + list_for_each_entry(brd, &brd_devices, brd_list) + if (brd->brd_number == i) return -EEXIST; - } - } brd = kzalloc(sizeof(*brd), GFP_KERNEL); - if (!brd) { - mutex_unlock(&brd_devices_mutex); + if (!brd) return -ENOMEM; - } brd->brd_number = i; list_add_tail(&brd->brd_list, &brd_devices); - mutex_unlock(&brd_devices_mutex); spin_lock_init(&brd->brd_lock); INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC); @@ -429,9 +421,7 @@ static int brd_alloc(int i) out_cleanup_disk: blk_cleanup_disk(disk); out_free_dev: - mutex_lock(&brd_devices_mutex); list_del(&brd->brd_list); - mutex_unlock(&brd_devices_mutex); kfree(brd); return err; } @@ -441,15 +431,19 @@ static void brd_probe(dev_t dev) brd_alloc(MINOR(dev) / max_part); } -static void brd_del_one(struct brd_device *brd) +static void brd_cleanup(void) { - del_gendisk(brd->brd_disk); - blk_cleanup_disk(brd->brd_disk); - brd_free_pages(brd); - mutex_lock(&brd_devices_mutex); - list_del(&brd->brd_list); - mutex_unlock(&brd_devices_mutex); - kfree(brd); + struct brd_device *brd, *next; + + debugfs_remove_recursive(brd_debugfs_dir); + + list_for_each_entry_safe(brd, next, &brd_devices, brd_list) { + del_gendisk(brd->brd_disk); + blk_cleanup_disk(brd->brd_disk); + brd_free_pages(brd); + list_del(&brd->brd_list); + kfree(brd); + } } static inline void brd_check_and_reset_par(void) @@ -473,9 +467,18 @@ static inline void brd_check_and_reset_par(void) static int __init brd_init(void) { - struct brd_device *brd, *next; int err, i; + brd_check_and_reset_par(); + + brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL); + + for (i = 0; i < rd_nr; i++) { + err = brd_alloc(i); + if (err) + goto out_free; + } + /* * brd module now has a feature to instantiate underlying device * structure on-demand, provided that there is an access dev node. @@ -491,28 +494,16 @@ static int __init brd_init(void) * dynamically. */ - if (__register_blkdev(RAMDISK_MAJOR, "ramdisk", brd_probe)) - return -EIO; - - brd_check_and_reset_par(); - - brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL); - - for (i = 0; i < rd_nr; i++) { - err = brd_alloc(i); - if (err) - goto out_free; + if (__register_blkdev(RAMDISK_MAJOR, "ramdisk", brd_probe)) { + err = -EIO; + goto out_free; } pr_info("brd: module loaded\n"); return 0; out_free: - unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); - debugfs_remove_recursive(brd_debugfs_dir); - - list_for_each_entry_safe(brd, next, &brd_devices, brd_list) - brd_del_one(brd); + brd_cleanup(); pr_info("brd: module NOT loaded !!!\n"); return err; @@ -520,13 +511,9 @@ out_free: static void __exit brd_exit(void) { - struct brd_device *brd, *next; unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); - debugfs_remove_recursive(brd_debugfs_dir); - - list_for_each_entry_safe(brd, next, &brd_devices, brd_list) - brd_del_one(brd); + brd_cleanup(); pr_info("brd: module unloaded\n"); } -- cgit v1.2.3 From e6a2e5116e07ce5acc8698785c29e9e47f010fd5 Mon Sep 17 00:00:00 2001 From: GuoYong Zheng Date: Mon, 17 Jan 2022 18:22:37 +0800 Subject: block: Remove unnecessary variable assignment The parameter "ret" should be zero when running to this line, no need to set to zero again, remove it. Signed-off-by: GuoYong Zheng Link: https://lore.kernel.org/r/1642414957-6785-1-git-send-email-zhenggy@chinatelecom.cn Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index e20eadfcf5c8..bed4a2facd65 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -887,7 +887,6 @@ int blk_register_queue(struct gendisk *disk) kobject_uevent(&q->elevator->kobj, KOBJ_ADD); mutex_unlock(&q->sysfs_lock); - ret = 0; unlock: mutex_unlock(&q->sysfs_dir_lock); -- cgit v1.2.3 From 850fd2abbe02eb2b52cbb1550adbcc89b36d65de Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 11 Jan 2022 20:34:01 +0800 Subject: block: cleanup q->srcu srcu structure has to be cleanup via cleanup_srcu_struct(), so fix it. Reported-by: syzbot+4f789823c1abc5accf13@syzkaller.appspotmail.com Fixes: 704b914f15fb ("blk-mq: move srcu from blk_mq_hw_ctx to request_queue") Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220111123401.520192-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index bed4a2facd65..9f32882ceb2f 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -811,6 +811,9 @@ static void blk_release_queue(struct kobject *kobj) bioset_exit(&q->bio_split); + if (blk_queue_has_srcu(q)) + cleanup_srcu_struct(q->srcu); + ida_simple_remove(&blk_queue_ida, q->id); call_rcu(&q->rcu_head, blk_free_queue_rcu); } -- cgit v1.2.3 From fd9f4e62a39f09a7c014d7415c2b9d1390aa0504 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 18 Jan 2022 08:04:44 +0100 Subject: block: assign bi_bdev for cloned bios in blk_rq_prep_clone bio_clone_fast() sets the cloned bio to have the same ->bi_bdev as the source bio. This means that when request-based dm called setup_clone(), the cloned bio had its ->bi_bdev pointing to the dm device. After Commit 0b6e522cdc4a ("blk-mq: use ->bi_bdev for I/O accounting") __blk_account_io_start() started using the request's ->bio->bi_bdev for I/O accounting, if it was set. This caused IO going to the underlying devices to use the dm device for their I/O accounting. Set up the proper ->bi_bdev in blk_rq_prep_clone based on the whole device bdev for the queue the request is cloned onto. Fixes: 0b6e522cdc4a ("blk-mq: use ->bi_bdev for I/O accounting") Reported-by: Benjamin Marzinski Signed-off-by: Christoph Hellwig [hch: the commit message is mostly from a different patch from Benjamin] Reviewed-by: Ming Lei Reviewed-by: Benjamin Marzinski Link: https://lore.kernel.org/r/20220118070444.1241739-1-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index a6d4780580fc..b5e35e63adad 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2976,6 +2976,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, bio = bio_clone_fast(bio_src, gfp_mask, bs); if (!bio) goto free_and_out; + bio->bi_bdev = rq->q->disk->part0; if (bio_ctr && bio_ctr(bio, bio_src, data)) goto free_and_out; -- cgit v1.2.3 From 3ee859e384d453d6ac68bfd5971f630d9fa46ad3 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Sun, 9 Jan 2022 18:36:43 +0900 Subject: block: Fix wrong offset in bio_truncate() bio_truncate() clears the buffer outside of last block of bdev, however current bio_truncate() is using the wrong offset of page. So it can return the uninitialized data. This happened when both of truncated/corrupted FS and userspace (via bdev) are trying to read the last of bdev. Reported-by: syzbot+ac94ae5f68b84197f41c@syzkaller.appspotmail.com Signed-off-by: OGAWA Hirofumi Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/875yqt1c9g.fsf@mail.parknet.co.jp Signed-off-by: Jens Axboe --- block/bio.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/bio.c b/block/bio.c index 0d400ba2dbd1..4312a8085396 100644 --- a/block/bio.c +++ b/block/bio.c @@ -569,7 +569,8 @@ static void bio_truncate(struct bio *bio, unsigned new_size) offset = new_size - done; else offset = 0; - zero_user(bv.bv_page, offset, bv.bv_len - offset); + zero_user(bv.bv_page, bv.bv_offset + offset, + bv.bv_len - offset); truncated = true; } done += bv.bv_len; -- cgit v1.2.3 From 46cdc45acb089c811d9a54fd50af33b96e5fae9d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 20 Jan 2022 10:28:13 -0700 Subject: block: fix async_depth sysfs interface for mq-deadline A previous commit added this feature, but it inadvertently used the wrong variable to show/store the setting from/to, victimized by copy/paste. Fix it up so that the async_depth sysfs interface reads and writes from the right setting. Fixes: 07757588e507 ("block/mq-deadline: Reserve 25% of scheduler tags for synchronous requests") Link: https://bugzilla.kernel.org/show_bug.cgi?id=215485 Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/mq-deadline.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 85d919bf60c7..3ed5eaf3446a 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -865,7 +865,7 @@ SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]); SHOW_JIFFIES(deadline_prio_aging_expire_show, dd->prio_aging_expire); SHOW_INT(deadline_writes_starved_show, dd->writes_starved); SHOW_INT(deadline_front_merges_show, dd->front_merges); -SHOW_INT(deadline_async_depth_show, dd->front_merges); +SHOW_INT(deadline_async_depth_show, dd->async_depth); SHOW_INT(deadline_fifo_batch_show, dd->fifo_batch); #undef SHOW_INT #undef SHOW_JIFFIES @@ -895,7 +895,7 @@ STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MA STORE_JIFFIES(deadline_prio_aging_expire_store, &dd->prio_aging_expire, 0, INT_MAX); STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX); STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1); -STORE_INT(deadline_async_depth_store, &dd->front_merges, 1, INT_MAX); +STORE_INT(deadline_async_depth_store, &dd->async_depth, 1, INT_MAX); STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX); #undef STORE_FUNCTION #undef STORE_INT -- cgit v1.2.3