aboutsummaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
authorLinus Torvalds2024-05-13 13:03:54 -0700
committerLinus Torvalds2024-05-13 13:03:54 -0700
commit0c9f4ac808b017a0013cee92a30de980550145d5 (patch)
tree94eedbb9ef4815df9dc8d1dd6424fc92a2fbcd7a /block
parent9961a785944601e32f185ea696347b22ffda634c (diff)
parenta3166c51702bb00b8f8b84022090cbab8f37be1a (diff)
Merge tag 'for-6.10/block-20240511' of git://git.kernel.dk/linux
Pull block updates from Jens Axboe: - Add a partscan attribute in sysfs, fixing an issue with systemd relying on an internal interface that went away. - Attempt #2 at making long running discards interruptible. The previous attempt went into 6.9, but we ended up mostly reverting it as it had issues. - Remove old ida_simple API in bcache - Support for zoned write plugging, greatly improving the performance on zoned devices. - Remove the old throttle low interface, which has been experimental since 2017 and never made it beyond that and isn't being used. - Remove page->index debugging checks in brd, as it hasn't caught anything and prepares us for removing in struct page. - MD pull request from Song - Don't schedule block workers on isolated CPUs * tag 'for-6.10/block-20240511' of git://git.kernel.dk/linux: (84 commits) blk-throttle: delay initialization until configuration blk-throttle: remove CONFIG_BLK_DEV_THROTTLING_LOW block: fix that util can be greater than 100% block: support to account io_ticks precisely block: add plug while submitting IO bcache: fix variable length array abuse in btree_iter bcache: Remove usage of the deprecated ida_simple_xx() API md: Revert "md: Fix overflow in is_mddev_idle" blk-lib: check for kill signal in ioctl BLKDISCARD block: add a bio_await_chain helper block: add a blk_alloc_discard_bio helper block: add a bio_chain_and_submit helper block: move discard checks into the ioctl handler block: remove the discard_granularity check in __blkdev_issue_discard block/ioctl: prefer different overflow check null_blk: Fix the WARNING: modpost: missing MODULE_DESCRIPTION() block: fix and simplify blkdevparts= cmdline parsing block: refine the EOF check in blkdev_iomap_begin block: add a partscan sysfs attribute for disks block: add a disk_has_partscan helper ...
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig16
-rw-r--r--block/Makefile1
-rw-r--r--block/bio.c50
-rw-r--r--block/blk-cgroup-rwstat.c18
-rw-r--r--block/blk-cgroup.c9
-rw-r--r--block/blk-core.c26
-rw-r--r--block/blk-flush.c2
-rw-r--r--block/blk-lib.c68
-rw-r--r--block/blk-merge.c25
-rw-r--r--block/blk-mq-debugfs-zoned.c22
-rw-r--r--block/blk-mq-debugfs.c3
-rw-r--r--block/blk-mq-debugfs.h6
-rw-r--r--block/blk-mq.c184
-rw-r--r--block/blk-mq.h31
-rw-r--r--block/blk-settings.c46
-rw-r--r--block/blk-stat.c3
-rw-r--r--block/blk-sysfs.c10
-rw-r--r--block/blk-throttle.c1019
-rw-r--r--block/blk-throttle.h46
-rw-r--r--block/blk-zoned.c1508
-rw-r--r--block/blk.h97
-rw-r--r--block/elevator.c46
-rw-r--r--block/elevator.h1
-rw-r--r--block/fops.c31
-rw-r--r--block/genhd.c32
-rw-r--r--block/ioctl.c42
-rw-r--r--block/mq-deadline.c204
-rw-r--r--block/partitions/cmdline.c49
-rw-r--r--block/partitions/core.c5
29 files changed, 1959 insertions, 1641 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 1de4682d48cc..dc12af58dbae 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -100,7 +100,6 @@ config BLK_DEV_WRITE_MOUNTED
config BLK_DEV_ZONED
bool "Zoned block device support"
- select MQ_IOSCHED_DEADLINE
help
Block layer zoned block device support. This option enables
support for ZAC/ZBC/ZNS host-managed and host-aware zoned block
@@ -120,17 +119,6 @@ config BLK_DEV_THROTTLING
See Documentation/admin-guide/cgroup-v1/blkio-controller.rst for more information.
-config BLK_DEV_THROTTLING_LOW
- bool "Block throttling .low limit interface support (EXPERIMENTAL)"
- depends on BLK_DEV_THROTTLING
- help
- Add .low limit interface for block throttling. The low limit is a best
- effort limit to prioritize cgroups. Depending on the setting, the limit
- can be used to protect cgroups in terms of bandwidth/iops and better
- utilize disk resource.
-
- Note, this is an experimental interface and could be changed someday.
-
config BLK_WBT
bool "Enable support for block device writeback throttling"
help
@@ -198,10 +186,6 @@ config BLK_DEBUG_FS
Unless you are building a kernel for a tiny system, you should
say Y here.
-config BLK_DEBUG_FS_ZONED
- bool
- default BLK_DEBUG_FS && BLK_DEV_ZONED
-
config BLK_SED_OPAL
bool "Logic for interfacing with Opal enabled SEDs"
depends on KEYS
diff --git a/block/Makefile b/block/Makefile
index 46ada9dc8bbf..168150b9c510 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -33,7 +33,6 @@ obj-$(CONFIG_BLK_MQ_VIRTIO) += blk-mq-virtio.o
obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o
obj-$(CONFIG_BLK_WBT) += blk-wbt.o
obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o
-obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o
obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o
obj-$(CONFIG_BLK_PM) += blk-pm.o
obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += blk-crypto.o blk-crypto-profile.o \
diff --git a/block/bio.c b/block/bio.c
index d24420ed1c4c..53f608028c78 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -345,18 +345,29 @@ void bio_chain(struct bio *bio, struct bio *parent)
}
EXPORT_SYMBOL(bio_chain);
-struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev,
- unsigned int nr_pages, blk_opf_t opf, gfp_t gfp)
+/**
+ * bio_chain_and_submit - submit a bio after chaining it to another one
+ * @prev: bio to chain and submit
+ * @new: bio to chain to
+ *
+ * If @prev is non-NULL, chain it to @new and submit it.
+ *
+ * Return: @new.
+ */
+struct bio *bio_chain_and_submit(struct bio *prev, struct bio *new)
{
- struct bio *new = bio_alloc(bdev, nr_pages, opf, gfp);
-
- if (bio) {
- bio_chain(bio, new);
- submit_bio(bio);
+ if (prev) {
+ bio_chain(prev, new);
+ submit_bio(prev);
}
-
return new;
}
+
+struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev,
+ unsigned int nr_pages, blk_opf_t opf, gfp_t gfp)
+{
+ return bio_chain_and_submit(bio, bio_alloc(bdev, nr_pages, opf, gfp));
+}
EXPORT_SYMBOL_GPL(blk_next_bio);
static void bio_alloc_rescue(struct work_struct *work)
@@ -1384,6 +1395,26 @@ int submit_bio_wait(struct bio *bio)
}
EXPORT_SYMBOL(submit_bio_wait);
+static void bio_wait_end_io(struct bio *bio)
+{
+ complete(bio->bi_private);
+ bio_put(bio);
+}
+
+/*
+ * bio_await_chain - ends @bio and waits for every chained bio to complete
+ */
+void bio_await_chain(struct bio *bio)
+{
+ DECLARE_COMPLETION_ONSTACK_MAP(done,
+ bio->bi_bdev->bd_disk->lockdep_map);
+
+ bio->bi_private = &done;
+ bio->bi_end_io = bio_wait_end_io;
+ bio_endio(bio);
+ blk_wait_io(&done);
+}
+
void __bio_advance(struct bio *bio, unsigned bytes)
{
if (bio_integrity(bio))
@@ -1576,6 +1607,8 @@ again:
if (!bio_integrity_endio(bio))
return;
+ blk_zone_bio_endio(bio);
+
rq_qos_done_bio(bio);
if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
@@ -1596,7 +1629,6 @@ again:
goto again;
}
- blk_throtl_bio_endio(bio);
/* release cgroup info */
bio_uninit(bio);
if (bio->bi_end_io)
diff --git a/block/blk-cgroup-rwstat.c b/block/blk-cgroup-rwstat.c
index 3304e841df7c..a55fb0c53558 100644
--- a/block/blk-cgroup-rwstat.c
+++ b/block/blk-cgroup-rwstat.c
@@ -9,25 +9,19 @@ int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp)
{
int i, ret;
- for (i = 0; i < BLKG_RWSTAT_NR; i++) {
- ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp);
- if (ret) {
- while (--i >= 0)
- percpu_counter_destroy(&rwstat->cpu_cnt[i]);
- return ret;
- }
+ ret = percpu_counter_init_many(rwstat->cpu_cnt, 0, gfp, BLKG_RWSTAT_NR);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < BLKG_RWSTAT_NR; i++)
atomic64_set(&rwstat->aux_cnt[i], 0);
- }
return 0;
}
EXPORT_SYMBOL_GPL(blkg_rwstat_init);
void blkg_rwstat_exit(struct blkg_rwstat *rwstat)
{
- int i;
-
- for (i = 0; i < BLKG_RWSTAT_NR; i++)
- percpu_counter_destroy(&rwstat->cpu_cnt[i]);
+ percpu_counter_destroy_many(rwstat->cpu_cnt, BLKG_RWSTAT_NR);
}
EXPORT_SYMBOL_GPL(blkg_rwstat_exit);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 059467086b13..4b1a35ab0ea4 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -218,8 +218,7 @@ static void blkg_async_bio_workfn(struct work_struct *work)
/* as long as there are pending bios, @blkg can't go away */
spin_lock(&blkg->async_bio_lock);
- bio_list_merge(&bios, &blkg->async_bios);
- bio_list_init(&blkg->async_bios);
+ bio_list_merge_init(&bios, &blkg->async_bios);
spin_unlock(&blkg->async_bio_lock);
/* start plug only when bio_list contains at least 2 bios */
@@ -1444,14 +1443,8 @@ int blkcg_init_disk(struct gendisk *disk)
if (ret)
goto err_destroy_all;
- ret = blk_throtl_init(disk);
- if (ret)
- goto err_ioprio_exit;
-
return 0;
-err_ioprio_exit:
- blk_ioprio_exit(disk);
err_destroy_all:
blkg_destroy_all(disk);
return ret;
diff --git a/block/blk-core.c b/block/blk-core.c
index b795ac177281..01186333c88e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -591,8 +591,7 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
return BLK_STS_NOTSUPP;
/* The bio sector must point to the start of a sequential zone */
- if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector) ||
- !bio_zone_is_seq(bio))
+ if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector))
return BLK_STS_IOERR;
/*
@@ -604,7 +603,7 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
return BLK_STS_IOERR;
/* Make sure the BIO is small enough and will not get split */
- if (nr_sectors > q->limits.max_zone_append_sectors)
+ if (nr_sectors > queue_max_zone_append_sectors(q))
return BLK_STS_IOERR;
bio->bi_opf |= REQ_NOMERGE;
@@ -649,11 +648,13 @@ static void __submit_bio(struct bio *bio)
static void __submit_bio_noacct(struct bio *bio)
{
struct bio_list bio_list_on_stack[2];
+ struct blk_plug plug;
BUG_ON(bio->bi_next);
bio_list_init(&bio_list_on_stack[0]);
current->bio_list = bio_list_on_stack;
+ blk_start_plug(&plug);
do {
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
@@ -687,19 +688,23 @@ static void __submit_bio_noacct(struct bio *bio)
bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
} while ((bio = bio_list_pop(&bio_list_on_stack[0])));
+ blk_finish_plug(&plug);
current->bio_list = NULL;
}
static void __submit_bio_noacct_mq(struct bio *bio)
{
struct bio_list bio_list[2] = { };
+ struct blk_plug plug;
current->bio_list = bio_list;
+ blk_start_plug(&plug);
do {
__submit_bio(bio);
} while ((bio = bio_list_pop(&bio_list[0])));
+ blk_finish_plug(&plug);
current->bio_list = NULL;
}
@@ -910,12 +915,6 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags)
!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
return 0;
- /*
- * As the requests that require a zone lock are not plugged in the
- * first place, directly accessing the plug instead of using
- * blk_mq_plug() should not have any consequences during flushing for
- * zoned devices.
- */
blk_flush_plug(current->plug, false);
/*
@@ -987,10 +986,11 @@ void update_io_ticks(struct block_device *part, unsigned long now, bool end)
unsigned long stamp;
again:
stamp = READ_ONCE(part->bd_stamp);
- if (unlikely(time_after(now, stamp))) {
- if (likely(try_cmpxchg(&part->bd_stamp, &stamp, now)))
- __part_stat_add(part, io_ticks, end ? now - stamp : 1);
- }
+ if (unlikely(time_after(now, stamp)) &&
+ likely(try_cmpxchg(&part->bd_stamp, &stamp, now)) &&
+ (end || part_in_flight(part)))
+ __part_stat_add(part, io_ticks, now - stamp);
+
if (part->bd_partno) {
part = bdev_whole(part);
goto again;
diff --git a/block/blk-flush.c b/block/blk-flush.c
index b0f314f4bc14..c17cf8ed8113 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -130,6 +130,8 @@ static void blk_flush_restore_request(struct request *rq)
* original @rq->bio. Restore it.
*/
rq->bio = rq->biotail;
+ if (rq->bio)
+ rq->__sector = rq->bio->bi_iter.bi_sector;
/* make @rq a normal request */
rq->rq_flags &= ~RQF_FLUSH_SEQ;
diff --git a/block/blk-lib.c b/block/blk-lib.c
index a6954eafb8c8..442da9dad042 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -35,51 +35,39 @@ static sector_t bio_discard_limit(struct block_device *bdev, sector_t sector)
return round_down(UINT_MAX, discard_granularity) >> SECTOR_SHIFT;
}
-int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
- sector_t nr_sects, gfp_t gfp_mask, struct bio **biop)
+struct bio *blk_alloc_discard_bio(struct block_device *bdev,
+ sector_t *sector, sector_t *nr_sects, gfp_t gfp_mask)
{
- struct bio *bio = *biop;
- sector_t bs_mask;
-
- if (bdev_read_only(bdev))
- return -EPERM;
- if (!bdev_max_discard_sectors(bdev))
- return -EOPNOTSUPP;
-
- /* In case the discard granularity isn't set by buggy device driver */
- if (WARN_ON_ONCE(!bdev_discard_granularity(bdev))) {
- pr_err_ratelimited("%pg: Error: discard_granularity is 0.\n",
- bdev);
- return -EOPNOTSUPP;
- }
-
- bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
- if ((sector | nr_sects) & bs_mask)
- return -EINVAL;
+ sector_t bio_sects = min(*nr_sects, bio_discard_limit(bdev, *sector));
+ struct bio *bio;
- if (!nr_sects)
- return -EINVAL;
+ if (!bio_sects)
+ return NULL;
- while (nr_sects) {
- sector_t req_sects =
- min(nr_sects, bio_discard_limit(bdev, sector));
+ bio = bio_alloc(bdev, 0, REQ_OP_DISCARD, gfp_mask);
+ if (!bio)
+ return NULL;
+ bio->bi_iter.bi_sector = *sector;
+ bio->bi_iter.bi_size = bio_sects << SECTOR_SHIFT;
+ *sector += bio_sects;
+ *nr_sects -= bio_sects;
+ /*
+ * We can loop for a long time in here if someone does full device
+ * discards (like mkfs). Be nice and allow us to schedule out to avoid
+ * softlocking if preempt is disabled.
+ */
+ cond_resched();
+ return bio;
+}
- bio = blk_next_bio(bio, bdev, 0, REQ_OP_DISCARD, gfp_mask);
- bio->bi_iter.bi_sector = sector;
- bio->bi_iter.bi_size = req_sects << 9;
- sector += req_sects;
- nr_sects -= req_sects;
-
- /*
- * We can loop for a long time in here, if someone does
- * full device discards (like mkfs). Be nice and allow
- * us to schedule out to avoid softlocking if preempt
- * is disabled.
- */
- cond_resched();
- }
+int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
+ sector_t nr_sects, gfp_t gfp_mask, struct bio **biop)
+{
+ struct bio *bio;
- *biop = bio;
+ while ((bio = blk_alloc_discard_bio(bdev, &sector, &nr_sects,
+ gfp_mask)))
+ *biop = bio_chain_and_submit(*biop, bio);
return 0;
}
EXPORT_SYMBOL(__blkdev_issue_discard);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 4e3483a16b75..8534c35e0497 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -377,6 +377,7 @@ struct bio *__bio_split_to_limits(struct bio *bio,
blkcg_bio_issue_init(split);
bio_chain(split, bio);
trace_block_split(split, bio->bi_iter.bi_sector);
+ WARN_ON_ONCE(bio_zone_write_plugging(bio));
submit_bio_noacct(bio);
return split;
}
@@ -779,6 +780,8 @@ static void blk_account_io_merge_request(struct request *req)
if (blk_do_io_stat(req)) {
part_stat_lock();
part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
+ part_stat_local_dec(req->part,
+ in_flight[op_is_write(req_op(req))]);
part_stat_unlock();
}
}
@@ -972,13 +975,7 @@ static void blk_account_io_merge_bio(struct request *req)
part_stat_unlock();
}
-enum bio_merge_status {
- BIO_MERGE_OK,
- BIO_MERGE_NONE,
- BIO_MERGE_FAILED,
-};
-
-static enum bio_merge_status bio_attempt_back_merge(struct request *req,
+enum bio_merge_status bio_attempt_back_merge(struct request *req,
struct bio *bio, unsigned int nr_segs)
{
const blk_opf_t ff = bio_failfast(bio);
@@ -994,6 +991,9 @@ static enum bio_merge_status bio_attempt_back_merge(struct request *req,
blk_update_mixed_merge(req, bio, false);
+ if (req->rq_flags & RQF_ZONE_WRITE_PLUGGING)
+ blk_zone_write_plug_bio_merged(bio);
+
req->biotail->bi_next = bio;
req->biotail = bio;
req->__data_len += bio->bi_iter.bi_size;
@@ -1009,6 +1009,14 @@ static enum bio_merge_status bio_attempt_front_merge(struct request *req,
{
const blk_opf_t ff = bio_failfast(bio);
+ /*
+ * A front merge for writes to sequential zones of a zoned block device
+ * can happen only if the user submitted writes out of order. Do not
+ * merge such write to let it fail.
+ */
+ if (req->rq_flags & RQF_ZONE_WRITE_PLUGGING)
+ return BIO_MERGE_FAILED;
+
if (!ll_front_merge_fn(req, bio, nr_segs))
return BIO_MERGE_FAILED;
@@ -1107,10 +1115,9 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs)
{
- struct blk_plug *plug;
+ struct blk_plug *plug = current->plug;
struct request *rq;
- plug = blk_mq_plug(bio);
if (!plug || rq_list_empty(plug->mq_list))
return false;
diff --git a/block/blk-mq-debugfs-zoned.c b/block/blk-mq-debugfs-zoned.c
deleted file mode 100644
index a77b099c34b7..000000000000
--- a/block/blk-mq-debugfs-zoned.c
+++ /dev/null
@@ -1,22 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2017 Western Digital Corporation or its affiliates.
- */
-
-#include <linux/blkdev.h>
-#include "blk-mq-debugfs.h"
-
-int queue_zone_wlock_show(void *data, struct seq_file *m)
-{
- struct request_queue *q = data;
- unsigned int i;
-
- if (!q->disk->seq_zones_wlock)
- return 0;
-
- for (i = 0; i < q->disk->nr_zones; i++)
- if (test_bit(i, q->disk->seq_zones_wlock))
- seq_printf(m, "%u\n", i);
-
- return 0;
-}
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 94668e72ab09..770c0c2b72fa 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -160,7 +160,7 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = {
{ "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops },
{ "pm_only", 0600, queue_pm_only_show, NULL },
{ "state", 0600, queue_state_show, queue_state_write },
- { "zone_wlock", 0400, queue_zone_wlock_show, NULL },
+ { "zone_wplugs", 0400, queue_zone_wplugs_show, NULL },
{ },
};
@@ -256,7 +256,6 @@ static const char *const rqf_name[] = {
RQF_NAME(HASHED),
RQF_NAME(STATS),
RQF_NAME(SPECIAL_PAYLOAD),
- RQF_NAME(ZONE_WRITE_LOCKED),
RQF_NAME(TIMED_OUT),
RQF_NAME(RESV),
};
diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h
index 9c7d4b6117d4..c80e453e3014 100644
--- a/block/blk-mq-debugfs.h
+++ b/block/blk-mq-debugfs.h
@@ -83,10 +83,10 @@ static inline void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos)
}
#endif
-#ifdef CONFIG_BLK_DEBUG_FS_ZONED
-int queue_zone_wlock_show(void *data, struct seq_file *m);
+#if defined(CONFIG_BLK_DEV_ZONED) && defined(CONFIG_BLK_DEBUG_FS)
+int queue_zone_wplugs_show(void *data, struct seq_file *m);
#else
-static inline int queue_zone_wlock_show(void *data, struct seq_file *m)
+static inline int queue_zone_wplugs_show(void *data, struct seq_file *m)
{
return 0;
}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 32afb87efbd0..8e01e4b32e10 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -28,6 +28,7 @@
#include <linux/prefetch.h>
#include <linux/blk-crypto.h>
#include <linux/part_stat.h>
+#include <linux/sched/isolation.h>
#include <trace/events/block.h>
@@ -690,6 +691,8 @@ static void blk_mq_finish_request(struct request *rq)
{
struct request_queue *q = rq->q;
+ blk_zone_finish_request(rq);
+
if (rq->rq_flags & RQF_USE_SCHED) {
q->elevator->type->ops.finish_request(rq);
/*
@@ -761,31 +764,6 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
}
EXPORT_SYMBOL(blk_dump_rq_flags);
-static void req_bio_endio(struct request *rq, struct bio *bio,
- unsigned int nbytes, blk_status_t error)
-{
- if (unlikely(error)) {
- bio->bi_status = error;
- } else if (req_op(rq) == REQ_OP_ZONE_APPEND) {
- /*
- * Partial zone append completions cannot be supported as the
- * BIO fragments may end up not being written sequentially.
- */
- if (bio->bi_iter.bi_size != nbytes)
- bio->bi_status = BLK_STS_IOERR;
- else
- bio->bi_iter.bi_sector = rq->__sector;
- }
-
- bio_advance(bio, nbytes);
-
- if (unlikely(rq->rq_flags & RQF_QUIET))
- bio_set_flag(bio, BIO_QUIET);
- /* don't actually finish bio if it's part of flush sequence */
- if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
- bio_endio(bio);
-}
-
static void blk_account_io_completion(struct request *req, unsigned int bytes)
{
if (req->part && blk_do_io_stat(req)) {
@@ -845,8 +823,7 @@ static void blk_complete_request(struct request *req)
/* Completion has already been traced */
bio_clear_flag(bio, BIO_TRACE_COMPLETION);
- if (req_op(req) == REQ_OP_ZONE_APPEND)
- bio->bi_iter.bi_sector = req->__sector;
+ blk_zone_update_request_bio(req, bio);
if (!is_flush)
bio_endio(bio);
@@ -889,6 +866,8 @@ static void blk_complete_request(struct request *req)
bool blk_update_request(struct request *req, blk_status_t error,
unsigned int nr_bytes)
{
+ bool is_flush = req->rq_flags & RQF_FLUSH_SEQ;
+ bool quiet = req->rq_flags & RQF_QUIET;
int total_bytes;
trace_block_rq_complete(req, error, nr_bytes);
@@ -909,9 +888,8 @@ bool blk_update_request(struct request *req, blk_status_t error,
if (blk_crypto_rq_has_keyslot(req) && nr_bytes >= blk_rq_bytes(req))
__blk_crypto_rq_put_keyslot(req);
- if (unlikely(error && !blk_rq_is_passthrough(req) &&
- !(req->rq_flags & RQF_QUIET)) &&
- !test_bit(GD_DEAD, &req->q->disk->state)) {
+ if (unlikely(error && !blk_rq_is_passthrough(req) && !quiet) &&
+ !test_bit(GD_DEAD, &req->q->disk->state)) {
blk_print_req_error(req, error);
trace_block_rq_error(req, error, nr_bytes);
}
@@ -923,12 +901,33 @@ bool blk_update_request(struct request *req, blk_status_t error,
struct bio *bio = req->bio;
unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);
- if (bio_bytes == bio->bi_iter.bi_size)
+ if (unlikely(error))
+ bio->bi_status = error;
+
+ if (bio_bytes == bio->bi_iter.bi_size) {
req->bio = bio->bi_next;
+ } else if (bio_is_zone_append(bio) && error == BLK_STS_OK) {
+ /*
+ * Partial zone append completions cannot be supported
+ * as the BIO fragments may end up not being written
+ * sequentially.
+ */
+ bio->bi_status = BLK_STS_IOERR;
+ }
/* Completion has already been traced */
bio_clear_flag(bio, BIO_TRACE_COMPLETION);
- req_bio_endio(req, bio, bio_bytes, error);
+ if (unlikely(quiet))
+ bio_set_flag(bio, BIO_QUIET);
+
+ bio_advance(bio, bio_bytes);
+
+ /* Don't actually finish bio if it's part of flush sequence */
+ if (!bio->bi_iter.bi_size) {
+ blk_zone_update_request_bio(req, bio);
+ if (!is_flush)
+ bio_endio(bio);
+ }
total_bytes += bio_bytes;
nr_bytes -= bio_bytes;
@@ -997,6 +996,8 @@ static inline void blk_account_io_done(struct request *req, u64 now)
update_io_ticks(req->part, jiffies, true);
part_stat_inc(req->part, ios[sgrp]);
part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
+ part_stat_local_dec(req->part,
+ in_flight[op_is_write(req_op(req))]);
part_stat_unlock();
}
}
@@ -1019,6 +1020,8 @@ static inline void blk_account_io_start(struct request *req)
part_stat_lock();
update_io_ticks(req->part, jiffies, false);
+ part_stat_local_inc(req->part,
+ in_flight[op_is_write(req_op(req))]);
part_stat_unlock();
}
}
@@ -1330,11 +1333,6 @@ void blk_execute_rq_nowait(struct request *rq, bool at_head)
blk_account_io_start(rq);
- /*
- * As plugging can be enabled for passthrough requests on a zoned
- * device, directly accessing the plug instead of using blk_mq_plug()
- * should not have any consequences.
- */
if (current->plug && !at_head) {
blk_add_rq_to_plug(current->plug, rq);
return;
@@ -1921,19 +1919,6 @@ static void blk_mq_handle_dev_resource(struct request *rq,
__blk_mq_requeue_request(rq);
}
-static void blk_mq_handle_zone_resource(struct request *rq,
- struct list_head *zone_list)
-{
- /*
- * If we end up here it is because we cannot dispatch a request to a
- * specific zone due to LLD level zone-write locking or other zone
- * related resource not being available. In this case, set the request
- * aside in zone_list for retrying it later.
- */
- list_add(&rq->queuelist, zone_list);
- __blk_mq_requeue_request(rq);
-}
-
enum prep_dispatch {
PREP_DISPATCH_OK,
PREP_DISPATCH_NO_TAG,
@@ -2019,7 +2004,6 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
struct request *rq;
int queued;
blk_status_t ret = BLK_STS_OK;
- LIST_HEAD(zone_list);
bool needs_resource = false;
if (list_empty(list))
@@ -2061,23 +2045,11 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
case BLK_STS_DEV_RESOURCE:
blk_mq_handle_dev_resource(rq, list);
goto out;
- case BLK_STS_ZONE_RESOURCE:
- /*
- * Move the request to zone_list and keep going through
- * the dispatch list to find more requests the drive can
- * accept.
- */
- blk_mq_handle_zone_resource(rq, &zone_list);
- needs_resource = true;
- break;
default:
blk_mq_end_request(rq, ret);
}
} while (!list_empty(list));
out:
- if (!list_empty(&zone_list))
- list_splice_tail_init(&zone_list, list);
-
/* If we didn't flush the entire list, we could have told the driver
* there was more coming, but that turned out to be a lie.
*/
@@ -2164,6 +2136,15 @@ static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
}
/*
+ * ->next_cpu is always calculated from hctx->cpumask, so simply use
+ * it for speeding up the check
+ */
+static bool blk_mq_hctx_empty_cpumask(struct blk_mq_hw_ctx *hctx)
+{
+ return hctx->next_cpu >= nr_cpu_ids;
+}
+
+/*
* It'd be great if the workqueue API had a way to pass
* in a mask and had some smarts for more clever placement.
* For now we just round-robin here, switching for every
@@ -2174,7 +2155,8 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
bool tried = false;
int next_cpu = hctx->next_cpu;
- if (hctx->queue->nr_hw_queues == 1)
+ /* Switch to unbound if no allowable CPUs in this hctx */
+ if (hctx->queue->nr_hw_queues == 1 || blk_mq_hctx_empty_cpumask(hctx))
return WORK_CPU_UNBOUND;
if (--hctx->next_cpu_batch <= 0) {
@@ -2948,22 +2930,37 @@ static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
void blk_mq_submit_bio(struct bio *bio)
{
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
- struct blk_plug *plug = blk_mq_plug(bio);
+ struct blk_plug *plug = current->plug;
const int is_sync = op_is_sync(bio->bi_opf);
struct blk_mq_hw_ctx *hctx;
unsigned int nr_segs = 1;
struct request *rq;
blk_status_t ret;
+ /*
+ * If the plug has a cached request for this queue, try to use it.
+ */
+ rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf);
+
+ /*
+ * A BIO that was released from a zone write plug has already been
+ * through the preparation in this function, already holds a reference
+ * on the queue usage counter, and is the only write BIO in-flight for
+ * the target zone. Go straight to preparing a request for it.
+ */
+ if (bio_zone_write_plugging(bio)) {
+ nr_segs = bio->__bi_nr_segments;
+ if (rq)
+ blk_queue_exit(q);
+ goto new_request;
+ }
+
bio = blk_queue_bounce(bio, q);
/*
- * If the plug has a cached request for this queue, try use it.
- *
* The cached request already holds a q_usage_counter reference and we
* don't have to acquire a new one if we use it.
*/
- rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf);
if (!rq) {
if (unlikely(bio_queue_enter(bio)))
return;
@@ -2980,6 +2977,10 @@ void blk_mq_submit_bio(struct bio *bio)
if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
goto queue_exit;
+ if (blk_queue_is_zoned(q) && blk_zone_plug_bio(bio, nr_segs))
+ goto queue_exit;
+
+new_request:
if (!rq) {
rq = blk_mq_get_new_requests(q, plug, bio, nr_segs);
if (unlikely(!rq))
@@ -3002,6 +3003,9 @@ void blk_mq_submit_bio(struct bio *bio)
return;
}
+ if (bio_zone_write_plugging(bio))
+ blk_zone_write_plug_init_request(rq);
+
if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq))
return;
@@ -3483,14 +3487,30 @@ static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
return data.has_rq;
}
-static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu,
- struct blk_mq_hw_ctx *hctx)
+static bool blk_mq_hctx_has_online_cpu(struct blk_mq_hw_ctx *hctx,
+ unsigned int this_cpu)
{
- if (cpumask_first_and(hctx->cpumask, cpu_online_mask) != cpu)
- return false;
- if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids)
- return false;
- return true;
+ enum hctx_type type = hctx->type;
+ int cpu;
+
+ /*
+ * hctx->cpumask has to rule out isolated CPUs, but userspace still
+ * might submit IOs on these isolated CPUs, so use the queue map to
+ * check if all CPUs mapped to this hctx are offline
+ */
+ for_each_online_cpu(cpu) {
+ struct blk_mq_hw_ctx *h = blk_mq_map_queue_type(hctx->queue,
+ type, cpu);
+
+ if (h != hctx)
+ continue;
+
+ /* this hctx has at least one online CPU */
+ if (this_cpu != cpu)
+ return true;
+ }
+
+ return false;
}
static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
@@ -3498,8 +3518,7 @@ static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
struct blk_mq_hw_ctx, cpuhp_online);
- if (!cpumask_test_cpu(cpu, hctx->cpumask) ||
- !blk_mq_last_cpu_in_hctx(cpu, hctx))
+ if (blk_mq_hctx_has_online_cpu(hctx, cpu))
return 0;
/*
@@ -3907,6 +3926,8 @@ static void blk_mq_map_swqueue(struct request_queue *q)
}
queue_for_each_hw_ctx(q, hctx, i) {
+ int cpu;
+
/*
* If no software queues are mapped to this hardware queue,
* disable it and free the request entries.
@@ -3934,6 +3955,15 @@ static void blk_mq_map_swqueue(struct request_queue *q)
sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx);
/*
+ * Rule out isolated CPUs from hctx->cpumask to avoid
+ * running block kworker on isolated CPUs
+ */
+ for_each_cpu(cpu, hctx->cpumask) {
+ if (cpu_is_isolated(cpu))
+ cpumask_clear_cpu(cpu, hctx->cpumask);
+ }
+
+ /*
* Initialize batch roundrobin counts
*/
hctx->next_cpu = blk_mq_first_mapped_cpu(hctx);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index f75a9ecfebde..260beea8e332 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -365,37 +365,6 @@ static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
qmap->mq_map[cpu] = 0;
}
-/*
- * blk_mq_plug() - Get caller context plug
- * @bio : the bio being submitted by the caller context
- *
- * Plugging, by design, may delay the insertion of BIOs into the elevator in
- * order to increase BIO merging opportunities. This however can cause BIO
- * insertion order to change from the order in which submit_bio() is being
- * executed in the case of multiple contexts concurrently issuing BIOs to a
- * device, even if these context are synchronized to tightly control BIO issuing
- * order. While this is not a problem with regular block devices, this ordering
- * change can cause write BIO failures with zoned block devices as these
- * require sequential write patterns to zones. Prevent this from happening by
- * ignoring the plug state of a BIO issuing context if it is for a zoned block
- * device and the BIO to plug is a write operation.
- *
- * Return current->plug if the bio can be plugged and NULL otherwise
- */
-static inline struct blk_plug *blk_mq_plug( struct bio *bio)
-{
- /* Zoned block device write operation case: do not plug the BIO */
- if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
- bdev_op_is_zoned_write(bio->bi_bdev, bio_op(bio)))
- return NULL;
-
- /*
- * For regular block devices or read operations, use the context plug
- * which may be NULL if blk_start_plug() was not executed.
- */
- return current->plug;
-}
-
/* Free all requests on the list */
static inline void blk_mq_free_requests(struct list_head *list)
{
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 9d6033e01f2e..ebba05a2bc7f 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -411,24 +411,32 @@ EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors);
* blk_queue_max_zone_append_sectors - set max sectors for a single zone append
* @q: the request queue for the device
* @max_zone_append_sectors: maximum number of sectors to write per command
+ *
+ * Sets the maximum number of sectors allowed for zone append commands. If
+ * Specifying 0 for @max_zone_append_sectors indicates that the queue does
+ * not natively support zone append operations and that the block layer must
+ * emulate these operations using regular writes.
**/
void blk_queue_max_zone_append_sectors(struct request_queue *q,
unsigned int max_zone_append_sectors)
{
- unsigned int max_sectors;
+ unsigned int max_sectors = 0;
if (WARN_ON(!blk_queue_is_zoned(q)))
return;
- max_sectors = min(q->limits.max_hw_sectors, max_zone_append_sectors);
- max_sectors = min(q->limits.chunk_sectors, max_sectors);
+ if (max_zone_append_sectors) {
+ max_sectors = min(q->limits.max_hw_sectors,
+ max_zone_append_sectors);
+ max_sectors = min(q->limits.chunk_sectors, max_sectors);
- /*
- * Signal eventual driver bugs resulting in the max_zone_append sectors limit
- * being 0 due to a 0 argument, the chunk_sectors limit (zone size) not set,
- * or the max_hw_sectors limit not set.
- */
- WARN_ON(!max_sectors);
+ /*
+ * Signal eventual driver bugs resulting in the max_zone_append
+ * sectors limit being 0 due to the chunk_sectors limit (zone
+ * size) not set or the max_hw_sectors limit not set.
+ */
+ WARN_ON_ONCE(!max_sectors);
+ }
q->limits.max_zone_append_sectors = max_sectors;
}
@@ -755,8 +763,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors);
t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors,
b->max_write_zeroes_sectors);
- t->max_zone_append_sectors = min(t->max_zone_append_sectors,
- b->max_zone_append_sectors);
+ t->max_zone_append_sectors = min(queue_limits_max_zone_append_sectors(t),
+ queue_limits_max_zone_append_sectors(b));
t->bounce = max(t->bounce, b->bounce);
t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
@@ -1044,22 +1052,6 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
EXPORT_SYMBOL_GPL(blk_queue_write_cache);
/**
- * blk_queue_required_elevator_features - Set a queue required elevator features
- * @q: the request queue for the target device
- * @features: Required elevator features OR'ed together
- *
- * Tell the block layer that for the device controlled through @q, only the
- * only elevators that can be used are those that implement at least the set of
- * features specified by @features.
- */
-void blk_queue_required_elevator_features(struct request_queue *q,
- unsigned int features)
-{
- q->required_elevator_features = features;
-}
-EXPORT_SYMBOL_GPL(blk_queue_required_elevator_features);
-
-/**
* blk_queue_can_use_dma_map_merging - configure queue for merging segments.
* @q: the request queue for the device
* @dev: the device pointer for dma
diff --git a/block/blk-stat.c b/block/blk-stat.c
index e42c263e53fb..eaf60097bbe1 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -57,9 +57,6 @@ void blk_stat_add(struct request *rq, u64 now)
value = (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0;
- if (req_op(rq) == REQ_OP_READ || req_op(rq) == REQ_OP_WRITE)
- blk_throtl_stat_add(rq, value);
-
rcu_read_lock();
cpu = get_cpu();
list_for_each_entry_rcu(cb, &q->stats->callbacks, list) {
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 8c8f69d8ba48..f0f9314ab65c 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -224,7 +224,7 @@ static ssize_t queue_zone_write_granularity_show(struct request_queue *q,
static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page)
{
- unsigned long long max_sectors = q->limits.max_zone_append_sectors;
+ unsigned long long max_sectors = queue_max_zone_append_sectors(q);
return sprintf(page, "%llu\n", max_sectors << SECTOR_SHIFT);
}
@@ -516,10 +516,6 @@ QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout");
QUEUE_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask");
QUEUE_RO_ENTRY(queue_dma_alignment, "dma_alignment");
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-QUEUE_RW_ENTRY(blk_throtl_sample_time, "throttle_sample_time");
-#endif
-
/* legacy alias for logical_block_size: */
static struct queue_sysfs_entry queue_hw_sector_size_entry = {
.attr = {.name = "hw_sector_size", .mode = 0444 },
@@ -640,9 +636,6 @@ static struct attribute *queue_attrs[] = {
&queue_fua_entry.attr,
&queue_dax_entry.attr,
&queue_poll_delay_entry.attr,
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
- &blk_throtl_sample_time_entry.attr,
-#endif
&queue_virt_boundary_mask_entry.attr,
&queue_dma_alignment_entry.attr,
NULL,
@@ -814,7 +807,6 @@ int blk_register_queue(struct gendisk *disk)
blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
wbt_enable_default(disk);
- blk_throtl_register(disk);
/* Now everything is ready and send out KOBJ_ADD uevent */
kobject_uevent(&disk->queue_kobj, KOBJ_ADD);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index f4850a6f860b..80aaca18bfb0 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -25,18 +25,6 @@
#define DFL_THROTL_SLICE_HD (HZ / 10)
#define DFL_THROTL_SLICE_SSD (HZ / 50)
#define MAX_THROTL_SLICE (HZ)
-#define MAX_IDLE_TIME (5L * 1000 * 1000) /* 5 s */
-#define MIN_THROTL_BPS (320 * 1024)
-#define MIN_THROTL_IOPS (10)
-#define DFL_LATENCY_TARGET (-1L)
-#define DFL_IDLE_THRESHOLD (0)
-#define DFL_HD_BASELINE_LATENCY (4000L) /* 4ms */
-#define LATENCY_FILTERED_SSD (0)
-/*
- * For HD, very small latency comes from sequential IO. Such IO is helpless to
- * help determine if its IO is impacted by others, hence we ignore the IO
- */
-#define LATENCY_FILTERED_HD (1000L) /* 1ms */
/* A workqueue to queue throttle related work */
static struct workqueue_struct *kthrotld_workqueue;
@@ -70,19 +58,6 @@ struct throtl_data
/* Work for dispatching throttled bios */
struct work_struct dispatch_work;
- unsigned int limit_index;
- bool limit_valid[LIMIT_CNT];
-
- unsigned long low_upgrade_time;
- unsigned long low_downgrade_time;
-
- unsigned int scale;
-
- struct latency_bucket tmp_buckets[2][LATENCY_BUCKET_SIZE];
- struct avg_latency_bucket avg_buckets[2][LATENCY_BUCKET_SIZE];
- struct latency_bucket __percpu *latency_buckets[2];
- unsigned long last_calculate_time;
- unsigned long filtered_latency;
bool track_bio_latency;
};
@@ -126,84 +101,24 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
return container_of(sq, struct throtl_data, service_queue);
}
-/*
- * cgroup's limit in LIMIT_MAX is scaled if low limit is set. This scale is to
- * make the IO dispatch more smooth.
- * Scale up: linearly scale up according to elapsed time since upgrade. For
- * every throtl_slice, the limit scales up 1/2 .low limit till the
- * limit hits .max limit
- * Scale down: exponentially scale down if a cgroup doesn't hit its .low limit
- */
-static uint64_t throtl_adjusted_limit(uint64_t low, struct throtl_data *td)
-{
- /* arbitrary value to avoid too big scale */
- if (td->scale < 4096 && time_after_eq(jiffies,
- td->low_upgrade_time + td->scale * td->throtl_slice))
- td->scale = (jiffies - td->low_upgrade_time) / td->throtl_slice;
-
- return low + (low >> 1) * td->scale;
-}
-
static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw)
{
struct blkcg_gq *blkg = tg_to_blkg(tg);
- struct throtl_data *td;
- uint64_t ret;
if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
return U64_MAX;
- td = tg->td;
- ret = tg->bps[rw][td->limit_index];
- if (ret == 0 && td->limit_index == LIMIT_LOW) {
- /* intermediate node or iops isn't 0 */
- if (!list_empty(&blkg->blkcg->css.children) ||
- tg->iops[rw][td->limit_index])
- return U64_MAX;
- else
- return MIN_THROTL_BPS;
- }
-
- if (td->limit_index == LIMIT_MAX && tg->bps[rw][LIMIT_LOW] &&
- tg->bps[rw][LIMIT_LOW] != tg->bps[rw][LIMIT_MAX]) {
- uint64_t adjusted;
-
- adjusted = throtl_adjusted_limit(tg->bps[rw][LIMIT_LOW], td);
- ret = min(tg->bps[rw][LIMIT_MAX], adjusted);
- }
- return ret;
+ return tg->bps[rw];
}
static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw)
{
struct blkcg_gq *blkg = tg_to_blkg(tg);
- struct throtl_data *td;
- unsigned int ret;
if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
return UINT_MAX;
- td = tg->td;
- ret = tg->iops[rw][td->limit_index];
- if (ret == 0 && tg->td->limit_index == LIMIT_LOW) {
- /* intermediate node or bps isn't 0 */
- if (!list_empty(&blkg->blkcg->css.children) ||
- tg->bps[rw][td->limit_index])
- return UINT_MAX;
- else
- return MIN_THROTL_IOPS;
- }
-
- if (td->limit_index == LIMIT_MAX && tg->iops[rw][LIMIT_LOW] &&
- tg->iops[rw][LIMIT_LOW] != tg->iops[rw][LIMIT_MAX]) {
- uint64_t adjusted;
-
- adjusted = throtl_adjusted_limit(tg->iops[rw][LIMIT_LOW], td);
- if (adjusted > UINT_MAX)
- adjusted = UINT_MAX;
- ret = min_t(unsigned int, tg->iops[rw][LIMIT_MAX], adjusted);
- }
- return ret;
+ return tg->iops[rw];
}
#define request_bucket_index(sectors) \
@@ -359,20 +274,10 @@ static struct blkg_policy_data *throtl_pd_alloc(struct gendisk *disk,
}
RB_CLEAR_NODE(&tg->rb_node);
- tg->bps[READ][LIMIT_MAX] = U64_MAX;
- tg->bps[WRITE][LIMIT_MAX] = U64_MAX;
- tg->iops[READ][LIMIT_MAX] = UINT_MAX;
- tg->iops[WRITE][LIMIT_MAX] = UINT_MAX;
- tg->bps_conf[READ][LIMIT_MAX] = U64_MAX;
- tg->bps_conf[WRITE][LIMIT_MAX] = U64_MAX;
- tg->iops_conf[READ][LIMIT_MAX] = UINT_MAX;
- tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX;
- /* LIMIT_LOW will have default value 0 */
-
- tg->latency_target = DFL_LATENCY_TARGET;
- tg->latency_target_conf = DFL_LATENCY_TARGET;
- tg->idletime_threshold = DFL_IDLE_THRESHOLD;
- tg->idletime_threshold_conf = DFL_IDLE_THRESHOLD;
+ tg->bps[READ] = U64_MAX;
+ tg->bps[WRITE] = U64_MAX;
+ tg->iops[READ] = UINT_MAX;
+ tg->iops[WRITE] = UINT_MAX;
return &tg->pd;
@@ -418,18 +323,15 @@ static void throtl_pd_init(struct blkg_policy_data *pd)
static void tg_update_has_rules(struct throtl_grp *tg)
{
struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq);
- struct throtl_data *td = tg->td;
int rw;
for (rw = READ; rw <= WRITE; rw++) {
tg->has_rules_iops[rw] =
(parent_tg && parent_tg->has_rules_iops[rw]) ||
- (td->limit_valid[td->limit_index] &&
- tg_iops_limit(tg, rw) != UINT_MAX);
+ tg_iops_limit(tg, rw) != UINT_MAX;
tg->has_rules_bps[rw] =
(parent_tg && parent_tg->has_rules_bps[rw]) ||
- (td->limit_valid[td->limit_index] &&
- (tg_bps_limit(tg, rw) != U64_MAX));
+ tg_bps_limit(tg, rw) != U64_MAX;
}
}
@@ -443,49 +345,6 @@ static void throtl_pd_online(struct blkg_policy_data *pd)
tg_update_has_rules(tg);
}
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-static void blk_throtl_update_limit_valid(struct throtl_data *td)
-{
- struct cgroup_subsys_state *pos_css;
- struct blkcg_gq *blkg;
- bool low_valid = false;
-
- rcu_read_lock();
- blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
- struct throtl_grp *tg = blkg_to_tg(blkg);
-
- if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] ||
- tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) {
- low_valid = true;
- break;
- }
- }
- rcu_read_unlock();
-
- td->limit_valid[LIMIT_LOW] = low_valid;
-}
-#else
-static inline void blk_throtl_update_limit_valid(struct throtl_data *td)
-{
-}
-#endif
-
-static void throtl_upgrade_state(struct throtl_data *td);
-static void throtl_pd_offline(struct blkg_policy_data *pd)
-{
- struct throtl_grp *tg = pd_to_tg(pd);
-
- tg->bps[READ][LIMIT_LOW] = 0;
- tg->bps[WRITE][LIMIT_LOW] = 0;
- tg->iops[READ][LIMIT_LOW] = 0;
- tg->iops[WRITE][LIMIT_LOW] = 0;
-
- blk_throtl_update_limit_valid(tg->td);
-
- if (!tg->td->limit_valid[tg->td->limit_index])
- throtl_upgrade_state(tg->td);
-}
-
static void throtl_pd_free(struct blkg_policy_data *pd)
{
struct throtl_grp *tg = pd_to_tg(pd);
@@ -1151,8 +1010,6 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
return nr_disp;
}
-static bool throtl_can_upgrade(struct throtl_data *td,
- struct throtl_grp *this_tg);
/**
* throtl_pending_timer_fn - timer function for service_queue->pending_timer
* @t: the pending_timer member of the throtl_service_queue being serviced
@@ -1189,9 +1046,6 @@ static void throtl_pending_timer_fn(struct timer_list *t)
if (!q->root_blkg)
goto out_unlock;
- if (throtl_can_upgrade(td, NULL))
- throtl_upgrade_state(td);
-
again:
parent_sq = sq->parent_sq;
dispatched = false;
@@ -1331,22 +1185,12 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global)
blkg_for_each_descendant_pre(blkg, pos_css,
global ? tg->td->queue->root_blkg : tg_to_blkg(tg)) {
struct throtl_grp *this_tg = blkg_to_tg(blkg);
- struct throtl_grp *parent_tg;
tg_update_has_rules(this_tg);
/* ignore root/second level */
if (!cgroup_subsys_on_dfl(io_cgrp_subsys) || !blkg->parent ||
!blkg->parent->parent)
continue;
- parent_tg = blkg_to_tg(blkg->parent);
- /*
- * make sure all children has lower idle time threshold and
- * higher latency target
- */
- this_tg->idletime_threshold = min(this_tg->idletime_threshold,
- parent_tg->idletime_threshold);
- this_tg->latency_target = max(this_tg->latency_target,
- parent_tg->latency_target);
}
rcu_read_unlock();
@@ -1367,6 +1211,53 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global)
}
}
+static int blk_throtl_init(struct gendisk *disk)
+{
+ struct request_queue *q = disk->queue;
+ struct throtl_data *td;
+ int ret;
+
+ td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
+ if (!td)
+ return -ENOMEM;
+
+ INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
+ throtl_service_queue_init(&td->service_queue);
+
+ /*
+ * Freeze queue before activating policy, to synchronize with IO path,
+ * which is protected by 'q_usage_counter'.
+ */
+ blk_mq_freeze_queue(disk->queue);
+ blk_mq_quiesce_queue(disk->queue);
+
+ q->td = td;
+ td->queue = q;
+
+ /* activate policy */
+ ret = blkcg_activate_policy(disk, &blkcg_policy_throtl);
+ if (ret) {
+ q->td = NULL;
+ kfree(td);
+ goto out;
+ }
+
+ if (blk_queue_nonrot(q))
+ td->throtl_slice = DFL_THROTL_SLICE_SSD;
+ else
+ td->throtl_slice = DFL_THROTL_SLICE_HD;
+ td->track_bio_latency = !queue_is_mq(q);
+ if (!td->track_bio_latency)
+ blk_stat_enable_accounting(q);
+
+out:
+ blk_mq_unquiesce_queue(disk->queue);
+ blk_mq_unfreeze_queue(disk->queue);
+
+ return ret;
+}
+
+
static ssize_t tg_set_conf(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off, bool is_u64)
{
@@ -1378,6 +1269,16 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
blkg_conf_init(&ctx, buf);
+ ret = blkg_conf_open_bdev(&ctx);
+ if (ret)
+ goto out_finish;
+
+ if (!blk_throtl_activated(ctx.bdev->bd_queue)) {
+ ret = blk_throtl_init(ctx.bdev->bd_disk);
+ if (ret)
+ goto out_finish;
+ }
+
ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx);
if (ret)
goto out_finish;
@@ -1444,25 +1345,25 @@ static int tg_print_rwstat_recursive(struct seq_file *sf, void *v)
static struct cftype throtl_legacy_files[] = {
{
.name = "throttle.read_bps_device",
- .private = offsetof(struct throtl_grp, bps[READ][LIMIT_MAX]),
+ .private = offsetof(struct throtl_grp, bps[READ]),
.seq_show = tg_print_conf_u64,
.write = tg_set_conf_u64,
},
{
.name = "throttle.write_bps_device",
- .private = offsetof(struct throtl_grp, bps[WRITE][LIMIT_MAX]),
+ .private = offsetof(struct throtl_grp, bps[WRITE]),
.seq_show = tg_print_conf_u64,
.write = tg_set_conf_u64,
},
{
.name = "throttle.read_iops_device",
- .private = offsetof(struct throtl_grp, iops[READ][LIMIT_MAX]),
+ .private = offsetof(struct throtl_grp, iops[READ]),
.seq_show = tg_print_conf_uint,
.write = tg_set_conf_uint,
},
{
.name = "throttle.write_iops_device",
- .private = offsetof(struct throtl_grp, iops[WRITE][LIMIT_MAX]),
+ .private = offsetof(struct throtl_grp, iops[WRITE]),
.seq_show = tg_print_conf_uint,
.write = tg_set_conf_uint,
},
@@ -1494,61 +1395,43 @@ static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd,
{
struct throtl_grp *tg = pd_to_tg(pd);
const char *dname = blkg_dev_name(pd->blkg);
- char bufs[4][21] = { "max", "max", "max", "max" };
u64 bps_dft;
unsigned int iops_dft;
- char idle_time[26] = "";
- char latency_time[26] = "";
if (!dname)
return 0;
- if (off == LIMIT_LOW) {
- bps_dft = 0;
- iops_dft = 0;
- } else {
- bps_dft = U64_MAX;
- iops_dft = UINT_MAX;
- }
+ bps_dft = U64_MAX;
+ iops_dft = UINT_MAX;
- if (tg->bps_conf[READ][off] == bps_dft &&
- tg->bps_conf[WRITE][off] == bps_dft &&
- tg->iops_conf[READ][off] == iops_dft &&
- tg->iops_conf[WRITE][off] == iops_dft &&
- (off != LIMIT_LOW ||
- (tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD &&
- tg->latency_target_conf == DFL_LATENCY_TARGET)))
+ if (tg->bps_conf[READ] == bps_dft &&
+ tg->bps_conf[WRITE] == bps_dft &&
+ tg->iops_conf[READ] == iops_dft &&
+ tg->iops_conf[WRITE] == iops_dft)
return 0;
- if (tg->bps_conf[READ][off] != U64_MAX)
- snprintf(bufs[0], sizeof(bufs[0]), "%llu",
- tg->bps_conf[READ][off]);
- if (tg->bps_conf[WRITE][off] != U64_MAX)
- snprintf(bufs[1], sizeof(bufs[1]), "%llu",
- tg->bps_conf[WRITE][off]);
- if (tg->iops_conf[READ][off] != UINT_MAX)
- snprintf(bufs[2], sizeof(bufs[2]), "%u",
- tg->iops_conf[READ][off]);
- if (tg->iops_conf[WRITE][off] != UINT_MAX)
- snprintf(bufs[3], sizeof(bufs[3]), "%u",
- tg->iops_conf[WRITE][off]);
- if (off == LIMIT_LOW) {
- if (tg->idletime_threshold_conf == ULONG_MAX)
- strcpy(idle_time, " idle=max");
- else
- snprintf(idle_time, sizeof(idle_time), " idle=%lu",
- tg->idletime_threshold_conf);
+ seq_printf(sf, "%s", dname);
+ if (tg->bps_conf[READ] == U64_MAX)
+ seq_printf(sf, " rbps=max");
+ else
+ seq_printf(sf, " rbps=%llu", tg->bps_conf[READ]);
- if (tg->latency_target_conf == ULONG_MAX)
- strcpy(latency_time, " latency=max");
- else
- snprintf(latency_time, sizeof(latency_time),
- " latency=%lu", tg->latency_target_conf);
- }
+ if (tg->bps_conf[WRITE] == U64_MAX)
+ seq_printf(sf, " wbps=max");
+ else
+ seq_printf(sf, " wbps=%llu", tg->bps_conf[WRITE]);
+
+ if (tg->iops_conf[READ] == UINT_MAX)
+ seq_printf(sf, " riops=max");
+ else
+ seq_printf(sf, " riops=%u", tg->iops_conf[READ]);
- seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s%s%s\n",
- dname, bufs[0], bufs[1], bufs[2], bufs[3], idle_time,
- latency_time);
+ if (tg->iops_conf[WRITE] == UINT_MAX)
+ seq_printf(sf, " wiops=max");
+ else
+ seq_printf(sf, " wiops=%u", tg->iops_conf[WRITE]);
+
+ seq_printf(sf, "\n");
return 0;
}
@@ -1566,13 +1449,20 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
struct blkg_conf_ctx ctx;
struct throtl_grp *tg;
u64 v[4];
- unsigned long idle_time;
- unsigned long latency_time;
int ret;
- int index = of_cft(of)->private;
blkg_conf_init(&ctx, buf);
+ ret = blkg_conf_open_bdev(&ctx);
+ if (ret)
+ goto out_finish;
+
+ if (!blk_throtl_activated(ctx.bdev->bd_queue)) {
+ ret = blk_throtl_init(ctx.bdev->bd_disk);
+ if (ret)
+ goto out_finish;
+ }
+
ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx);
if (ret)
goto out_finish;
@@ -1580,13 +1470,11 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
tg = blkg_to_tg(ctx.blkg);
tg_update_carryover(tg);
- v[0] = tg->bps_conf[READ][index];
- v[1] = tg->bps_conf[WRITE][index];
- v[2] = tg->iops_conf[READ][index];
- v[3] = tg->iops_conf[WRITE][index];
+ v[0] = tg->bps[READ];
+ v[1] = tg->bps[WRITE];
+ v[2] = tg->iops[READ];
+ v[3] = tg->iops[WRITE];
- idle_time = tg->idletime_threshold_conf;
- latency_time = tg->latency_target_conf;
while (true) {
char tok[27]; /* wiops=18446744073709551616 */
char *p;
@@ -1618,60 +1506,16 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
v[2] = min_t(u64, val, UINT_MAX);
else if (!strcmp(tok, "wiops") && val > 1)
v[3] = min_t(u64, val, UINT_MAX);
- else if (off == LIMIT_LOW && !strcmp(tok, "idle"))
- idle_time = val;
- else if (off == LIMIT_LOW && !strcmp(tok, "latency"))
- latency_time = val;
else
goto out_finish;
}
- tg->bps_conf[READ][index] = v[0];
- tg->bps_conf[WRITE][index] = v[1];
- tg->iops_conf[READ][index] = v[2];
- tg->iops_conf[WRITE][index] = v[3];
+ tg->bps[READ] = v[0];
+ tg->bps[WRITE] = v[1];
+ tg->iops[READ] = v[2];
+ tg->iops[WRITE] = v[3];
- if (index == LIMIT_MAX) {
- tg->bps[READ][index] = v[0];
- tg->bps[WRITE][index] = v[1];
- tg->iops[READ][index] = v[2];
- tg->iops[WRITE][index] = v[3];
- }
- tg->bps[READ][LIMIT_LOW] = min(tg->bps_conf[READ][LIMIT_LOW],
- tg->bps_conf[READ][LIMIT_MAX]);
- tg->bps[WRITE][LIMIT_LOW] = min(tg->bps_conf[WRITE][LIMIT_LOW],
- tg->bps_conf[WRITE][LIMIT_MAX]);
- tg->iops[READ][LIMIT_LOW] = min(tg->iops_conf[READ][LIMIT_LOW],
- tg->iops_conf[READ][LIMIT_MAX]);
- tg->iops[WRITE][LIMIT_LOW] = min(tg->iops_conf[WRITE][LIMIT_LOW],
- tg->iops_conf[WRITE][LIMIT_MAX]);
- tg->idletime_threshold_conf = idle_time;
- tg->latency_target_conf = latency_time;
-
- /* force user to configure all settings for low limit */
- if (!(tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW] ||
- tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) ||
- tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD ||
- tg->latency_target_conf == DFL_LATENCY_TARGET) {
- tg->bps[READ][LIMIT_LOW] = 0;
- tg->bps[WRITE][LIMIT_LOW] = 0;
- tg->iops[READ][LIMIT_LOW] = 0;
- tg->iops[WRITE][LIMIT_LOW] = 0;
- tg->idletime_threshold = DFL_IDLE_THRESHOLD;
- tg->latency_target = DFL_LATENCY_TARGET;
- } else if (index == LIMIT_LOW) {
- tg->idletime_threshold = tg->idletime_threshold_conf;
- tg->latency_target = tg->latency_target_conf;
- }
-
- blk_throtl_update_limit_valid(tg->td);
- if (tg->td->limit_valid[LIMIT_LOW]) {
- if (index == LIMIT_LOW)
- tg->td->limit_index = LIMIT_LOW;
- } else
- tg->td->limit_index = LIMIT_MAX;
- tg_conf_updated(tg, index == LIMIT_LOW &&
- tg->td->limit_valid[LIMIT_LOW]);
+ tg_conf_updated(tg, false);
ret = 0;
out_finish:
blkg_conf_exit(&ctx);
@@ -1679,21 +1523,11 @@ out_finish:
}
static struct cftype throtl_files[] = {
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
- {
- .name = "low",
- .flags = CFTYPE_NOT_ON_ROOT,
- .seq_show = tg_print_limit,
- .write = tg_set_limit,
- .private = LIMIT_LOW,
- },
-#endif
{
.name = "max",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = tg_print_limit,
.write = tg_set_limit,
- .private = LIMIT_MAX,
},
{ } /* terminate */
};
@@ -1712,7 +1546,6 @@ struct blkcg_policy blkcg_policy_throtl = {
.pd_alloc_fn = throtl_pd_alloc,
.pd_init_fn = throtl_pd_init,
.pd_online_fn = throtl_pd_online,
- .pd_offline_fn = throtl_pd_offline,
.pd_free_fn = throtl_pd_free,
};
@@ -1722,6 +1555,9 @@ void blk_throtl_cancel_bios(struct gendisk *disk)
struct cgroup_subsys_state *pos_css;
struct blkcg_gq *blkg;
+ if (!blk_throtl_activated(q))
+ return;
+
spin_lock_irq(&q->queue_lock);
/*
* queue_lock is held, rcu lock is not needed here technically.
@@ -1761,418 +1597,6 @@ void blk_throtl_cancel_bios(struct gendisk *disk)
spin_unlock_irq(&q->queue_lock);
}
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg)
-{
- unsigned long rtime = jiffies, wtime = jiffies;
-
- if (tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW])
- rtime = tg->last_low_overflow_time[READ];
- if (tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW])
- wtime = tg->last_low_overflow_time[WRITE];
- return min(rtime, wtime);
-}
-
-static unsigned long tg_last_low_overflow_time(struct throtl_grp *tg)
-{
- struct throtl_service_queue *parent_sq;
- struct throtl_grp *parent = tg;
- unsigned long ret = __tg_last_low_overflow_time(tg);
-
- while (true) {
- parent_sq = parent->service_queue.parent_sq;
- parent = sq_to_tg(parent_sq);
- if (!parent)
- break;
-
- /*
- * The parent doesn't have low limit, it always reaches low
- * limit. Its overflow time is useless for children
- */
- if (!parent->bps[READ][LIMIT_LOW] &&
- !parent->iops[READ][LIMIT_LOW] &&
- !parent->bps[WRITE][LIMIT_LOW] &&
- !parent->iops[WRITE][LIMIT_LOW])
- continue;
- if (time_after(__tg_last_low_overflow_time(parent), ret))
- ret = __tg_last_low_overflow_time(parent);
- }
- return ret;
-}
-
-static bool throtl_tg_is_idle(struct throtl_grp *tg)
-{
- /*
- * cgroup is idle if:
- * - single idle is too long, longer than a fixed value (in case user
- * configure a too big threshold) or 4 times of idletime threshold
- * - average think time is more than threshold
- * - IO latency is largely below threshold
- */
- unsigned long time;
- bool ret;
-
- time = min_t(unsigned long, MAX_IDLE_TIME, 4 * tg->idletime_threshold);
- ret = tg->latency_target == DFL_LATENCY_TARGET ||
- tg->idletime_threshold == DFL_IDLE_THRESHOLD ||
- (blk_time_get_ns() >> 10) - tg->last_finish_time > time ||
- tg->avg_idletime > tg->idletime_threshold ||
- (tg->latency_target && tg->bio_cnt &&
- tg->bad_bio_cnt * 5 < tg->bio_cnt);
- throtl_log(&tg->service_queue,
- "avg_idle=%ld, idle_threshold=%ld, bad_bio=%d, total_bio=%d, is_idle=%d, scale=%d",
- tg->avg_idletime, tg->idletime_threshold, tg->bad_bio_cnt,
- tg->bio_cnt, ret, tg->td->scale);
- return ret;
-}
-
-static bool throtl_low_limit_reached(struct throtl_grp *tg, int rw)
-{
- struct throtl_service_queue *sq = &tg->service_queue;
- bool limit = tg->bps[rw][LIMIT_LOW] || tg->iops[rw][LIMIT_LOW];
-
- /*
- * if low limit is zero, low limit is always reached.
- * if low limit is non-zero, we can check if there is any request
- * is queued to determine if low limit is reached as we throttle
- * request according to limit.
- */
- return !limit || sq->nr_queued[rw];
-}
-
-static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
-{
- /*
- * cgroup reaches low limit when low limit of READ and WRITE are
- * both reached, it's ok to upgrade to next limit if cgroup reaches
- * low limit
- */
- if (throtl_low_limit_reached(tg, READ) &&
- throtl_low_limit_reached(tg, WRITE))
- return true;
-
- if (time_after_eq(jiffies,
- tg_last_low_overflow_time(tg) + tg->td->throtl_slice) &&
- throtl_tg_is_idle(tg))
- return true;
- return false;
-}
-
-static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg)
-{
- while (true) {
- if (throtl_tg_can_upgrade(tg))
- return true;
- tg = sq_to_tg(tg->service_queue.parent_sq);
- if (!tg || !tg_to_blkg(tg)->parent)
- return false;
- }
- return false;
-}
-
-static bool throtl_can_upgrade(struct throtl_data *td,
- struct throtl_grp *this_tg)
-{
- struct cgroup_subsys_state *pos_css;
- struct blkcg_gq *blkg;
-
- if (td->limit_index != LIMIT_LOW)
- return false;
-
- if (time_before(jiffies, td->low_downgrade_time + td->throtl_slice))
- return false;
-
- rcu_read_lock();
- blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
- struct throtl_grp *tg = blkg_to_tg(blkg);
-
- if (tg == this_tg)
- continue;
- if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))
- continue;
- if (!throtl_hierarchy_can_upgrade(tg)) {
- rcu_read_unlock();
- return false;
- }
- }
- rcu_read_unlock();
- return true;
-}
-
-static void throtl_upgrade_check(struct throtl_grp *tg)
-{
- unsigned long now = jiffies;
-
- if (tg->td->limit_index != LIMIT_LOW)
- return;
-
- if (time_after(tg->last_check_time + tg->td->throtl_slice, now))
- return;
-
- tg->last_check_time = now;
-
- if (!time_after_eq(now,
- __tg_last_low_overflow_time(tg) + tg->td->throtl_slice))
- return;
-
- if (throtl_can_upgrade(tg->td, NULL))
- throtl_upgrade_state(tg->td);
-}
-
-static void throtl_upgrade_state(struct throtl_data *td)
-{
- struct cgroup_subsys_state *pos_css;
- struct blkcg_gq *blkg;
-
- throtl_log(&td->service_queue, "upgrade to max");
- td->limit_index = LIMIT_MAX;
- td->low_upgrade_time = jiffies;
- td->scale = 0;
- rcu_read_lock();
- blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
- struct throtl_grp *tg = blkg_to_tg(blkg);
- struct throtl_service_queue *sq = &tg->service_queue;
-
- tg->disptime = jiffies - 1;
- throtl_select_dispatch(sq);
- throtl_schedule_next_dispatch(sq, true);
- }
- rcu_read_unlock();
- throtl_select_dispatch(&td->service_queue);
- throtl_schedule_next_dispatch(&td->service_queue, true);
- queue_work(kthrotld_workqueue, &td->dispatch_work);
-}
-
-static void throtl_downgrade_state(struct throtl_data *td)
-{
- td->scale /= 2;
-
- throtl_log(&td->service_queue, "downgrade, scale %d", td->scale);
- if (td->scale) {
- td->low_upgrade_time = jiffies - td->scale * td->throtl_slice;
- return;
- }
-
- td->limit_index = LIMIT_LOW;
- td->low_downgrade_time = jiffies;
-}
-
-static bool throtl_tg_can_downgrade(struct throtl_grp *tg)
-{
- struct throtl_data *td = tg->td;
- unsigned long now = jiffies;
-
- /*
- * If cgroup is below low limit, consider downgrade and throttle other
- * cgroups
- */
- if (time_after_eq(now, tg_last_low_overflow_time(tg) +
- td->throtl_slice) &&
- (!throtl_tg_is_idle(tg) ||
- !list_empty(&tg_to_blkg(tg)->blkcg->css.children)))
- return true;
- return false;
-}
-
-static bool throtl_hierarchy_can_downgrade(struct throtl_grp *tg)
-{
- struct throtl_data *td = tg->td;
-
- if (time_before(jiffies, td->low_upgrade_time + td->throtl_slice))
- return false;
-
- while (true) {
- if (!throtl_tg_can_downgrade(tg))
- return false;
- tg = sq_to_tg(tg->service_queue.parent_sq);
- if (!tg || !tg_to_blkg(tg)->parent)
- break;
- }
- return true;
-}
-
-static void throtl_downgrade_check(struct throtl_grp *tg)
-{
- uint64_t bps;
- unsigned int iops;
- unsigned long elapsed_time;
- unsigned long now = jiffies;
-
- if (tg->td->limit_index != LIMIT_MAX ||
- !tg->td->limit_valid[LIMIT_LOW])
- return;
- if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))
- return;
- if (time_after(tg->last_check_time + tg->td->throtl_slice, now))
- return;
-
- elapsed_time = now - tg->last_check_time;
- tg->last_check_time = now;
-
- if (time_before(now, tg_last_low_overflow_time(tg) +
- tg->td->throtl_slice))
- return;
-
- if (tg->bps[READ][LIMIT_LOW]) {
- bps = tg->last_bytes_disp[READ] * HZ;
- do_div(bps, elapsed_time);
- if (bps >= tg->bps[READ][LIMIT_LOW])
- tg->last_low_overflow_time[READ] = now;
- }
-
- if (tg->bps[WRITE][LIMIT_LOW]) {
- bps = tg->last_bytes_disp[WRITE] * HZ;
- do_div(bps, elapsed_time);
- if (bps >= tg->bps[WRITE][LIMIT_LOW])
- tg->last_low_overflow_time[WRITE] = now;
- }
-
- if (tg->iops[READ][LIMIT_LOW]) {
- iops = tg->last_io_disp[READ] * HZ / elapsed_time;
- if (iops >= tg->iops[READ][LIMIT_LOW])
- tg->last_low_overflow_time[READ] = now;
- }
-
- if (tg->iops[WRITE][LIMIT_LOW]) {
- iops = tg->last_io_disp[WRITE] * HZ / elapsed_time;
- if (iops >= tg->iops[WRITE][LIMIT_LOW])
- tg->last_low_overflow_time[WRITE] = now;
- }
-
- /*
- * If cgroup is below low limit, consider downgrade and throttle other
- * cgroups
- */
- if (throtl_hierarchy_can_downgrade(tg))
- throtl_downgrade_state(tg->td);
-
- tg->last_bytes_disp[READ] = 0;
- tg->last_bytes_disp[WRITE] = 0;
- tg->last_io_disp[READ] = 0;
- tg->last_io_disp[WRITE] = 0;
-}
-
-static void blk_throtl_update_idletime(struct throtl_grp *tg)
-{
- unsigned long now;
- unsigned long last_finish_time = tg->last_finish_time;
-
- if (last_finish_time == 0)
- return;
-
- now = blk_time_get_ns() >> 10;
- if (now <= last_finish_time ||
- last_finish_time == tg->checked_last_finish_time)
- return;
-
- tg->avg_idletime = (tg->avg_idletime * 7 + now - last_finish_time) >> 3;
- tg->checked_last_finish_time = last_finish_time;
-}
-
-static void throtl_update_latency_buckets(struct throtl_data *td)
-{
- struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE];
- int i, cpu, rw;
- unsigned long last_latency[2] = { 0 };
- unsigned long latency[2];
-
- if (!blk_queue_nonrot(td->queue) || !td->limit_valid[LIMIT_LOW])
- return;
- if (time_before(jiffies, td->last_calculate_time + HZ))
- return;
- td->last_calculate_time = jiffies;
-
- memset(avg_latency, 0, sizeof(avg_latency));
- for (rw = READ; rw <= WRITE; rw++) {
- for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
- struct latency_bucket *tmp = &td->tmp_buckets[rw][i];
-
- for_each_possible_cpu(cpu) {
- struct latency_bucket *bucket;
-
- /* this isn't race free, but ok in practice */
- bucket = per_cpu_ptr(td->latency_buckets[rw],
- cpu);
- tmp->total_latency += bucket[i].total_latency;
- tmp->samples += bucket[i].samples;
- bucket[i].total_latency = 0;
- bucket[i].samples = 0;
- }
-
- if (tmp->samples >= 32) {
- int samples = tmp->samples;
-
- latency[rw] = tmp->total_latency;
-
- tmp->total_latency = 0;
- tmp->samples = 0;
- latency[rw] /= samples;
- if (latency[rw] == 0)
- continue;
- avg_latency[rw][i].latency = latency[rw];
- }
- }
- }
-
- for (rw = READ; rw <= WRITE; rw++) {
- for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
- if (!avg_latency[rw][i].latency) {
- if (td->avg_buckets[rw][i].latency < last_latency[rw])
- td->avg_buckets[rw][i].latency =
- last_latency[rw];
- continue;
- }
-
- if (!td->avg_buckets[rw][i].valid)
- latency[rw] = avg_latency[rw][i].latency;
- else
- latency[rw] = (td->avg_buckets[rw][i].latency * 7 +
- avg_latency[rw][i].latency) >> 3;
-
- td->avg_buckets[rw][i].latency = max(latency[rw],
- last_latency[rw]);
- td->avg_buckets[rw][i].valid = true;
- last_latency[rw] = td->avg_buckets[rw][i].latency;
- }
- }
-
- for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
- throtl_log(&td->service_queue,
- "Latency bucket %d: read latency=%ld, read valid=%d, "
- "write latency=%ld, write valid=%d", i,
- td->avg_buckets[READ][i].latency,
- td->avg_buckets[READ][i].valid,
- td->avg_buckets[WRITE][i].latency,
- td->avg_buckets[WRITE][i].valid);
-}
-#else
-static inline void throtl_update_latency_buckets(struct throtl_data *td)
-{
-}
-
-static void blk_throtl_update_idletime(struct throtl_grp *tg)
-{
-}
-
-static void throtl_downgrade_check(struct throtl_grp *tg)
-{
-}
-
-static void throtl_upgrade_check(struct throtl_grp *tg)
-{
-}
-
-static bool throtl_can_upgrade(struct throtl_data *td,
- struct throtl_grp *this_tg)
-{
- return false;
-}
-
-static void throtl_upgrade_state(struct throtl_data *td)
-{
-}
-#endif
-
bool __blk_throtl_bio(struct bio *bio)
{
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
@@ -2185,21 +1609,12 @@ bool __blk_throtl_bio(struct bio *bio)
struct throtl_data *td = tg->td;
rcu_read_lock();
-
spin_lock_irq(&q->queue_lock);
-
- throtl_update_latency_buckets(td);
-
- blk_throtl_update_idletime(tg);
-
sq = &tg->service_queue;
-again:
while (true) {
if (tg->last_low_overflow_time[rw] == 0)
tg->last_low_overflow_time[rw] = jiffies;
- throtl_downgrade_check(tg);
- throtl_upgrade_check(tg);
/* throtl is FIFO - if bios are already queued, should queue */
if (sq->nr_queued[rw])
break;
@@ -2207,10 +1622,6 @@ again:
/* if above limits, break to queue */
if (!tg_may_dispatch(tg, bio, NULL)) {
tg->last_low_overflow_time[rw] = jiffies;
- if (throtl_can_upgrade(td, tg)) {
- throtl_upgrade_state(td);
- goto again;
- }
break;
}
@@ -2270,215 +1681,25 @@ again:
}
out_unlock:
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
- if (throttled || !td->track_bio_latency)
- bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY;
-#endif
spin_unlock_irq(&q->queue_lock);
rcu_read_unlock();
return throttled;
}
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-static void throtl_track_latency(struct throtl_data *td, sector_t size,
- enum req_op op, unsigned long time)
-{
- const bool rw = op_is_write(op);
- struct latency_bucket *latency;
- int index;
-
- if (!td || td->limit_index != LIMIT_LOW ||
- !(op == REQ_OP_READ || op == REQ_OP_WRITE) ||
- !blk_queue_nonrot(td->queue))
- return;
-
- index = request_bucket_index(size);
-
- latency = get_cpu_ptr(td->latency_buckets[rw]);
- latency[index].total_latency += time;
- latency[index].samples++;
- put_cpu_ptr(td->latency_buckets[rw]);
-}
-
-void blk_throtl_stat_add(struct request *rq, u64 time_ns)
-{
- struct request_queue *q = rq->q;
- struct throtl_data *td = q->td;
-
- throtl_track_latency(td, blk_rq_stats_sectors(rq), req_op(rq),
- time_ns >> 10);
-}
-
-void blk_throtl_bio_endio(struct bio *bio)
-{
- struct blkcg_gq *blkg;
- struct throtl_grp *tg;
- u64 finish_time_ns;
- unsigned long finish_time;
- unsigned long start_time;
- unsigned long lat;
- int rw = bio_data_dir(bio);
-
- blkg = bio->bi_blkg;
- if (!blkg)
- return;
- tg = blkg_to_tg(blkg);
- if (!tg->td->limit_valid[LIMIT_LOW])
- return;
-
- finish_time_ns = blk_time_get_ns();
- tg->last_finish_time = finish_time_ns >> 10;
-
- start_time = bio_issue_time(&bio->bi_issue) >> 10;
- finish_time = __bio_issue_time(finish_time_ns) >> 10;
- if (!start_time || finish_time <= start_time)
- return;
-
- lat = finish_time - start_time;
- /* this is only for bio based driver */
- if (!(bio->bi_issue.value & BIO_ISSUE_THROTL_SKIP_LATENCY))
- throtl_track_latency(tg->td, bio_issue_size(&bio->bi_issue),
- bio_op(bio), lat);
-
- if (tg->latency_target && lat >= tg->td->filtered_latency) {
- int bucket;
- unsigned int threshold;
-
- bucket = request_bucket_index(bio_issue_size(&bio->bi_issue));
- threshold = tg->td->avg_buckets[rw][bucket].latency +
- tg->latency_target;
- if (lat > threshold)
- tg->bad_bio_cnt++;
- /*
- * Not race free, could get wrong count, which means cgroups
- * will be throttled
- */
- tg->bio_cnt++;
- }
-
- if (time_after(jiffies, tg->bio_cnt_reset_time) || tg->bio_cnt > 1024) {
- tg->bio_cnt_reset_time = tg->td->throtl_slice + jiffies;
- tg->bio_cnt /= 2;
- tg->bad_bio_cnt /= 2;
- }
-}
-#endif
-
-int blk_throtl_init(struct gendisk *disk)
-{
- struct request_queue *q = disk->queue;
- struct throtl_data *td;
- int ret;
-
- td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
- if (!td)
- return -ENOMEM;
- td->latency_buckets[READ] = __alloc_percpu(sizeof(struct latency_bucket) *
- LATENCY_BUCKET_SIZE, __alignof__(u64));
- if (!td->latency_buckets[READ]) {
- kfree(td);
- return -ENOMEM;
- }
- td->latency_buckets[WRITE] = __alloc_percpu(sizeof(struct latency_bucket) *
- LATENCY_BUCKET_SIZE, __alignof__(u64));
- if (!td->latency_buckets[WRITE]) {
- free_percpu(td->latency_buckets[READ]);
- kfree(td);
- return -ENOMEM;
- }
-
- INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
- throtl_service_queue_init(&td->service_queue);
-
- q->td = td;
- td->queue = q;
-
- td->limit_valid[LIMIT_MAX] = true;
- td->limit_index = LIMIT_MAX;
- td->low_upgrade_time = jiffies;
- td->low_downgrade_time = jiffies;
-
- /* activate policy */
- ret = blkcg_activate_policy(disk, &blkcg_policy_throtl);
- if (ret) {
- free_percpu(td->latency_buckets[READ]);
- free_percpu(td->latency_buckets[WRITE]);
- kfree(td);
- }
- return ret;
-}
-
void blk_throtl_exit(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
- BUG_ON(!q->td);
+ if (!blk_throtl_activated(q))
+ return;
+
del_timer_sync(&q->td->service_queue.pending_timer);
throtl_shutdown_wq(q);
blkcg_deactivate_policy(disk, &blkcg_policy_throtl);
- free_percpu(q->td->latency_buckets[READ]);
- free_percpu(q->td->latency_buckets[WRITE]);
kfree(q->td);
}
-void blk_throtl_register(struct gendisk *disk)
-{
- struct request_queue *q = disk->queue;
- struct throtl_data *td;
- int i;
-
- td = q->td;
- BUG_ON(!td);
-
- if (blk_queue_nonrot(q)) {
- td->throtl_slice = DFL_THROTL_SLICE_SSD;
- td->filtered_latency = LATENCY_FILTERED_SSD;
- } else {
- td->throtl_slice = DFL_THROTL_SLICE_HD;
- td->filtered_latency = LATENCY_FILTERED_HD;
- for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
- td->avg_buckets[READ][i].latency = DFL_HD_BASELINE_LATENCY;
- td->avg_buckets[WRITE][i].latency = DFL_HD_BASELINE_LATENCY;
- }
- }
-#ifndef CONFIG_BLK_DEV_THROTTLING_LOW
- /* if no low limit, use previous default */
- td->throtl_slice = DFL_THROTL_SLICE_HD;
-
-#else
- td->track_bio_latency = !queue_is_mq(q);
- if (!td->track_bio_latency)
- blk_stat_enable_accounting(q);
-#endif
-}
-
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page)
-{
- if (!q->td)
- return -EINVAL;
- return sprintf(page, "%u\n", jiffies_to_msecs(q->td->throtl_slice));
-}
-
-ssize_t blk_throtl_sample_time_store(struct request_queue *q,
- const char *page, size_t count)
-{
- unsigned long v;
- unsigned long t;
-
- if (!q->td)
- return -EINVAL;
- if (kstrtoul(page, 10, &v))
- return -EINVAL;
- t = msecs_to_jiffies(v);
- if (t == 0 || t > MAX_THROTL_SLICE)
- return -EINVAL;
- q->td->throtl_slice = t;
- return count;
-}
-#endif
-
static int __init throtl_init(void)
{
kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0);
diff --git a/block/blk-throttle.h b/block/blk-throttle.h
index bffbc9cfc8ab..393c3d134b96 100644
--- a/block/blk-throttle.h
+++ b/block/blk-throttle.h
@@ -58,12 +58,6 @@ enum tg_state_flags {
THROTL_TG_CANCELING = 1 << 2, /* starts to cancel bio */
};
-enum {
- LIMIT_LOW,
- LIMIT_MAX,
- LIMIT_CNT,
-};
-
struct throtl_grp {
/* must be the first member */
struct blkg_policy_data pd;
@@ -102,14 +96,14 @@ struct throtl_grp {
bool has_rules_iops[2];
/* internally used bytes per second rate limits */
- uint64_t bps[2][LIMIT_CNT];
+ uint64_t bps[2];
/* user configured bps limits */
- uint64_t bps_conf[2][LIMIT_CNT];
+ uint64_t bps_conf[2];
/* internally used IOPS limits */
- unsigned int iops[2][LIMIT_CNT];
+ unsigned int iops[2];
/* user configured IOPS limits */
- unsigned int iops_conf[2][LIMIT_CNT];
+ unsigned int iops_conf[2];
/* Number of bytes dispatched in current slice */
uint64_t bytes_disp[2];
@@ -132,22 +126,10 @@ struct throtl_grp {
unsigned long last_check_time;
- unsigned long latency_target; /* us */
- unsigned long latency_target_conf; /* us */
/* When did we start a new slice */
unsigned long slice_start[2];
unsigned long slice_end[2];
- unsigned long last_finish_time; /* ns / 1024 */
- unsigned long checked_last_finish_time; /* ns / 1024 */
- unsigned long avg_idletime; /* ns / 1024 */
- unsigned long idletime_threshold; /* us */
- unsigned long idletime_threshold_conf; /* us */
-
- unsigned int bio_cnt; /* total bios */
- unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
- unsigned long bio_cnt_reset_time;
-
struct blkg_rwstat stat_bytes;
struct blkg_rwstat stat_ios;
};
@@ -168,23 +150,33 @@ static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
* Internal throttling interface
*/
#ifndef CONFIG_BLK_DEV_THROTTLING
-static inline int blk_throtl_init(struct gendisk *disk) { return 0; }
static inline void blk_throtl_exit(struct gendisk *disk) { }
-static inline void blk_throtl_register(struct gendisk *disk) { }
static inline bool blk_throtl_bio(struct bio *bio) { return false; }
static inline void blk_throtl_cancel_bios(struct gendisk *disk) { }
#else /* CONFIG_BLK_DEV_THROTTLING */
-int blk_throtl_init(struct gendisk *disk);
void blk_throtl_exit(struct gendisk *disk);
-void blk_throtl_register(struct gendisk *disk);
bool __blk_throtl_bio(struct bio *bio);
void blk_throtl_cancel_bios(struct gendisk *disk);
+static inline bool blk_throtl_activated(struct request_queue *q)
+{
+ return q->td != NULL;
+}
+
static inline bool blk_should_throtl(struct bio *bio)
{
- struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg);
+ struct throtl_grp *tg;
int rw = bio_data_dir(bio);
+ /*
+ * This is called under bio_queue_enter(), and it's synchronized with
+ * the activation of blk-throtl, which is protected by
+ * blk_mq_freeze_queue().
+ */
+ if (!blk_throtl_activated(bio->bi_bdev->bd_queue))
+ return false;
+
+ tg = blkg_to_tg(bio->bi_blkg);
if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) {
if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
bio_set_flag(bio, BIO_CGROUP_ACCT);
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index da0f4b2a8fa0..48e5e3bbb89c 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -7,6 +7,7 @@
*
* Copyright (c) 2016, Damien Le Moal
* Copyright (c) 2016, Western Digital
+ * Copyright (c) 2024, Western Digital Corporation or its affiliates.
*/
#include <linux/kernel.h>
@@ -16,8 +17,13 @@
#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <linux/sched/mm.h>
+#include <linux/spinlock.h>
+#include <linux/atomic.h>
+#include <linux/mempool.h>
#include "blk.h"
+#include "blk-mq-sched.h"
+#include "blk-mq-debugfs.h"
#define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name
static const char *const zone_cond_name[] = {
@@ -32,6 +38,64 @@ static const char *const zone_cond_name[] = {
};
#undef ZONE_COND_NAME
+/*
+ * Per-zone write plug.
+ * @node: hlist_node structure for managing the plug using a hash table.
+ * @link: To list the plug in the zone write plug error list of the disk.
+ * @ref: Zone write plug reference counter. A zone write plug reference is
+ * always at least 1 when the plug is hashed in the disk plug hash table.
+ * The reference is incremented whenever a new BIO needing plugging is
+ * submitted and when a function needs to manipulate a plug. The
+ * reference count is decremented whenever a plugged BIO completes and
+ * when a function that referenced the plug returns. The initial
+ * reference is dropped whenever the zone of the zone write plug is reset,
+ * finished and when the zone becomes full (last write BIO to the zone
+ * completes).
+ * @lock: Spinlock to atomically manipulate the plug.
+ * @flags: Flags indicating the plug state.
+ * @zone_no: The number of the zone the plug is managing.
+ * @wp_offset: The zone write pointer location relative to the start of the zone
+ * as a number of 512B sectors.
+ * @bio_list: The list of BIOs that are currently plugged.
+ * @bio_work: Work struct to handle issuing of plugged BIOs
+ * @rcu_head: RCU head to free zone write plugs with an RCU grace period.
+ * @disk: The gendisk the plug belongs to.
+ */
+struct blk_zone_wplug {
+ struct hlist_node node;
+ struct list_head link;
+ atomic_t ref;
+ spinlock_t lock;
+ unsigned int flags;
+ unsigned int zone_no;
+ unsigned int wp_offset;
+ struct bio_list bio_list;
+ struct work_struct bio_work;
+ struct rcu_head rcu_head;
+ struct gendisk *disk;
+};
+
+/*
+ * Zone write plug flags bits:
+ * - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged,
+ * that is, that write BIOs are being throttled due to a write BIO already
+ * being executed or the zone write plug bio list is not empty.
+ * - BLK_ZONE_WPLUG_ERROR: Indicates that a write error happened which will be
+ * recovered with a report zone to update the zone write pointer offset.
+ * - BLK_ZONE_WPLUG_UNHASHED: Indicates that the zone write plug was removed
+ * from the disk hash table and that the initial reference to the zone
+ * write plug set when the plug was first added to the hash table has been
+ * dropped. This flag is set when a zone is reset, finished or become full,
+ * to prevent new references to the zone write plug to be taken for
+ * newly incoming BIOs. A zone write plug flagged with this flag will be
+ * freed once all remaining references from BIOs or functions are dropped.
+ */
+#define BLK_ZONE_WPLUG_PLUGGED (1U << 0)
+#define BLK_ZONE_WPLUG_ERROR (1U << 1)
+#define BLK_ZONE_WPLUG_UNHASHED (1U << 2)
+
+#define BLK_ZONE_WPLUG_BUSY (BLK_ZONE_WPLUG_PLUGGED | BLK_ZONE_WPLUG_ERROR)
+
/**
* blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX.
* @zone_cond: BLK_ZONE_COND_XXX.
@@ -51,52 +115,6 @@ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond)
}
EXPORT_SYMBOL_GPL(blk_zone_cond_str);
-/*
- * Return true if a request is a write requests that needs zone write locking.
- */
-bool blk_req_needs_zone_write_lock(struct request *rq)
-{
- if (!rq->q->disk->seq_zones_wlock)
- return false;
-
- return blk_rq_is_seq_zoned_write(rq);
-}
-EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock);
-
-bool blk_req_zone_write_trylock(struct request *rq)
-{
- unsigned int zno = blk_rq_zone_no(rq);
-
- if (test_and_set_bit(zno, rq->q->disk->seq_zones_wlock))
- return false;
-
- WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
- rq->rq_flags |= RQF_ZONE_WRITE_LOCKED;
-
- return true;
-}
-EXPORT_SYMBOL_GPL(blk_req_zone_write_trylock);
-
-void __blk_req_zone_write_lock(struct request *rq)
-{
- if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq),
- rq->q->disk->seq_zones_wlock)))
- return;
-
- WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
- rq->rq_flags |= RQF_ZONE_WRITE_LOCKED;
-}
-EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock);
-
-void __blk_req_zone_write_unlock(struct request *rq)
-{
- rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED;
- if (rq->q->disk->seq_zones_wlock)
- WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq),
- rq->q->disk->seq_zones_wlock));
-}
-EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
-
/**
* bdev_nr_zones - Get number of zones
* @bdev: Target device
@@ -425,23 +443,1288 @@ fail:
return ret;
}
-void disk_free_zone_bitmaps(struct gendisk *disk)
+static inline bool disk_zone_is_conv(struct gendisk *disk, sector_t sector)
+{
+ if (!disk->conv_zones_bitmap)
+ return false;
+ return test_bit(disk_zone_no(disk, sector), disk->conv_zones_bitmap);
+}
+
+static bool disk_insert_zone_wplug(struct gendisk *disk,
+ struct blk_zone_wplug *zwplug)
+{
+ struct blk_zone_wplug *zwplg;
+ unsigned long flags;
+ unsigned int idx =
+ hash_32(zwplug->zone_no, disk->zone_wplugs_hash_bits);
+
+ /*
+ * Add the new zone write plug to the hash table, but carefully as we
+ * are racing with other submission context, so we may already have a
+ * zone write plug for the same zone.
+ */
+ spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+ hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) {
+ if (zwplg->zone_no == zwplug->zone_no) {
+ spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+ return false;
+ }
+ }
+ hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]);
+ spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+
+ return true;
+}
+
+static struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk,
+ sector_t sector)
+{
+ unsigned int zno = disk_zone_no(disk, sector);
+ unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits);
+ struct blk_zone_wplug *zwplug;
+
+ rcu_read_lock();
+
+ hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[idx], node) {
+ if (zwplug->zone_no == zno &&
+ atomic_inc_not_zero(&zwplug->ref)) {
+ rcu_read_unlock();
+ return zwplug;
+ }
+ }
+
+ rcu_read_unlock();
+
+ return NULL;
+}
+
+static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head)
{
+ struct blk_zone_wplug *zwplug =
+ container_of(rcu_head, struct blk_zone_wplug, rcu_head);
+
+ mempool_free(zwplug, zwplug->disk->zone_wplugs_pool);
+}
+
+static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug)
+{
+ if (atomic_dec_and_test(&zwplug->ref)) {
+ WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list));
+ WARN_ON_ONCE(!list_empty(&zwplug->link));
+ WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_UNHASHED));
+
+ call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu);
+ }
+}
+
+static inline bool disk_should_remove_zone_wplug(struct gendisk *disk,
+ struct blk_zone_wplug *zwplug)
+{
+ /* If the zone write plug was already removed, we are done. */
+ if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)
+ return false;
+
+ /* If the zone write plug is still busy, it cannot be removed. */
+ if (zwplug->flags & BLK_ZONE_WPLUG_BUSY)
+ return false;
+
+ /*
+ * Completions of BIOs with blk_zone_write_plug_bio_endio() may
+ * happen after handling a request completion with
+ * blk_zone_write_plug_finish_request() (e.g. with split BIOs
+ * that are chained). In such case, disk_zone_wplug_unplug_bio()
+ * should not attempt to remove the zone write plug until all BIO
+ * completions are seen. Check by looking at the zone write plug
+ * reference count, which is 2 when the plug is unused (one reference
+ * taken when the plug was allocated and another reference taken by the
+ * caller context).
+ */
+ if (atomic_read(&zwplug->ref) > 2)
+ return false;
+
+ /* We can remove zone write plugs for zones that are empty or full. */
+ return !zwplug->wp_offset || zwplug->wp_offset >= disk->zone_capacity;
+}
+
+static void disk_remove_zone_wplug(struct gendisk *disk,
+ struct blk_zone_wplug *zwplug)
+{
+ unsigned long flags;
+
+ /* If the zone write plug was already removed, we have nothing to do. */
+ if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)
+ return;
+
+ /*
+ * Mark the zone write plug as unhashed and drop the extra reference we
+ * took when the plug was inserted in the hash table.
+ */
+ zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED;
+ spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+ hlist_del_init_rcu(&zwplug->node);
+ spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+ disk_put_zone_wplug(zwplug);
+}
+
+static void blk_zone_wplug_bio_work(struct work_struct *work);
+
+/*
+ * Get a reference on the write plug for the zone containing @sector.
+ * If the plug does not exist, it is allocated and hashed.
+ * Return a pointer to the zone write plug with the plug spinlock held.
+ */
+static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk,
+ sector_t sector, gfp_t gfp_mask,
+ unsigned long *flags)
+{
+ unsigned int zno = disk_zone_no(disk, sector);
+ struct blk_zone_wplug *zwplug;
+
+again:
+ zwplug = disk_get_zone_wplug(disk, sector);
+ if (zwplug) {
+ /*
+ * Check that a BIO completion or a zone reset or finish
+ * operation has not already removed the zone write plug from
+ * the hash table and dropped its reference count. In such case,
+ * we need to get a new plug so start over from the beginning.
+ */
+ spin_lock_irqsave(&zwplug->lock, *flags);
+ if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) {
+ spin_unlock_irqrestore(&zwplug->lock, *flags);
+ disk_put_zone_wplug(zwplug);
+ goto again;
+ }
+ return zwplug;
+ }
+
+ /*
+ * Allocate and initialize a zone write plug with an extra reference
+ * so that it is not freed when the zone write plug becomes idle without
+ * the zone being full.
+ */
+ zwplug = mempool_alloc(disk->zone_wplugs_pool, gfp_mask);
+ if (!zwplug)
+ return NULL;
+
+ INIT_HLIST_NODE(&zwplug->node);
+ INIT_LIST_HEAD(&zwplug->link);
+ atomic_set(&zwplug->ref, 2);
+ spin_lock_init(&zwplug->lock);
+ zwplug->flags = 0;
+ zwplug->zone_no = zno;
+ zwplug->wp_offset = sector & (disk->queue->limits.chunk_sectors - 1);
+ bio_list_init(&zwplug->bio_list);
+ INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work);
+ zwplug->disk = disk;
+
+ spin_lock_irqsave(&zwplug->lock, *flags);
+
+ /*
+ * Insert the new zone write plug in the hash table. This can fail only
+ * if another context already inserted a plug. Retry from the beginning
+ * in such case.
+ */
+ if (!disk_insert_zone_wplug(disk, zwplug)) {
+ spin_unlock_irqrestore(&zwplug->lock, *flags);
+ mempool_free(zwplug, disk->zone_wplugs_pool);
+ goto again;
+ }
+
+ return zwplug;
+}
+
+static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug,
+ struct bio *bio)
+{
+ struct request_queue *q = zwplug->disk->queue;
+
+ bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING);
+ bio_io_error(bio);
+ disk_put_zone_wplug(zwplug);
+ blk_queue_exit(q);
+}
+
+/*
+ * Abort (fail) all plugged BIOs of a zone write plug.
+ */
+static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug)
+{
+ struct bio *bio;
+
+ while ((bio = bio_list_pop(&zwplug->bio_list)))
+ blk_zone_wplug_bio_io_error(zwplug, bio);
+}
+
+/*
+ * Abort (fail) all plugged BIOs of a zone write plug that are not aligned
+ * with the assumed write pointer location of the zone when the BIO will
+ * be unplugged.
+ */
+static void disk_zone_wplug_abort_unaligned(struct gendisk *disk,
+ struct blk_zone_wplug *zwplug)
+{
+ unsigned int zone_capacity = disk->zone_capacity;
+ unsigned int wp_offset = zwplug->wp_offset;
+ struct bio_list bl = BIO_EMPTY_LIST;
+ struct bio *bio;
+
+ while ((bio = bio_list_pop(&zwplug->bio_list))) {
+ if (wp_offset >= zone_capacity ||
+ (bio_op(bio) != REQ_OP_ZONE_APPEND &&
+ bio_offset_from_zone_start(bio) != wp_offset)) {
+ blk_zone_wplug_bio_io_error(zwplug, bio);
+ continue;
+ }
+
+ wp_offset += bio_sectors(bio);
+ bio_list_add(&bl, bio);
+ }
+
+ bio_list_merge(&zwplug->bio_list, &bl);
+}
+
+static inline void disk_zone_wplug_set_error(struct gendisk *disk,
+ struct blk_zone_wplug *zwplug)
+{
+ unsigned long flags;
+
+ if (zwplug->flags & BLK_ZONE_WPLUG_ERROR)
+ return;
+
+ /*
+ * At this point, we already have a reference on the zone write plug.
+ * However, since we are going to add the plug to the disk zone write
+ * plugs work list, increase its reference count. This reference will
+ * be dropped in disk_zone_wplugs_work() once the error state is
+ * handled, or in disk_zone_wplug_clear_error() if the zone is reset or
+ * finished.
+ */
+ zwplug->flags |= BLK_ZONE_WPLUG_ERROR;
+ atomic_inc(&zwplug->ref);
+
+ spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+ list_add_tail(&zwplug->link, &disk->zone_wplugs_err_list);
+ spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+}
+
+static inline void disk_zone_wplug_clear_error(struct gendisk *disk,
+ struct blk_zone_wplug *zwplug)
+{
+ unsigned long flags;
+
+ if (!(zwplug->flags & BLK_ZONE_WPLUG_ERROR))
+ return;
+
+ /*
+ * We are racing with the error handling work which drops the reference
+ * on the zone write plug after handling the error state. So remove the
+ * plug from the error list and drop its reference count only if the
+ * error handling has not yet started, that is, if the zone write plug
+ * is still listed.
+ */
+ spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+ if (!list_empty(&zwplug->link)) {
+ list_del_init(&zwplug->link);
+ zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR;
+ disk_put_zone_wplug(zwplug);
+ }
+ spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+}
+
+/*
+ * Set a zone write plug write pointer offset to either 0 (zone reset case)
+ * or to the zone size (zone finish case). This aborts all plugged BIOs, which
+ * is fine to do as doing a zone reset or zone finish while writes are in-flight
+ * is a mistake from the user which will most likely cause all plugged BIOs to
+ * fail anyway.
+ */
+static void disk_zone_wplug_set_wp_offset(struct gendisk *disk,
+ struct blk_zone_wplug *zwplug,
+ unsigned int wp_offset)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&zwplug->lock, flags);
+
+ /*
+ * Make sure that a BIO completion or another zone reset or finish
+ * operation has not already removed the plug from the hash table.
+ */
+ if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) {
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+ return;
+ }
+
+ /* Update the zone write pointer and abort all plugged BIOs. */
+ zwplug->wp_offset = wp_offset;
+ disk_zone_wplug_abort(zwplug);
+
+ /*
+ * Updating the write pointer offset puts back the zone
+ * in a good state. So clear the error flag and decrement the
+ * error count if we were in error state.
+ */
+ disk_zone_wplug_clear_error(disk, zwplug);
+
+ /*
+ * The zone write plug now has no BIO plugged: remove it from the
+ * hash table so that it cannot be seen. The plug will be freed
+ * when the last reference is dropped.
+ */
+ if (disk_should_remove_zone_wplug(disk, zwplug))
+ disk_remove_zone_wplug(disk, zwplug);
+
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+}
+
+static bool blk_zone_wplug_handle_reset_or_finish(struct bio *bio,
+ unsigned int wp_offset)
+{
+ struct gendisk *disk = bio->bi_bdev->bd_disk;
+ sector_t sector = bio->bi_iter.bi_sector;
+ struct blk_zone_wplug *zwplug;
+
+ /* Conventional zones cannot be reset nor finished. */
+ if (disk_zone_is_conv(disk, sector)) {
+ bio_io_error(bio);
+ return true;
+ }
+
+ /*
+ * If we have a zone write plug, set its write pointer offset to 0
+ * (reset case) or to the zone size (finish case). This will abort all
+ * BIOs plugged for the target zone. It is fine as resetting or
+ * finishing zones while writes are still in-flight will result in the
+ * writes failing anyway.
+ */
+ zwplug = disk_get_zone_wplug(disk, sector);
+ if (zwplug) {
+ disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset);
+ disk_put_zone_wplug(zwplug);
+ }
+
+ return false;
+}
+
+static bool blk_zone_wplug_handle_reset_all(struct bio *bio)
+{
+ struct gendisk *disk = bio->bi_bdev->bd_disk;
+ struct blk_zone_wplug *zwplug;
+ sector_t sector;
+
+ /*
+ * Set the write pointer offset of all zone write plugs to 0. This will
+ * abort all plugged BIOs. It is fine as resetting zones while writes
+ * are still in-flight will result in the writes failing anyway.
+ */
+ for (sector = 0; sector < get_capacity(disk);
+ sector += disk->queue->limits.chunk_sectors) {
+ zwplug = disk_get_zone_wplug(disk, sector);
+ if (zwplug) {
+ disk_zone_wplug_set_wp_offset(disk, zwplug, 0);
+ disk_put_zone_wplug(zwplug);
+ }
+ }
+
+ return false;
+}
+
+static inline void blk_zone_wplug_add_bio(struct blk_zone_wplug *zwplug,
+ struct bio *bio, unsigned int nr_segs)
+{
+ /*
+ * Grab an extra reference on the BIO request queue usage counter.
+ * This reference will be reused to submit a request for the BIO for
+ * blk-mq devices and dropped when the BIO is failed and after
+ * it is issued in the case of BIO-based devices.
+ */
+ percpu_ref_get(&bio->bi_bdev->bd_disk->queue->q_usage_counter);
+
+ /*
+ * The BIO is being plugged and thus will have to wait for the on-going
+ * write and for all other writes already plugged. So polling makes
+ * no sense.
+ */
+ bio_clear_polled(bio);
+
+ /*
+ * Reuse the poll cookie field to store the number of segments when
+ * split to the hardware limits.
+ */
+ bio->__bi_nr_segments = nr_segs;
+
+ /*
+ * We always receive BIOs after they are split and ready to be issued.
+ * The block layer passes the parts of a split BIO in order, and the
+ * user must also issue write sequentially. So simply add the new BIO
+ * at the tail of the list to preserve the sequential write order.
+ */
+ bio_list_add(&zwplug->bio_list, bio);
+}
+
+/*
+ * Called from bio_attempt_back_merge() when a BIO was merged with a request.
+ */
+void blk_zone_write_plug_bio_merged(struct bio *bio)
+{
+ struct blk_zone_wplug *zwplug;
+ unsigned long flags;
+
+ /*
+ * If the BIO was already plugged, then we were called through
+ * blk_zone_write_plug_init_request() -> blk_attempt_bio_merge().
+ * For this case, we already hold a reference on the zone write plug for
+ * the BIO and blk_zone_write_plug_init_request() will handle the
+ * zone write pointer offset update.
+ */
+ if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING))
+ return;
+
+ bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING);
+
+ /*
+ * Get a reference on the zone write plug of the target zone and advance
+ * the zone write pointer offset. Given that this is a merge, we already
+ * have at least one request and one BIO referencing the zone write
+ * plug. So this should not fail.
+ */
+ zwplug = disk_get_zone_wplug(bio->bi_bdev->bd_disk,
+ bio->bi_iter.bi_sector);
+ if (WARN_ON_ONCE(!zwplug))
+ return;
+
+ spin_lock_irqsave(&zwplug->lock, flags);
+ zwplug->wp_offset += bio_sectors(bio);
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+}
+
+/*
+ * Attempt to merge plugged BIOs with a newly prepared request for a BIO that
+ * already went through zone write plugging (either a new BIO or one that was
+ * unplugged).
+ */
+void blk_zone_write_plug_init_request(struct request *req)
+{
+ sector_t req_back_sector = blk_rq_pos(req) + blk_rq_sectors(req);
+ struct request_queue *q = req->q;
+ struct gendisk *disk = q->disk;
+ unsigned int zone_capacity = disk->zone_capacity;
+ struct blk_zone_wplug *zwplug =
+ disk_get_zone_wplug(disk, blk_rq_pos(req));
+ unsigned long flags;
+ struct bio *bio;
+
+ if (WARN_ON_ONCE(!zwplug))
+ return;
+
+ /*
+ * Indicate that completion of this request needs to be handled with
+ * blk_zone_write_plug_finish_request(), which will drop the reference
+ * on the zone write plug we took above on entry to this function.
+ */
+ req->rq_flags |= RQF_ZONE_WRITE_PLUGGING;
+
+ if (blk_queue_nomerges(q))
+ return;
+
+ /*
+ * Walk through the list of plugged BIOs to check if they can be merged
+ * into the back of the request.
+ */
+ spin_lock_irqsave(&zwplug->lock, flags);
+ while (zwplug->wp_offset < zone_capacity) {
+ bio = bio_list_peek(&zwplug->bio_list);
+ if (!bio)
+ break;
+
+ if (bio->bi_iter.bi_sector != req_back_sector ||
+ !blk_rq_merge_ok(req, bio))
+ break;
+
+ WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE_ZEROES &&
+ !bio->__bi_nr_segments);
+
+ bio_list_pop(&zwplug->bio_list);
+ if (bio_attempt_back_merge(req, bio, bio->__bi_nr_segments) !=
+ BIO_MERGE_OK) {
+ bio_list_add_head(&zwplug->bio_list, bio);
+ break;
+ }
+
+ /*
+ * Drop the extra reference on the queue usage we got when
+ * plugging the BIO and advance the write pointer offset.
+ */
+ blk_queue_exit(q);
+ zwplug->wp_offset += bio_sectors(bio);
+
+ req_back_sector += bio_sectors(bio);
+ }
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+}
+
+/*
+ * Check and prepare a BIO for submission by incrementing the write pointer
+ * offset of its zone write plug and changing zone append operations into
+ * regular write when zone append emulation is needed.
+ */
+static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug,
+ struct bio *bio)
+{
+ struct gendisk *disk = bio->bi_bdev->bd_disk;
+
+ /*
+ * Check that the user is not attempting to write to a full zone.
+ * We know such BIO will fail, and that would potentially overflow our
+ * write pointer offset beyond the end of the zone.
+ */
+ if (zwplug->wp_offset >= disk->zone_capacity)
+ goto err;
+
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ /*
+ * Use a regular write starting at the current write pointer.
+ * Similarly to native zone append operations, do not allow
+ * merging.
+ */
+ bio->bi_opf &= ~REQ_OP_MASK;
+ bio->bi_opf |= REQ_OP_WRITE | REQ_NOMERGE;
+ bio->bi_iter.bi_sector += zwplug->wp_offset;
+
+ /*
+ * Remember that this BIO is in fact a zone append operation
+ * so that we can restore its operation code on completion.
+ */
+ bio_set_flag(bio, BIO_EMULATES_ZONE_APPEND);
+ } else {
+ /*
+ * Check for non-sequential writes early because we avoid a
+ * whole lot of error handling trouble if we don't send it off
+ * to the driver.
+ */
+ if (bio_offset_from_zone_start(bio) != zwplug->wp_offset)
+ goto err;
+ }
+
+ /* Advance the zone write pointer offset. */
+ zwplug->wp_offset += bio_sectors(bio);
+
+ return true;
+
+err:
+ /* We detected an invalid write BIO: schedule error recovery. */
+ disk_zone_wplug_set_error(disk, zwplug);
+ kblockd_schedule_work(&disk->zone_wplugs_work);
+ return false;
+}
+
+static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs)
+{
+ struct gendisk *disk = bio->bi_bdev->bd_disk;
+ sector_t sector = bio->bi_iter.bi_sector;
+ struct blk_zone_wplug *zwplug;
+ gfp_t gfp_mask = GFP_NOIO;
+ unsigned long flags;
+
+ /*
+ * BIOs must be fully contained within a zone so that we use the correct
+ * zone write plug for the entire BIO. For blk-mq devices, the block
+ * layer should already have done any splitting required to ensure this
+ * and this BIO should thus not be straddling zone boundaries. For
+ * BIO-based devices, it is the responsibility of the driver to split
+ * the bio before submitting it.
+ */
+ if (WARN_ON_ONCE(bio_straddles_zones(bio))) {
+ bio_io_error(bio);
+ return true;
+ }
+
+ /* Conventional zones do not need write plugging. */
+ if (disk_zone_is_conv(disk, sector)) {
+ /* Zone append to conventional zones is not allowed. */
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ bio_io_error(bio);
+ return true;
+ }
+ return false;
+ }
+
+ if (bio->bi_opf & REQ_NOWAIT)
+ gfp_mask = GFP_NOWAIT;
+
+ zwplug = disk_get_and_lock_zone_wplug(disk, sector, gfp_mask, &flags);
+ if (!zwplug) {
+ bio_io_error(bio);
+ return true;
+ }
+
+ /* Indicate that this BIO is being handled using zone write plugging. */
+ bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING);
+
+ /*
+ * If the zone is already plugged or has a pending error, add the BIO
+ * to the plug BIO list. Otherwise, plug and let the BIO execute.
+ */
+ if (zwplug->flags & BLK_ZONE_WPLUG_BUSY)
+ goto plug;
+
+ /*
+ * If an error is detected when preparing the BIO, add it to the BIO
+ * list so that error recovery can deal with it.
+ */
+ if (!blk_zone_wplug_prepare_bio(zwplug, bio))
+ goto plug;
+
+ zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
+
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+
+ return false;
+
+plug:
+ zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
+ blk_zone_wplug_add_bio(zwplug, bio, nr_segs);
+
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+
+ return true;
+}
+
+/**
+ * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging
+ * @bio: The BIO being submitted
+ * @nr_segs: The number of physical segments of @bio
+ *
+ * Handle write, write zeroes and zone append operations requiring emulation
+ * using zone write plugging.
+ *
+ * Return true whenever @bio execution needs to be delayed through the zone
+ * write plug. Otherwise, return false to let the submission path process
+ * @bio normally.
+ */
+bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
+{
+ struct block_device *bdev = bio->bi_bdev;
+
+ if (!bdev->bd_disk->zone_wplugs_hash)
+ return false;
+
+ /*
+ * If the BIO already has the plugging flag set, then it was already
+ * handled through this path and this is a submission from the zone
+ * plug bio submit work.
+ */
+ if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING))
+ return false;
+
+ /*
+ * We do not need to do anything special for empty flush BIOs, e.g
+ * BIOs such as issued by blkdev_issue_flush(). The is because it is
+ * the responsibility of the user to first wait for the completion of
+ * write operations for flush to have any effect on the persistence of
+ * the written data.
+ */
+ if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
+ return false;
+
+ /*
+ * Regular writes and write zeroes need to be handled through the target
+ * zone write plug. This includes writes with REQ_FUA | REQ_PREFLUSH
+ * which may need to go through the flush machinery depending on the
+ * target device capabilities. Plugging such writes is fine as the flush
+ * machinery operates at the request level, below the plug, and
+ * completion of the flush sequence will go through the regular BIO
+ * completion, which will handle zone write plugging.
+ * Zone append operations for devices that requested emulation must
+ * also be plugged so that these BIOs can be changed into regular
+ * write BIOs.
+ * Zone reset, reset all and finish commands need special treatment
+ * to correctly track the write pointer offset of zones. These commands
+ * are not plugged as we do not need serialization with write
+ * operations. It is the responsibility of the user to not issue reset
+ * and finish commands when write operations are in flight.
+ */
+ switch (bio_op(bio)) {
+ case REQ_OP_ZONE_APPEND:
+ if (!bdev_emulates_zone_append(bdev))
+ return false;
+ fallthrough;
+ case REQ_OP_WRITE:
+ case REQ_OP_WRITE_ZEROES:
+ return blk_zone_wplug_handle_write(bio, nr_segs);
+ case REQ_OP_ZONE_RESET:
+ return blk_zone_wplug_handle_reset_or_finish(bio, 0);
+ case REQ_OP_ZONE_FINISH:
+ return blk_zone_wplug_handle_reset_or_finish(bio,
+ bdev_zone_sectors(bdev));
+ case REQ_OP_ZONE_RESET_ALL:
+ return blk_zone_wplug_handle_reset_all(bio);
+ default:
+ return false;
+ }
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(blk_zone_plug_bio);
+
+static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk,
+ struct blk_zone_wplug *zwplug)
+{
+ /*
+ * Take a reference on the zone write plug and schedule the submission
+ * of the next plugged BIO. blk_zone_wplug_bio_work() will release the
+ * reference we take here.
+ */
+ WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED));
+ atomic_inc(&zwplug->ref);
+ queue_work(disk->zone_wplugs_wq, &zwplug->bio_work);
+}
+
+static void disk_zone_wplug_unplug_bio(struct gendisk *disk,
+ struct blk_zone_wplug *zwplug)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&zwplug->lock, flags);
+
+ /*
+ * If we had an error, schedule error recovery. The recovery work
+ * will restart submission of plugged BIOs.
+ */
+ if (zwplug->flags & BLK_ZONE_WPLUG_ERROR) {
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+ kblockd_schedule_work(&disk->zone_wplugs_work);
+ return;
+ }
+
+ /* Schedule submission of the next plugged BIO if we have one. */
+ if (!bio_list_empty(&zwplug->bio_list)) {
+ disk_zone_wplug_schedule_bio_work(disk, zwplug);
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+ return;
+ }
+
+ zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
+
+ /*
+ * If the zone is full (it was fully written or finished, or empty
+ * (it was reset), remove its zone write plug from the hash table.
+ */
+ if (disk_should_remove_zone_wplug(disk, zwplug))
+ disk_remove_zone_wplug(disk, zwplug);
+
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+}
+
+void blk_zone_write_plug_bio_endio(struct bio *bio)
+{
+ struct gendisk *disk = bio->bi_bdev->bd_disk;
+ struct blk_zone_wplug *zwplug =
+ disk_get_zone_wplug(disk, bio->bi_iter.bi_sector);
+ unsigned long flags;
+
+ if (WARN_ON_ONCE(!zwplug))
+ return;
+
+ /* Make sure we do not see this BIO again by clearing the plug flag. */
+ bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING);
+
+ /*
+ * If this is a regular write emulating a zone append operation,
+ * restore the original operation code.
+ */
+ if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) {
+ bio->bi_opf &= ~REQ_OP_MASK;
+ bio->bi_opf |= REQ_OP_ZONE_APPEND;
+ }
+
+ /*
+ * If the BIO failed, mark the plug as having an error to trigger
+ * recovery.
+ */
+ if (bio->bi_status != BLK_STS_OK) {
+ spin_lock_irqsave(&zwplug->lock, flags);
+ disk_zone_wplug_set_error(disk, zwplug);
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+ }
+
+ /* Drop the reference we took when the BIO was issued. */
+ disk_put_zone_wplug(zwplug);
+
+ /*
+ * For BIO-based devices, blk_zone_write_plug_finish_request()
+ * is not called. So we need to schedule execution of the next
+ * plugged BIO here.
+ */
+ if (bio->bi_bdev->bd_has_submit_bio)
+ disk_zone_wplug_unplug_bio(disk, zwplug);
+
+ /* Drop the reference we took when entering this function. */
+ disk_put_zone_wplug(zwplug);
+}
+
+void blk_zone_write_plug_finish_request(struct request *req)
+{
+ struct gendisk *disk = req->q->disk;
+ struct blk_zone_wplug *zwplug;
+
+ zwplug = disk_get_zone_wplug(disk, req->__sector);
+ if (WARN_ON_ONCE(!zwplug))
+ return;
+
+ req->rq_flags &= ~RQF_ZONE_WRITE_PLUGGING;
+
+ /*
+ * Drop the reference we took when the request was initialized in
+ * blk_zone_write_plug_init_request().
+ */
+ disk_put_zone_wplug(zwplug);
+
+ disk_zone_wplug_unplug_bio(disk, zwplug);
+
+ /* Drop the reference we took when entering this function. */
+ disk_put_zone_wplug(zwplug);
+}
+
+static void blk_zone_wplug_bio_work(struct work_struct *work)
+{
+ struct blk_zone_wplug *zwplug =
+ container_of(work, struct blk_zone_wplug, bio_work);
+ struct block_device *bdev;
+ unsigned long flags;
+ struct bio *bio;
+
+ /*
+ * Submit the next plugged BIO. If we do not have any, clear
+ * the plugged flag.
+ */
+ spin_lock_irqsave(&zwplug->lock, flags);
+
+ bio = bio_list_pop(&zwplug->bio_list);
+ if (!bio) {
+ zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+ goto put_zwplug;
+ }
+
+ if (!blk_zone_wplug_prepare_bio(zwplug, bio)) {
+ /* Error recovery will decide what to do with the BIO. */
+ bio_list_add_head(&zwplug->bio_list, bio);
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+ goto put_zwplug;
+ }
+
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+
+ bdev = bio->bi_bdev;
+ submit_bio_noacct_nocheck(bio);
+
+ /*
+ * blk-mq devices will reuse the extra reference on the request queue
+ * usage counter we took when the BIO was plugged, but the submission
+ * path for BIO-based devices will not do that. So drop this extra
+ * reference here.
+ */
+ if (bdev->bd_has_submit_bio)
+ blk_queue_exit(bdev->bd_disk->queue);
+
+put_zwplug:
+ /* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */
+ disk_put_zone_wplug(zwplug);
+}
+
+static unsigned int blk_zone_wp_offset(struct blk_zone *zone)
+{
+ switch (zone->cond) {
+ case BLK_ZONE_COND_IMP_OPEN:
+ case BLK_ZONE_COND_EXP_OPEN:
+ case BLK_ZONE_COND_CLOSED:
+ return zone->wp - zone->start;
+ case BLK_ZONE_COND_FULL:
+ return zone->len;
+ case BLK_ZONE_COND_EMPTY:
+ return 0;
+ case BLK_ZONE_COND_NOT_WP:
+ case BLK_ZONE_COND_OFFLINE:
+ case BLK_ZONE_COND_READONLY:
+ default:
+ /*
+ * Conventional, offline and read-only zones do not have a valid
+ * write pointer.
+ */
+ return UINT_MAX;
+ }
+}
+
+static int blk_zone_wplug_report_zone_cb(struct blk_zone *zone,
+ unsigned int idx, void *data)
+{
+ struct blk_zone *zonep = data;
+
+ *zonep = *zone;
+ return 0;
+}
+
+static void disk_zone_wplug_handle_error(struct gendisk *disk,
+ struct blk_zone_wplug *zwplug)
+{
+ sector_t zone_start_sector =
+ bdev_zone_sectors(disk->part0) * zwplug->zone_no;
+ unsigned int noio_flag;
+ struct blk_zone zone;
+ unsigned long flags;
+ int ret;
+
+ /* Get the current zone information from the device. */
+ noio_flag = memalloc_noio_save();
+ ret = disk->fops->report_zones(disk, zone_start_sector, 1,
+ blk_zone_wplug_report_zone_cb, &zone);
+ memalloc_noio_restore(noio_flag);
+
+ spin_lock_irqsave(&zwplug->lock, flags);
+
+ /*
+ * A zone reset or finish may have cleared the error already. In such
+ * case, do nothing as the report zones may have seen the "old" write
+ * pointer value before the reset/finish operation completed.
+ */
+ if (!(zwplug->flags & BLK_ZONE_WPLUG_ERROR))
+ goto unlock;
+
+ zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR;
+
+ if (ret != 1) {
+ /*
+ * We failed to get the zone information, meaning that something
+ * is likely really wrong with the device. Abort all remaining
+ * plugged BIOs as otherwise we could endup waiting forever on
+ * plugged BIOs to complete if there is a queue freeze on-going.
+ */
+ disk_zone_wplug_abort(zwplug);
+ goto unplug;
+ }
+
+ /* Update the zone write pointer offset. */
+ zwplug->wp_offset = blk_zone_wp_offset(&zone);
+ disk_zone_wplug_abort_unaligned(disk, zwplug);
+
+ /* Restart BIO submission if we still have any BIO left. */
+ if (!bio_list_empty(&zwplug->bio_list)) {
+ disk_zone_wplug_schedule_bio_work(disk, zwplug);
+ goto unlock;
+ }
+
+unplug:
+ zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
+ if (disk_should_remove_zone_wplug(disk, zwplug))
+ disk_remove_zone_wplug(disk, zwplug);
+
+unlock:
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+}
+
+static void disk_zone_wplugs_work(struct work_struct *work)
+{
+ struct gendisk *disk =
+ container_of(work, struct gendisk, zone_wplugs_work);
+ struct blk_zone_wplug *zwplug;
+ unsigned long flags;
+
+ spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+
+ while (!list_empty(&disk->zone_wplugs_err_list)) {
+ zwplug = list_first_entry(&disk->zone_wplugs_err_list,
+ struct blk_zone_wplug, link);
+ list_del_init(&zwplug->link);
+ spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+
+ disk_zone_wplug_handle_error(disk, zwplug);
+ disk_put_zone_wplug(zwplug);
+
+ spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+ }
+
+ spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+}
+
+static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk)
+{
+ return 1U << disk->zone_wplugs_hash_bits;
+}
+
+void disk_init_zone_resources(struct gendisk *disk)
+{
+ spin_lock_init(&disk->zone_wplugs_lock);
+ INIT_LIST_HEAD(&disk->zone_wplugs_err_list);
+ INIT_WORK(&disk->zone_wplugs_work, disk_zone_wplugs_work);
+}
+
+/*
+ * For the size of a disk zone write plug hash table, use the size of the
+ * zone write plug mempool, which is the maximum of the disk open zones and
+ * active zones limits. But do not exceed 4KB (512 hlist head entries), that is,
+ * 9 bits. For a disk that has no limits, mempool size defaults to 128.
+ */
+#define BLK_ZONE_WPLUG_MAX_HASH_BITS 9
+#define BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE 128
+
+static int disk_alloc_zone_resources(struct gendisk *disk,
+ unsigned int pool_size)
+{
+ unsigned int i;
+
+ disk->zone_wplugs_hash_bits =
+ min(ilog2(pool_size) + 1, BLK_ZONE_WPLUG_MAX_HASH_BITS);
+
+ disk->zone_wplugs_hash =
+ kcalloc(disk_zone_wplugs_hash_size(disk),
+ sizeof(struct hlist_head), GFP_KERNEL);
+ if (!disk->zone_wplugs_hash)
+ return -ENOMEM;
+
+ for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++)
+ INIT_HLIST_HEAD(&disk->zone_wplugs_hash[i]);
+
+ disk->zone_wplugs_pool = mempool_create_kmalloc_pool(pool_size,
+ sizeof(struct blk_zone_wplug));
+ if (!disk->zone_wplugs_pool)
+ goto free_hash;
+
+ disk->zone_wplugs_wq =
+ alloc_workqueue("%s_zwplugs", WQ_MEM_RECLAIM | WQ_HIGHPRI,
+ pool_size, disk->disk_name);
+ if (!disk->zone_wplugs_wq)
+ goto destroy_pool;
+
+ return 0;
+
+destroy_pool:
+ mempool_destroy(disk->zone_wplugs_pool);
+ disk->zone_wplugs_pool = NULL;
+free_hash:
+ kfree(disk->zone_wplugs_hash);
+ disk->zone_wplugs_hash = NULL;
+ disk->zone_wplugs_hash_bits = 0;
+ return -ENOMEM;
+}
+
+static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk)
+{
+ struct blk_zone_wplug *zwplug;
+ unsigned int i;
+
+ if (!disk->zone_wplugs_hash)
+ return;
+
+ /* Free all the zone write plugs we have. */
+ for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) {
+ while (!hlist_empty(&disk->zone_wplugs_hash[i])) {
+ zwplug = hlist_entry(disk->zone_wplugs_hash[i].first,
+ struct blk_zone_wplug, node);
+ atomic_inc(&zwplug->ref);
+ disk_remove_zone_wplug(disk, zwplug);
+ disk_put_zone_wplug(zwplug);
+ }
+ }
+
+ kfree(disk->zone_wplugs_hash);
+ disk->zone_wplugs_hash = NULL;
+ disk->zone_wplugs_hash_bits = 0;
+}
+
+void disk_free_zone_resources(struct gendisk *disk)
+{
+ cancel_work_sync(&disk->zone_wplugs_work);
+
+ if (disk->zone_wplugs_wq) {
+ destroy_workqueue(disk->zone_wplugs_wq);
+ disk->zone_wplugs_wq = NULL;
+ }
+
+ disk_destroy_zone_wplugs_hash_table(disk);
+
+ /*
+ * Wait for the zone write plugs to be RCU-freed before
+ * destorying the mempool.
+ */
+ rcu_barrier();
+
+ mempool_destroy(disk->zone_wplugs_pool);
+ disk->zone_wplugs_pool = NULL;
+
kfree(disk->conv_zones_bitmap);
disk->conv_zones_bitmap = NULL;
- kfree(disk->seq_zones_wlock);
- disk->seq_zones_wlock = NULL;
+ disk->zone_capacity = 0;
+ disk->nr_zones = 0;
+}
+
+static inline bool disk_need_zone_resources(struct gendisk *disk)
+{
+ /*
+ * All mq zoned devices need zone resources so that the block layer
+ * can automatically handle write BIO plugging. BIO-based device drivers
+ * (e.g. DM devices) are normally responsible for handling zone write
+ * ordering and do not need zone resources, unless the driver requires
+ * zone append emulation.
+ */
+ return queue_is_mq(disk->queue) ||
+ queue_emulates_zone_append(disk->queue);
+}
+
+static int disk_revalidate_zone_resources(struct gendisk *disk,
+ unsigned int nr_zones)
+{
+ struct queue_limits *lim = &disk->queue->limits;
+ unsigned int pool_size;
+
+ if (!disk_need_zone_resources(disk))
+ return 0;
+
+ /*
+ * If the device has no limit on the maximum number of open and active
+ * zones, use BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE.
+ */
+ pool_size = max(lim->max_open_zones, lim->max_active_zones);
+ if (!pool_size)
+ pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_zones);
+
+ if (!disk->zone_wplugs_hash)
+ return disk_alloc_zone_resources(disk, pool_size);
+
+ return 0;
}
struct blk_revalidate_zone_args {
struct gendisk *disk;
unsigned long *conv_zones_bitmap;
- unsigned long *seq_zones_wlock;
unsigned int nr_zones;
+ unsigned int zone_capacity;
sector_t sector;
};
/*
+ * Update the disk zone resources information and device queue limits.
+ * The disk queue is frozen when this is executed.
+ */
+static int disk_update_zone_resources(struct gendisk *disk,
+ struct blk_revalidate_zone_args *args)
+{
+ struct request_queue *q = disk->queue;
+ unsigned int nr_seq_zones, nr_conv_zones = 0;
+ unsigned int pool_size;
+ struct queue_limits lim;
+
+ disk->nr_zones = args->nr_zones;
+ disk->zone_capacity = args->zone_capacity;
+ swap(disk->conv_zones_bitmap, args->conv_zones_bitmap);
+ if (disk->conv_zones_bitmap)
+ nr_conv_zones = bitmap_weight(disk->conv_zones_bitmap,
+ disk->nr_zones);
+ if (nr_conv_zones >= disk->nr_zones) {
+ pr_warn("%s: Invalid number of conventional zones %u / %u\n",
+ disk->disk_name, nr_conv_zones, disk->nr_zones);
+ return -ENODEV;
+ }
+
+ if (!disk->zone_wplugs_pool)
+ return 0;
+
+ /*
+ * If the device has no limit on the maximum number of open and active
+ * zones, set its max open zone limit to the mempool size to indicate
+ * to the user that there is a potential performance impact due to
+ * dynamic zone write plug allocation when simultaneously writing to
+ * more zones than the size of the mempool.
+ */
+ lim = queue_limits_start_update(q);
+
+ nr_seq_zones = disk->nr_zones - nr_conv_zones;
+ pool_size = max(lim.max_open_zones, lim.max_active_zones);
+ if (!pool_size)
+ pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_seq_zones);
+
+ mempool_resize(disk->zone_wplugs_pool, pool_size);
+
+ if (!lim.max_open_zones && !lim.max_active_zones) {
+ if (pool_size < nr_seq_zones)
+ lim.max_open_zones = pool_size;
+ else
+ lim.max_open_zones = 0;
+ }
+
+ return queue_limits_commit_update(q, &lim);
+}
+
+static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx,
+ struct blk_revalidate_zone_args *args)
+{
+ struct gendisk *disk = args->disk;
+ struct request_queue *q = disk->queue;
+
+ if (zone->capacity != zone->len) {
+ pr_warn("%s: Invalid conventional zone capacity\n",
+ disk->disk_name);
+ return -ENODEV;
+ }
+
+ if (!disk_need_zone_resources(disk))
+ return 0;
+
+ if (!args->conv_zones_bitmap) {
+ args->conv_zones_bitmap =
+ blk_alloc_zone_bitmap(q->node, args->nr_zones);
+ if (!args->conv_zones_bitmap)
+ return -ENOMEM;
+ }
+
+ set_bit(idx, args->conv_zones_bitmap);
+
+ return 0;
+}
+
+static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx,
+ struct blk_revalidate_zone_args *args)
+{
+ struct gendisk *disk = args->disk;
+ struct blk_zone_wplug *zwplug;
+ unsigned int wp_offset;
+ unsigned long flags;
+
+ /*
+ * Remember the capacity of the first sequential zone and check
+ * if it is constant for all zones.
+ */
+ if (!args->zone_capacity)
+ args->zone_capacity = zone->capacity;
+ if (zone->capacity != args->zone_capacity) {
+ pr_warn("%s: Invalid variable zone capacity\n",
+ disk->disk_name);
+ return -ENODEV;
+ }
+
+ /*
+ * We need to track the write pointer of all zones that are not
+ * empty nor full. So make sure we have a zone write plug for
+ * such zone if the device has a zone write plug hash table.
+ */
+ if (!disk->zone_wplugs_hash)
+ return 0;
+
+ wp_offset = blk_zone_wp_offset(zone);
+ if (!wp_offset || wp_offset >= zone->capacity)
+ return 0;
+
+ zwplug = disk_get_and_lock_zone_wplug(disk, zone->wp, GFP_NOIO, &flags);
+ if (!zwplug)
+ return -ENOMEM;
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+ disk_put_zone_wplug(zwplug);
+
+ return 0;
+}
+
+/*
* Helper function to check the validity of zones of a zoned block device.
*/
static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
@@ -449,9 +1732,9 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
{
struct blk_revalidate_zone_args *args = data;
struct gendisk *disk = args->disk;
- struct request_queue *q = disk->queue;
sector_t capacity = get_capacity(disk);
- sector_t zone_sectors = q->limits.chunk_sectors;
+ sector_t zone_sectors = disk->queue->limits.chunk_sectors;
+ int ret;
/* Check for bad zones and holes in the zone report */
if (zone->start != args->sector) {
@@ -482,66 +1765,57 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
return -ENODEV;
}
+ if (!zone->capacity || zone->capacity > zone->len) {
+ pr_warn("%s: Invalid zone capacity\n",
+ disk->disk_name);
+ return -ENODEV;
+ }
+
/* Check zone type */
switch (zone->type) {
case BLK_ZONE_TYPE_CONVENTIONAL:
- if (!args->conv_zones_bitmap) {
- args->conv_zones_bitmap =
- blk_alloc_zone_bitmap(q->node, args->nr_zones);
- if (!args->conv_zones_bitmap)
- return -ENOMEM;
- }
- set_bit(idx, args->conv_zones_bitmap);
+ ret = blk_revalidate_conv_zone(zone, idx, args);
break;
case BLK_ZONE_TYPE_SEQWRITE_REQ:
- if (!args->seq_zones_wlock) {
- args->seq_zones_wlock =
- blk_alloc_zone_bitmap(q->node, args->nr_zones);
- if (!args->seq_zones_wlock)
- return -ENOMEM;
- }
+ ret = blk_revalidate_seq_zone(zone, idx, args);
break;
case BLK_ZONE_TYPE_SEQWRITE_PREF:
default:
pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n",
disk->disk_name, (int)zone->type, zone->start);
- return -ENODEV;
+ ret = -ENODEV;
}
- args->sector += zone->len;
- return 0;
+ if (!ret)
+ args->sector += zone->len;
+
+ return ret;
}
/**
- * blk_revalidate_disk_zones - (re)allocate and initialize zone bitmaps
+ * blk_revalidate_disk_zones - (re)allocate and initialize zone write plugs
* @disk: Target disk
- * @update_driver_data: Callback to update driver data on the frozen disk
*
- * Helper function for low-level device drivers to check and (re) allocate and
- * initialize a disk request queue zone bitmaps. This functions should normally
- * be called within the disk ->revalidate method for blk-mq based drivers.
+ * Helper function for low-level device drivers to check, (re) allocate and
+ * initialize resources used for managing zoned disks. This function should
+ * normally be called by blk-mq based drivers when a zoned gendisk is probed
+ * and when the zone configuration of the gendisk changes (e.g. after a format).
* Before calling this function, the device driver must already have set the
* device zone size (chunk_sector limit) and the max zone append limit.
- * For BIO based drivers, this function cannot be used. BIO based device drivers
- * only need to set disk->nr_zones so that the sysfs exposed value is correct.
- * If the @update_driver_data callback function is not NULL, the callback is
- * executed with the device request queue frozen after all zones have been
- * checked.
+ * BIO based drivers can also use this function as long as the device queue
+ * can be safely frozen.
*/
-int blk_revalidate_disk_zones(struct gendisk *disk,
- void (*update_driver_data)(struct gendisk *disk))
+int blk_revalidate_disk_zones(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
sector_t zone_sectors = q->limits.chunk_sectors;
sector_t capacity = get_capacity(disk);
struct blk_revalidate_zone_args args = { };
unsigned int noio_flag;
- int ret;
+ int ret = -ENOMEM;
if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
return -EIO;
- if (WARN_ON_ONCE(!queue_is_mq(q)))
- return -EIO;
if (!capacity)
return -ENODEV;
@@ -556,7 +1830,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk,
return -ENODEV;
}
- if (!q->limits.max_zone_append_sectors) {
+ if (!queue_max_zone_append_sectors(q)) {
pr_warn("%s: Invalid 0 maximum zone append limit\n",
disk->disk_name);
return -ENODEV;
@@ -569,6 +1843,11 @@ int blk_revalidate_disk_zones(struct gendisk *disk,
args.disk = disk;
args.nr_zones = (capacity + zone_sectors - 1) >> ilog2(zone_sectors);
noio_flag = memalloc_noio_save();
+ ret = disk_revalidate_zone_resources(disk, args.nr_zones);
+ if (ret) {
+ memalloc_noio_restore(noio_flag);
+ return ret;
+ }
ret = disk->fops->report_zones(disk, 0, UINT_MAX,
blk_revalidate_zone_cb, &args);
if (!ret) {
@@ -588,26 +1867,59 @@ int blk_revalidate_disk_zones(struct gendisk *disk,
}
/*
- * Install the new bitmaps and update nr_zones only once the queue is
- * stopped and all I/Os are completed (i.e. a scheduler is not
- * referencing the bitmaps).
+ * Set the new disk zone parameters only once the queue is frozen and
+ * all I/Os are completed.
*/
blk_mq_freeze_queue(q);
- if (ret > 0) {
- disk->nr_zones = args.nr_zones;
- swap(disk->seq_zones_wlock, args.seq_zones_wlock);
- swap(disk->conv_zones_bitmap, args.conv_zones_bitmap);
- if (update_driver_data)
- update_driver_data(disk);
- ret = 0;
- } else {
+ if (ret > 0)
+ ret = disk_update_zone_resources(disk, &args);
+ else
pr_warn("%s: failed to revalidate zones\n", disk->disk_name);
- disk_free_zone_bitmaps(disk);
- }
+ if (ret)
+ disk_free_zone_resources(disk);
blk_mq_unfreeze_queue(q);
- kfree(args.seq_zones_wlock);
kfree(args.conv_zones_bitmap);
+
return ret;
}
EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
+
+#ifdef CONFIG_BLK_DEBUG_FS
+
+int queue_zone_wplugs_show(void *data, struct seq_file *m)
+{
+ struct request_queue *q = data;
+ struct gendisk *disk = q->disk;
+ struct blk_zone_wplug *zwplug;
+ unsigned int zwp_wp_offset, zwp_flags;
+ unsigned int zwp_zone_no, zwp_ref;
+ unsigned int zwp_bio_list_size, i;
+ unsigned long flags;
+
+ if (!disk->zone_wplugs_hash)
+ return 0;
+
+ rcu_read_lock();
+ for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) {
+ hlist_for_each_entry_rcu(zwplug,
+ &disk->zone_wplugs_hash[i], node) {
+ spin_lock_irqsave(&zwplug->lock, flags);
+ zwp_zone_no = zwplug->zone_no;
+ zwp_flags = zwplug->flags;
+ zwp_ref = atomic_read(&zwplug->ref);
+ zwp_wp_offset = zwplug->wp_offset;
+ zwp_bio_list_size = bio_list_size(&zwplug->bio_list);
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+
+ seq_printf(m, "%u 0x%x %u %u %u\n",
+ zwp_zone_no, zwp_flags, zwp_ref,
+ zwp_wp_offset, zwp_bio_list_size);
+ }
+ }
+ rcu_read_unlock();
+
+ return 0;
+}
+
+#endif
diff --git a/block/blk.h b/block/blk.h
index d9f584984bc4..6e94c10af798 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -38,6 +38,7 @@ void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic);
void blk_queue_start_drain(struct request_queue *q);
int __bio_queue_enter(struct request_queue *q, struct bio *bio);
void submit_bio_noacct_nocheck(struct bio *bio);
+void bio_await_chain(struct bio *bio);
static inline bool blk_try_enter_queue(struct request_queue *q, bool pm)
{
@@ -269,6 +270,14 @@ static inline void bio_integrity_free(struct bio *bio)
unsigned long blk_rq_timeout(unsigned long timeout);
void blk_add_timer(struct request *req);
+enum bio_merge_status {
+ BIO_MERGE_OK,
+ BIO_MERGE_NONE,
+ BIO_MERGE_FAILED,
+};
+
+enum bio_merge_status bio_attempt_back_merge(struct request *req,
+ struct bio *bio, unsigned int nr_segs);
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs);
bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
@@ -357,6 +366,7 @@ static inline bool blk_do_io_stat(struct request *rq)
}
void update_io_ticks(struct block_device *part, unsigned long now, bool end);
+unsigned int part_in_flight(struct block_device *part);
static inline void req_set_nomerge(struct request_queue *q, struct request *req)
{
@@ -378,17 +388,6 @@ static inline void ioc_clear_queue(struct request_queue *q)
}
#endif /* CONFIG_BLK_ICQ */
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page);
-extern ssize_t blk_throtl_sample_time_store(struct request_queue *q,
- const char *page, size_t count);
-extern void blk_throtl_bio_endio(struct bio *bio);
-extern void blk_throtl_stat_add(struct request *rq, u64 time);
-#else
-static inline void blk_throtl_bio_endio(struct bio *bio) { }
-static inline void blk_throtl_stat_add(struct request *rq, u64 time) { }
-#endif
-
struct bio *__blk_queue_bounce(struct bio *bio, struct request_queue *q);
static inline bool blk_queue_may_bounce(struct request_queue *q)
@@ -407,13 +406,85 @@ static inline struct bio *blk_queue_bounce(struct bio *bio,
}
#ifdef CONFIG_BLK_DEV_ZONED
-void disk_free_zone_bitmaps(struct gendisk *disk);
+void disk_init_zone_resources(struct gendisk *disk);
+void disk_free_zone_resources(struct gendisk *disk);
+static inline bool bio_zone_write_plugging(struct bio *bio)
+{
+ return bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING);
+}
+static inline bool bio_is_zone_append(struct bio *bio)
+{
+ return bio_op(bio) == REQ_OP_ZONE_APPEND ||
+ bio_flagged(bio, BIO_EMULATES_ZONE_APPEND);
+}
+void blk_zone_write_plug_bio_merged(struct bio *bio);
+void blk_zone_write_plug_init_request(struct request *rq);
+static inline void blk_zone_update_request_bio(struct request *rq,
+ struct bio *bio)
+{
+ /*
+ * For zone append requests, the request sector indicates the location
+ * at which the BIO data was written. Return this value to the BIO
+ * issuer through the BIO iter sector.
+ * For plugged zone writes, which include emulated zone append, we need
+ * the original BIO sector so that blk_zone_write_plug_bio_endio() can
+ * lookup the zone write plug.
+ */
+ if (req_op(rq) == REQ_OP_ZONE_APPEND || bio_zone_write_plugging(bio))
+ bio->bi_iter.bi_sector = rq->__sector;
+}
+void blk_zone_write_plug_bio_endio(struct bio *bio);
+static inline void blk_zone_bio_endio(struct bio *bio)
+{
+ /*
+ * For write BIOs to zoned devices, signal the completion of the BIO so
+ * that the next write BIO can be submitted by zone write plugging.
+ */
+ if (bio_zone_write_plugging(bio))
+ blk_zone_write_plug_bio_endio(bio);
+}
+
+void blk_zone_write_plug_finish_request(struct request *rq);
+static inline void blk_zone_finish_request(struct request *rq)
+{
+ if (rq->rq_flags & RQF_ZONE_WRITE_PLUGGING)
+ blk_zone_write_plug_finish_request(rq);
+}
int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd,
unsigned long arg);
int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
unsigned int cmd, unsigned long arg);
#else /* CONFIG_BLK_DEV_ZONED */
-static inline void disk_free_zone_bitmaps(struct gendisk *disk) {}
+static inline void disk_init_zone_resources(struct gendisk *disk)
+{
+}
+static inline void disk_free_zone_resources(struct gendisk *disk)
+{
+}
+static inline bool bio_zone_write_plugging(struct bio *bio)
+{
+ return false;
+}
+static inline bool bio_is_zone_append(struct bio *bio)
+{
+ return false;
+}
+static inline void blk_zone_write_plug_bio_merged(struct bio *bio)
+{
+}
+static inline void blk_zone_write_plug_init_request(struct request *rq)
+{
+}
+static inline void blk_zone_update_request_bio(struct request *rq,
+ struct bio *bio)
+{
+}
+static inline void blk_zone_bio_endio(struct bio *bio)
+{
+}
+static inline void blk_zone_finish_request(struct request *rq)
+{
+}
static inline int blkdev_report_zones_ioctl(struct block_device *bdev,
unsigned int cmd, unsigned long arg)
{
diff --git a/block/elevator.c b/block/elevator.c
index 5ff093cb3cf8..f64ebd726e58 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -83,13 +83,6 @@ bool elv_bio_merge_ok(struct request *rq, struct bio *bio)
}
EXPORT_SYMBOL(elv_bio_merge_ok);
-static inline bool elv_support_features(struct request_queue *q,
- const struct elevator_type *e)
-{
- return (q->required_elevator_features & e->elevator_features) ==
- q->required_elevator_features;
-}
-
/**
* elevator_match - Check whether @e's name or alias matches @name
* @e: Scheduler to test
@@ -120,7 +113,7 @@ static struct elevator_type *elevator_find_get(struct request_queue *q,
spin_lock(&elv_list_lock);
e = __elevator_find(name);
- if (e && (!elv_support_features(q, e) || !elevator_tryget(e)))
+ if (e && (!elevator_tryget(e)))
e = NULL;
spin_unlock(&elv_list_lock);
return e;
@@ -580,34 +573,8 @@ static struct elevator_type *elevator_get_default(struct request_queue *q)
}
/*
- * Get the first elevator providing the features required by the request queue.
- * Default to "none" if no matching elevator is found.
- */
-static struct elevator_type *elevator_get_by_features(struct request_queue *q)
-{
- struct elevator_type *e, *found = NULL;
-
- spin_lock(&elv_list_lock);
-
- list_for_each_entry(e, &elv_list, list) {
- if (elv_support_features(q, e)) {
- found = e;
- break;
- }
- }
-
- if (found && !elevator_tryget(found))
- found = NULL;
-
- spin_unlock(&elv_list_lock);
- return found;
-}
-
-/*
- * For a device queue that has no required features, use the default elevator
- * settings. Otherwise, use the first elevator available matching the required
- * features. If no suitable elevator is find or if the chosen elevator
- * initialization fails, fall back to the "none" elevator (no elevator).
+ * Use the default elevator settings. If the chosen elevator initialization
+ * fails, fall back to the "none" elevator (no elevator).
*/
void elevator_init_mq(struct request_queue *q)
{
@@ -622,10 +589,7 @@ void elevator_init_mq(struct request_queue *q)
if (unlikely(q->elevator))
return;
- if (!q->required_elevator_features)
- e = elevator_get_default(q);
- else
- e = elevator_get_by_features(q);
+ e = elevator_get_default(q);
if (!e)
return;
@@ -781,7 +745,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
list_for_each_entry(e, &elv_list, list) {
if (e == cur)
len += sprintf(name+len, "[%s] ", e->elevator_name);
- else if (elv_support_features(q, e))
+ else
len += sprintf(name+len, "%s ", e->elevator_name);
}
spin_unlock(&elv_list_lock);
diff --git a/block/elevator.h b/block/elevator.h
index 7ca3d7b6ed82..e9a050a96e53 100644
--- a/block/elevator.h
+++ b/block/elevator.h
@@ -74,7 +74,6 @@ struct elevator_type
struct elv_fs_entry *elevator_attrs;
const char *elevator_name;
const char *elevator_alias;
- const unsigned int elevator_features;
struct module *elevator_owner;
#ifdef CONFIG_BLK_DEBUG_FS
const struct blk_mq_debugfs_attr *queue_debugfs_attrs;
diff --git a/block/fops.c b/block/fops.c
index af6c244314af..7a163f7fe2d8 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -44,18 +44,15 @@ static bool blkdev_dio_unaligned(struct block_device *bdev, loff_t pos,
#define DIO_INLINE_BIO_VECS 4
static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
- struct iov_iter *iter, unsigned int nr_pages)
+ struct iov_iter *iter, struct block_device *bdev,
+ unsigned int nr_pages)
{
- struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs;
loff_t pos = iocb->ki_pos;
bool should_dirty = false;
struct bio bio;
ssize_t ret;
- if (blkdev_dio_unaligned(bdev, pos, iter))
- return -EINVAL;
-
if (nr_pages <= DIO_INLINE_BIO_VECS)
vecs = inline_vecs;
else {
@@ -161,9 +158,8 @@ static void blkdev_bio_end_io(struct bio *bio)
}
static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
- unsigned int nr_pages)
+ struct block_device *bdev, unsigned int nr_pages)
{
- struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
struct blk_plug plug;
struct blkdev_dio *dio;
struct bio *bio;
@@ -172,9 +168,6 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
loff_t pos = iocb->ki_pos;
int ret = 0;
- if (blkdev_dio_unaligned(bdev, pos, iter))
- return -EINVAL;
-
if (iocb->ki_flags & IOCB_ALLOC_CACHE)
opf |= REQ_ALLOC_CACHE;
bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL,
@@ -302,9 +295,9 @@ static void blkdev_bio_end_io_async(struct bio *bio)
static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
struct iov_iter *iter,
+ struct block_device *bdev,
unsigned int nr_pages)
{
- struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
bool is_read = iov_iter_rw(iter) == READ;
blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
struct blkdev_dio *dio;
@@ -312,9 +305,6 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
loff_t pos = iocb->ki_pos;
int ret = 0;
- if (blkdev_dio_unaligned(bdev, pos, iter))
- return -EINVAL;
-
if (iocb->ki_flags & IOCB_ALLOC_CACHE)
opf |= REQ_ALLOC_CACHE;
bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL,
@@ -368,18 +358,23 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
+ struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
unsigned int nr_pages;
if (!iov_iter_count(iter))
return 0;
+ if (blkdev_dio_unaligned(bdev, iocb->ki_pos, iter))
+ return -EINVAL;
+
nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
if (likely(nr_pages <= BIO_MAX_VECS)) {
if (is_sync_kiocb(iocb))
- return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
- return __blkdev_direct_IO_async(iocb, iter, nr_pages);
+ return __blkdev_direct_IO_simple(iocb, iter, bdev,
+ nr_pages);
+ return __blkdev_direct_IO_async(iocb, iter, bdev, nr_pages);
}
- return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
+ return __blkdev_direct_IO(iocb, iter, bdev, bio_max_segs(nr_pages));
}
static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
@@ -390,7 +385,7 @@ static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
iomap->bdev = bdev;
iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev));
- if (iomap->offset >= isize)
+ if (offset >= isize)
return -EIO;
iomap->type = IOMAP_MAPPED;
iomap->addr = iomap->offset;
diff --git a/block/genhd.c b/block/genhd.c
index bb29a68e1d67..7f39fbe60753 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -118,7 +118,7 @@ static void part_stat_read_all(struct block_device *part,
}
}
-static unsigned int part_in_flight(struct block_device *part)
+unsigned int part_in_flight(struct block_device *part)
{
unsigned int inflight = 0;
int cpu;
@@ -345,9 +345,7 @@ int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode)
struct file *file;
int ret = 0;
- if (disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN))
- return -EINVAL;
- if (test_bit(GD_SUPPRESS_PART_SCAN, &disk->state))
+ if (!disk_has_partscan(disk))
return -EINVAL;
if (disk->open_partitions)
return -EBUSY;
@@ -503,8 +501,7 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
goto out_unregister_bdi;
/* Make sure the first partition scan will be proceed */
- if (get_capacity(disk) && !(disk->flags & GENHD_FL_NO_PART) &&
- !test_bit(GD_SUPPRESS_PART_SCAN, &disk->state))
+ if (get_capacity(disk) && disk_has_partscan(disk))
set_bit(GD_NEED_PART_SCAN, &disk->state);
bdev_add(disk->part0, ddev->devt);
@@ -954,15 +951,10 @@ ssize_t part_stat_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct block_device *bdev = dev_to_bdev(dev);
- struct request_queue *q = bdev_get_queue(bdev);
struct disk_stats stat;
unsigned int inflight;
- if (queue_is_mq(q))
- inflight = blk_mq_in_flight(q, bdev);
- else
- inflight = part_in_flight(bdev);
-
+ inflight = part_in_flight(bdev);
if (inflight) {
part_stat_lock();
update_io_ticks(bdev, jiffies, true);
@@ -1047,6 +1039,12 @@ static ssize_t diskseq_show(struct device *dev,
return sprintf(buf, "%llu\n", disk->diskseq);
}
+static ssize_t partscan_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", disk_has_partscan(dev_to_disk(dev)));
+}
+
static DEVICE_ATTR(range, 0444, disk_range_show, NULL);
static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL);
static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL);
@@ -1060,6 +1058,7 @@ static DEVICE_ATTR(stat, 0444, part_stat_show, NULL);
static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store);
static DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL);
+static DEVICE_ATTR(partscan, 0444, partscan_show, NULL);
#ifdef CONFIG_FAIL_MAKE_REQUEST
ssize_t part_fail_show(struct device *dev,
@@ -1106,6 +1105,7 @@ static struct attribute *disk_attrs[] = {
&dev_attr_events_async.attr,
&dev_attr_events_poll_msecs.attr,
&dev_attr_diskseq.attr,
+ &dev_attr_partscan.attr,
#ifdef CONFIG_FAIL_MAKE_REQUEST
&dev_attr_fail.attr,
#endif
@@ -1182,7 +1182,7 @@ static void disk_release(struct device *dev)
disk_release_events(disk);
kfree(disk->random);
- disk_free_zone_bitmaps(disk);
+ disk_free_zone_resources(disk);
xa_destroy(&disk->part_tbl);
disk->queue->disk = NULL;
@@ -1251,11 +1251,8 @@ static int diskstats_show(struct seq_file *seqf, void *v)
xa_for_each(&gp->part_tbl, idx, hd) {
if (bdev_is_partition(hd) && !bdev_nr_sectors(hd))
continue;
- if (queue_is_mq(gp->queue))
- inflight = blk_mq_in_flight(gp->queue, hd);
- else
- inflight = part_in_flight(hd);
+ inflight = part_in_flight(hd);
if (inflight) {
part_stat_lock();
update_io_ticks(hd, jiffies, true);
@@ -1364,6 +1361,7 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
if (blkcg_init_disk(disk))
goto out_erase_part0;
+ disk_init_zone_resources(disk);
rand_initialize_disk(disk);
disk_to_dev(disk)->class = &block_class;
disk_to_dev(disk)->type = &disk_type;
diff --git a/block/ioctl.c b/block/ioctl.c
index f505f9c341eb..c7db3bd2d653 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -33,7 +33,7 @@ static int blkpg_do_ioctl(struct block_device *bdev,
if (op == BLKPG_DEL_PARTITION)
return bdev_del_partition(disk, p.pno);
- if (p.start < 0 || p.length <= 0 || p.start + p.length < 0)
+ if (p.start < 0 || p.length <= 0 || LLONG_MAX - p.length < p.start)
return -EINVAL;
/* Check that the partition is aligned to the block size */
if (!IS_ALIGNED(p.start | p.length, bdev_logical_block_size(bdev)))
@@ -95,9 +95,12 @@ static int compat_blkpg_ioctl(struct block_device *bdev,
static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
unsigned long arg)
{
- uint64_t range[2];
- uint64_t start, len, end;
+ unsigned int bs_mask = bdev_logical_block_size(bdev) - 1;
struct inode *inode = bdev->bd_inode;
+ uint64_t range[2], start, len, end;
+ struct bio *prev = NULL, *bio;
+ sector_t sector, nr_sects;
+ struct blk_plug plug;
int err;
if (!(mode & BLK_OPEN_WRITE))
@@ -105,6 +108,8 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
if (!bdev_max_discard_sectors(bdev))
return -EOPNOTSUPP;
+ if (bdev_read_only(bdev))
+ return -EPERM;
if (copy_from_user(range, (void __user *)arg, sizeof(range)))
return -EFAULT;
@@ -112,9 +117,9 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
start = range[0];
len = range[1];
- if (start & 511)
+ if (!len)
return -EINVAL;
- if (len & 511)
+ if ((start | len) & bs_mask)
return -EINVAL;
if (check_add_overflow(start, len, &end) ||
@@ -125,7 +130,32 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
err = truncate_bdev_range(bdev, mode, start, start + len - 1);
if (err)
goto fail;
- err = blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
+
+ sector = start >> SECTOR_SHIFT;
+ nr_sects = len >> SECTOR_SHIFT;
+
+ blk_start_plug(&plug);
+ while (1) {
+ if (fatal_signal_pending(current)) {
+ if (prev)
+ bio_await_chain(prev);
+ err = -EINTR;
+ goto out_unplug;
+ }
+ bio = blk_alloc_discard_bio(bdev, &sector, &nr_sects,
+ GFP_KERNEL);
+ if (!bio)
+ break;
+ prev = bio_chain_and_submit(prev, bio);
+ }
+ if (prev) {
+ err = submit_bio_wait(prev);
+ if (err == -EOPNOTSUPP)
+ err = 0;
+ bio_put(prev);
+ }
+out_unplug:
+ blk_finish_plug(&plug);
fail:
filemap_invalidate_unlock(inode->i_mapping);
return err;
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 02a916ba62ee..94eede4fb9eb 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -102,7 +102,6 @@ struct deadline_data {
int prio_aging_expire;
spinlock_t lock;
- spinlock_t zone_lock;
};
/* Maps an I/O priority class to a deadline scheduler priority. */
@@ -129,36 +128,7 @@ static u8 dd_rq_ioclass(struct request *rq)
}
/*
- * get the request before `rq' in sector-sorted order
- */
-static inline struct request *
-deadline_earlier_request(struct request *rq)
-{
- struct rb_node *node = rb_prev(&rq->rb_node);
-
- if (node)
- return rb_entry_rq(node);
-
- return NULL;
-}
-
-/*
- * get the request after `rq' in sector-sorted order
- */
-static inline struct request *
-deadline_latter_request(struct request *rq)
-{
- struct rb_node *node = rb_next(&rq->rb_node);
-
- if (node)
- return rb_entry_rq(node);
-
- return NULL;
-}
-
-/*
- * Return the first request for which blk_rq_pos() >= @pos. For zoned devices,
- * return the first request after the start of the zone containing @pos.
+ * Return the first request for which blk_rq_pos() >= @pos.
*/
static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio,
enum dd_data_dir data_dir, sector_t pos)
@@ -170,14 +140,6 @@ static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio,
return NULL;
rq = rb_entry_rq(node);
- /*
- * A zoned write may have been requeued with a starting position that
- * is below that of the most recently dispatched request. Hence, for
- * zoned writes, start searching from the start of a zone.
- */
- if (blk_rq_is_seq_zoned_write(rq))
- pos = round_down(pos, rq->q->limits.chunk_sectors);
-
while (node) {
rq = rb_entry_rq(node);
if (blk_rq_pos(rq) >= pos) {
@@ -309,36 +271,6 @@ static inline bool deadline_check_fifo(struct dd_per_prio *per_prio,
}
/*
- * Check if rq has a sequential request preceding it.
- */
-static bool deadline_is_seq_write(struct deadline_data *dd, struct request *rq)
-{
- struct request *prev = deadline_earlier_request(rq);
-
- if (!prev)
- return false;
-
- return blk_rq_pos(prev) + blk_rq_sectors(prev) == blk_rq_pos(rq);
-}
-
-/*
- * Skip all write requests that are sequential from @rq, even if we cross
- * a zone boundary.
- */
-static struct request *deadline_skip_seq_writes(struct deadline_data *dd,
- struct request *rq)
-{
- sector_t pos = blk_rq_pos(rq);
-
- do {
- pos += blk_rq_sectors(rq);
- rq = deadline_latter_request(rq);
- } while (rq && blk_rq_pos(rq) == pos);
-
- return rq;
-}
-
-/*
* For the specified data direction, return the next request to
* dispatch using arrival ordered lists.
*/
@@ -346,40 +278,10 @@ static struct request *
deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
enum dd_data_dir data_dir)
{
- struct request *rq, *rb_rq, *next;
- unsigned long flags;
-
if (list_empty(&per_prio->fifo_list[data_dir]))
return NULL;
- rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next);
- if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q))
- return rq;
-
- /*
- * Look for a write request that can be dispatched, that is one with
- * an unlocked target zone. For some HDDs, breaking a sequential
- * write stream can lead to lower throughput, so make sure to preserve
- * sequential write streams, even if that stream crosses into the next
- * zones and these zones are unlocked.
- */
- spin_lock_irqsave(&dd->zone_lock, flags);
- list_for_each_entry_safe(rq, next, &per_prio->fifo_list[DD_WRITE],
- queuelist) {
- /* Check whether a prior request exists for the same zone. */
- rb_rq = deadline_from_pos(per_prio, data_dir, blk_rq_pos(rq));
- if (rb_rq && blk_rq_pos(rb_rq) < blk_rq_pos(rq))
- rq = rb_rq;
- if (blk_req_can_dispatch_to_zone(rq) &&
- (blk_queue_nonrot(rq->q) ||
- !deadline_is_seq_write(dd, rq)))
- goto out;
- }
- rq = NULL;
-out:
- spin_unlock_irqrestore(&dd->zone_lock, flags);
-
- return rq;
+ return rq_entry_fifo(per_prio->fifo_list[data_dir].next);
}
/*
@@ -390,36 +292,8 @@ static struct request *
deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
enum dd_data_dir data_dir)
{
- struct request *rq;
- unsigned long flags;
-
- rq = deadline_from_pos(per_prio, data_dir,
- per_prio->latest_pos[data_dir]);
- if (!rq)
- return NULL;
-
- if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q))
- return rq;
-
- /*
- * Look for a write request that can be dispatched, that is one with
- * an unlocked target zone. For some HDDs, breaking a sequential
- * write stream can lead to lower throughput, so make sure to preserve
- * sequential write streams, even if that stream crosses into the next
- * zones and these zones are unlocked.
- */
- spin_lock_irqsave(&dd->zone_lock, flags);
- while (rq) {
- if (blk_req_can_dispatch_to_zone(rq))
- break;
- if (blk_queue_nonrot(rq->q))
- rq = deadline_latter_request(rq);
- else
- rq = deadline_skip_seq_writes(dd, rq);
- }
- spin_unlock_irqrestore(&dd->zone_lock, flags);
-
- return rq;
+ return deadline_from_pos(per_prio, data_dir,
+ per_prio->latest_pos[data_dir]);
}
/*
@@ -525,10 +399,6 @@ dispatch_find_request:
rq = next_rq;
}
- /*
- * For a zoned block device, if we only have writes queued and none of
- * them can be dispatched, rq will be NULL.
- */
if (!rq)
return NULL;
@@ -549,10 +419,6 @@ done:
prio = ioprio_class_to_prio[ioprio_class];
dd->per_prio[prio].latest_pos[data_dir] = blk_rq_pos(rq);
dd->per_prio[prio].stats.dispatched++;
- /*
- * If the request needs its target zone locked, do it.
- */
- blk_req_zone_write_lock(rq);
rq->rq_flags |= RQF_STARTED;
return rq;
}
@@ -722,7 +588,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
dd->fifo_batch = fifo_batch;
dd->prio_aging_expire = prio_aging_expire;
spin_lock_init(&dd->lock);
- spin_lock_init(&dd->zone_lock);
/* We dispatch from request queue wide instead of hw queue */
blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);
@@ -804,12 +669,6 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
lockdep_assert_held(&dd->lock);
- /*
- * This may be a requeue of a write request that has locked its
- * target zone. If it is the case, this releases the zone lock.
- */
- blk_req_zone_write_unlock(rq);
-
prio = ioprio_class_to_prio[ioprio_class];
per_prio = &dd->per_prio[prio];
if (!rq->elv.priv[0]) {
@@ -841,18 +700,6 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
*/
rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
insert_before = &per_prio->fifo_list[data_dir];
-#ifdef CONFIG_BLK_DEV_ZONED
- /*
- * Insert zoned writes such that requests are sorted by
- * position per zone.
- */
- if (blk_rq_is_seq_zoned_write(rq)) {
- struct request *rq2 = deadline_latter_request(rq);
-
- if (rq2 && blk_rq_zone_no(rq2) == blk_rq_zone_no(rq))
- insert_before = &rq2->queuelist;
- }
-#endif
list_add_tail(&rq->queuelist, insert_before);
}
}
@@ -887,33 +734,8 @@ static void dd_prepare_request(struct request *rq)
rq->elv.priv[0] = NULL;
}
-static bool dd_has_write_work(struct blk_mq_hw_ctx *hctx)
-{
- struct deadline_data *dd = hctx->queue->elevator->elevator_data;
- enum dd_prio p;
-
- for (p = 0; p <= DD_PRIO_MAX; p++)
- if (!list_empty_careful(&dd->per_prio[p].fifo_list[DD_WRITE]))
- return true;
-
- return false;
-}
-
/*
* Callback from inside blk_mq_free_request().
- *
- * For zoned block devices, write unlock the target zone of
- * completed write requests. Do this while holding the zone lock
- * spinlock so that the zone is never unlocked while deadline_fifo_request()
- * or deadline_next_request() are executing. This function is called for
- * all requests, whether or not these requests complete successfully.
- *
- * For a zoned block device, __dd_dispatch_request() may have stopped
- * dispatching requests if all the queued requests are write requests directed
- * at zones that are already locked due to on-going write requests. To ensure
- * write request dispatch progress in this case, mark the queue as needing a
- * restart to ensure that the queue is run again after completion of the
- * request and zones being unlocked.
*/
static void dd_finish_request(struct request *rq)
{
@@ -928,21 +750,8 @@ static void dd_finish_request(struct request *rq)
* called dd_insert_requests(). Skip requests that bypassed I/O
* scheduling. See also blk_mq_request_bypass_insert().
*/
- if (!rq->elv.priv[0])
- return;
-
- atomic_inc(&per_prio->stats.completed);
-
- if (blk_queue_is_zoned(q)) {
- unsigned long flags;
-
- spin_lock_irqsave(&dd->zone_lock, flags);
- blk_req_zone_write_unlock(rq);
- spin_unlock_irqrestore(&dd->zone_lock, flags);
-
- if (dd_has_write_work(rq->mq_hctx))
- blk_mq_sched_mark_restart_hctx(rq->mq_hctx);
- }
+ if (rq->elv.priv[0])
+ atomic_inc(&per_prio->stats.completed);
}
static bool dd_has_work_for_prio(struct dd_per_prio *per_prio)
@@ -1266,7 +1075,6 @@ static struct elevator_type mq_deadline = {
.elevator_attrs = deadline_attrs,
.elevator_name = "mq-deadline",
.elevator_alias = "deadline",
- .elevator_features = ELEVATOR_F_ZBD_SEQ_WRITE,
.elevator_owner = THIS_MODULE,
};
MODULE_ALIAS("mq-deadline-iosched");
diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c
index c03bc105e575..152c85df92b2 100644
--- a/block/partitions/cmdline.c
+++ b/block/partitions/cmdline.c
@@ -70,8 +70,8 @@ static int parse_subpart(struct cmdline_subpart **subpart, char *partdef)
}
if (*partdef == '(') {
- int length;
- char *next = strchr(++partdef, ')');
+ partdef++;
+ char *next = strsep(&partdef, ")");
if (!next) {
pr_warn("cmdline partition format is invalid.");
@@ -79,11 +79,7 @@ static int parse_subpart(struct cmdline_subpart **subpart, char *partdef)
goto fail;
}
- length = min_t(int, next - partdef,
- sizeof(new_subpart->name) - 1);
- strscpy(new_subpart->name, partdef, length);
-
- partdef = ++next;
+ strscpy(new_subpart->name, next, sizeof(new_subpart->name));
} else
new_subpart->name[0] = '\0';
@@ -117,14 +113,12 @@ static void free_subpart(struct cmdline_parts *parts)
}
}
-static int parse_parts(struct cmdline_parts **parts, const char *bdevdef)
+static int parse_parts(struct cmdline_parts **parts, char *bdevdef)
{
int ret = -EINVAL;
char *next;
- int length;
struct cmdline_subpart **next_subpart;
struct cmdline_parts *newparts;
- char buf[BDEVNAME_SIZE + 32 + 4];
*parts = NULL;
@@ -132,28 +126,19 @@ static int parse_parts(struct cmdline_parts **parts, const char *bdevdef)
if (!newparts)
return -ENOMEM;
- next = strchr(bdevdef, ':');
+ next = strsep(&bdevdef, ":");
if (!next) {
pr_warn("cmdline partition has no block device.");
goto fail;
}
- length = min_t(int, next - bdevdef, sizeof(newparts->name) - 1);
- strscpy(newparts->name, bdevdef, length);
+ strscpy(newparts->name, next, sizeof(newparts->name));
newparts->nr_subparts = 0;
next_subpart = &newparts->subpart;
- while (next && *(++next)) {
- bdevdef = next;
- next = strchr(bdevdef, ',');
-
- length = (!next) ? (sizeof(buf) - 1) :
- min_t(int, next - bdevdef, sizeof(buf) - 1);
-
- strscpy(buf, bdevdef, length);
-
- ret = parse_subpart(next_subpart, buf);
+ while ((next = strsep(&bdevdef, ","))) {
+ ret = parse_subpart(next_subpart, next);
if (ret)
goto fail;
@@ -199,24 +184,17 @@ static int cmdline_parts_parse(struct cmdline_parts **parts,
*parts = NULL;
- next = pbuf = buf = kstrdup(cmdline, GFP_KERNEL);
+ pbuf = buf = kstrdup(cmdline, GFP_KERNEL);
if (!buf)
return -ENOMEM;
next_parts = parts;
- while (next && *pbuf) {
- next = strchr(pbuf, ';');
- if (next)
- *next = '\0';
-
- ret = parse_parts(next_parts, pbuf);
+ while ((next = strsep(&pbuf, ";"))) {
+ ret = parse_parts(next_parts, next);
if (ret)
goto fail;
- if (next)
- pbuf = ++next;
-
next_parts = &(*next_parts)->next_parts;
}
@@ -250,7 +228,6 @@ static struct cmdline_parts *bdev_parts;
static int add_part(int slot, struct cmdline_subpart *subpart,
struct parsed_partitions *state)
{
- int label_min;
struct partition_meta_info *info;
char tmp[sizeof(info->volname) + 4];
@@ -262,9 +239,7 @@ static int add_part(int slot, struct cmdline_subpart *subpart,
info = &state->parts[slot].info;
- label_min = min_t(int, sizeof(info->volname) - 1,
- sizeof(subpart->name));
- strscpy(info->volname, subpart->name, label_min);
+ strscpy(info->volname, subpart->name, sizeof(info->volname));
snprintf(tmp, sizeof(tmp), "(%s)", info->volname);
strlcat(state->pp_buf, tmp, PAGE_SIZE);
diff --git a/block/partitions/core.c b/block/partitions/core.c
index b11e88c82c8c..37b5f92d07fe 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -573,10 +573,7 @@ static int blk_add_partitions(struct gendisk *disk)
struct parsed_partitions *state;
int ret = -EAGAIN, p;
- if (disk->flags & GENHD_FL_NO_PART)
- return 0;
-
- if (test_bit(GD_SUPPRESS_PART_SCAN, &disk->state))
+ if (!disk_has_partscan(disk))
return 0;
state = check_partition(disk);