aboutsummaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig1
-rw-r--r--block/bdev.c10
-rw-r--r--block/bfq-cgroup.c34
-rw-r--r--block/bfq-iosched.c16
-rw-r--r--block/bfq-iosched.h8
-rw-r--r--block/bfq-wf2q.c5
-rw-r--r--block/bio-integrity.c2
-rw-r--r--block/bio.c142
-rw-r--r--block/blk-cgroup-rwstat.h8
-rw-r--r--block/blk-cgroup.c76
-rw-r--r--block/blk-cgroup.h12
-rw-r--r--block/blk-core.c89
-rw-r--r--block/blk-flush.c6
-rw-r--r--block/blk-ia-ranges.c65
-rw-r--r--block/blk-ioc.c2
-rw-r--r--block/blk-iocost.c22
-rw-r--r--block/blk-iolatency.c30
-rw-r--r--block/blk-ioprio.c57
-rw-r--r--block/blk-ioprio.h9
-rw-r--r--block/blk-lib.c6
-rw-r--r--block/blk-map.c7
-rw-r--r--block/blk-merge.c249
-rw-r--r--block/blk-mq-debugfs-zoned.c6
-rw-r--r--block/blk-mq-debugfs.c19
-rw-r--r--block/blk-mq-sysfs.c45
-rw-r--r--block/blk-mq-tag.c31
-rw-r--r--block/blk-mq-tag.h10
-rw-r--r--block/blk-mq.c111
-rw-r--r--block/blk-mq.h39
-rw-r--r--block/blk-rq-qos.c10
-rw-r--r--block/blk-rq-qos.h11
-rw-r--r--block/blk-settings.c11
-rw-r--r--block/blk-sysfs.c62
-rw-r--r--block/blk-throttle.c7
-rw-r--r--block/blk-wbt.c30
-rw-r--r--block/blk-zoned.c92
-rw-r--r--block/blk.h81
-rw-r--r--block/bounce.c39
-rw-r--r--block/bsg-lib.c6
-rw-r--r--block/bsg.c4
-rw-r--r--block/elevator.h2
-rw-r--r--block/fops.c38
-rw-r--r--block/genhd.c92
-rw-r--r--block/ioctl.c2
-rw-r--r--block/ioprio.c58
-rw-r--r--block/kyber-iosched.c8
-rw-r--r--block/mq-deadline.c4
-rw-r--r--block/partitions/check.h4
-rw-r--r--block/partitions/core.c23
49 files changed, 841 insertions, 860 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 50b17e260fa2..444c5ab3b67e 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -147,7 +147,6 @@ config BLK_CGROUP_FC_APPID
config BLK_CGROUP_IOCOST
bool "Enable support for cost model based cgroup IO controller"
depends on BLK_CGROUP
- select BLK_RQ_IO_DATA_LEN
select BLK_RQ_ALLOC_TIME
help
Enabling this option enables the .weight interface for cost
diff --git a/block/bdev.c b/block/bdev.c
index 5fe06c1f2def..ce05175e71ce 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -54,12 +54,10 @@ static void bdev_write_inode(struct block_device *bdev)
while (inode->i_state & I_DIRTY) {
spin_unlock(&inode->i_lock);
ret = write_inode_now(inode, true);
- if (ret) {
- char name[BDEVNAME_SIZE];
- pr_warn_ratelimited("VFS: Dirty inode writeback failed "
- "for block device %s (err=%d).\n",
- bdevname(bdev, name), ret);
- }
+ if (ret)
+ pr_warn_ratelimited(
+ "VFS: Dirty inode writeback failed for block device %pg (err=%d).\n",
+ bdev, ret);
spin_lock(&inode->i_lock);
}
spin_unlock(&inode->i_lock);
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 09574af83566..30b15a9a47c4 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -220,46 +220,46 @@ void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg)
}
void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
- unsigned int op)
+ blk_opf_t opf)
{
- blkg_rwstat_add(&bfqg->stats.queued, op, 1);
+ blkg_rwstat_add(&bfqg->stats.queued, opf, 1);
bfqg_stats_end_empty_time(&bfqg->stats);
if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))
bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));
}
-void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op)
+void bfqg_stats_update_io_remove(struct bfq_group *bfqg, blk_opf_t opf)
{
- blkg_rwstat_add(&bfqg->stats.queued, op, -1);
+ blkg_rwstat_add(&bfqg->stats.queued, opf, -1);
}
-void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op)
+void bfqg_stats_update_io_merged(struct bfq_group *bfqg, blk_opf_t opf)
{
- blkg_rwstat_add(&bfqg->stats.merged, op, 1);
+ blkg_rwstat_add(&bfqg->stats.merged, opf, 1);
}
void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns,
- u64 io_start_time_ns, unsigned int op)
+ u64 io_start_time_ns, blk_opf_t opf)
{
struct bfqg_stats *stats = &bfqg->stats;
u64 now = ktime_get_ns();
if (now > io_start_time_ns)
- blkg_rwstat_add(&stats->service_time, op,
+ blkg_rwstat_add(&stats->service_time, opf,
now - io_start_time_ns);
if (io_start_time_ns > start_time_ns)
- blkg_rwstat_add(&stats->wait_time, op,
+ blkg_rwstat_add(&stats->wait_time, opf,
io_start_time_ns - start_time_ns);
}
#else /* CONFIG_BFQ_CGROUP_DEBUG */
void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
- unsigned int op) { }
-void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { }
-void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { }
+ blk_opf_t opf) { }
+void bfqg_stats_update_io_remove(struct bfq_group *bfqg, blk_opf_t opf) { }
+void bfqg_stats_update_io_merged(struct bfq_group *bfqg, blk_opf_t opf) { }
void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns,
- u64 io_start_time_ns, unsigned int op) { }
+ u64 io_start_time_ns, blk_opf_t opf) { }
void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { }
void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { }
void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { }
@@ -706,10 +706,10 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
}
/**
- * __bfq_bic_change_cgroup - move @bic to @cgroup.
+ * __bfq_bic_change_cgroup - move @bic to @bfqg.
* @bfqd: the queue descriptor.
* @bic: the bic to move.
- * @blkcg: the blk-cgroup to move to.
+ * @bfqg: the group to move to.
*
* Move bic to blkcg, assuming that bfqd->lock is held; which makes
* sure that the reference to cgroup is valid across the call (see
@@ -863,6 +863,7 @@ static void bfq_flush_idle_tree(struct bfq_service_tree *st)
* @bfqd: the device data structure with the root group.
* @entity: the entity to move, if entity is a leaf; or the parent entity
* of an active leaf entity to move, if entity is not a leaf.
+ * @ioprio_class: I/O priority class to reparent.
*/
static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
struct bfq_entity *entity,
@@ -892,6 +893,7 @@ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
* @bfqd: the device data structure with the root group.
* @bfqg: the group to move from.
* @st: the service tree to start the search from.
+ * @ioprio_class: I/O priority class to reparent.
*/
static void bfq_reparent_active_queues(struct bfq_data *bfqd,
struct bfq_group *bfqg,
@@ -1471,8 +1473,6 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
return bfqq->bfqd->root_group;
}
-void bfqg_and_blkg_get(struct bfq_group *bfqg) {}
-
void bfqg_and_blkg_put(struct bfq_group *bfqg) {}
struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index e6d7e6b01a05..c740b41fe0a4 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -668,19 +668,19 @@ static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit)
* significantly affect service guarantees coming from the BFQ scheduling
* algorithm.
*/
-static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
+static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
{
struct bfq_data *bfqd = data->q->elevator->elevator_data;
struct bfq_io_cq *bic = bfq_bic_lookup(data->q);
- struct bfq_queue *bfqq = bic ? bic_to_bfqq(bic, op_is_sync(op)) : NULL;
+ struct bfq_queue *bfqq = bic ? bic_to_bfqq(bic, op_is_sync(opf)) : NULL;
int depth;
unsigned limit = data->q->nr_requests;
/* Sync reads have full depth available */
- if (op_is_sync(op) && !op_is_write(op)) {
+ if (op_is_sync(opf) && !op_is_write(opf)) {
depth = 0;
} else {
- depth = bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)];
+ depth = bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(opf)];
limit = (limit * depth) >> bfqd->full_depth_shift;
}
@@ -693,7 +693,7 @@ static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
depth = 1;
bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u",
- __func__, bfqd->wr_busy_queues, op_is_sync(op), depth);
+ __func__, bfqd->wr_busy_queues, op_is_sync(opf), depth);
if (depth)
data->shallow_depth = depth;
}
@@ -6104,7 +6104,7 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
static void bfq_update_insert_stats(struct request_queue *q,
struct bfq_queue *bfqq,
bool idle_timer_disabled,
- unsigned int cmd_flags)
+ blk_opf_t cmd_flags)
{
if (!bfqq)
return;
@@ -6129,7 +6129,7 @@ static void bfq_update_insert_stats(struct request_queue *q,
static inline void bfq_update_insert_stats(struct request_queue *q,
struct bfq_queue *bfqq,
bool idle_timer_disabled,
- unsigned int cmd_flags) {}
+ blk_opf_t cmd_flags) {}
#endif /* CONFIG_BFQ_CGROUP_DEBUG */
static struct bfq_queue *bfq_init_rq(struct request *rq);
@@ -6141,7 +6141,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
struct bfq_data *bfqd = q->elevator->elevator_data;
struct bfq_queue *bfqq;
bool idle_timer_disabled = false;
- unsigned int cmd_flags;
+ blk_opf_t cmd_flags;
LIST_HEAD(free);
#ifdef CONFIG_BFQ_GROUP_IOSCHED
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index ca8177d7bf7c..ad8e513d7e87 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -994,11 +994,11 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq);
void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
- unsigned int op);
-void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op);
-void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op);
+ blk_opf_t opf);
+void bfqg_stats_update_io_remove(struct bfq_group *bfqg, blk_opf_t opf);
+void bfqg_stats_update_io_merged(struct bfq_group *bfqg, blk_opf_t opf);
void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns,
- u64 io_start_time_ns, unsigned int op);
+ u64 io_start_time_ns, blk_opf_t opf);
void bfqg_stats_update_dequeue(struct bfq_group *bfqg);
void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
void bfqg_stats_update_idle_time(struct bfq_group *bfqg);
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index f8eb340381cf..983413cdefad 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -1360,6 +1360,8 @@ left:
/**
* __bfq_lookup_next_entity - return the first eligible entity in @st.
* @st: the service tree.
+ * @in_service: whether or not there is an in-service entity for the sched_data
+ * this active tree belongs to.
*
* If there is no in-service entity for the sched_data st belongs to,
* then return the entity that will be set in service if:
@@ -1472,9 +1474,6 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
break;
}
- if (!entity)
- return NULL;
-
return entity;
}
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 32929c89ba8a..3f5685c00e36 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -134,7 +134,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
iv = bip->bip_vec + bip->bip_vcnt;
if (bip->bip_vcnt &&
- bvec_gap_to_prev(bdev_get_queue(bio->bi_bdev),
+ bvec_gap_to_prev(&bdev_get_queue(bio->bi_bdev)->limits,
&bip->bip_vec[bip->bip_vcnt - 1], offset))
return 0;
diff --git a/block/bio.c b/block/bio.c
index 51c99f2c5c90..3d3a2678fea2 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -239,7 +239,7 @@ static void bio_free(struct bio *bio)
* when IO has completed, or when the bio is released.
*/
void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
- unsigned short max_vecs, unsigned int opf)
+ unsigned short max_vecs, blk_opf_t opf)
{
bio->bi_next = NULL;
bio->bi_bdev = bdev;
@@ -292,7 +292,7 @@ EXPORT_SYMBOL(bio_init);
* preserved are the ones that are initialized by bio_alloc_bioset(). See
* comment in struct bio.
*/
-void bio_reset(struct bio *bio, struct block_device *bdev, unsigned int opf)
+void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf)
{
bio_uninit(bio);
memset(bio, 0, BIO_RESET_BYTES);
@@ -341,7 +341,7 @@ void bio_chain(struct bio *bio, struct bio *parent)
EXPORT_SYMBOL(bio_chain);
struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev,
- unsigned int nr_pages, unsigned int opf, gfp_t gfp)
+ unsigned int nr_pages, blk_opf_t opf, gfp_t gfp)
{
struct bio *new = bio_alloc(bdev, nr_pages, opf, gfp);
@@ -409,7 +409,7 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
}
static struct bio *bio_alloc_percpu_cache(struct block_device *bdev,
- unsigned short nr_vecs, unsigned int opf, gfp_t gfp,
+ unsigned short nr_vecs, blk_opf_t opf, gfp_t gfp,
struct bio_set *bs)
{
struct bio_alloc_cache *cache;
@@ -468,7 +468,7 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev,
* Returns: Pointer to new bio on success, NULL on failure.
*/
struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs,
- unsigned int opf, gfp_t gfp_mask,
+ blk_opf_t opf, gfp_t gfp_mask,
struct bio_set *bs)
{
gfp_t saved_gfp = gfp_mask;
@@ -965,7 +965,7 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
* would create a gap, disallow it.
*/
bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
- if (bvec_gap_to_prev(q, bvec, offset))
+ if (bvec_gap_to_prev(&q->limits, bvec, offset))
return 0;
}
@@ -1033,7 +1033,7 @@ int bio_add_zone_append_page(struct bio *bio, struct page *page,
if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_ZONE_APPEND))
return 0;
- if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
+ if (WARN_ON_ONCE(!bdev_is_zoned(bio->bi_bdev)))
return 0;
return bio_add_hw_page(q, bio, page, len, offset,
@@ -1151,12 +1151,33 @@ void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
bio_set_flag(bio, BIO_CLONED);
}
-static void bio_put_pages(struct page **pages, size_t size, size_t off)
+static int bio_iov_add_page(struct bio *bio, struct page *page,
+ unsigned int len, unsigned int offset)
{
- size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE);
+ bool same_page = false;
+
+ if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) {
+ __bio_add_page(bio, page, len, offset);
+ return 0;
+ }
- for (i = 0; i < nr; i++)
- put_page(pages[i]);
+ if (same_page)
+ put_page(page);
+ return 0;
+}
+
+static int bio_iov_add_zone_append_page(struct bio *bio, struct page *page,
+ unsigned int len, unsigned int offset)
+{
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+ bool same_page = false;
+
+ if (bio_add_hw_page(q, bio, page, len, offset,
+ queue_max_zone_append_sectors(q), &same_page) != len)
+ return -EINVAL;
+ if (same_page)
+ put_page(page);
+ return 0;
}
#define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *))
@@ -1177,90 +1198,62 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
struct page **pages = (struct page **)bv;
- bool same_page = false;
ssize_t size, left;
- unsigned len, i;
- size_t offset;
+ unsigned len, i = 0;
+ size_t offset, trim;
+ int ret = 0;
/*
* Move page array up in the allocated memory for the bio vecs as far as
* possible so that we can start filling biovecs from the beginning
* without overwriting the temporary page array.
- */
+ */
BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
- size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
+ /*
+ * Each segment in the iov is required to be a block size multiple.
+ * However, we may not be able to get the entire segment if it spans
+ * more pages than bi_max_vecs allows, so we have to ALIGN_DOWN the
+ * result to ensure the bio's total size is correct. The remainder of
+ * the iov data will be picked up in the next bio iteration.
+ */
+ size = iov_iter_get_pages2(iter, pages, UINT_MAX - bio->bi_iter.bi_size,
+ nr_pages, &offset);
if (unlikely(size <= 0))
return size ? size : -EFAULT;
- for (left = size, i = 0; left > 0; left -= len, i++) {
- struct page *page = pages[i];
+ nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
- len = min_t(size_t, PAGE_SIZE - offset, left);
+ trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1);
+ iov_iter_revert(iter, trim);
- if (__bio_try_merge_page(bio, page, len, offset, &same_page)) {
- if (same_page)
- put_page(page);
- } else {
- if (WARN_ON_ONCE(bio_full(bio, len))) {
- bio_put_pages(pages + i, left, offset);
- return -EINVAL;
- }
- __bio_add_page(bio, page, len, offset);
- }
- offset = 0;
+ size -= trim;
+ if (unlikely(!size)) {
+ ret = -EFAULT;
+ goto out;
}
- iov_iter_advance(iter, size);
- return 0;
-}
-
-static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
-{
- unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
- unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
- struct request_queue *q = bdev_get_queue(bio->bi_bdev);
- unsigned int max_append_sectors = queue_max_zone_append_sectors(q);
- struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
- struct page **pages = (struct page **)bv;
- ssize_t size, left;
- unsigned len, i;
- size_t offset;
- int ret = 0;
-
- if (WARN_ON_ONCE(!max_append_sectors))
- return 0;
-
- /*
- * Move page array up in the allocated memory for the bio vecs as far as
- * possible so that we can start filling biovecs from the beginning
- * without overwriting the temporary page array.
- */
- BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
- pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
-
- size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
- if (unlikely(size <= 0))
- return size ? size : -EFAULT;
-
for (left = size, i = 0; left > 0; left -= len, i++) {
struct page *page = pages[i];
- bool same_page = false;
len = min_t(size_t, PAGE_SIZE - offset, left);
- if (bio_add_hw_page(q, bio, page, len, offset,
- max_append_sectors, &same_page) != len) {
- bio_put_pages(pages + i, left, offset);
- ret = -EINVAL;
- break;
- }
- if (same_page)
- put_page(page);
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ ret = bio_iov_add_zone_append_page(bio, page, len,
+ offset);
+ if (ret)
+ break;
+ } else
+ bio_iov_add_page(bio, page, len, offset);
+
offset = 0;
}
- iov_iter_advance(iter, size - left);
+ iov_iter_revert(iter, left);
+out:
+ while (i < nr_pages)
+ put_page(pages[i++]);
+
return ret;
}
@@ -1298,10 +1291,7 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
}
do {
- if (bio_op(bio) == REQ_OP_ZONE_APPEND)
- ret = __bio_iov_append_get_pages(bio, iter);
- else
- ret = __bio_iov_iter_get_pages(bio, iter);
+ ret = __bio_iov_iter_get_pages(bio, iter);
} while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
/* don't account direct I/O as memory stall */
diff --git a/block/blk-cgroup-rwstat.h b/block/blk-cgroup-rwstat.h
index 9f2723b34b75..022527b0b043 100644
--- a/block/blk-cgroup-rwstat.h
+++ b/block/blk-cgroup-rwstat.h
@@ -59,20 +59,20 @@ void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol,
* caller is responsible for synchronizing calls to this function.
*/
static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
- unsigned int op, uint64_t val)
+ blk_opf_t opf, uint64_t val)
{
struct percpu_counter *cnt;
- if (op_is_discard(op))
+ if (op_is_discard(opf))
cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_DISCARD];
- else if (op_is_write(op))
+ else if (op_is_write(opf))
cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE];
else
cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ];
percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH);
- if (op_is_sync(op))
+ if (op_is_sync(opf))
cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC];
else
cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC];
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 764e740b0c0f..869af9d72bcf 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -846,6 +846,21 @@ static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src)
}
}
+static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur,
+ struct blkg_iostat *last)
+{
+ struct blkg_iostat delta;
+ unsigned long flags;
+
+ /* propagate percpu delta to global */
+ flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
+ blkg_iostat_set(&delta, cur);
+ blkg_iostat_sub(&delta, last);
+ blkg_iostat_add(&blkg->iostat.cur, &delta);
+ blkg_iostat_add(last, &delta);
+ u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
+}
+
static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
{
struct blkcg *blkcg = css_to_blkcg(css);
@@ -860,8 +875,7 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
struct blkcg_gq *parent = blkg->parent;
struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu);
- struct blkg_iostat cur, delta;
- unsigned long flags;
+ struct blkg_iostat cur;
unsigned int seq;
/* fetch the current per-cpu values */
@@ -870,23 +884,12 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
blkg_iostat_set(&cur, &bisc->cur);
} while (u64_stats_fetch_retry(&bisc->sync, seq));
- /* propagate percpu delta to global */
- flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
- blkg_iostat_set(&delta, &cur);
- blkg_iostat_sub(&delta, &bisc->last);
- blkg_iostat_add(&blkg->iostat.cur, &delta);
- blkg_iostat_add(&bisc->last, &delta);
- u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
+ blkcg_iostat_update(blkg, &cur, &bisc->last);
/* propagate global delta to parent (unless that's root) */
- if (parent && parent->parent) {
- flags = u64_stats_update_begin_irqsave(&parent->iostat.sync);
- blkg_iostat_set(&delta, &blkg->iostat.cur);
- blkg_iostat_sub(&delta, &blkg->iostat.last);
- blkg_iostat_add(&parent->iostat.cur, &delta);
- blkg_iostat_add(&blkg->iostat.last, &delta);
- u64_stats_update_end_irqrestore(&parent->iostat.sync, flags);
- }
+ if (parent && parent->parent)
+ blkcg_iostat_update(parent, &blkg->iostat.cur,
+ &blkg->iostat.last);
}
rcu_read_unlock();
@@ -1299,6 +1302,7 @@ int blkcg_init_queue(struct request_queue *q)
ret = blk_iolatency_init(q);
if (ret) {
blk_throtl_exit(q);
+ blk_ioprio_exit(q);
goto err_destroy_all;
}
@@ -1529,6 +1533,18 @@ void blkcg_deactivate_policy(struct request_queue *q,
}
EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
+static void blkcg_free_all_cpd(struct blkcg_policy *pol)
+{
+ struct blkcg *blkcg;
+
+ list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
+ if (blkcg->cpd[pol->plid]) {
+ pol->cpd_free_fn(blkcg->cpd[pol->plid]);
+ blkcg->cpd[pol->plid] = NULL;
+ }
+ }
+}
+
/**
* blkcg_policy_register - register a blkcg policy
* @pol: blkcg policy to register
@@ -1593,14 +1609,9 @@ int blkcg_policy_register(struct blkcg_policy *pol)
return 0;
err_free_cpds:
- if (pol->cpd_free_fn) {
- list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
- if (blkcg->cpd[pol->plid]) {
- pol->cpd_free_fn(blkcg->cpd[pol->plid]);
- blkcg->cpd[pol->plid] = NULL;
- }
- }
- }
+ if (pol->cpd_free_fn)
+ blkcg_free_all_cpd(pol);
+
blkcg_policy[pol->plid] = NULL;
err_unlock:
mutex_unlock(&blkcg_pol_mutex);
@@ -1617,8 +1628,6 @@ EXPORT_SYMBOL_GPL(blkcg_policy_register);
*/
void blkcg_policy_unregister(struct blkcg_policy *pol)
{
- struct blkcg *blkcg;
-
mutex_lock(&blkcg_pol_register_mutex);
if (WARN_ON(blkcg_policy[pol->plid] != pol))
@@ -1633,14 +1642,9 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
/* remove cpds and unregister */
mutex_lock(&blkcg_pol_mutex);
- if (pol->cpd_free_fn) {
- list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
- if (blkcg->cpd[pol->plid]) {
- pol->cpd_free_fn(blkcg->cpd[pol->plid]);
- blkcg->cpd[pol->plid] = NULL;
- }
- }
- }
+ if (pol->cpd_free_fn)
+ blkcg_free_all_cpd(pol);
+
blkcg_policy[pol->plid] = NULL;
mutex_unlock(&blkcg_pol_mutex);
@@ -1696,7 +1700,7 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
* everybody is happy with their IO latencies.
*/
if (time_before64(old + NSEC_PER_SEC, now) &&
- atomic64_cmpxchg(&blkg->delay_start, old, now) == old) {
+ atomic64_try_cmpxchg(&blkg->delay_start, &old, now)) {
u64 cur = atomic64_read(&blkg->delay_nsec);
u64 sub = min_t(u64, blkg->last_delay, now - old);
int cur_use = atomic_read(&blkg->use_delay);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index d4de0a35e066..d2724d1dd7c9 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -430,12 +430,8 @@ static inline int blkcg_unuse_delay(struct blkcg_gq *blkg)
* then check to see if we were the last delay so we can drop the
* congestion count on the cgroup.
*/
- while (old) {
- int cur = atomic_cmpxchg(&blkg->use_delay, old, old - 1);
- if (cur == old)
- break;
- old = cur;
- }
+ while (old && !atomic_try_cmpxchg(&blkg->use_delay, &old, old - 1))
+ ;
if (old == 0)
return 0;
@@ -458,7 +454,7 @@ static inline void blkcg_set_delay(struct blkcg_gq *blkg, u64 delay)
int old = atomic_read(&blkg->use_delay);
/* We only want 1 person setting the congestion count for this blkg. */
- if (!old && atomic_cmpxchg(&blkg->use_delay, old, -1) == old)
+ if (!old && atomic_try_cmpxchg(&blkg->use_delay, &old, -1))
atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);
atomic64_set(&blkg->delay_nsec, delay);
@@ -475,7 +471,7 @@ static inline void blkcg_clear_delay(struct blkcg_gq *blkg)
int old = atomic_read(&blkg->use_delay);
/* We only want 1 person clearing the congestion count for this blkg. */
- if (old && atomic_cmpxchg(&blkg->use_delay, old, 0) == old)
+ if (old && atomic_try_cmpxchg(&blkg->use_delay, &old, 0))
atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
}
diff --git a/block/blk-core.c b/block/blk-core.c
index 27fb1357ad4b..a0d1104c5590 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -136,7 +136,7 @@ static const char *const blk_op_name[] = {
* string format. Useful in the debugging and tracing bio or request. For
* invalid REQ_OP_XXX it returns string "UNKNOWN".
*/
-inline const char *blk_op_str(unsigned int op)
+inline const char *blk_op_str(enum req_op op)
{
const char *op_str = "UNKNOWN";
@@ -285,49 +285,6 @@ void blk_queue_start_drain(struct request_queue *q)
}
/**
- * blk_cleanup_queue - shutdown a request queue
- * @q: request queue to shutdown
- *
- * Mark @q DYING, drain all pending requests, mark @q DEAD, destroy and
- * put it. All future requests will be failed immediately with -ENODEV.
- *
- * Context: can sleep
- */
-void blk_cleanup_queue(struct request_queue *q)
-{
- /* cannot be called from atomic context */
- might_sleep();
-
- WARN_ON_ONCE(blk_queue_registered(q));
-
- /* mark @q DYING, no new request or merges will be allowed afterwards */
- blk_queue_flag_set(QUEUE_FLAG_DYING, q);
- blk_queue_start_drain(q);
-
- blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q);
- blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
-
- /*
- * Drain all requests queued before DYING marking. Set DEAD flag to
- * prevent that blk_mq_run_hw_queues() accesses the hardware queues
- * after draining finished.
- */
- blk_freeze_queue(q);
-
- blk_queue_flag_set(QUEUE_FLAG_DEAD, q);
-
- blk_sync_queue(q);
- if (queue_is_mq(q)) {
- blk_mq_cancel_work_sync(q);
- blk_mq_exit_queue(q);
- }
-
- /* @q is and will stay empty, shutdown and put */
- blk_put_queue(q);
-}
-EXPORT_SYMBOL(blk_cleanup_queue);
-
-/**
* blk_queue_enter() - try to increase q->q_usage_counter
* @q: request queue pointer
* @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PM
@@ -420,7 +377,6 @@ static void blk_timeout_work(struct work_struct *work)
struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
{
struct request_queue *q;
- int ret;
q = kmem_cache_alloc_node(blk_get_queue_kmem_cache(alloc_srcu),
GFP_KERNEL | __GFP_ZERO, node_id);
@@ -435,17 +391,13 @@ struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
q->last_merge = NULL;
- q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL);
+ q->id = ida_alloc(&blk_queue_ida, GFP_KERNEL);
if (q->id < 0)
goto fail_srcu;
- ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, 0);
- if (ret)
- goto fail_id;
-
q->stats = blk_alloc_queue_stats();
if (!q->stats)
- goto fail_split;
+ goto fail_id;
q->node = node_id;
@@ -482,10 +434,8 @@ struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
fail_stats:
blk_free_queue_stats(q->stats);
-fail_split:
- bioset_exit(&q->bio_split);
fail_id:
- ida_simple_remove(&blk_queue_ida, q->id);
+ ida_free(&blk_queue_ida, q->id);
fail_srcu:
if (alloc_srcu)
cleanup_srcu_struct(q->srcu);
@@ -504,12 +454,10 @@ fail_q:
*/
bool blk_get_queue(struct request_queue *q)
{
- if (likely(!blk_queue_dying(q))) {
- __blk_get_queue(q);
- return true;
- }
-
- return false;
+ if (unlikely(blk_queue_dying(q)))
+ return false;
+ kobject_get(&q->kobj);
+ return true;
}
EXPORT_SYMBOL(blk_get_queue);
@@ -608,16 +556,15 @@ static int blk_partition_remap(struct bio *bio)
static inline blk_status_t blk_check_zone_append(struct request_queue *q,
struct bio *bio)
{
- sector_t pos = bio->bi_iter.bi_sector;
int nr_sectors = bio_sectors(bio);
/* Only applicable to zoned block devices */
- if (!blk_queue_is_zoned(q))
+ if (!bdev_is_zoned(bio->bi_bdev))
return BLK_STS_NOTSUPP;
/* The bio sector must point to the start of a sequential zone */
- if (pos & (blk_queue_zone_sectors(q) - 1) ||
- !blk_queue_zone_is_seq(q, pos))
+ if (bio->bi_iter.bi_sector & (bdev_zone_sectors(bio->bi_bdev) - 1) ||
+ !bio_zone_is_seq(bio))
return BLK_STS_IOERR;
/*
@@ -762,7 +709,7 @@ void submit_bio_noacct(struct bio *bio)
might_sleep();
- plug = blk_mq_plug(q, bio);
+ plug = blk_mq_plug(bio);
if (plug && plug->nowait)
bio->bi_opf |= REQ_NOWAIT;
@@ -818,11 +765,11 @@ void submit_bio_noacct(struct bio *bio)
case REQ_OP_ZONE_OPEN:
case REQ_OP_ZONE_CLOSE:
case REQ_OP_ZONE_FINISH:
- if (!blk_queue_is_zoned(q))
+ if (!bdev_is_zoned(bio->bi_bdev))
goto not_supported;
break;
case REQ_OP_ZONE_RESET_ALL:
- if (!blk_queue_is_zoned(q) || !blk_queue_zone_resetall(q))
+ if (!bdev_is_zoned(bio->bi_bdev) || !blk_queue_zone_resetall(q))
goto not_supported;
break;
case REQ_OP_WRITE_ZEROES:
@@ -987,7 +934,7 @@ void update_io_ticks(struct block_device *part, unsigned long now, bool end)
again:
stamp = READ_ONCE(part->bd_stamp);
if (unlikely(time_after(now, stamp))) {
- if (likely(cmpxchg(&part->bd_stamp, stamp, now) == stamp))
+ if (likely(try_cmpxchg(&part->bd_stamp, &stamp, now)))
__part_stat_add(part, io_ticks, end ? now - stamp : 1);
}
if (part->bd_partno) {
@@ -997,7 +944,7 @@ again:
}
unsigned long bdev_start_io_acct(struct block_device *bdev,
- unsigned int sectors, unsigned int op,
+ unsigned int sectors, enum req_op op,
unsigned long start_time)
{
const int sgrp = op_stat_group(op);
@@ -1038,7 +985,7 @@ unsigned long bio_start_io_acct(struct bio *bio)
}
EXPORT_SYMBOL_GPL(bio_start_io_acct);
-void bdev_end_io_acct(struct block_device *bdev, unsigned int op,
+void bdev_end_io_acct(struct block_device *bdev, enum req_op op,
unsigned long start_time)
{
const int sgrp = op_stat_group(op);
@@ -1247,7 +1194,7 @@ EXPORT_SYMBOL_GPL(blk_io_schedule);
int __init blk_dev_init(void)
{
- BUILD_BUG_ON(REQ_OP_LAST >= (1 << REQ_OP_BITS));
+ BUILD_BUG_ON((__force u32)REQ_OP_LAST >= (1 << REQ_OP_BITS));
BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
sizeof_field(struct request, cmd_flags));
BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
diff --git a/block/blk-flush.c b/block/blk-flush.c
index c68968724870..d20a0c6b2c66 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -94,7 +94,7 @@ enum {
};
static void blk_kick_flush(struct request_queue *q,
- struct blk_flush_queue *fq, unsigned int flags);
+ struct blk_flush_queue *fq, blk_opf_t flags);
static inline struct blk_flush_queue *
blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx)
@@ -173,7 +173,7 @@ static void blk_flush_complete_seq(struct request *rq,
{
struct request_queue *q = rq->q;
struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
- unsigned int cmd_flags;
+ blk_opf_t cmd_flags;
BUG_ON(rq->flush.seq & seq);
rq->flush.seq |= seq;
@@ -290,7 +290,7 @@ bool is_flush_rq(struct request *rq)
*
*/
static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
- unsigned int flags)
+ blk_opf_t flags)
{
struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
struct request *first_rq =
diff --git a/block/blk-ia-ranges.c b/block/blk-ia-ranges.c
index 47c89e65b57f..2bd1d311033b 100644
--- a/block/blk-ia-ranges.c
+++ b/block/blk-ia-ranges.c
@@ -102,31 +102,18 @@ static struct kobj_type blk_ia_ranges_ktype = {
* disk_register_independent_access_ranges - register with sysfs a set of
* independent access ranges
* @disk: Target disk
- * @new_iars: New set of independent access ranges
*
* Register with sysfs a set of independent access ranges for @disk.
- * If @new_iars is not NULL, this set of ranges is registered and the old set
- * specified by q->ia_ranges is unregistered. Otherwise, q->ia_ranges is
- * registered if it is not already.
*/
-int disk_register_independent_access_ranges(struct gendisk *disk,
- struct blk_independent_access_ranges *new_iars)
+int disk_register_independent_access_ranges(struct gendisk *disk)
{
+ struct blk_independent_access_ranges *iars = disk->ia_ranges;
struct request_queue *q = disk->queue;
- struct blk_independent_access_ranges *iars;
int i, ret;
lockdep_assert_held(&q->sysfs_dir_lock);
lockdep_assert_held(&q->sysfs_lock);
- /* If a new range set is specified, unregister the old one */
- if (new_iars) {
- if (q->ia_ranges)
- disk_unregister_independent_access_ranges(disk);
- q->ia_ranges = new_iars;
- }
-
- iars = q->ia_ranges;
if (!iars)
return 0;
@@ -138,7 +125,7 @@ int disk_register_independent_access_ranges(struct gendisk *disk,
ret = kobject_init_and_add(&iars->kobj, &blk_ia_ranges_ktype,
&q->kobj, "%s", "independent_access_ranges");
if (ret) {
- q->ia_ranges = NULL;
+ disk->ia_ranges = NULL;
kobject_put(&iars->kobj);
return ret;
}
@@ -164,7 +151,7 @@ int disk_register_independent_access_ranges(struct gendisk *disk,
void disk_unregister_independent_access_ranges(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
- struct blk_independent_access_ranges *iars = q->ia_ranges;
+ struct blk_independent_access_ranges *iars = disk->ia_ranges;
int i;
lockdep_assert_held(&q->sysfs_dir_lock);
@@ -182,7 +169,7 @@ void disk_unregister_independent_access_ranges(struct gendisk *disk)
kfree(iars);
}
- q->ia_ranges = NULL;
+ disk->ia_ranges = NULL;
}
static struct blk_independent_access_range *
@@ -210,6 +197,9 @@ static bool disk_check_ia_ranges(struct gendisk *disk,
sector_t sector = 0;
int i;
+ if (WARN_ON_ONCE(!iars->nr_ia_ranges))
+ return false;
+
/*
* While sorting the ranges in increasing LBA order, check that the
* ranges do not overlap, that there are no sector holes and that all
@@ -242,7 +232,7 @@ static bool disk_check_ia_ranges(struct gendisk *disk,
static bool disk_ia_ranges_changed(struct gendisk *disk,
struct blk_independent_access_ranges *new)
{
- struct blk_independent_access_ranges *old = disk->queue->ia_ranges;
+ struct blk_independent_access_ranges *old = disk->ia_ranges;
int i;
if (!old)
@@ -298,25 +288,15 @@ void disk_set_independent_access_ranges(struct gendisk *disk,
{
struct request_queue *q = disk->queue;
- if (WARN_ON_ONCE(iars && !iars->nr_ia_ranges)) {
+ mutex_lock(&q->sysfs_dir_lock);
+ mutex_lock(&q->sysfs_lock);
+ if (iars && !disk_check_ia_ranges(disk, iars)) {
kfree(iars);
iars = NULL;
}
-
- mutex_lock(&q->sysfs_dir_lock);
- mutex_lock(&q->sysfs_lock);
-
- if (iars) {
- if (!disk_check_ia_ranges(disk, iars)) {
- kfree(iars);
- iars = NULL;
- goto reg;
- }
-
- if (!disk_ia_ranges_changed(disk, iars)) {
- kfree(iars);
- goto unlock;
- }
+ if (iars && !disk_ia_ranges_changed(disk, iars)) {
+ kfree(iars);
+ goto unlock;
}
/*
@@ -324,17 +304,12 @@ void disk_set_independent_access_ranges(struct gendisk *disk,
* revalidation. If that is the case, we need to unregister the old
* set of independent access ranges and register the new set. If the
* queue is not registered, registration of the device request queue
- * will register the independent access ranges, so only swap in the
- * new set and free the old one.
+ * will register the independent access ranges.
*/
-reg:
- if (blk_queue_registered(q)) {
- disk_register_independent_access_ranges(disk, iars);
- } else {
- swap(q->ia_ranges, iars);
- kfree(iars);
- }
-
+ disk_unregister_independent_access_ranges(disk);
+ disk->ia_ranges = iars;
+ if (blk_queue_registered(q))
+ disk_register_independent_access_ranges(disk);
unlock:
mutex_unlock(&q->sysfs_lock);
mutex_unlock(&q->sysfs_dir_lock);
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index df9cfe4ca532..63fc02042408 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -247,6 +247,8 @@ static struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
INIT_HLIST_HEAD(&ioc->icq_list);
INIT_WORK(&ioc->release_work, ioc_release_fn);
#endif
+ ioc->ioprio = IOPRIO_DEFAULT;
+
return ioc;
}
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 33a11ba971ea..7936e5f5821c 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -2769,7 +2769,7 @@ static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
return;
- switch (req_op(rq) & REQ_OP_MASK) {
+ switch (req_op(rq)) {
case REQ_OP_READ:
pidx = QOS_RLAT;
rw = READ;
@@ -2886,15 +2886,21 @@ static int blk_iocost_init(struct request_queue *q)
* called before policy activation completion, can't assume that the
* target bio has an iocg associated and need to test for NULL iocg.
*/
- rq_qos_add(q, rqos);
+ ret = rq_qos_add(q, rqos);
+ if (ret)
+ goto err_free_ioc;
+
ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
- if (ret) {
- rq_qos_del(q, rqos);
- free_percpu(ioc->pcpu_stat);
- kfree(ioc);
- return ret;
- }
+ if (ret)
+ goto err_del_qos;
return 0;
+
+err_del_qos:
+ rq_qos_del(q, rqos);
+err_free_ioc:
+ free_percpu(ioc->pcpu_stat);
+ kfree(ioc);
+ return ret;
}
static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 9568bf8dfe82..e285152345a2 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -401,7 +401,6 @@ static void check_scale_change(struct iolatency_grp *iolat)
unsigned int cur_cookie;
unsigned int our_cookie = atomic_read(&iolat->scale_cookie);
u64 scale_lat;
- unsigned int old;
int direction = 0;
if (lat_to_blkg(iolat)->parent == NULL)
@@ -422,11 +421,10 @@ static void check_scale_change(struct iolatency_grp *iolat)
else
return;
- old = atomic_cmpxchg(&iolat->scale_cookie, our_cookie, cur_cookie);
-
- /* Somebody beat us to the punch, just bail. */
- if (old != our_cookie)
+ if (!atomic_try_cmpxchg(&iolat->scale_cookie, &our_cookie, cur_cookie)) {
+ /* Somebody beat us to the punch, just bail. */
return;
+ }
if (direction < 0 && iolat->min_lat_nsec) {
u64 samples_thresh;
@@ -633,8 +631,8 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
window_start = atomic64_read(&iolat->window_start);
if (now > window_start &&
(now - window_start) >= iolat->cur_win_nsec) {
- if (atomic64_cmpxchg(&iolat->window_start,
- window_start, now) == window_start)
+ if (atomic64_try_cmpxchg(&iolat->window_start,
+ &window_start, now))
iolatency_check_latencies(iolat, now);
}
}
@@ -773,19 +771,23 @@ int blk_iolatency_init(struct request_queue *q)
rqos->ops = &blkcg_iolatency_ops;
rqos->q = q;
- rq_qos_add(q, rqos);
-
+ ret = rq_qos_add(q, rqos);
+ if (ret)
+ goto err_free;
ret = blkcg_activate_policy(q, &blkcg_policy_iolatency);
- if (ret) {
- rq_qos_del(q, rqos);
- kfree(blkiolat);
- return ret;
- }
+ if (ret)
+ goto err_qos_del;
timer_setup(&blkiolat->timer, blkiolatency_timer_fn, 0);
INIT_WORK(&blkiolat->enable_work, blkiolatency_enable_work_fn);
return 0;
+
+err_qos_del:
+ rq_qos_del(q, rqos);
+err_free:
+ kfree(blkiolat);
+ return ret;
}
static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val)
diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c
index 79e797f5d194..c00060a02c6e 100644
--- a/block/blk-ioprio.c
+++ b/block/blk-ioprio.c
@@ -62,7 +62,6 @@ struct ioprio_blkg {
struct ioprio_blkcg {
struct blkcg_policy_data cpd;
enum prio_policy prio_policy;
- bool prio_set;
};
static inline struct ioprio_blkg *pd_to_ioprio(struct blkg_policy_data *pd)
@@ -113,7 +112,6 @@ static ssize_t ioprio_set_prio_policy(struct kernfs_open_file *of, char *buf,
if (ret < 0)
return ret;
blkcg->prio_policy = ret;
- blkcg->prio_set = true;
return nbytes;
}
@@ -183,26 +181,20 @@ static struct blkcg_policy ioprio_policy = {
.pd_free_fn = ioprio_free_pd,
};
-struct blk_ioprio {
- struct rq_qos rqos;
-};
-
-static void blkcg_ioprio_track(struct rq_qos *rqos, struct request *rq,
- struct bio *bio)
+void blkcg_set_ioprio(struct bio *bio)
{
struct ioprio_blkcg *blkcg = ioprio_blkcg_from_bio(bio);
u16 prio;
- if (!blkcg->prio_set)
+ if (!blkcg || blkcg->prio_policy == POLICY_NO_CHANGE)
return;
/*
* Except for IOPRIO_CLASS_NONE, higher I/O priority numbers
* correspond to a lower priority. Hence, the max_t() below selects
* the lower priority of bi_ioprio and the cgroup I/O priority class.
- * If the cgroup policy has been set to POLICY_NO_CHANGE == 0, the
- * bio I/O priority is not modified. If the bio I/O priority equals
- * IOPRIO_CLASS_NONE, the cgroup I/O priority is assigned to the bio.
+ * If the bio I/O priority equals IOPRIO_CLASS_NONE, the cgroup I/O
+ * priority is assigned to the bio.
*/
prio = max_t(u16, bio->bi_ioprio,
IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0));
@@ -210,49 +202,14 @@ static void blkcg_ioprio_track(struct rq_qos *rqos, struct request *rq,
bio->bi_ioprio = prio;
}
-static void blkcg_ioprio_exit(struct rq_qos *rqos)
+void blk_ioprio_exit(struct request_queue *q)
{
- struct blk_ioprio *blkioprio_blkg =
- container_of(rqos, typeof(*blkioprio_blkg), rqos);
-
- blkcg_deactivate_policy(rqos->q, &ioprio_policy);
- kfree(blkioprio_blkg);
+ blkcg_deactivate_policy(q, &ioprio_policy);
}
-static struct rq_qos_ops blkcg_ioprio_ops = {
- .track = blkcg_ioprio_track,
- .exit = blkcg_ioprio_exit,
-};
-
int blk_ioprio_init(struct request_queue *q)
{
- struct blk_ioprio *blkioprio_blkg;
- struct rq_qos *rqos;
- int ret;
-
- blkioprio_blkg = kzalloc(sizeof(*blkioprio_blkg), GFP_KERNEL);
- if (!blkioprio_blkg)
- return -ENOMEM;
-
- ret = blkcg_activate_policy(q, &ioprio_policy);
- if (ret) {
- kfree(blkioprio_blkg);
- return ret;
- }
-
- rqos = &blkioprio_blkg->rqos;
- rqos->id = RQ_QOS_IOPRIO;
- rqos->ops = &blkcg_ioprio_ops;
- rqos->q = q;
-
- /*
- * Registering the rq-qos policy after activating the blk-cgroup
- * policy guarantees that ioprio_blkcg_from_bio(bio) != NULL in the
- * rq-qos callbacks.
- */
- rq_qos_add(q, rqos);
-
- return 0;
+ return blkcg_activate_policy(q, &ioprio_policy);
}
static int __init ioprio_init(void)
diff --git a/block/blk-ioprio.h b/block/blk-ioprio.h
index a7785c2f1aea..5a1eb550e178 100644
--- a/block/blk-ioprio.h
+++ b/block/blk-ioprio.h
@@ -6,14 +6,23 @@
#include <linux/kconfig.h>
struct request_queue;
+struct bio;
#ifdef CONFIG_BLK_CGROUP_IOPRIO
int blk_ioprio_init(struct request_queue *q);
+void blk_ioprio_exit(struct request_queue *q);
+void blkcg_set_ioprio(struct bio *bio);
#else
static inline int blk_ioprio_init(struct request_queue *q)
{
return 0;
}
+static inline void blk_ioprio_exit(struct request_queue *q)
+{
+}
+static inline void blkcg_set_ioprio(struct bio *bio)
+{
+}
#endif
#endif /* _BLK_IOPRIO_H_ */
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 09b7e1200c0f..67e6dbc1ae81 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -48,10 +48,8 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
/* In case the discard granularity isn't set by buggy device driver */
if (WARN_ON_ONCE(!bdev_discard_granularity(bdev))) {
- char dev_name[BDEVNAME_SIZE];
-
- bdevname(bdev, dev_name);
- pr_err_ratelimited("%s: Error: discard_granularity is 0.\n", dev_name);
+ pr_err_ratelimited("%pg: Error: discard_granularity is 0.\n",
+ bdev);
return -EOPNOTSUPP;
}
diff --git a/block/blk-map.c b/block/blk-map.c
index df8b066cd548..7196a6b64c80 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -254,7 +254,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
size_t offs, added = 0;
int npages;
- bytes = iov_iter_get_pages_alloc(iter, &pages, LONG_MAX, &offs);
+ bytes = iov_iter_get_pages_alloc2(iter, &pages, LONG_MAX, &offs);
if (unlikely(bytes <= 0)) {
ret = bytes ? bytes : -EFAULT;
goto out_unmap;
@@ -284,7 +284,6 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
bytes -= n;
offs = 0;
}
- iov_iter_advance(iter, added);
}
/*
* release the pages we didn't map into the bio, if any
@@ -293,8 +292,10 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
put_page(pages[j++]);
kvfree(pages);
/* couldn't stuff something into bio? */
- if (bytes)
+ if (bytes) {
+ iov_iter_revert(iter, bytes);
break;
+ }
}
ret = blk_rq_append_bio(rq, bio);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 7771dacc99cb..ff04e9290715 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -82,7 +82,7 @@ static inline bool bio_will_gap(struct request_queue *q,
bio_get_first_bvec(next, &nb);
if (biovec_phys_mergeable(q, &pb, &nb))
return false;
- return __bvec_gap_to_prev(q, &pb, nb.bv_offset);
+ return __bvec_gap_to_prev(&q->limits, &pb, nb.bv_offset);
}
static inline bool req_gap_back_merge(struct request *req, struct bio *bio)
@@ -95,23 +95,30 @@ static inline bool req_gap_front_merge(struct request *req, struct bio *bio)
return bio_will_gap(req->q, NULL, bio, req->bio);
}
-static struct bio *blk_bio_discard_split(struct request_queue *q,
- struct bio *bio,
- struct bio_set *bs,
- unsigned *nsegs)
+/*
+ * The max size one bio can handle is UINT_MAX becasue bvec_iter.bi_size
+ * is defined as 'unsigned int', meantime it has to be aligned to with the
+ * logical block size, which is the minimum accepted unit by hardware.
+ */
+static unsigned int bio_allowed_max_sectors(struct queue_limits *lim)
+{
+ return round_down(UINT_MAX, lim->logical_block_size) >> SECTOR_SHIFT;
+}
+
+static struct bio *bio_split_discard(struct bio *bio, struct queue_limits *lim,
+ unsigned *nsegs, struct bio_set *bs)
{
unsigned int max_discard_sectors, granularity;
- int alignment;
sector_t tmp;
unsigned split_sectors;
*nsegs = 1;
/* Zero-sector (unknown) and one-sector granularities are the same. */
- granularity = max(q->limits.discard_granularity >> 9, 1U);
+ granularity = max(lim->discard_granularity >> 9, 1U);
- max_discard_sectors = min(q->limits.max_discard_sectors,
- bio_allowed_max_sectors(q));
+ max_discard_sectors =
+ min(lim->max_discard_sectors, bio_allowed_max_sectors(lim));
max_discard_sectors -= max_discard_sectors % granularity;
if (unlikely(!max_discard_sectors)) {
@@ -128,9 +135,8 @@ static struct bio *blk_bio_discard_split(struct request_queue *q,
* If the next starting sector would be misaligned, stop the discard at
* the previous aligned sector.
*/
- alignment = (q->limits.discard_alignment >> 9) % granularity;
-
- tmp = bio->bi_iter.bi_sector + split_sectors - alignment;
+ tmp = bio->bi_iter.bi_sector + split_sectors -
+ ((lim->discard_alignment >> 9) % granularity);
tmp = sector_div(tmp, granularity);
if (split_sectors > tmp)
@@ -139,18 +145,15 @@ static struct bio *blk_bio_discard_split(struct request_queue *q,
return bio_split(bio, split_sectors, GFP_NOIO, bs);
}
-static struct bio *blk_bio_write_zeroes_split(struct request_queue *q,
- struct bio *bio, struct bio_set *bs, unsigned *nsegs)
+static struct bio *bio_split_write_zeroes(struct bio *bio,
+ struct queue_limits *lim, unsigned *nsegs, struct bio_set *bs)
{
*nsegs = 0;
-
- if (!q->limits.max_write_zeroes_sectors)
+ if (!lim->max_write_zeroes_sectors)
return NULL;
-
- if (bio_sectors(bio) <= q->limits.max_write_zeroes_sectors)
+ if (bio_sectors(bio) <= lim->max_write_zeroes_sectors)
return NULL;
-
- return bio_split(bio, q->limits.max_write_zeroes_sectors, GFP_NOIO, bs);
+ return bio_split(bio, lim->max_write_zeroes_sectors, GFP_NOIO, bs);
}
/*
@@ -161,28 +164,30 @@ static struct bio *blk_bio_write_zeroes_split(struct request_queue *q,
* requests that are submitted to a block device if the start of a bio is not
* aligned to a physical block boundary.
*/
-static inline unsigned get_max_io_size(struct request_queue *q,
- struct bio *bio)
+static inline unsigned get_max_io_size(struct bio *bio,
+ struct queue_limits *lim)
{
- unsigned sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector, 0);
- unsigned max_sectors = sectors;
- unsigned pbs = queue_physical_block_size(q) >> SECTOR_SHIFT;
- unsigned lbs = queue_logical_block_size(q) >> SECTOR_SHIFT;
- unsigned start_offset = bio->bi_iter.bi_sector & (pbs - 1);
-
- max_sectors += start_offset;
- max_sectors &= ~(pbs - 1);
- if (max_sectors > start_offset)
- return max_sectors - start_offset;
-
- return sectors & ~(lbs - 1);
+ unsigned pbs = lim->physical_block_size >> SECTOR_SHIFT;
+ unsigned lbs = lim->logical_block_size >> SECTOR_SHIFT;
+ unsigned max_sectors = lim->max_sectors, start, end;
+
+ if (lim->chunk_sectors) {
+ max_sectors = min(max_sectors,
+ blk_chunk_sectors_left(bio->bi_iter.bi_sector,
+ lim->chunk_sectors));
+ }
+
+ start = bio->bi_iter.bi_sector & (pbs - 1);
+ end = (start + max_sectors) & ~(pbs - 1);
+ if (end > start)
+ return end - start;
+ return max_sectors & ~(lbs - 1);
}
-static inline unsigned get_max_segment_size(const struct request_queue *q,
- struct page *start_page,
- unsigned long offset)
+static inline unsigned get_max_segment_size(struct queue_limits *lim,
+ struct page *start_page, unsigned long offset)
{
- unsigned long mask = queue_segment_boundary(q);
+ unsigned long mask = lim->seg_boundary_mask;
offset = mask & (page_to_phys(start_page) + offset);
@@ -191,21 +196,21 @@ static inline unsigned get_max_segment_size(const struct request_queue *q,
* on 32bit arch, use queue's max segment size when that happens.
*/
return min_not_zero(mask - offset + 1,
- (unsigned long)queue_max_segment_size(q));
+ (unsigned long)lim->max_segment_size);
}
/**
* bvec_split_segs - verify whether or not a bvec should be split in the middle
- * @q: [in] request queue associated with the bio associated with @bv
+ * @lim: [in] queue limits to split based on
* @bv: [in] bvec to examine
* @nsegs: [in,out] Number of segments in the bio being built. Incremented
* by the number of segments from @bv that may be appended to that
* bio without exceeding @max_segs
- * @sectors: [in,out] Number of sectors in the bio being built. Incremented
- * by the number of sectors from @bv that may be appended to that
- * bio without exceeding @max_sectors
+ * @bytes: [in,out] Number of bytes in the bio being built. Incremented
+ * by the number of bytes from @bv that may be appended to that
+ * bio without exceeding @max_bytes
* @max_segs: [in] upper bound for *@nsegs
- * @max_sectors: [in] upper bound for *@sectors
+ * @max_bytes: [in] upper bound for *@bytes
*
* When splitting a bio, it can happen that a bvec is encountered that is too
* big to fit in a single segment and hence that it has to be split in the
@@ -214,18 +219,17 @@ static inline unsigned get_max_segment_size(const struct request_queue *q,
* *@nsegs segments and *@sectors sectors would make that bio unacceptable for
* the block driver.
*/
-static bool bvec_split_segs(const struct request_queue *q,
- const struct bio_vec *bv, unsigned *nsegs,
- unsigned *sectors, unsigned max_segs,
- unsigned max_sectors)
+static bool bvec_split_segs(struct queue_limits *lim, const struct bio_vec *bv,
+ unsigned *nsegs, unsigned *bytes, unsigned max_segs,
+ unsigned max_bytes)
{
- unsigned max_len = (min(max_sectors, UINT_MAX >> 9) - *sectors) << 9;
+ unsigned max_len = min(max_bytes, UINT_MAX) - *bytes;
unsigned len = min(bv->bv_len, max_len);
unsigned total_len = 0;
unsigned seg_size = 0;
while (len && *nsegs < max_segs) {
- seg_size = get_max_segment_size(q, bv->bv_page,
+ seg_size = get_max_segment_size(lim, bv->bv_page,
bv->bv_offset + total_len);
seg_size = min(seg_size, len);
@@ -233,27 +237,28 @@ static bool bvec_split_segs(const struct request_queue *q,
total_len += seg_size;
len -= seg_size;
- if ((bv->bv_offset + total_len) & queue_virt_boundary(q))
+ if ((bv->bv_offset + total_len) & lim->virt_boundary_mask)
break;
}
- *sectors += total_len >> 9;
+ *bytes += total_len;
/* tell the caller to split the bvec if it is too big to fit */
return len > 0 || bv->bv_len > max_len;
}
/**
- * blk_bio_segment_split - split a bio in two bios
- * @q: [in] request queue pointer
+ * bio_split_rw - split a bio in two bios
* @bio: [in] bio to be split
- * @bs: [in] bio set to allocate the clone from
+ * @lim: [in] queue limits to split based on
* @segs: [out] number of segments in the bio with the first half of the sectors
+ * @bs: [in] bio set to allocate the clone from
+ * @max_bytes: [in] maximum number of bytes per bio
*
* Clone @bio, update the bi_iter of the clone to represent the first sectors
* of @bio and update @bio->bi_iter to represent the remaining sectors. The
* following is guaranteed for the cloned bio:
- * - That it has at most get_max_io_size(@q, @bio) sectors.
+ * - That it has at most @max_bytes worth of data
* - That it has at most queue_max_segments(@q) segments.
*
* Except for discard requests the cloned bio will point at the bi_io_vec of
@@ -262,33 +267,30 @@ static bool bvec_split_segs(const struct request_queue *q,
* responsible for ensuring that @bs is only destroyed after processing of the
* split bio has finished.
*/
-static struct bio *blk_bio_segment_split(struct request_queue *q,
- struct bio *bio,
- struct bio_set *bs,
- unsigned *segs)
+static struct bio *bio_split_rw(struct bio *bio, struct queue_limits *lim,
+ unsigned *segs, struct bio_set *bs, unsigned max_bytes)
{
struct bio_vec bv, bvprv, *bvprvp = NULL;
struct bvec_iter iter;
- unsigned nsegs = 0, sectors = 0;
- const unsigned max_sectors = get_max_io_size(q, bio);
- const unsigned max_segs = queue_max_segments(q);
+ unsigned nsegs = 0, bytes = 0;
bio_for_each_bvec(bv, bio, iter) {
/*
* If the queue doesn't support SG gaps and adding this
* offset would create a gap, disallow it.
*/
- if (bvprvp && bvec_gap_to_prev(q, bvprvp, bv.bv_offset))
+ if (bvprvp && bvec_gap_to_prev(lim, bvprvp, bv.bv_offset))
goto split;
- if (nsegs < max_segs &&
- sectors + (bv.bv_len >> 9) <= max_sectors &&
+ if (nsegs < lim->max_segments &&
+ bytes + bv.bv_len <= max_bytes &&
bv.bv_offset + bv.bv_len <= PAGE_SIZE) {
nsegs++;
- sectors += bv.bv_len >> 9;
- } else if (bvec_split_segs(q, &bv, &nsegs, &sectors, max_segs,
- max_sectors)) {
- goto split;
+ bytes += bv.bv_len;
+ } else {
+ if (bvec_split_segs(lim, &bv, &nsegs, &bytes,
+ lim->max_segments, max_bytes))
+ goto split;
}
bvprv = bv;
@@ -301,43 +303,51 @@ split:
*segs = nsegs;
/*
+ * Individual bvecs might not be logical block aligned. Round down the
+ * split size so that each bio is properly block size aligned, even if
+ * we do not use the full hardware limits.
+ */
+ bytes = ALIGN_DOWN(bytes, lim->logical_block_size);
+
+ /*
* Bio splitting may cause subtle trouble such as hang when doing sync
* iopoll in direct IO routine. Given performance gain of iopoll for
* big IO can be trival, disable iopoll when split needed.
*/
bio_clear_polled(bio);
- return bio_split(bio, sectors, GFP_NOIO, bs);
+ return bio_split(bio, bytes >> SECTOR_SHIFT, GFP_NOIO, bs);
}
/**
- * __blk_queue_split - split a bio and submit the second half
- * @q: [in] request_queue new bio is being queued at
- * @bio: [in, out] bio to be split
- * @nr_segs: [out] number of segments in the first bio
+ * __bio_split_to_limits - split a bio to fit the queue limits
+ * @bio: bio to be split
+ * @lim: queue limits to split based on
+ * @nr_segs: returns the number of segments in the returned bio
*
- * Split a bio into two bios, chain the two bios, submit the second half and
- * store a pointer to the first half in *@bio. If the second bio is still too
- * big it will be split by a recursive call to this function. Since this
- * function may allocate a new bio from q->bio_split, it is the responsibility
- * of the caller to ensure that q->bio_split is only released after processing
- * of the split bio has finished.
+ * Check if @bio needs splitting based on the queue limits, and if so split off
+ * a bio fitting the limits from the beginning of @bio and return it. @bio is
+ * shortened to the remainder and re-submitted.
+ *
+ * The split bio is allocated from @q->bio_split, which is provided by the
+ * block layer.
*/
-void __blk_queue_split(struct request_queue *q, struct bio **bio,
+struct bio *__bio_split_to_limits(struct bio *bio, struct queue_limits *lim,
unsigned int *nr_segs)
{
- struct bio *split = NULL;
+ struct bio_set *bs = &bio->bi_bdev->bd_disk->bio_split;
+ struct bio *split;
- switch (bio_op(*bio)) {
+ switch (bio_op(bio)) {
case REQ_OP_DISCARD:
case REQ_OP_SECURE_ERASE:
- split = blk_bio_discard_split(q, *bio, &q->bio_split, nr_segs);
+ split = bio_split_discard(bio, lim, nr_segs, bs);
break;
case REQ_OP_WRITE_ZEROES:
- split = blk_bio_write_zeroes_split(q, *bio, &q->bio_split,
- nr_segs);
+ split = bio_split_write_zeroes(bio, lim, nr_segs, bs);
break;
default:
- split = blk_bio_segment_split(q, *bio, &q->bio_split, nr_segs);
+ split = bio_split_rw(bio, lim, nr_segs, bs,
+ get_max_io_size(bio, lim) << SECTOR_SHIFT);
break;
}
@@ -345,37 +355,41 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio,
/* there isn't chance to merge the splitted bio */
split->bi_opf |= REQ_NOMERGE;
- bio_chain(split, *bio);
- trace_block_split(split, (*bio)->bi_iter.bi_sector);
- submit_bio_noacct(*bio);
- *bio = split;
+ blkcg_bio_issue_init(split);
+ bio_chain(split, bio);
+ trace_block_split(split, bio->bi_iter.bi_sector);
+ submit_bio_noacct(bio);
+ return split;
}
+ return bio;
}
/**
- * blk_queue_split - split a bio and submit the second half
- * @bio: [in, out] bio to be split
+ * bio_split_to_limits - split a bio to fit the queue limits
+ * @bio: bio to be split
+ *
+ * Check if @bio needs splitting based on the queue limits of @bio->bi_bdev, and
+ * if so split off a bio fitting the limits from the beginning of @bio and
+ * return it. @bio is shortened to the remainder and re-submitted.
*
- * Split a bio into two bios, chains the two bios, submit the second half and
- * store a pointer to the first half in *@bio. Since this function may allocate
- * a new bio from q->bio_split, it is the responsibility of the caller to ensure
- * that q->bio_split is only released after processing of the split bio has
- * finished.
+ * The split bio is allocated from @q->bio_split, which is provided by the
+ * block layer.
*/
-void blk_queue_split(struct bio **bio)
+struct bio *bio_split_to_limits(struct bio *bio)
{
- struct request_queue *q = bdev_get_queue((*bio)->bi_bdev);
+ struct queue_limits *lim = &bdev_get_queue(bio->bi_bdev)->limits;
unsigned int nr_segs;
- if (blk_may_split(q, *bio))
- __blk_queue_split(q, bio, &nr_segs);
+ if (bio_may_exceed_limits(bio, lim))
+ return __bio_split_to_limits(bio, lim, &nr_segs);
+ return bio;
}
-EXPORT_SYMBOL(blk_queue_split);
+EXPORT_SYMBOL(bio_split_to_limits);
unsigned int blk_recalc_rq_segments(struct request *rq)
{
unsigned int nr_phys_segs = 0;
- unsigned int nr_sectors = 0;
+ unsigned int bytes = 0;
struct req_iterator iter;
struct bio_vec bv;
@@ -395,10 +409,12 @@ unsigned int blk_recalc_rq_segments(struct request *rq)
return 1;
case REQ_OP_WRITE_ZEROES:
return 0;
+ default:
+ break;
}
rq_for_each_bvec(bv, rq, iter)
- bvec_split_segs(rq->q, &bv, &nr_phys_segs, &nr_sectors,
+ bvec_split_segs(&rq->q->limits, &bv, &nr_phys_segs, &bytes,
UINT_MAX, UINT_MAX);
return nr_phys_segs;
}
@@ -429,8 +445,8 @@ static unsigned blk_bvec_map_sg(struct request_queue *q,
while (nbytes > 0) {
unsigned offset = bvec->bv_offset + total;
- unsigned len = min(get_max_segment_size(q, bvec->bv_page,
- offset), nbytes);
+ unsigned len = min(get_max_segment_size(&q->limits,
+ bvec->bv_page, offset), nbytes);
struct page *page = bvec->bv_page;
/*
@@ -559,17 +575,18 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
sector_t offset)
{
struct request_queue *q = rq->q;
+ unsigned int max_sectors;
if (blk_rq_is_passthrough(rq))
return q->limits.max_hw_sectors;
+ max_sectors = blk_queue_get_max_sectors(q, req_op(rq));
if (!q->limits.chunk_sectors ||
req_op(rq) == REQ_OP_DISCARD ||
req_op(rq) == REQ_OP_SECURE_ERASE)
- return blk_queue_get_max_sectors(q, req_op(rq));
-
- return min(blk_max_size_offset(q, offset, 0),
- blk_queue_get_max_sectors(q, req_op(rq)));
+ return max_sectors;
+ return min(max_sectors,
+ blk_chunk_sectors_left(offset, q->limits.chunk_sectors));
}
static inline int ll_new_hw_segment(struct request *req, struct bio *bio,
@@ -699,7 +716,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
*/
void blk_rq_set_mixed_merge(struct request *rq)
{
- unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK;
+ blk_opf_t ff = rq->cmd_flags & REQ_FAILFAST_MASK;
struct bio *bio;
if (rq->rq_flags & RQF_MIXED_MERGE)
@@ -915,7 +932,7 @@ enum bio_merge_status {
static enum bio_merge_status bio_attempt_back_merge(struct request *req,
struct bio *bio, unsigned int nr_segs)
{
- const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
+ const blk_opf_t ff = bio->bi_opf & REQ_FAILFAST_MASK;
if (!ll_back_merge_fn(req, bio, nr_segs))
return BIO_MERGE_FAILED;
@@ -939,7 +956,7 @@ static enum bio_merge_status bio_attempt_back_merge(struct request *req,
static enum bio_merge_status bio_attempt_front_merge(struct request *req,
struct bio *bio, unsigned int nr_segs)
{
- const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
+ const blk_opf_t ff = bio->bi_opf & REQ_FAILFAST_MASK;
if (!ll_front_merge_fn(req, bio, nr_segs))
return BIO_MERGE_FAILED;
@@ -1040,7 +1057,7 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
struct blk_plug *plug;
struct request *rq;
- plug = blk_mq_plug(q, bio);
+ plug = blk_mq_plug(bio);
if (!plug || rq_list_empty(plug->mq_list))
return false;
diff --git a/block/blk-mq-debugfs-zoned.c b/block/blk-mq-debugfs-zoned.c
index 038cb627c868..a77b099c34b7 100644
--- a/block/blk-mq-debugfs-zoned.c
+++ b/block/blk-mq-debugfs-zoned.c
@@ -11,11 +11,11 @@ int queue_zone_wlock_show(void *data, struct seq_file *m)
struct request_queue *q = data;
unsigned int i;
- if (!q->seq_zones_wlock)
+ if (!q->disk->seq_zones_wlock)
return 0;
- for (i = 0; i < q->nr_zones; i++)
- if (test_bit(i, q->seq_zones_wlock))
+ for (i = 0; i < q->disk->nr_zones; i++)
+ if (test_bit(i, q->disk->seq_zones_wlock))
seq_printf(m, "%u\n", i);
return 0;
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 4d1ce9ef4318..8559cea7f300 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -116,7 +116,6 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(NOXMERGES),
QUEUE_FLAG_NAME(ADD_RANDOM),
QUEUE_FLAG_NAME(SAME_FORCE),
- QUEUE_FLAG_NAME(DEAD),
QUEUE_FLAG_NAME(INIT_DONE),
QUEUE_FLAG_NAME(STABLE_WRITES),
QUEUE_FLAG_NAME(POLL),
@@ -151,11 +150,10 @@ static ssize_t queue_state_write(void *data, const char __user *buf,
char opbuf[16] = { }, *op;
/*
- * The "state" attribute is removed after blk_cleanup_queue() has called
- * blk_mq_free_queue(). Return if QUEUE_FLAG_DEAD has been set to avoid
- * triggering a use-after-free.
+ * The "state" attribute is removed when the queue is removed. Don't
+ * allow setting the state on a dying queue to avoid a use-after-free.
*/
- if (blk_queue_dead(q))
+ if (blk_queue_dying(q))
return -ENOENT;
if (count >= sizeof(opbuf)) {
@@ -306,7 +304,7 @@ static const char *blk_mq_rq_state_name(enum mq_rq_state rq_state)
int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
{
const struct blk_mq_ops *const mq_ops = rq->q->mq_ops;
- const unsigned int op = req_op(rq);
+ const enum req_op op = req_op(rq);
const char *op_str = blk_op_str(op);
seq_printf(m, "%p {.op=", rq);
@@ -315,8 +313,8 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
else
seq_printf(m, "%s", op_str);
seq_puts(m, ", .cmd_flags=");
- blk_flags_show(m, rq->cmd_flags & ~REQ_OP_MASK, cmd_flag_name,
- ARRAY_SIZE(cmd_flag_name));
+ blk_flags_show(m, (__force unsigned int)(rq->cmd_flags & ~REQ_OP_MASK),
+ cmd_flag_name, ARRAY_SIZE(cmd_flag_name));
seq_puts(m, ", .rq_flags=");
blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name,
ARRAY_SIZE(rqf_name));
@@ -377,7 +375,7 @@ struct show_busy_params {
* e.g. due to a concurrent blk_mq_finish_request() call. Returns true to
* keep iterating requests.
*/
-static bool hctx_show_busy_rq(struct request *rq, void *data, bool reserved)
+static bool hctx_show_busy_rq(struct request *rq, void *data)
{
const struct show_busy_params *params = data;
@@ -730,6 +728,9 @@ void blk_mq_debugfs_register_hctx(struct request_queue *q,
char name[20];
int i;
+ if (!q->debugfs_dir)
+ return;
+
snprintf(name, sizeof(name), "hctx%u", hctx->queue_num);
hctx->debugfs_dir = debugfs_create_dir(name, q->debugfs_dir);
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index c08426975856..93997d297d42 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -203,23 +203,6 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
return ret;
}
-void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
-{
- struct blk_mq_hw_ctx *hctx;
- unsigned long i;
-
- lockdep_assert_held(&q->sysfs_dir_lock);
-
- queue_for_each_hw_ctx(q, hctx, i)
- blk_mq_unregister_hctx(hctx);
-
- kobject_uevent(q->mq_kobj, KOBJ_REMOVE);
- kobject_del(q->mq_kobj);
- kobject_put(&dev->kobj);
-
- q->mq_sysfs_init_done = false;
-}
-
void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx)
{
kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
@@ -252,16 +235,16 @@ void blk_mq_sysfs_init(struct request_queue *q)
}
}
-int __blk_mq_register_dev(struct device *dev, struct request_queue *q)
+int blk_mq_sysfs_register(struct gendisk *disk)
{
+ struct request_queue *q = disk->queue;
struct blk_mq_hw_ctx *hctx;
unsigned long i, j;
int ret;
- WARN_ON_ONCE(!q->kobj.parent);
lockdep_assert_held(&q->sysfs_dir_lock);
- ret = kobject_add(q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
+ ret = kobject_add(q->mq_kobj, &disk_to_dev(disk)->kobj, "mq");
if (ret < 0)
goto out;
@@ -286,11 +269,27 @@ unreg:
kobject_uevent(q->mq_kobj, KOBJ_REMOVE);
kobject_del(q->mq_kobj);
- kobject_put(&dev->kobj);
return ret;
}
-void blk_mq_sysfs_unregister(struct request_queue *q)
+void blk_mq_sysfs_unregister(struct gendisk *disk)
+{
+ struct request_queue *q = disk->queue;
+ struct blk_mq_hw_ctx *hctx;
+ unsigned long i;
+
+ lockdep_assert_held(&q->sysfs_dir_lock);
+
+ queue_for_each_hw_ctx(q, hctx, i)
+ blk_mq_unregister_hctx(hctx);
+
+ kobject_uevent(q->mq_kobj, KOBJ_REMOVE);
+ kobject_del(q->mq_kobj);
+
+ q->mq_sysfs_init_done = false;
+}
+
+void blk_mq_sysfs_unregister_hctxs(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
unsigned long i;
@@ -306,7 +305,7 @@ unlock:
mutex_unlock(&q->sysfs_dir_lock);
}
-int blk_mq_sysfs_register(struct request_queue *q)
+int blk_mq_sysfs_register_hctxs(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
unsigned long i;
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 2dcd738c6952..8e3b36d1cb57 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -37,29 +37,25 @@ static void blk_mq_update_wake_batch(struct blk_mq_tags *tags,
* to get tag when first time, the other shared-tag users could reserve
* budget for it.
*/
-bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
+void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
unsigned int users;
if (blk_mq_is_shared_tags(hctx->flags)) {
struct request_queue *q = hctx->queue;
- if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) ||
- test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) {
- return true;
- }
+ if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
+ return;
+ set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags);
} else {
- if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) ||
- test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) {
- return true;
- }
+ if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
+ return;
+ set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state);
}
users = atomic_inc_return(&hctx->tags->active_queues);
blk_mq_update_wake_batch(hctx->tags, users);
-
- return true;
}
/*
@@ -266,7 +262,6 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
struct blk_mq_hw_ctx *hctx = iter_data->hctx;
struct request_queue *q = iter_data->q;
struct blk_mq_tag_set *set = q->tag_set;
- bool reserved = iter_data->reserved;
struct blk_mq_tags *tags;
struct request *rq;
bool ret = true;
@@ -276,7 +271,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
else
tags = hctx->tags;
- if (!reserved)
+ if (!iter_data->reserved)
bitnr += tags->nr_reserved_tags;
/*
* We can hit rq == NULL here, because the tagging functions
@@ -287,7 +282,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
return true;
if (rq->q == q && (!hctx || rq->mq_hctx == hctx))
- ret = iter_data->fn(rq, iter_data->data, reserved);
+ ret = iter_data->fn(rq, iter_data->data);
blk_mq_put_rq_ref(rq);
return ret;
}
@@ -337,12 +332,11 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
{
struct bt_tags_iter_data *iter_data = data;
struct blk_mq_tags *tags = iter_data->tags;
- bool reserved = iter_data->flags & BT_TAG_ITER_RESERVED;
struct request *rq;
bool ret = true;
bool iter_static_rqs = !!(iter_data->flags & BT_TAG_ITER_STATIC_RQS);
- if (!reserved)
+ if (!(iter_data->flags & BT_TAG_ITER_RESERVED))
bitnr += tags->nr_reserved_tags;
/*
@@ -358,7 +352,7 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
if (!(iter_data->flags & BT_TAG_ITER_STARTED) ||
blk_mq_request_started(rq))
- ret = iter_data->fn(rq, iter_data->data, reserved);
+ ret = iter_data->fn(rq, iter_data->data);
if (!iter_static_rqs)
blk_mq_put_rq_ref(rq);
return ret;
@@ -448,8 +442,7 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
}
EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
-static bool blk_mq_tagset_count_completed_rqs(struct request *rq,
- void *data, bool reserved)
+static bool blk_mq_tagset_count_completed_rqs(struct request *rq, void *data)
{
unsigned *count = data;
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 5668e28be0b7..91ff37e3b43d 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -47,15 +47,13 @@ enum {
BLK_MQ_TAG_MAX = BLK_MQ_NO_TAG - 1,
};
-extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
+extern void __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);
-static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
+static inline void blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
- if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))
- return false;
-
- return __blk_mq_tag_busy(hctx);
+ if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
+ __blk_mq_tag_busy(hctx);
}
static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 93d9d60980fb..5ee62b95f3e5 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -42,6 +42,7 @@
#include "blk-stat.h"
#include "blk-mq-sched.h"
#include "blk-rq-qos.h"
+#include "blk-ioprio.h"
static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
@@ -128,8 +129,7 @@ struct mq_inflight {
unsigned int inflight[2];
};
-static bool blk_mq_check_inflight(struct request *rq, void *priv,
- bool reserved)
+static bool blk_mq_check_inflight(struct request *rq, void *priv)
{
struct mq_inflight *mi = priv;
@@ -474,6 +474,9 @@ retry:
if (!(data->rq_flags & RQF_ELV))
blk_mq_tag_busy(data->hctx);
+ if (data->flags & BLK_MQ_REQ_RESERVED)
+ data->rq_flags |= RQF_RESV;
+
/*
* Try batched alloc if we want more than 1 tag.
*/
@@ -507,13 +510,13 @@ retry:
alloc_time_ns);
}
-struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
+struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
blk_mq_req_flags_t flags)
{
struct blk_mq_alloc_data data = {
.q = q,
.flags = flags,
- .cmd_flags = op,
+ .cmd_flags = opf,
.nr_tags = 1,
};
struct request *rq;
@@ -537,12 +540,12 @@ out_queue_exit:
EXPORT_SYMBOL(blk_mq_alloc_request);
struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
- unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
+ blk_opf_t opf, blk_mq_req_flags_t flags, unsigned int hctx_idx)
{
struct blk_mq_alloc_data data = {
.q = q,
.flags = flags,
- .cmd_flags = op,
+ .cmd_flags = opf,
.nr_tags = 1,
};
u64 alloc_time_ns = 0;
@@ -588,6 +591,9 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
else
data.rq_flags |= RQF_ELV;
+ if (flags & BLK_MQ_REQ_RESERVED)
+ data.rq_flags |= RQF_RESV;
+
ret = -EWOULDBLOCK;
tag = blk_mq_get_tag(&data);
if (tag == BLK_MQ_NO_TAG)
@@ -654,7 +660,7 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
{
printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg,
rq->q->disk ? rq->q->disk->disk_name : "?",
- (unsigned long long) rq->cmd_flags);
+ (__force unsigned long long) rq->cmd_flags);
printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n",
(unsigned long long)blk_rq_pos(rq),
@@ -707,8 +713,9 @@ static void blk_print_req_error(struct request *req, blk_status_t status)
"phys_seg %u prio class %u\n",
blk_status_to_str(status),
req->q->disk ? req->q->disk->disk_name : "?",
- blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)),
- req->cmd_flags & ~REQ_OP_MASK,
+ blk_rq_pos(req), (__force u32)req_op(req),
+ blk_op_str(req_op(req)),
+ (__force u32)(req->cmd_flags & ~REQ_OP_MASK),
req->nr_phys_segments,
IOPRIO_PRIO_CLASS(req->ioprio));
}
@@ -1393,8 +1400,7 @@ void blk_mq_delay_kick_requeue_list(struct request_queue *q,
}
EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
-static bool blk_mq_rq_inflight(struct request *rq, void *priv,
- bool reserved)
+static bool blk_mq_rq_inflight(struct request *rq, void *priv)
{
/*
* If we find a request that isn't idle we know the queue is busy
@@ -1420,13 +1426,13 @@ bool blk_mq_queue_inflight(struct request_queue *q)
}
EXPORT_SYMBOL_GPL(blk_mq_queue_inflight);
-static void blk_mq_rq_timed_out(struct request *req, bool reserved)
+static void blk_mq_rq_timed_out(struct request *req)
{
req->rq_flags |= RQF_TIMED_OUT;
if (req->q->mq_ops->timeout) {
enum blk_eh_timer_return ret;
- ret = req->q->mq_ops->timeout(req, reserved);
+ ret = req->q->mq_ops->timeout(req);
if (ret == BLK_EH_DONE)
return;
WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER);
@@ -1463,7 +1469,7 @@ void blk_mq_put_rq_ref(struct request *rq)
__blk_mq_free_request(rq);
}
-static bool blk_mq_check_expired(struct request *rq, void *priv, bool reserved)
+static bool blk_mq_check_expired(struct request *rq, void *priv)
{
unsigned long *next = priv;
@@ -1475,7 +1481,7 @@ static bool blk_mq_check_expired(struct request *rq, void *priv, bool reserved)
* from blk_mq_check_expired().
*/
if (blk_mq_req_expired(rq, next))
- blk_mq_rq_timed_out(rq, reserved);
+ blk_mq_rq_timed_out(rq);
return true;
}
@@ -2085,14 +2091,10 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
return;
if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
- int cpu = get_cpu();
- if (cpumask_test_cpu(cpu, hctx->cpumask)) {
+ if (cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) {
__blk_mq_run_hw_queue(hctx);
- put_cpu();
return;
}
-
- put_cpu();
}
kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work,
@@ -2156,7 +2158,7 @@ static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q)
* just causes lock contention inside the scheduler and pointless cache
* bouncing.
*/
- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, 0, ctx);
+ struct blk_mq_hw_ctx *hctx = ctx->hctxs[HCTX_TYPE_DEFAULT];
if (!blk_mq_hctx_stopped(hctx))
return hctx;
@@ -2783,6 +2785,14 @@ static inline struct request *blk_mq_get_cached_request(struct request_queue *q,
return rq;
}
+static void bio_set_ioprio(struct bio *bio)
+{
+ /* Nobody set ioprio so far? Initialize it based on task's nice value */
+ if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) == IOPRIO_CLASS_NONE)
+ bio->bi_ioprio = get_current_ioprio();
+ blkcg_set_ioprio(bio);
+}
+
/**
* blk_mq_submit_bio - Create and send a request to block device.
* @bio: Bio pointer.
@@ -2799,19 +2809,21 @@ static inline struct request *blk_mq_get_cached_request(struct request_queue *q,
void blk_mq_submit_bio(struct bio *bio)
{
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
- struct blk_plug *plug = blk_mq_plug(q, bio);
+ struct blk_plug *plug = blk_mq_plug(bio);
const int is_sync = op_is_sync(bio->bi_opf);
struct request *rq;
unsigned int nr_segs = 1;
blk_status_t ret;
- blk_queue_bounce(q, &bio);
- if (blk_may_split(q, bio))
- __blk_queue_split(q, &bio, &nr_segs);
+ bio = blk_queue_bounce(bio, q);
+ if (bio_may_exceed_limits(bio, &q->limits))
+ bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
if (!bio_integrity_prep(bio))
return;
+ bio_set_ioprio(bio);
+
rq = blk_mq_get_cached_request(q, plug, &bio, nr_segs);
if (!rq) {
if (!bio)
@@ -3276,7 +3288,7 @@ struct rq_iter_data {
bool has_rq;
};
-static bool blk_mq_has_request(struct request *rq, void *data, bool reserved)
+static bool blk_mq_has_request(struct request *rq, void *data)
{
struct rq_iter_data *iter_data = data;
@@ -3895,7 +3907,7 @@ static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
q->queuedata = queuedata;
ret = blk_mq_init_allocated_queue(set, q);
if (ret) {
- blk_cleanup_queue(q);
+ blk_put_queue(q);
return ERR_PTR(ret);
}
return q;
@@ -3907,6 +3919,35 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
}
EXPORT_SYMBOL(blk_mq_init_queue);
+/**
+ * blk_mq_destroy_queue - shutdown a request queue
+ * @q: request queue to shutdown
+ *
+ * This shuts down a request queue allocated by blk_mq_init_queue() and drops
+ * the initial reference. All future requests will failed with -ENODEV.
+ *
+ * Context: can sleep
+ */
+void blk_mq_destroy_queue(struct request_queue *q)
+{
+ WARN_ON_ONCE(!queue_is_mq(q));
+ WARN_ON_ONCE(blk_queue_registered(q));
+
+ might_sleep();
+
+ blk_queue_flag_set(QUEUE_FLAG_DYING, q);
+ blk_queue_start_drain(q);
+ blk_freeze_queue(q);
+
+ blk_sync_queue(q);
+ blk_mq_cancel_work_sync(q);
+ blk_mq_exit_queue(q);
+
+ /* @q is and will stay empty, shutdown and put */
+ blk_put_queue(q);
+}
+EXPORT_SYMBOL(blk_mq_destroy_queue);
+
struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata,
struct lock_class_key *lkclass)
{
@@ -3919,13 +3960,23 @@ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata,
disk = __alloc_disk_node(q, set->numa_node, lkclass);
if (!disk) {
- blk_cleanup_queue(q);
+ blk_mq_destroy_queue(q);
return ERR_PTR(-ENOMEM);
}
+ set_bit(GD_OWNS_QUEUE, &disk->state);
return disk;
}
EXPORT_SYMBOL(__blk_mq_alloc_disk);
+struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q,
+ struct lock_class_key *lkclass)
+{
+ if (!blk_get_queue(q))
+ return NULL;
+ return __alloc_disk_node(q, NUMA_NO_NODE, lkclass);
+}
+EXPORT_SYMBOL(blk_mq_alloc_disk_for_queue);
+
static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
struct blk_mq_tag_set *set, struct request_queue *q,
int hctx_idx, int node)
@@ -4513,7 +4564,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_debugfs_unregister_hctxs(q);
- blk_mq_sysfs_unregister(q);
+ blk_mq_sysfs_unregister_hctxs(q);
}
prev_nr_hw_queues = set->nr_hw_queues;
@@ -4544,7 +4595,7 @@ fallback:
reregister:
list_for_each_entry(q, &set->tag_list, tag_set_list) {
- blk_mq_sysfs_register(q);
+ blk_mq_sysfs_register_hctxs(q);
blk_mq_debugfs_register_hctxs(q);
}
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 2615bd58bad3..8ca453ac243d 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -86,16 +86,16 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *
return xa_load(&q->hctx_table, q->tag_set->map[type].mq_map[cpu]);
}
-static inline enum hctx_type blk_mq_get_hctx_type(unsigned int flags)
+static inline enum hctx_type blk_mq_get_hctx_type(blk_opf_t opf)
{
enum hctx_type type = HCTX_TYPE_DEFAULT;
/*
* The caller ensure that if REQ_POLLED, poll must be enabled.
*/
- if (flags & REQ_POLLED)
+ if (opf & REQ_POLLED)
type = HCTX_TYPE_POLL;
- else if ((flags & REQ_OP_MASK) == REQ_OP_READ)
+ else if ((opf & REQ_OP_MASK) == REQ_OP_READ)
type = HCTX_TYPE_READ;
return type;
}
@@ -103,14 +103,14 @@ static inline enum hctx_type blk_mq_get_hctx_type(unsigned int flags)
/*
* blk_mq_map_queue() - map (cmd_flags,type) to hardware queue
* @q: request queue
- * @flags: request command flags
+ * @opf: operation type (REQ_OP_*) and flags (e.g. REQ_POLLED).
* @ctx: software queue cpu ctx
*/
static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
- unsigned int flags,
+ blk_opf_t opf,
struct blk_mq_ctx *ctx)
{
- return ctx->hctxs[blk_mq_get_hctx_type(flags)];
+ return ctx->hctxs[blk_mq_get_hctx_type(opf)];
}
/*
@@ -118,9 +118,10 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
*/
extern void blk_mq_sysfs_init(struct request_queue *q);
extern void blk_mq_sysfs_deinit(struct request_queue *q);
-extern int __blk_mq_register_dev(struct device *dev, struct request_queue *q);
-extern int blk_mq_sysfs_register(struct request_queue *q);
-extern void blk_mq_sysfs_unregister(struct request_queue *q);
+int blk_mq_sysfs_register(struct gendisk *disk);
+void blk_mq_sysfs_unregister(struct gendisk *disk);
+int blk_mq_sysfs_register_hctxs(struct request_queue *q);
+void blk_mq_sysfs_unregister_hctxs(struct request_queue *q);
extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
void blk_mq_free_plug_rqs(struct blk_plug *plug);
void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
@@ -151,7 +152,7 @@ struct blk_mq_alloc_data {
struct request_queue *q;
blk_mq_req_flags_t flags;
unsigned int shallow_depth;
- unsigned int cmd_flags;
+ blk_opf_t cmd_flags;
req_flags_t rq_flags;
/* allocate multiple requests/tags in one go */
@@ -293,7 +294,6 @@ static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
/*
* blk_mq_plug() - Get caller context plug
- * @q: request queue
* @bio : the bio being submitted by the caller context
*
* Plugging, by design, may delay the insertion of BIOs into the elevator in
@@ -304,23 +304,22 @@ static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
* order. While this is not a problem with regular block devices, this ordering
* change can cause write BIO failures with zoned block devices as these
* require sequential write patterns to zones. Prevent this from happening by
- * ignoring the plug state of a BIO issuing context if the target request queue
- * is for a zoned block device and the BIO to plug is a write operation.
+ * ignoring the plug state of a BIO issuing context if it is for a zoned block
+ * device and the BIO to plug is a write operation.
*
* Return current->plug if the bio can be plugged and NULL otherwise
*/
-static inline struct blk_plug *blk_mq_plug(struct request_queue *q,
- struct bio *bio)
+static inline struct blk_plug *blk_mq_plug( struct bio *bio)
{
+ /* Zoned block device write operation case: do not plug the BIO */
+ if (bdev_is_zoned(bio->bi_bdev) && op_is_write(bio_op(bio)))
+ return NULL;
+
/*
* For regular block devices or read operations, use the context plug
* which may be NULL if blk_start_plug() was not executed.
*/
- if (!blk_queue_is_zoned(q) || !op_is_write(bio_op(bio)))
- return current->plug;
-
- /* Zoned block device write operation case: do not plug the BIO */
- return NULL;
+ return current->plug;
}
/* Free all requests on the list */
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index d3a75693adbf..88f0fe7dcf54 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -10,16 +10,10 @@ static bool atomic_inc_below(atomic_t *v, unsigned int below)
{
unsigned int cur = atomic_read(v);
- for (;;) {
- unsigned int old;
-
+ do {
if (cur >= below)
return false;
- old = atomic_cmpxchg(v, cur, cur + 1);
- if (old == cur)
- break;
- cur = old;
- }
+ } while (!atomic_try_cmpxchg(v, &cur, cur + 1));
return true;
}
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index 0e46052b018a..08b856570ad1 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -86,7 +86,7 @@ static inline void rq_wait_init(struct rq_wait *rq_wait)
init_waitqueue_head(&rq_wait->wait);
}
-static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
+static inline int rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
{
/*
* No IO can be in-flight when adding rqos, so freeze queue, which
@@ -98,6 +98,8 @@ static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
blk_mq_freeze_queue(q);
spin_lock_irq(&q->queue_lock);
+ if (rq_qos_id(q, rqos->id))
+ goto ebusy;
rqos->next = q->rq_qos;
q->rq_qos = rqos;
spin_unlock_irq(&q->queue_lock);
@@ -109,6 +111,13 @@ static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
blk_mq_debugfs_register_rqos(rqos);
mutex_unlock(&q->debugfs_mutex);
}
+
+ return 0;
+ebusy:
+ spin_unlock_irq(&q->queue_lock);
+ blk_mq_unfreeze_queue(q);
+ return -EBUSY;
+
}
static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos)
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 6ccceb421ed2..8bb9eef5310e 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -893,18 +893,19 @@ static bool disk_has_partitions(struct gendisk *disk)
}
/**
- * blk_queue_set_zoned - configure a disk queue zoned model.
+ * disk_set_zoned - configure the zoned model for a disk
* @disk: the gendisk of the queue to configure
* @model: the zoned model to set
*
- * Set the zoned model of the request queue of @disk according to @model.
+ * Set the zoned model of @disk to @model.
+ *
* When @model is BLK_ZONED_HM (host managed), this should be called only
* if zoned block device support is enabled (CONFIG_BLK_DEV_ZONED option).
* If @model specifies BLK_ZONED_HA (host aware), the effective model used
* depends on CONFIG_BLK_DEV_ZONED settings and on the existence of partitions
* on the disk.
*/
-void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model)
+void disk_set_zoned(struct gendisk *disk, enum blk_zoned_model model)
{
struct request_queue *q = disk->queue;
@@ -945,10 +946,10 @@ void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model)
blk_queue_zone_write_granularity(q,
queue_logical_block_size(q));
} else {
- blk_queue_clear_zone_settings(q);
+ disk_clear_zone_settings(disk);
}
}
-EXPORT_SYMBOL_GPL(blk_queue_set_zoned);
+EXPORT_SYMBOL_GPL(disk_set_zoned);
int bdev_alignment_offset(struct block_device *bdev)
{
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 9b905e9443e4..e1f009aba6fd 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -274,6 +274,11 @@ static ssize_t queue_virt_boundary_mask_show(struct request_queue *q, char *page
return queue_var_show(q->limits.virt_boundary_mask, page);
}
+static ssize_t queue_dma_alignment_show(struct request_queue *q, char *page)
+{
+ return queue_var_show(queue_dma_alignment(q), page);
+}
+
#define QUEUE_SYSFS_BIT_FNS(name, flag, neg) \
static ssize_t \
queue_##name##_show(struct request_queue *q, char *page) \
@@ -320,17 +325,17 @@ static ssize_t queue_zoned_show(struct request_queue *q, char *page)
static ssize_t queue_nr_zones_show(struct request_queue *q, char *page)
{
- return queue_var_show(blk_queue_nr_zones(q), page);
+ return queue_var_show(disk_nr_zones(q->disk), page);
}
static ssize_t queue_max_open_zones_show(struct request_queue *q, char *page)
{
- return queue_var_show(queue_max_open_zones(q), page);
+ return queue_var_show(bdev_max_open_zones(q->disk->part0), page);
}
static ssize_t queue_max_active_zones_show(struct request_queue *q, char *page)
{
- return queue_var_show(queue_max_active_zones(q), page);
+ return queue_var_show(bdev_max_active_zones(q->disk->part0), page);
}
static ssize_t queue_nomerges_show(struct request_queue *q, char *page)
@@ -606,6 +611,7 @@ QUEUE_RO_ENTRY(queue_dax, "dax");
QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout");
QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec");
QUEUE_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask");
+QUEUE_RO_ENTRY(queue_dma_alignment, "dma_alignment");
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
QUEUE_RW_ENTRY(blk_throtl_sample_time, "throttle_sample_time");
@@ -667,6 +673,7 @@ static struct attribute *queue_attrs[] = {
&blk_throtl_sample_time_entry.attr,
#endif
&queue_virt_boundary_mask_entry.attr,
+ &queue_dma_alignment_entry.attr,
NULL,
};
@@ -748,11 +755,6 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head)
* decremented with blk_put_queue(). Once the refcount reaches 0 this function
* is called.
*
- * For drivers that have a request_queue on a gendisk and added with
- * __device_add_disk() the refcount to request_queue will reach 0 with
- * the last put_disk() called by the driver. For drivers which don't use
- * __device_add_disk() this happens with blk_cleanup_queue().
- *
* Drivers exist which depend on the release of the request_queue to be
* synchronous, it should not be deferred.
*
@@ -774,17 +776,13 @@ static void blk_release_queue(struct kobject *kobj)
blk_free_queue_stats(q->stats);
kfree(q->poll_stat);
- blk_queue_free_zone_bitmaps(q);
-
if (queue_is_mq(q))
blk_mq_release(q);
- bioset_exit(&q->bio_split);
-
if (blk_queue_has_srcu(q))
cleanup_srcu_struct(q->srcu);
- ida_simple_remove(&blk_queue_ida, q->id);
+ ida_free(&blk_queue_ida, q->id);
call_rcu(&q->rcu_head, blk_free_queue_rcu);
}
@@ -793,7 +791,13 @@ static const struct sysfs_ops queue_sysfs_ops = {
.store = queue_attr_store,
};
+static const struct attribute_group *blk_queue_attr_groups[] = {
+ &queue_attr_group,
+ NULL
+};
+
struct kobj_type blk_queue_ktype = {
+ .default_groups = blk_queue_attr_groups,
.sysfs_ops = &queue_sysfs_ops,
.release = blk_release_queue,
};
@@ -804,32 +808,17 @@ struct kobj_type blk_queue_ktype = {
*/
int blk_register_queue(struct gendisk *disk)
{
- int ret;
- struct device *dev = disk_to_dev(disk);
struct request_queue *q = disk->queue;
-
- ret = blk_trace_init_sysfs(dev);
- if (ret)
- return ret;
+ int ret;
mutex_lock(&q->sysfs_dir_lock);
- ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue");
- if (ret < 0) {
- blk_trace_remove_sysfs(dev);
- goto unlock;
- }
-
- ret = sysfs_create_group(&q->kobj, &queue_attr_group);
- if (ret) {
- blk_trace_remove_sysfs(dev);
- kobject_del(&q->kobj);
- kobject_put(&dev->kobj);
+ ret = kobject_add(&q->kobj, &disk_to_dev(disk)->kobj, "queue");
+ if (ret < 0)
goto unlock;
- }
if (queue_is_mq(q))
- __blk_mq_register_dev(dev, q);
+ blk_mq_sysfs_register(disk);
mutex_lock(&q->sysfs_lock);
mutex_lock(&q->debugfs_mutex);
@@ -839,7 +828,7 @@ int blk_register_queue(struct gendisk *disk)
blk_mq_debugfs_register(q);
mutex_unlock(&q->debugfs_mutex);
- ret = disk_register_independent_access_ranges(disk, NULL);
+ ret = disk_register_independent_access_ranges(disk);
if (ret)
goto put_dev;
@@ -888,8 +877,6 @@ put_dev:
mutex_unlock(&q->sysfs_lock);
mutex_unlock(&q->sysfs_dir_lock);
kobject_del(&q->kobj);
- blk_trace_remove_sysfs(dev);
- kobject_put(&dev->kobj);
return ret;
}
@@ -927,9 +914,8 @@ void blk_unregister_queue(struct gendisk *disk)
* structures that can be modified through sysfs.
*/
if (queue_is_mq(q))
- blk_mq_unregister_dev(disk_to_dev(disk), q);
+ blk_mq_sysfs_unregister(disk);
blk_crypto_sysfs_unregister(q);
- blk_trace_remove_sysfs(disk_to_dev(disk));
mutex_lock(&q->sysfs_lock);
elv_unregister_queue(q);
@@ -948,6 +934,4 @@ void blk_unregister_queue(struct gendisk *disk)
q->sched_debugfs_dir = NULL;
q->rqos_debugfs_dir = NULL;
mutex_unlock(&q->debugfs_mutex);
-
- kobject_put(&disk_to_dev(disk)->kobj);
}
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 139b2d7a99e2..9f5fe62afff9 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -2203,8 +2203,9 @@ out_unlock:
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
static void throtl_track_latency(struct throtl_data *td, sector_t size,
- int op, unsigned long time)
+ enum req_op op, unsigned long time)
{
+ const bool rw = op_is_write(op);
struct latency_bucket *latency;
int index;
@@ -2215,10 +2216,10 @@ static void throtl_track_latency(struct throtl_data *td, sector_t size,
index = request_bucket_index(size);
- latency = get_cpu_ptr(td->latency_buckets[op]);
+ latency = get_cpu_ptr(td->latency_buckets[rw]);
latency[index].total_latency += time;
latency[index].samples++;
- put_cpu_ptr(td->latency_buckets[op]);
+ put_cpu_ptr(td->latency_buckets[rw]);
}
void blk_throtl_stat_add(struct request *rq, u64 time_ns)
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 0c119be0e813..a9982000b667 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -451,7 +451,7 @@ static bool close_io(struct rq_wb *rwb)
#define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO)
-static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
+static inline unsigned int get_limit(struct rq_wb *rwb, blk_opf_t opf)
{
unsigned int limit;
@@ -462,7 +462,7 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
if (!rwb_enabled(rwb))
return UINT_MAX;
- if ((rw & REQ_OP_MASK) == REQ_OP_DISCARD)
+ if ((opf & REQ_OP_MASK) == REQ_OP_DISCARD)
return rwb->wb_background;
/*
@@ -473,9 +473,9 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
* the idle limit, or go to normal if we haven't had competing
* IO for a bit.
*/
- if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd())
+ if ((opf & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd())
limit = rwb->rq_depth.max_depth;
- else if ((rw & REQ_BACKGROUND) || close_io(rwb)) {
+ else if ((opf & REQ_BACKGROUND) || close_io(rwb)) {
/*
* If less than 100ms since we completed unrelated IO,
* limit us to half the depth for background writeback.
@@ -490,13 +490,13 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
struct wbt_wait_data {
struct rq_wb *rwb;
enum wbt_flags wb_acct;
- unsigned long rw;
+ blk_opf_t opf;
};
static bool wbt_inflight_cb(struct rq_wait *rqw, void *private_data)
{
struct wbt_wait_data *data = private_data;
- return rq_wait_inc_below(rqw, get_limit(data->rwb, data->rw));
+ return rq_wait_inc_below(rqw, get_limit(data->rwb, data->opf));
}
static void wbt_cleanup_cb(struct rq_wait *rqw, void *private_data)
@@ -510,13 +510,13 @@ static void wbt_cleanup_cb(struct rq_wait *rqw, void *private_data)
* the timer to kick off queuing again.
*/
static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
- unsigned long rw)
+ blk_opf_t opf)
{
struct rq_wait *rqw = get_rq_wait(rwb, wb_acct);
struct wbt_wait_data data = {
.rwb = rwb,
.wb_acct = wb_acct,
- .rw = rw,
+ .opf = opf,
};
rq_qos_wait(rqw, &data, wbt_inflight_cb, wbt_cleanup_cb);
@@ -670,7 +670,7 @@ u64 wbt_default_latency_nsec(struct request_queue *q)
static int wbt_data_dir(const struct request *rq)
{
- const int op = req_op(rq);
+ const enum req_op op = req_op(rq);
if (op == REQ_OP_READ)
return READ;
@@ -820,6 +820,7 @@ int wbt_init(struct request_queue *q)
{
struct rq_wb *rwb;
int i;
+ int ret;
rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
if (!rwb)
@@ -846,7 +847,10 @@ int wbt_init(struct request_queue *q)
/*
* Assign rwb and add the stats callback.
*/
- rq_qos_add(q, &rwb->rqos);
+ ret = rq_qos_add(q, &rwb->rqos);
+ if (ret)
+ goto err_free;
+
blk_stat_add_callback(q, rwb->cb);
rwb->min_lat_nsec = wbt_default_latency_nsec(q);
@@ -855,4 +859,10 @@ int wbt_init(struct request_queue *q)
wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
return 0;
+
+err_free:
+ blk_stat_free_callback(rwb->cb);
+ kfree(rwb);
+ return ret;
+
}
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 38cd840d8838..a264621d4905 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -57,10 +57,10 @@ EXPORT_SYMBOL_GPL(blk_zone_cond_str);
*/
bool blk_req_needs_zone_write_lock(struct request *rq)
{
- if (!rq->q->seq_zones_wlock)
+ if (blk_rq_is_passthrough(rq))
return false;
- if (blk_rq_is_passthrough(rq))
+ if (!rq->q->disk->seq_zones_wlock)
return false;
switch (req_op(rq)) {
@@ -77,7 +77,7 @@ bool blk_req_zone_write_trylock(struct request *rq)
{
unsigned int zno = blk_rq_zone_no(rq);
- if (test_and_set_bit(zno, rq->q->seq_zones_wlock))
+ if (test_and_set_bit(zno, rq->q->disk->seq_zones_wlock))
return false;
WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
@@ -90,7 +90,7 @@ EXPORT_SYMBOL_GPL(blk_req_zone_write_trylock);
void __blk_req_zone_write_lock(struct request *rq)
{
if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq),
- rq->q->seq_zones_wlock)))
+ rq->q->disk->seq_zones_wlock)))
return;
WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
@@ -101,28 +101,29 @@ EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock);
void __blk_req_zone_write_unlock(struct request *rq)
{
rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED;
- if (rq->q->seq_zones_wlock)
+ if (rq->q->disk->seq_zones_wlock)
WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq),
- rq->q->seq_zones_wlock));
+ rq->q->disk->seq_zones_wlock));
}
EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
/**
- * blkdev_nr_zones - Get number of zones
- * @disk: Target gendisk
+ * bdev_nr_zones - Get number of zones
+ * @bdev: Target device
*
* Return the total number of zones of a zoned block device. For a block
* device without zone capabilities, the number of zones is always 0.
*/
-unsigned int blkdev_nr_zones(struct gendisk *disk)
+unsigned int bdev_nr_zones(struct block_device *bdev)
{
- sector_t zone_sectors = blk_queue_zone_sectors(disk->queue);
+ sector_t zone_sectors = bdev_zone_sectors(bdev);
- if (!blk_queue_is_zoned(disk->queue))
+ if (!bdev_is_zoned(bdev))
return 0;
- return (get_capacity(disk) + zone_sectors - 1) >> ilog2(zone_sectors);
+ return (bdev_nr_sectors(bdev) + zone_sectors - 1) >>
+ ilog2(zone_sectors);
}
-EXPORT_SYMBOL_GPL(blkdev_nr_zones);
+EXPORT_SYMBOL_GPL(bdev_nr_zones);
/**
* blkdev_report_zones - Get zones information
@@ -149,8 +150,7 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector,
struct gendisk *disk = bdev->bd_disk;
sector_t capacity = get_capacity(disk);
- if (!blk_queue_is_zoned(bdev_get_queue(bdev)) ||
- WARN_ON_ONCE(!disk->fops->report_zones))
+ if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones))
return -EOPNOTSUPP;
if (!nr_zones || sector >= capacity)
@@ -189,27 +189,26 @@ static int blk_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx,
static int blkdev_zone_reset_all_emulated(struct block_device *bdev,
gfp_t gfp_mask)
{
- struct request_queue *q = bdev_get_queue(bdev);
- sector_t capacity = get_capacity(bdev->bd_disk);
- sector_t zone_sectors = blk_queue_zone_sectors(q);
+ struct gendisk *disk = bdev->bd_disk;
+ sector_t capacity = bdev_nr_sectors(bdev);
+ sector_t zone_sectors = bdev_zone_sectors(bdev);
unsigned long *need_reset;
struct bio *bio = NULL;
sector_t sector = 0;
int ret;
- need_reset = blk_alloc_zone_bitmap(q->node, q->nr_zones);
+ need_reset = blk_alloc_zone_bitmap(disk->queue->node, disk->nr_zones);
if (!need_reset)
return -ENOMEM;
- ret = bdev->bd_disk->fops->report_zones(bdev->bd_disk, 0,
- q->nr_zones, blk_zone_need_reset_cb,
- need_reset);
+ ret = disk->fops->report_zones(disk, 0, disk->nr_zones,
+ blk_zone_need_reset_cb, need_reset);
if (ret < 0)
goto out_free_need_reset;
ret = 0;
while (sector < capacity) {
- if (!test_bit(blk_queue_zone_no(q, sector), need_reset)) {
+ if (!test_bit(disk_zone_no(disk, sector), need_reset)) {
sector += zone_sectors;
continue;
}
@@ -257,18 +256,17 @@ static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask)
* The operation to execute on each zone can be a zone reset, open, close
* or finish request.
*/
-int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op,
- sector_t sector, sector_t nr_sectors,
- gfp_t gfp_mask)
+int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
+ sector_t sector, sector_t nr_sectors, gfp_t gfp_mask)
{
struct request_queue *q = bdev_get_queue(bdev);
- sector_t zone_sectors = blk_queue_zone_sectors(q);
- sector_t capacity = get_capacity(bdev->bd_disk);
+ sector_t zone_sectors = bdev_zone_sectors(bdev);
+ sector_t capacity = bdev_nr_sectors(bdev);
sector_t end_sector = sector + nr_sectors;
struct bio *bio = NULL;
int ret = 0;
- if (!blk_queue_is_zoned(q))
+ if (!bdev_is_zoned(bdev))
return -EOPNOTSUPP;
if (bdev_read_only(bdev))
@@ -350,7 +348,7 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
if (!q)
return -ENXIO;
- if (!blk_queue_is_zoned(q))
+ if (!bdev_is_zoned(bdev))
return -ENOTTY;
if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report)))
@@ -398,7 +396,7 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
void __user *argp = (void __user *)arg;
struct request_queue *q;
struct blk_zone_range zrange;
- enum req_opf op;
+ enum req_op op;
int ret;
if (!argp)
@@ -408,7 +406,7 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
if (!q)
return -ENXIO;
- if (!blk_queue_is_zoned(q))
+ if (!bdev_is_zoned(bdev))
return -ENOTTY;
if (!(mode & FMODE_WRITE))
@@ -450,12 +448,12 @@ fail:
return ret;
}
-void blk_queue_free_zone_bitmaps(struct request_queue *q)
+void disk_free_zone_bitmaps(struct gendisk *disk)
{
- kfree(q->conv_zones_bitmap);
- q->conv_zones_bitmap = NULL;
- kfree(q->seq_zones_wlock);
- q->seq_zones_wlock = NULL;
+ kfree(disk->conv_zones_bitmap);
+ disk->conv_zones_bitmap = NULL;
+ kfree(disk->seq_zones_wlock);
+ disk->seq_zones_wlock = NULL;
}
struct blk_revalidate_zone_args {
@@ -605,15 +603,15 @@ int blk_revalidate_disk_zones(struct gendisk *disk,
blk_mq_freeze_queue(q);
if (ret > 0) {
blk_queue_chunk_sectors(q, args.zone_sectors);
- q->nr_zones = args.nr_zones;
- swap(q->seq_zones_wlock, args.seq_zones_wlock);
- swap(q->conv_zones_bitmap, args.conv_zones_bitmap);
+ disk->nr_zones = args.nr_zones;
+ swap(disk->seq_zones_wlock, args.seq_zones_wlock);
+ swap(disk->conv_zones_bitmap, args.conv_zones_bitmap);
if (update_driver_data)
update_driver_data(disk);
ret = 0;
} else {
pr_warn("%s: failed to revalidate zones\n", disk->disk_name);
- blk_queue_free_zone_bitmaps(q);
+ disk_free_zone_bitmaps(disk);
}
blk_mq_unfreeze_queue(q);
@@ -623,16 +621,18 @@ int blk_revalidate_disk_zones(struct gendisk *disk,
}
EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
-void blk_queue_clear_zone_settings(struct request_queue *q)
+void disk_clear_zone_settings(struct gendisk *disk)
{
+ struct request_queue *q = disk->queue;
+
blk_mq_freeze_queue(q);
- blk_queue_free_zone_bitmaps(q);
+ disk_free_zone_bitmaps(disk);
blk_queue_flag_clear(QUEUE_FLAG_ZONE_RESETALL, q);
q->required_elevator_features &= ~ELEVATOR_F_ZBD_SEQ_WRITE;
- q->nr_zones = 0;
- q->max_open_zones = 0;
- q->max_active_zones = 0;
+ disk->nr_zones = 0;
+ disk->max_open_zones = 0;
+ disk->max_active_zones = 0;
q->limits.chunk_sectors = 0;
q->limits.zone_write_granularity = 0;
q->limits.max_zone_append_sectors = 0;
diff --git a/block/blk.h b/block/blk.h
index 434017701403..d7142c4d2fef 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -31,11 +31,6 @@ extern struct kmem_cache *blk_requestq_srcu_cachep;
extern struct kobj_type blk_queue_ktype;
extern struct ida blk_queue_ida;
-static inline void __blk_get_queue(struct request_queue *q)
-{
- kobject_get(&q->kobj);
-}
-
bool is_flush_rq(struct request *req);
struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
@@ -102,23 +97,23 @@ static inline bool biovec_phys_mergeable(struct request_queue *q,
return true;
}
-static inline bool __bvec_gap_to_prev(struct request_queue *q,
+static inline bool __bvec_gap_to_prev(struct queue_limits *lim,
struct bio_vec *bprv, unsigned int offset)
{
- return (offset & queue_virt_boundary(q)) ||
- ((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q));
+ return (offset & lim->virt_boundary_mask) ||
+ ((bprv->bv_offset + bprv->bv_len) & lim->virt_boundary_mask);
}
/*
* Check if adding a bio_vec after bprv with offset would create a gap in
* the SG list. Most drivers don't care about this, but some do.
*/
-static inline bool bvec_gap_to_prev(struct request_queue *q,
+static inline bool bvec_gap_to_prev(struct queue_limits *lim,
struct bio_vec *bprv, unsigned int offset)
{
- if (!queue_virt_boundary(q))
+ if (!lim->virt_boundary_mask)
return false;
- return __bvec_gap_to_prev(q, bprv, offset);
+ return __bvec_gap_to_prev(lim, bprv, offset);
}
static inline bool rq_mergeable(struct request *rq)
@@ -159,6 +154,19 @@ static inline bool blk_discard_mergable(struct request *req)
return false;
}
+static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
+ enum req_op op)
+{
+ if (unlikely(op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE))
+ return min(q->limits.max_discard_sectors,
+ UINT_MAX >> SECTOR_SHIFT);
+
+ if (unlikely(op == REQ_OP_WRITE_ZEROES))
+ return q->limits.max_write_zeroes_sectors;
+
+ return q->limits.max_sectors;
+}
+
#ifdef CONFIG_BLK_DEV_INTEGRITY
void blk_flush_integrity(void);
bool __bio_integrity_endio(struct bio *);
@@ -181,7 +189,8 @@ static inline bool integrity_req_gap_back_merge(struct request *req,
struct bio_integrity_payload *bip = bio_integrity(req->bio);
struct bio_integrity_payload *bip_next = bio_integrity(next);
- return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1],
+ return bvec_gap_to_prev(&req->q->limits,
+ &bip->bip_vec[bip->bip_vcnt - 1],
bip_next->bip_vec[0].bv_offset);
}
@@ -191,7 +200,8 @@ static inline bool integrity_req_gap_front_merge(struct request *req,
struct bio_integrity_payload *bip = bio_integrity(bio);
struct bio_integrity_payload *bip_next = bio_integrity(req->bio);
- return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1],
+ return bvec_gap_to_prev(&req->q->limits,
+ &bip->bip_vec[bip->bip_vcnt - 1],
bip_next->bip_vec[0].bv_offset);
}
@@ -280,7 +290,8 @@ ssize_t part_timeout_show(struct device *, struct device_attribute *, char *);
ssize_t part_timeout_store(struct device *, struct device_attribute *,
const char *, size_t);
-static inline bool blk_may_split(struct request_queue *q, struct bio *bio)
+static inline bool bio_may_exceed_limits(struct bio *bio,
+ struct queue_limits *lim)
{
switch (bio_op(bio)) {
case REQ_OP_DISCARD:
@@ -299,12 +310,12 @@ static inline bool blk_may_split(struct request_queue *q, struct bio *bio)
* to the performance impact of cloned bios themselves the loop below
* doesn't matter anyway.
*/
- return q->limits.chunk_sectors || bio->bi_vcnt != 1 ||
+ return lim->chunk_sectors || bio->bi_vcnt != 1 ||
bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE;
}
-void __blk_queue_split(struct request_queue *q, struct bio **bio,
- unsigned int *nr_segs);
+struct bio *__bio_split_to_limits(struct bio *bio, struct queue_limits *lim,
+ unsigned int *nr_segs);
int ll_back_merge_fn(struct request *req, struct bio *bio,
unsigned int nr_segs);
bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
@@ -337,16 +348,6 @@ static inline void req_set_nomerge(struct request_queue *q, struct request *req)
}
/*
- * The max size one bio can handle is UINT_MAX becasue bvec_iter.bi_size
- * is defined as 'unsigned int', meantime it has to aligned to with logical
- * block size which is the minimum accepted unit by hardware.
- */
-static inline unsigned int bio_allowed_max_sectors(struct request_queue *q)
-{
- return round_down(UINT_MAX, queue_logical_block_size(q)) >> 9;
-}
-
-/*
* Internal io_context interface
*/
struct io_cq *ioc_find_get_icq(struct request_queue *q);
@@ -370,7 +371,7 @@ static inline void blk_throtl_bio_endio(struct bio *bio) { }
static inline void blk_throtl_stat_add(struct request *rq, u64 time) { }
#endif
-void __blk_queue_bounce(struct request_queue *q, struct bio **bio);
+struct bio *__blk_queue_bounce(struct bio *bio, struct request_queue *q);
static inline bool blk_queue_may_bounce(struct request_queue *q)
{
@@ -379,10 +380,12 @@ static inline bool blk_queue_may_bounce(struct request_queue *q)
max_low_pfn >= max_pfn;
}
-static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio)
+static inline struct bio *blk_queue_bounce(struct bio *bio,
+ struct request_queue *q)
{
- if (unlikely(blk_queue_may_bounce(q) && bio_has_data(*bio)))
- __blk_queue_bounce(q, bio);
+ if (unlikely(blk_queue_may_bounce(q) && bio_has_data(bio)))
+ return __blk_queue_bounce(bio, q);
+ return bio;
}
#ifdef CONFIG_BLK_CGROUP_IOLATENCY
@@ -392,11 +395,11 @@ static inline int blk_iolatency_init(struct request_queue *q) { return 0; }
#endif
#ifdef CONFIG_BLK_DEV_ZONED
-void blk_queue_free_zone_bitmaps(struct request_queue *q);
-void blk_queue_clear_zone_settings(struct request_queue *q);
+void disk_free_zone_bitmaps(struct gendisk *disk);
+void disk_clear_zone_settings(struct gendisk *disk);
#else
-static inline void blk_queue_free_zone_bitmaps(struct request_queue *q) {}
-static inline void blk_queue_clear_zone_settings(struct request_queue *q) {}
+static inline void disk_free_zone_bitmaps(struct gendisk *disk) {}
+static inline void disk_clear_zone_settings(struct gendisk *disk) {}
#endif
int blk_alloc_ext_minor(void);
@@ -411,6 +414,9 @@ int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start,
sector_t length);
void blk_drop_partitions(struct gendisk *disk);
+struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
+ struct lock_class_key *lkclass);
+
int bio_add_hw_page(struct request_queue *q, struct bio *bio,
struct page *page, unsigned int len, unsigned int offset,
unsigned int max_sectors, bool *same_page);
@@ -436,13 +442,14 @@ extern struct device_attribute dev_attr_events;
extern struct device_attribute dev_attr_events_async;
extern struct device_attribute dev_attr_events_poll_msecs;
+extern struct attribute_group blk_trace_attr_group;
+
long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
extern const struct address_space_operations def_blk_aops;
-int disk_register_independent_access_ranges(struct gendisk *disk,
- struct blk_independent_access_ranges *new_iars);
+int disk_register_independent_access_ranges(struct gendisk *disk);
void disk_unregister_independent_access_ranges(struct gendisk *disk);
#ifdef CONFIG_FAIL_MAKE_REQUEST
diff --git a/block/bounce.c b/block/bounce.c
index 8f7b6fe3b4db..7cfcb242f9a1 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -199,32 +199,39 @@ err_put:
return NULL;
}
-void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
+struct bio *__blk_queue_bounce(struct bio *bio_orig, struct request_queue *q)
{
struct bio *bio;
- int rw = bio_data_dir(*bio_orig);
+ int rw = bio_data_dir(bio_orig);
struct bio_vec *to, from;
struct bvec_iter iter;
- unsigned i = 0;
+ unsigned i = 0, bytes = 0;
bool bounce = false;
- int sectors = 0;
+ int sectors;
- bio_for_each_segment(from, *bio_orig, iter) {
+ bio_for_each_segment(from, bio_orig, iter) {
if (i++ < BIO_MAX_VECS)
- sectors += from.bv_len >> 9;
+ bytes += from.bv_len;
if (PageHighMem(from.bv_page))
bounce = true;
}
if (!bounce)
- return;
+ return bio_orig;
- if (sectors < bio_sectors(*bio_orig)) {
- bio = bio_split(*bio_orig, sectors, GFP_NOIO, &bounce_bio_split);
- bio_chain(bio, *bio_orig);
- submit_bio_noacct(*bio_orig);
- *bio_orig = bio;
+ /*
+ * Individual bvecs might not be logical block aligned. Round down
+ * the split size so that each bio is properly block size aligned,
+ * even if we do not use the full hardware limits.
+ */
+ sectors = ALIGN_DOWN(bytes, queue_logical_block_size(q)) >>
+ SECTOR_SHIFT;
+ if (sectors < bio_sectors(bio_orig)) {
+ bio = bio_split(bio_orig, sectors, GFP_NOIO, &bounce_bio_split);
+ bio_chain(bio, bio_orig);
+ submit_bio_noacct(bio_orig);
+ bio_orig = bio;
}
- bio = bounce_clone_bio(*bio_orig);
+ bio = bounce_clone_bio(bio_orig);
/*
* Bvec table can't be updated by bio_for_each_segment_all(),
@@ -247,7 +254,7 @@ void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
to->bv_page = bounce_page;
}
- trace_block_bio_bounce(*bio_orig);
+ trace_block_bio_bounce(bio_orig);
bio->bi_flags |= (1 << BIO_BOUNCED);
@@ -256,6 +263,6 @@ void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
else
bio->bi_end_io = bounce_end_io_write;
- bio->bi_private = *bio_orig;
- *bio_orig = bio;
+ bio->bi_private = bio_orig;
+ return bio;
}
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index acfe1357bf6c..d6f5dcdce748 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -324,14 +324,14 @@ void bsg_remove_queue(struct request_queue *q)
container_of(q->tag_set, struct bsg_set, tag_set);
bsg_unregister_queue(bset->bd);
- blk_cleanup_queue(q);
+ blk_mq_destroy_queue(q);
blk_mq_free_tag_set(&bset->tag_set);
kfree(bset);
}
}
EXPORT_SYMBOL_GPL(bsg_remove_queue);
-static enum blk_eh_timer_return bsg_timeout(struct request *rq, bool reserved)
+static enum blk_eh_timer_return bsg_timeout(struct request *rq)
{
struct bsg_set *bset =
container_of(rq->q->tag_set, struct bsg_set, tag_set);
@@ -399,7 +399,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
return q;
out_cleanup_queue:
- blk_cleanup_queue(q);
+ blk_mq_destroy_queue(q);
out_queue:
blk_mq_free_tag_set(set);
out_tag_set:
diff --git a/block/bsg.c b/block/bsg.c
index 882f56bff14f..2ab1351eb082 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -169,7 +169,7 @@ static void bsg_device_release(struct device *dev)
{
struct bsg_device *bd = container_of(dev, struct bsg_device, device);
- ida_simple_remove(&bsg_minor_ida, MINOR(bd->device.devt));
+ ida_free(&bsg_minor_ida, MINOR(bd->device.devt));
kfree(bd);
}
@@ -196,7 +196,7 @@ struct bsg_device *bsg_register_queue(struct request_queue *q,
bd->queue = q;
bd->sg_io_fn = sg_io_fn;
- ret = ida_simple_get(&bsg_minor_ida, 0, BSG_MAX_DEVS, GFP_KERNEL);
+ ret = ida_alloc_max(&bsg_minor_ida, BSG_MAX_DEVS - 1, GFP_KERNEL);
if (ret < 0) {
if (ret == -ENOSPC)
dev_err(parent, "bsg: too many bsg devices\n");
diff --git a/block/elevator.h b/block/elevator.h
index 16cd8bdedb7e..3f0593b3bf9d 100644
--- a/block/elevator.h
+++ b/block/elevator.h
@@ -34,7 +34,7 @@ struct elevator_mq_ops {
int (*request_merge)(struct request_queue *q, struct request **, struct bio *);
void (*request_merged)(struct request_queue *, struct request *, enum elv_merge);
void (*requests_merged)(struct request_queue *, struct request *, struct request *);
- void (*limit_depth)(unsigned int, struct blk_mq_alloc_data *);
+ void (*limit_depth)(blk_opf_t, struct blk_mq_alloc_data *);
void (*prepare_request)(struct request *);
void (*finish_request)(struct request *);
void (*insert_requests)(struct blk_mq_hw_ctx *, struct list_head *, bool);
diff --git a/block/fops.c b/block/fops.c
index d6b3276a6c68..b90742595317 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -32,14 +32,21 @@ static int blkdev_get_block(struct inode *inode, sector_t iblock,
return 0;
}
-static unsigned int dio_bio_write_op(struct kiocb *iocb)
+static blk_opf_t dio_bio_write_op(struct kiocb *iocb)
{
- unsigned int op = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
+ blk_opf_t opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
/* avoid the need for a I/O completion work item */
- if (iocb->ki_flags & IOCB_DSYNC)
- op |= REQ_FUA;
- return op;
+ if (iocb_is_dsync(iocb))
+ opf |= REQ_FUA;
+ return opf;
+}
+
+static bool blkdev_dio_unaligned(struct block_device *bdev, loff_t pos,
+ struct iov_iter *iter)
+{
+ return pos & (bdev_logical_block_size(bdev) - 1) ||
+ !bdev_iter_is_aligned(bdev, iter);
}
#define DIO_INLINE_BIO_VECS 4
@@ -54,8 +61,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
struct bio bio;
ssize_t ret;
- if ((pos | iov_iter_alignment(iter)) &
- (bdev_logical_block_size(bdev) - 1))
+ if (blkdev_dio_unaligned(bdev, pos, iter))
return -EINVAL;
if (nr_pages <= DIO_INLINE_BIO_VECS)
@@ -69,7 +75,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
if (iov_iter_rw(iter) == READ) {
bio_init(&bio, bdev, vecs, nr_pages, REQ_OP_READ);
- if (iter_is_iovec(iter))
+ if (user_backed_iter(iter))
should_dirty = true;
} else {
bio_init(&bio, bdev, vecs, nr_pages, dio_bio_write_op(iocb));
@@ -169,12 +175,11 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
struct blkdev_dio *dio;
struct bio *bio;
bool is_read = (iov_iter_rw(iter) == READ), is_sync;
- unsigned int opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
+ blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
loff_t pos = iocb->ki_pos;
int ret = 0;
- if ((pos | iov_iter_alignment(iter)) &
- (bdev_logical_block_size(bdev) - 1))
+ if (blkdev_dio_unaligned(bdev, pos, iter))
return -EINVAL;
if (iocb->ki_flags & IOCB_ALLOC_CACHE)
@@ -199,7 +204,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
}
dio->size = 0;
- if (is_read && iter_is_iovec(iter))
+ if (is_read && user_backed_iter(iter))
dio->flags |= DIO_SHOULD_DIRTY;
blk_start_plug(&plug);
@@ -292,14 +297,13 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
{
struct block_device *bdev = iocb->ki_filp->private_data;
bool is_read = iov_iter_rw(iter) == READ;
- unsigned int opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
+ blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
struct blkdev_dio *dio;
struct bio *bio;
loff_t pos = iocb->ki_pos;
int ret = 0;
- if ((pos | iov_iter_alignment(iter)) &
- (bdev_logical_block_size(bdev) - 1))
+ if (blkdev_dio_unaligned(bdev, pos, iter))
return -EINVAL;
if (iocb->ki_flags & IOCB_ALLOC_CACHE)
@@ -331,7 +335,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
dio->size = bio->bi_iter.bi_size;
if (is_read) {
- if (iter_is_iovec(iter)) {
+ if (user_backed_iter(iter)) {
dio->flags |= DIO_SHOULD_DIRTY;
bio_set_pages_dirty(bio);
}
@@ -417,7 +421,7 @@ const struct address_space_operations def_blk_aops = {
.write_end = blkdev_write_end,
.writepages = blkdev_writepages,
.direct_IO = blkdev_direct_IO,
- .migratepage = buffer_migrate_page_norefs,
+ .migrate_folio = buffer_migrate_folio_norefs,
.is_dirty_writeback = buffer_check_dirty_writeback,
};
diff --git a/block/genhd.c b/block/genhd.c
index 278227ba1d53..d36fabf0abc1 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -101,29 +101,6 @@ bool set_capacity_and_notify(struct gendisk *disk, sector_t size)
}
EXPORT_SYMBOL_GPL(set_capacity_and_notify);
-/*
- * Format the device name of the indicated block device into the supplied buffer
- * and return a pointer to that same buffer for convenience.
- *
- * Note: do not use this in new code, use the %pg specifier to sprintf and
- * printk insted.
- */
-const char *bdevname(struct block_device *bdev, char *buf)
-{
- struct gendisk *hd = bdev->bd_disk;
- int partno = bdev->bd_partno;
-
- if (!partno)
- snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name);
- else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1]))
- snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno);
- else
- snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno);
-
- return buf;
-}
-EXPORT_SYMBOL(bdevname);
-
static void part_stat_read_all(struct block_device *part,
struct disk_stats *stat)
{
@@ -617,6 +594,8 @@ void del_gendisk(struct gendisk *disk)
* Fail any new I/O.
*/
set_bit(GD_DEAD, &disk->state);
+ if (test_bit(GD_OWNS_QUEUE, &disk->state))
+ blk_queue_flag_set(QUEUE_FLAG_DYING, q);
set_capacity(disk, 0);
/*
@@ -663,11 +642,16 @@ void del_gendisk(struct gendisk *disk)
blk_mq_unquiesce_queue(q);
/*
- * Allow using passthrough request again after the queue is torn down.
+ * If the disk does not own the queue, allow using passthrough requests
+ * again. Else leave the queue frozen to fail all I/O.
*/
- blk_queue_flag_clear(QUEUE_FLAG_INIT_DONE, q);
- __blk_mq_unfreeze_queue(q, true);
-
+ if (!test_bit(GD_OWNS_QUEUE, &disk->state)) {
+ blk_queue_flag_clear(QUEUE_FLAG_INIT_DONE, q);
+ __blk_mq_unfreeze_queue(q, true);
+ } else {
+ if (queue_is_mq(q))
+ blk_mq_exit_queue(q);
+ }
}
EXPORT_SYMBOL(del_gendisk);
@@ -1127,6 +1111,9 @@ static struct attribute_group disk_attr_group = {
static const struct attribute_group *disk_attr_groups[] = {
&disk_attr_group,
+#ifdef CONFIG_BLK_DEV_IO_TRACE
+ &blk_trace_attr_group,
+#endif
NULL
};
@@ -1151,10 +1138,24 @@ static void disk_release(struct device *dev)
might_sleep();
WARN_ON_ONCE(disk_live(disk));
+ /*
+ * To undo the all initialization from blk_mq_init_allocated_queue in
+ * case of a probe failure where add_disk is never called we have to
+ * call blk_mq_exit_queue here. We can't do this for the more common
+ * teardown case (yet) as the tagset can be gone by the time the disk
+ * is released once it was added.
+ */
+ if (queue_is_mq(disk->queue) &&
+ test_bit(GD_OWNS_QUEUE, &disk->state) &&
+ !test_bit(GD_ADDED, &disk->state))
+ blk_mq_exit_queue(disk->queue);
+
blkcg_exit_queue(disk->queue);
+ bioset_exit(&disk->bio_split);
disk_release_events(disk);
kfree(disk->random);
+ disk_free_zone_bitmaps(disk);
xa_destroy(&disk->part_tbl);
disk->queue->disk = NULL;
@@ -1338,16 +1339,16 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
{
struct gendisk *disk;
- if (!blk_get_queue(q))
- return NULL;
-
disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id);
if (!disk)
- goto out_put_queue;
+ return NULL;
+
+ if (bioset_init(&disk->bio_split, BIO_POOL_SIZE, 0, 0))
+ goto out_free_disk;
disk->bdi = bdi_alloc(node_id);
if (!disk->bdi)
- goto out_free_disk;
+ goto out_free_bioset;
/* bdev_alloc() might need the queue, set before the first call */
disk->queue = q;
@@ -1385,13 +1386,12 @@ out_destroy_part_tbl:
iput(disk->part0->bd_inode);
out_free_bdi:
bdi_put(disk->bdi);
+out_free_bioset:
+ bioset_exit(&disk->bio_split);
out_free_disk:
kfree(disk);
-out_put_queue:
- blk_put_queue(q);
return NULL;
}
-EXPORT_SYMBOL(__alloc_disk_node);
struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass)
{
@@ -1404,9 +1404,10 @@ struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass)
disk = __alloc_disk_node(q, node, lkclass);
if (!disk) {
- blk_cleanup_queue(q);
+ blk_put_queue(q);
return NULL;
}
+ set_bit(GD_OWNS_QUEUE, &disk->state);
return disk;
}
EXPORT_SYMBOL(__blk_alloc_disk);
@@ -1418,6 +1419,9 @@ EXPORT_SYMBOL(__blk_alloc_disk);
* This decrements the refcount for the struct gendisk. When this reaches 0
* we'll have disk_release() called.
*
+ * Note: for blk-mq disk put_disk must be called before freeing the tag_set
+ * when handling probe errors (that is before add_disk() is called).
+ *
* Context: Any context, but the last reference must not be dropped from
* atomic context.
*/
@@ -1428,22 +1432,6 @@ void put_disk(struct gendisk *disk)
}
EXPORT_SYMBOL(put_disk);
-/**
- * blk_cleanup_disk - shutdown a gendisk allocated by blk_alloc_disk
- * @disk: gendisk to shutdown
- *
- * Mark the queue hanging off @disk DYING, drain all pending requests, then mark
- * the queue DEAD, destroy and put it and the gendisk structure.
- *
- * Context: can sleep
- */
-void blk_cleanup_disk(struct gendisk *disk)
-{
- blk_cleanup_queue(disk->queue);
- put_disk(disk);
-}
-EXPORT_SYMBOL(blk_cleanup_disk);
-
static void set_disk_ro_uevent(struct gendisk *gd, int ro)
{
char event[] = "DISK_RO=1";
diff --git a/block/ioctl.c b/block/ioctl.c
index 46949f1b0dba..60121e89052b 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -495,7 +495,7 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
case BLKGETZONESZ:
return put_uint(argp, bdev_zone_sectors(bdev));
case BLKGETNRZONES:
- return put_uint(argp, blkdev_nr_zones(bdev->bd_disk));
+ return put_uint(argp, bdev_nr_zones(bdev));
case BLKROGET:
return put_int(argp, bdev_read_only(bdev) != 0);
case BLKSSZGET: /* get block device logical block size */
diff --git a/block/ioprio.c b/block/ioprio.c
index 2fe068fcaad5..32a456b45804 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -138,6 +138,32 @@ out:
return ret;
}
+/*
+ * If the task has set an I/O priority, use that. Otherwise, return
+ * the default I/O priority.
+ *
+ * Expected to be called for current task or with task_lock() held to keep
+ * io_context stable.
+ */
+int __get_task_ioprio(struct task_struct *p)
+{
+ struct io_context *ioc = p->io_context;
+ int prio;
+
+ if (p != current)
+ lockdep_assert_held(&p->alloc_lock);
+ if (ioc)
+ prio = ioc->ioprio;
+ else
+ prio = IOPRIO_DEFAULT;
+
+ if (IOPRIO_PRIO_CLASS(prio) == IOPRIO_CLASS_NONE)
+ prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p),
+ task_nice_ioprio(p));
+ return prio;
+}
+EXPORT_SYMBOL_GPL(__get_task_ioprio);
+
static int get_task_ioprio(struct task_struct *p)
{
int ret;
@@ -145,22 +171,38 @@ static int get_task_ioprio(struct task_struct *p)
ret = security_task_getioprio(p);
if (ret)
goto out;
- ret = IOPRIO_DEFAULT;
+ task_lock(p);
+ ret = __get_task_ioprio(p);
+ task_unlock(p);
+out:
+ return ret;
+}
+
+/*
+ * Return raw IO priority value as set by userspace. We use this for
+ * ioprio_get(pid, IOPRIO_WHO_PROCESS) so that we keep historical behavior and
+ * also so that userspace can distinguish unset IO priority (which just gets
+ * overriden based on task's nice value) from IO priority set to some value.
+ */
+static int get_task_raw_ioprio(struct task_struct *p)
+{
+ int ret;
+
+ ret = security_task_getioprio(p);
+ if (ret)
+ goto out;
task_lock(p);
if (p->io_context)
ret = p->io_context->ioprio;
+ else
+ ret = IOPRIO_DEFAULT;
task_unlock(p);
out:
return ret;
}
-int ioprio_best(unsigned short aprio, unsigned short bprio)
+static int ioprio_best(unsigned short aprio, unsigned short bprio)
{
- if (!ioprio_valid(aprio))
- aprio = IOPRIO_DEFAULT;
- if (!ioprio_valid(bprio))
- bprio = IOPRIO_DEFAULT;
-
return min(aprio, bprio);
}
@@ -181,7 +223,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
else
p = find_task_by_vpid(who);
if (p)
- ret = get_task_ioprio(p);
+ ret = get_task_raw_ioprio(p);
break;
case IOPRIO_WHO_PGRP:
if (!who)
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index 8f7c745b4a57..b05357bced99 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -195,9 +195,9 @@ struct kyber_hctx_data {
static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
void *key);
-static unsigned int kyber_sched_domain(unsigned int op)
+static unsigned int kyber_sched_domain(blk_opf_t opf)
{
- switch (op & REQ_OP_MASK) {
+ switch (opf & REQ_OP_MASK) {
case REQ_OP_READ:
return KYBER_READ;
case REQ_OP_WRITE:
@@ -553,13 +553,13 @@ static void rq_clear_domain_token(struct kyber_queue_data *kqd,
}
}
-static void kyber_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
+static void kyber_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
{
/*
* We use the scheduler tags as per-hardware queue queueing tokens.
* Async requests can be limited at this stage.
*/
- if (!op_is_sync(op)) {
+ if (!op_is_sync(opf)) {
struct kyber_queue_data *kqd = data->q->elevator->elevator_data;
data->shallow_depth = kqd->async_depth;
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 1a9e835e816c..5639921dfa92 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -543,12 +543,12 @@ unlock:
* Called by __blk_mq_alloc_request(). The shallow_depth value set by this
* function is used by __blk_mq_get_tag().
*/
-static void dd_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
+static void dd_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
{
struct deadline_data *dd = data->q->elevator->elevator_data;
/* Do not throttle synchronous reads. */
- if (op_is_sync(op) && !op_is_write(op))
+ if (op_is_sync(opf) && !op_is_write(opf))
return;
/*
diff --git a/block/partitions/check.h b/block/partitions/check.h
index 4ffa2359b1a3..8d70a880c372 100644
--- a/block/partitions/check.h
+++ b/block/partitions/check.h
@@ -24,13 +24,13 @@ struct parsed_partitions {
};
typedef struct {
- struct page *v;
+ struct folio *v;
} Sector;
void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p);
static inline void put_dev_sector(Sector p)
{
- put_page(p.v);
+ folio_put(p.v);
}
static inline void
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 8a0ec929023b..fc1d70384825 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -9,7 +9,6 @@
#include <linux/slab.h>
#include <linux/ctype.h>
#include <linux/vmalloc.h>
-#include <linux/blktrace_api.h>
#include <linux/raid/detect.h>
#include "check.h"
@@ -331,7 +330,7 @@ static struct block_device *add_partition(struct gendisk *disk, int partno,
case BLK_ZONED_HA:
pr_info("%s: disabling host aware zoned block device support due to partitions\n",
disk->disk_name);
- blk_queue_set_zoned(disk, BLK_ZONED_NONE);
+ disk_set_zoned(disk, BLK_ZONED_NONE);
break;
case BLK_ZONED_NONE:
break;
@@ -705,25 +704,19 @@ EXPORT_SYMBOL_GPL(bdev_disk_changed);
void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p)
{
struct address_space *mapping = state->disk->part0->bd_inode->i_mapping;
- struct page *page;
+ struct folio *folio;
if (n >= get_capacity(state->disk)) {
state->access_beyond_eod = true;
- return NULL;
+ goto out;
}
- page = read_mapping_page(mapping,
- (pgoff_t)(n >> (PAGE_SHIFT - 9)), NULL);
- if (IS_ERR(page))
+ folio = read_mapping_folio(mapping, n >> PAGE_SECTORS_SHIFT, NULL);
+ if (IS_ERR(folio))
goto out;
- if (PageError(page))
- goto out_put_page;
-
- p->v = page;
- return (unsigned char *)page_address(page) +
- ((n & ((1 << (PAGE_SHIFT - 9)) - 1)) << SECTOR_SHIFT);
-out_put_page:
- put_page(page);
+
+ p->v = folio;
+ return folio_address(folio) + offset_in_folio(folio, n * SECTOR_SIZE);
out:
p->v = NULL;
return NULL;