aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds2018-06-04 07:58:06 -0700
committerLinus Torvalds2018-06-04 07:58:06 -0700
commitf459c34538f57661e0fd1d3eaf7c0b17125ae011 (patch)
tree3addc82d7f792c4533501978798dad0095293933
parent29dcea88779c856c7dc92040a0c01233263101d4 (diff)
parent32a50fabb334b2f0725de84bf248bd8c24c22b05 (diff)
Merge tag 'for-4.18/block-20180603' of git://git.kernel.dk/linux-block
Pull block updates from Jens Axboe: - clean up how we pass around gfp_t and blk_mq_req_flags_t (Christoph) - prepare us to defer scheduler attach (Christoph) - clean up drivers handling of bounce buffers (Christoph) - fix timeout handling corner cases (Christoph/Bart/Keith) - bcache fixes (Coly) - prep work for bcachefs and some block layer optimizations (Kent). - convert users of bio_sets to using embedded structs (Kent). - fixes for the BFQ io scheduler (Paolo/Davide/Filippo) - lightnvm fixes and improvements (Matias, with contributions from Hans and Javier) - adding discard throttling to blk-wbt (me) - sbitmap blk-mq-tag handling (me/Omar/Ming). - remove the sparc jsflash block driver, acked by DaveM. - Kyber scheduler improvement from Jianchao, making it more friendly wrt merging. - conversion of symbolic proc permissions to octal, from Joe Perches. Previously the block parts were a mix of both. - nbd fixes (Josef and Kevin Vigor) - unify how we handle the various kinds of timestamps that the block core and utility code uses (Omar) - three NVMe pull requests from Keith and Christoph, bringing AEN to feature completeness, file backed namespaces, cq/sq lock split, and various fixes - various little fixes and improvements all over the map * tag 'for-4.18/block-20180603' of git://git.kernel.dk/linux-block: (196 commits) blk-mq: update nr_requests when switching to 'none' scheduler block: don't use blocking queue entered for recursive bio submits dm-crypt: fix warning in shutdown path lightnvm: pblk: take bitmap alloc. out of critical section lightnvm: pblk: kick writer on new flush points lightnvm: pblk: only try to recover lines with written smeta lightnvm: pblk: remove unnecessary bio_get/put lightnvm: pblk: add possibility to set write buffer size manually lightnvm: fix partial read error path lightnvm: proper error handling for pblk_bio_add_pages lightnvm: pblk: fix smeta write error path lightnvm: pblk: garbage collect lines with failed writes lightnvm: pblk: rework write error recovery path lightnvm: pblk: remove dead function lightnvm: pass flag on graceful teardown to targets lightnvm: pblk: check for chunk size before allocating it lightnvm: pblk: remove unnecessary argument lightnvm: pblk: remove unnecessary indirection lightnvm: pblk: return NVM_ error on failed submission lightnvm: pblk: warn in case of corrupted write buffer ...
-rw-r--r--Documentation/block/null_blk.txt9
-rw-r--r--Documentation/scsi/scsi_eh.txt15
-rw-r--r--MAINTAINERS2
-rw-r--r--arch/sparc/include/uapi/asm/jsflash.h40
-rw-r--r--block/bfq-cgroup.c40
-rw-r--r--block/bfq-iosched.c478
-rw-r--r--block/bfq-iosched.h30
-rw-r--r--block/bio-integrity.c29
-rw-r--r--block/bio.c189
-rw-r--r--block/blk-core.c120
-rw-r--r--block/blk-integrity.c12
-rw-r--r--block/blk-lib.c12
-rw-r--r--block/blk-merge.c29
-rw-r--r--block/blk-mq-debugfs.c1
-rw-r--r--block/blk-mq-sched.c46
-rw-r--r--block/blk-mq-sched.h2
-rw-r--r--block/blk-mq-sysfs.c6
-rw-r--r--block/blk-mq-tag.c14
-rw-r--r--block/blk-mq.c340
-rw-r--r--block/blk-mq.h42
-rw-r--r--block/blk-stat.c10
-rw-r--r--block/blk-stat.h45
-rw-r--r--block/blk-sysfs.c80
-rw-r--r--block/blk-throttle.c35
-rw-r--r--block/blk-timeout.c6
-rw-r--r--block/blk-wbt.c129
-rw-r--r--block/blk-wbt.h55
-rw-r--r--block/blk-zoned.c8
-rw-r--r--block/blk.h5
-rw-r--r--block/bounce.c52
-rw-r--r--block/bsg-lib.c6
-rw-r--r--block/bsg.c44
-rw-r--r--block/cfq-iosched.c66
-rw-r--r--block/deadline-iosched.c3
-rw-r--r--block/elevator.c101
-rw-r--r--block/genhd.c37
-rw-r--r--block/kyber-iosched.c199
-rw-r--r--block/mq-deadline.c3
-rw-r--r--block/partition-generic.c26
-rw-r--r--block/scsi_ioctl.c10
-rw-r--r--drivers/ata/libata-eh.c51
-rw-r--r--drivers/block/DAC960.c11
-rw-r--r--drivers/block/DAC960.h1
-rw-r--r--drivers/block/aoe/aoeblk.c11
-rw-r--r--drivers/block/aoe/aoecmd.c3
-rw-r--r--drivers/block/brd.c10
-rw-r--r--drivers/block/drbd/drbd_bitmap.c5
-rw-r--r--drivers/block/drbd/drbd_debugfs.c20
-rw-r--r--drivers/block/drbd/drbd_int.h10
-rw-r--r--drivers/block/drbd/drbd_main.c73
-rw-r--r--drivers/block/drbd/drbd_receiver.c6
-rw-r--r--drivers/block/drbd/drbd_req.c4
-rw-r--r--drivers/block/drbd/drbd_req.h2
-rw-r--r--drivers/block/floppy.c2
-rw-r--r--drivers/block/loop.c17
-rw-r--r--drivers/block/loop.h1
-rw-r--r--drivers/block/mtip32xx/mtip32xx.c29
-rw-r--r--drivers/block/nbd.c77
-rw-r--r--drivers/block/null_blk.c36
-rw-r--r--drivers/block/paride/pd.c2
-rw-r--r--drivers/block/pktcdvd.c60
-rw-r--r--drivers/block/ps3disk.c2
-rw-r--r--drivers/block/rbd.c44
-rw-r--r--drivers/block/rsxx/core.c6
-rw-r--r--drivers/block/sx8.c2
-rw-r--r--drivers/block/virtio_blk.c8
-rw-r--r--drivers/block/xen-blkback/blkback.c2
-rw-r--r--drivers/block/xen-blkback/xenbus.c4
-rw-r--r--drivers/block/xen-blkfront.c7
-rw-r--r--drivers/cdrom/cdrom.c2
-rw-r--r--drivers/ide/ide-atapi.c2
-rw-r--r--drivers/ide/ide-cd.c2
-rw-r--r--drivers/ide/ide-cd_ioctl.c2
-rw-r--r--drivers/ide/ide-devsets.c2
-rw-r--r--drivers/ide/ide-disk.c2
-rw-r--r--drivers/ide/ide-ioctls.c4
-rw-r--r--drivers/ide/ide-park.c4
-rw-r--r--drivers/ide/ide-pm.c5
-rw-r--r--drivers/ide/ide-tape.c4
-rw-r--r--drivers/ide/ide-taskfile.c4
-rw-r--r--drivers/lightnvm/core.c10
-rw-r--r--drivers/lightnvm/pblk-cache.c10
-rw-r--r--drivers/lightnvm/pblk-core.c233
-rw-r--r--drivers/lightnvm/pblk-gc.c112
-rw-r--r--drivers/lightnvm/pblk-init.c172
-rw-r--r--drivers/lightnvm/pblk-map.c33
-rw-r--r--drivers/lightnvm/pblk-rb.c48
-rw-r--r--drivers/lightnvm/pblk-read.c146
-rw-r--r--drivers/lightnvm/pblk-recovery.c121
-rw-r--r--drivers/lightnvm/pblk-rl.c29
-rw-r--r--drivers/lightnvm/pblk-sysfs.c15
-rw-r--r--drivers/lightnvm/pblk-write.c269
-rw-r--r--drivers/lightnvm/pblk.h58
-rw-r--r--drivers/md/bcache/bcache.h14
-rw-r--r--drivers/md/bcache/bset.c13
-rw-r--r--drivers/md/bcache/bset.h2
-rw-r--r--drivers/md/bcache/btree.c4
-rw-r--r--drivers/md/bcache/io.c4
-rw-r--r--drivers/md/bcache/request.c18
-rw-r--r--drivers/md/bcache/super.c109
-rw-r--r--drivers/md/bcache/sysfs.c51
-rw-r--r--drivers/md/bcache/util.c35
-rw-r--r--drivers/md/bcache/util.h5
-rw-r--r--drivers/md/dm-bio-prison-v1.c13
-rw-r--r--drivers/md/dm-bio-prison-v2.c13
-rw-r--r--drivers/md/dm-cache-target.c25
-rw-r--r--drivers/md/dm-core.h4
-rw-r--r--drivers/md/dm-crypt.c59
-rw-r--r--drivers/md/dm-integrity.c15
-rw-r--r--drivers/md/dm-io.c29
-rw-r--r--drivers/md/dm-kcopyd.c22
-rw-r--r--drivers/md/dm-log-userspace-base.c19
-rw-r--r--drivers/md/dm-mpath.c3
-rw-r--r--drivers/md/dm-region-hash.c23
-rw-r--r--drivers/md/dm-rq.c4
-rw-r--r--drivers/md/dm-snap.c17
-rw-r--r--drivers/md/dm-thin.c32
-rw-r--r--drivers/md/dm-verity-fec.c55
-rw-r--r--drivers/md/dm-verity-fec.h8
-rw-r--r--drivers/md/dm-zoned-target.c13
-rw-r--r--drivers/md/dm.c55
-rw-r--r--drivers/md/md-faulty.c2
-rw-r--r--drivers/md/md-linear.c2
-rw-r--r--drivers/md/md-multipath.c17
-rw-r--r--drivers/md/md-multipath.h2
-rw-r--r--drivers/md/md.c61
-rw-r--r--drivers/md/md.h4
-rw-r--r--drivers/md/raid0.c5
-rw-r--r--drivers/md/raid1.c76
-rw-r--r--drivers/md/raid1.h6
-rw-r--r--drivers/md/raid10.c60
-rw-r--r--drivers/md/raid10.h6
-rw-r--r--drivers/md/raid5-cache.c43
-rw-r--r--drivers/md/raid5-ppl.c42
-rw-r--r--drivers/md/raid5.c12
-rw-r--r--drivers/md/raid5.h2
-rw-r--r--drivers/memstick/core/ms_block.c6
-rw-r--r--drivers/memstick/core/mspro_block.c6
-rw-r--r--drivers/message/fusion/mptsas.c2
-rw-r--r--drivers/mmc/core/block.c12
-rw-r--r--drivers/mmc/core/queue.c5
-rw-r--r--drivers/mtd/mtd_blkdevs.c20
-rw-r--r--drivers/nvme/host/core.c157
-rw-r--r--drivers/nvme/host/fabrics.c100
-rw-r--r--drivers/nvme/host/fabrics.h4
-rw-r--r--drivers/nvme/host/fc.c15
-rw-r--r--drivers/nvme/host/nvme.h9
-rw-r--r--drivers/nvme/host/pci.c258
-rw-r--r--drivers/nvme/host/rdma.c12
-rw-r--r--drivers/nvme/host/trace.h4
-rw-r--r--drivers/nvme/target/Makefile4
-rw-r--r--drivers/nvme/target/admin-cmd.c143
-rw-r--r--drivers/nvme/target/core.c128
-rw-r--r--drivers/nvme/target/discovery.c2
-rw-r--r--drivers/nvme/target/fabrics-cmd.c4
-rw-r--r--drivers/nvme/target/fc.c2
-rw-r--r--drivers/nvme/target/io-cmd-bdev.c (renamed from drivers/nvme/target/io-cmd.c)77
-rw-r--r--drivers/nvme/target/io-cmd-file.c304
-rw-r--r--drivers/nvme/target/loop.c52
-rw-r--r--drivers/nvme/target/nvmet.h51
-rw-r--r--drivers/s390/block/dasd.c6
-rw-r--r--drivers/sbus/char/Kconfig7
-rw-r--r--drivers/sbus/char/Makefile1
-rw-r--r--drivers/sbus/char/jsflash.c658
-rw-r--r--drivers/scsi/gdth.c2
-rw-r--r--drivers/scsi/libiscsi.c6
-rw-r--r--drivers/scsi/megaraid/megaraid_sas_base.c2
-rw-r--r--drivers/scsi/mvumi.c2
-rw-r--r--drivers/scsi/osd/osd_initiator.c24
-rw-r--r--drivers/scsi/osst.c2
-rw-r--r--drivers/scsi/qla4xxx/ql4_os.c2
-rw-r--r--drivers/scsi/scsi_error.c10
-rw-r--r--drivers/scsi/scsi_lib.c4
-rw-r--r--drivers/scsi/scsi_transport_fc.c16
-rw-r--r--drivers/scsi/scsi_transport_iscsi.c2
-rw-r--r--drivers/scsi/scsi_transport_sas.c19
-rw-r--r--drivers/scsi/scsi_transport_srp.c4
-rw-r--r--drivers/scsi/sg.c2
-rw-r--r--drivers/scsi/st.c2
-rw-r--r--drivers/scsi/ufs/ufshcd.c6
-rw-r--r--drivers/target/target_core_iblock.c16
-rw-r--r--drivers/target/target_core_iblock.h2
-rw-r--r--drivers/target/target_core_pscsi.c3
-rw-r--r--fs/block_dev.c24
-rw-r--r--fs/btrfs/extent_io.c25
-rw-r--r--fs/direct-io.c4
-rw-r--r--fs/exofs/ore.c10
-rw-r--r--fs/exofs/super.c2
-rw-r--r--fs/nfsd/blocklayout.c2
-rw-r--r--fs/xfs/xfs_aops.c2
-rw-r--r--fs/xfs/xfs_aops.h2
-rw-r--r--fs/xfs/xfs_super.c11
-rw-r--r--include/linux/bio.h42
-rw-r--r--include/linux/blk-mq.h3
-rw-r--r--include/linux/blk_types.h51
-rw-r--r--include/linux/blkdev.h110
-rw-r--r--include/linux/bsg-lib.h3
-rw-r--r--include/linux/bsg.h6
-rw-r--r--include/linux/elevator.h2
-rw-r--r--include/linux/fs.h2
-rw-r--r--include/linux/libata.h2
-rw-r--r--include/linux/lightnvm.h2
-rw-r--r--include/linux/mempool.h34
-rw-r--r--include/linux/nvme.h16
-rw-r--r--include/linux/pktcdvd.h2
-rw-r--r--include/linux/sbitmap.h36
-rw-r--r--include/scsi/osd_initiator.h6
-rw-r--r--include/scsi/scsi_host.h2
-rw-r--r--kernel/power/swap.c14
-rw-r--r--lib/sbitmap.c113
-rw-r--r--mm/backing-dev.c18
-rw-r--r--mm/mempool.c108
212 files changed, 4020 insertions, 3984 deletions
diff --git a/Documentation/block/null_blk.txt b/Documentation/block/null_blk.txt
index 733927a7b501..07f147381f32 100644
--- a/Documentation/block/null_blk.txt
+++ b/Documentation/block/null_blk.txt
@@ -71,13 +71,16 @@ use_per_node_hctx=[0/1]: Default: 0
1: The multi-queue block layer is instantiated with a hardware dispatch
queue for each CPU node in the system.
-use_lightnvm=[0/1]: Default: 0
- Register device with LightNVM. Requires blk-mq and CONFIG_NVM to be enabled.
-
no_sched=[0/1]: Default: 0
0: nullb* use default blk-mq io scheduler.
1: nullb* doesn't use io scheduler.
+blocking=[0/1]: Default: 0
+ 0: Register as a non-blocking blk-mq driver device.
+ 1: Register as a blocking blk-mq driver device, null_blk will set
+ the BLK_MQ_F_BLOCKING flag, indicating that it sometimes/always
+ needs to block in its ->queue_rq() function.
+
shared_tags=[0/1]: Default: 0
0: Tag set is not shared.
1: Tag set shared between devices for blk-mq. Only makes sense with
diff --git a/Documentation/scsi/scsi_eh.txt b/Documentation/scsi/scsi_eh.txt
index 11e447bdb3a5..1b7436932a2b 100644
--- a/Documentation/scsi/scsi_eh.txt
+++ b/Documentation/scsi/scsi_eh.txt
@@ -82,24 +82,13 @@ function
1. invokes optional hostt->eh_timed_out() callback. Return value can
be one of
- - BLK_EH_HANDLED
- This indicates that eh_timed_out() dealt with the timeout.
- The command is passed back to the block layer and completed
- via __blk_complete_requests().
-
- *NOTE* After returning BLK_EH_HANDLED the SCSI layer is
- assumed to be finished with the command, and no other
- functions from the SCSI layer will be called. So this
- should typically only be returned if the eh_timed_out()
- handler raced with normal completion.
-
- BLK_EH_RESET_TIMER
This indicates that more time is required to finish the
command. Timer is restarted. This action is counted as a
retry and only allowed scmd->allowed + 1(!) times. Once the
- limit is reached, action for BLK_EH_NOT_HANDLED is taken instead.
+ limit is reached, action for BLK_EH_DONE is taken instead.
- - BLK_EH_NOT_HANDLED
+ - BLK_EH_DONE
eh_timed_out() callback did not handle the command.
Step #2 is taken.
diff --git a/MAINTAINERS b/MAINTAINERS
index 9c125f705f78..cbc6bbf4ef9c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9700,7 +9700,7 @@ S: Maintained
F: drivers/net/ethernet/netronome/
NETWORK BLOCK DEVICE (NBD)
-M: Josef Bacik <jbacik@fb.com>
+M: Josef Bacik <josef@toxicpanda.com>
S: Maintained
L: linux-block@vger.kernel.org
L: nbd@other.debian.org
diff --git a/arch/sparc/include/uapi/asm/jsflash.h b/arch/sparc/include/uapi/asm/jsflash.h
deleted file mode 100644
index 68c98a54281a..000000000000
--- a/arch/sparc/include/uapi/asm/jsflash.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * jsflash.h: OS Flash SIMM support for JavaStations.
- *
- * Copyright (C) 1999 Pete Zaitcev
- */
-
-#ifndef _SPARC_JSFLASH_H
-#define _SPARC_JSFLASH_H
-
-#ifndef _SPARC_TYPES_H
-#include <linux/types.h>
-#endif
-
-/*
- * Semantics of the offset is a full address.
- * Hardcode it or get it from probe ioctl.
- *
- * We use full bus address, so that we would be
- * automatically compatible with possible future systems.
- */
-
-#define JSFLASH_IDENT (('F'<<8)|54)
-struct jsflash_ident_arg {
- __u64 off; /* 0x20000000 is included */
- __u32 size;
- char name[32]; /* With trailing zero */
-};
-
-#define JSFLASH_ERASE (('F'<<8)|55)
-/* Put 0 as argument, may be flags or sector number... */
-
-#define JSFLASH_PROGRAM (('F'<<8)|56)
-struct jsflash_program_arg {
- __u64 data; /* char* for sparc and sparc64 */
- __u64 off;
- __u32 size;
-};
-
-#endif /* _SPARC_JSFLASH_H */
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index d819dc77fe65..a9e8633388f4 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -55,13 +55,13 @@ BFQG_FLAG_FNS(empty)
/* This should be called with the scheduler lock held. */
static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
{
- unsigned long long now;
+ u64 now;
if (!bfqg_stats_waiting(stats))
return;
- now = sched_clock();
- if (time_after64(now, stats->start_group_wait_time))
+ now = ktime_get_ns();
+ if (now > stats->start_group_wait_time)
blkg_stat_add(&stats->group_wait_time,
now - stats->start_group_wait_time);
bfqg_stats_clear_waiting(stats);
@@ -77,20 +77,20 @@ static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
return;
if (bfqg == curr_bfqg)
return;
- stats->start_group_wait_time = sched_clock();
+ stats->start_group_wait_time = ktime_get_ns();
bfqg_stats_mark_waiting(stats);
}
/* This should be called with the scheduler lock held. */
static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
{
- unsigned long long now;
+ u64 now;
if (!bfqg_stats_empty(stats))
return;
- now = sched_clock();
- if (time_after64(now, stats->start_empty_time))
+ now = ktime_get_ns();
+ if (now > stats->start_empty_time)
blkg_stat_add(&stats->empty_time,
now - stats->start_empty_time);
bfqg_stats_clear_empty(stats);
@@ -116,7 +116,7 @@ void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
if (bfqg_stats_empty(stats))
return;
- stats->start_empty_time = sched_clock();
+ stats->start_empty_time = ktime_get_ns();
bfqg_stats_mark_empty(stats);
}
@@ -125,9 +125,9 @@ void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
struct bfqg_stats *stats = &bfqg->stats;
if (bfqg_stats_idling(stats)) {
- unsigned long long now = sched_clock();
+ u64 now = ktime_get_ns();
- if (time_after64(now, stats->start_idle_time))
+ if (now > stats->start_idle_time)
blkg_stat_add(&stats->idle_time,
now - stats->start_idle_time);
bfqg_stats_clear_idling(stats);
@@ -138,7 +138,7 @@ void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg)
{
struct bfqg_stats *stats = &bfqg->stats;
- stats->start_idle_time = sched_clock();
+ stats->start_idle_time = ktime_get_ns();
bfqg_stats_mark_idling(stats);
}
@@ -171,18 +171,18 @@ void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op)
blkg_rwstat_add(&bfqg->stats.merged, op, 1);
}
-void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time,
- uint64_t io_start_time, unsigned int op)
+void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns,
+ u64 io_start_time_ns, unsigned int op)
{
struct bfqg_stats *stats = &bfqg->stats;
- unsigned long long now = sched_clock();
+ u64 now = ktime_get_ns();
- if (time_after64(now, io_start_time))
+ if (now > io_start_time_ns)
blkg_rwstat_add(&stats->service_time, op,
- now - io_start_time);
- if (time_after64(io_start_time, start_time))
+ now - io_start_time_ns);
+ if (io_start_time_ns > start_time_ns)
blkg_rwstat_add(&stats->wait_time, op,
- io_start_time - start_time);
+ io_start_time_ns - start_time_ns);
}
#else /* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
@@ -191,8 +191,8 @@ void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
unsigned int op) { }
void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { }
void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { }
-void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time,
- uint64_t io_start_time, unsigned int op) { }
+void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns,
+ u64 io_start_time_ns, unsigned int op) { }
void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { }
void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { }
void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { }
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 771ae9730ac6..495b9ddb3355 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -49,9 +49,39 @@
*
* In particular, to provide these low-latency guarantees, BFQ
* explicitly privileges the I/O of two classes of time-sensitive
- * applications: interactive and soft real-time. This feature enables
- * BFQ to provide applications in these classes with a very low
- * latency. Finally, BFQ also features additional heuristics for
+ * applications: interactive and soft real-time. In more detail, BFQ
+ * behaves this way if the low_latency parameter is set (default
+ * configuration). This feature enables BFQ to provide applications in
+ * these classes with a very low latency.
+ *
+ * To implement this feature, BFQ constantly tries to detect whether
+ * the I/O requests in a bfq_queue come from an interactive or a soft
+ * real-time application. For brevity, in these cases, the queue is
+ * said to be interactive or soft real-time. In both cases, BFQ
+ * privileges the service of the queue, over that of non-interactive
+ * and non-soft-real-time queues. This privileging is performed,
+ * mainly, by raising the weight of the queue. So, for brevity, we
+ * call just weight-raising periods the time periods during which a
+ * queue is privileged, because deemed interactive or soft real-time.
+ *
+ * The detection of soft real-time queues/applications is described in
+ * detail in the comments on the function
+ * bfq_bfqq_softrt_next_start. On the other hand, the detection of an
+ * interactive queue works as follows: a queue is deemed interactive
+ * if it is constantly non empty only for a limited time interval,
+ * after which it does become empty. The queue may be deemed
+ * interactive again (for a limited time), if it restarts being
+ * constantly non empty, provided that this happens only after the
+ * queue has remained empty for a given minimum idle time.
+ *
+ * By default, BFQ computes automatically the above maximum time
+ * interval, i.e., the time interval after which a constantly
+ * non-empty queue stops being deemed interactive. Since a queue is
+ * weight-raised while it is deemed interactive, this maximum time
+ * interval happens to coincide with the (maximum) duration of the
+ * weight-raising for interactive queues.
+ *
+ * Finally, BFQ also features additional heuristics for
* preserving both a low latency and a high throughput on NCQ-capable,
* rotational or flash-based devices, and to get the job done quickly
* for applications consisting in many I/O-bound processes.
@@ -61,14 +91,14 @@
* all low-latency heuristics for that device, by setting low_latency
* to 0.
*
- * BFQ is described in [1], where also a reference to the initial, more
- * theoretical paper on BFQ can be found. The interested reader can find
- * in the latter paper full details on the main algorithm, as well as
- * formulas of the guarantees and formal proofs of all the properties.
- * With respect to the version of BFQ presented in these papers, this
- * implementation adds a few more heuristics, such as the one that
- * guarantees a low latency to soft real-time applications, and a
- * hierarchical extension based on H-WF2Q+.
+ * BFQ is described in [1], where also a reference to the initial,
+ * more theoretical paper on BFQ can be found. The interested reader
+ * can find in the latter paper full details on the main algorithm, as
+ * well as formulas of the guarantees and formal proofs of all the
+ * properties. With respect to the version of BFQ presented in these
+ * papers, this implementation adds a few more heuristics, such as the
+ * ones that guarantee a low latency to interactive and soft real-time
+ * applications, and a hierarchical extension based on H-WF2Q+.
*
* B-WF2Q+ is based on WF2Q+, which is described in [2], together with
* H-WF2Q+, while the augmented tree used here to implement B-WF2Q+
@@ -218,56 +248,46 @@ static struct kmem_cache *bfq_pool;
#define BFQ_RATE_SHIFT 16
/*
- * By default, BFQ computes the duration of the weight raising for
- * interactive applications automatically, using the following formula:
- * duration = (R / r) * T, where r is the peak rate of the device, and
- * R and T are two reference parameters.
- * In particular, R is the peak rate of the reference device (see
- * below), and T is a reference time: given the systems that are
- * likely to be installed on the reference device according to its
- * speed class, T is about the maximum time needed, under BFQ and
- * while reading two files in parallel, to load typical large
- * applications on these systems (see the comments on
- * max_service_from_wr below, for more details on how T is obtained).
- * In practice, the slower/faster the device at hand is, the more/less
- * it takes to load applications with respect to the reference device.
- * Accordingly, the longer/shorter BFQ grants weight raising to
- * interactive applications.
- *
- * BFQ uses four different reference pairs (R, T), depending on:
- * . whether the device is rotational or non-rotational;
- * . whether the device is slow, such as old or portable HDDs, as well as
- * SD cards, or fast, such as newer HDDs and SSDs.
+ * When configured for computing the duration of the weight-raising
+ * for interactive queues automatically (see the comments at the
+ * beginning of this file), BFQ does it using the following formula:
+ * duration = (ref_rate / r) * ref_wr_duration,
+ * where r is the peak rate of the device, and ref_rate and
+ * ref_wr_duration are two reference parameters. In particular,
+ * ref_rate is the peak rate of the reference storage device (see
+ * below), and ref_wr_duration is about the maximum time needed, with
+ * BFQ and while reading two files in parallel, to load typical large
+ * applications on the reference device (see the comments on
+ * max_service_from_wr below, for more details on how ref_wr_duration
+ * is obtained). In practice, the slower/faster the device at hand
+ * is, the more/less it takes to load applications with respect to the
+ * reference device. Accordingly, the longer/shorter BFQ grants
+ * weight raising to interactive applications.
*
- * The device's speed class is dynamically (re)detected in
- * bfq_update_peak_rate() every time the estimated peak rate is updated.
+ * BFQ uses two different reference pairs (ref_rate, ref_wr_duration),
+ * depending on whether the device is rotational or non-rotational.
*
- * In the following definitions, R_slow[0]/R_fast[0] and
- * T_slow[0]/T_fast[0] are the reference values for a slow/fast
- * rotational device, whereas R_slow[1]/R_fast[1] and
- * T_slow[1]/T_fast[1] are the reference values for a slow/fast
- * non-rotational device. Finally, device_speed_thresh are the
- * thresholds used to switch between speed classes. The reference
- * rates are not the actual peak rates of the devices used as a
- * reference, but slightly lower values. The reason for using these
- * slightly lower values is that the peak-rate estimator tends to
- * yield slightly lower values than the actual peak rate (it can yield
- * the actual peak rate only if there is only one process doing I/O,
- * and the process does sequential I/O).
+ * In the following definitions, ref_rate[0] and ref_wr_duration[0]
+ * are the reference values for a rotational device, whereas
+ * ref_rate[1] and ref_wr_duration[1] are the reference values for a
+ * non-rotational device. The reference rates are not the actual peak
+ * rates of the devices used as a reference, but slightly lower
+ * values. The reason for using slightly lower values is that the
+ * peak-rate estimator tends to yield slightly lower values than the
+ * actual peak rate (it can yield the actual peak rate only if there
+ * is only one process doing I/O, and the process does sequential
+ * I/O).
*
- * Both the reference peak rates and the thresholds are measured in
- * sectors/usec, left-shifted by BFQ_RATE_SHIFT.
+ * The reference peak rates are measured in sectors/usec, left-shifted
+ * by BFQ_RATE_SHIFT.
*/
-static int R_slow[2] = {1000, 10700};
-static int R_fast[2] = {14000, 33000};
+static int ref_rate[2] = {14000, 33000};
/*
- * To improve readability, a conversion function is used to initialize the
- * following arrays, which entails that they can be initialized only in a
- * function.
+ * To improve readability, a conversion function is used to initialize
+ * the following array, which entails that the array can be
+ * initialized only in a function.
*/
-static int T_slow[2];
-static int T_fast[2];
-static int device_speed_thresh[2];
+static int ref_wr_duration[2];
/*
* BFQ uses the above-detailed, time-based weight-raising mechanism to
@@ -487,46 +507,6 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd,
}
/*
- * See the comments on bfq_limit_depth for the purpose of
- * the depths set in the function.
- */
-static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
-{
- bfqd->sb_shift = bt->sb.shift;
-
- /*
- * In-word depths if no bfq_queue is being weight-raised:
- * leaving 25% of tags only for sync reads.
- *
- * In next formulas, right-shift the value
- * (1U<<bfqd->sb_shift), instead of computing directly
- * (1U<<(bfqd->sb_shift - something)), to be robust against
- * any possible value of bfqd->sb_shift, without having to
- * limit 'something'.
- */
- /* no more than 50% of tags for async I/O */
- bfqd->word_depths[0][0] = max((1U<<bfqd->sb_shift)>>1, 1U);
- /*
- * no more than 75% of tags for sync writes (25% extra tags
- * w.r.t. async I/O, to prevent async I/O from starving sync
- * writes)
- */
- bfqd->word_depths[0][1] = max(((1U<<bfqd->sb_shift) * 3)>>2, 1U);
-
- /*
- * In-word depths in case some bfq_queue is being weight-
- * raised: leaving ~63% of tags for sync reads. This is the
- * highest percentage for which, in our tests, application
- * start-up times didn't suffer from any regression due to tag
- * shortage.
- */
- /* no more than ~18% of tags for async I/O */
- bfqd->word_depths[1][0] = max(((1U<<bfqd->sb_shift) * 3)>>4, 1U);
- /* no more than ~37% of tags for sync writes (~20% extra tags) */
- bfqd->word_depths[1][1] = max(((1U<<bfqd->sb_shift) * 6)>>4, 1U);
-}
-
-/*
* Async I/O can easily starve sync I/O (both sync reads and sync
* writes), by consuming all tags. Similarly, storms of sync writes,
* such as those that sync(2) may trigger, can starve sync reads.
@@ -535,25 +515,11 @@ static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
*/
static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
{
- struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
struct bfq_data *bfqd = data->q->elevator->elevator_data;
- struct sbitmap_queue *bt;
if (op_is_sync(op) && !op_is_write(op))
return;
- if (data->flags & BLK_MQ_REQ_RESERVED) {
- if (unlikely(!tags->nr_reserved_tags)) {
- WARN_ON_ONCE(1);
- return;
- }
- bt = &tags->breserved_tags;
- } else
- bt = &tags->bitmap_tags;
-
- if (unlikely(bfqd->sb_shift != bt->sb.shift))
- bfq_update_depths(bfqd, bt);
-
data->shallow_depth =
bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)];
@@ -906,26 +872,30 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
if (bfqd->bfq_wr_max_time > 0)
return bfqd->bfq_wr_max_time;
- dur = bfqd->RT_prod;
+ dur = bfqd->rate_dur_prod;
do_div(dur, bfqd->peak_rate);
/*
- * Limit duration between 3 and 13 seconds. Tests show that
- * higher values than 13 seconds often yield the opposite of
- * the desired result, i.e., worsen responsiveness by letting
- * non-interactive and non-soft-real-time applications
- * preserve weight raising for a too long time interval.
+ * Limit duration between 3 and 25 seconds. The upper limit
+ * has been conservatively set after the following worst case:
+ * on a QEMU/KVM virtual machine
+ * - running in a slow PC
+ * - with a virtual disk stacked on a slow low-end 5400rpm HDD
+ * - serving a heavy I/O workload, such as the sequential reading
+ * of several files
+ * mplayer took 23 seconds to start, if constantly weight-raised.
+ *
+ * As for higher values than that accomodating the above bad
+ * scenario, tests show that higher values would often yield
+ * the opposite of the desired result, i.e., would worsen
+ * responsiveness by allowing non-interactive applications to
+ * preserve weight raising for too long.
*
* On the other end, lower values than 3 seconds make it
* difficult for most interactive tasks to complete their jobs
* before weight-raising finishes.
*/
- if (dur > msecs_to_jiffies(13000))
- dur = msecs_to_jiffies(13000);
- else if (dur < msecs_to_jiffies(3000))
- dur = msecs_to_jiffies(3000);
-
- return dur;
+ return clamp_val(dur, msecs_to_jiffies(3000), msecs_to_jiffies(25000));
}
/* switch back from soft real-time to interactive weight raising */
@@ -1393,15 +1363,6 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
}
/*
- * Return the farthest future time instant according to jiffies
- * macros.
- */
-static unsigned long bfq_greatest_from_now(void)
-{
- return jiffies + MAX_JIFFY_OFFSET;
-}
-
-/*
* Return the farthest past time instant according to jiffies
* macros.
*/
@@ -1545,7 +1506,8 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
in_burst = bfq_bfqq_in_large_burst(bfqq);
soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
!in_burst &&
- time_is_before_jiffies(bfqq->soft_rt_next_start);
+ time_is_before_jiffies(bfqq->soft_rt_next_start) &&
+ bfqq->dispatched == 0;
*interactive = !in_burst && idle_for_long_time;
wr_or_deserves_wr = bfqd->low_latency &&
(bfqq->wr_coeff > 1 ||
@@ -1858,6 +1820,8 @@ static int bfq_request_merge(struct request_queue *q, struct request **req,
return ELEVATOR_NO_MERGE;
}
+static struct bfq_queue *bfq_init_rq(struct request *rq);
+
static void bfq_request_merged(struct request_queue *q, struct request *req,
enum elv_merge type)
{
@@ -1866,7 +1830,7 @@ static void bfq_request_merged(struct request_queue *q, struct request *req,
blk_rq_pos(req) <
blk_rq_pos(container_of(rb_prev(&req->rb_node),
struct request, rb_node))) {
- struct bfq_queue *bfqq = RQ_BFQQ(req);
+ struct bfq_queue *bfqq = bfq_init_rq(req);
struct bfq_data *bfqd = bfqq->bfqd;
struct request *prev, *next_rq;
@@ -1891,14 +1855,25 @@ static void bfq_request_merged(struct request_queue *q, struct request *req,
}
}
+/*
+ * This function is called to notify the scheduler that the requests
+ * rq and 'next' have been merged, with 'next' going away. BFQ
+ * exploits this hook to address the following issue: if 'next' has a
+ * fifo_time lower that rq, then the fifo_time of rq must be set to
+ * the value of 'next', to not forget the greater age of 'next'.
+ *
+ * NOTE: in this function we assume that rq is in a bfq_queue, basing
+ * on that rq is picked from the hash table q->elevator->hash, which,
+ * in its turn, is filled only with I/O requests present in
+ * bfq_queues, while BFQ is in use for the request queue q. In fact,
+ * the function that fills this hash table (elv_rqhash_add) is called
+ * only by bfq_insert_request.
+ */
static void bfq_requests_merged(struct request_queue *q, struct request *rq,
struct request *next)
{
- struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next);
-
- if (!RB_EMPTY_NODE(&rq->rb_node))
- goto end;
- spin_lock_irq(&bfqq->bfqd->lock);
+ struct bfq_queue *bfqq = bfq_init_rq(rq),
+ *next_bfqq = bfq_init_rq(next);
/*
* If next and rq belong to the same bfq_queue and next is older
@@ -1920,11 +1895,6 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq,
if (bfqq->next_rq == next)
bfqq->next_rq = rq;
- bfq_remove_request(q, next);
- bfqg_stats_update_io_remove(bfqq_group(bfqq), next->cmd_flags);
-
- spin_unlock_irq(&bfqq->bfqd->lock);
-end:
bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags);
}
@@ -2506,37 +2476,15 @@ static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd)
/*
* Update parameters related to throughput and responsiveness, as a
* function of the estimated peak rate. See comments on
- * bfq_calc_max_budget(), and on T_slow and T_fast arrays.
+ * bfq_calc_max_budget(), and on the ref_wr_duration array.
*/
static void update_thr_responsiveness_params(struct bfq_data *bfqd)
{
- int dev_type = blk_queue_nonrot(bfqd->queue);
-
- if (bfqd->bfq_user_max_budget == 0)
+ if (bfqd->bfq_user_max_budget == 0) {
bfqd->bfq_max_budget =
bfq_calc_max_budget(bfqd);
-
- if (bfqd->device_speed == BFQ_BFQD_FAST &&
- bfqd->peak_rate < device_speed_thresh[dev_type]) {
- bfqd->device_speed = BFQ_BFQD_SLOW;
- bfqd->RT_prod = R_slow[dev_type] *
- T_slow[dev_type];
- } else if (bfqd->device_speed == BFQ_BFQD_SLOW &&
- bfqd->peak_rate > device_speed_thresh[dev_type]) {
- bfqd->device_speed = BFQ_BFQD_FAST;
- bfqd->RT_prod = R_fast[dev_type] *
- T_fast[dev_type];
+ bfq_log(bfqd, "new max_budget = %d", bfqd->bfq_max_budget);
}
-
- bfq_log(bfqd,
-"dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec",
- dev_type == 0 ? "ROT" : "NONROT",
- bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW",
- bfqd->device_speed == BFQ_BFQD_FAST ?
- (USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT :
- (USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT,
- (USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>>
- BFQ_RATE_SHIFT);
}
static void bfq_reset_rate_computation(struct bfq_data *bfqd,
@@ -3266,23 +3214,6 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
bfq_bfqq_softrt_next_start(bfqd, bfqq);
else {
/*
- * The application is still waiting for the
- * completion of one or more requests:
- * prevent it from possibly being incorrectly
- * deemed as soft real-time by setting its
- * soft_rt_next_start to infinity. In fact,
- * without this assignment, the application
- * would be incorrectly deemed as soft
- * real-time if:
- * 1) it issued a new request before the
- * completion of all its in-flight
- * requests, and
- * 2) at that time, its soft_rt_next_start
- * happened to be in the past.
- */
- bfqq->soft_rt_next_start =
- bfq_greatest_from_now();
- /*
* Schedule an update of soft_rt_next_start to when
* the task may be discovered to be isochronous.
*/
@@ -4540,14 +4471,12 @@ static inline void bfq_update_insert_stats(struct request_queue *q,
unsigned int cmd_flags) {}
#endif
-static void bfq_prepare_request(struct request *rq, struct bio *bio);
-
static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
bool at_head)
{
struct request_queue *q = hctx->queue;
struct bfq_data *bfqd = q->elevator->elevator_data;
- struct bfq_queue *bfqq = RQ_BFQQ(rq);
+ struct bfq_queue *bfqq;
bool idle_timer_disabled = false;
unsigned int cmd_flags;
@@ -4562,24 +4491,13 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
blk_mq_sched_request_inserted(rq);
spin_lock_irq(&bfqd->lock);
+ bfqq = bfq_init_rq(rq);
if (at_head || blk_rq_is_passthrough(rq)) {
if (at_head)
list_add(&rq->queuelist, &bfqd->dispatch);
else
list_add_tail(&rq->queuelist, &bfqd->dispatch);
- } else {
- if (WARN_ON_ONCE(!bfqq)) {
- /*
- * This should never happen. Most likely rq is
- * a requeued regular request, being
- * re-inserted without being first
- * re-prepared. Do a prepare, to avoid
- * failure.
- */
- bfq_prepare_request(rq, rq->bio);
- bfqq = RQ_BFQQ(rq);
- }
-
+ } else { /* bfqq is assumed to be non null here */
idle_timer_disabled = __bfq_insert_request(bfqd, rq);
/*
* Update bfqq, because, if a queue merge has occurred
@@ -4778,8 +4696,8 @@ static void bfq_finish_requeue_request(struct request *rq)
if (rq->rq_flags & RQF_STARTED)
bfqg_stats_update_completion(bfqq_group(bfqq),
- rq_start_time_ns(rq),
- rq_io_start_time_ns(rq),
+ rq->start_time_ns,
+ rq->io_start_time_ns,
rq->cmd_flags);
if (likely(rq->rq_flags & RQF_STARTED)) {
@@ -4922,11 +4840,48 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
}
/*
- * Allocate bfq data structures associated with this request.
+ * Only reset private fields. The actual request preparation will be
+ * performed by bfq_init_rq, when rq is either inserted or merged. See
+ * comments on bfq_init_rq for the reason behind this delayed
+ * preparation.
*/
static void bfq_prepare_request(struct request *rq, struct bio *bio)
{
+ /*
+ * Regardless of whether we have an icq attached, we have to
+ * clear the scheduler pointers, as they might point to
+ * previously allocated bic/bfqq structs.
+ */
+ rq->elv.priv[0] = rq->elv.priv[1] = NULL;
+}
+
+/*
+ * If needed, init rq, allocate bfq data structures associated with
+ * rq, and increment reference counters in the destination bfq_queue
+ * for rq. Return the destination bfq_queue for rq, or NULL is rq is
+ * not associated with any bfq_queue.
+ *
+ * This function is invoked by the functions that perform rq insertion
+ * or merging. One may have expected the above preparation operations
+ * to be performed in bfq_prepare_request, and not delayed to when rq
+ * is inserted or merged. The rationale behind this delayed
+ * preparation is that, after the prepare_request hook is invoked for
+ * rq, rq may still be transformed into a request with no icq, i.e., a
+ * request not associated with any queue. No bfq hook is invoked to
+ * signal this tranformation. As a consequence, should these
+ * preparation operations be performed when the prepare_request hook
+ * is invoked, and should rq be transformed one moment later, bfq
+ * would end up in an inconsistent state, because it would have
+ * incremented some queue counters for an rq destined to
+ * transformation, without any chance to correctly lower these
+ * counters back. In contrast, no transformation can still happen for
+ * rq after rq has been inserted or merged. So, it is safe to execute
+ * these preparation operations when rq is finally inserted or merged.
+ */
+static struct bfq_queue *bfq_init_rq(struct request *rq)
+{
struct request_queue *q = rq->q;
+ struct bio *bio = rq->bio;
struct bfq_data *bfqd = q->elevator->elevator_data;
struct bfq_io_cq *bic;
const int is_sync = rq_is_sync(rq);
@@ -4934,20 +4889,21 @@ static void bfq_prepare_request(struct request *rq, struct bio *bio)
bool new_queue = false;
bool bfqq_already_existing = false, split = false;
+ if (unlikely(!rq->elv.icq))
+ return NULL;
+
/*
- * Even if we don't have an icq attached, we should still clear
- * the scheduler pointers, as they might point to previously
- * allocated bic/bfqq structs.
+ * Assuming that elv.priv[1] is set only if everything is set
+ * for this rq. This holds true, because this function is
+ * invoked only for insertion or merging, and, after such
+ * events, a request cannot be manipulated any longer before
+ * being removed from bfq.
*/
- if (!rq->elv.icq) {
- rq->elv.priv[0] = rq->elv.priv[1] = NULL;
- return;
- }
+ if (rq->elv.priv[1])
+ return rq->elv.priv[1];
bic = icq_to_bic(rq->elv.icq);
- spin_lock_irq(&bfqd->lock);
-
bfq_check_ioprio_change(bic, bio);
bfq_bic_update_cgroup(bic, bio);
@@ -5006,7 +4962,7 @@ static void bfq_prepare_request(struct request *rq, struct bio *bio)
if (unlikely(bfq_bfqq_just_created(bfqq)))
bfq_handle_burst(bfqd, bfqq);
- spin_unlock_irq(&bfqd->lock);
+ return bfqq;
}
static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq)
@@ -5105,6 +5061,64 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
}
+/*
+ * See the comments on bfq_limit_depth for the purpose of
+ * the depths set in the function. Return minimum shallow depth we'll use.
+ */
+static unsigned int bfq_update_depths(struct bfq_data *bfqd,
+ struct sbitmap_queue *bt)
+{
+ unsigned int i, j, min_shallow = UINT_MAX;
+
+ /*
+ * In-word depths if no bfq_queue is being weight-raised:
+ * leaving 25% of tags only for sync reads.
+ *
+ * In next formulas, right-shift the value
+ * (1U<<bt->sb.shift), instead of computing directly
+ * (1U<<(bt->sb.shift - something)), to be robust against
+ * any possible value of bt->sb.shift, without having to
+ * limit 'something'.
+ */
+ /* no more than 50% of tags for async I/O */
+ bfqd->word_depths[0][0] = max((1U << bt->sb.shift) >> 1, 1U);
+ /*
+ * no more than 75% of tags for sync writes (25% extra tags
+ * w.r.t. async I/O, to prevent async I/O from starving sync
+ * writes)
+ */
+ bfqd->word_depths[0][1] = max(((1U << bt->sb.shift) * 3) >> 2, 1U);
+
+ /*
+ * In-word depths in case some bfq_queue is being weight-
+ * raised: leaving ~63% of tags for sync reads. This is the
+ * highest percentage for which, in our tests, application
+ * start-up times didn't suffer from any regression due to tag
+ * shortage.
+ */
+ /* no more than ~18% of tags for async I/O */
+ bfqd->word_depths[1][0] = max(((1U << bt->sb.shift) * 3) >> 4, 1U);
+ /* no more than ~37% of tags for sync writes (~20% extra tags) */
+ bfqd->word_depths[1][1] = max(((1U << bt->sb.shift) * 6) >> 4, 1U);
+
+ for (i = 0; i < 2; i++)
+ for (j = 0; j < 2; j++)
+ min_shallow = min(min_shallow, bfqd->word_depths[i][j]);
+
+ return min_shallow;
+}
+
+static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index)
+{
+ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
+ struct blk_mq_tags *tags = hctx->sched_tags;
+ unsigned int min_shallow;
+
+ min_shallow = bfq_update_depths(bfqd, &tags->bitmap_tags);
+ sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, min_shallow);
+ return 0;
+}
+
static void bfq_exit_queue(struct elevator_queue *e)
{
struct bfq_data *bfqd = e->elevator_data;
@@ -5242,14 +5256,12 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
bfqd->wr_busy_queues = 0;
/*
- * Begin by assuming, optimistically, that the device is a
- * high-speed one, and that its peak rate is equal to 2/3 of
- * the highest reference rate.
+ * Begin by assuming, optimistically, that the device peak
+ * rate is equal to 2/3 of the highest reference rate.
*/
- bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] *
- T_fast[blk_queue_nonrot(bfqd->queue)];
- bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3;
- bfqd->device_speed = BFQ_BFQD_FAST;
+ bfqd->rate_dur_prod = ref_rate[blk_queue_nonrot(bfqd->queue)] *
+ ref_wr_duration[blk_queue_nonrot(bfqd->queue)];
+ bfqd->peak_rate = ref_rate[blk_queue_nonrot(bfqd->queue)] * 2 / 3;
spin_lock_init(&bfqd->lock);
@@ -5526,6 +5538,7 @@ static struct elevator_type iosched_bfq_mq = {
.requests_merged = bfq_requests_merged,
.request_merged = bfq_request_merged,
.has_work = bfq_has_work,
+ .init_hctx = bfq_init_hctx,
.init_sched = bfq_init_queue,
.exit_sched = bfq_exit_queue,
},
@@ -5556,8 +5569,8 @@ static int __init bfq_init(void)
/*
* Times to load large popular applications for the typical
* systems installed on the reference devices (see the
- * comments before the definitions of the next two
- * arrays). Actually, we use slightly slower values, as the
+ * comments before the definition of the next
+ * array). Actually, we use slightly lower values, as the
* estimated peak rate tends to be smaller than the actual
* peak rate. The reason for this last fact is that estimates
* are computed over much shorter time intervals than the long
@@ -5566,25 +5579,8 @@ static int __init bfq_init(void)
* scheduler cannot rely on a peak-rate-evaluation workload to
* be run for a long time.
*/
- T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */
- T_slow[1] = msecs_to_jiffies(6000); /* actually 6.5 sec */
- T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */
- T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */
-
- /*
- * Thresholds that determine the switch between speed classes
- * (see the comments before the definition of the array
- * device_speed_thresh). These thresholds are biased towards
- * transitions to the fast class. This is safer than the
- * opposite bias. In fact, a wrong transition to the slow
- * class results in short weight-raising periods, because the
- * speed of the device then tends to be higher that the
- * reference peak rate. On the opposite end, a wrong
- * transition to the fast class tends to increase
- * weight-raising periods, because of the opposite reason.
- */
- device_speed_thresh[0] = (4 * R_slow[0]) / 3;
- device_speed_thresh[1] = (4 * R_slow[1]) / 3;
+ ref_wr_duration[0] = msecs_to_jiffies(7000); /* actually 8 sec */
+ ref_wr_duration[1] = msecs_to_jiffies(2500); /* actually 3 sec */
ret = elv_register(&iosched_bfq_mq);
if (ret)
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index ae2f3dadec44..0f712e03b035 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -399,11 +399,6 @@ struct bfq_io_cq {
struct bfq_ttime saved_ttime;
};
-enum bfq_device_speed {
- BFQ_BFQD_FAST,
- BFQ_BFQD_SLOW,
-};
-
/**
* struct bfq_data - per-device data structure.
*
@@ -611,12 +606,11 @@ struct bfq_data {
/* Max service-rate for a soft real-time queue, in sectors/sec */
unsigned int bfq_wr_max_softrt_rate;
/*
- * Cached value of the product R*T, used for computing the
- * maximum duration of weight raising automatically.
+ * Cached value of the product ref_rate*ref_wr_duration, used
+ * for computing the maximum duration of weight raising
+ * automatically.
*/
- u64 RT_prod;
- /* device-speed class for the low-latency heuristic */
- enum bfq_device_speed device_speed;
+ u64 rate_dur_prod;
/* fallback dummy bfqq for extreme OOM conditions */
struct bfq_queue oom_bfqq;
@@ -636,12 +630,6 @@ struct bfq_data {
struct bfq_queue *bio_bfqq;
/*
- * Cached sbitmap shift, used to compute depth limits in
- * bfq_update_depths.
- */
- unsigned int sb_shift;
-
- /*
* Depth limits used in bfq_limit_depth (see comments on the
* function)
*/
@@ -732,9 +720,9 @@ struct bfqg_stats {
/* total time with empty current active q with other requests queued */
struct blkg_stat empty_time;
/* fields after this shouldn't be cleared on stat reset */
- uint64_t start_group_wait_time;
- uint64_t start_idle_time;
- uint64_t start_empty_time;
+ u64 start_group_wait_time;
+ u64 start_idle_time;
+ u64 start_empty_time;
uint16_t flags;
#endif /* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
};
@@ -856,8 +844,8 @@ void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
unsigned int op);
void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op);
void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op);
-void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time,
- uint64_t io_start_time, unsigned int op);
+void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns,
+ u64 io_start_time_ns, unsigned int op);
void bfqg_stats_update_dequeue(struct bfq_group *bfqg);
void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
void bfqg_stats_update_idle_time(struct bfq_group *bfqg);
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 9cfdd6c83b5b..add7c7c85335 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -56,12 +56,12 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
struct bio_set *bs = bio->bi_pool;
unsigned inline_vecs;
- if (!bs || !bs->bio_integrity_pool) {
+ if (!bs || !mempool_initialized(&bs->bio_integrity_pool)) {
bip = kmalloc(sizeof(struct bio_integrity_payload) +
sizeof(struct bio_vec) * nr_vecs, gfp_mask);
inline_vecs = nr_vecs;
} else {
- bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
+ bip = mempool_alloc(&bs->bio_integrity_pool, gfp_mask);
inline_vecs = BIP_INLINE_VECS;
}
@@ -74,7 +74,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
unsigned long idx = 0;
bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx,
- bs->bvec_integrity_pool);
+ &bs->bvec_integrity_pool);
if (!bip->bip_vec)
goto err;
bip->bip_max_vcnt = bvec_nr_vecs(idx);
@@ -90,7 +90,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
return bip;
err:
- mempool_free(bip, bs->bio_integrity_pool);
+ mempool_free(bip, &bs->bio_integrity_pool);
return ERR_PTR(-ENOMEM);
}
EXPORT_SYMBOL(bio_integrity_alloc);
@@ -111,10 +111,10 @@ static void bio_integrity_free(struct bio *bio)
kfree(page_address(bip->bip_vec->bv_page) +
bip->bip_vec->bv_offset);
- if (bs && bs->bio_integrity_pool) {
- bvec_free(bs->bvec_integrity_pool, bip->bip_vec, bip->bip_slab);
+ if (bs && mempool_initialized(&bs->bio_integrity_pool)) {
+ bvec_free(&bs->bvec_integrity_pool, bip->bip_vec, bip->bip_slab);
- mempool_free(bip, bs->bio_integrity_pool);
+ mempool_free(bip, &bs->bio_integrity_pool);
} else {
kfree(bip);
}
@@ -465,16 +465,15 @@ EXPORT_SYMBOL(bio_integrity_clone);
int bioset_integrity_create(struct bio_set *bs, int pool_size)
{
- if (bs->bio_integrity_pool)
+ if (mempool_initialized(&bs->bio_integrity_pool))
return 0;
- bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, bip_slab);
- if (!bs->bio_integrity_pool)
+ if (mempool_init_slab_pool(&bs->bio_integrity_pool,
+ pool_size, bip_slab))
return -1;
- bs->bvec_integrity_pool = biovec_create_pool(pool_size);
- if (!bs->bvec_integrity_pool) {
- mempool_destroy(bs->bio_integrity_pool);
+ if (biovec_init_pool(&bs->bvec_integrity_pool, pool_size)) {
+ mempool_exit(&bs->bio_integrity_pool);
return -1;
}
@@ -484,8 +483,8 @@ EXPORT_SYMBOL(bioset_integrity_create);
void bioset_integrity_free(struct bio_set *bs)
{
- mempool_destroy(bs->bio_integrity_pool);
- mempool_destroy(bs->bvec_integrity_pool);
+ mempool_exit(&bs->bio_integrity_pool);
+ mempool_exit(&bs->bvec_integrity_pool);
}
EXPORT_SYMBOL(bioset_integrity_free);
diff --git a/block/bio.c b/block/bio.c
index 53e0f0a1ed94..595663e0281a 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -53,7 +53,7 @@ static struct biovec_slab bvec_slabs[BVEC_POOL_NR] __read_mostly = {
* fs_bio_set is the bio_set containing bio and iovec memory pools used by
* IO code that does not need private memory pools.
*/
-struct bio_set *fs_bio_set;
+struct bio_set fs_bio_set;
EXPORT_SYMBOL(fs_bio_set);
/*
@@ -254,7 +254,7 @@ static void bio_free(struct bio *bio)
bio_uninit(bio);
if (bs) {
- bvec_free(bs->bvec_pool, bio->bi_io_vec, BVEC_POOL_IDX(bio));
+ bvec_free(&bs->bvec_pool, bio->bi_io_vec, BVEC_POOL_IDX(bio));
/*
* If we have front padding, adjust the bio pointer before freeing
@@ -262,7 +262,7 @@ static void bio_free(struct bio *bio)
p = bio;
p -= bs->front_pad;
- mempool_free(p, bs->bio_pool);
+ mempool_free(p, &bs->bio_pool);
} else {
/* Bio was allocated by bio_kmalloc() */
kfree(bio);
@@ -454,7 +454,8 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs,
inline_vecs = nr_iovecs;
} else {
/* should not use nobvec bioset for nr_iovecs > 0 */
- if (WARN_ON_ONCE(!bs->bvec_pool && nr_iovecs > 0))
+ if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) &&
+ nr_iovecs > 0))
return NULL;
/*
* generic_make_request() converts recursion to iteration; this
@@ -483,11 +484,11 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs,
bs->rescue_workqueue)
gfp_mask &= ~__GFP_DIRECT_RECLAIM;
- p = mempool_alloc(bs->bio_pool, gfp_mask);
+ p = mempool_alloc(&bs->bio_pool, gfp_mask);
if (!p && gfp_mask != saved_gfp) {
punt_bios_to_rescuer(bs);
gfp_mask = saved_gfp;
- p = mempool_alloc(bs->bio_pool, gfp_mask);
+ p = mempool_alloc(&bs->bio_pool, gfp_mask);
}
front_pad = bs->front_pad;
@@ -503,11 +504,11 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs,
if (nr_iovecs > inline_vecs) {
unsigned long idx = 0;
- bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool);
+ bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool);
if (!bvl && gfp_mask != saved_gfp) {
punt_bios_to_rescuer(bs);
gfp_mask = saved_gfp;
- bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool);
+ bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool);
}
if (unlikely(!bvl))
@@ -524,25 +525,25 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs,
return bio;
err_free:
- mempool_free(p, bs->bio_pool);
+ mempool_free(p, &bs->bio_pool);
return NULL;
}
EXPORT_SYMBOL(bio_alloc_bioset);
-void zero_fill_bio(struct bio *bio)
+void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start)
{
unsigned long flags;
struct bio_vec bv;
struct bvec_iter iter;
- bio_for_each_segment(bv, bio, iter) {
+ __bio_for_each_segment(bv, bio, iter, start) {
char *data = bvec_kmap_irq(&bv, &flags);
memset(data, 0, bv.bv_len);
flush_dcache_page(bv.bv_page);
bvec_kunmap_irq(data, &flags);
}
}
-EXPORT_SYMBOL(zero_fill_bio);
+EXPORT_SYMBOL(zero_fill_bio_iter);
/**
* bio_put - release a reference to a bio
@@ -970,27 +971,68 @@ void bio_advance(struct bio *bio, unsigned bytes)
}
EXPORT_SYMBOL(bio_advance);
+void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
+ struct bio *src, struct bvec_iter *src_iter)
+{
+ struct bio_vec src_bv, dst_bv;
+ void *src_p, *dst_p;
+ unsigned bytes;
+
+ while (src_iter->bi_size && dst_iter->bi_size) {
+ src_bv = bio_iter_iovec(src, *src_iter);
+ dst_bv = bio_iter_iovec(dst, *dst_iter);
+
+ bytes = min(src_bv.bv_len, dst_bv.bv_len);
+
+ src_p = kmap_atomic(src_bv.bv_page);
+ dst_p = kmap_atomic(dst_bv.bv_page);
+
+ memcpy(dst_p + dst_bv.bv_offset,
+ src_p + src_bv.bv_offset,
+ bytes);
+
+ kunmap_atomic(dst_p);
+ kunmap_atomic(src_p);
+
+ flush_dcache_page(dst_bv.bv_page);
+
+ bio_advance_iter(src, src_iter, bytes);
+ bio_advance_iter(dst, dst_iter, bytes);
+ }
+}
+EXPORT_SYMBOL(bio_copy_data_iter);
+
/**
- * bio_copy_data - copy contents of data buffers from one chain of bios to
- * another
- * @src: source bio list
- * @dst: destination bio list
- *
- * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
- * @src and @dst as linked lists of bios.
+ * bio_copy_data - copy contents of data buffers from one bio to another
+ * @src: source bio
+ * @dst: destination bio
*
* Stops when it reaches the end of either @src or @dst - that is, copies
* min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
*/
void bio_copy_data(struct bio *dst, struct bio *src)
{
- struct bvec_iter src_iter, dst_iter;
- struct bio_vec src_bv, dst_bv;
- void *src_p, *dst_p;
- unsigned bytes;
+ struct bvec_iter src_iter = src->bi_iter;
+ struct bvec_iter dst_iter = dst->bi_iter;
- src_iter = src->bi_iter;
- dst_iter = dst->bi_iter;
+ bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
+}
+EXPORT_SYMBOL(bio_copy_data);
+
+/**
+ * bio_list_copy_data - copy contents of data buffers from one chain of bios to
+ * another
+ * @src: source bio list
+ * @dst: destination bio list
+ *
+ * Stops when it reaches the end of either the @src list or @dst list - that is,
+ * copies min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of
+ * bios).
+ */
+void bio_list_copy_data(struct bio *dst, struct bio *src)
+{
+ struct bvec_iter src_iter = src->bi_iter;
+ struct bvec_iter dst_iter = dst->bi_iter;
while (1) {
if (!src_iter.bi_size) {
@@ -1009,26 +1051,10 @@ void bio_copy_data(struct bio *dst, struct bio *src)
dst_iter = dst->bi_iter;
}
- src_bv = bio_iter_iovec(src, src_iter);
- dst_bv = bio_iter_iovec(dst, dst_iter);
-
- bytes = min(src_bv.bv_len, dst_bv.bv_len);
-
- src_p = kmap_atomic(src_bv.bv_page);
- dst_p = kmap_atomic(dst_bv.bv_page);
-
- memcpy(dst_p + dst_bv.bv_offset,
- src_p + src_bv.bv_offset,
- bytes);
-
- kunmap_atomic(dst_p);
- kunmap_atomic(src_p);
-
- bio_advance_iter(src, &src_iter, bytes);
- bio_advance_iter(dst, &dst_iter, bytes);
+ bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
}
}
-EXPORT_SYMBOL(bio_copy_data);
+EXPORT_SYMBOL(bio_list_copy_data);
struct bio_map_data {
int is_our_pages;
@@ -1584,6 +1610,7 @@ void bio_set_pages_dirty(struct bio *bio)
set_page_dirty_lock(page);
}
}
+EXPORT_SYMBOL_GPL(bio_set_pages_dirty);
static void bio_release_pages(struct bio *bio)
{
@@ -1667,6 +1694,7 @@ void bio_check_pages_dirty(struct bio *bio)
bio_put(bio);
}
}
+EXPORT_SYMBOL_GPL(bio_check_pages_dirty);
void generic_start_io_acct(struct request_queue *q, int rw,
unsigned long sectors, struct hd_struct *part)
@@ -1749,6 +1777,9 @@ again:
if (!bio_integrity_endio(bio))
return;
+ if (WARN_ONCE(bio->bi_next, "driver left bi_next not NULL"))
+ bio->bi_next = NULL;
+
/*
* Need to have a real endio function for chained bios, otherwise
* various corner cases will break (like stacking block devices that
@@ -1848,30 +1879,38 @@ EXPORT_SYMBOL_GPL(bio_trim);
* create memory pools for biovec's in a bio_set.
* use the global biovec slabs created for general use.
*/
-mempool_t *biovec_create_pool(int pool_entries)
+int biovec_init_pool(mempool_t *pool, int pool_entries)
{
struct biovec_slab *bp = bvec_slabs + BVEC_POOL_MAX;
- return mempool_create_slab_pool(pool_entries, bp->slab);
+ return mempool_init_slab_pool(pool, pool_entries, bp->slab);
}
-void bioset_free(struct bio_set *bs)
+/*
+ * bioset_exit - exit a bioset initialized with bioset_init()
+ *
+ * May be called on a zeroed but uninitialized bioset (i.e. allocated with
+ * kzalloc()).
+ */
+void bioset_exit(struct bio_set *bs)
{
if (bs->rescue_workqueue)
destroy_workqueue(bs->rescue_workqueue);
+ bs->rescue_workqueue = NULL;
- mempool_destroy(bs->bio_pool);
- mempool_destroy(bs->bvec_pool);
+ mempool_exit(&bs->bio_pool);
+ mempool_exit(&bs->bvec_pool);
bioset_integrity_free(bs);
- bio_put_slab(bs);
-
- kfree(bs);
+ if (bs->bio_slab)
+ bio_put_slab(bs);
+ bs->bio_slab = NULL;
}
-EXPORT_SYMBOL(bioset_free);
+EXPORT_SYMBOL(bioset_exit);
/**
- * bioset_create - Create a bio_set
+ * bioset_init - Initialize a bio_set
+ * @bs: pool to initialize
* @pool_size: Number of bio and bio_vecs to cache in the mempool
* @front_pad: Number of bytes to allocate in front of the returned bio
* @flags: Flags to modify behavior, currently %BIOSET_NEED_BVECS
@@ -1890,16 +1929,12 @@ EXPORT_SYMBOL(bioset_free);
* dispatch queued requests when the mempool runs out of space.
*
*/
-struct bio_set *bioset_create(unsigned int pool_size,
- unsigned int front_pad,
- int flags)
+int bioset_init(struct bio_set *bs,
+ unsigned int pool_size,
+ unsigned int front_pad,
+ int flags)
{
unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
- struct bio_set *bs;
-
- bs = kzalloc(sizeof(*bs), GFP_KERNEL);
- if (!bs)
- return NULL;
bs->front_pad = front_pad;
@@ -1908,34 +1943,29 @@ struct bio_set *bioset_create(unsigned int pool_size,
INIT_WORK(&bs->rescue_work, bio_alloc_rescue);
bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
- if (!bs->bio_slab) {
- kfree(bs);
- return NULL;
- }
+ if (!bs->bio_slab)
+ return -ENOMEM;
- bs->bio_pool = mempool_create_slab_pool(pool_size, bs->bio_slab);
- if (!bs->bio_pool)
+ if (mempool_init_slab_pool(&bs->bio_pool, pool_size, bs->bio_slab))
goto bad;
- if (flags & BIOSET_NEED_BVECS) {
- bs->bvec_pool = biovec_create_pool(pool_size);
- if (!bs->bvec_pool)
- goto bad;
- }
+ if ((flags & BIOSET_NEED_BVECS) &&
+ biovec_init_pool(&bs->bvec_pool, pool_size))
+ goto bad;
if (!(flags & BIOSET_NEED_RESCUER))
- return bs;
+ return 0;
bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
if (!bs->rescue_workqueue)
goto bad;
- return bs;
+ return 0;
bad:
- bioset_free(bs);
- return NULL;
+ bioset_exit(bs);
+ return -ENOMEM;
}
-EXPORT_SYMBOL(bioset_create);
+EXPORT_SYMBOL(bioset_init);
#ifdef CONFIG_BLK_CGROUP
@@ -2020,11 +2050,10 @@ static int __init init_bio(void)
bio_integrity_init();
biovec_init_slabs();
- fs_bio_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
- if (!fs_bio_set)
+ if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS))
panic("bio: can't allocate bios\n");
- if (bioset_integrity_create(fs_bio_set, BIO_POOL_SIZE))
+ if (bioset_integrity_create(&fs_bio_set, BIO_POOL_SIZE))
panic("bio: can't create integrity pool\n");
return 0;
diff --git a/block/blk-core.c b/block/blk-core.c
index 85909b431eb0..3f56be15f17e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -196,15 +196,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
RB_CLEAR_NODE(&rq->rb_node);
rq->tag = -1;
rq->internal_tag = -1;
- rq->start_time = jiffies;
- set_start_time_ns(rq);
+ rq->start_time_ns = ktime_get_ns();
rq->part = NULL;
- seqcount_init(&rq->gstate_seq);
- u64_stats_init(&rq->aborted_gstate_sync);
- /*
- * See comment of blk_mq_init_request
- */
- WRITE_ONCE(rq->gstate, MQ_RQ_GEN_INC);
}
EXPORT_SYMBOL(blk_rq_init);
@@ -280,6 +273,10 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
bio_advance(bio, nbytes);
/* don't actually finish bio if it's part of flush sequence */
+ /*
+ * XXX this code looks suspicious - it's not consistent with advancing
+ * req->bio in caller
+ */
if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
bio_endio(bio);
}
@@ -360,7 +357,6 @@ EXPORT_SYMBOL(blk_start_queue_async);
void blk_start_queue(struct request_queue *q)
{
lockdep_assert_held(q->queue_lock);
- WARN_ON(!in_interrupt() && !irqs_disabled());
WARN_ON_ONCE(q->mq_ops);
queue_flag_clear(QUEUE_FLAG_STOPPED, q);
@@ -996,18 +992,24 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id,
spinlock_t *lock)
{
struct request_queue *q;
+ int ret;
q = kmem_cache_alloc_node(blk_requestq_cachep,
gfp_mask | __GFP_ZERO, node_id);
if (!q)
return NULL;
+ INIT_LIST_HEAD(&q->queue_head);
+ q->last_merge = NULL;
+ q->end_sector = 0;
+ q->boundary_rq = NULL;
+
q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
if (q->id < 0)
goto fail_q;
- q->bio_split = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
- if (!q->bio_split)
+ ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
+ if (ret)
goto fail_id;
q->backing_dev_info = bdi_alloc_node(gfp_mask, node_id);
@@ -1079,7 +1081,7 @@ fail_bdi:
fail_stats:
bdi_put(q->backing_dev_info);
fail_split:
- bioset_free(q->bio_split);
+ bioset_exit(&q->bio_split);
fail_id:
ida_simple_remove(&blk_queue_ida, q->id);
fail_q:
@@ -1173,16 +1175,8 @@ int blk_init_allocated_queue(struct request_queue *q)
q->sg_reserved_size = INT_MAX;
- /* Protect q->elevator from elevator_change */
- mutex_lock(&q->sysfs_lock);
-
- /* init elevator */
- if (elevator_init(q, NULL)) {
- mutex_unlock(&q->sysfs_lock);
+ if (elevator_init(q))
goto out_exit_flush_rq;
- }
-
- mutex_unlock(&q->sysfs_lock);
return 0;
out_exit_flush_rq:
@@ -1334,6 +1328,7 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
* @op: operation and flags
* @bio: bio to allocate request for (can be %NULL)
* @flags: BLQ_MQ_REQ_* flags
+ * @gfp_mask: allocator flags
*
* Get a free request from @q. This function may fail under memory
* pressure or if @q is dead.
@@ -1343,7 +1338,7 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
* Returns request pointer on success, with @q->queue_lock *not held*.
*/
static struct request *__get_request(struct request_list *rl, unsigned int op,
- struct bio *bio, blk_mq_req_flags_t flags)
+ struct bio *bio, blk_mq_req_flags_t flags, gfp_t gfp_mask)
{
struct request_queue *q = rl->q;
struct request *rq;
@@ -1352,8 +1347,6 @@ static struct request *__get_request(struct request_list *rl, unsigned int op,
struct io_cq *icq = NULL;
const bool is_sync = op_is_sync(op);
int may_queue;
- gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC :
- __GFP_DIRECT_RECLAIM;
req_flags_t rq_flags = RQF_ALLOCED;
lockdep_assert_held(q->queue_lock);
@@ -1517,8 +1510,9 @@ rq_starved:
* @op: operation and flags
* @bio: bio to allocate request for (can be %NULL)
* @flags: BLK_MQ_REQ_* flags.
+ * @gfp: allocator flags
*
- * Get a free request from @q. If %__GFP_DIRECT_RECLAIM is set in @gfp_mask,
+ * Get a free request from @q. If %BLK_MQ_REQ_NOWAIT is set in @flags,
* this function keeps retrying under memory pressure and fails iff @q is dead.
*
* Must be called with @q->queue_lock held and,
@@ -1526,7 +1520,7 @@ rq_starved:
* Returns request pointer on success, with @q->queue_lock *not held*.
*/
static struct request *get_request(struct request_queue *q, unsigned int op,
- struct bio *bio, blk_mq_req_flags_t flags)
+ struct bio *bio, blk_mq_req_flags_t flags, gfp_t gfp)
{
const bool is_sync = op_is_sync(op);
DEFINE_WAIT(wait);
@@ -1538,7 +1532,7 @@ static struct request *get_request(struct request_queue *q, unsigned int op,
rl = blk_get_rl(q, bio); /* transferred to @rq on success */
retry:
- rq = __get_request(rl, op, bio, flags);
+ rq = __get_request(rl, op, bio, flags, gfp);
if (!IS_ERR(rq))
return rq;
@@ -1579,8 +1573,7 @@ static struct request *blk_old_get_request(struct request_queue *q,
unsigned int op, blk_mq_req_flags_t flags)
{
struct request *rq;
- gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC :
- __GFP_DIRECT_RECLAIM;
+ gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC : GFP_NOIO;
int ret = 0;
WARN_ON_ONCE(q->mq_ops);
@@ -1592,7 +1585,7 @@ static struct request *blk_old_get_request(struct request_queue *q,
if (ret)
return ERR_PTR(ret);
spin_lock_irq(q->queue_lock);
- rq = get_request(q, op, NULL, flags);
+ rq = get_request(q, op, NULL, flags, gfp_mask);
if (IS_ERR(rq)) {
spin_unlock_irq(q->queue_lock);
blk_queue_exit(q);
@@ -1607,13 +1600,13 @@ static struct request *blk_old_get_request(struct request_queue *q,
}
/**
- * blk_get_request_flags - allocate a request
+ * blk_get_request - allocate a request
* @q: request queue to allocate a request for
* @op: operation (REQ_OP_*) and REQ_* flags, e.g. REQ_SYNC.
* @flags: BLK_MQ_REQ_* flags, e.g. BLK_MQ_REQ_NOWAIT.
*/
-struct request *blk_get_request_flags(struct request_queue *q, unsigned int op,
- blk_mq_req_flags_t flags)
+struct request *blk_get_request(struct request_queue *q, unsigned int op,
+ blk_mq_req_flags_t flags)
{
struct request *req;
@@ -1632,14 +1625,6 @@ struct request *blk_get_request_flags(struct request_queue *q, unsigned int op,
return req;
}
-EXPORT_SYMBOL(blk_get_request_flags);
-
-struct request *blk_get_request(struct request_queue *q, unsigned int op,
- gfp_t gfp_mask)
-{
- return blk_get_request_flags(q, op, gfp_mask & __GFP_DIRECT_RECLAIM ?
- 0 : BLK_MQ_REQ_NOWAIT);
-}
EXPORT_SYMBOL(blk_get_request);
/**
@@ -1660,7 +1645,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
blk_delete_timer(rq);
blk_clear_rq_complete(rq);
trace_block_rq_requeue(q, rq);
- wbt_requeue(q->rq_wb, &rq->issue_stat);
+ wbt_requeue(q->rq_wb, rq);
if (rq->rq_flags & RQF_QUEUED)
blk_queue_end_tag(q, rq);
@@ -1767,7 +1752,7 @@ void __blk_put_request(struct request_queue *q, struct request *req)
/* this is a bio leak */
WARN_ON(req->bio != NULL);
- wbt_done(q->rq_wb, &req->issue_stat);
+ wbt_done(q->rq_wb, req);
/*
* Request may not have originated from ll_rw_blk. if not,
@@ -2066,7 +2051,7 @@ get_rq:
* Returns with the queue unlocked.
*/
blk_queue_enter_live(q);
- req = get_request(q, bio->bi_opf, bio, 0);
+ req = get_request(q, bio->bi_opf, bio, 0, GFP_NOIO);
if (IS_ERR(req)) {
blk_queue_exit(q);
__wbt_done(q->rq_wb, wb_acct);
@@ -2078,7 +2063,7 @@ get_rq:
goto out_unlock;
}
- wbt_track(&req->issue_stat, wb_acct);
+ wbt_track(req, wb_acct);
/*
* After dropping the lock and possibly sleeping here, our request
@@ -2392,7 +2377,9 @@ blk_qc_t generic_make_request(struct bio *bio)
if (bio->bi_opf & REQ_NOWAIT)
flags = BLK_MQ_REQ_NOWAIT;
- if (blk_queue_enter(q, flags) < 0) {
+ if (bio_flagged(bio, BIO_QUEUE_ENTERED))
+ blk_queue_enter_live(q);
+ else if (blk_queue_enter(q, flags) < 0) {
if (!blk_queue_dying(q) && (bio->bi_opf & REQ_NOWAIT))
bio_wouldblock_error(bio);
else
@@ -2727,7 +2714,7 @@ void blk_account_io_completion(struct request *req, unsigned int bytes)
}
}
-void blk_account_io_done(struct request *req)
+void blk_account_io_done(struct request *req, u64 now)
{
/*
* Account IO completion. flush_rq isn't accounted as a
@@ -2735,11 +2722,12 @@ void blk_account_io_done(struct request *req)
* containing request is enough.
*/
if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) {
- unsigned long duration = jiffies - req->start_time;
+ unsigned long duration;
const int rw = rq_data_dir(req);
struct hd_struct *part;
int cpu;
+ duration = nsecs_to_jiffies(now - req->start_time_ns);
cpu = part_stat_lock();
part = req->part;
@@ -2970,10 +2958,8 @@ static void blk_dequeue_request(struct request *rq)
* and to it is freed is accounted as io that is in progress at
* the driver side.
*/
- if (blk_account_rq(rq)) {
+ if (blk_account_rq(rq))
q->in_flight[rq_is_sync(rq)]++;
- set_io_start_time_ns(rq);
- }
}
/**
@@ -2992,9 +2978,12 @@ void blk_start_request(struct request *req)
blk_dequeue_request(req);
if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) {
- blk_stat_set_issue(&req->issue_stat, blk_rq_sectors(req));
+ req->io_start_time_ns = ktime_get_ns();
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+ req->throtl_size = blk_rq_sectors(req);
+#endif
req->rq_flags |= RQF_STATS;
- wbt_issue(req->q->rq_wb, &req->issue_stat);
+ wbt_issue(req->q->rq_wb, req);
}
BUG_ON(blk_rq_is_complete(req));
@@ -3092,8 +3081,10 @@ bool blk_update_request(struct request *req, blk_status_t error,
struct bio *bio = req->bio;
unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);
- if (bio_bytes == bio->bi_iter.bi_size)
+ if (bio_bytes == bio->bi_iter.bi_size) {
req->bio = bio->bi_next;
+ bio->bi_next = NULL;
+ }
/* Completion has already been traced */
bio_clear_flag(bio, BIO_TRACE_COMPLETION);
@@ -3190,12 +3181,13 @@ EXPORT_SYMBOL_GPL(blk_unprep_request);
void blk_finish_request(struct request *req, blk_status_t error)
{
struct request_queue *q = req->q;
+ u64 now = ktime_get_ns();
lockdep_assert_held(req->q->queue_lock);
WARN_ON_ONCE(q->mq_ops);
if (req->rq_flags & RQF_STATS)
- blk_stat_add(req);
+ blk_stat_add(req, now);
if (req->rq_flags & RQF_QUEUED)
blk_queue_end_tag(q, req);
@@ -3210,10 +3202,10 @@ void blk_finish_request(struct request *req, blk_status_t error)
if (req->rq_flags & RQF_DONTPREP)
blk_unprep_request(req);
- blk_account_io_done(req);
+ blk_account_io_done(req, now);
if (req->end_io) {
- wbt_done(req->q->rq_wb, &req->issue_stat);
+ wbt_done(req->q->rq_wb, req);
req->end_io(req, error);
} else {
if (blk_bidi_rq(req))
@@ -3519,7 +3511,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
struct bio *bio, *bio_src;
if (!bs)
- bs = fs_bio_set;
+ bs = &fs_bio_set;
__rq_for_each_bio(bio_src, rq_src) {
bio = bio_clone_fast(bio_src, gfp_mask, bs);
@@ -3630,7 +3622,7 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth,
blk_run_queue_async(q);
else
__blk_run_queue(q);
- spin_unlock(q->queue_lock);
+ spin_unlock_irq(q->queue_lock);
}
static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
@@ -3678,7 +3670,6 @@ EXPORT_SYMBOL(blk_check_plugged);
void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
{
struct request_queue *q;
- unsigned long flags;
struct request *rq;
LIST_HEAD(list);
unsigned int depth;
@@ -3698,11 +3689,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
q = NULL;
depth = 0;
- /*
- * Save and disable interrupts here, to avoid doing it for every
- * queue lock we have to take.
- */
- local_irq_save(flags);
while (!list_empty(&list)) {
rq = list_entry_rq(list.next);
list_del_init(&rq->queuelist);
@@ -3715,7 +3701,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
queue_unplugged(q, depth, from_schedule);
q = rq->q;
depth = 0;
- spin_lock(q->queue_lock);
+ spin_lock_irq(q->queue_lock);
}
/*
@@ -3742,8 +3728,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
*/
if (q)
queue_unplugged(q, depth, from_schedule);
-
- local_irq_restore(flags);
}
void blk_finish_plug(struct blk_plug *plug)
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index feb30570eaf5..6121611e1316 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -333,34 +333,34 @@ static ssize_t integrity_device_show(struct blk_integrity *bi, char *page)
}
static struct integrity_sysfs_entry integrity_format_entry = {
- .attr = { .name = "format", .mode = S_IRUGO },
+ .attr = { .name = "format", .mode = 0444 },
.show = integrity_format_show,
};
static struct integrity_sysfs_entry integrity_tag_size_entry = {
- .attr = { .name = "tag_size", .mode = S_IRUGO },
+ .attr = { .name = "tag_size", .mode = 0444 },
.show = integrity_tag_size_show,
};
static struct integrity_sysfs_entry integrity_interval_entry = {
- .attr = { .name = "protection_interval_bytes", .mode = S_IRUGO },
+ .attr = { .name = "protection_interval_bytes", .mode = 0444 },
.show = integrity_interval_show,
};
static struct integrity_sysfs_entry integrity_verify_entry = {
- .attr = { .name = "read_verify", .mode = S_IRUGO | S_IWUSR },
+ .attr = { .name = "read_verify", .mode = 0644 },
.show = integrity_verify_show,
.store = integrity_verify_store,
};
static struct integrity_sysfs_entry integrity_generate_entry = {
- .attr = { .name = "write_generate", .mode = S_IRUGO | S_IWUSR },
+ .attr = { .name = "write_generate", .mode = 0644 },
.show = integrity_generate_show,
.store = integrity_generate_store,
};
static struct integrity_sysfs_entry integrity_device_entry = {
- .attr = { .name = "device_is_integrity_capable", .mode = S_IRUGO },
+ .attr = { .name = "device_is_integrity_capable", .mode = 0444 },
.show = integrity_device_show,
};
diff --git a/block/blk-lib.c b/block/blk-lib.c
index a676084d4740..8faa70f26fcd 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -62,10 +62,16 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
unsigned int req_sects;
sector_t end_sect, tmp;
- /* Make sure bi_size doesn't overflow */
- req_sects = min_t(sector_t, nr_sects, UINT_MAX >> 9);
+ /*
+ * Issue in chunks of the user defined max discard setting,
+ * ensuring that bi_size doesn't overflow
+ */
+ req_sects = min_t(sector_t, nr_sects,
+ q->limits.max_discard_sectors);
+ if (req_sects > UINT_MAX >> 9)
+ req_sects = UINT_MAX >> 9;
- /**
+ /*
* If splitting a request, and the next starting sector would be
* misaligned, stop the discard at the previous aligned sector.
*/
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 782940c65d8a..aaec38cc37b8 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -188,16 +188,16 @@ void blk_queue_split(struct request_queue *q, struct bio **bio)
switch (bio_op(*bio)) {
case REQ_OP_DISCARD:
case REQ_OP_SECURE_ERASE:
- split = blk_bio_discard_split(q, *bio, q->bio_split, &nsegs);
+ split = blk_bio_discard_split(q, *bio, &q->bio_split, &nsegs);
break;
case REQ_OP_WRITE_ZEROES:
- split = blk_bio_write_zeroes_split(q, *bio, q->bio_split, &nsegs);
+ split = blk_bio_write_zeroes_split(q, *bio, &q->bio_split, &nsegs);
break;
case REQ_OP_WRITE_SAME:
- split = blk_bio_write_same_split(q, *bio, q->bio_split, &nsegs);
+ split = blk_bio_write_same_split(q, *bio, &q->bio_split, &nsegs);
break;
default:
- split = blk_bio_segment_split(q, *bio, q->bio_split, &nsegs);
+ split = blk_bio_segment_split(q, *bio, &q->bio_split, &nsegs);
break;
}
@@ -210,6 +210,16 @@ void blk_queue_split(struct request_queue *q, struct bio **bio)
/* there isn't chance to merge the splitted bio */
split->bi_opf |= REQ_NOMERGE;
+ /*
+ * Since we're recursing into make_request here, ensure
+ * that we mark this bio as already having entered the queue.
+ * If not, and the queue is going away, we can get stuck
+ * forever on waiting for the queue reference to drop. But
+ * that will never happen, as we're already holding a
+ * reference to it.
+ */
+ bio_set_flag(*bio, BIO_QUEUE_ENTERED);
+
bio_chain(split, *bio);
trace_block_split(q, split, (*bio)->bi_iter.bi_sector);
generic_make_request(*bio);
@@ -724,13 +734,12 @@ static struct request *attempt_merge(struct request_queue *q,
}
/*
- * At this point we have either done a back merge
- * or front merge. We need the smaller start_time of
- * the merged requests to be the current request
- * for accounting purposes.
+ * At this point we have either done a back merge or front merge. We
+ * need the smaller start_time_ns of the merged requests to be the
+ * current request for accounting purposes.
*/
- if (time_after(req->start_time, next->start_time))
- req->start_time = next->start_time;
+ if (next->start_time_ns < req->start_time_ns)
+ req->start_time_ns = next->start_time_ns;
req->biotail->bi_next = next->bio;
req->biotail = next->biotail;
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 3080e18cb859..ffa622366922 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -344,7 +344,6 @@ static const char *const rqf_name[] = {
RQF_NAME(STATS),
RQF_NAME(SPECIAL_PAYLOAD),
RQF_NAME(ZONE_WRITE_LOCKED),
- RQF_NAME(MQ_TIMEOUT_EXPIRED),
RQF_NAME(MQ_POLL_SLEPT),
};
#undef RQF_NAME
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 25c14c58385c..56c493c6cd90 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -268,19 +268,16 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);
/*
- * Reverse check our software queue for entries that we could potentially
- * merge with. Currently includes a hand-wavy stop count of 8, to not spend
- * too much time checking for merges.
+ * Iterate list of requests and see if we can merge this bio with any
+ * of them.
*/
-static bool blk_mq_attempt_merge(struct request_queue *q,
- struct blk_mq_ctx *ctx, struct bio *bio)
+bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list,
+ struct bio *bio)
{
struct request *rq;
int checked = 8;
- lockdep_assert_held(&ctx->lock);
-
- list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
+ list_for_each_entry_reverse(rq, list, queuelist) {
bool merged = false;
if (!checked--)
@@ -305,13 +302,30 @@ static bool blk_mq_attempt_merge(struct request_queue *q,
continue;
}
- if (merged)
- ctx->rq_merged++;
return merged;
}
return false;
}
+EXPORT_SYMBOL_GPL(blk_mq_bio_list_merge);
+
+/*
+ * Reverse check our software queue for entries that we could potentially
+ * merge with. Currently includes a hand-wavy stop count of 8, to not spend
+ * too much time checking for merges.
+ */
+static bool blk_mq_attempt_merge(struct request_queue *q,
+ struct blk_mq_ctx *ctx, struct bio *bio)
+{
+ lockdep_assert_held(&ctx->lock);
+
+ if (blk_mq_bio_list_merge(q, &ctx->rq_list, bio)) {
+ ctx->rq_merged++;
+ return true;
+ }
+
+ return false;
+}
bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
{
@@ -571,6 +585,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
if (!e) {
q->elevator = NULL;
+ q->nr_requests = q->tag_set->queue_depth;
return 0;
}
@@ -633,14 +648,3 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
blk_mq_sched_tags_teardown(q);
q->elevator = NULL;
}
-
-int blk_mq_sched_init(struct request_queue *q)
-{
- int ret;
-
- mutex_lock(&q->sysfs_lock);
- ret = elevator_init(q, NULL);
- mutex_unlock(&q->sysfs_lock);
-
- return ret;
-}
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 1e9c9018ace1..0cb8f938dff9 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -33,8 +33,6 @@ int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
unsigned int hctx_idx);
-int blk_mq_sched_init(struct request_queue *q);
-
static inline bool
blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
{
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index a54b4b070f1c..aafb44224c89 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -166,15 +166,15 @@ static struct attribute *default_ctx_attrs[] = {
};
static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_tags = {
- .attr = {.name = "nr_tags", .mode = S_IRUGO },
+ .attr = {.name = "nr_tags", .mode = 0444 },
.show = blk_mq_hw_sysfs_nr_tags_show,
};
static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_reserved_tags = {
- .attr = {.name = "nr_reserved_tags", .mode = S_IRUGO },
+ .attr = {.name = "nr_reserved_tags", .mode = 0444 },
.show = blk_mq_hw_sysfs_nr_reserved_tags_show,
};
static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = {
- .attr = {.name = "cpu_list", .mode = S_IRUGO },
+ .attr = {.name = "cpu_list", .mode = 0444 },
.show = blk_mq_hw_sysfs_cpus_show,
};
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 336dde07b230..70356a2a11ab 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -134,6 +134,8 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
ws = bt_wait_ptr(bt, data->hctx);
drop_ctx = data->ctx == NULL;
do {
+ struct sbitmap_queue *bt_prev;
+
/*
* We're out of tags on this hardware queue, kick any
* pending IO submits before going to sleep waiting for
@@ -159,6 +161,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
if (data->ctx)
blk_mq_put_ctx(data->ctx);
+ bt_prev = bt;
io_schedule();
data->ctx = blk_mq_get_ctx(data->q);
@@ -170,6 +173,15 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
bt = &tags->bitmap_tags;
finish_wait(&ws->wait, &wait);
+
+ /*
+ * If destination hw queue is changed, fake wake up on
+ * previous queue for compensating the wake up miss, so
+ * other allocations on previous queue won't be starved.
+ */
+ if (bt != bt_prev)
+ sbitmap_queue_wake_up(bt_prev);
+
ws = bt_wait_ptr(bt, data->hctx);
} while (1);
@@ -259,7 +271,7 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
* test and set the bit before assining ->rqs[].
*/
rq = tags->rqs[bitnr];
- if (rq)
+ if (rq && blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
iter_data->fn(rq, iter_data->data, reserved);
return true;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9ce9cac16c3f..6332940ca118 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -309,7 +309,8 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
RB_CLEAR_NODE(&rq->rb_node);
rq->rq_disk = NULL;
rq->part = NULL;
- rq->start_time = jiffies;
+ rq->start_time_ns = ktime_get_ns();
+ rq->io_start_time_ns = 0;
rq->nr_phys_segments = 0;
#if defined(CONFIG_BLK_DEV_INTEGRITY)
rq->nr_integrity_segments = 0;
@@ -328,11 +329,10 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
#ifdef CONFIG_BLK_CGROUP
rq->rl = NULL;
- set_start_time_ns(rq);
- rq->io_start_time_ns = 0;
#endif
data->ctx->rq_dispatched[op_is_sync(op)]++;
+ refcount_set(&rq->ref, 1);
return rq;
}
@@ -361,9 +361,11 @@ static struct request *blk_mq_get_request(struct request_queue *q,
/*
* Flush requests are special and go directly to the
- * dispatch list.
+ * dispatch list. Don't include reserved tags in the
+ * limiting, as it isn't useful.
*/
- if (!op_is_flush(op) && e->type->ops.mq.limit_depth)
+ if (!op_is_flush(op) && e->type->ops.mq.limit_depth &&
+ !(data->flags & BLK_MQ_REQ_RESERVED))
e->type->ops.mq.limit_depth(op, data);
}
@@ -464,13 +466,27 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
}
EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
+static void __blk_mq_free_request(struct request *rq)
+{
+ struct request_queue *q = rq->q;
+ struct blk_mq_ctx *ctx = rq->mq_ctx;
+ struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
+ const int sched_tag = rq->internal_tag;
+
+ if (rq->tag != -1)
+ blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
+ if (sched_tag != -1)
+ blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
+ blk_mq_sched_restart(hctx);
+ blk_queue_exit(q);
+}
+
void blk_mq_free_request(struct request *rq)
{
struct request_queue *q = rq->q;
struct elevator_queue *e = q->elevator;
struct blk_mq_ctx *ctx = rq->mq_ctx;
struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
- const int sched_tag = rq->internal_tag;
if (rq->rq_flags & RQF_ELVPRIV) {
if (e && e->type->ops.mq.finish_request)
@@ -488,27 +504,30 @@ void blk_mq_free_request(struct request *rq)
if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
laptop_io_completion(q->backing_dev_info);
- wbt_done(q->rq_wb, &rq->issue_stat);
+ wbt_done(q->rq_wb, rq);
if (blk_rq_rl(rq))
blk_put_rl(blk_rq_rl(rq));
- blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
- if (rq->tag != -1)
- blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
- if (sched_tag != -1)
- blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
- blk_mq_sched_restart(hctx);
- blk_queue_exit(q);
+ WRITE_ONCE(rq->state, MQ_RQ_IDLE);
+ if (refcount_dec_and_test(&rq->ref))
+ __blk_mq_free_request(rq);
}
EXPORT_SYMBOL_GPL(blk_mq_free_request);
inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
{
- blk_account_io_done(rq);
+ u64 now = ktime_get_ns();
+
+ if (rq->rq_flags & RQF_STATS) {
+ blk_mq_poll_stats_start(rq->q);
+ blk_stat_add(rq, now);
+ }
+
+ blk_account_io_done(rq, now);
if (rq->end_io) {
- wbt_done(rq->q->rq_wb, &rq->issue_stat);
+ wbt_done(rq->q->rq_wb, rq);
rq->end_io(rq, error);
} else {
if (unlikely(blk_bidi_rq(rq)))
@@ -539,15 +558,12 @@ static void __blk_mq_complete_request(struct request *rq)
bool shared = false;
int cpu;
- WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT);
- blk_mq_rq_update_state(rq, MQ_RQ_COMPLETE);
+ if (cmpxchg(&rq->state, MQ_RQ_IN_FLIGHT, MQ_RQ_COMPLETE) !=
+ MQ_RQ_IN_FLIGHT)
+ return;
if (rq->internal_tag != -1)
blk_mq_sched_completed_request(rq);
- if (rq->rq_flags & RQF_STATS) {
- blk_mq_poll_stats_start(rq->q);
- blk_stat_add(rq);
- }
if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
rq->q->softirq_done_fn(rq);
@@ -589,36 +605,6 @@ static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
*srcu_idx = srcu_read_lock(hctx->srcu);
}
-static void blk_mq_rq_update_aborted_gstate(struct request *rq, u64 gstate)
-{
- unsigned long flags;
-
- /*
- * blk_mq_rq_aborted_gstate() is used from the completion path and
- * can thus be called from irq context. u64_stats_fetch in the
- * middle of update on the same CPU leads to lockup. Disable irq
- * while updating.
- */
- local_irq_save(flags);
- u64_stats_update_begin(&rq->aborted_gstate_sync);
- rq->aborted_gstate = gstate;
- u64_stats_update_end(&rq->aborted_gstate_sync);
- local_irq_restore(flags);
-}
-
-static u64 blk_mq_rq_aborted_gstate(struct request *rq)
-{
- unsigned int start;
- u64 aborted_gstate;
-
- do {
- start = u64_stats_fetch_begin(&rq->aborted_gstate_sync);
- aborted_gstate = rq->aborted_gstate;
- } while (u64_stats_fetch_retry(&rq->aborted_gstate_sync, start));
-
- return aborted_gstate;
-}
-
/**
* blk_mq_complete_request - end I/O on a request
* @rq: the request being processed
@@ -629,28 +615,9 @@ static u64 blk_mq_rq_aborted_gstate(struct request *rq)
**/
void blk_mq_complete_request(struct request *rq)
{
- struct request_queue *q = rq->q;
- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
- int srcu_idx;
-
- if (unlikely(blk_should_fake_timeout(q)))
+ if (unlikely(blk_should_fake_timeout(rq->q)))
return;
-
- /*
- * If @rq->aborted_gstate equals the current instance, timeout is
- * claiming @rq and we lost. This is synchronized through
- * hctx_lock(). See blk_mq_timeout_work() for details.
- *
- * Completion path never blocks and we can directly use RCU here
- * instead of hctx_lock() which can be either RCU or SRCU.
- * However, that would complicate paths which want to synchronize
- * against us. Let stay in sync with the issue path so that
- * hctx_lock() covers both issue and completion paths.
- */
- hctx_lock(hctx, &srcu_idx);
- if (blk_mq_rq_aborted_gstate(rq) != rq->gstate)
- __blk_mq_complete_request(rq);
- hctx_unlock(hctx, srcu_idx);
+ __blk_mq_complete_request(rq);
}
EXPORT_SYMBOL(blk_mq_complete_request);
@@ -669,32 +636,18 @@ void blk_mq_start_request(struct request *rq)
trace_block_rq_issue(q, rq);
if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
- blk_stat_set_issue(&rq->issue_stat, blk_rq_sectors(rq));
+ rq->io_start_time_ns = ktime_get_ns();
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+ rq->throtl_size = blk_rq_sectors(rq);
+#endif
rq->rq_flags |= RQF_STATS;
- wbt_issue(q->rq_wb, &rq->issue_stat);
+ wbt_issue(q->rq_wb, rq);
}
WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
- /*
- * Mark @rq in-flight which also advances the generation number,
- * and register for timeout. Protect with a seqcount to allow the
- * timeout path to read both @rq->gstate and @rq->deadline
- * coherently.
- *
- * This is the only place where a request is marked in-flight. If
- * the timeout path reads an in-flight @rq->gstate, the
- * @rq->deadline it reads together under @rq->gstate_seq is
- * guaranteed to be the matching one.
- */
- preempt_disable();
- write_seqcount_begin(&rq->gstate_seq);
-
- blk_mq_rq_update_state(rq, MQ_RQ_IN_FLIGHT);
blk_add_timer(rq);
-
- write_seqcount_end(&rq->gstate_seq);
- preempt_enable();
+ WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
if (q->dma_drain_size && blk_rq_bytes(rq)) {
/*
@@ -707,11 +660,6 @@ void blk_mq_start_request(struct request *rq)
}
EXPORT_SYMBOL(blk_mq_start_request);
-/*
- * When we reach here because queue is busy, it's safe to change the state
- * to IDLE without checking @rq->aborted_gstate because we should still be
- * holding the RCU read lock and thus protected against timeout.
- */
static void __blk_mq_requeue_request(struct request *rq)
{
struct request_queue *q = rq->q;
@@ -719,10 +667,10 @@ static void __blk_mq_requeue_request(struct request *rq)
blk_mq_put_driver_tag(rq);
trace_block_rq_requeue(q, rq);
- wbt_requeue(q->rq_wb, &rq->issue_stat);
+ wbt_requeue(q->rq_wb, rq);
- if (blk_mq_rq_state(rq) != MQ_RQ_IDLE) {
- blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
+ if (blk_mq_request_started(rq)) {
+ WRITE_ONCE(rq->state, MQ_RQ_IDLE);
if (q->dma_drain_size && blk_rq_bytes(rq))
rq->nr_phys_segments--;
}
@@ -820,101 +768,79 @@ struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
}
EXPORT_SYMBOL(blk_mq_tag_to_rq);
-struct blk_mq_timeout_data {
- unsigned long next;
- unsigned int next_set;
- unsigned int nr_expired;
-};
-
static void blk_mq_rq_timed_out(struct request *req, bool reserved)
{
- const struct blk_mq_ops *ops = req->q->mq_ops;
- enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
-
- req->rq_flags |= RQF_MQ_TIMEOUT_EXPIRED;
-
- if (ops->timeout)
- ret = ops->timeout(req, reserved);
+ if (req->q->mq_ops->timeout) {
+ enum blk_eh_timer_return ret;
- switch (ret) {
- case BLK_EH_HANDLED:
- __blk_mq_complete_request(req);
- break;
- case BLK_EH_RESET_TIMER:
- /*
- * As nothing prevents from completion happening while
- * ->aborted_gstate is set, this may lead to ignored
- * completions and further spurious timeouts.
- */
- blk_mq_rq_update_aborted_gstate(req, 0);
- blk_add_timer(req);
- break;
- case BLK_EH_NOT_HANDLED:
- break;
- default:
- printk(KERN_ERR "block: bad eh return: %d\n", ret);
- break;
+ ret = req->q->mq_ops->timeout(req, reserved);
+ if (ret == BLK_EH_DONE)
+ return;
+ WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER);
}
+
+ blk_add_timer(req);
}
-static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
- struct request *rq, void *priv, bool reserved)
+static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
{
- struct blk_mq_timeout_data *data = priv;
- unsigned long gstate, deadline;
- int start;
+ unsigned long deadline;
- might_sleep();
-
- if (rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED)
- return;
+ if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT)
+ return false;
- /* read coherent snapshots of @rq->state_gen and @rq->deadline */
- while (true) {
- start = read_seqcount_begin(&rq->gstate_seq);
- gstate = READ_ONCE(rq->gstate);
- deadline = blk_rq_deadline(rq);
- if (!read_seqcount_retry(&rq->gstate_seq, start))
- break;
- cond_resched();
- }
+ deadline = blk_rq_deadline(rq);
+ if (time_after_eq(jiffies, deadline))
+ return true;
- /* if in-flight && overdue, mark for abortion */
- if ((gstate & MQ_RQ_STATE_MASK) == MQ_RQ_IN_FLIGHT &&
- time_after_eq(jiffies, deadline)) {
- blk_mq_rq_update_aborted_gstate(rq, gstate);
- data->nr_expired++;
- hctx->nr_expired++;
- } else if (!data->next_set || time_after(data->next, deadline)) {
- data->next = deadline;
- data->next_set = 1;
- }
+ if (*next == 0)
+ *next = deadline;
+ else if (time_after(*next, deadline))
+ *next = deadline;
+ return false;
}
-static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx,
+static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
struct request *rq, void *priv, bool reserved)
{
+ unsigned long *next = priv;
+
/*
- * We marked @rq->aborted_gstate and waited for RCU. If there were
- * completions that we lost to, they would have finished and
- * updated @rq->gstate by now; otherwise, the completion path is
- * now guaranteed to see @rq->aborted_gstate and yield. If
- * @rq->aborted_gstate still matches @rq->gstate, @rq is ours.
+ * Just do a quick check if it is expired before locking the request in
+ * so we're not unnecessarilly synchronizing across CPUs.
+ */
+ if (!blk_mq_req_expired(rq, next))
+ return;
+
+ /*
+ * We have reason to believe the request may be expired. Take a
+ * reference on the request to lock this request lifetime into its
+ * currently allocated context to prevent it from being reallocated in
+ * the event the completion by-passes this timeout handler.
+ *
+ * If the reference was already released, then the driver beat the
+ * timeout handler to posting a natural completion.
*/
- if (!(rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) &&
- READ_ONCE(rq->gstate) == rq->aborted_gstate)
+ if (!refcount_inc_not_zero(&rq->ref))
+ return;
+
+ /*
+ * The request is now locked and cannot be reallocated underneath the
+ * timeout handler's processing. Re-verify this exact request is truly
+ * expired; if it is not expired, then the request was completed and
+ * reallocated as a new request.
+ */
+ if (blk_mq_req_expired(rq, next))
blk_mq_rq_timed_out(rq, reserved);
+ if (refcount_dec_and_test(&rq->ref))
+ __blk_mq_free_request(rq);
}
static void blk_mq_timeout_work(struct work_struct *work)
{
struct request_queue *q =
container_of(work, struct request_queue, timeout_work);
- struct blk_mq_timeout_data data = {
- .next = 0,
- .next_set = 0,
- .nr_expired = 0,
- };
+ unsigned long next = 0;
struct blk_mq_hw_ctx *hctx;
int i;
@@ -934,39 +860,10 @@ static void blk_mq_timeout_work(struct work_struct *work)
if (!percpu_ref_tryget(&q->q_usage_counter))
return;
- /* scan for the expired ones and set their ->aborted_gstate */
- blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
-
- if (data.nr_expired) {
- bool has_rcu = false;
-
- /*
- * Wait till everyone sees ->aborted_gstate. The
- * sequential waits for SRCUs aren't ideal. If this ever
- * becomes a problem, we can add per-hw_ctx rcu_head and
- * wait in parallel.
- */
- queue_for_each_hw_ctx(q, hctx, i) {
- if (!hctx->nr_expired)
- continue;
-
- if (!(hctx->flags & BLK_MQ_F_BLOCKING))
- has_rcu = true;
- else
- synchronize_srcu(hctx->srcu);
-
- hctx->nr_expired = 0;
- }
- if (has_rcu)
- synchronize_rcu();
-
- /* terminate the ones we won */
- blk_mq_queue_tag_busy_iter(q, blk_mq_terminate_expired, NULL);
- }
+ blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next);
- if (data.next_set) {
- data.next = blk_rq_timeout(round_jiffies_up(data.next));
- mod_timer(&q->timeout, data.next);
+ if (next != 0) {
+ mod_timer(&q->timeout, next);
} else {
/*
* Request timeouts are handled as a forward rolling timer. If
@@ -1029,7 +926,7 @@ static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
spin_lock(&ctx->lock);
- if (unlikely(!list_empty(&ctx->rq_list))) {
+ if (!list_empty(&ctx->rq_list)) {
dispatch_data->rq = list_entry_rq(ctx->rq_list.next);
list_del_init(&dispatch_data->rq->queuelist);
if (list_empty(&ctx->rq_list))
@@ -1716,15 +1613,6 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
blk_account_io_start(rq, true);
}
-static inline void blk_mq_queue_io(struct blk_mq_hw_ctx *hctx,
- struct blk_mq_ctx *ctx,
- struct request *rq)
-{
- spin_lock(&ctx->lock);
- __blk_mq_insert_request(hctx, rq, false);
- spin_unlock(&ctx->lock);
-}
-
static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
{
if (rq->tag != -1)
@@ -1882,7 +1770,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
return BLK_QC_T_NONE;
}
- wbt_track(&rq->issue_stat, wb_acct);
+ wbt_track(rq, wb_acct);
cookie = request_to_qc_t(data.hctx, rq);
@@ -1949,15 +1837,10 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
blk_mq_put_ctx(data.ctx);
blk_mq_bio_to_request(rq, bio);
blk_mq_try_issue_directly(data.hctx, rq, &cookie);
- } else if (q->elevator) {
- blk_mq_put_ctx(data.ctx);
- blk_mq_bio_to_request(rq, bio);
- blk_mq_sched_insert_request(rq, false, true, true);
} else {
blk_mq_put_ctx(data.ctx);
blk_mq_bio_to_request(rq, bio);
- blk_mq_queue_io(data.hctx, data.ctx, rq);
- blk_mq_run_hw_queue(data.hctx, true);
+ blk_mq_sched_insert_request(rq, false, true, true);
}
return cookie;
@@ -2056,15 +1939,7 @@ static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
return ret;
}
- seqcount_init(&rq->gstate_seq);
- u64_stats_init(&rq->aborted_gstate_sync);
- /*
- * start gstate with gen 1 instead of 0, otherwise it will be equal
- * to aborted_gstate, and be identified timed out by
- * blk_mq_terminate_expired.
- */
- WRITE_ONCE(rq->gstate, MQ_RQ_GEN_INC);
-
+ WRITE_ONCE(rq->state, MQ_RQ_IDLE);
return 0;
}
@@ -2365,6 +2240,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
queue_for_each_hw_ctx(q, hctx, i) {
cpumask_clear(hctx->cpumask);
hctx->nr_ctx = 0;
+ hctx->dispatch_from = NULL;
}
/*
@@ -2697,7 +2573,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
if (!(set->flags & BLK_MQ_F_NO_SCHED)) {
int ret;
- ret = blk_mq_sched_init(q);
+ ret = elevator_init_mq(q);
if (ret)
return ERR_PTR(ret);
}
diff --git a/block/blk-mq.h b/block/blk-mq.h
index e1bb420dc5d6..89231e439b2f 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -30,20 +30,6 @@ struct blk_mq_ctx {
struct kobject kobj;
} ____cacheline_aligned_in_smp;
-/*
- * Bits for request->gstate. The lower two bits carry MQ_RQ_* state value
- * and the upper bits the generation number.
- */
-enum mq_rq_state {
- MQ_RQ_IDLE = 0,
- MQ_RQ_IN_FLIGHT = 1,
- MQ_RQ_COMPLETE = 2,
-
- MQ_RQ_STATE_BITS = 2,
- MQ_RQ_STATE_MASK = (1 << MQ_RQ_STATE_BITS) - 1,
- MQ_RQ_GEN_INC = 1 << MQ_RQ_STATE_BITS,
-};
-
void blk_mq_freeze_queue(struct request_queue *q);
void blk_mq_free_queue(struct request_queue *q);
int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
@@ -107,33 +93,9 @@ void blk_mq_release(struct request_queue *q);
* blk_mq_rq_state() - read the current MQ_RQ_* state of a request
* @rq: target request.
*/
-static inline int blk_mq_rq_state(struct request *rq)
+static inline enum mq_rq_state blk_mq_rq_state(struct request *rq)
{
- return READ_ONCE(rq->gstate) & MQ_RQ_STATE_MASK;
-}
-
-/**
- * blk_mq_rq_update_state() - set the current MQ_RQ_* state of a request
- * @rq: target request.
- * @state: new state to set.
- *
- * Set @rq's state to @state. The caller is responsible for ensuring that
- * there are no other updaters. A request can transition into IN_FLIGHT
- * only from IDLE and doing so increments the generation number.
- */
-static inline void blk_mq_rq_update_state(struct request *rq,
- enum mq_rq_state state)
-{
- u64 old_val = READ_ONCE(rq->gstate);
- u64 new_val = (old_val & ~MQ_RQ_STATE_MASK) | state;
-
- if (state == MQ_RQ_IN_FLIGHT) {
- WARN_ON_ONCE((old_val & MQ_RQ_STATE_MASK) != MQ_RQ_IDLE);
- new_val += MQ_RQ_GEN_INC;
- }
-
- /* avoid exposing interim values */
- WRITE_ONCE(rq->gstate, new_val);
+ return READ_ONCE(rq->state);
}
static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
diff --git a/block/blk-stat.c b/block/blk-stat.c
index bd365a95fcf8..175c143ac5b9 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -47,19 +47,15 @@ static void __blk_stat_add(struct blk_rq_stat *stat, u64 value)
stat->nr_samples++;
}
-void blk_stat_add(struct request *rq)
+void blk_stat_add(struct request *rq, u64 now)
{
struct request_queue *q = rq->q;
struct blk_stat_callback *cb;
struct blk_rq_stat *stat;
int bucket;
- u64 now, value;
+ u64 value;
- now = __blk_stat_time(ktime_to_ns(ktime_get()));
- if (now < blk_stat_time(&rq->issue_stat))
- return;
-
- value = now - blk_stat_time(&rq->issue_stat);
+ value = (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0;
blk_throtl_stat_add(rq, value);
diff --git a/block/blk-stat.h b/block/blk-stat.h
index 2dd36347252a..78399cdde9c9 100644
--- a/block/blk-stat.h
+++ b/block/blk-stat.h
@@ -8,21 +8,6 @@
#include <linux/rcupdate.h>
#include <linux/timer.h>
-/*
- * from upper:
- * 3 bits: reserved for other usage
- * 12 bits: size
- * 49 bits: time
- */
-#define BLK_STAT_RES_BITS 3
-#define BLK_STAT_SIZE_BITS 12
-#define BLK_STAT_RES_SHIFT (64 - BLK_STAT_RES_BITS)
-#define BLK_STAT_SIZE_SHIFT (BLK_STAT_RES_SHIFT - BLK_STAT_SIZE_BITS)
-#define BLK_STAT_TIME_MASK ((1ULL << BLK_STAT_SIZE_SHIFT) - 1)
-#define BLK_STAT_SIZE_MASK \
- (((1ULL << BLK_STAT_SIZE_BITS) - 1) << BLK_STAT_SIZE_SHIFT)
-#define BLK_STAT_RES_MASK (~((1ULL << BLK_STAT_RES_SHIFT) - 1))
-
/**
* struct blk_stat_callback - Block statistics callback.
*
@@ -80,35 +65,7 @@ struct blk_stat_callback {
struct blk_queue_stats *blk_alloc_queue_stats(void);
void blk_free_queue_stats(struct blk_queue_stats *);
-void blk_stat_add(struct request *);
-
-static inline u64 __blk_stat_time(u64 time)
-{
- return time & BLK_STAT_TIME_MASK;
-}
-
-static inline u64 blk_stat_time(struct blk_issue_stat *stat)
-{
- return __blk_stat_time(stat->stat);
-}
-
-static inline sector_t blk_capped_size(sector_t size)
-{
- return size & ((1ULL << BLK_STAT_SIZE_BITS) - 1);
-}
-
-static inline sector_t blk_stat_size(struct blk_issue_stat *stat)
-{
- return (stat->stat & BLK_STAT_SIZE_MASK) >> BLK_STAT_SIZE_SHIFT;
-}
-
-static inline void blk_stat_set_issue(struct blk_issue_stat *stat,
- sector_t size)
-{
- stat->stat = (stat->stat & BLK_STAT_RES_MASK) |
- (ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK) |
- (((u64)blk_capped_size(size)) << BLK_STAT_SIZE_SHIFT);
-}
+void blk_stat_add(struct request *rq, u64 now);
/* record time/size info in request but not add a callback */
void blk_stat_enable_accounting(struct request_queue *q);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index d00d1b0ec109..94987b1f69e1 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -491,188 +491,198 @@ static ssize_t queue_wc_store(struct request_queue *q, const char *page,
return count;
}
+static ssize_t queue_fua_show(struct request_queue *q, char *page)
+{
+ return sprintf(page, "%u\n", test_bit(QUEUE_FLAG_FUA, &q->queue_flags));
+}
+
static ssize_t queue_dax_show(struct request_queue *q, char *page)
{
return queue_var_show(blk_queue_dax(q), page);
}
static struct queue_sysfs_entry queue_requests_entry = {
- .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
+ .attr = {.name = "nr_requests", .mode = 0644 },
.show = queue_requests_show,
.store = queue_requests_store,
};
static struct queue_sysfs_entry queue_ra_entry = {
- .attr = {.name = "read_ahead_kb", .mode = S_IRUGO | S_IWUSR },
+ .attr = {.name = "read_ahead_kb", .mode = 0644 },
.show = queue_ra_show,
.store = queue_ra_store,
};
static struct queue_sysfs_entry queue_max_sectors_entry = {
- .attr = {.name = "max_sectors_kb", .mode = S_IRUGO | S_IWUSR },
+ .attr = {.name = "max_sectors_kb", .mode = 0644 },
.show = queue_max_sectors_show,
.store = queue_max_sectors_store,
};
static struct queue_sysfs_entry queue_max_hw_sectors_entry = {
- .attr = {.name = "max_hw_sectors_kb", .mode = S_IRUGO },
+ .attr = {.name = "max_hw_sectors_kb", .mode = 0444 },
.show = queue_max_hw_sectors_show,
};
static struct queue_sysfs_entry queue_max_segments_entry = {
- .attr = {.name = "max_segments", .mode = S_IRUGO },
+ .attr = {.name = "max_segments", .mode = 0444 },
.show = queue_max_segments_show,
};
static struct queue_sysfs_entry queue_max_discard_segments_entry = {
- .attr = {.name = "max_discard_segments", .mode = S_IRUGO },
+ .attr = {.name = "max_discard_segments", .mode = 0444 },
.show = queue_max_discard_segments_show,
};
static struct queue_sysfs_entry queue_max_integrity_segments_entry = {
- .attr = {.name = "max_integrity_segments", .mode = S_IRUGO },
+ .attr = {.name = "max_integrity_segments", .mode = 0444 },
.show = queue_max_integrity_segments_show,
};
static struct queue_sysfs_entry queue_max_segment_size_entry = {
- .attr = {.name = "max_segment_size", .mode = S_IRUGO },
+ .attr = {.name = "max_segment_size", .mode = 0444 },
.show = queue_max_segment_size_show,
};
static struct queue_sysfs_entry queue_iosched_entry = {
- .attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR },
+ .attr = {.name = "scheduler", .mode = 0644 },
.show = elv_iosched_show,
.store = elv_iosched_store,
};
static struct queue_sysfs_entry queue_hw_sector_size_entry = {
- .attr = {.name = "hw_sector_size", .mode = S_IRUGO },
+ .attr = {.name = "hw_sector_size", .mode = 0444 },
.show = queue_logical_block_size_show,
};
static struct queue_sysfs_entry queue_logical_block_size_entry = {
- .attr = {.name = "logical_block_size", .mode = S_IRUGO },
+ .attr = {.name = "logical_block_size", .mode = 0444 },
.show = queue_logical_block_size_show,
};
static struct queue_sysfs_entry queue_physical_block_size_entry = {
- .attr = {.name = "physical_block_size", .mode = S_IRUGO },
+ .attr = {.name = "physical_block_size", .mode = 0444 },
.show = queue_physical_block_size_show,
};
static struct queue_sysfs_entry queue_chunk_sectors_entry = {
- .attr = {.name = "chunk_sectors", .mode = S_IRUGO },
+ .attr = {.name = "chunk_sectors", .mode = 0444 },
.show = queue_chunk_sectors_show,
};
static struct queue_sysfs_entry queue_io_min_entry = {
- .attr = {.name = "minimum_io_size", .mode = S_IRUGO },
+ .attr = {.name = "minimum_io_size", .mode = 0444 },
.show = queue_io_min_show,
};
static struct queue_sysfs_entry queue_io_opt_entry = {
- .attr = {.name = "optimal_io_size", .mode = S_IRUGO },
+ .attr = {.name = "optimal_io_size", .mode = 0444 },
.show = queue_io_opt_show,
};
static struct queue_sysfs_entry queue_discard_granularity_entry = {
- .attr = {.name = "discard_granularity", .mode = S_IRUGO },
+ .attr = {.name = "discard_granularity", .mode = 0444 },
.show = queue_discard_granularity_show,
};
static struct queue_sysfs_entry queue_discard_max_hw_entry = {
- .attr = {.name = "discard_max_hw_bytes", .mode = S_IRUGO },
+ .attr = {.name = "discard_max_hw_bytes", .mode = 0444 },
.show = queue_discard_max_hw_show,
};
static struct queue_sysfs_entry queue_discard_max_entry = {
- .attr = {.name = "discard_max_bytes", .mode = S_IRUGO | S_IWUSR },
+ .attr = {.name = "discard_max_bytes", .mode = 0644 },
.show = queue_discard_max_show,
.store = queue_discard_max_store,
};
static struct queue_sysfs_entry queue_discard_zeroes_data_entry = {
- .attr = {.name = "discard_zeroes_data", .mode = S_IRUGO },
+ .attr = {.name = "discard_zeroes_data", .mode = 0444 },
.show = queue_discard_zeroes_data_show,
};
static struct queue_sysfs_entry queue_write_same_max_entry = {
- .attr = {.name = "write_same_max_bytes", .mode = S_IRUGO },
+ .attr = {.name = "write_same_max_bytes", .mode = 0444 },
.show = queue_write_same_max_show,
};
static struct queue_sysfs_entry queue_write_zeroes_max_entry = {
- .attr = {.name = "write_zeroes_max_bytes", .mode = S_IRUGO },
+ .attr = {.name = "write_zeroes_max_bytes", .mode = 0444 },
.show = queue_write_zeroes_max_show,
};
static struct queue_sysfs_entry queue_nonrot_entry = {
- .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
+ .attr = {.name = "rotational", .mode = 0644 },
.show = queue_show_nonrot,
.store = queue_store_nonrot,
};
static struct queue_sysfs_entry queue_zoned_entry = {
- .attr = {.name = "zoned", .mode = S_IRUGO },
+ .attr = {.name = "zoned", .mode = 0444 },
.show = queue_zoned_show,
};
static struct queue_sysfs_entry queue_nomerges_entry = {
- .attr = {.name = "nomerges", .mode = S_IRUGO | S_IWUSR },
+ .attr = {.name = "nomerges", .mode = 0644 },
.show = queue_nomerges_show,
.store = queue_nomerges_store,
};
static struct queue_sysfs_entry queue_rq_affinity_entry = {
- .attr = {.name = "rq_affinity", .mode = S_IRUGO | S_IWUSR },
+ .attr = {.name = "rq_affinity", .mode = 0644 },
.show = queue_rq_affinity_show,
.store = queue_rq_affinity_store,
};
static struct queue_sysfs_entry queue_iostats_entry = {
- .attr = {.name = "iostats", .mode = S_IRUGO | S_IWUSR },
+ .attr = {.name = "iostats", .mode = 0644 },
.show = queue_show_iostats,
.store = queue_store_iostats,
};
static struct queue_sysfs_entry queue_random_entry = {
- .attr = {.name = "add_random", .mode = S_IRUGO | S_IWUSR },
+ .attr = {.name = "add_random", .mode = 0644 },
.show = queue_show_random,
.store = queue_store_random,
};
static struct queue_sysfs_entry queue_poll_entry = {
- .attr = {.name = "io_poll", .mode = S_IRUGO | S_IWUSR },
+ .attr = {.name = "io_poll", .mode = 0644 },
.show = queue_poll_show,
.store = queue_poll_store,
};
static struct queue_sysfs_entry queue_poll_delay_entry = {
- .attr = {.name = "io_poll_delay", .mode = S_IRUGO | S_IWUSR },
+ .attr = {.name = "io_poll_delay", .mode = 0644 },
.show = queue_poll_delay_show,
.store = queue_poll_delay_store,
};
static struct queue_sysfs_entry queue_wc_entry = {
- .attr = {.name = "write_cache", .mode = S_IRUGO | S_IWUSR },
+ .attr = {.name = "write_cache", .mode = 0644 },
.show = queue_wc_show,
.store = queue_wc_store,
};
+static struct queue_sysfs_entry queue_fua_entry = {
+ .attr = {.name = "fua", .mode = 0444 },
+ .show = queue_fua_show,
+};
+
static struct queue_sysfs_entry queue_dax_entry = {
- .attr = {.name = "dax", .mode = S_IRUGO },
+ .attr = {.name = "dax", .mode = 0444 },
.show = queue_dax_show,
};
static struct queue_sysfs_entry queue_wb_lat_entry = {
- .attr = {.name = "wbt_lat_usec", .mode = S_IRUGO | S_IWUSR },
+ .attr = {.name = "wbt_lat_usec", .mode = 0644 },
.show = queue_wb_lat_show,
.store = queue_wb_lat_store,
};
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
static struct queue_sysfs_entry throtl_sample_time_entry = {
- .attr = {.name = "throttle_sample_time", .mode = S_IRUGO | S_IWUSR },
+ .attr = {.name = "throttle_sample_time", .mode = 0644 },
.show = blk_throtl_sample_time_show,
.store = blk_throtl_sample_time_store,
};
@@ -708,6 +718,7 @@ static struct attribute *default_attrs[] = {
&queue_random_entry.attr,
&queue_poll_entry.attr,
&queue_wc_entry.attr,
+ &queue_fua_entry.attr,
&queue_dax_entry.attr,
&queue_wb_lat_entry.attr,
&queue_poll_delay_entry.attr,
@@ -813,8 +824,7 @@ static void __blk_release_queue(struct work_struct *work)
if (q->mq_ops)
blk_mq_debugfs_unregister(q);
- if (q->bio_split)
- bioset_free(q->bio_split);
+ bioset_exit(&q->bio_split);
ida_simple_remove(&blk_queue_ida, q->id);
call_rcu(&q->rcu_head, blk_free_queue_rcu);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index c5a131673733..82282e6fdcf8 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -36,8 +36,6 @@ static int throtl_quantum = 32;
*/
#define LATENCY_FILTERED_HD (1000L) /* 1ms */
-#define SKIP_LATENCY (((u64)1) << BLK_STAT_RES_SHIFT)
-
static struct blkcg_policy blkcg_policy_throtl;
/* A workqueue to queue throttle related work */
@@ -821,7 +819,7 @@ static bool throtl_slice_used(struct throtl_grp *tg, bool rw)
if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
return false;
- return 1;
+ return true;
}
/* Trim the used slices and adjust slice start accordingly */
@@ -931,7 +929,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
if (wait)
*wait = jiffy_wait;
- return 0;
+ return false;
}
static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
@@ -974,7 +972,7 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
if (wait)
*wait = jiffy_wait;
- return 0;
+ return false;
}
/*
@@ -1024,7 +1022,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
tg_with_in_iops_limit(tg, bio, &iops_wait)) {
if (wait)
*wait = 0;
- return 1;
+ return true;
}
max_wait = max(bps_wait, iops_wait);
@@ -1035,7 +1033,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
if (time_before(tg->slice_end[rw], jiffies + max_wait))
throtl_extend_slice(tg, rw, jiffies + max_wait);
- return 0;
+ return false;
}
static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
@@ -1209,7 +1207,7 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
while (1) {
struct throtl_grp *tg = throtl_rb_first(parent_sq);
- struct throtl_service_queue *sq = &tg->service_queue;
+ struct throtl_service_queue *sq;
if (!tg)
break;
@@ -1221,6 +1219,7 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
nr_disp += throtl_dispatch_tg(tg);
+ sq = &tg->service_queue;
if (sq->nr_queued[0] || sq->nr_queued[1])
tg_update_disptime(tg);
@@ -2139,7 +2138,7 @@ static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
bio->bi_cg_private = tg;
blkg_get(tg_to_blkg(tg));
}
- blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio));
+ bio_issue_init(&bio->bi_issue, bio_sectors(bio));
#endif
}
@@ -2251,7 +2250,7 @@ out:
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
if (throttled || !td->track_bio_latency)
- bio->bi_issue_stat.stat |= SKIP_LATENCY;
+ bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY;
#endif
return throttled;
}
@@ -2281,8 +2280,7 @@ void blk_throtl_stat_add(struct request *rq, u64 time_ns)
struct request_queue *q = rq->q;
struct throtl_data *td = q->td;
- throtl_track_latency(td, blk_stat_size(&rq->issue_stat),
- req_op(rq), time_ns >> 10);
+ throtl_track_latency(td, rq->throtl_size, req_op(rq), time_ns >> 10);
}
void blk_throtl_bio_endio(struct bio *bio)
@@ -2302,8 +2300,8 @@ void blk_throtl_bio_endio(struct bio *bio)
finish_time_ns = ktime_get_ns();
tg->last_finish_time = finish_time_ns >> 10;
- start_time = blk_stat_time(&bio->bi_issue_stat) >> 10;
- finish_time = __blk_stat_time(finish_time_ns) >> 10;
+ start_time = bio_issue_time(&bio->bi_issue) >> 10;
+ finish_time = __bio_issue_time(finish_time_ns) >> 10;
if (!start_time || finish_time <= start_time) {
blkg_put(tg_to_blkg(tg));
return;
@@ -2311,16 +2309,15 @@ void blk_throtl_bio_endio(struct bio *bio)
lat = finish_time - start_time;
/* this is only for bio based driver */
- if (!(bio->bi_issue_stat.stat & SKIP_LATENCY))
- throtl_track_latency(tg->td, blk_stat_size(&bio->bi_issue_stat),
- bio_op(bio), lat);
+ if (!(bio->bi_issue.value & BIO_ISSUE_THROTL_SKIP_LATENCY))
+ throtl_track_latency(tg->td, bio_issue_size(&bio->bi_issue),
+ bio_op(bio), lat);
if (tg->latency_target && lat >= tg->td->filtered_latency) {
int bucket;
unsigned int threshold;
- bucket = request_bucket_index(
- blk_stat_size(&bio->bi_issue_stat));
+ bucket = request_bucket_index(bio_issue_size(&bio->bi_issue));
threshold = tg->td->avg_buckets[rw][bucket].latency +
tg->latency_target;
if (lat > threshold)
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 652d4d4d3e97..4b8a48d48ba1 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -86,14 +86,11 @@ static void blk_rq_timed_out(struct request *req)
if (q->rq_timed_out_fn)
ret = q->rq_timed_out_fn(req);
switch (ret) {
- case BLK_EH_HANDLED:
- __blk_complete_request(req);
- break;
case BLK_EH_RESET_TIMER:
blk_add_timer(req);
blk_clear_rq_complete(req);
break;
- case BLK_EH_NOT_HANDLED:
+ case BLK_EH_DONE:
/*
* LLD handles this for now but in the future
* we can send a request msg to abort the command
@@ -214,7 +211,6 @@ void blk_add_timer(struct request *req)
req->timeout = q->rq_timeout;
blk_rq_set_deadline(req, jiffies + req->timeout);
- req->rq_flags &= ~RQF_MQ_TIMEOUT_EXPIRED;
/*
* Only the non-mq case needs to add the request to a protected list.
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index f92fc84b5e2c..4f89b28fa652 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -29,6 +29,26 @@
#define CREATE_TRACE_POINTS
#include <trace/events/wbt.h>
+static inline void wbt_clear_state(struct request *rq)
+{
+ rq->wbt_flags = 0;
+}
+
+static inline enum wbt_flags wbt_flags(struct request *rq)
+{
+ return rq->wbt_flags;
+}
+
+static inline bool wbt_is_tracked(struct request *rq)
+{
+ return rq->wbt_flags & WBT_TRACKED;
+}
+
+static inline bool wbt_is_read(struct request *rq)
+{
+ return rq->wbt_flags & WBT_READ;
+}
+
enum {
/*
* Default setting, we'll scale up (to 75% of QD max) or down (min 1)
@@ -101,9 +121,15 @@ static bool wb_recent_wait(struct rq_wb *rwb)
return time_before(jiffies, wb->dirty_sleep + HZ);
}
-static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, bool is_kswapd)
+static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb,
+ enum wbt_flags wb_acct)
{
- return &rwb->rq_wait[is_kswapd];
+ if (wb_acct & WBT_KSWAPD)
+ return &rwb->rq_wait[WBT_RWQ_KSWAPD];
+ else if (wb_acct & WBT_DISCARD)
+ return &rwb->rq_wait[WBT_RWQ_DISCARD];
+
+ return &rwb->rq_wait[WBT_RWQ_BG];
}
static void rwb_wake_all(struct rq_wb *rwb)
@@ -126,7 +152,7 @@ void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
if (!(wb_acct & WBT_TRACKED))
return;
- rqw = get_rq_wait(rwb, wb_acct & WBT_KSWAPD);
+ rqw = get_rq_wait(rwb, wb_acct);
inflight = atomic_dec_return(&rqw->inflight);
/*
@@ -139,10 +165,13 @@ void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
}
/*
- * If the device does write back caching, drop further down
- * before we wake people up.
+ * For discards, our limit is always the background. For writes, if
+ * the device does write back caching, drop further down before we
+ * wake people up.
*/
- if (rwb->wc && !wb_recent_wait(rwb))
+ if (wb_acct & WBT_DISCARD)
+ limit = rwb->wb_background;
+ else if (rwb->wc && !wb_recent_wait(rwb))
limit = 0;
else
limit = rwb->wb_normal;
@@ -165,24 +194,24 @@ void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
* Called on completion of a request. Note that it's also called when
* a request is merged, when the request gets freed.
*/
-void wbt_done(struct rq_wb *rwb, struct blk_issue_stat *stat)
+void wbt_done(struct rq_wb *rwb, struct request *rq)
{
if (!rwb)
return;
- if (!wbt_is_tracked(stat)) {
- if (rwb->sync_cookie == stat) {
+ if (!wbt_is_tracked(rq)) {
+ if (rwb->sync_cookie == rq) {
rwb->sync_issue = 0;
rwb->sync_cookie = NULL;
}
- if (wbt_is_read(stat))
+ if (wbt_is_read(rq))
wb_timestamp(rwb, &rwb->last_comp);
} else {
- WARN_ON_ONCE(stat == rwb->sync_cookie);
- __wbt_done(rwb, wbt_stat_to_mask(stat));
+ WARN_ON_ONCE(rq == rwb->sync_cookie);
+ __wbt_done(rwb, wbt_flags(rq));
}
- wbt_clear_state(stat);
+ wbt_clear_state(rq);
}
/*
@@ -479,6 +508,9 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
{
unsigned int limit;
+ if ((rw & REQ_OP_MASK) == REQ_OP_DISCARD)
+ return rwb->wb_background;
+
/*
* At this point we know it's a buffered write. If this is
* kswapd trying to free memory, or REQ_SYNC is set, then
@@ -529,11 +561,12 @@ static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw,
* Block if we will exceed our limit, or if we are currently waiting for
* the timer to kick off queuing again.
*/
-static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock)
+static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
+ unsigned long rw, spinlock_t *lock)
__releases(lock)
__acquires(lock)
{
- struct rq_wait *rqw = get_rq_wait(rwb, current_is_kswapd());
+ struct rq_wait *rqw = get_rq_wait(rwb, wb_acct);
DEFINE_WAIT(wait);
if (may_queue(rwb, rqw, &wait, rw))
@@ -559,21 +592,20 @@ static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock)
static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
{
- const int op = bio_op(bio);
-
- /*
- * If not a WRITE, do nothing
- */
- if (op != REQ_OP_WRITE)
- return false;
-
- /*
- * Don't throttle WRITE_ODIRECT
- */
- if ((bio->bi_opf & (REQ_SYNC | REQ_IDLE)) == (REQ_SYNC | REQ_IDLE))
+ switch (bio_op(bio)) {
+ case REQ_OP_WRITE:
+ /*
+ * Don't throttle WRITE_ODIRECT
+ */
+ if ((bio->bi_opf & (REQ_SYNC | REQ_IDLE)) ==
+ (REQ_SYNC | REQ_IDLE))
+ return false;
+ /* fallthrough */
+ case REQ_OP_DISCARD:
+ return true;
+ default:
return false;
-
- return true;
+ }
}
/*
@@ -584,7 +616,7 @@ static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
*/
enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock)
{
- unsigned int ret = 0;
+ enum wbt_flags ret = 0;
if (!rwb_enabled(rwb))
return 0;
@@ -598,41 +630,42 @@ enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock)
return ret;
}
- __wbt_wait(rwb, bio->bi_opf, lock);
+ if (current_is_kswapd())
+ ret |= WBT_KSWAPD;
+ if (bio_op(bio) == REQ_OP_DISCARD)
+ ret |= WBT_DISCARD;
+
+ __wbt_wait(rwb, ret, bio->bi_opf, lock);
if (!blk_stat_is_active(rwb->cb))
rwb_arm_timer(rwb);
- if (current_is_kswapd())
- ret |= WBT_KSWAPD;
-
return ret | WBT_TRACKED;
}
-void wbt_issue(struct rq_wb *rwb, struct blk_issue_stat *stat)
+void wbt_issue(struct rq_wb *rwb, struct request *rq)
{
if (!rwb_enabled(rwb))
return;
/*
- * Track sync issue, in case it takes a long time to complete. Allows
- * us to react quicker, if a sync IO takes a long time to complete.
- * Note that this is just a hint. 'stat' can go away when the
- * request completes, so it's important we never dereference it. We
- * only use the address to compare with, which is why we store the
- * sync_issue time locally.
+ * Track sync issue, in case it takes a long time to complete. Allows us
+ * to react quicker, if a sync IO takes a long time to complete. Note
+ * that this is just a hint. The request can go away when it completes,
+ * so it's important we never dereference it. We only use the address to
+ * compare with, which is why we store the sync_issue time locally.
*/
- if (wbt_is_read(stat) && !rwb->sync_issue) {
- rwb->sync_cookie = stat;
- rwb->sync_issue = blk_stat_time(stat);
+ if (wbt_is_read(rq) && !rwb->sync_issue) {
+ rwb->sync_cookie = rq;
+ rwb->sync_issue = rq->io_start_time_ns;
}
}
-void wbt_requeue(struct rq_wb *rwb, struct blk_issue_stat *stat)
+void wbt_requeue(struct rq_wb *rwb, struct request *rq)
{
if (!rwb_enabled(rwb))
return;
- if (stat == rwb->sync_cookie) {
+ if (rq == rwb->sync_cookie) {
rwb->sync_issue = 0;
rwb->sync_cookie = NULL;
}
@@ -701,7 +734,7 @@ static int wbt_data_dir(const struct request *rq)
if (op == REQ_OP_READ)
return READ;
- else if (op == REQ_OP_WRITE || op == REQ_OP_FLUSH)
+ else if (op_is_write(op))
return WRITE;
/* don't account */
@@ -713,8 +746,6 @@ int wbt_init(struct request_queue *q)
struct rq_wb *rwb;
int i;
- BUILD_BUG_ON(WBT_NR_BITS > BLK_STAT_RES_BITS);
-
rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
if (!rwb)
return -ENOMEM;
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
index a232c98fbf4d..300df531d0a6 100644
--- a/block/blk-wbt.h
+++ b/block/blk-wbt.h
@@ -14,12 +14,16 @@ enum wbt_flags {
WBT_TRACKED = 1, /* write, tracked for throttling */
WBT_READ = 2, /* read */
WBT_KSWAPD = 4, /* write, from kswapd */
+ WBT_DISCARD = 8, /* discard */
- WBT_NR_BITS = 3, /* number of bits */
+ WBT_NR_BITS = 4, /* number of bits */
};
enum {
- WBT_NUM_RWQ = 2,
+ WBT_RWQ_BG = 0,
+ WBT_RWQ_KSWAPD,
+ WBT_RWQ_DISCARD,
+ WBT_NUM_RWQ,
};
/*
@@ -31,31 +35,6 @@ enum {
WBT_STATE_ON_MANUAL = 2,
};
-static inline void wbt_clear_state(struct blk_issue_stat *stat)
-{
- stat->stat &= ~BLK_STAT_RES_MASK;
-}
-
-static inline enum wbt_flags wbt_stat_to_mask(struct blk_issue_stat *stat)
-{
- return (stat->stat & BLK_STAT_RES_MASK) >> BLK_STAT_RES_SHIFT;
-}
-
-static inline void wbt_track(struct blk_issue_stat *stat, enum wbt_flags wb_acct)
-{
- stat->stat |= ((u64) wb_acct) << BLK_STAT_RES_SHIFT;
-}
-
-static inline bool wbt_is_tracked(struct blk_issue_stat *stat)
-{
- return (stat->stat >> BLK_STAT_RES_SHIFT) & WBT_TRACKED;
-}
-
-static inline bool wbt_is_read(struct blk_issue_stat *stat)
-{
- return (stat->stat >> BLK_STAT_RES_SHIFT) & WBT_READ;
-}
-
struct rq_wait {
wait_queue_head_t wait;
atomic_t inflight;
@@ -84,7 +63,7 @@ struct rq_wb {
struct blk_stat_callback *cb;
- s64 sync_issue;
+ u64 sync_issue;
void *sync_cookie;
unsigned int wc;
@@ -109,14 +88,19 @@ static inline unsigned int wbt_inflight(struct rq_wb *rwb)
#ifdef CONFIG_BLK_WBT
+static inline void wbt_track(struct request *rq, enum wbt_flags flags)
+{
+ rq->wbt_flags |= flags;
+}
+
void __wbt_done(struct rq_wb *, enum wbt_flags);
-void wbt_done(struct rq_wb *, struct blk_issue_stat *);
+void wbt_done(struct rq_wb *, struct request *);
enum wbt_flags wbt_wait(struct rq_wb *, struct bio *, spinlock_t *);
int wbt_init(struct request_queue *);
void wbt_exit(struct request_queue *);
void wbt_update_limits(struct rq_wb *);
-void wbt_requeue(struct rq_wb *, struct blk_issue_stat *);
-void wbt_issue(struct rq_wb *, struct blk_issue_stat *);
+void wbt_requeue(struct rq_wb *, struct request *);
+void wbt_issue(struct rq_wb *, struct request *);
void wbt_disable_default(struct request_queue *);
void wbt_enable_default(struct request_queue *);
@@ -127,10 +111,13 @@ u64 wbt_default_latency_nsec(struct request_queue *);
#else
+static inline void wbt_track(struct request *rq, enum wbt_flags flags)
+{
+}
static inline void __wbt_done(struct rq_wb *rwb, enum wbt_flags flags)
{
}
-static inline void wbt_done(struct rq_wb *rwb, struct blk_issue_stat *stat)
+static inline void wbt_done(struct rq_wb *rwb, struct request *rq)
{
}
static inline enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio,
@@ -148,10 +135,10 @@ static inline void wbt_exit(struct request_queue *q)
static inline void wbt_update_limits(struct rq_wb *rwb)
{
}
-static inline void wbt_requeue(struct rq_wb *rwb, struct blk_issue_stat *stat)
+static inline void wbt_requeue(struct rq_wb *rwb, struct request *rq)
{
}
-static inline void wbt_issue(struct rq_wb *rwb, struct blk_issue_stat *stat)
+static inline void wbt_issue(struct rq_wb *rwb, struct request *rq)
{
}
static inline void wbt_disable_default(struct request_queue *q)
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 08e84ef2bc05..3d08dc84db16 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -328,7 +328,11 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
if (!rep.nr_zones)
return -EINVAL;
- zones = kcalloc(rep.nr_zones, sizeof(struct blk_zone), GFP_KERNEL);
+ if (rep.nr_zones > INT_MAX / sizeof(struct blk_zone))
+ return -ERANGE;
+
+ zones = kvmalloc(rep.nr_zones * sizeof(struct blk_zone),
+ GFP_KERNEL | __GFP_ZERO);
if (!zones)
return -ENOMEM;
@@ -350,7 +354,7 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
}
out:
- kfree(zones);
+ kvfree(zones);
return ret;
}
diff --git a/block/blk.h b/block/blk.h
index b034fd2460c4..8d23aea96ce9 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -186,7 +186,7 @@ unsigned int blk_plug_queued_count(struct request_queue *q);
void blk_account_io_start(struct request *req, bool new_io);
void blk_account_io_completion(struct request *req, unsigned int bytes);
-void blk_account_io_done(struct request *req);
+void blk_account_io_done(struct request *req, u64 now);
/*
* EH timer and IO completion will both attempt to 'grab' the request, make
@@ -231,6 +231,9 @@ static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq
e->type->ops.sq.elevator_deactivate_req_fn(q, rq);
}
+int elevator_init(struct request_queue *);
+int elevator_init_mq(struct request_queue *q);
+void elevator_exit(struct request_queue *, struct elevator_queue *);
int elv_register_queue(struct request_queue *q);
void elv_unregister_queue(struct request_queue *q);
diff --git a/block/bounce.c b/block/bounce.c
index dd0b93f2a871..fd31347b7836 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -28,28 +28,29 @@
#define POOL_SIZE 64
#define ISA_POOL_SIZE 16
-static struct bio_set *bounce_bio_set, *bounce_bio_split;
-static mempool_t *page_pool, *isa_page_pool;
+static struct bio_set bounce_bio_set, bounce_bio_split;
+static mempool_t page_pool, isa_page_pool;
#if defined(CONFIG_HIGHMEM)
static __init int init_emergency_pool(void)
{
+ int ret;
#if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG)
if (max_pfn <= max_low_pfn)
return 0;
#endif
- page_pool = mempool_create_page_pool(POOL_SIZE, 0);
- BUG_ON(!page_pool);
+ ret = mempool_init_page_pool(&page_pool, POOL_SIZE, 0);
+ BUG_ON(ret);
pr_info("pool size: %d pages\n", POOL_SIZE);
- bounce_bio_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
- BUG_ON(!bounce_bio_set);
- if (bioset_integrity_create(bounce_bio_set, BIO_POOL_SIZE))
+ ret = bioset_init(&bounce_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
+ BUG_ON(ret);
+ if (bioset_integrity_create(&bounce_bio_set, BIO_POOL_SIZE))
BUG_ON(1);
- bounce_bio_split = bioset_create(BIO_POOL_SIZE, 0, 0);
- BUG_ON(!bounce_bio_split);
+ ret = bioset_init(&bounce_bio_split, BIO_POOL_SIZE, 0, 0);
+ BUG_ON(ret);
return 0;
}
@@ -63,14 +64,11 @@ __initcall(init_emergency_pool);
*/
static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
{
- unsigned long flags;
unsigned char *vto;
- local_irq_save(flags);
vto = kmap_atomic(to->bv_page);
memcpy(vto + to->bv_offset, vfrom, to->bv_len);
kunmap_atomic(vto);
- local_irq_restore(flags);
}
#else /* CONFIG_HIGHMEM */
@@ -94,12 +92,14 @@ static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
*/
int init_emergency_isa_pool(void)
{
- if (isa_page_pool)
+ int ret;
+
+ if (mempool_initialized(&isa_page_pool))
return 0;
- isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa,
- mempool_free_pages, (void *) 0);
- BUG_ON(!isa_page_pool);
+ ret = mempool_init(&isa_page_pool, ISA_POOL_SIZE, mempool_alloc_pages_isa,
+ mempool_free_pages, (void *) 0);
+ BUG_ON(ret);
pr_info("isa pool size: %d pages\n", ISA_POOL_SIZE);
return 0;
@@ -166,13 +166,13 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool)
static void bounce_end_io_write(struct bio *bio)
{
- bounce_end_io(bio, page_pool);
+ bounce_end_io(bio, &page_pool);
}
static void bounce_end_io_write_isa(struct bio *bio)
{
- bounce_end_io(bio, isa_page_pool);
+ bounce_end_io(bio, &isa_page_pool);
}
static void __bounce_end_io_read(struct bio *bio, mempool_t *pool)
@@ -187,12 +187,12 @@ static void __bounce_end_io_read(struct bio *bio, mempool_t *pool)
static void bounce_end_io_read(struct bio *bio)
{
- __bounce_end_io_read(bio, page_pool);
+ __bounce_end_io_read(bio, &page_pool);
}
static void bounce_end_io_read_isa(struct bio *bio)
{
- __bounce_end_io_read(bio, isa_page_pool);
+ __bounce_end_io_read(bio, &isa_page_pool);
}
static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
@@ -217,13 +217,13 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
return;
if (!passthrough && sectors < bio_sectors(*bio_orig)) {
- bio = bio_split(*bio_orig, sectors, GFP_NOIO, bounce_bio_split);
+ bio = bio_split(*bio_orig, sectors, GFP_NOIO, &bounce_bio_split);
bio_chain(bio, *bio_orig);
generic_make_request(*bio_orig);
*bio_orig = bio;
}
bio = bio_clone_bioset(*bio_orig, GFP_NOIO, passthrough ? NULL :
- bounce_bio_set);
+ &bounce_bio_set);
bio_for_each_segment_all(to, bio, i) {
struct page *page = to->bv_page;
@@ -250,7 +250,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
bio->bi_flags |= (1 << BIO_BOUNCED);
- if (pool == page_pool) {
+ if (pool == &page_pool) {
bio->bi_end_io = bounce_end_io_write;
if (rw == READ)
bio->bi_end_io = bounce_end_io_read;
@@ -282,10 +282,10 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
if (!(q->bounce_gfp & GFP_DMA)) {
if (q->limits.bounce_pfn >= blk_max_pfn)
return;
- pool = page_pool;
+ pool = &page_pool;
} else {
- BUG_ON(!isa_page_pool);
- pool = isa_page_pool;
+ BUG_ON(!mempool_initialized(&isa_page_pool));
+ pool = &isa_page_pool;
}
/*
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index fc2e5ff2c4b9..9419def8c017 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -303,11 +303,9 @@ static void bsg_exit_rq(struct request_queue *q, struct request *req)
* @name: device to give bsg device
* @job_fn: bsg job handler
* @dd_job_size: size of LLD data needed for each job
- * @release: @dev release function
*/
struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
- bsg_job_fn *job_fn, int dd_job_size,
- void (*release)(struct device *))
+ bsg_job_fn *job_fn, int dd_job_size)
{
struct request_queue *q;
int ret;
@@ -331,7 +329,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
blk_queue_softirq_done(q, bsg_softirq_done);
blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT);
- ret = bsg_register_queue(q, dev, name, &bsg_transport_ops, release);
+ ret = bsg_register_queue(q, dev, name, &bsg_transport_ops);
if (ret) {
printk(KERN_ERR "%s: bsg interface failed to "
"initialize - register queue\n", dev->kobj.name);
diff --git a/block/bsg.c b/block/bsg.c
index defa06c11858..132e657e2d91 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -226,8 +226,7 @@ bsg_map_hdr(struct request_queue *q, struct sg_io_v4 *hdr, fmode_t mode)
return ERR_PTR(ret);
rq = blk_get_request(q, hdr->dout_xfer_len ?
- REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN,
- GFP_KERNEL);
+ REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
if (IS_ERR(rq))
return rq;
@@ -249,7 +248,7 @@ bsg_map_hdr(struct request_queue *q, struct sg_io_v4 *hdr, fmode_t mode)
goto out;
}
- next_rq = blk_get_request(q, REQ_OP_SCSI_IN, GFP_KERNEL);
+ next_rq = blk_get_request(q, REQ_OP_SCSI_IN, 0);
if (IS_ERR(next_rq)) {
ret = PTR_ERR(next_rq);
goto out;
@@ -650,18 +649,6 @@ static struct bsg_device *bsg_alloc_device(void)
return bd;
}
-static void bsg_kref_release_function(struct kref *kref)
-{
- struct bsg_class_device *bcd =
- container_of(kref, struct bsg_class_device, ref);
- struct device *parent = bcd->parent;
-
- if (bcd->release)
- bcd->release(bcd->parent);
-
- put_device(parent);
-}
-
static int bsg_put_device(struct bsg_device *bd)
{
int ret = 0, do_free;
@@ -694,7 +681,6 @@ static int bsg_put_device(struct bsg_device *bd)
kfree(bd);
out:
- kref_put(&q->bsg_dev.ref, bsg_kref_release_function);
if (do_free)
blk_put_queue(q);
return ret;
@@ -760,8 +746,6 @@ static struct bsg_device *bsg_get_device(struct inode *inode, struct file *file)
*/
mutex_lock(&bsg_mutex);
bcd = idr_find(&bsg_minor_idr, iminor(inode));
- if (bcd)
- kref_get(&bcd->ref);
mutex_unlock(&bsg_mutex);
if (!bcd)
@@ -772,8 +756,6 @@ static struct bsg_device *bsg_get_device(struct inode *inode, struct file *file)
return bd;
bd = bsg_add_device(inode, bcd->queue, file);
- if (IS_ERR(bd))
- kref_put(&bcd->ref, bsg_kref_release_function);
return bd;
}
@@ -913,25 +895,17 @@ void bsg_unregister_queue(struct request_queue *q)
sysfs_remove_link(&q->kobj, "bsg");
device_unregister(bcd->class_dev);
bcd->class_dev = NULL;
- kref_put(&bcd->ref, bsg_kref_release_function);
mutex_unlock(&bsg_mutex);
}
EXPORT_SYMBOL_GPL(bsg_unregister_queue);
int bsg_register_queue(struct request_queue *q, struct device *parent,
- const char *name, const struct bsg_ops *ops,
- void (*release)(struct device *))
+ const char *name, const struct bsg_ops *ops)
{
struct bsg_class_device *bcd;
dev_t dev;
int ret;
struct device *class_dev = NULL;
- const char *devname;
-
- if (name)
- devname = name;
- else
- devname = dev_name(parent);
/*
* we need a proper transport to send commands, not a stacked device
@@ -955,15 +929,12 @@ int bsg_register_queue(struct request_queue *q, struct device *parent,
bcd->minor = ret;
bcd->queue = q;
- bcd->parent = get_device(parent);
- bcd->release = release;
bcd->ops = ops;
- kref_init(&bcd->ref);
dev = MKDEV(bsg_major, bcd->minor);
- class_dev = device_create(bsg_class, parent, dev, NULL, "%s", devname);
+ class_dev = device_create(bsg_class, parent, dev, NULL, "%s", name);
if (IS_ERR(class_dev)) {
ret = PTR_ERR(class_dev);
- goto put_dev;
+ goto idr_remove;
}
bcd->class_dev = class_dev;
@@ -978,8 +949,7 @@ int bsg_register_queue(struct request_queue *q, struct device *parent,
unregister_class_dev:
device_unregister(class_dev);
-put_dev:
- put_device(parent);
+idr_remove:
idr_remove(&bsg_minor_idr, bcd->minor);
unlock:
mutex_unlock(&bsg_mutex);
@@ -993,7 +963,7 @@ int bsg_scsi_register_queue(struct request_queue *q, struct device *parent)
return -EINVAL;
}
- return bsg_register_queue(q, parent, NULL, &bsg_scsi_ops, NULL);
+ return bsg_register_queue(q, parent, dev_name(parent), &bsg_scsi_ops);
}
EXPORT_SYMBOL_GPL(bsg_scsi_register_queue);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 9f342ef1ad42..82b6c27b3245 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -210,9 +210,9 @@ struct cfqg_stats {
/* total time with empty current active q with other requests queued */
struct blkg_stat empty_time;
/* fields after this shouldn't be cleared on stat reset */
- uint64_t start_group_wait_time;
- uint64_t start_idle_time;
- uint64_t start_empty_time;
+ u64 start_group_wait_time;
+ u64 start_idle_time;
+ u64 start_empty_time;
uint16_t flags;
#endif /* CONFIG_DEBUG_BLK_CGROUP */
#endif /* CONFIG_CFQ_GROUP_IOSCHED */
@@ -491,13 +491,13 @@ CFQG_FLAG_FNS(empty)
/* This should be called with the queue_lock held. */
static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats)
{
- unsigned long long now;
+ u64 now;
if (!cfqg_stats_waiting(stats))
return;
- now = sched_clock();
- if (time_after64(now, stats->start_group_wait_time))
+ now = ktime_get_ns();
+ if (now > stats->start_group_wait_time)
blkg_stat_add(&stats->group_wait_time,
now - stats->start_group_wait_time);
cfqg_stats_clear_waiting(stats);
@@ -513,20 +513,20 @@ static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg,
return;
if (cfqg == curr_cfqg)
return;
- stats->start_group_wait_time = sched_clock();
+ stats->start_group_wait_time = ktime_get_ns();
cfqg_stats_mark_waiting(stats);
}
/* This should be called with the queue_lock held. */
static void cfqg_stats_end_empty_time(struct cfqg_stats *stats)
{
- unsigned long long now;
+ u64 now;
if (!cfqg_stats_empty(stats))
return;
- now = sched_clock();
- if (time_after64(now, stats->start_empty_time))
+ now = ktime_get_ns();
+ if (now > stats->start_empty_time)
blkg_stat_add(&stats->empty_time,
now - stats->start_empty_time);
cfqg_stats_clear_empty(stats);
@@ -552,7 +552,7 @@ static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg)
if (cfqg_stats_empty(stats))
return;
- stats->start_empty_time = sched_clock();
+ stats->start_empty_time = ktime_get_ns();
cfqg_stats_mark_empty(stats);
}
@@ -561,9 +561,9 @@ static void cfqg_stats_update_idle_time(struct cfq_group *cfqg)
struct cfqg_stats *stats = &cfqg->stats;
if (cfqg_stats_idling(stats)) {
- unsigned long long now = sched_clock();
+ u64 now = ktime_get_ns();
- if (time_after64(now, stats->start_idle_time))
+ if (now > stats->start_idle_time)
blkg_stat_add(&stats->idle_time,
now - stats->start_idle_time);
cfqg_stats_clear_idling(stats);
@@ -576,7 +576,7 @@ static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg)
BUG_ON(cfqg_stats_idling(stats));
- stats->start_idle_time = sched_clock();
+ stats->start_idle_time = ktime_get_ns();
cfqg_stats_mark_idling(stats);
}
@@ -701,17 +701,19 @@ static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg,
}
static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
- uint64_t start_time, uint64_t io_start_time,
- unsigned int op)
+ u64 start_time_ns,
+ u64 io_start_time_ns,
+ unsigned int op)
{
struct cfqg_stats *stats = &cfqg->stats;
- unsigned long long now = sched_clock();
+ u64 now = ktime_get_ns();
- if (time_after64(now, io_start_time))
- blkg_rwstat_add(&stats->service_time, op, now - io_start_time);
- if (time_after64(io_start_time, start_time))
+ if (now > io_start_time_ns)
+ blkg_rwstat_add(&stats->service_time, op,
+ now - io_start_time_ns);
+ if (io_start_time_ns > start_time_ns)
blkg_rwstat_add(&stats->wait_time, op,
- io_start_time - start_time);
+ io_start_time_ns - start_time_ns);
}
/* @stats = 0 */
@@ -797,8 +799,9 @@ static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg,
static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg,
unsigned int op) { }
static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
- uint64_t start_time, uint64_t io_start_time,
- unsigned int op) { }
+ u64 start_time_ns,
+ u64 io_start_time_ns,
+ unsigned int op) { }
#endif /* CONFIG_CFQ_GROUP_IOSCHED */
@@ -4225,8 +4228,8 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
cfqd->rq_in_driver--;
cfqq->dispatched--;
(RQ_CFQG(rq))->dispatched--;
- cfqg_stats_update_completion(cfqq->cfqg, rq_start_time_ns(rq),
- rq_io_start_time_ns(rq), rq->cmd_flags);
+ cfqg_stats_update_completion(cfqq->cfqg, rq->start_time_ns,
+ rq->io_start_time_ns, rq->cmd_flags);
cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
@@ -4242,16 +4245,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
cfqq_type(cfqq));
st->ttime.last_end_request = now;
- /*
- * We have to do this check in jiffies since start_time is in
- * jiffies and it is not trivial to convert to ns. If
- * cfq_fifo_expire[1] ever comes close to 1 jiffie, this test
- * will become problematic but so far we are fine (the default
- * is 128 ms).
- */
- if (!time_after(rq->start_time +
- nsecs_to_jiffies(cfqd->cfq_fifo_expire[1]),
- jiffies))
+ if (rq->start_time_ns + cfqd->cfq_fifo_expire[1] <= now)
cfqd->last_delayed_sync = now;
}
@@ -4792,7 +4786,7 @@ USEC_STORE_FUNCTION(cfq_target_latency_us_store, &cfqd->cfq_target_latency, 1, U
#undef USEC_STORE_FUNCTION
#define CFQ_ATTR(name) \
- __ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)
+ __ATTR(name, 0644, cfq_##name##_show, cfq_##name##_store)
static struct elv_fs_entry cfq_attrs[] = {
CFQ_ATTR(quantum),
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index 9de9f156e203..ef2f1f09e9b3 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -512,8 +512,7 @@ STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0);
#undef STORE_FUNCTION
#define DD_ATTR(name) \
- __ATTR(name, S_IRUGO|S_IWUSR, deadline_##name##_show, \
- deadline_##name##_store)
+ __ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store)
static struct elv_fs_entry deadline_attrs[] = {
DD_ATTR(read_expire),
diff --git a/block/elevator.c b/block/elevator.c
index e87e9b43aba0..fa828b5bfd4b 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -199,76 +199,46 @@ static void elevator_release(struct kobject *kobj)
kfree(e);
}
-int elevator_init(struct request_queue *q, char *name)
+/*
+ * Use the default elevator specified by config boot param for non-mq devices,
+ * or by config option. Don't try to load modules as we could be running off
+ * async and request_module() isn't allowed from async.
+ */
+int elevator_init(struct request_queue *q)
{
struct elevator_type *e = NULL;
- int err;
+ int err = 0;
/*
* q->sysfs_lock must be held to provide mutual exclusion between
* elevator_switch() and here.
*/
- lockdep_assert_held(&q->sysfs_lock);
-
+ mutex_lock(&q->sysfs_lock);
if (unlikely(q->elevator))
- return 0;
-
- INIT_LIST_HEAD(&q->queue_head);
- q->last_merge = NULL;
- q->end_sector = 0;
- q->boundary_rq = NULL;
-
- if (name) {
- e = elevator_get(q, name, true);
- if (!e)
- return -EINVAL;
- }
+ goto out_unlock;
- /*
- * Use the default elevator specified by config boot param for
- * non-mq devices, or by config option. Don't try to load modules
- * as we could be running off async and request_module() isn't
- * allowed from async.
- */
- if (!e && !q->mq_ops && *chosen_elevator) {
+ if (*chosen_elevator) {
e = elevator_get(q, chosen_elevator, false);
if (!e)
printk(KERN_ERR "I/O scheduler %s not found\n",
chosen_elevator);
}
+ if (!e)
+ e = elevator_get(q, CONFIG_DEFAULT_IOSCHED, false);
if (!e) {
- /*
- * For blk-mq devices, we default to using mq-deadline,
- * if available, for single queue devices. If deadline
- * isn't available OR we have multiple queues, default
- * to "none".
- */
- if (q->mq_ops) {
- if (q->nr_hw_queues == 1)
- e = elevator_get(q, "mq-deadline", false);
- if (!e)
- return 0;
- } else
- e = elevator_get(q, CONFIG_DEFAULT_IOSCHED, false);
-
- if (!e) {
- printk(KERN_ERR
- "Default I/O scheduler not found. " \
- "Using noop.\n");
- e = elevator_get(q, "noop", false);
- }
+ printk(KERN_ERR
+ "Default I/O scheduler not found. Using noop.\n");
+ e = elevator_get(q, "noop", false);
}
- if (e->uses_mq)
- err = blk_mq_init_sched(q, e);
- else
- err = e->ops.sq.elevator_init_fn(q, e);
+ err = e->ops.sq.elevator_init_fn(q, e);
if (err)
elevator_put(e);
+out_unlock:
+ mutex_unlock(&q->sysfs_lock);
return err;
}
-EXPORT_SYMBOL(elevator_init);
void elevator_exit(struct request_queue *q, struct elevator_queue *e)
{
@@ -281,7 +251,6 @@ void elevator_exit(struct request_queue *q, struct elevator_queue *e)
kobject_put(&e->kobj);
}
-EXPORT_SYMBOL(elevator_exit);
static inline void __elv_rqhash_del(struct request *rq)
{
@@ -1005,6 +974,40 @@ out:
}
/*
+ * For blk-mq devices, we default to using mq-deadline, if available, for single
+ * queue devices. If deadline isn't available OR we have multiple queues,
+ * default to "none".
+ */
+int elevator_init_mq(struct request_queue *q)
+{
+ struct elevator_type *e;
+ int err = 0;
+
+ if (q->nr_hw_queues != 1)
+ return 0;
+
+ /*
+ * q->sysfs_lock must be held to provide mutual exclusion between
+ * elevator_switch() and here.
+ */
+ mutex_lock(&q->sysfs_lock);
+ if (unlikely(q->elevator))
+ goto out_unlock;
+
+ e = elevator_get(q, "mq-deadline", false);
+ if (!e)
+ goto out_unlock;
+
+ err = blk_mq_init_sched(q, e);
+ if (err)
+ elevator_put(e);
+out_unlock:
+ mutex_unlock(&q->sysfs_lock);
+ return err;
+}
+
+
+/*
* switch to new_e io scheduler. be careful not to introduce deadlocks -
* we don't free the old io scheduler, before we have allocated what we
* need for the new one. this way we have a chance of going back to the old
diff --git a/block/genhd.c b/block/genhd.c
index c4513fe1adda..4d694035d343 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1139,28 +1139,25 @@ static ssize_t disk_discard_alignment_show(struct device *dev,
return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue));
}
-static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
-static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
-static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
-static DEVICE_ATTR(hidden, S_IRUGO, disk_hidden_show, NULL);
-static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL);
-static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
-static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL);
-static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show,
- NULL);
-static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
-static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
-static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
-static DEVICE_ATTR(badblocks, S_IRUGO | S_IWUSR, disk_badblocks_show,
- disk_badblocks_store);
+static DEVICE_ATTR(range, 0444, disk_range_show, NULL);
+static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL);
+static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL);
+static DEVICE_ATTR(hidden, 0444, disk_hidden_show, NULL);
+static DEVICE_ATTR(ro, 0444, disk_ro_show, NULL);
+static DEVICE_ATTR(size, 0444, part_size_show, NULL);
+static DEVICE_ATTR(alignment_offset, 0444, disk_alignment_offset_show, NULL);
+static DEVICE_ATTR(discard_alignment, 0444, disk_discard_alignment_show, NULL);
+static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL);
+static DEVICE_ATTR(stat, 0444, part_stat_show, NULL);
+static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
+static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store);
#ifdef CONFIG_FAIL_MAKE_REQUEST
static struct device_attribute dev_attr_fail =
- __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
+ __ATTR(make-it-fail, 0644, part_fail_show, part_fail_store);
#endif
#ifdef CONFIG_FAIL_IO_TIMEOUT
static struct device_attribute dev_attr_fail_timeout =
- __ATTR(io-timeout-fail, S_IRUGO|S_IWUSR, part_timeout_show,
- part_timeout_store);
+ __ATTR(io-timeout-fail, 0644, part_timeout_show, part_timeout_store);
#endif
static struct attribute *disk_attrs[] = {
@@ -1924,9 +1921,9 @@ static ssize_t disk_events_poll_msecs_store(struct device *dev,
return count;
}
-static const DEVICE_ATTR(events, S_IRUGO, disk_events_show, NULL);
-static const DEVICE_ATTR(events_async, S_IRUGO, disk_events_async_show, NULL);
-static const DEVICE_ATTR(events_poll_msecs, S_IRUGO|S_IWUSR,
+static const DEVICE_ATTR(events, 0444, disk_events_show, NULL);
+static const DEVICE_ATTR(events_async, 0444, disk_events_async_show, NULL);
+static const DEVICE_ATTR(events_poll_msecs, 0644,
disk_events_poll_msecs_show,
disk_events_poll_msecs_store);
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index 0d6d25e32e1f..a1660bafc912 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -72,6 +72,19 @@ static const unsigned int kyber_batch_size[] = {
[KYBER_OTHER] = 8,
};
+/*
+ * There is a same mapping between ctx & hctx and kcq & khd,
+ * we use request->mq_ctx->index_hw to index the kcq in khd.
+ */
+struct kyber_ctx_queue {
+ /*
+ * Used to ensure operations on rq_list and kcq_map to be an atmoic one.
+ * Also protect the rqs on rq_list when merge.
+ */
+ spinlock_t lock;
+ struct list_head rq_list[KYBER_NUM_DOMAINS];
+} ____cacheline_aligned_in_smp;
+
struct kyber_queue_data {
struct request_queue *q;
@@ -99,6 +112,8 @@ struct kyber_hctx_data {
struct list_head rqs[KYBER_NUM_DOMAINS];
unsigned int cur_domain;
unsigned int batching;
+ struct kyber_ctx_queue *kcqs;
+ struct sbitmap kcq_map[KYBER_NUM_DOMAINS];
wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS];
struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS];
atomic_t wait_index[KYBER_NUM_DOMAINS];
@@ -107,10 +122,8 @@ struct kyber_hctx_data {
static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
void *key);
-static int rq_sched_domain(const struct request *rq)
+static unsigned int kyber_sched_domain(unsigned int op)
{
- unsigned int op = rq->cmd_flags;
-
if ((op & REQ_OP_MASK) == REQ_OP_READ)
return KYBER_READ;
else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op))
@@ -284,6 +297,11 @@ static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd)
return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
}
+static int kyber_bucket_fn(const struct request *rq)
+{
+ return kyber_sched_domain(rq->cmd_flags);
+}
+
static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
{
struct kyber_queue_data *kqd;
@@ -297,7 +315,7 @@ static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
goto err;
kqd->q = q;
- kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, rq_sched_domain,
+ kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, kyber_bucket_fn,
KYBER_NUM_DOMAINS, kqd);
if (!kqd->cb)
goto err_kqd;
@@ -376,8 +394,18 @@ static void kyber_exit_sched(struct elevator_queue *e)
kfree(kqd);
}
+static void kyber_ctx_queue_init(struct kyber_ctx_queue *kcq)
+{
+ unsigned int i;
+
+ spin_lock_init(&kcq->lock);
+ for (i = 0; i < KYBER_NUM_DOMAINS; i++)
+ INIT_LIST_HEAD(&kcq->rq_list[i]);
+}
+
static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
{
+ struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
struct kyber_hctx_data *khd;
int i;
@@ -385,6 +413,24 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
if (!khd)
return -ENOMEM;
+ khd->kcqs = kmalloc_array_node(hctx->nr_ctx,
+ sizeof(struct kyber_ctx_queue),
+ GFP_KERNEL, hctx->numa_node);
+ if (!khd->kcqs)
+ goto err_khd;
+
+ for (i = 0; i < hctx->nr_ctx; i++)
+ kyber_ctx_queue_init(&khd->kcqs[i]);
+
+ for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
+ if (sbitmap_init_node(&khd->kcq_map[i], hctx->nr_ctx,
+ ilog2(8), GFP_KERNEL, hctx->numa_node)) {
+ while (--i >= 0)
+ sbitmap_free(&khd->kcq_map[i]);
+ goto err_kcqs;
+ }
+ }
+
spin_lock_init(&khd->lock);
for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
@@ -400,12 +446,26 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
khd->batching = 0;
hctx->sched_data = khd;
+ sbitmap_queue_min_shallow_depth(&hctx->sched_tags->bitmap_tags,
+ kqd->async_depth);
return 0;
+
+err_kcqs:
+ kfree(khd->kcqs);
+err_khd:
+ kfree(khd);
+ return -ENOMEM;
}
static void kyber_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
{
+ struct kyber_hctx_data *khd = hctx->sched_data;
+ int i;
+
+ for (i = 0; i < KYBER_NUM_DOMAINS; i++)
+ sbitmap_free(&khd->kcq_map[i]);
+ kfree(khd->kcqs);
kfree(hctx->sched_data);
}
@@ -427,7 +487,7 @@ static void rq_clear_domain_token(struct kyber_queue_data *kqd,
nr = rq_get_domain_token(rq);
if (nr != -1) {
- sched_domain = rq_sched_domain(rq);
+ sched_domain = kyber_sched_domain(rq->cmd_flags);
sbitmap_queue_clear(&kqd->domain_tokens[sched_domain], nr,
rq->mq_ctx->cpu);
}
@@ -446,11 +506,51 @@ static void kyber_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
}
}
+static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
+{
+ struct kyber_hctx_data *khd = hctx->sched_data;
+ struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue);
+ struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw];
+ unsigned int sched_domain = kyber_sched_domain(bio->bi_opf);
+ struct list_head *rq_list = &kcq->rq_list[sched_domain];
+ bool merged;
+
+ spin_lock(&kcq->lock);
+ merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio);
+ spin_unlock(&kcq->lock);
+ blk_mq_put_ctx(ctx);
+
+ return merged;
+}
+
static void kyber_prepare_request(struct request *rq, struct bio *bio)
{
rq_set_domain_token(rq, -1);
}
+static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx,
+ struct list_head *rq_list, bool at_head)
+{
+ struct kyber_hctx_data *khd = hctx->sched_data;
+ struct request *rq, *next;
+
+ list_for_each_entry_safe(rq, next, rq_list, queuelist) {
+ unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags);
+ struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw];
+ struct list_head *head = &kcq->rq_list[sched_domain];
+
+ spin_lock(&kcq->lock);
+ if (at_head)
+ list_move(&rq->queuelist, head);
+ else
+ list_move_tail(&rq->queuelist, head);
+ sbitmap_set_bit(&khd->kcq_map[sched_domain],
+ rq->mq_ctx->index_hw);
+ blk_mq_sched_request_inserted(rq);
+ spin_unlock(&kcq->lock);
+ }
+}
+
static void kyber_finish_request(struct request *rq)
{
struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;
@@ -469,7 +569,7 @@ static void kyber_completed_request(struct request *rq)
* Check if this request met our latency goal. If not, quickly gather
* some statistics and start throttling.
*/
- sched_domain = rq_sched_domain(rq);
+ sched_domain = kyber_sched_domain(rq->cmd_flags);
switch (sched_domain) {
case KYBER_READ:
target = kqd->read_lat_nsec;
@@ -485,29 +585,48 @@ static void kyber_completed_request(struct request *rq)
if (blk_stat_is_active(kqd->cb))
return;
- now = __blk_stat_time(ktime_to_ns(ktime_get()));
- if (now < blk_stat_time(&rq->issue_stat))
+ now = ktime_get_ns();
+ if (now < rq->io_start_time_ns)
return;
- latency = now - blk_stat_time(&rq->issue_stat);
+ latency = now - rq->io_start_time_ns;
if (latency > target)
blk_stat_activate_msecs(kqd->cb, 10);
}
-static void kyber_flush_busy_ctxs(struct kyber_hctx_data *khd,
- struct blk_mq_hw_ctx *hctx)
+struct flush_kcq_data {
+ struct kyber_hctx_data *khd;
+ unsigned int sched_domain;
+ struct list_head *list;
+};
+
+static bool flush_busy_kcq(struct sbitmap *sb, unsigned int bitnr, void *data)
{
- LIST_HEAD(rq_list);
- struct request *rq, *next;
+ struct flush_kcq_data *flush_data = data;
+ struct kyber_ctx_queue *kcq = &flush_data->khd->kcqs[bitnr];
- blk_mq_flush_busy_ctxs(hctx, &rq_list);
- list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
- unsigned int sched_domain;
+ spin_lock(&kcq->lock);
+ list_splice_tail_init(&kcq->rq_list[flush_data->sched_domain],
+ flush_data->list);
+ sbitmap_clear_bit(sb, bitnr);
+ spin_unlock(&kcq->lock);
- sched_domain = rq_sched_domain(rq);
- list_move_tail(&rq->queuelist, &khd->rqs[sched_domain]);
- }
+ return true;
+}
+
+static void kyber_flush_busy_kcqs(struct kyber_hctx_data *khd,
+ unsigned int sched_domain,
+ struct list_head *list)
+{
+ struct flush_kcq_data data = {
+ .khd = khd,
+ .sched_domain = sched_domain,
+ .list = list,
+ };
+
+ sbitmap_for_each_set(&khd->kcq_map[sched_domain],
+ flush_busy_kcq, &data);
}
static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
@@ -570,26 +689,23 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd,
static struct request *
kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
struct kyber_hctx_data *khd,
- struct blk_mq_hw_ctx *hctx,
- bool *flushed)
+ struct blk_mq_hw_ctx *hctx)
{
struct list_head *rqs;
struct request *rq;
int nr;
rqs = &khd->rqs[khd->cur_domain];
- rq = list_first_entry_or_null(rqs, struct request, queuelist);
/*
- * If there wasn't already a pending request and we haven't flushed the
- * software queues yet, flush the software queues and check again.
+ * If we already have a flushed request, then we just need to get a
+ * token for it. Otherwise, if there are pending requests in the kcqs,
+ * flush the kcqs, but only if we can get a token. If not, we should
+ * leave the requests in the kcqs so that they can be merged. Note that
+ * khd->lock serializes the flushes, so if we observed any bit set in
+ * the kcq_map, we will always get a request.
*/
- if (!rq && !*flushed) {
- kyber_flush_busy_ctxs(khd, hctx);
- *flushed = true;
- rq = list_first_entry_or_null(rqs, struct request, queuelist);
- }
-
+ rq = list_first_entry_or_null(rqs, struct request, queuelist);
if (rq) {
nr = kyber_get_domain_token(kqd, khd, hctx);
if (nr >= 0) {
@@ -598,6 +714,16 @@ kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
list_del_init(&rq->queuelist);
return rq;
}
+ } else if (sbitmap_any_bit_set(&khd->kcq_map[khd->cur_domain])) {
+ nr = kyber_get_domain_token(kqd, khd, hctx);
+ if (nr >= 0) {
+ kyber_flush_busy_kcqs(khd, khd->cur_domain, rqs);
+ rq = list_first_entry(rqs, struct request, queuelist);
+ khd->batching++;
+ rq_set_domain_token(rq, nr);
+ list_del_init(&rq->queuelist);
+ return rq;
+ }
}
/* There were either no pending requests or no tokens. */
@@ -608,7 +734,6 @@ static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
{
struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
struct kyber_hctx_data *khd = hctx->sched_data;
- bool flushed = false;
struct request *rq;
int i;
@@ -619,7 +744,7 @@ static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
* from the batch.
*/
if (khd->batching < kyber_batch_size[khd->cur_domain]) {
- rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed);
+ rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
if (rq)
goto out;
}
@@ -640,7 +765,7 @@ static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
else
khd->cur_domain++;
- rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed);
+ rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
if (rq)
goto out;
}
@@ -657,10 +782,12 @@ static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
int i;
for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
- if (!list_empty_careful(&khd->rqs[i]))
+ if (!list_empty_careful(&khd->rqs[i]) ||
+ sbitmap_any_bit_set(&khd->kcq_map[i]))
return true;
}
- return sbitmap_any_bit_set(&hctx->ctx_map);
+
+ return false;
}
#define KYBER_LAT_SHOW_STORE(op) \
@@ -831,7 +958,9 @@ static struct elevator_type kyber_sched = {
.init_hctx = kyber_init_hctx,
.exit_hctx = kyber_exit_hctx,
.limit_depth = kyber_limit_depth,
+ .bio_merge = kyber_bio_merge,
.prepare_request = kyber_prepare_request,
+ .insert_requests = kyber_insert_requests,
.finish_request = kyber_finish_request,
.requeue_request = kyber_finish_request,
.completed_request = kyber_completed_request,
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 8ec0ba9f5386..099a9e05854c 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -630,8 +630,7 @@ STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0);
#undef STORE_FUNCTION
#define DD_ATTR(name) \
- __ATTR(name, S_IRUGO|S_IWUSR, deadline_##name##_show, \
- deadline_##name##_store)
+ __ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store)
static struct elv_fs_entry deadline_attrs[] = {
DD_ATTR(read_expire),
diff --git a/block/partition-generic.c b/block/partition-generic.c
index db57cced9b98..3dcfd4ec0e11 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -179,18 +179,17 @@ ssize_t part_fail_store(struct device *dev,
}
#endif
-static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
-static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
-static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
-static DEVICE_ATTR(ro, S_IRUGO, part_ro_show, NULL);
-static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
-static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show,
- NULL);
-static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
-static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
+static DEVICE_ATTR(partition, 0444, part_partition_show, NULL);
+static DEVICE_ATTR(start, 0444, part_start_show, NULL);
+static DEVICE_ATTR(size, 0444, part_size_show, NULL);
+static DEVICE_ATTR(ro, 0444, part_ro_show, NULL);
+static DEVICE_ATTR(alignment_offset, 0444, part_alignment_offset_show, NULL);
+static DEVICE_ATTR(discard_alignment, 0444, part_discard_alignment_show, NULL);
+static DEVICE_ATTR(stat, 0444, part_stat_show, NULL);
+static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
#ifdef CONFIG_FAIL_MAKE_REQUEST
static struct device_attribute dev_attr_fail =
- __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
+ __ATTR(make-it-fail, 0644, part_fail_show, part_fail_store);
#endif
static struct attribute *part_attrs[] = {
@@ -291,8 +290,7 @@ static ssize_t whole_disk_show(struct device *dev,
{
return 0;
}
-static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH,
- whole_disk_show, NULL);
+static DEVICE_ATTR(whole_disk, 0444, whole_disk_show, NULL);
/*
* Must be called either with bd_mutex held, before a disk can be opened or
@@ -518,7 +516,7 @@ rescan:
if (disk->fops->revalidate_disk)
disk->fops->revalidate_disk(disk);
- check_disk_size_change(disk, bdev);
+ check_disk_size_change(disk, bdev, true);
bdev->bd_invalidated = 0;
if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
return 0;
@@ -643,7 +641,7 @@ int invalidate_partitions(struct gendisk *disk, struct block_device *bdev)
return res;
set_capacity(disk, 0);
- check_disk_size_change(disk, bdev);
+ check_disk_size_change(disk, bdev, false);
bdev->bd_invalidated = 0;
/* tell userspace that the media / partition table may have changed */
kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 60b471f8621b..533f4aee8567 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -321,8 +321,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
at_head = 1;
ret = -ENOMEM;
- rq = blk_get_request(q, writing ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN,
- GFP_KERNEL);
+ rq = blk_get_request(q, writing ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
if (IS_ERR(rq))
return PTR_ERR(rq);
req = scsi_req(rq);
@@ -449,8 +448,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
}
- rq = blk_get_request(q, in_len ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN,
- __GFP_RECLAIM);
+ rq = blk_get_request(q, in_len ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
if (IS_ERR(rq)) {
err = PTR_ERR(rq);
goto error_free_buffer;
@@ -501,7 +499,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
break;
}
- if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, __GFP_RECLAIM)) {
+ if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, GFP_NOIO)) {
err = DRIVER_ERROR << 24;
goto error;
}
@@ -538,7 +536,7 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk,
struct request *rq;
int err;
- rq = blk_get_request(q, REQ_OP_SCSI_OUT, __GFP_RECLAIM);
+ rq = blk_get_request(q, REQ_OP_SCSI_OUT, 0);
if (IS_ERR(rq))
return PTR_ERR(rq);
rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 513b260bcff1..a2398e28c295 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -500,57 +500,6 @@ void ata_eh_release(struct ata_port *ap)
mutex_unlock(&ap->host->eh_mutex);
}
-/**
- * ata_scsi_timed_out - SCSI layer time out callback
- * @cmd: timed out SCSI command
- *
- * Handles SCSI layer timeout. We race with normal completion of
- * the qc for @cmd. If the qc is already gone, we lose and let
- * the scsi command finish (EH_HANDLED). Otherwise, the qc has
- * timed out and EH should be invoked. Prevent ata_qc_complete()
- * from finishing it by setting EH_SCHEDULED and return
- * EH_NOT_HANDLED.
- *
- * TODO: kill this function once old EH is gone.
- *
- * LOCKING:
- * Called from timer context
- *
- * RETURNS:
- * EH_HANDLED or EH_NOT_HANDLED
- */
-enum blk_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd)
-{
- struct Scsi_Host *host = cmd->device->host;
- struct ata_port *ap = ata_shost_to_port(host);
- unsigned long flags;
- struct ata_queued_cmd *qc;
- enum blk_eh_timer_return ret;
-
- DPRINTK("ENTER\n");
-
- if (ap->ops->error_handler) {
- ret = BLK_EH_NOT_HANDLED;
- goto out;
- }
-
- ret = BLK_EH_HANDLED;
- spin_lock_irqsave(ap->lock, flags);
- qc = ata_qc_from_tag(ap, ap->link.active_tag);
- if (qc) {
- WARN_ON(qc->scsicmd != cmd);
- qc->flags |= ATA_QCFLAG_EH_SCHEDULED;
- qc->err_mask |= AC_ERR_TIMEOUT;
- ret = BLK_EH_NOT_HANDLED;
- }
- spin_unlock_irqrestore(ap->lock, flags);
-
- out:
- DPRINTK("EXIT, ret=%d\n", ret);
- return ret;
-}
-EXPORT_SYMBOL(ata_scsi_timed_out);
-
static void ata_eh_unload(struct ata_port *ap)
{
struct ata_link *link;
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index f781eff7d23e..7c3887a7e534 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -1179,7 +1179,6 @@ static bool DAC960_V1_EnableMemoryMailboxInterface(DAC960_Controller_T
if (pci_set_dma_mask(Controller->PCIDevice, DMA_BIT_MASK(32)))
return DAC960_Failure(Controller, "DMA mask out of range");
- Controller->BounceBufferLimit = DMA_BIT_MASK(32);
if ((hw_type == DAC960_PD_Controller) || (hw_type == DAC960_P_Controller)) {
CommandMailboxesSize = 0;
@@ -1380,11 +1379,8 @@ static bool DAC960_V2_EnableMemoryMailboxInterface(DAC960_Controller_T
dma_addr_t CommandMailboxDMA;
DAC960_V2_CommandStatus_T CommandStatus;
- if (!pci_set_dma_mask(Controller->PCIDevice, DMA_BIT_MASK(64)))
- Controller->BounceBufferLimit = DMA_BIT_MASK(64);
- else if (!pci_set_dma_mask(Controller->PCIDevice, DMA_BIT_MASK(32)))
- Controller->BounceBufferLimit = DMA_BIT_MASK(32);
- else
+ if (pci_set_dma_mask(Controller->PCIDevice, DMA_BIT_MASK(64)) &&
+ pci_set_dma_mask(Controller->PCIDevice, DMA_BIT_MASK(32)))
return DAC960_Failure(Controller, "DMA mask out of range");
/* This is a temporary dma mapping, used only in the scope of this function */
@@ -2540,7 +2536,6 @@ static bool DAC960_RegisterBlockDevice(DAC960_Controller_T *Controller)
continue;
}
Controller->RequestQueue[n] = RequestQueue;
- blk_queue_bounce_limit(RequestQueue, Controller->BounceBufferLimit);
RequestQueue->queuedata = Controller;
blk_queue_max_segments(RequestQueue, Controller->DriverScatterGatherLimit);
blk_queue_max_hw_sectors(RequestQueue, Controller->MaxBlocksPerCommand);
@@ -6594,7 +6589,7 @@ static void DAC960_CreateProcEntries(DAC960_Controller_T *Controller)
DAC960_ProcDirectoryEntry);
proc_create_data("initial_status", 0, ControllerProcEntry, &dac960_initial_status_proc_fops, Controller);
proc_create_data("current_status", 0, ControllerProcEntry, &dac960_current_status_proc_fops, Controller);
- proc_create_data("user_command", S_IWUSR | S_IRUSR, ControllerProcEntry, &dac960_user_command_proc_fops, Controller);
+ proc_create_data("user_command", 0600, ControllerProcEntry, &dac960_user_command_proc_fops, Controller);
Controller->ControllerProcEntry = ControllerProcEntry;
}
diff --git a/drivers/block/DAC960.h b/drivers/block/DAC960.h
index 21aff470d268..1439e651928b 100644
--- a/drivers/block/DAC960.h
+++ b/drivers/block/DAC960.h
@@ -2295,7 +2295,6 @@ typedef struct DAC960_Controller
unsigned short MaxBlocksPerCommand;
unsigned short ControllerScatterGatherLimit;
unsigned short DriverScatterGatherLimit;
- u64 BounceBufferLimit;
unsigned int CombinedStatusBufferLength;
unsigned int InitialStatusLength;
unsigned int CurrentStatusLength;
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 6797e6c23c8a..429ebb84b592 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -159,14 +159,14 @@ static int aoe_debugfs_open(struct inode *inode, struct file *file)
return single_open(file, aoedisk_debugfs_show, inode->i_private);
}
-static DEVICE_ATTR(state, S_IRUGO, aoedisk_show_state, NULL);
-static DEVICE_ATTR(mac, S_IRUGO, aoedisk_show_mac, NULL);
-static DEVICE_ATTR(netif, S_IRUGO, aoedisk_show_netif, NULL);
+static DEVICE_ATTR(state, 0444, aoedisk_show_state, NULL);
+static DEVICE_ATTR(mac, 0444, aoedisk_show_mac, NULL);
+static DEVICE_ATTR(netif, 0444, aoedisk_show_netif, NULL);
static struct device_attribute dev_attr_firmware_version = {
- .attr = { .name = "firmware-version", .mode = S_IRUGO },
+ .attr = { .name = "firmware-version", .mode = 0444 },
.show = aoedisk_show_fwver,
};
-static DEVICE_ATTR(payload, S_IRUGO, aoedisk_show_payload, NULL);
+static DEVICE_ATTR(payload, 0444, aoedisk_show_payload, NULL);
static struct attribute *aoe_attrs[] = {
&dev_attr_state.attr,
@@ -388,7 +388,6 @@ aoeblk_gdalloc(void *vp)
d->aoemajor, d->aoeminor);
goto err_mempool;
}
- blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
spin_lock_irqsave(&d->lock, flags);
WARN_ON(!(d->flags & DEVFL_GD_NOW));
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 540bb60cd071..096882e54095 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -1032,8 +1032,9 @@ bvcpy(struct sk_buff *skb, struct bio *bio, struct bvec_iter iter, long cnt)
iter.bi_size = cnt;
__bio_for_each_segment(bv, bio, iter, iter) {
- char *p = page_address(bv.bv_page) + bv.bv_offset;
+ char *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
skb_copy_bits(skb, soff, p, bv.bv_len);
+ kunmap_atomic(p);
soff += bv.bv_len;
}
}
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 66cb0f857f64..bb976598ee43 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -331,15 +331,15 @@ static const struct block_device_operations brd_fops = {
* And now the modules code and kernel interface.
*/
static int rd_nr = CONFIG_BLK_DEV_RAM_COUNT;
-module_param(rd_nr, int, S_IRUGO);
+module_param(rd_nr, int, 0444);
MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices");
unsigned long rd_size = CONFIG_BLK_DEV_RAM_SIZE;
-module_param(rd_size, ulong, S_IRUGO);
+module_param(rd_size, ulong, 0444);
MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
static int max_part = 1;
-module_param(max_part, int, S_IRUGO);
+module_param(max_part, int, 0444);
MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices");
MODULE_LICENSE("GPL");
@@ -402,6 +402,10 @@ static struct brd_device *brd_alloc(int i)
set_capacity(disk, rd_size * 2);
disk->queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO;
+ /* Tell the block layer that this is not a rotational device */
+ blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
+ blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue);
+
return brd;
out_free_queue:
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 9f4e6f502b84..11a85b740327 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -977,7 +977,7 @@ static void drbd_bm_endio(struct bio *bio)
bm_page_unlock_io(device, idx);
if (ctx->flags & BM_AIO_COPY_PAGES)
- mempool_free(bio->bi_io_vec[0].bv_page, drbd_md_io_page_pool);
+ mempool_free(bio->bi_io_vec[0].bv_page, &drbd_md_io_page_pool);
bio_put(bio);
@@ -1014,7 +1014,8 @@ static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_ho
bm_set_page_unchanged(b->bm_pages[page_nr]);
if (ctx->flags & BM_AIO_COPY_PAGES) {
- page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_RECLAIM);
+ page = mempool_alloc(&drbd_md_io_page_pool,
+ GFP_NOIO | __GFP_HIGHMEM);
copy_highpage(page, b->bm_pages[page_nr]);
bm_store_page_idx(page, page_nr);
} else
diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
index ab21976a87b2..5d5e8d6a8a56 100644
--- a/drivers/block/drbd/drbd_debugfs.c
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -481,9 +481,9 @@ void drbd_debugfs_resource_add(struct drbd_resource *resource)
goto fail;
resource->debugfs_res_connections = dentry;
- dentry = debugfs_create_file("in_flight_summary", S_IRUSR|S_IRGRP,
- resource->debugfs_res, resource,
- &in_flight_summary_fops);
+ dentry = debugfs_create_file("in_flight_summary", 0440,
+ resource->debugfs_res, resource,
+ &in_flight_summary_fops);
if (IS_ERR_OR_NULL(dentry))
goto fail;
resource->debugfs_res_in_flight_summary = dentry;
@@ -645,16 +645,16 @@ void drbd_debugfs_connection_add(struct drbd_connection *connection)
goto fail;
connection->debugfs_conn = dentry;
- dentry = debugfs_create_file("callback_history", S_IRUSR|S_IRGRP,
- connection->debugfs_conn, connection,
- &connection_callback_history_fops);
+ dentry = debugfs_create_file("callback_history", 0440,
+ connection->debugfs_conn, connection,
+ &connection_callback_history_fops);
if (IS_ERR_OR_NULL(dentry))
goto fail;
connection->debugfs_conn_callback_history = dentry;
- dentry = debugfs_create_file("oldest_requests", S_IRUSR|S_IRGRP,
- connection->debugfs_conn, connection,
- &connection_oldest_requests_fops);
+ dentry = debugfs_create_file("oldest_requests", 0440,
+ connection->debugfs_conn, connection,
+ &connection_oldest_requests_fops);
if (IS_ERR_OR_NULL(dentry))
goto fail;
connection->debugfs_conn_oldest_requests = dentry;
@@ -824,7 +824,7 @@ void drbd_debugfs_device_add(struct drbd_device *device)
device->debugfs_minor = dentry;
#define DCF(name) do { \
- dentry = debugfs_create_file(#name, S_IRUSR|S_IRGRP, \
+ dentry = debugfs_create_file(#name, 0440, \
device->debugfs_vol, device, \
&device_ ## name ## _fops); \
if (IS_ERR_OR_NULL(dentry)) \
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 06ecee1b528e..21b4186add6f 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1405,8 +1405,8 @@ extern struct kmem_cache *drbd_request_cache;
extern struct kmem_cache *drbd_ee_cache; /* peer requests */
extern struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
-extern mempool_t *drbd_request_mempool;
-extern mempool_t *drbd_ee_mempool;
+extern mempool_t drbd_request_mempool;
+extern mempool_t drbd_ee_mempool;
/* drbd's page pool, used to buffer data received from the peer,
* or data requested by the peer.
@@ -1432,16 +1432,16 @@ extern wait_queue_head_t drbd_pp_wait;
* 128 should be plenty, currently we probably can get away with as few as 1.
*/
#define DRBD_MIN_POOL_PAGES 128
-extern mempool_t *drbd_md_io_page_pool;
+extern mempool_t drbd_md_io_page_pool;
/* We also need to make sure we get a bio
* when we need it for housekeeping purposes */
-extern struct bio_set *drbd_md_io_bio_set;
+extern struct bio_set drbd_md_io_bio_set;
/* to allocate from that set */
extern struct bio *bio_alloc_drbd(gfp_t gfp_mask);
/* And a bio_set for cloning */
-extern struct bio_set *drbd_io_bio_set;
+extern struct bio_set drbd_io_bio_set;
extern struct mutex resources_mutex;
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 185f1ef00a7c..a233e71e58ff 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -124,11 +124,11 @@ struct kmem_cache *drbd_request_cache;
struct kmem_cache *drbd_ee_cache; /* peer requests */
struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
-mempool_t *drbd_request_mempool;
-mempool_t *drbd_ee_mempool;
-mempool_t *drbd_md_io_page_pool;
-struct bio_set *drbd_md_io_bio_set;
-struct bio_set *drbd_io_bio_set;
+mempool_t drbd_request_mempool;
+mempool_t drbd_ee_mempool;
+mempool_t drbd_md_io_page_pool;
+struct bio_set drbd_md_io_bio_set;
+struct bio_set drbd_io_bio_set;
/* I do not use a standard mempool, because:
1) I want to hand out the pre-allocated objects first.
@@ -153,10 +153,10 @@ struct bio *bio_alloc_drbd(gfp_t gfp_mask)
{
struct bio *bio;
- if (!drbd_md_io_bio_set)
+ if (!bioset_initialized(&drbd_md_io_bio_set))
return bio_alloc(gfp_mask, 1);
- bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
+ bio = bio_alloc_bioset(gfp_mask, 1, &drbd_md_io_bio_set);
if (!bio)
return NULL;
return bio;
@@ -2097,16 +2097,11 @@ static void drbd_destroy_mempools(void)
/* D_ASSERT(device, atomic_read(&drbd_pp_vacant)==0); */
- if (drbd_io_bio_set)
- bioset_free(drbd_io_bio_set);
- if (drbd_md_io_bio_set)
- bioset_free(drbd_md_io_bio_set);
- if (drbd_md_io_page_pool)
- mempool_destroy(drbd_md_io_page_pool);
- if (drbd_ee_mempool)
- mempool_destroy(drbd_ee_mempool);
- if (drbd_request_mempool)
- mempool_destroy(drbd_request_mempool);
+ bioset_exit(&drbd_io_bio_set);
+ bioset_exit(&drbd_md_io_bio_set);
+ mempool_exit(&drbd_md_io_page_pool);
+ mempool_exit(&drbd_ee_mempool);
+ mempool_exit(&drbd_request_mempool);
if (drbd_ee_cache)
kmem_cache_destroy(drbd_ee_cache);
if (drbd_request_cache)
@@ -2116,11 +2111,6 @@ static void drbd_destroy_mempools(void)
if (drbd_al_ext_cache)
kmem_cache_destroy(drbd_al_ext_cache);
- drbd_io_bio_set = NULL;
- drbd_md_io_bio_set = NULL;
- drbd_md_io_page_pool = NULL;
- drbd_ee_mempool = NULL;
- drbd_request_mempool = NULL;
drbd_ee_cache = NULL;
drbd_request_cache = NULL;
drbd_bm_ext_cache = NULL;
@@ -2133,18 +2123,7 @@ static int drbd_create_mempools(void)
{
struct page *page;
const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * drbd_minor_count;
- int i;
-
- /* prepare our caches and mempools */
- drbd_request_mempool = NULL;
- drbd_ee_cache = NULL;
- drbd_request_cache = NULL;
- drbd_bm_ext_cache = NULL;
- drbd_al_ext_cache = NULL;
- drbd_pp_pool = NULL;
- drbd_md_io_page_pool = NULL;
- drbd_md_io_bio_set = NULL;
- drbd_io_bio_set = NULL;
+ int i, ret;
/* caches */
drbd_request_cache = kmem_cache_create(
@@ -2168,26 +2147,26 @@ static int drbd_create_mempools(void)
goto Enomem;
/* mempools */
- drbd_io_bio_set = bioset_create(BIO_POOL_SIZE, 0, 0);
- if (drbd_io_bio_set == NULL)
+ ret = bioset_init(&drbd_io_bio_set, BIO_POOL_SIZE, 0, 0);
+ if (ret)
goto Enomem;
- drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0,
- BIOSET_NEED_BVECS);
- if (drbd_md_io_bio_set == NULL)
+ ret = bioset_init(&drbd_md_io_bio_set, DRBD_MIN_POOL_PAGES, 0,
+ BIOSET_NEED_BVECS);
+ if (ret)
goto Enomem;
- drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0);
- if (drbd_md_io_page_pool == NULL)
+ ret = mempool_init_page_pool(&drbd_md_io_page_pool, DRBD_MIN_POOL_PAGES, 0);
+ if (ret)
goto Enomem;
- drbd_request_mempool = mempool_create_slab_pool(number,
- drbd_request_cache);
- if (drbd_request_mempool == NULL)
+ ret = mempool_init_slab_pool(&drbd_request_mempool, number,
+ drbd_request_cache);
+ if (ret)
goto Enomem;
- drbd_ee_mempool = mempool_create_slab_pool(number, drbd_ee_cache);
- if (drbd_ee_mempool == NULL)
+ ret = mempool_init_slab_pool(&drbd_ee_mempool, number, drbd_ee_cache);
+ if (ret)
goto Enomem;
/* drbd's page pool */
@@ -3010,7 +2989,7 @@ static int __init drbd_init(void)
goto fail;
err = -ENOMEM;
- drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
+ drbd_proc = proc_create_data("drbd", S_IFREG | 0444 , NULL, &drbd_proc_fops, NULL);
if (!drbd_proc) {
pr_err("unable to register proc file\n");
goto fail;
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index c72dee0ef083..be9450f5ad1c 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -378,7 +378,7 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
if (drbd_insert_fault(device, DRBD_FAULT_AL_EE))
return NULL;
- peer_req = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
+ peer_req = mempool_alloc(&drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
if (!peer_req) {
if (!(gfp_mask & __GFP_NOWARN))
drbd_err(device, "%s: allocation failed\n", __func__);
@@ -409,7 +409,7 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
return peer_req;
fail:
- mempool_free(peer_req, drbd_ee_mempool);
+ mempool_free(peer_req, &drbd_ee_mempool);
return NULL;
}
@@ -426,7 +426,7 @@ void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *
peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
drbd_al_complete_io(device, &peer_req->i);
}
- mempool_free(peer_req, drbd_ee_mempool);
+ mempool_free(peer_req, &drbd_ee_mempool);
}
int drbd_free_peer_reqs(struct drbd_device *device, struct list_head *list)
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index a500e738d929..a47e4987ee46 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -55,7 +55,7 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device, struct bio
{
struct drbd_request *req;
- req = mempool_alloc(drbd_request_mempool, GFP_NOIO);
+ req = mempool_alloc(&drbd_request_mempool, GFP_NOIO);
if (!req)
return NULL;
memset(req, 0, sizeof(*req));
@@ -184,7 +184,7 @@ void drbd_req_destroy(struct kref *kref)
}
}
- mempool_free(req, drbd_request_mempool);
+ mempool_free(req, &drbd_request_mempool);
}
static void wake_all_senders(struct drbd_connection *connection)
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index cb97b3b30962..94c654020f0f 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -269,7 +269,7 @@ enum drbd_req_state_bits {
static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src)
{
struct bio *bio;
- bio = bio_clone_fast(bio_src, GFP_NOIO, drbd_io_bio_set);
+ bio = bio_clone_fast(bio_src, GFP_NOIO, &drbd_io_bio_set);
req->private_bio = bio;
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 8ec7235fc93b..8871b5044d9e 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -4450,7 +4450,7 @@ static ssize_t floppy_cmos_show(struct device *dev,
return sprintf(buf, "%X\n", UDP->cmos);
}
-static DEVICE_ATTR(cmos, S_IRUGO, floppy_cmos_show, NULL);
+static DEVICE_ATTR(cmos, 0444, floppy_cmos_show, NULL);
static struct attribute *floppy_dev_attrs[] = {
&dev_attr_cmos.attr,
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 55cf554bc914..4838b0dbaad3 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -732,7 +732,7 @@ static ssize_t loop_attr_do_show_##_name(struct device *d, \
return loop_attr_show(d, b, loop_attr_##_name##_show); \
} \
static struct device_attribute loop_attr_##_name = \
- __ATTR(_name, S_IRUGO, loop_attr_do_show_##_name, NULL);
+ __ATTR(_name, 0444, loop_attr_do_show_##_name, NULL);
static ssize_t loop_attr_backing_file_show(struct loop_device *lo, char *buf)
{
@@ -809,16 +809,17 @@ static struct attribute_group loop_attribute_group = {
.attrs= loop_attrs,
};
-static int loop_sysfs_init(struct loop_device *lo)
+static void loop_sysfs_init(struct loop_device *lo)
{
- return sysfs_create_group(&disk_to_dev(lo->lo_disk)->kobj,
- &loop_attribute_group);
+ lo->sysfs_inited = !sysfs_create_group(&disk_to_dev(lo->lo_disk)->kobj,
+ &loop_attribute_group);
}
static void loop_sysfs_exit(struct loop_device *lo)
{
- sysfs_remove_group(&disk_to_dev(lo->lo_disk)->kobj,
- &loop_attribute_group);
+ if (lo->sysfs_inited)
+ sysfs_remove_group(&disk_to_dev(lo->lo_disk)->kobj,
+ &loop_attribute_group);
}
static void loop_config_discard(struct loop_device *lo)
@@ -1677,9 +1678,9 @@ static const struct block_device_operations lo_fops = {
* And now the modules code and kernel interface.
*/
static int max_loop;
-module_param(max_loop, int, S_IRUGO);
+module_param(max_loop, int, 0444);
MODULE_PARM_DESC(max_loop, "Maximum number of loop devices");
-module_param(max_part, int, S_IRUGO);
+module_param(max_part, int, 0444);
MODULE_PARM_DESC(max_part, "Maximum number of partitions per loop device");
MODULE_LICENSE("GPL");
MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR);
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index b78de9879f4f..4d42c7af7de7 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -58,6 +58,7 @@ struct loop_device {
struct kthread_worker worker;
struct task_struct *worker_task;
bool use_dio;
+ bool sysfs_inited;
struct request_queue *lo_queue;
struct blk_mq_tag_set tag_set;
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 769c551e3d71..c73626decb46 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -2285,7 +2285,7 @@ static ssize_t mtip_hw_show_status(struct device *dev,
return size;
}
-static DEVICE_ATTR(status, S_IRUGO, mtip_hw_show_status, NULL);
+static DEVICE_ATTR(status, 0444, mtip_hw_show_status, NULL);
/* debugsfs entries */
@@ -2566,10 +2566,9 @@ static int mtip_hw_debugfs_init(struct driver_data *dd)
return -1;
}
- debugfs_create_file("flags", S_IRUGO, dd->dfs_node, dd,
- &mtip_flags_fops);
- debugfs_create_file("registers", S_IRUGO, dd->dfs_node, dd,
- &mtip_regs_fops);
+ debugfs_create_file("flags", 0444, dd->dfs_node, dd, &mtip_flags_fops);
+ debugfs_create_file("registers", 0444, dd->dfs_node, dd,
+ &mtip_regs_fops);
return 0;
}
@@ -2726,15 +2725,11 @@ static void mtip_softirq_done_fn(struct request *rq)
blk_mq_end_request(rq, cmd->status);
}
-static void mtip_abort_cmd(struct request *req, void *data,
- bool reserved)
+static void mtip_abort_cmd(struct request *req, void *data, bool reserved)
{
struct mtip_cmd *cmd = blk_mq_rq_to_pdu(req);
struct driver_data *dd = data;
- if (!blk_mq_request_started(req))
- return;
-
dbg_printk(MTIP_DRV_NAME " Aborting request, tag = %d\n", req->tag);
clear_bit(req->tag, dd->port->cmds_to_issue);
@@ -2742,14 +2737,10 @@ static void mtip_abort_cmd(struct request *req, void *data,
mtip_softirq_done_fn(req);
}
-static void mtip_queue_cmd(struct request *req, void *data,
- bool reserved)
+static void mtip_queue_cmd(struct request *req, void *data, bool reserved)
{
struct driver_data *dd = data;
- if (!blk_mq_request_started(req))
- return;
-
set_bit(req->tag, dd->port->cmds_to_issue);
blk_abort_request(req);
}
@@ -3720,7 +3711,8 @@ static enum blk_eh_timer_return mtip_cmd_timeout(struct request *req,
struct mtip_cmd *cmd = blk_mq_rq_to_pdu(req);
cmd->status = BLK_STS_TIMEOUT;
- return BLK_EH_HANDLED;
+ blk_mq_complete_request(req);
+ return BLK_EH_DONE;
}
if (test_bit(req->tag, dd->port->cmds_to_issue))
@@ -3862,7 +3854,6 @@ skip_create_disk:
blk_queue_max_hw_sectors(dd->queue, 0xffff);
blk_queue_max_segment_size(dd->queue, 0x400000);
blk_queue_io_min(dd->queue, 4096);
- blk_queue_bounce_limit(dd->queue, dd->pdev->dma_mask);
/* Signal trim support */
if (dd->trim_supp == true) {
@@ -4273,7 +4264,7 @@ static int mtip_pci_probe(struct pci_dev *pdev,
if (!dd->isr_workq) {
dev_warn(&pdev->dev, "Can't create wq %d\n", dd->instance);
rv = -ENOMEM;
- goto block_initialize_err;
+ goto setmask_err;
}
memset(cpu_list, 0, sizeof(cpu_list));
@@ -4614,7 +4605,7 @@ static int __init mtip_init(void)
}
if (dfs_parent) {
dfs_device_status = debugfs_create_file("device_status",
- S_IRUGO, dfs_parent, NULL,
+ 0444, dfs_parent, NULL,
&mtip_device_status_fops);
if (IS_ERR_OR_NULL(dfs_device_status)) {
pr_err("Error creating device_status node\n");
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index afbc202ca6fd..3ed1ef8ee528 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -166,16 +166,19 @@ static ssize_t pid_show(struct device *dev,
}
static const struct device_attribute pid_attr = {
- .attr = { .name = "pid", .mode = S_IRUGO},
+ .attr = { .name = "pid", .mode = 0444},
.show = pid_show,
};
static void nbd_dev_remove(struct nbd_device *nbd)
{
struct gendisk *disk = nbd->disk;
+ struct request_queue *q;
+
if (disk) {
+ q = disk->queue;
del_gendisk(disk);
- blk_cleanup_queue(disk->queue);
+ blk_cleanup_queue(q);
blk_mq_free_tag_set(&nbd->tag_set);
disk->private_data = NULL;
put_disk(disk);
@@ -213,7 +216,15 @@ static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
}
if (!nsock->dead) {
kernel_sock_shutdown(nsock->sock, SHUT_RDWR);
- atomic_dec(&nbd->config->live_connections);
+ if (atomic_dec_return(&nbd->config->live_connections) == 0) {
+ if (test_and_clear_bit(NBD_DISCONNECT_REQUESTED,
+ &nbd->config->runtime_flags)) {
+ set_bit(NBD_DISCONNECTED,
+ &nbd->config->runtime_flags);
+ dev_info(nbd_to_dev(nbd),
+ "Disconnected due to user request.\n");
+ }
+ }
}
nsock->dead = true;
nsock->pending = NULL;
@@ -231,9 +242,22 @@ static void nbd_size_clear(struct nbd_device *nbd)
static void nbd_size_update(struct nbd_device *nbd)
{
struct nbd_config *config = nbd->config;
+ struct block_device *bdev = bdget_disk(nbd->disk, 0);
+
+ if (config->flags & NBD_FLAG_SEND_TRIM) {
+ nbd->disk->queue->limits.discard_granularity = config->blksize;
+ blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX);
+ }
blk_queue_logical_block_size(nbd->disk->queue, config->blksize);
blk_queue_physical_block_size(nbd->disk->queue, config->blksize);
set_capacity(nbd->disk, config->bytesize >> 9);
+ if (bdev) {
+ if (bdev->bd_disk)
+ bd_set_size(bdev, config->bytesize);
+ else
+ bdev->bd_invalidated = 1;
+ bdput(bdev);
+ }
kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
}
@@ -243,6 +267,8 @@ static void nbd_size_set(struct nbd_device *nbd, loff_t blocksize,
struct nbd_config *config = nbd->config;
config->blksize = blocksize;
config->bytesize = blocksize * nr_blocks;
+ if (nbd->task_recv != NULL)
+ nbd_size_update(nbd);
}
static void nbd_complete_rq(struct request *req)
@@ -286,13 +312,15 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
if (!refcount_inc_not_zero(&nbd->config_refs)) {
cmd->status = BLK_STS_TIMEOUT;
- return BLK_EH_HANDLED;
+ goto done;
}
config = nbd->config;
if (config->num_connections > 1) {
dev_err_ratelimited(nbd_to_dev(nbd),
- "Connection timed out, retrying\n");
+ "Connection timed out, retrying (%d/%d alive)\n",
+ atomic_read(&config->live_connections),
+ config->num_connections);
/*
* Hooray we have more connections, requeue this IO, the submit
* path will put it on a real connection.
@@ -314,7 +342,7 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
}
blk_mq_requeue_request(req, true);
nbd_config_put(nbd);
- return BLK_EH_NOT_HANDLED;
+ return BLK_EH_DONE;
}
} else {
dev_err_ratelimited(nbd_to_dev(nbd),
@@ -324,8 +352,9 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
cmd->status = BLK_STS_IOERR;
sock_shutdown(nbd);
nbd_config_put(nbd);
-
- return BLK_EH_HANDLED;
+done:
+ blk_mq_complete_request(req);
+ return BLK_EH_DONE;
}
/*
@@ -647,11 +676,8 @@ static void recv_work(struct work_struct *work)
static void nbd_clear_req(struct request *req, void *data, bool reserved)
{
- struct nbd_cmd *cmd;
+ struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
- if (!blk_mq_request_started(req))
- return;
- cmd = blk_mq_rq_to_pdu(req);
cmd->status = BLK_STS_IOERR;
blk_mq_complete_request(req);
}
@@ -714,10 +740,9 @@ static int wait_for_reconnect(struct nbd_device *nbd)
return 0;
if (test_bit(NBD_DISCONNECTED, &config->runtime_flags))
return 0;
- wait_event_timeout(config->conn_wait,
- atomic_read(&config->live_connections),
- config->dead_conn_timeout);
- return atomic_read(&config->live_connections);
+ return wait_event_timeout(config->conn_wait,
+ atomic_read(&config->live_connections) > 0,
+ config->dead_conn_timeout) > 0;
}
static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
@@ -950,10 +975,6 @@ static void nbd_bdev_reset(struct block_device *bdev)
if (bdev->bd_openers > 1)
return;
bd_set_size(bdev, 0);
- if (max_part > 0) {
- blkdev_reread_part(bdev);
- bdev->bd_invalidated = 1;
- }
}
static void nbd_parse_flags(struct nbd_device *nbd)
@@ -1040,6 +1061,8 @@ static void nbd_config_put(struct nbd_device *nbd)
nbd->config = NULL;
nbd->tag_set.timeout = 0;
+ nbd->disk->queue->limits.discard_granularity = 0;
+ blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX);
blk_queue_flag_clear(QUEUE_FLAG_DISCARD, nbd->disk->queue);
mutex_unlock(&nbd->config_lock);
@@ -1109,7 +1132,6 @@ static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *b
if (ret)
return ret;
- bd_set_size(bdev, config->bytesize);
if (max_part)
bdev->bd_invalidated = 1;
mutex_unlock(&nbd->config_lock);
@@ -1118,7 +1140,7 @@ static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *b
if (ret)
sock_shutdown(nbd);
mutex_lock(&nbd->config_lock);
- bd_set_size(bdev, 0);
+ nbd_bdev_reset(bdev);
/* user requested, ignore socket errors */
if (test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags))
ret = 0;
@@ -1269,6 +1291,9 @@ static int nbd_open(struct block_device *bdev, fmode_t mode)
refcount_set(&nbd->config_refs, 1);
refcount_inc(&nbd->refs);
mutex_unlock(&nbd->config_lock);
+ bdev->bd_invalidated = 1;
+ } else if (nbd_disconnected(nbd->config)) {
+ bdev->bd_invalidated = 1;
}
out:
mutex_unlock(&nbd_index_mutex);
@@ -1490,8 +1515,8 @@ static int nbd_dev_add(int index)
*/
blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue);
- disk->queue->limits.discard_granularity = 512;
- blk_queue_max_discard_sectors(disk->queue, UINT_MAX);
+ disk->queue->limits.discard_granularity = 0;
+ blk_queue_max_discard_sectors(disk->queue, 0);
blk_queue_max_segment_size(disk->queue, UINT_MAX);
blk_queue_max_segments(disk->queue, USHRT_MAX);
blk_queue_max_hw_sectors(disk->queue, 65536);
@@ -1755,6 +1780,7 @@ static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info)
}
mutex_lock(&nbd->config_lock);
nbd_disconnect(nbd);
+ nbd_clear_sock(nbd);
mutex_unlock(&nbd->config_lock);
if (test_and_clear_bit(NBD_HAS_CONFIG_REF,
&nbd->config->runtime_flags))
@@ -2093,7 +2119,8 @@ static int __init nbd_init(void)
if (nbds_max > 1UL << (MINORBITS - part_shift))
return -EINVAL;
recv_workqueue = alloc_workqueue("knbd-recv",
- WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
+ WQ_MEM_RECLAIM | WQ_HIGHPRI |
+ WQ_UNBOUND, 0);
if (!recv_workqueue)
return -ENOMEM;
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index a76553293a31..2bdadd7f1454 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -157,23 +157,23 @@ enum {
};
static int g_no_sched;
-module_param_named(no_sched, g_no_sched, int, S_IRUGO);
+module_param_named(no_sched, g_no_sched, int, 0444);
MODULE_PARM_DESC(no_sched, "No io scheduler");
static int g_submit_queues = 1;
-module_param_named(submit_queues, g_submit_queues, int, S_IRUGO);
+module_param_named(submit_queues, g_submit_queues, int, 0444);
MODULE_PARM_DESC(submit_queues, "Number of submission queues");
static int g_home_node = NUMA_NO_NODE;
-module_param_named(home_node, g_home_node, int, S_IRUGO);
+module_param_named(home_node, g_home_node, int, 0444);
MODULE_PARM_DESC(home_node, "Home node for the device");
#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
static char g_timeout_str[80];
-module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), S_IRUGO);
+module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), 0444);
static char g_requeue_str[80];
-module_param_string(requeue, g_requeue_str, sizeof(g_requeue_str), S_IRUGO);
+module_param_string(requeue, g_requeue_str, sizeof(g_requeue_str), 0444);
#endif
static int g_queue_mode = NULL_Q_MQ;
@@ -203,27 +203,27 @@ static const struct kernel_param_ops null_queue_mode_param_ops = {
.get = param_get_int,
};
-device_param_cb(queue_mode, &null_queue_mode_param_ops, &g_queue_mode, S_IRUGO);
+device_param_cb(queue_mode, &null_queue_mode_param_ops, &g_queue_mode, 0444);
MODULE_PARM_DESC(queue_mode, "Block interface to use (0=bio,1=rq,2=multiqueue)");
static int g_gb = 250;
-module_param_named(gb, g_gb, int, S_IRUGO);
+module_param_named(gb, g_gb, int, 0444);
MODULE_PARM_DESC(gb, "Size in GB");
static int g_bs = 512;
-module_param_named(bs, g_bs, int, S_IRUGO);
+module_param_named(bs, g_bs, int, 0444);
MODULE_PARM_DESC(bs, "Block size (in bytes)");
static int nr_devices = 1;
-module_param(nr_devices, int, S_IRUGO);
+module_param(nr_devices, int, 0444);
MODULE_PARM_DESC(nr_devices, "Number of devices to register");
static bool g_blocking;
-module_param_named(blocking, g_blocking, bool, S_IRUGO);
+module_param_named(blocking, g_blocking, bool, 0444);
MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device");
static bool shared_tags;
-module_param(shared_tags, bool, S_IRUGO);
+module_param(shared_tags, bool, 0444);
MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq");
static int g_irqmode = NULL_IRQ_SOFTIRQ;
@@ -239,19 +239,19 @@ static const struct kernel_param_ops null_irqmode_param_ops = {
.get = param_get_int,
};
-device_param_cb(irqmode, &null_irqmode_param_ops, &g_irqmode, S_IRUGO);
+device_param_cb(irqmode, &null_irqmode_param_ops, &g_irqmode, 0444);
MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer");
static unsigned long g_completion_nsec = 10000;
-module_param_named(completion_nsec, g_completion_nsec, ulong, S_IRUGO);
+module_param_named(completion_nsec, g_completion_nsec, ulong, 0444);
MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns");
static int g_hw_queue_depth = 64;
-module_param_named(hw_queue_depth, g_hw_queue_depth, int, S_IRUGO);
+module_param_named(hw_queue_depth, g_hw_queue_depth, int, 0444);
MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 64");
static bool g_use_per_node_hctx;
-module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, S_IRUGO);
+module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444);
MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false");
static struct nullb_device *null_alloc_dev(void);
@@ -1365,7 +1365,8 @@ static blk_qc_t null_queue_bio(struct request_queue *q, struct bio *bio)
static enum blk_eh_timer_return null_rq_timed_out_fn(struct request *rq)
{
pr_info("null: rq %p timed out\n", rq);
- return BLK_EH_HANDLED;
+ blk_mq_complete_request(rq);
+ return BLK_EH_DONE;
}
static int null_rq_prep_fn(struct request_queue *q, struct request *req)
@@ -1427,7 +1428,8 @@ static void null_request_fn(struct request_queue *q)
static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res)
{
pr_info("null: rq %p timed out\n", rq);
- return BLK_EH_HANDLED;
+ blk_mq_complete_request(rq);
+ return BLK_EH_DONE;
}
static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 27a44b97393a..8961b190e256 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -740,7 +740,7 @@ static int pd_special_command(struct pd_unit *disk,
{
struct request *rq;
- rq = blk_get_request(disk->gd->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+ rq = blk_get_request(disk->gd->queue, REQ_OP_DRV_IN, 0);
if (IS_ERR(rq))
return PTR_ERR(rq);
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index c61d20c9f3f8..1a2c0101cfcb 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -97,8 +97,8 @@ static int pktdev_major;
static int write_congestion_on = PKT_WRITE_CONGESTION_ON;
static int write_congestion_off = PKT_WRITE_CONGESTION_OFF;
static struct mutex ctl_mutex; /* Serialize open/close/setup/teardown */
-static mempool_t *psd_pool;
-static struct bio_set *pkt_bio_set;
+static mempool_t psd_pool;
+static struct bio_set pkt_bio_set;
static struct class *class_pktcdvd = NULL; /* /sys/class/pktcdvd */
static struct dentry *pkt_debugfs_root = NULL; /* /sys/kernel/debug/pktcdvd */
@@ -478,8 +478,8 @@ static void pkt_debugfs_dev_new(struct pktcdvd_device *pd)
if (!pd->dfs_d_root)
return;
- pd->dfs_f_info = debugfs_create_file("info", S_IRUGO,
- pd->dfs_d_root, pd, &debug_fops);
+ pd->dfs_f_info = debugfs_create_file("info", 0444,
+ pd->dfs_d_root, pd, &debug_fops);
}
static void pkt_debugfs_dev_remove(struct pktcdvd_device *pd)
@@ -631,7 +631,7 @@ static inline struct pkt_rb_node *pkt_rbtree_next(struct pkt_rb_node *node)
static void pkt_rbtree_erase(struct pktcdvd_device *pd, struct pkt_rb_node *node)
{
rb_erase(&node->rb_node, &pd->bio_queue);
- mempool_free(node, pd->rb_pool);
+ mempool_free(node, &pd->rb_pool);
pd->bio_queue_size--;
BUG_ON(pd->bio_queue_size < 0);
}
@@ -704,13 +704,13 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
int ret = 0;
rq = blk_get_request(q, (cgc->data_direction == CGC_DATA_WRITE) ?
- REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, __GFP_RECLAIM);
+ REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
if (IS_ERR(rq))
return PTR_ERR(rq);
if (cgc->buflen) {
ret = blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen,
- __GFP_RECLAIM);
+ GFP_NOIO);
if (ret)
goto out;
}
@@ -1285,7 +1285,7 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
* Fill-in bvec with data from orig_bios.
*/
spin_lock(&pkt->lock);
- bio_copy_data(pkt->w_bio, pkt->orig_bios.head);
+ bio_list_copy_data(pkt->w_bio, pkt->orig_bios.head);
pkt_set_state(pkt, PACKET_WRITE_WAIT_STATE);
spin_unlock(&pkt->lock);
@@ -2303,14 +2303,14 @@ static void pkt_end_io_read_cloned(struct bio *bio)
psd->bio->bi_status = bio->bi_status;
bio_put(bio);
bio_endio(psd->bio);
- mempool_free(psd, psd_pool);
+ mempool_free(psd, &psd_pool);
pkt_bio_finished(pd);
}
static void pkt_make_request_read(struct pktcdvd_device *pd, struct bio *bio)
{
- struct bio *cloned_bio = bio_clone_fast(bio, GFP_NOIO, pkt_bio_set);
- struct packet_stacked_data *psd = mempool_alloc(psd_pool, GFP_NOIO);
+ struct bio *cloned_bio = bio_clone_fast(bio, GFP_NOIO, &pkt_bio_set);
+ struct packet_stacked_data *psd = mempool_alloc(&psd_pool, GFP_NOIO);
psd->pd = pd;
psd->bio = bio;
@@ -2381,7 +2381,7 @@ static void pkt_make_request_write(struct request_queue *q, struct bio *bio)
/*
* No matching packet found. Store the bio in the work queue.
*/
- node = mempool_alloc(pd->rb_pool, GFP_NOIO);
+ node = mempool_alloc(&pd->rb_pool, GFP_NOIO);
node->bio = bio;
spin_lock(&pd->lock);
BUG_ON(pd->bio_queue_size < 0);
@@ -2451,7 +2451,7 @@ static blk_qc_t pkt_make_request(struct request_queue *q, struct bio *bio)
split = bio_split(bio, last_zone -
bio->bi_iter.bi_sector,
- GFP_NOIO, pkt_bio_set);
+ GFP_NOIO, &pkt_bio_set);
bio_chain(split, bio);
} else {
split = bio;
@@ -2707,9 +2707,9 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
if (!pd)
goto out_mutex;
- pd->rb_pool = mempool_create_kmalloc_pool(PKT_RB_POOL_SIZE,
- sizeof(struct pkt_rb_node));
- if (!pd->rb_pool)
+ ret = mempool_init_kmalloc_pool(&pd->rb_pool, PKT_RB_POOL_SIZE,
+ sizeof(struct pkt_rb_node));
+ if (ret)
goto out_mem;
INIT_LIST_HEAD(&pd->cdrw.pkt_free_list);
@@ -2766,7 +2766,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
out_mem2:
put_disk(disk);
out_mem:
- mempool_destroy(pd->rb_pool);
+ mempool_exit(&pd->rb_pool);
kfree(pd);
out_mutex:
mutex_unlock(&ctl_mutex);
@@ -2817,7 +2817,7 @@ static int pkt_remove_dev(dev_t pkt_dev)
blk_cleanup_queue(pd->disk->queue);
put_disk(pd->disk);
- mempool_destroy(pd->rb_pool);
+ mempool_exit(&pd->rb_pool);
kfree(pd);
/* This is safe: open() is still holding a reference. */
@@ -2914,14 +2914,14 @@ static int __init pkt_init(void)
mutex_init(&ctl_mutex);
- psd_pool = mempool_create_kmalloc_pool(PSD_POOL_SIZE,
- sizeof(struct packet_stacked_data));
- if (!psd_pool)
- return -ENOMEM;
- pkt_bio_set = bioset_create(BIO_POOL_SIZE, 0, 0);
- if (!pkt_bio_set) {
- mempool_destroy(psd_pool);
- return -ENOMEM;
+ ret = mempool_init_kmalloc_pool(&psd_pool, PSD_POOL_SIZE,
+ sizeof(struct packet_stacked_data));
+ if (ret)
+ return ret;
+ ret = bioset_init(&pkt_bio_set, BIO_POOL_SIZE, 0, 0);
+ if (ret) {
+ mempool_exit(&psd_pool);
+ return ret;
}
ret = register_blkdev(pktdev_major, DRIVER_NAME);
@@ -2954,8 +2954,8 @@ out_misc:
out:
unregister_blkdev(pktdev_major, DRIVER_NAME);
out2:
- mempool_destroy(psd_pool);
- bioset_free(pkt_bio_set);
+ mempool_exit(&psd_pool);
+ bioset_exit(&pkt_bio_set);
return ret;
}
@@ -2968,8 +2968,8 @@ static void __exit pkt_exit(void)
pkt_sysfs_cleanup();
unregister_blkdev(pktdev_major, DRIVER_NAME);
- mempool_destroy(psd_pool);
- bioset_free(pkt_bio_set);
+ mempool_exit(&psd_pool);
+ bioset_exit(&pkt_bio_set);
}
MODULE_DESCRIPTION("Packet writing layer for CD/DVD drives");
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index 075662f2cf46..afe1508d82c6 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -465,8 +465,6 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev)
priv->queue = queue;
queue->queuedata = dev;
- blk_queue_bounce_limit(queue, BLK_BOUNCE_HIGH);
-
blk_queue_max_hw_sectors(queue, dev->bounce_size >> 9);
blk_queue_segment_boundary(queue, -1UL);
blk_queue_dma_alignment(queue, dev->blk_size-1);
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 33b36fea1d73..af354047ac4b 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -424,7 +424,7 @@ static struct workqueue_struct *rbd_wq;
* single-major requires >= 0.75 version of userspace rbd utility.
*/
static bool single_major = true;
-module_param(single_major, bool, S_IRUGO);
+module_param(single_major, bool, 0444);
MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
static ssize_t rbd_add(struct bus_type *bus, const char *buf,
@@ -468,11 +468,11 @@ static ssize_t rbd_supported_features_show(struct bus_type *bus, char *buf)
return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
}
-static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
-static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
-static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major);
-static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major);
-static BUS_ATTR(supported_features, S_IRUGO, rbd_supported_features_show, NULL);
+static BUS_ATTR(add, 0200, NULL, rbd_add);
+static BUS_ATTR(remove, 0200, NULL, rbd_remove);
+static BUS_ATTR(add_single_major, 0200, NULL, rbd_add_single_major);
+static BUS_ATTR(remove_single_major, 0200, NULL, rbd_remove_single_major);
+static BUS_ATTR(supported_features, 0444, rbd_supported_features_show, NULL);
static struct attribute *rbd_bus_attrs[] = {
&bus_attr_add.attr,
@@ -4204,22 +4204,22 @@ static ssize_t rbd_image_refresh(struct device *dev,
return size;
}
-static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
-static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
-static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
-static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL);
-static DEVICE_ATTR(client_addr, S_IRUGO, rbd_client_addr_show, NULL);
-static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
-static DEVICE_ATTR(cluster_fsid, S_IRUGO, rbd_cluster_fsid_show, NULL);
-static DEVICE_ATTR(config_info, S_IRUSR, rbd_config_info_show, NULL);
-static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
-static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
-static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
-static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
-static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
-static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
-static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
-static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
+static DEVICE_ATTR(size, 0444, rbd_size_show, NULL);
+static DEVICE_ATTR(features, 0444, rbd_features_show, NULL);
+static DEVICE_ATTR(major, 0444, rbd_major_show, NULL);
+static DEVICE_ATTR(minor, 0444, rbd_minor_show, NULL);
+static DEVICE_ATTR(client_addr, 0444, rbd_client_addr_show, NULL);
+static DEVICE_ATTR(client_id, 0444, rbd_client_id_show, NULL);
+static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL);
+static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL);
+static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL);
+static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL);
+static DEVICE_ATTR(name, 0444, rbd_name_show, NULL);
+static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL);
+static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh);
+static DEVICE_ATTR(current_snap, 0444, rbd_snap_show, NULL);
+static DEVICE_ATTR(snap_id, 0444, rbd_snap_id_show, NULL);
+static DEVICE_ATTR(parent, 0444, rbd_parent_show, NULL);
static struct attribute *rbd_attrs[] = {
&dev_attr_size.attr,
diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c
index 34997df132e2..09537bee387f 100644
--- a/drivers/block/rsxx/core.c
+++ b/drivers/block/rsxx/core.c
@@ -247,19 +247,19 @@ static void rsxx_debugfs_dev_new(struct rsxx_cardinfo *card)
if (IS_ERR_OR_NULL(card->debugfs_dir))
goto failed_debugfs_dir;
- debugfs_stats = debugfs_create_file("stats", S_IRUGO,
+ debugfs_stats = debugfs_create_file("stats", 0444,
card->debugfs_dir, card,
&debugfs_stats_fops);
if (IS_ERR_OR_NULL(debugfs_stats))
goto failed_debugfs_stats;
- debugfs_pci_regs = debugfs_create_file("pci_regs", S_IRUGO,
+ debugfs_pci_regs = debugfs_create_file("pci_regs", 0444,
card->debugfs_dir, card,
&debugfs_pci_regs_fops);
if (IS_ERR_OR_NULL(debugfs_pci_regs))
goto failed_debugfs_pci_regs;
- debugfs_cram = debugfs_create_file("cram", S_IRUGO | S_IWUSR,
+ debugfs_cram = debugfs_create_file("cram", 0644,
card->debugfs_dir, card,
&debugfs_cram_fops);
if (IS_ERR_OR_NULL(debugfs_cram))
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index 08586dc14e85..4d90e5eba2f5 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -567,7 +567,7 @@ static struct carm_request *carm_get_special(struct carm_host *host)
if (!crq)
return NULL;
- rq = blk_get_request(host->oob_q, REQ_OP_DRV_OUT, GFP_KERNEL);
+ rq = blk_get_request(host->oob_q, REQ_OP_DRV_OUT, 0);
if (IS_ERR(rq)) {
spin_lock_irqsave(&host->lock, flags);
carm_put_request(host, crq);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 4a07593c2efd..23752dc99b00 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -298,7 +298,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
struct request *req;
int err;
- req = blk_get_request(q, REQ_OP_DRV_IN, GFP_KERNEL);
+ req = blk_get_request(q, REQ_OP_DRV_IN, 0);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -371,7 +371,7 @@ static ssize_t virtblk_serial_show(struct device *dev,
return err;
}
-static DEVICE_ATTR(serial, S_IRUGO, virtblk_serial_show, NULL);
+static DEVICE_ATTR(serial, 0444, virtblk_serial_show, NULL);
/* The queue's logical block size must be set before calling this */
static void virtblk_update_capacity(struct virtio_blk *vblk, bool resize)
@@ -576,10 +576,10 @@ virtblk_cache_type_show(struct device *dev, struct device_attribute *attr,
}
static const struct device_attribute dev_attr_cache_type_ro =
- __ATTR(cache_type, S_IRUGO,
+ __ATTR(cache_type, 0444,
virtblk_cache_type_show, NULL);
static const struct device_attribute dev_attr_cache_type_rw =
- __ATTR(cache_type, S_IRUGO|S_IWUSR,
+ __ATTR(cache_type, 0644,
virtblk_cache_type_show, virtblk_cache_type_store);
static int virtblk_init_request(struct blk_mq_tag_set *set, struct request *rq,
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 987d665e82de..b55b245e8052 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -98,7 +98,7 @@ MODULE_PARM_DESC(max_queues,
* backend, 4KB page granularity is used.
*/
unsigned int xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
-module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, S_IRUGO);
+module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, 0444);
MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring");
/*
* The LRU mechanism to clean the lists of persistent grants needs to
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 21c1be1eb226..66412eededda 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -367,7 +367,7 @@ int __init xen_blkif_interface_init(void)
out: \
return sprintf(buf, format, result); \
} \
- static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+ static DEVICE_ATTR(name, 0444, show_##name, NULL)
VBD_SHOW_ALLRING(oo_req, "%llu\n");
VBD_SHOW_ALLRING(rd_req, "%llu\n");
@@ -403,7 +403,7 @@ static const struct attribute_group xen_vbdstat_group = {
\
return sprintf(buf, format, ##args); \
} \
- static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+ static DEVICE_ATTR(name, 0444, show_##name, NULL)
VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor);
VBD_SHOW(mode, "%s\n", be->mode);
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 2a8e7813bd1a..ae00a82f350b 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -129,13 +129,12 @@ static const struct block_device_operations xlvbd_block_fops;
*/
static unsigned int xen_blkif_max_segments = 32;
-module_param_named(max_indirect_segments, xen_blkif_max_segments, uint,
- S_IRUGO);
+module_param_named(max_indirect_segments, xen_blkif_max_segments, uint, 0444);
MODULE_PARM_DESC(max_indirect_segments,
"Maximum amount of segments in indirect requests (default is 32)");
static unsigned int xen_blkif_max_queues = 4;
-module_param_named(max_queues, xen_blkif_max_queues, uint, S_IRUGO);
+module_param_named(max_queues, xen_blkif_max_queues, uint, 0444);
MODULE_PARM_DESC(max_queues, "Maximum number of hardware queues/rings used per virtual disk");
/*
@@ -143,7 +142,7 @@ MODULE_PARM_DESC(max_queues, "Maximum number of hardware queues/rings used per v
* backend, 4KB page granularity is used.
*/
static unsigned int xen_blkif_max_ring_order;
-module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, S_IRUGO);
+module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, 0444);
MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring");
#define BLK_RING_SIZE(info) \
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index bfc566d3f31a..9adc8c3eb0fa 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -2192,7 +2192,7 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf,
len = nr * CD_FRAMESIZE_RAW;
- rq = blk_get_request(q, REQ_OP_SCSI_IN, GFP_KERNEL);
+ rq = blk_get_request(q, REQ_OP_SCSI_IN, 0);
if (IS_ERR(rq)) {
ret = PTR_ERR(rq);
break;
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 0e6bc631a1ca..8b2b72b93885 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -92,7 +92,7 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
struct request *rq;
int error;
- rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+ rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
ide_req(rq)->type = ATA_PRIV_MISC;
rq->special = (char *)pc;
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 5a8e8e3c22cd..32af6f063cb3 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -437,7 +437,7 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
bool delay = false;
rq = blk_get_request(drive->queue,
- write ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, __GFP_RECLAIM);
+ write ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
memcpy(scsi_req(rq)->cmd, cmd, BLK_MAX_CDB);
ide_req(rq)->type = ATA_PRIV_PC;
rq->rq_flags |= rq_flags;
diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c
index 2acca12b9c94..b1322400887b 100644
--- a/drivers/ide/ide-cd_ioctl.c
+++ b/drivers/ide/ide-cd_ioctl.c
@@ -304,7 +304,7 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi)
struct request *rq;
int ret;
- rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+ rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
ide_req(rq)->type = ATA_PRIV_MISC;
rq->rq_flags = RQF_QUIET;
blk_execute_rq(drive->queue, cd->disk, rq, 0);
diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c
index 4e20747af32e..f4f8afdf8bbe 100644
--- a/drivers/ide/ide-devsets.c
+++ b/drivers/ide/ide-devsets.c
@@ -166,7 +166,7 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting,
if (!(setting->flags & DS_SYNC))
return setting->set(drive, arg);
- rq = blk_get_request(q, REQ_OP_DRV_IN, __GFP_RECLAIM);
+ rq = blk_get_request(q, REQ_OP_DRV_IN, 0);
ide_req(rq)->type = ATA_PRIV_MISC;
scsi_req(rq)->cmd_len = 5;
scsi_req(rq)->cmd[0] = REQ_DEVSET_EXEC;
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index f1a7c58fe418..e3b4e659082d 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -478,7 +478,7 @@ static int set_multcount(ide_drive_t *drive, int arg)
if (drive->special_flags & IDE_SFLAG_SET_MULTMODE)
return -EBUSY;
- rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+ rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
ide_req(rq)->type = ATA_PRIV_TASKFILE;
drive->mult_req = arg;
diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c
index 3661abb16a5f..af5119a73689 100644
--- a/drivers/ide/ide-ioctls.c
+++ b/drivers/ide/ide-ioctls.c
@@ -125,7 +125,7 @@ static int ide_cmd_ioctl(ide_drive_t *drive, unsigned long arg)
if (NULL == (void *) arg) {
struct request *rq;
- rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+ rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
ide_req(rq)->type = ATA_PRIV_TASKFILE;
blk_execute_rq(drive->queue, NULL, rq, 0);
err = scsi_req(rq)->result ? -EIO : 0;
@@ -222,7 +222,7 @@ static int generic_drive_reset(ide_drive_t *drive)
struct request *rq;
int ret = 0;
- rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+ rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
ide_req(rq)->type = ATA_PRIV_MISC;
scsi_req(rq)->cmd_len = 1;
scsi_req(rq)->cmd[0] = REQ_DRIVE_RESET;
diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c
index 6465bcc7cea6..622f0edb3945 100644
--- a/drivers/ide/ide-park.c
+++ b/drivers/ide/ide-park.c
@@ -32,7 +32,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
}
spin_unlock_irq(&hwif->lock);
- rq = blk_get_request(q, REQ_OP_DRV_IN, __GFP_RECLAIM);
+ rq = blk_get_request(q, REQ_OP_DRV_IN, 0);
scsi_req(rq)->cmd[0] = REQ_PARK_HEADS;
scsi_req(rq)->cmd_len = 1;
ide_req(rq)->type = ATA_PRIV_MISC;
@@ -47,7 +47,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
* Make sure that *some* command is sent to the drive after the
* timeout has expired, so power management will be reenabled.
*/
- rq = blk_get_request(q, REQ_OP_DRV_IN, GFP_NOWAIT);
+ rq = blk_get_request(q, REQ_OP_DRV_IN, BLK_MQ_REQ_NOWAIT);
if (IS_ERR(rq))
goto out;
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index ad8a125defdd..59217aa1d1fb 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -19,7 +19,7 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
}
memset(&rqpm, 0, sizeof(rqpm));
- rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+ rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
ide_req(rq)->type = ATA_PRIV_PM_SUSPEND;
rq->special = &rqpm;
rqpm.pm_step = IDE_PM_START_SUSPEND;
@@ -90,8 +90,7 @@ int generic_ide_resume(struct device *dev)
}
memset(&rqpm, 0, sizeof(rqpm));
- rq = blk_get_request_flags(drive->queue, REQ_OP_DRV_IN,
- BLK_MQ_REQ_PREEMPT);
+ rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, BLK_MQ_REQ_PREEMPT);
ide_req(rq)->type = ATA_PRIV_PM_RESUME;
rq->special = &rqpm;
rqpm.pm_step = IDE_PM_START_RESUME;
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index fd57e8ccc47a..62c1a19a9aed 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -854,7 +854,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size)
BUG_ON(cmd != REQ_IDETAPE_READ && cmd != REQ_IDETAPE_WRITE);
BUG_ON(size < 0 || size % tape->blk_size);
- rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+ rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
ide_req(rq)->type = ATA_PRIV_MISC;
scsi_req(rq)->cmd[13] = cmd;
rq->rq_disk = tape->disk;
@@ -862,7 +862,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size)
if (size) {
ret = blk_rq_map_kern(drive->queue, rq, tape->buf, size,
- __GFP_RECLAIM);
+ GFP_NOIO);
if (ret)
goto out_put;
}
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index abe0822dd429..c034cd965831 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -431,7 +431,7 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf,
rq = blk_get_request(drive->queue,
(cmd->tf_flags & IDE_TFLAG_WRITE) ?
- REQ_OP_DRV_OUT : REQ_OP_DRV_IN, __GFP_RECLAIM);
+ REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
ide_req(rq)->type = ATA_PRIV_TASKFILE;
/*
@@ -442,7 +442,7 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf,
*/
if (nsect) {
error = blk_rq_map_kern(drive->queue, rq, buf,
- nsect * SECTOR_SIZE, __GFP_RECLAIM);
+ nsect * SECTOR_SIZE, GFP_NOIO);
if (error)
goto put_req;
}
diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 63171cdce270..60aa7bc5a630 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -431,7 +431,7 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
return 0;
err_sysfs:
if (tt->exit)
- tt->exit(targetdata);
+ tt->exit(targetdata, true);
err_init:
blk_cleanup_queue(tqueue);
tdisk->queue = NULL;
@@ -446,7 +446,7 @@ err_reserve:
return ret;
}
-static void __nvm_remove_target(struct nvm_target *t)
+static void __nvm_remove_target(struct nvm_target *t, bool graceful)
{
struct nvm_tgt_type *tt = t->type;
struct gendisk *tdisk = t->disk;
@@ -459,7 +459,7 @@ static void __nvm_remove_target(struct nvm_target *t)
tt->sysfs_exit(tdisk);
if (tt->exit)
- tt->exit(tdisk->private_data);
+ tt->exit(tdisk->private_data, graceful);
nvm_remove_tgt_dev(t->dev, 1);
put_disk(tdisk);
@@ -489,7 +489,7 @@ static int nvm_remove_tgt(struct nvm_dev *dev, struct nvm_ioctl_remove *remove)
mutex_unlock(&dev->mlock);
return 1;
}
- __nvm_remove_target(t);
+ __nvm_remove_target(t, true);
mutex_unlock(&dev->mlock);
return 0;
@@ -963,7 +963,7 @@ void nvm_unregister(struct nvm_dev *dev)
list_for_each_entry_safe(t, tmp, &dev->targets, list) {
if (t->dev->parent != dev)
continue;
- __nvm_remove_target(t);
+ __nvm_remove_target(t, false);
}
mutex_unlock(&dev->mlock);
diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c
index 29a23111b31c..b1c6d7eb6115 100644
--- a/drivers/lightnvm/pblk-cache.c
+++ b/drivers/lightnvm/pblk-cache.c
@@ -44,13 +44,15 @@ retry:
goto out;
}
- if (unlikely(!bio_has_data(bio)))
- goto out;
-
pblk_ppa_set_empty(&w_ctx.ppa);
w_ctx.flags = flags;
- if (bio->bi_opf & REQ_PREFLUSH)
+ if (bio->bi_opf & REQ_PREFLUSH) {
w_ctx.flags |= PBLK_FLUSH_ENTRY;
+ pblk_write_kick(pblk);
+ }
+
+ if (unlikely(!bio_has_data(bio)))
+ goto out;
for (i = 0; i < nr_entries; i++) {
void *data = bio_data(bio);
diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 94d5d97c9d8a..ed9cc977c8b3 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -40,7 +40,7 @@ static void pblk_line_mark_bb(struct work_struct *work)
}
kfree(ppa);
- mempool_free(line_ws, pblk->gen_ws_pool);
+ mempool_free(line_ws, &pblk->gen_ws_pool);
}
static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line,
@@ -102,7 +102,7 @@ static void pblk_end_io_erase(struct nvm_rq *rqd)
struct pblk *pblk = rqd->private;
__pblk_end_io_erase(pblk, rqd);
- mempool_free(rqd, pblk->e_rq_pool);
+ mempool_free(rqd, &pblk->e_rq_pool);
}
/*
@@ -237,15 +237,15 @@ struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int type)
switch (type) {
case PBLK_WRITE:
case PBLK_WRITE_INT:
- pool = pblk->w_rq_pool;
+ pool = &pblk->w_rq_pool;
rq_size = pblk_w_rq_size;
break;
case PBLK_READ:
- pool = pblk->r_rq_pool;
+ pool = &pblk->r_rq_pool;
rq_size = pblk_g_rq_size;
break;
default:
- pool = pblk->e_rq_pool;
+ pool = &pblk->e_rq_pool;
rq_size = pblk_g_rq_size;
}
@@ -265,20 +265,22 @@ void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type)
case PBLK_WRITE:
kfree(((struct pblk_c_ctx *)nvm_rq_to_pdu(rqd))->lun_bitmap);
case PBLK_WRITE_INT:
- pool = pblk->w_rq_pool;
+ pool = &pblk->w_rq_pool;
break;
case PBLK_READ:
- pool = pblk->r_rq_pool;
+ pool = &pblk->r_rq_pool;
break;
case PBLK_ERASE:
- pool = pblk->e_rq_pool;
+ pool = &pblk->e_rq_pool;
break;
default:
pr_err("pblk: trying to free unknown rqd type\n");
return;
}
- nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
+ if (rqd->meta_list)
+ nvm_dev_dma_free(dev->parent, rqd->meta_list,
+ rqd->dma_meta_list);
mempool_free(rqd, pool);
}
@@ -292,7 +294,7 @@ void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off,
for (i = off; i < nr_pages + off; i++) {
bv = bio->bi_io_vec[i];
- mempool_free(bv.bv_page, pblk->page_bio_pool);
+ mempool_free(bv.bv_page, &pblk->page_bio_pool);
}
}
@@ -304,23 +306,23 @@ int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags,
int i, ret;
for (i = 0; i < nr_pages; i++) {
- page = mempool_alloc(pblk->page_bio_pool, flags);
+ page = mempool_alloc(&pblk->page_bio_pool, flags);
ret = bio_add_pc_page(q, bio, page, PBLK_EXPOSED_PAGE_SIZE, 0);
if (ret != PBLK_EXPOSED_PAGE_SIZE) {
pr_err("pblk: could not add page to bio\n");
- mempool_free(page, pblk->page_bio_pool);
+ mempool_free(page, &pblk->page_bio_pool);
goto err;
}
}
return 0;
err:
- pblk_bio_free_pages(pblk, bio, 0, i - 1);
+ pblk_bio_free_pages(pblk, bio, (bio->bi_vcnt - i), i);
return -1;
}
-static void pblk_write_kick(struct pblk *pblk)
+void pblk_write_kick(struct pblk *pblk)
{
wake_up_process(pblk->writer_ts);
mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(1000));
@@ -342,13 +344,6 @@ void pblk_write_should_kick(struct pblk *pblk)
pblk_write_kick(pblk);
}
-void pblk_end_io_sync(struct nvm_rq *rqd)
-{
- struct completion *waiting = rqd->private;
-
- complete(waiting);
-}
-
static void pblk_wait_for_meta(struct pblk *pblk)
{
do {
@@ -380,7 +375,13 @@ struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line)
lockdep_assert_held(&line->lock);
- if (!vsc) {
+ if (line->w_err_gc->has_write_err) {
+ if (line->gc_group != PBLK_LINEGC_WERR) {
+ line->gc_group = PBLK_LINEGC_WERR;
+ move_list = &l_mg->gc_werr_list;
+ pblk_rl_werr_line_in(&pblk->rl);
+ }
+ } else if (!vsc) {
if (line->gc_group != PBLK_LINEGC_FULL) {
line->gc_group = PBLK_LINEGC_FULL;
move_list = &l_mg->gc_full_list;
@@ -467,16 +468,13 @@ int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd)
{
struct nvm_tgt_dev *dev = pblk->dev;
-#ifdef CONFIG_NVM_DEBUG
- int ret;
+ atomic_inc(&pblk->inflight_io);
- ret = pblk_check_io(pblk, rqd);
- if (ret)
- return ret;
+#ifdef CONFIG_NVM_DEBUG
+ if (pblk_check_io(pblk, rqd))
+ return NVM_IO_ERR;
#endif
- atomic_inc(&pblk->inflight_io);
-
return nvm_submit_io(dev, rqd);
}
@@ -484,16 +482,13 @@ int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd)
{
struct nvm_tgt_dev *dev = pblk->dev;
-#ifdef CONFIG_NVM_DEBUG
- int ret;
+ atomic_inc(&pblk->inflight_io);
- ret = pblk_check_io(pblk, rqd);
- if (ret)
- return ret;
+#ifdef CONFIG_NVM_DEBUG
+ if (pblk_check_io(pblk, rqd))
+ return NVM_IO_ERR;
#endif
- atomic_inc(&pblk->inflight_io);
-
return nvm_submit_io_sync(dev, rqd);
}
@@ -856,9 +851,10 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
atomic_dec(&pblk->inflight_io);
if (rqd.error) {
- if (dir == PBLK_WRITE)
+ if (dir == PBLK_WRITE) {
pblk_log_write_err(pblk, &rqd);
- else if (dir == PBLK_READ)
+ ret = 1;
+ } else if (dir == PBLK_READ)
pblk_log_read_err(pblk, &rqd);
}
@@ -1071,6 +1067,25 @@ static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line,
return 1;
}
+static int pblk_line_alloc_bitmaps(struct pblk *pblk, struct pblk_line *line)
+{
+ struct pblk_line_meta *lm = &pblk->lm;
+
+ line->map_bitmap = kzalloc(lm->sec_bitmap_len, GFP_KERNEL);
+ if (!line->map_bitmap)
+ return -ENOMEM;
+
+ /* will be initialized using bb info from map_bitmap */
+ line->invalid_bitmap = kmalloc(lm->sec_bitmap_len, GFP_KERNEL);
+ if (!line->invalid_bitmap) {
+ kfree(line->map_bitmap);
+ line->map_bitmap = NULL;
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
/* For now lines are always assumed full lines. Thus, smeta former and current
* lun bitmaps are omitted.
*/
@@ -1108,7 +1123,7 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line,
if (init && pblk_line_submit_smeta_io(pblk, line, off, PBLK_WRITE)) {
pr_debug("pblk: line smeta I/O failed. Retry\n");
- return 1;
+ return 0;
}
bitmap_copy(line->invalid_bitmap, line->map_bitmap, lm->sec_per_line);
@@ -1174,19 +1189,9 @@ static int pblk_prepare_new_line(struct pblk *pblk, struct pblk_line *line)
static int pblk_line_prepare(struct pblk *pblk, struct pblk_line *line)
{
struct pblk_line_meta *lm = &pblk->lm;
+ int blk_in_line = atomic_read(&line->blk_in_line);
int blk_to_erase;
- line->map_bitmap = kzalloc(lm->sec_bitmap_len, GFP_ATOMIC);
- if (!line->map_bitmap)
- return -ENOMEM;
-
- /* will be initialized using bb info from map_bitmap */
- line->invalid_bitmap = kmalloc(lm->sec_bitmap_len, GFP_ATOMIC);
- if (!line->invalid_bitmap) {
- kfree(line->map_bitmap);
- return -ENOMEM;
- }
-
/* Bad blocks do not need to be erased */
bitmap_copy(line->erase_bitmap, line->blk_bitmap, lm->blk_per_line);
@@ -1199,16 +1204,19 @@ static int pblk_line_prepare(struct pblk *pblk, struct pblk_line *line)
blk_to_erase = pblk_prepare_new_line(pblk, line);
line->state = PBLK_LINESTATE_FREE;
} else {
- blk_to_erase = atomic_read(&line->blk_in_line);
+ blk_to_erase = blk_in_line;
}
- if (line->state != PBLK_LINESTATE_FREE) {
- kfree(line->map_bitmap);
- kfree(line->invalid_bitmap);
+ if (blk_in_line < lm->min_blk_line) {
spin_unlock(&line->lock);
+ return -EAGAIN;
+ }
+
+ if (line->state != PBLK_LINESTATE_FREE) {
WARN(1, "pblk: corrupted line %d, state %d\n",
line->id, line->state);
- return -EAGAIN;
+ spin_unlock(&line->lock);
+ return -EINTR;
}
line->state = PBLK_LINESTATE_OPEN;
@@ -1241,13 +1249,16 @@ int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line)
}
spin_unlock(&l_mg->free_lock);
- pblk_rl_free_lines_dec(&pblk->rl, line, true);
+ ret = pblk_line_alloc_bitmaps(pblk, line);
+ if (ret)
+ return ret;
if (!pblk_line_init_bb(pblk, line, 0)) {
list_add(&line->list, &l_mg->free_list);
return -EINTR;
}
+ pblk_rl_free_lines_dec(&pblk->rl, line, true);
return 0;
}
@@ -1259,6 +1270,24 @@ void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line)
line->emeta = NULL;
}
+static void pblk_line_reinit(struct pblk_line *line)
+{
+ *line->vsc = cpu_to_le32(EMPTY_ENTRY);
+
+ line->map_bitmap = NULL;
+ line->invalid_bitmap = NULL;
+ line->smeta = NULL;
+ line->emeta = NULL;
+}
+
+void pblk_line_free(struct pblk_line *line)
+{
+ kfree(line->map_bitmap);
+ kfree(line->invalid_bitmap);
+
+ pblk_line_reinit(line);
+}
+
struct pblk_line *pblk_line_get(struct pblk *pblk)
{
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
@@ -1292,10 +1321,14 @@ retry:
ret = pblk_line_prepare(pblk, line);
if (ret) {
- if (ret == -EAGAIN) {
+ switch (ret) {
+ case -EAGAIN:
+ list_add(&line->list, &l_mg->bad_list);
+ goto retry;
+ case -EINTR:
list_add(&line->list, &l_mg->corrupt_list);
goto retry;
- } else {
+ default:
pr_err("pblk: failed to prepare line %d\n", line->id);
list_add(&line->list, &l_mg->free_list);
l_mg->nr_free_lines++;
@@ -1321,11 +1354,14 @@ retry:
return NULL;
}
+ retry_line->map_bitmap = line->map_bitmap;
+ retry_line->invalid_bitmap = line->invalid_bitmap;
retry_line->smeta = line->smeta;
retry_line->emeta = line->emeta;
retry_line->meta_line = line->meta_line;
- pblk_line_free(pblk, line);
+ pblk_line_reinit(line);
+
l_mg->data_line = retry_line;
spin_unlock(&l_mg->free_lock);
@@ -1378,6 +1414,9 @@ struct pblk_line *pblk_line_get_first_data(struct pblk *pblk)
}
spin_unlock(&l_mg->free_lock);
+ if (pblk_line_alloc_bitmaps(pblk, line))
+ return NULL;
+
if (pblk_line_erase(pblk, line)) {
line = pblk_line_retry(pblk, line);
if (!line)
@@ -1449,7 +1488,7 @@ static void pblk_line_close_meta_sync(struct pblk *pblk)
flush_workqueue(pblk->close_wq);
}
-void pblk_pipeline_stop(struct pblk *pblk)
+void __pblk_pipeline_flush(struct pblk *pblk)
{
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
int ret;
@@ -1474,6 +1513,11 @@ void pblk_pipeline_stop(struct pblk *pblk)
flush_workqueue(pblk->bb_wq);
pblk_line_close_meta_sync(pblk);
+}
+
+void __pblk_pipeline_stop(struct pblk *pblk)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
spin_lock(&l_mg->free_lock);
pblk->state = PBLK_STATE_STOPPED;
@@ -1482,6 +1526,12 @@ void pblk_pipeline_stop(struct pblk *pblk)
spin_unlock(&l_mg->free_lock);
}
+void pblk_pipeline_stop(struct pblk *pblk)
+{
+ __pblk_pipeline_flush(pblk);
+ __pblk_pipeline_stop(pblk);
+}
+
struct pblk_line *pblk_line_replace_data(struct pblk *pblk)
{
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
@@ -1511,6 +1561,9 @@ retry_erase:
goto retry_erase;
}
+ if (pblk_line_alloc_bitmaps(pblk, new))
+ return NULL;
+
retry_setup:
if (!pblk_line_init_metadata(pblk, new, cur)) {
new = pblk_line_retry(pblk, new);
@@ -1550,19 +1603,6 @@ out:
return new;
}
-void pblk_line_free(struct pblk *pblk, struct pblk_line *line)
-{
- kfree(line->map_bitmap);
- kfree(line->invalid_bitmap);
-
- *line->vsc = cpu_to_le32(EMPTY_ENTRY);
-
- line->map_bitmap = NULL;
- line->invalid_bitmap = NULL;
- line->smeta = NULL;
- line->emeta = NULL;
-}
-
static void __pblk_line_put(struct pblk *pblk, struct pblk_line *line)
{
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
@@ -1572,9 +1612,14 @@ static void __pblk_line_put(struct pblk *pblk, struct pblk_line *line)
WARN_ON(line->state != PBLK_LINESTATE_GC);
line->state = PBLK_LINESTATE_FREE;
line->gc_group = PBLK_LINEGC_NONE;
- pblk_line_free(pblk, line);
- spin_unlock(&line->lock);
+ pblk_line_free(line);
+
+ if (line->w_err_gc->has_write_err) {
+ pblk_rl_werr_line_out(&pblk->rl);
+ line->w_err_gc->has_write_err = 0;
+ }
+ spin_unlock(&line->lock);
atomic_dec(&gc->pipeline_gc);
spin_lock(&l_mg->free_lock);
@@ -1593,7 +1638,7 @@ static void pblk_line_put_ws(struct work_struct *work)
struct pblk_line *line = line_put_ws->line;
__pblk_line_put(pblk, line);
- mempool_free(line_put_ws, pblk->gen_ws_pool);
+ mempool_free(line_put_ws, &pblk->gen_ws_pool);
}
void pblk_line_put(struct kref *ref)
@@ -1610,7 +1655,7 @@ void pblk_line_put_wq(struct kref *ref)
struct pblk *pblk = line->pblk;
struct pblk_line_ws *line_put_ws;
- line_put_ws = mempool_alloc(pblk->gen_ws_pool, GFP_ATOMIC);
+ line_put_ws = mempool_alloc(&pblk->gen_ws_pool, GFP_ATOMIC);
if (!line_put_ws)
return;
@@ -1737,11 +1782,34 @@ void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line)
spin_lock(&l_mg->close_lock);
spin_lock(&line->lock);
+
+ /* Update the in-memory start address for emeta, in case it has
+ * shifted due to write errors
+ */
+ if (line->emeta_ssec != line->cur_sec)
+ line->emeta_ssec = line->cur_sec;
+
list_add_tail(&line->list, &l_mg->emeta_list);
spin_unlock(&line->lock);
spin_unlock(&l_mg->close_lock);
pblk_line_should_sync_meta(pblk);
+
+
+}
+
+static void pblk_save_lba_list(struct pblk *pblk, struct pblk_line *line)
+{
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ unsigned int lba_list_size = lm->emeta_len[2];
+ struct pblk_w_err_gc *w_err_gc = line->w_err_gc;
+ struct pblk_emeta *emeta = line->emeta;
+
+ w_err_gc->lba_list = pblk_malloc(lba_list_size,
+ l_mg->emeta_alloc_type, GFP_KERNEL);
+ memcpy(w_err_gc->lba_list, emeta_to_lbas(pblk, emeta->buf),
+ lba_list_size);
}
void pblk_line_close_ws(struct work_struct *work)
@@ -1750,9 +1818,16 @@ void pblk_line_close_ws(struct work_struct *work)
ws);
struct pblk *pblk = line_ws->pblk;
struct pblk_line *line = line_ws->line;
+ struct pblk_w_err_gc *w_err_gc = line->w_err_gc;
+
+ /* Write errors makes the emeta start address stored in smeta invalid,
+ * so keep a copy of the lba list until we've gc'd the line
+ */
+ if (w_err_gc->has_write_err)
+ pblk_save_lba_list(pblk, line);
pblk_line_close(pblk, line);
- mempool_free(line_ws, pblk->gen_ws_pool);
+ mempool_free(line_ws, &pblk->gen_ws_pool);
}
void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
@@ -1761,7 +1836,7 @@ void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
{
struct pblk_line_ws *line_ws;
- line_ws = mempool_alloc(pblk->gen_ws_pool, gfp_mask);
+ line_ws = mempool_alloc(&pblk->gen_ws_pool, gfp_mask);
line_ws->pblk = pblk;
line_ws->line = line;
diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c
index 6851a5c67189..df88f1bdd921 100644
--- a/drivers/lightnvm/pblk-gc.c
+++ b/drivers/lightnvm/pblk-gc.c
@@ -129,6 +129,53 @@ out:
kfree(gc_rq_ws);
}
+static __le64 *get_lba_list_from_emeta(struct pblk *pblk,
+ struct pblk_line *line)
+{
+ struct line_emeta *emeta_buf;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line_meta *lm = &pblk->lm;
+ unsigned int lba_list_size = lm->emeta_len[2];
+ __le64 *lba_list;
+ int ret;
+
+ emeta_buf = pblk_malloc(lm->emeta_len[0],
+ l_mg->emeta_alloc_type, GFP_KERNEL);
+ if (!emeta_buf)
+ return NULL;
+
+ ret = pblk_line_read_emeta(pblk, line, emeta_buf);
+ if (ret) {
+ pr_err("pblk: line %d read emeta failed (%d)\n",
+ line->id, ret);
+ pblk_mfree(emeta_buf, l_mg->emeta_alloc_type);
+ return NULL;
+ }
+
+ /* If this read fails, it means that emeta is corrupted.
+ * For now, leave the line untouched.
+ * TODO: Implement a recovery routine that scans and moves
+ * all sectors on the line.
+ */
+
+ ret = pblk_recov_check_emeta(pblk, emeta_buf);
+ if (ret) {
+ pr_err("pblk: inconsistent emeta (line %d)\n",
+ line->id);
+ pblk_mfree(emeta_buf, l_mg->emeta_alloc_type);
+ return NULL;
+ }
+
+ lba_list = pblk_malloc(lba_list_size,
+ l_mg->emeta_alloc_type, GFP_KERNEL);
+ if (lba_list)
+ memcpy(lba_list, emeta_to_lbas(pblk, emeta_buf), lba_list_size);
+
+ pblk_mfree(emeta_buf, l_mg->emeta_alloc_type);
+
+ return lba_list;
+}
+
static void pblk_gc_line_prepare_ws(struct work_struct *work)
{
struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws,
@@ -138,46 +185,26 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work)
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
struct pblk_line_meta *lm = &pblk->lm;
struct pblk_gc *gc = &pblk->gc;
- struct line_emeta *emeta_buf;
struct pblk_line_ws *gc_rq_ws;
struct pblk_gc_rq *gc_rq;
__le64 *lba_list;
unsigned long *invalid_bitmap;
int sec_left, nr_secs, bit;
- int ret;
invalid_bitmap = kmalloc(lm->sec_bitmap_len, GFP_KERNEL);
if (!invalid_bitmap)
goto fail_free_ws;
- emeta_buf = pblk_malloc(lm->emeta_len[0], l_mg->emeta_alloc_type,
- GFP_KERNEL);
- if (!emeta_buf) {
- pr_err("pblk: cannot use GC emeta\n");
- goto fail_free_bitmap;
- }
-
- ret = pblk_line_read_emeta(pblk, line, emeta_buf);
- if (ret) {
- pr_err("pblk: line %d read emeta failed (%d)\n", line->id, ret);
- goto fail_free_emeta;
- }
-
- /* If this read fails, it means that emeta is corrupted. For now, leave
- * the line untouched. TODO: Implement a recovery routine that scans and
- * moves all sectors on the line.
- */
-
- ret = pblk_recov_check_emeta(pblk, emeta_buf);
- if (ret) {
- pr_err("pblk: inconsistent emeta (line %d)\n", line->id);
- goto fail_free_emeta;
- }
-
- lba_list = emeta_to_lbas(pblk, emeta_buf);
- if (!lba_list) {
- pr_err("pblk: could not interpret emeta (line %d)\n", line->id);
- goto fail_free_emeta;
+ if (line->w_err_gc->has_write_err) {
+ lba_list = line->w_err_gc->lba_list;
+ line->w_err_gc->lba_list = NULL;
+ } else {
+ lba_list = get_lba_list_from_emeta(pblk, line);
+ if (!lba_list) {
+ pr_err("pblk: could not interpret emeta (line %d)\n",
+ line->id);
+ goto fail_free_ws;
+ }
}
spin_lock(&line->lock);
@@ -187,14 +214,14 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work)
if (sec_left < 0) {
pr_err("pblk: corrupted GC line (%d)\n", line->id);
- goto fail_free_emeta;
+ goto fail_free_lba_list;
}
bit = -1;
next_rq:
gc_rq = kmalloc(sizeof(struct pblk_gc_rq), GFP_KERNEL);
if (!gc_rq)
- goto fail_free_emeta;
+ goto fail_free_lba_list;
nr_secs = 0;
do {
@@ -240,7 +267,7 @@ next_rq:
goto next_rq;
out:
- pblk_mfree(emeta_buf, l_mg->emeta_alloc_type);
+ pblk_mfree(lba_list, l_mg->emeta_alloc_type);
kfree(line_ws);
kfree(invalid_bitmap);
@@ -251,9 +278,8 @@ out:
fail_free_gc_rq:
kfree(gc_rq);
-fail_free_emeta:
- pblk_mfree(emeta_buf, l_mg->emeta_alloc_type);
-fail_free_bitmap:
+fail_free_lba_list:
+ pblk_mfree(lba_list, l_mg->emeta_alloc_type);
kfree(invalid_bitmap);
fail_free_ws:
kfree(line_ws);
@@ -349,12 +375,14 @@ static struct pblk_line *pblk_gc_get_victim_line(struct pblk *pblk,
static bool pblk_gc_should_run(struct pblk_gc *gc, struct pblk_rl *rl)
{
unsigned int nr_blocks_free, nr_blocks_need;
+ unsigned int werr_lines = atomic_read(&rl->werr_lines);
nr_blocks_need = pblk_rl_high_thrs(rl);
nr_blocks_free = pblk_rl_nr_free_blks(rl);
/* This is not critical, no need to take lock here */
- return ((gc->gc_active) && (nr_blocks_need > nr_blocks_free));
+ return ((werr_lines > 0) ||
+ ((gc->gc_active) && (nr_blocks_need > nr_blocks_free)));
}
void pblk_gc_free_full_lines(struct pblk *pblk)
@@ -649,7 +677,7 @@ fail_free_main_kthread:
return ret;
}
-void pblk_gc_exit(struct pblk *pblk)
+void pblk_gc_exit(struct pblk *pblk, bool graceful)
{
struct pblk_gc *gc = &pblk->gc;
@@ -663,10 +691,12 @@ void pblk_gc_exit(struct pblk *pblk)
if (gc->gc_reader_ts)
kthread_stop(gc->gc_reader_ts);
- flush_workqueue(gc->gc_reader_wq);
- destroy_workqueue(gc->gc_reader_wq);
+ if (graceful) {
+ flush_workqueue(gc->gc_reader_wq);
+ flush_workqueue(gc->gc_line_reader_wq);
+ }
- flush_workqueue(gc->gc_line_reader_wq);
+ destroy_workqueue(gc->gc_reader_wq);
destroy_workqueue(gc->gc_line_reader_wq);
if (gc->gc_writer_ts)
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 91a5bc2556a3..ce561f5d48ce 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -20,10 +20,15 @@
#include "pblk.h"
+unsigned int write_buffer_size;
+
+module_param(write_buffer_size, uint, 0644);
+MODULE_PARM_DESC(write_buffer_size, "number of entries in a write buffer");
+
static struct kmem_cache *pblk_ws_cache, *pblk_rec_cache, *pblk_g_rq_cache,
*pblk_w_rq_cache;
static DECLARE_RWSEM(pblk_lock);
-struct bio_set *pblk_bio_set;
+struct bio_set pblk_bio_set;
static int pblk_rw_io(struct request_queue *q, struct pblk *pblk,
struct bio *bio)
@@ -127,10 +132,8 @@ static int pblk_l2p_recover(struct pblk *pblk, bool factory_init)
if (!line) {
/* Configure next line for user data */
line = pblk_line_get_first_data(pblk);
- if (!line) {
- pr_err("pblk: line list corrupted\n");
+ if (!line)
return -EFAULT;
- }
}
return 0;
@@ -141,6 +144,7 @@ static int pblk_l2p_init(struct pblk *pblk, bool factory_init)
sector_t i;
struct ppa_addr ppa;
size_t map_size;
+ int ret = 0;
map_size = pblk_trans_map_size(pblk);
pblk->trans_map = vmalloc(map_size);
@@ -152,7 +156,11 @@ static int pblk_l2p_init(struct pblk *pblk, bool factory_init)
for (i = 0; i < pblk->rl.nr_secs; i++)
pblk_trans_map_set(pblk, i, ppa);
- return pblk_l2p_recover(pblk, factory_init);
+ ret = pblk_l2p_recover(pblk, factory_init);
+ if (ret)
+ vfree(pblk->trans_map);
+
+ return ret;
}
static void pblk_rwb_free(struct pblk *pblk)
@@ -169,10 +177,15 @@ static int pblk_rwb_init(struct pblk *pblk)
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct pblk_rb_entry *entries;
- unsigned long nr_entries;
+ unsigned long nr_entries, buffer_size;
unsigned int power_size, power_seg_sz;
- nr_entries = pblk_rb_calculate_size(pblk->pgs_in_buffer);
+ if (write_buffer_size && (write_buffer_size > pblk->pgs_in_buffer))
+ buffer_size = write_buffer_size;
+ else
+ buffer_size = pblk->pgs_in_buffer;
+
+ nr_entries = pblk_rb_calculate_size(buffer_size);
entries = vzalloc(nr_entries * sizeof(struct pblk_rb_entry));
if (!entries)
@@ -341,7 +354,7 @@ static int pblk_core_init(struct pblk *pblk)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
- int max_write_ppas;
+ int ret, max_write_ppas;
atomic64_set(&pblk->user_wa, 0);
atomic64_set(&pblk->pad_wa, 0);
@@ -375,33 +388,33 @@ static int pblk_core_init(struct pblk *pblk)
goto fail_free_pad_dist;
/* Internal bios can be at most the sectors signaled by the device. */
- pblk->page_bio_pool = mempool_create_page_pool(NVM_MAX_VLBA, 0);
- if (!pblk->page_bio_pool)
+ ret = mempool_init_page_pool(&pblk->page_bio_pool, NVM_MAX_VLBA, 0);
+ if (ret)
goto free_global_caches;
- pblk->gen_ws_pool = mempool_create_slab_pool(PBLK_GEN_WS_POOL_SIZE,
- pblk_ws_cache);
- if (!pblk->gen_ws_pool)
+ ret = mempool_init_slab_pool(&pblk->gen_ws_pool, PBLK_GEN_WS_POOL_SIZE,
+ pblk_ws_cache);
+ if (ret)
goto free_page_bio_pool;
- pblk->rec_pool = mempool_create_slab_pool(geo->all_luns,
- pblk_rec_cache);
- if (!pblk->rec_pool)
+ ret = mempool_init_slab_pool(&pblk->rec_pool, geo->all_luns,
+ pblk_rec_cache);
+ if (ret)
goto free_gen_ws_pool;
- pblk->r_rq_pool = mempool_create_slab_pool(geo->all_luns,
- pblk_g_rq_cache);
- if (!pblk->r_rq_pool)
+ ret = mempool_init_slab_pool(&pblk->r_rq_pool, geo->all_luns,
+ pblk_g_rq_cache);
+ if (ret)
goto free_rec_pool;
- pblk->e_rq_pool = mempool_create_slab_pool(geo->all_luns,
- pblk_g_rq_cache);
- if (!pblk->e_rq_pool)
+ ret = mempool_init_slab_pool(&pblk->e_rq_pool, geo->all_luns,
+ pblk_g_rq_cache);
+ if (ret)
goto free_r_rq_pool;
- pblk->w_rq_pool = mempool_create_slab_pool(geo->all_luns,
- pblk_w_rq_cache);
- if (!pblk->w_rq_pool)
+ ret = mempool_init_slab_pool(&pblk->w_rq_pool, geo->all_luns,
+ pblk_w_rq_cache);
+ if (ret)
goto free_e_rq_pool;
pblk->close_wq = alloc_workqueue("pblk-close-wq",
@@ -423,6 +436,7 @@ static int pblk_core_init(struct pblk *pblk)
goto free_r_end_wq;
INIT_LIST_HEAD(&pblk->compl_list);
+ INIT_LIST_HEAD(&pblk->resubmit_list);
return 0;
@@ -433,17 +447,17 @@ free_bb_wq:
free_close_wq:
destroy_workqueue(pblk->close_wq);
free_w_rq_pool:
- mempool_destroy(pblk->w_rq_pool);
+ mempool_exit(&pblk->w_rq_pool);
free_e_rq_pool:
- mempool_destroy(pblk->e_rq_pool);
+ mempool_exit(&pblk->e_rq_pool);
free_r_rq_pool:
- mempool_destroy(pblk->r_rq_pool);
+ mempool_exit(&pblk->r_rq_pool);
free_rec_pool:
- mempool_destroy(pblk->rec_pool);
+ mempool_exit(&pblk->rec_pool);
free_gen_ws_pool:
- mempool_destroy(pblk->gen_ws_pool);
+ mempool_exit(&pblk->gen_ws_pool);
free_page_bio_pool:
- mempool_destroy(pblk->page_bio_pool);
+ mempool_exit(&pblk->page_bio_pool);
free_global_caches:
pblk_free_global_caches(pblk);
fail_free_pad_dist:
@@ -462,12 +476,12 @@ static void pblk_core_free(struct pblk *pblk)
if (pblk->bb_wq)
destroy_workqueue(pblk->bb_wq);
- mempool_destroy(pblk->page_bio_pool);
- mempool_destroy(pblk->gen_ws_pool);
- mempool_destroy(pblk->rec_pool);
- mempool_destroy(pblk->r_rq_pool);
- mempool_destroy(pblk->e_rq_pool);
- mempool_destroy(pblk->w_rq_pool);
+ mempool_exit(&pblk->page_bio_pool);
+ mempool_exit(&pblk->gen_ws_pool);
+ mempool_exit(&pblk->rec_pool);
+ mempool_exit(&pblk->r_rq_pool);
+ mempool_exit(&pblk->e_rq_pool);
+ mempool_exit(&pblk->w_rq_pool);
pblk_free_global_caches(pblk);
kfree(pblk->pad_dist);
@@ -489,11 +503,17 @@ static void pblk_line_mg_free(struct pblk *pblk)
}
}
-static void pblk_line_meta_free(struct pblk_line *line)
+static void pblk_line_meta_free(struct pblk_line_mgmt *l_mg,
+ struct pblk_line *line)
{
+ struct pblk_w_err_gc *w_err_gc = line->w_err_gc;
+
kfree(line->blk_bitmap);
kfree(line->erase_bitmap);
kfree(line->chks);
+
+ pblk_mfree(w_err_gc->lba_list, l_mg->emeta_alloc_type);
+ kfree(w_err_gc);
}
static void pblk_lines_free(struct pblk *pblk)
@@ -506,8 +526,8 @@ static void pblk_lines_free(struct pblk *pblk)
for (i = 0; i < l_mg->nr_lines; i++) {
line = &pblk->lines[i];
- pblk_line_free(pblk, line);
- pblk_line_meta_free(line);
+ pblk_line_free(line);
+ pblk_line_meta_free(l_mg, line);
}
spin_unlock(&l_mg->free_lock);
@@ -748,14 +768,14 @@ static int pblk_setup_line_meta_20(struct pblk *pblk, struct pblk_line *line,
chunk->cnlb = chunk_meta->cnlb;
chunk->wp = chunk_meta->wp;
- if (!(chunk->state & NVM_CHK_ST_OFFLINE))
- continue;
-
if (chunk->type & NVM_CHK_TP_SZ_SPEC) {
WARN_ONCE(1, "pblk: custom-sized chunks unsupported\n");
continue;
}
+ if (!(chunk->state & NVM_CHK_ST_OFFLINE))
+ continue;
+
set_bit(pos, line->blk_bitmap);
nr_bad_chks++;
}
@@ -809,20 +829,28 @@ static int pblk_alloc_line_meta(struct pblk *pblk, struct pblk_line *line)
return -ENOMEM;
line->erase_bitmap = kzalloc(lm->blk_bitmap_len, GFP_KERNEL);
- if (!line->erase_bitmap) {
- kfree(line->blk_bitmap);
- return -ENOMEM;
- }
+ if (!line->erase_bitmap)
+ goto free_blk_bitmap;
+
line->chks = kmalloc(lm->blk_per_line * sizeof(struct nvm_chk_meta),
GFP_KERNEL);
- if (!line->chks) {
- kfree(line->erase_bitmap);
- kfree(line->blk_bitmap);
- return -ENOMEM;
- }
+ if (!line->chks)
+ goto free_erase_bitmap;
+
+ line->w_err_gc = kzalloc(sizeof(struct pblk_w_err_gc), GFP_KERNEL);
+ if (!line->w_err_gc)
+ goto free_chks;
return 0;
+
+free_chks:
+ kfree(line->chks);
+free_erase_bitmap:
+ kfree(line->erase_bitmap);
+free_blk_bitmap:
+ kfree(line->blk_bitmap);
+ return -ENOMEM;
}
static int pblk_line_mg_init(struct pblk *pblk)
@@ -847,12 +875,14 @@ static int pblk_line_mg_init(struct pblk *pblk)
INIT_LIST_HEAD(&l_mg->gc_mid_list);
INIT_LIST_HEAD(&l_mg->gc_low_list);
INIT_LIST_HEAD(&l_mg->gc_empty_list);
+ INIT_LIST_HEAD(&l_mg->gc_werr_list);
INIT_LIST_HEAD(&l_mg->emeta_list);
- l_mg->gc_lists[0] = &l_mg->gc_high_list;
- l_mg->gc_lists[1] = &l_mg->gc_mid_list;
- l_mg->gc_lists[2] = &l_mg->gc_low_list;
+ l_mg->gc_lists[0] = &l_mg->gc_werr_list;
+ l_mg->gc_lists[1] = &l_mg->gc_high_list;
+ l_mg->gc_lists[2] = &l_mg->gc_mid_list;
+ l_mg->gc_lists[3] = &l_mg->gc_low_list;
spin_lock_init(&l_mg->free_lock);
spin_lock_init(&l_mg->close_lock);
@@ -1047,6 +1077,11 @@ static int pblk_lines_init(struct pblk *pblk)
nr_free_chks += pblk_setup_line_meta(pblk, line, chunk_meta, i);
}
+ if (!nr_free_chks) {
+ pr_err("pblk: too many bad blocks prevent for sane instance\n");
+ return -EINTR;
+ }
+
pblk_set_provision(pblk, nr_free_chks);
kfree(chunk_meta);
@@ -1054,7 +1089,7 @@ static int pblk_lines_init(struct pblk *pblk)
fail_free_lines:
while (--i >= 0)
- pblk_line_meta_free(&pblk->lines[i]);
+ pblk_line_meta_free(l_mg, &pblk->lines[i]);
kfree(pblk->lines);
fail_free_chunk_meta:
kfree(chunk_meta);
@@ -1110,23 +1145,25 @@ static void pblk_free(struct pblk *pblk)
kfree(pblk);
}
-static void pblk_tear_down(struct pblk *pblk)
+static void pblk_tear_down(struct pblk *pblk, bool graceful)
{
- pblk_pipeline_stop(pblk);
+ if (graceful)
+ __pblk_pipeline_flush(pblk);
+ __pblk_pipeline_stop(pblk);
pblk_writer_stop(pblk);
pblk_rb_sync_l2p(&pblk->rwb);
pblk_rl_free(&pblk->rl);
- pr_debug("pblk: consistent tear down\n");
+ pr_debug("pblk: consistent tear down (graceful:%d)\n", graceful);
}
-static void pblk_exit(void *private)
+static void pblk_exit(void *private, bool graceful)
{
struct pblk *pblk = private;
down_write(&pblk_lock);
- pblk_gc_exit(pblk);
- pblk_tear_down(pblk);
+ pblk_gc_exit(pblk, graceful);
+ pblk_tear_down(pblk, graceful);
#ifdef CONFIG_NVM_DEBUG
pr_info("pblk exit: L2P CRC: %x\n", pblk_l2p_crc(pblk));
@@ -1175,6 +1212,7 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
pblk->state = PBLK_STATE_RUNNING;
pblk->gc.gc_enabled = 0;
+ spin_lock_init(&pblk->resubmit_lock);
spin_lock_init(&pblk->trans_lock);
spin_lock_init(&pblk->lock);
@@ -1297,18 +1335,18 @@ static int __init pblk_module_init(void)
{
int ret;
- pblk_bio_set = bioset_create(BIO_POOL_SIZE, 0, 0);
- if (!pblk_bio_set)
- return -ENOMEM;
+ ret = bioset_init(&pblk_bio_set, BIO_POOL_SIZE, 0, 0);
+ if (ret)
+ return ret;
ret = nvm_register_tgt_type(&tt_pblk);
if (ret)
- bioset_free(pblk_bio_set);
+ bioset_exit(&pblk_bio_set);
return ret;
}
static void pblk_module_exit(void)
{
- bioset_free(pblk_bio_set);
+ bioset_exit(&pblk_bio_set);
nvm_unregister_tgt_type(&tt_pblk);
}
diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c
index 20dbaa89c9df..953ca31dda68 100644
--- a/drivers/lightnvm/pblk-map.c
+++ b/drivers/lightnvm/pblk-map.c
@@ -18,11 +18,11 @@
#include "pblk.h"
-static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
- struct ppa_addr *ppa_list,
- unsigned long *lun_bitmap,
- struct pblk_sec_meta *meta_list,
- unsigned int valid_secs)
+static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
+ struct ppa_addr *ppa_list,
+ unsigned long *lun_bitmap,
+ struct pblk_sec_meta *meta_list,
+ unsigned int valid_secs)
{
struct pblk_line *line = pblk_line_get_data(pblk);
struct pblk_emeta *emeta;
@@ -35,8 +35,14 @@ static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
if (pblk_line_is_full(line)) {
struct pblk_line *prev_line = line;
+ /* If we cannot allocate a new line, make sure to store metadata
+ * on current line and then fail
+ */
line = pblk_line_replace_data(pblk);
pblk_line_close_meta(pblk, prev_line);
+
+ if (!line)
+ return -EINTR;
}
emeta = line->emeta;
@@ -74,6 +80,7 @@ static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
}
pblk_down_rq(pblk, ppa_list, nr_secs, lun_bitmap);
+ return 0;
}
void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
@@ -87,8 +94,12 @@ void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
for (i = off; i < rqd->nr_ppas; i += min) {
map_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
- pblk_map_page_data(pblk, sentry + i, &rqd->ppa_list[i],
- lun_bitmap, &meta_list[i], map_secs);
+ if (pblk_map_page_data(pblk, sentry + i, &rqd->ppa_list[i],
+ lun_bitmap, &meta_list[i], map_secs)) {
+ bio_put(rqd->bio);
+ pblk_free_rqd(pblk, rqd, PBLK_WRITE);
+ pblk_pipeline_stop(pblk);
+ }
}
}
@@ -108,8 +119,12 @@ void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
for (i = 0; i < rqd->nr_ppas; i += min) {
map_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
- pblk_map_page_data(pblk, sentry + i, &rqd->ppa_list[i],
- lun_bitmap, &meta_list[i], map_secs);
+ if (pblk_map_page_data(pblk, sentry + i, &rqd->ppa_list[i],
+ lun_bitmap, &meta_list[i], map_secs)) {
+ bio_put(rqd->bio);
+ pblk_free_rqd(pblk, rqd, PBLK_WRITE);
+ pblk_pipeline_stop(pblk);
+ }
erase_lun = pblk_ppa_to_pos(geo, rqd->ppa_list[i]);
diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c
index 52fdd85dbc97..00cd1f20a196 100644
--- a/drivers/lightnvm/pblk-rb.c
+++ b/drivers/lightnvm/pblk-rb.c
@@ -142,10 +142,9 @@ static void clean_wctx(struct pblk_w_ctx *w_ctx)
{
int flags;
-try:
flags = READ_ONCE(w_ctx->flags);
- if (!(flags & PBLK_SUBMITTED_ENTRY))
- goto try;
+ WARN_ONCE(!(flags & PBLK_SUBMITTED_ENTRY),
+ "pblk: overwriting unsubmitted data\n");
/* Release flags on context. Protect from writes and reads */
smp_store_release(&w_ctx->flags, PBLK_WRITABLE_ENTRY);
@@ -350,7 +349,7 @@ void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data,
}
static int pblk_rb_flush_point_set(struct pblk_rb *rb, struct bio *bio,
- unsigned int pos)
+ unsigned int pos)
{
struct pblk_rb_entry *entry;
unsigned int sync, flush_point;
@@ -420,7 +419,7 @@ void pblk_rb_flush(struct pblk_rb *rb)
if (pblk_rb_flush_point_set(rb, NULL, mem))
return;
- pblk_write_should_kick(pblk);
+ pblk_write_kick(pblk);
}
static int pblk_rb_may_write_flush(struct pblk_rb *rb, unsigned int nr_entries,
@@ -504,45 +503,6 @@ int pblk_rb_may_write_gc(struct pblk_rb *rb, unsigned int nr_entries,
}
/*
- * The caller of this function must ensure that the backpointer will not
- * overwrite the entries passed on the list.
- */
-unsigned int pblk_rb_read_to_bio_list(struct pblk_rb *rb, struct bio *bio,
- struct list_head *list,
- unsigned int max)
-{
- struct pblk_rb_entry *entry, *tentry;
- struct page *page;
- unsigned int read = 0;
- int ret;
-
- list_for_each_entry_safe(entry, tentry, list, index) {
- if (read > max) {
- pr_err("pblk: too many entries on list\n");
- goto out;
- }
-
- page = virt_to_page(entry->data);
- if (!page) {
- pr_err("pblk: could not allocate write bio page\n");
- goto out;
- }
-
- ret = bio_add_page(bio, page, rb->seg_size, 0);
- if (ret != rb->seg_size) {
- pr_err("pblk: could not add page to write bio\n");
- goto out;
- }
-
- list_del(&entry->index);
- read++;
- }
-
-out:
- return read;
-}
-
-/*
* Read available entries on rb and add them to the given bio. To avoid a memory
* copy, a page reference to the write buffer is used to be added to the bio.
*
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index 9eee10f69df0..18694694e5f0 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -39,10 +39,10 @@ static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio,
}
static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
- sector_t blba, unsigned long *read_bitmap)
+ struct bio *bio, sector_t blba,
+ unsigned long *read_bitmap)
{
struct pblk_sec_meta *meta_list = rqd->meta_list;
- struct bio *bio = rqd->bio;
struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS];
int nr_secs = rqd->nr_ppas;
bool advanced_bio = false;
@@ -102,32 +102,69 @@ next:
#endif
}
-static int pblk_submit_read_io(struct pblk *pblk, struct nvm_rq *rqd)
+
+static void pblk_read_check_seq(struct pblk *pblk, struct nvm_rq *rqd,
+ sector_t blba)
{
- int err;
+ struct pblk_sec_meta *meta_lba_list = rqd->meta_list;
+ int nr_lbas = rqd->nr_ppas;
+ int i;
- err = pblk_submit_io(pblk, rqd);
- if (err)
- return NVM_IO_ERR;
+ for (i = 0; i < nr_lbas; i++) {
+ u64 lba = le64_to_cpu(meta_lba_list[i].lba);
+
+ if (lba == ADDR_EMPTY)
+ continue;
+
+ if (lba != blba + i) {
+#ifdef CONFIG_NVM_DEBUG
+ struct ppa_addr *p;
- return NVM_IO_OK;
+ p = (nr_lbas == 1) ? &rqd->ppa_list[i] : &rqd->ppa_addr;
+ print_ppa(&pblk->dev->geo, p, "seq", i);
+#endif
+ pr_err("pblk: corrupted read LBA (%llu/%llu)\n",
+ lba, (u64)blba + i);
+ WARN_ON(1);
+ }
+ }
}
-static void pblk_read_check(struct pblk *pblk, struct nvm_rq *rqd,
- sector_t blba)
+/*
+ * There can be holes in the lba list.
+ */
+static void pblk_read_check_rand(struct pblk *pblk, struct nvm_rq *rqd,
+ u64 *lba_list, int nr_lbas)
{
- struct pblk_sec_meta *meta_list = rqd->meta_list;
- int nr_lbas = rqd->nr_ppas;
- int i;
+ struct pblk_sec_meta *meta_lba_list = rqd->meta_list;
+ int i, j;
- for (i = 0; i < nr_lbas; i++) {
- u64 lba = le64_to_cpu(meta_list[i].lba);
+ for (i = 0, j = 0; i < nr_lbas; i++) {
+ u64 lba = lba_list[i];
+ u64 meta_lba;
if (lba == ADDR_EMPTY)
continue;
- WARN(lba != blba + i, "pblk: corrupted read LBA\n");
+ meta_lba = le64_to_cpu(meta_lba_list[j].lba);
+
+ if (lba != meta_lba) {
+#ifdef CONFIG_NVM_DEBUG
+ struct ppa_addr *p;
+ int nr_ppas = rqd->nr_ppas;
+
+ p = (nr_ppas == 1) ? &rqd->ppa_list[j] : &rqd->ppa_addr;
+ print_ppa(&pblk->dev->geo, p, "seq", j);
+#endif
+ pr_err("pblk: corrupted read LBA (%llu/%llu)\n",
+ lba, meta_lba);
+ WARN_ON(1);
+ }
+
+ j++;
}
+
+ WARN_ONCE(j != rqd->nr_ppas, "pblk: corrupted random request\n");
}
static void pblk_read_put_rqd_kref(struct pblk *pblk, struct nvm_rq *rqd)
@@ -152,7 +189,6 @@ static void pblk_end_user_read(struct bio *bio)
WARN_ONCE(bio->bi_status, "pblk: corrupted read bio\n");
#endif
bio_endio(bio);
- bio_put(bio);
}
static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd,
@@ -160,23 +196,18 @@ static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd,
{
struct nvm_tgt_dev *dev = pblk->dev;
struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
- struct bio *bio = rqd->bio;
+ struct bio *int_bio = rqd->bio;
unsigned long start_time = r_ctx->start_time;
generic_end_io_acct(dev->q, READ, &pblk->disk->part0, start_time);
if (rqd->error)
pblk_log_read_err(pblk, rqd);
-#ifdef CONFIG_NVM_DEBUG
- else
- WARN_ONCE(bio->bi_status, "pblk: corrupted read error\n");
-#endif
- pblk_read_check(pblk, rqd, r_ctx->lba);
+ pblk_read_check_seq(pblk, rqd, r_ctx->lba);
- bio_put(bio);
- if (r_ctx->private)
- pblk_end_user_read((struct bio *)r_ctx->private);
+ if (int_bio)
+ bio_put(int_bio);
if (put_line)
pblk_read_put_rqd_kref(pblk, rqd);
@@ -193,16 +224,19 @@ static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd,
static void pblk_end_io_read(struct nvm_rq *rqd)
{
struct pblk *pblk = rqd->private;
+ struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
+ struct bio *bio = (struct bio *)r_ctx->private;
+ pblk_end_user_read(bio);
__pblk_end_io_read(pblk, rqd, true);
}
-static int pblk_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
- unsigned int bio_init_idx,
- unsigned long *read_bitmap)
+static int pblk_partial_read(struct pblk *pblk, struct nvm_rq *rqd,
+ struct bio *orig_bio, unsigned int bio_init_idx,
+ unsigned long *read_bitmap)
{
- struct bio *new_bio, *bio = rqd->bio;
struct pblk_sec_meta *meta_list = rqd->meta_list;
+ struct bio *new_bio;
struct bio_vec src_bv, dst_bv;
void *ppa_ptr = NULL;
void *src_p, *dst_p;
@@ -219,11 +253,11 @@ static int pblk_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
new_bio = bio_alloc(GFP_KERNEL, nr_holes);
if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes))
- goto err;
+ goto fail_add_pages;
if (nr_holes != new_bio->bi_vcnt) {
pr_err("pblk: malformed bio\n");
- goto err;
+ goto fail;
}
for (i = 0; i < nr_secs; i++)
@@ -246,7 +280,7 @@ static int pblk_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
if (ret) {
bio_put(rqd->bio);
pr_err("pblk: sync read IO submission failed\n");
- goto err;
+ goto fail;
}
if (rqd->error) {
@@ -282,7 +316,7 @@ static int pblk_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
meta_list[hole].lba = lba_list_media[i];
src_bv = new_bio->bi_io_vec[i++];
- dst_bv = bio->bi_io_vec[bio_init_idx + hole];
+ dst_bv = orig_bio->bi_io_vec[bio_init_idx + hole];
src_p = kmap_atomic(src_bv.bv_page);
dst_p = kmap_atomic(dst_bv.bv_page);
@@ -294,35 +328,33 @@ static int pblk_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
kunmap_atomic(src_p);
kunmap_atomic(dst_p);
- mempool_free(src_bv.bv_page, pblk->page_bio_pool);
+ mempool_free(src_bv.bv_page, &pblk->page_bio_pool);
hole = find_next_zero_bit(read_bitmap, nr_secs, hole + 1);
} while (hole < nr_secs);
bio_put(new_bio);
- /* Complete the original bio and associated request */
- bio_endio(bio);
- rqd->bio = bio;
+ /* restore original request */
+ rqd->bio = NULL;
rqd->nr_ppas = nr_secs;
__pblk_end_io_read(pblk, rqd, false);
- return NVM_IO_OK;
-
-err:
- pr_err("pblk: failed to perform partial read\n");
+ return NVM_IO_DONE;
+fail:
/* Free allocated pages in new bio */
- pblk_bio_free_pages(pblk, bio, 0, new_bio->bi_vcnt);
+ pblk_bio_free_pages(pblk, new_bio, 0, new_bio->bi_vcnt);
+fail_add_pages:
+ pr_err("pblk: failed to perform partial read\n");
__pblk_end_io_read(pblk, rqd, false);
return NVM_IO_ERR;
}
-static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd,
+static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, struct bio *bio,
sector_t lba, unsigned long *read_bitmap)
{
struct pblk_sec_meta *meta_list = rqd->meta_list;
- struct bio *bio = rqd->bio;
struct ppa_addr ppa;
pblk_lookup_l2p_seq(pblk, &ppa, lba, 1);
@@ -386,14 +418,15 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
rqd = pblk_alloc_rqd(pblk, PBLK_READ);
rqd->opcode = NVM_OP_PREAD;
- rqd->bio = bio;
rqd->nr_ppas = nr_secs;
+ rqd->bio = NULL; /* cloned bio if needed */
rqd->private = pblk;
rqd->end_io = pblk_end_io_read;
r_ctx = nvm_rq_to_pdu(rqd);
r_ctx->start_time = jiffies;
r_ctx->lba = blba;
+ r_ctx->private = bio; /* original bio */
/* Save the index for this bio's start. This is needed in case
* we need to fill a partial read.
@@ -411,17 +444,15 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size;
rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size;
- pblk_read_ppalist_rq(pblk, rqd, blba, &read_bitmap);
+ pblk_read_ppalist_rq(pblk, rqd, bio, blba, &read_bitmap);
} else {
- pblk_read_rq(pblk, rqd, blba, &read_bitmap);
+ pblk_read_rq(pblk, rqd, bio, blba, &read_bitmap);
}
- bio_get(bio);
if (bitmap_full(&read_bitmap, nr_secs)) {
- bio_endio(bio);
atomic_inc(&pblk->inflight_io);
__pblk_end_io_read(pblk, rqd, false);
- return NVM_IO_OK;
+ return NVM_IO_DONE;
}
/* All sectors are to be read from the device */
@@ -429,20 +460,17 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
struct bio *int_bio = NULL;
/* Clone read bio to deal with read errors internally */
- int_bio = bio_clone_fast(bio, GFP_KERNEL, pblk_bio_set);
+ int_bio = bio_clone_fast(bio, GFP_KERNEL, &pblk_bio_set);
if (!int_bio) {
pr_err("pblk: could not clone read bio\n");
goto fail_end_io;
}
rqd->bio = int_bio;
- r_ctx->private = bio;
- ret = pblk_submit_read_io(pblk, rqd);
- if (ret) {
+ if (pblk_submit_io(pblk, rqd)) {
pr_err("pblk: read IO submission failed\n");
- if (int_bio)
- bio_put(int_bio);
+ ret = NVM_IO_ERR;
goto fail_end_io;
}
@@ -452,7 +480,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
/* The read bio request could be partially filled by the write buffer,
* but there are some holes that need to be read from the drive.
*/
- return pblk_partial_read_bio(pblk, rqd, bio_init_idx, &read_bitmap);
+ return pblk_partial_read(pblk, rqd, bio, bio_init_idx, &read_bitmap);
fail_rqd_free:
pblk_free_rqd(pblk, rqd, PBLK_READ);
@@ -585,6 +613,8 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
goto err_free_bio;
}
+ pblk_read_check_rand(pblk, &rqd, gc_rq->lba_list, gc_rq->nr_secs);
+
atomic_dec(&pblk->inflight_io);
if (rqd.error) {
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index 3e079c2afa6e..598342833d0d 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -16,97 +16,6 @@
#include "pblk.h"
-void pblk_submit_rec(struct work_struct *work)
-{
- struct pblk_rec_ctx *recovery =
- container_of(work, struct pblk_rec_ctx, ws_rec);
- struct pblk *pblk = recovery->pblk;
- struct nvm_rq *rqd = recovery->rqd;
- struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
- struct bio *bio;
- unsigned int nr_rec_secs;
- unsigned int pgs_read;
- int ret;
-
- nr_rec_secs = bitmap_weight((unsigned long int *)&rqd->ppa_status,
- NVM_MAX_VLBA);
-
- bio = bio_alloc(GFP_KERNEL, nr_rec_secs);
-
- bio->bi_iter.bi_sector = 0;
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
- rqd->bio = bio;
- rqd->nr_ppas = nr_rec_secs;
-
- pgs_read = pblk_rb_read_to_bio_list(&pblk->rwb, bio, &recovery->failed,
- nr_rec_secs);
- if (pgs_read != nr_rec_secs) {
- pr_err("pblk: could not read recovery entries\n");
- goto err;
- }
-
- if (pblk_setup_w_rec_rq(pblk, rqd, c_ctx)) {
- pr_err("pblk: could not setup recovery request\n");
- goto err;
- }
-
-#ifdef CONFIG_NVM_DEBUG
- atomic_long_add(nr_rec_secs, &pblk->recov_writes);
-#endif
-
- ret = pblk_submit_io(pblk, rqd);
- if (ret) {
- pr_err("pblk: I/O submission failed: %d\n", ret);
- goto err;
- }
-
- mempool_free(recovery, pblk->rec_pool);
- return;
-
-err:
- bio_put(bio);
- pblk_free_rqd(pblk, rqd, PBLK_WRITE);
-}
-
-int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
- struct pblk_rec_ctx *recovery, u64 *comp_bits,
- unsigned int comp)
-{
- struct nvm_rq *rec_rqd;
- struct pblk_c_ctx *rec_ctx;
- int nr_entries = c_ctx->nr_valid + c_ctx->nr_padded;
-
- rec_rqd = pblk_alloc_rqd(pblk, PBLK_WRITE);
- rec_ctx = nvm_rq_to_pdu(rec_rqd);
-
- /* Copy completion bitmap, but exclude the first X completed entries */
- bitmap_shift_right((unsigned long int *)&rec_rqd->ppa_status,
- (unsigned long int *)comp_bits,
- comp, NVM_MAX_VLBA);
-
- /* Save the context for the entries that need to be re-written and
- * update current context with the completed entries.
- */
- rec_ctx->sentry = pblk_rb_wrap_pos(&pblk->rwb, c_ctx->sentry + comp);
- if (comp >= c_ctx->nr_valid) {
- rec_ctx->nr_valid = 0;
- rec_ctx->nr_padded = nr_entries - comp;
-
- c_ctx->nr_padded = comp - c_ctx->nr_valid;
- } else {
- rec_ctx->nr_valid = c_ctx->nr_valid - comp;
- rec_ctx->nr_padded = c_ctx->nr_padded;
-
- c_ctx->nr_valid = comp;
- c_ctx->nr_padded = 0;
- }
-
- recovery->rqd = rec_rqd;
- recovery->pblk = pblk;
-
- return 0;
-}
-
int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta_buf)
{
u32 crc;
@@ -865,18 +774,30 @@ static void pblk_recov_wa_counters(struct pblk *pblk,
}
static int pblk_line_was_written(struct pblk_line *line,
- struct pblk_line_meta *lm)
+ struct pblk *pblk)
{
- int i;
- int state_mask = NVM_CHK_ST_OFFLINE | NVM_CHK_ST_FREE;
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct nvm_chk_meta *chunk;
+ struct ppa_addr bppa;
+ int smeta_blk;
- for (i = 0; i < lm->blk_per_line; i++) {
- if (!(line->chks[i].state & state_mask))
- return 1;
- }
+ if (line->state == PBLK_LINESTATE_BAD)
+ return 0;
- return 0;
+ smeta_blk = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line);
+ if (smeta_blk >= lm->blk_per_line)
+ return 0;
+
+ bppa = pblk->luns[smeta_blk].bppa;
+ chunk = &line->chks[pblk_ppa_to_pos(geo, bppa)];
+
+ if (chunk->state & NVM_CHK_ST_FREE)
+ return 0;
+
+ return 1;
}
struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
@@ -915,7 +836,7 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
line->lun_bitmap = ((void *)(smeta_buf)) +
sizeof(struct line_smeta);
- if (!pblk_line_was_written(line, lm))
+ if (!pblk_line_was_written(line, pblk))
continue;
/* Lines that cannot be read are assumed as not written here */
diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c
index 883a7113b19d..6a0616a6fcaf 100644
--- a/drivers/lightnvm/pblk-rl.c
+++ b/drivers/lightnvm/pblk-rl.c
@@ -73,6 +73,16 @@ void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries)
pblk_rl_kick_u_timer(rl);
}
+void pblk_rl_werr_line_in(struct pblk_rl *rl)
+{
+ atomic_inc(&rl->werr_lines);
+}
+
+void pblk_rl_werr_line_out(struct pblk_rl *rl)
+{
+ atomic_dec(&rl->werr_lines);
+}
+
void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries)
{
atomic_add(nr_entries, &rl->rb_gc_cnt);
@@ -99,11 +109,21 @@ static void __pblk_rl_update_rates(struct pblk_rl *rl,
{
struct pblk *pblk = container_of(rl, struct pblk, rl);
int max = rl->rb_budget;
+ int werr_gc_needed = atomic_read(&rl->werr_lines);
if (free_blocks >= rl->high) {
- rl->rb_user_max = max;
- rl->rb_gc_max = 0;
- rl->rb_state = PBLK_RL_HIGH;
+ if (werr_gc_needed) {
+ /* Allocate a small budget for recovering
+ * lines with write errors
+ */
+ rl->rb_gc_max = 1 << rl->rb_windows_pw;
+ rl->rb_user_max = max - rl->rb_gc_max;
+ rl->rb_state = PBLK_RL_WERR;
+ } else {
+ rl->rb_user_max = max;
+ rl->rb_gc_max = 0;
+ rl->rb_state = PBLK_RL_OFF;
+ }
} else if (free_blocks < rl->high) {
int shift = rl->high_pw - rl->rb_windows_pw;
int user_windows = free_blocks >> shift;
@@ -124,7 +144,7 @@ static void __pblk_rl_update_rates(struct pblk_rl *rl,
rl->rb_state = PBLK_RL_LOW;
}
- if (rl->rb_state == (PBLK_RL_MID | PBLK_RL_LOW))
+ if (rl->rb_state != PBLK_RL_OFF)
pblk_gc_should_start(pblk);
else
pblk_gc_should_stop(pblk);
@@ -221,6 +241,7 @@ void pblk_rl_init(struct pblk_rl *rl, int budget)
atomic_set(&rl->rb_user_cnt, 0);
atomic_set(&rl->rb_gc_cnt, 0);
atomic_set(&rl->rb_space, -1);
+ atomic_set(&rl->werr_lines, 0);
timer_setup(&rl->u_timer, pblk_rl_u_timer, 0);
diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c
index e61909af23a5..88a0a7c407aa 100644
--- a/drivers/lightnvm/pblk-sysfs.c
+++ b/drivers/lightnvm/pblk-sysfs.c
@@ -173,6 +173,8 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page)
int free_line_cnt = 0, closed_line_cnt = 0, emeta_line_cnt = 0;
int d_line_cnt = 0, l_line_cnt = 0;
int gc_full = 0, gc_high = 0, gc_mid = 0, gc_low = 0, gc_empty = 0;
+ int gc_werr = 0;
+
int bad = 0, cor = 0;
int msecs = 0, cur_sec = 0, vsc = 0, sec_in_line = 0;
int map_weight = 0, meta_weight = 0;
@@ -237,6 +239,15 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page)
gc_empty++;
}
+ list_for_each_entry(line, &l_mg->gc_werr_list, list) {
+ if (line->type == PBLK_LINETYPE_DATA)
+ d_line_cnt++;
+ else if (line->type == PBLK_LINETYPE_LOG)
+ l_line_cnt++;
+ closed_line_cnt++;
+ gc_werr++;
+ }
+
list_for_each_entry(line, &l_mg->bad_list, list)
bad++;
list_for_each_entry(line, &l_mg->corrupt_list, list)
@@ -275,8 +286,8 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page)
l_mg->nr_lines);
sz += snprintf(page + sz, PAGE_SIZE - sz,
- "GC: full:%d, high:%d, mid:%d, low:%d, empty:%d, queue:%d\n",
- gc_full, gc_high, gc_mid, gc_low, gc_empty,
+ "GC: full:%d, high:%d, mid:%d, low:%d, empty:%d, werr: %d, queue:%d\n",
+ gc_full, gc_high, gc_mid, gc_low, gc_empty, gc_werr,
atomic_read(&pblk->gc.read_inflight_gc));
sz += snprintf(page + sz, PAGE_SIZE - sz,
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index 3e6f1ebd743a..f353e52941f5 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -103,68 +103,150 @@ retry:
pblk_rb_sync_end(&pblk->rwb, &flags);
}
-/* When a write fails, we are not sure whether the block has grown bad or a page
- * range is more susceptible to write errors. If a high number of pages fail, we
- * assume that the block is bad and we mark it accordingly. In all cases, we
- * remap and resubmit the failed entries as fast as possible; if a flush is
- * waiting on a completion, the whole stack would stall otherwise.
- */
-static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd)
+/* Map remaining sectors in chunk, starting from ppa */
+static void pblk_map_remaining(struct pblk *pblk, struct ppa_addr *ppa)
{
- void *comp_bits = &rqd->ppa_status;
- struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
- struct pblk_rec_ctx *recovery;
- struct ppa_addr *ppa_list = rqd->ppa_list;
- int nr_ppas = rqd->nr_ppas;
- unsigned int c_entries;
- int bit, ret;
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line *line;
+ struct ppa_addr map_ppa = *ppa;
+ u64 paddr;
+ int done = 0;
- if (unlikely(nr_ppas == 1))
- ppa_list = &rqd->ppa_addr;
+ line = &pblk->lines[pblk_ppa_to_line(*ppa)];
+ spin_lock(&line->lock);
- recovery = mempool_alloc(pblk->rec_pool, GFP_ATOMIC);
+ while (!done) {
+ paddr = pblk_dev_ppa_to_line_addr(pblk, map_ppa);
- INIT_LIST_HEAD(&recovery->failed);
+ if (!test_and_set_bit(paddr, line->map_bitmap))
+ line->left_msecs--;
- bit = -1;
- while ((bit = find_next_bit(comp_bits, nr_ppas, bit + 1)) < nr_ppas) {
- struct pblk_rb_entry *entry;
- struct ppa_addr ppa;
+ if (!test_and_set_bit(paddr, line->invalid_bitmap))
+ le32_add_cpu(line->vsc, -1);
- /* Logic error */
- if (bit > c_ctx->nr_valid) {
- WARN_ONCE(1, "pblk: corrupted write request\n");
- mempool_free(recovery, pblk->rec_pool);
- goto out;
+ if (geo->version == NVM_OCSSD_SPEC_12) {
+ map_ppa.ppa++;
+ if (map_ppa.g.pg == geo->num_pg)
+ done = 1;
+ } else {
+ map_ppa.m.sec++;
+ if (map_ppa.m.sec == geo->clba)
+ done = 1;
}
+ }
- ppa = ppa_list[bit];
- entry = pblk_rb_sync_scan_entry(&pblk->rwb, &ppa);
- if (!entry) {
- pr_err("pblk: could not scan entry on write failure\n");
- mempool_free(recovery, pblk->rec_pool);
- goto out;
- }
+ line->w_err_gc->has_write_err = 1;
+ spin_unlock(&line->lock);
+}
- /* The list is filled first and emptied afterwards. No need for
- * protecting it with a lock
+static void pblk_prepare_resubmit(struct pblk *pblk, unsigned int sentry,
+ unsigned int nr_entries)
+{
+ struct pblk_rb *rb = &pblk->rwb;
+ struct pblk_rb_entry *entry;
+ struct pblk_line *line;
+ struct pblk_w_ctx *w_ctx;
+ struct ppa_addr ppa_l2p;
+ int flags;
+ unsigned int pos, i;
+
+ spin_lock(&pblk->trans_lock);
+ pos = sentry;
+ for (i = 0; i < nr_entries; i++) {
+ entry = &rb->entries[pos];
+ w_ctx = &entry->w_ctx;
+
+ /* Check if the lba has been overwritten */
+ ppa_l2p = pblk_trans_map_get(pblk, w_ctx->lba);
+ if (!pblk_ppa_comp(ppa_l2p, entry->cacheline))
+ w_ctx->lba = ADDR_EMPTY;
+
+ /* Mark up the entry as submittable again */
+ flags = READ_ONCE(w_ctx->flags);
+ flags |= PBLK_WRITTEN_DATA;
+ /* Release flags on write context. Protect from writes */
+ smp_store_release(&w_ctx->flags, flags);
+
+ /* Decrese the reference count to the line as we will
+ * re-map these entries
*/
- list_add_tail(&entry->index, &recovery->failed);
+ line = &pblk->lines[pblk_ppa_to_line(w_ctx->ppa)];
+ kref_put(&line->ref, pblk_line_put);
+
+ pos = (pos + 1) & (rb->nr_entries - 1);
}
+ spin_unlock(&pblk->trans_lock);
+}
- c_entries = find_first_bit(comp_bits, nr_ppas);
- ret = pblk_recov_setup_rq(pblk, c_ctx, recovery, comp_bits, c_entries);
- if (ret) {
- pr_err("pblk: could not recover from write failure\n");
- mempool_free(recovery, pblk->rec_pool);
- goto out;
+static void pblk_queue_resubmit(struct pblk *pblk, struct pblk_c_ctx *c_ctx)
+{
+ struct pblk_c_ctx *r_ctx;
+
+ r_ctx = kzalloc(sizeof(struct pblk_c_ctx), GFP_KERNEL);
+ if (!r_ctx)
+ return;
+
+ r_ctx->lun_bitmap = NULL;
+ r_ctx->sentry = c_ctx->sentry;
+ r_ctx->nr_valid = c_ctx->nr_valid;
+ r_ctx->nr_padded = c_ctx->nr_padded;
+
+ spin_lock(&pblk->resubmit_lock);
+ list_add_tail(&r_ctx->list, &pblk->resubmit_list);
+ spin_unlock(&pblk->resubmit_lock);
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_long_add(c_ctx->nr_valid, &pblk->recov_writes);
+#endif
+}
+
+static void pblk_submit_rec(struct work_struct *work)
+{
+ struct pblk_rec_ctx *recovery =
+ container_of(work, struct pblk_rec_ctx, ws_rec);
+ struct pblk *pblk = recovery->pblk;
+ struct nvm_rq *rqd = recovery->rqd;
+ struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
+ struct ppa_addr *ppa_list;
+
+ pblk_log_write_err(pblk, rqd);
+
+ if (rqd->nr_ppas == 1)
+ ppa_list = &rqd->ppa_addr;
+ else
+ ppa_list = rqd->ppa_list;
+
+ pblk_map_remaining(pblk, ppa_list);
+ pblk_queue_resubmit(pblk, c_ctx);
+
+ pblk_up_rq(pblk, rqd->ppa_list, rqd->nr_ppas, c_ctx->lun_bitmap);
+ if (c_ctx->nr_padded)
+ pblk_bio_free_pages(pblk, rqd->bio, c_ctx->nr_valid,
+ c_ctx->nr_padded);
+ bio_put(rqd->bio);
+ pblk_free_rqd(pblk, rqd, PBLK_WRITE);
+ mempool_free(recovery, &pblk->rec_pool);
+
+ atomic_dec(&pblk->inflight_io);
+}
+
+
+static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd)
+{
+ struct pblk_rec_ctx *recovery;
+
+ recovery = mempool_alloc(&pblk->rec_pool, GFP_ATOMIC);
+ if (!recovery) {
+ pr_err("pblk: could not allocate recovery work\n");
+ return;
}
+ recovery->pblk = pblk;
+ recovery->rqd = rqd;
+
INIT_WORK(&recovery->ws_rec, pblk_submit_rec);
queue_work(pblk->close_wq, &recovery->ws_rec);
-
-out:
- pblk_complete_write(pblk, rqd, c_ctx);
}
static void pblk_end_io_write(struct nvm_rq *rqd)
@@ -173,8 +255,8 @@ static void pblk_end_io_write(struct nvm_rq *rqd)
struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
if (rqd->error) {
- pblk_log_write_err(pblk, rqd);
- return pblk_end_w_fail(pblk, rqd);
+ pblk_end_w_fail(pblk, rqd);
+ return;
}
#ifdef CONFIG_NVM_DEBUG
else
@@ -198,6 +280,7 @@ static void pblk_end_io_write_meta(struct nvm_rq *rqd)
if (rqd->error) {
pblk_log_write_err(pblk, rqd);
pr_err("pblk: metadata I/O failed. Line %d\n", line->id);
+ line->w_err_gc->has_write_err = 1;
}
sync = atomic_add_return(rqd->nr_ppas, &emeta->sync);
@@ -266,31 +349,6 @@ static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
return 0;
}
-int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd,
- struct pblk_c_ctx *c_ctx)
-{
- struct pblk_line_meta *lm = &pblk->lm;
- unsigned long *lun_bitmap;
- int ret;
-
- lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL);
- if (!lun_bitmap)
- return -ENOMEM;
-
- c_ctx->lun_bitmap = lun_bitmap;
-
- ret = pblk_alloc_w_rq(pblk, rqd, rqd->nr_ppas, pblk_end_io_write);
- if (ret)
- return ret;
-
- pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, c_ctx->nr_valid, 0);
-
- rqd->ppa_status = (u64)0;
- rqd->flags = pblk_set_progr_mode(pblk, PBLK_WRITE);
-
- return ret;
-}
-
static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail,
unsigned int secs_to_flush)
{
@@ -339,6 +397,7 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len,
l_mg->emeta_alloc_type, GFP_KERNEL);
if (IS_ERR(bio)) {
+ pr_err("pblk: failed to map emeta io");
ret = PTR_ERR(bio);
goto fail_free_rqd;
}
@@ -515,26 +574,54 @@ static int pblk_submit_write(struct pblk *pblk)
unsigned int secs_avail, secs_to_sync, secs_to_com;
unsigned int secs_to_flush;
unsigned long pos;
+ unsigned int resubmit;
- /* If there are no sectors in the cache, flushes (bios without data)
- * will be cleared on the cache threads
- */
- secs_avail = pblk_rb_read_count(&pblk->rwb);
- if (!secs_avail)
- return 1;
-
- secs_to_flush = pblk_rb_flush_point_count(&pblk->rwb);
- if (!secs_to_flush && secs_avail < pblk->min_write_pgs)
- return 1;
-
- secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, secs_to_flush);
- if (secs_to_sync > pblk->max_write_pgs) {
- pr_err("pblk: bad buffer sync calculation\n");
- return 1;
- }
+ spin_lock(&pblk->resubmit_lock);
+ resubmit = !list_empty(&pblk->resubmit_list);
+ spin_unlock(&pblk->resubmit_lock);
+
+ /* Resubmit failed writes first */
+ if (resubmit) {
+ struct pblk_c_ctx *r_ctx;
+
+ spin_lock(&pblk->resubmit_lock);
+ r_ctx = list_first_entry(&pblk->resubmit_list,
+ struct pblk_c_ctx, list);
+ list_del(&r_ctx->list);
+ spin_unlock(&pblk->resubmit_lock);
+
+ secs_avail = r_ctx->nr_valid;
+ pos = r_ctx->sentry;
+
+ pblk_prepare_resubmit(pblk, pos, secs_avail);
+ secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail,
+ secs_avail);
- secs_to_com = (secs_to_sync > secs_avail) ? secs_avail : secs_to_sync;
- pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com);
+ kfree(r_ctx);
+ } else {
+ /* If there are no sectors in the cache,
+ * flushes (bios without data) will be cleared on
+ * the cache threads
+ */
+ secs_avail = pblk_rb_read_count(&pblk->rwb);
+ if (!secs_avail)
+ return 1;
+
+ secs_to_flush = pblk_rb_flush_point_count(&pblk->rwb);
+ if (!secs_to_flush && secs_avail < pblk->min_write_pgs)
+ return 1;
+
+ secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail,
+ secs_to_flush);
+ if (secs_to_sync > pblk->max_write_pgs) {
+ pr_err("pblk: bad buffer sync calculation\n");
+ return 1;
+ }
+
+ secs_to_com = (secs_to_sync > secs_avail) ?
+ secs_avail : secs_to_sync;
+ pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com);
+ }
bio = bio_alloc(GFP_KERNEL, secs_to_sync);
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 9c682acfc5d1..34cc1d64a9d4 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -89,12 +89,14 @@ struct pblk_sec_meta {
/* The number of GC lists and the rate-limiter states go together. This way the
* rate-limiter can dictate how much GC is needed based on resource utilization.
*/
-#define PBLK_GC_NR_LISTS 3
+#define PBLK_GC_NR_LISTS 4
enum {
- PBLK_RL_HIGH = 1,
- PBLK_RL_MID = 2,
- PBLK_RL_LOW = 3,
+ PBLK_RL_OFF = 0,
+ PBLK_RL_WERR = 1,
+ PBLK_RL_HIGH = 2,
+ PBLK_RL_MID = 3,
+ PBLK_RL_LOW = 4
};
#define pblk_dma_meta_size (sizeof(struct pblk_sec_meta) * PBLK_MAX_REQ_ADDRS)
@@ -128,7 +130,6 @@ struct pblk_pad_rq {
struct pblk_rec_ctx {
struct pblk *pblk;
struct nvm_rq *rqd;
- struct list_head failed;
struct work_struct ws_rec;
};
@@ -279,6 +280,8 @@ struct pblk_rl {
int rb_user_active;
int rb_gc_active;
+ atomic_t werr_lines; /* Number of write error lines that needs gc */
+
struct timer_list u_timer;
unsigned long long nr_secs;
@@ -312,6 +315,7 @@ enum {
PBLK_LINEGC_MID = 23,
PBLK_LINEGC_HIGH = 24,
PBLK_LINEGC_FULL = 25,
+ PBLK_LINEGC_WERR = 26
};
#define PBLK_MAGIC 0x70626c6b /*pblk*/
@@ -413,6 +417,11 @@ struct pblk_smeta {
struct line_smeta *buf; /* smeta buffer in persistent format */
};
+struct pblk_w_err_gc {
+ int has_write_err;
+ __le64 *lba_list;
+};
+
struct pblk_line {
struct pblk *pblk;
unsigned int id; /* Line number corresponds to the
@@ -458,6 +467,8 @@ struct pblk_line {
struct kref ref; /* Write buffer L2P references */
+ struct pblk_w_err_gc *w_err_gc; /* Write error gc recovery metadata */
+
spinlock_t lock; /* Necessary for invalid_bitmap only */
};
@@ -489,6 +500,8 @@ struct pblk_line_mgmt {
struct list_head gc_mid_list; /* Full lines ready to GC, mid isc */
struct list_head gc_low_list; /* Full lines ready to GC, low isc */
+ struct list_head gc_werr_list; /* Write err recovery list */
+
struct list_head gc_full_list; /* Full lines ready to GC, no valid */
struct list_head gc_empty_list; /* Full lines close, all valid */
@@ -664,12 +677,15 @@ struct pblk {
struct list_head compl_list;
- mempool_t *page_bio_pool;
- mempool_t *gen_ws_pool;
- mempool_t *rec_pool;
- mempool_t *r_rq_pool;
- mempool_t *w_rq_pool;
- mempool_t *e_rq_pool;
+ spinlock_t resubmit_lock; /* Resubmit list lock */
+ struct list_head resubmit_list; /* Resubmit list for failed writes*/
+
+ mempool_t page_bio_pool;
+ mempool_t gen_ws_pool;
+ mempool_t rec_pool;
+ mempool_t r_rq_pool;
+ mempool_t w_rq_pool;
+ mempool_t e_rq_pool;
struct workqueue_struct *close_wq;
struct workqueue_struct *bb_wq;
@@ -713,9 +729,6 @@ void pblk_rb_sync_l2p(struct pblk_rb *rb);
unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd,
unsigned int pos, unsigned int nr_entries,
unsigned int count);
-unsigned int pblk_rb_read_to_bio_list(struct pblk_rb *rb, struct bio *bio,
- struct list_head *list,
- unsigned int max);
int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba,
struct ppa_addr ppa, int bio_iter, bool advanced_bio);
unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int entries);
@@ -766,11 +779,13 @@ struct pblk_line *pblk_line_get_data(struct pblk *pblk);
struct pblk_line *pblk_line_get_erase(struct pblk *pblk);
int pblk_line_erase(struct pblk *pblk, struct pblk_line *line);
int pblk_line_is_full(struct pblk_line *line);
-void pblk_line_free(struct pblk *pblk, struct pblk_line *line);
+void pblk_line_free(struct pblk_line *line);
void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line);
void pblk_line_close(struct pblk *pblk, struct pblk_line *line);
void pblk_line_close_ws(struct work_struct *work);
void pblk_pipeline_stop(struct pblk *pblk);
+void __pblk_pipeline_stop(struct pblk *pblk);
+void __pblk_pipeline_flush(struct pblk *pblk);
void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
void (*work)(struct work_struct *), gfp_t gfp_mask,
struct workqueue_struct *wq);
@@ -794,7 +809,6 @@ void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
void pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas);
void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
unsigned long *lun_bitmap);
-void pblk_end_io_sync(struct nvm_rq *rqd);
int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags,
int nr_pages);
void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off,
@@ -837,23 +851,20 @@ void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
int pblk_write_ts(void *data);
void pblk_write_timer_fn(struct timer_list *t);
void pblk_write_should_kick(struct pblk *pblk);
+void pblk_write_kick(struct pblk *pblk);
/*
* pblk read path
*/
-extern struct bio_set *pblk_bio_set;
+extern struct bio_set pblk_bio_set;
int pblk_submit_read(struct pblk *pblk, struct bio *bio);
int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq);
/*
* pblk recovery
*/
-void pblk_submit_rec(struct work_struct *work);
struct pblk_line *pblk_recov_l2p(struct pblk *pblk);
int pblk_recov_pad(struct pblk *pblk);
int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta);
-int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
- struct pblk_rec_ctx *recovery, u64 *comp_bits,
- unsigned int comp);
/*
* pblk gc
@@ -864,7 +875,7 @@ int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
#define PBLK_GC_RSV_LINE 1 /* Reserved lines for GC */
int pblk_gc_init(struct pblk *pblk);
-void pblk_gc_exit(struct pblk *pblk);
+void pblk_gc_exit(struct pblk *pblk, bool graceful);
void pblk_gc_should_start(struct pblk *pblk);
void pblk_gc_should_stop(struct pblk *pblk);
void pblk_gc_should_kick(struct pblk *pblk);
@@ -894,6 +905,9 @@ void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line,
bool used);
int pblk_rl_is_limit(struct pblk_rl *rl);
+void pblk_rl_werr_line_in(struct pblk_rl *rl);
+void pblk_rl_werr_line_out(struct pblk_rl *rl);
+
/*
* pblk sysfs
*/
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 3a0cfb237af9..d6bf294f3907 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -269,7 +269,7 @@ struct bcache_device {
atomic_t *stripe_sectors_dirty;
unsigned long *full_dirty_stripes;
- struct bio_set *bio_split;
+ struct bio_set bio_split;
unsigned data_csum:1;
@@ -345,6 +345,7 @@ struct cached_dev {
struct keybuf writeback_keys;
+ struct task_struct *status_update_thread;
/*
* Order the write-half of writeback operations strongly in dispatch
* order. (Maintain LBA order; don't allow reads completing out of
@@ -392,6 +393,7 @@ struct cached_dev {
#define DEFAULT_CACHED_DEV_ERROR_LIMIT 64
atomic_t io_errors;
unsigned error_limit;
+ unsigned offline_seconds;
char backing_dev_name[BDEVNAME_SIZE];
};
@@ -528,9 +530,9 @@ struct cache_set {
struct closure sb_write;
struct semaphore sb_write_mutex;
- mempool_t *search;
- mempool_t *bio_meta;
- struct bio_set *bio_split;
+ mempool_t search;
+ mempool_t bio_meta;
+ struct bio_set bio_split;
/* For the btree cache */
struct shrinker shrink;
@@ -655,7 +657,7 @@ struct cache_set {
* A btree node on disk could have too many bsets for an iterator to fit
* on the stack - have to dynamically allocate them
*/
- mempool_t *fill_iter;
+ mempool_t fill_iter;
struct bset_sort_state sort;
@@ -956,8 +958,6 @@ void bch_prio_write(struct cache *);
void bch_write_bdev_super(struct cached_dev *, struct closure *);
extern struct workqueue_struct *bcache_wq;
-extern const char * const bch_cache_modes[];
-extern const char * const bch_stop_on_failure_modes[];
extern struct mutex bch_register_lock;
extern struct list_head bch_cache_sets;
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 579c696a5fe0..f3403b45bc28 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -1118,8 +1118,7 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter,
void bch_bset_sort_state_free(struct bset_sort_state *state)
{
- if (state->pool)
- mempool_destroy(state->pool);
+ mempool_exit(&state->pool);
}
int bch_bset_sort_state_init(struct bset_sort_state *state, unsigned page_order)
@@ -1129,11 +1128,7 @@ int bch_bset_sort_state_init(struct bset_sort_state *state, unsigned page_order)
state->page_order = page_order;
state->crit_factor = int_sqrt(1 << page_order);
- state->pool = mempool_create_page_pool(1, page_order);
- if (!state->pool)
- return -ENOMEM;
-
- return 0;
+ return mempool_init_page_pool(&state->pool, 1, page_order);
}
EXPORT_SYMBOL(bch_bset_sort_state_init);
@@ -1191,7 +1186,7 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
BUG_ON(order > state->page_order);
- outp = mempool_alloc(state->pool, GFP_NOIO);
+ outp = mempool_alloc(&state->pool, GFP_NOIO);
out = page_address(outp);
used_mempool = true;
order = state->page_order;
@@ -1220,7 +1215,7 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
}
if (used_mempool)
- mempool_free(virt_to_page(out), state->pool);
+ mempool_free(virt_to_page(out), &state->pool);
else
free_pages((unsigned long) out, order);
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index 0c24280f3b98..b867f2200495 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -347,7 +347,7 @@ static inline struct bkey *bch_bset_search(struct btree_keys *b,
/* Sorting */
struct bset_sort_state {
- mempool_t *pool;
+ mempool_t pool;
unsigned page_order;
unsigned crit_factor;
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 17936b2dc7d6..2a0968c04e21 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -204,7 +204,7 @@ void bch_btree_node_read_done(struct btree *b)
struct bset *i = btree_bset_first(b);
struct btree_iter *iter;
- iter = mempool_alloc(b->c->fill_iter, GFP_NOIO);
+ iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO);
iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
iter->used = 0;
@@ -271,7 +271,7 @@ void bch_btree_node_read_done(struct btree *b)
bch_bset_init_next(&b->keys, write_block(b),
bset_magic(&b->c->sb));
out:
- mempool_free(iter, b->c->fill_iter);
+ mempool_free(iter, &b->c->fill_iter);
return;
err:
set_btree_node_io_error(b);
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index 2ddf8515e6a5..9612873afee2 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -17,12 +17,12 @@
void bch_bbio_free(struct bio *bio, struct cache_set *c)
{
struct bbio *b = container_of(bio, struct bbio, bio);
- mempool_free(b, c->bio_meta);
+ mempool_free(b, &c->bio_meta);
}
struct bio *bch_bbio_alloc(struct cache_set *c)
{
- struct bbio *b = mempool_alloc(c->bio_meta, GFP_NOIO);
+ struct bbio *b = mempool_alloc(&c->bio_meta, GFP_NOIO);
struct bio *bio = &b->bio;
bio_init(bio, bio->bi_inline_vecs, bucket_pages(c));
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 8e3e8655ed63..ae67f5fa8047 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -213,7 +213,7 @@ static void bch_data_insert_start(struct closure *cl)
do {
unsigned i;
struct bkey *k;
- struct bio_set *split = op->c->bio_split;
+ struct bio_set *split = &op->c->bio_split;
/* 1 for the device pointer and 1 for the chksum */
if (bch_keylist_realloc(&op->insert_keys,
@@ -548,7 +548,7 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
n = bio_next_split(bio, min_t(uint64_t, INT_MAX,
KEY_OFFSET(k) - bio->bi_iter.bi_sector),
- GFP_NOIO, s->d->bio_split);
+ GFP_NOIO, &s->d->bio_split);
bio_key = &container_of(n, struct bbio, bio)->key;
bch_bkey_copy_single_ptr(bio_key, k, ptr);
@@ -707,7 +707,7 @@ static void search_free(struct closure *cl)
bio_complete(s);
closure_debug_destroy(cl);
- mempool_free(s, s->d->c->search);
+ mempool_free(s, &s->d->c->search);
}
static inline struct search *search_alloc(struct bio *bio,
@@ -715,7 +715,7 @@ static inline struct search *search_alloc(struct bio *bio,
{
struct search *s;
- s = mempool_alloc(d->c->search, GFP_NOIO);
+ s = mempool_alloc(&d->c->search, GFP_NOIO);
closure_init(&s->cl, NULL);
do_bio_hook(s, bio, request_endio);
@@ -864,7 +864,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
s->cache_missed = 1;
if (s->cache_miss || s->iop.bypass) {
- miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split);
+ miss = bio_next_split(bio, sectors, GFP_NOIO, &s->d->bio_split);
ret = miss == bio ? MAP_DONE : MAP_CONTINUE;
goto out_submit;
}
@@ -887,14 +887,14 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
s->iop.replace = true;
- miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split);
+ miss = bio_next_split(bio, sectors, GFP_NOIO, &s->d->bio_split);
/* btree_search_recurse()'s btree iterator is no good anymore */
ret = miss == bio ? MAP_DONE : -EINTR;
cache_bio = bio_alloc_bioset(GFP_NOWAIT,
DIV_ROUND_UP(s->insert_bio_sectors, PAGE_SECTORS),
- dc->disk.bio_split);
+ &dc->disk.bio_split);
if (!cache_bio)
goto out_submit;
@@ -1008,7 +1008,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
struct bio *flush;
flush = bio_alloc_bioset(GFP_NOIO, 0,
- dc->disk.bio_split);
+ &dc->disk.bio_split);
if (!flush) {
s->iop.status = BLK_STS_RESOURCE;
goto insert_data;
@@ -1021,7 +1021,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
closure_bio_submit(s->iop.c, flush, cl);
}
} else {
- s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split);
+ s->iop.bio = bio_clone_fast(bio, GFP_NOIO, &dc->disk.bio_split);
/* I/O request sent to backing device */
bio->bi_end_io = backing_request_endio;
closure_bio_submit(s->iop.c, bio, cl);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 3dea06b41d43..a31e55bcc4e5 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -37,24 +37,6 @@ static const char invalid_uuid[] = {
0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
};
-/* Default is -1; we skip past it for struct cached_dev's cache mode */
-const char * const bch_cache_modes[] = {
- "default",
- "writethrough",
- "writeback",
- "writearound",
- "none",
- NULL
-};
-
-/* Default is -1; we skip past it for stop_when_cache_set_failed */
-const char * const bch_stop_on_failure_modes[] = {
- "default",
- "auto",
- "always",
- NULL
-};
-
static struct kobject *bcache_kobj;
struct mutex bch_register_lock;
LIST_HEAD(bch_cache_sets);
@@ -654,6 +636,11 @@ static int ioctl_dev(struct block_device *b, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
struct bcache_device *d = b->bd_disk->private_data;
+ struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+
+ if (dc->io_disable)
+ return -EIO;
+
return d->ioctl(d, mode, cmd, arg);
}
@@ -766,8 +753,7 @@ static void bcache_device_free(struct bcache_device *d)
put_disk(d->disk);
}
- if (d->bio_split)
- bioset_free(d->bio_split);
+ bioset_exit(&d->bio_split);
kvfree(d->full_dirty_stripes);
kvfree(d->stripe_sectors_dirty);
@@ -809,9 +795,8 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
if (idx < 0)
return idx;
- if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio),
- BIOSET_NEED_BVECS |
- BIOSET_NEED_RESCUER)) ||
+ if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio),
+ BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER) ||
!(d->disk = alloc_disk(BCACHE_MINORS))) {
ida_simple_remove(&bcache_device_idx, idx);
return -ENOMEM;
@@ -864,6 +849,44 @@ static void calc_cached_dev_sectors(struct cache_set *c)
c->cached_dev_sectors = sectors;
}
+#define BACKING_DEV_OFFLINE_TIMEOUT 5
+static int cached_dev_status_update(void *arg)
+{
+ struct cached_dev *dc = arg;
+ struct request_queue *q;
+
+ /*
+ * If this delayed worker is stopping outside, directly quit here.
+ * dc->io_disable might be set via sysfs interface, so check it
+ * here too.
+ */
+ while (!kthread_should_stop() && !dc->io_disable) {
+ q = bdev_get_queue(dc->bdev);
+ if (blk_queue_dying(q))
+ dc->offline_seconds++;
+ else
+ dc->offline_seconds = 0;
+
+ if (dc->offline_seconds >= BACKING_DEV_OFFLINE_TIMEOUT) {
+ pr_err("%s: device offline for %d seconds",
+ dc->backing_dev_name,
+ BACKING_DEV_OFFLINE_TIMEOUT);
+ pr_err("%s: disable I/O request due to backing "
+ "device offline", dc->disk.name);
+ dc->io_disable = true;
+ /* let others know earlier that io_disable is true */
+ smp_mb();
+ bcache_device_stop(&dc->disk);
+ break;
+ }
+ schedule_timeout_interruptible(HZ);
+ }
+
+ wait_for_kthread_stop();
+ return 0;
+}
+
+
void bch_cached_dev_run(struct cached_dev *dc)
{
struct bcache_device *d = &dc->disk;
@@ -906,6 +929,14 @@ void bch_cached_dev_run(struct cached_dev *dc)
if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
pr_debug("error creating sysfs link");
+
+ dc->status_update_thread = kthread_run(cached_dev_status_update,
+ dc, "bcache_status_update");
+ if (IS_ERR(dc->status_update_thread)) {
+ pr_warn("failed to create bcache_status_update kthread, "
+ "continue to run without monitoring backing "
+ "device status");
+ }
}
/*
@@ -1139,6 +1170,8 @@ static void cached_dev_free(struct closure *cl)
kthread_stop(dc->writeback_thread);
if (dc->writeback_write_wq)
destroy_workqueue(dc->writeback_write_wq);
+ if (!IS_ERR_OR_NULL(dc->status_update_thread))
+ kthread_stop(dc->status_update_thread);
if (atomic_read(&dc->running))
bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
@@ -1465,14 +1498,10 @@ static void cache_set_free(struct closure *cl)
if (c->moving_gc_wq)
destroy_workqueue(c->moving_gc_wq);
- if (c->bio_split)
- bioset_free(c->bio_split);
- if (c->fill_iter)
- mempool_destroy(c->fill_iter);
- if (c->bio_meta)
- mempool_destroy(c->bio_meta);
- if (c->search)
- mempool_destroy(c->search);
+ bioset_exit(&c->bio_split);
+ mempool_exit(&c->fill_iter);
+ mempool_exit(&c->bio_meta);
+ mempool_exit(&c->search);
kfree(c->devices);
mutex_lock(&bch_register_lock);
@@ -1683,21 +1712,17 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
INIT_LIST_HEAD(&c->btree_cache_freed);
INIT_LIST_HEAD(&c->data_buckets);
- c->search = mempool_create_slab_pool(32, bch_search_cache);
- if (!c->search)
- goto err;
-
iter_size = (sb->bucket_size / sb->block_size + 1) *
sizeof(struct btree_iter_set);
if (!(c->devices = kzalloc(c->nr_uuids * sizeof(void *), GFP_KERNEL)) ||
- !(c->bio_meta = mempool_create_kmalloc_pool(2,
- sizeof(struct bbio) + sizeof(struct bio_vec) *
- bucket_pages(c))) ||
- !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
- !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio),
- BIOSET_NEED_BVECS |
- BIOSET_NEED_RESCUER)) ||
+ mempool_init_slab_pool(&c->search, 32, bch_search_cache) ||
+ mempool_init_kmalloc_pool(&c->bio_meta, 2,
+ sizeof(struct bbio) + sizeof(struct bio_vec) *
+ bucket_pages(c)) ||
+ mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
+ bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio),
+ BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER) ||
!(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
!(c->moving_gc_wq = alloc_workqueue("bcache_gc",
WQ_MEM_RECLAIM, 0)) ||
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index dfeef583ee50..8ccbc8f3b3af 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -16,6 +16,22 @@
#include <linux/sort.h>
#include <linux/sched/clock.h>
+/* Default is -1; we skip past it for struct cached_dev's cache mode */
+static const char * const bch_cache_modes[] = {
+ "writethrough",
+ "writeback",
+ "writearound",
+ "none",
+ NULL
+};
+
+/* Default is -1; we skip past it for stop_when_cache_set_failed */
+static const char * const bch_stop_on_failure_modes[] = {
+ "auto",
+ "always",
+ NULL
+};
+
static const char * const cache_replacement_policies[] = {
"lru",
"fifo",
@@ -114,6 +130,20 @@ rw_attribute(btree_shrinker_disabled);
rw_attribute(copy_gc_enabled);
rw_attribute(size);
+static ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[],
+ size_t selected)
+{
+ char *out = buf;
+ size_t i;
+
+ for (i = 0; list[i]; i++)
+ out += snprintf(out, buf + size - out,
+ i == selected ? "[%s] " : "%s ", list[i]);
+
+ out[-1] = '\n';
+ return out - buf;
+}
+
SHOW(__bch_cached_dev)
{
struct cached_dev *dc = container_of(kobj, struct cached_dev,
@@ -124,12 +154,12 @@ SHOW(__bch_cached_dev)
if (attr == &sysfs_cache_mode)
return bch_snprint_string_list(buf, PAGE_SIZE,
- bch_cache_modes + 1,
+ bch_cache_modes,
BDEV_CACHE_MODE(&dc->sb));
if (attr == &sysfs_stop_when_cache_set_failed)
return bch_snprint_string_list(buf, PAGE_SIZE,
- bch_stop_on_failure_modes + 1,
+ bch_stop_on_failure_modes,
dc->stop_when_cache_set_failed);
@@ -253,8 +283,7 @@ STORE(__cached_dev)
bch_cached_dev_run(dc);
if (attr == &sysfs_cache_mode) {
- v = bch_read_string_list(buf, bch_cache_modes + 1);
-
+ v = __sysfs_match_string(bch_cache_modes, -1, buf);
if (v < 0)
return v;
@@ -265,8 +294,7 @@ STORE(__cached_dev)
}
if (attr == &sysfs_stop_when_cache_set_failed) {
- v = bch_read_string_list(buf, bch_stop_on_failure_modes + 1);
-
+ v = __sysfs_match_string(bch_stop_on_failure_modes, -1, buf);
if (v < 0)
return v;
@@ -635,6 +663,7 @@ SHOW_LOCKED(bch_cache_set)
STORE(__bch_cache_set)
{
struct cache_set *c = container_of(kobj, struct cache_set, kobj);
+ ssize_t v;
if (attr == &sysfs_unregister)
bch_cache_set_unregister(c);
@@ -698,8 +727,7 @@ STORE(__bch_cache_set)
c->congested_write_threshold_us);
if (attr == &sysfs_errors) {
- ssize_t v = bch_read_string_list(buf, error_actions);
-
+ v = __sysfs_match_string(error_actions, -1, buf);
if (v < 0)
return v;
@@ -714,8 +742,7 @@ STORE(__bch_cache_set)
c->error_decay = strtoul_or_return(buf) / 88;
if (attr == &sysfs_io_disable) {
- int v = strtoul_or_return(buf);
-
+ v = strtoul_or_return(buf);
if (v) {
if (test_and_set_bit(CACHE_SET_IO_DISABLE,
&c->flags))
@@ -929,6 +956,7 @@ SHOW_LOCKED(bch_cache)
STORE(__bch_cache)
{
struct cache *ca = container_of(kobj, struct cache, kobj);
+ ssize_t v;
if (attr == &sysfs_discard) {
bool v = strtoul_or_return(buf);
@@ -943,8 +971,7 @@ STORE(__bch_cache)
}
if (attr == &sysfs_cache_replacement_policy) {
- ssize_t v = bch_read_string_list(buf, cache_replacement_policies);
-
+ v = __sysfs_match_string(cache_replacement_policies, -1, buf);
if (v < 0)
return v;
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index 74febd5230df..fc479b026d6d 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -120,41 +120,6 @@ ssize_t bch_hprint(char *buf, int64_t v)
return sprintf(buf, "%llu.%i%c", q, t * 10 / 1024, units[u]);
}
-ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[],
- size_t selected)
-{
- char *out = buf;
- size_t i;
-
- for (i = 0; list[i]; i++)
- out += snprintf(out, buf + size - out,
- i == selected ? "[%s] " : "%s ", list[i]);
-
- out[-1] = '\n';
- return out - buf;
-}
-
-ssize_t bch_read_string_list(const char *buf, const char * const list[])
-{
- size_t i;
- char *s, *d = kstrndup(buf, PAGE_SIZE - 1, GFP_KERNEL);
- if (!d)
- return -ENOMEM;
-
- s = strim(d);
-
- for (i = 0; list[i]; i++)
- if (!strcmp(list[i], s))
- break;
-
- kfree(d);
-
- if (!list[i])
- return -EINVAL;
-
- return i;
-}
-
bool bch_is_zero(const char *p, size_t n)
{
size_t i;
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 268024529edd..cced87f8eb27 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -365,11 +365,6 @@ ssize_t bch_hprint(char *buf, int64_t v);
bool bch_is_zero(const char *p, size_t n);
int bch_parse_uuid(const char *s, char *uuid);
-ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[],
- size_t selected);
-
-ssize_t bch_read_string_list(const char *buf, const char * const list[]);
-
struct time_stats {
spinlock_t lock;
/*
diff --git a/drivers/md/dm-bio-prison-v1.c b/drivers/md/dm-bio-prison-v1.c
index 874841f0fc83..8e33a3808368 100644
--- a/drivers/md/dm-bio-prison-v1.c
+++ b/drivers/md/dm-bio-prison-v1.c
@@ -19,7 +19,7 @@
struct dm_bio_prison {
spinlock_t lock;
- mempool_t *cell_pool;
+ mempool_t cell_pool;
struct rb_root cells;
};
@@ -34,14 +34,15 @@ static struct kmem_cache *_cell_cache;
struct dm_bio_prison *dm_bio_prison_create(void)
{
struct dm_bio_prison *prison = kmalloc(sizeof(*prison), GFP_KERNEL);
+ int ret;
if (!prison)
return NULL;
spin_lock_init(&prison->lock);
- prison->cell_pool = mempool_create_slab_pool(MIN_CELLS, _cell_cache);
- if (!prison->cell_pool) {
+ ret = mempool_init_slab_pool(&prison->cell_pool, MIN_CELLS, _cell_cache);
+ if (ret) {
kfree(prison);
return NULL;
}
@@ -54,21 +55,21 @@ EXPORT_SYMBOL_GPL(dm_bio_prison_create);
void dm_bio_prison_destroy(struct dm_bio_prison *prison)
{
- mempool_destroy(prison->cell_pool);
+ mempool_exit(&prison->cell_pool);
kfree(prison);
}
EXPORT_SYMBOL_GPL(dm_bio_prison_destroy);
struct dm_bio_prison_cell *dm_bio_prison_alloc_cell(struct dm_bio_prison *prison, gfp_t gfp)
{
- return mempool_alloc(prison->cell_pool, gfp);
+ return mempool_alloc(&prison->cell_pool, gfp);
}
EXPORT_SYMBOL_GPL(dm_bio_prison_alloc_cell);
void dm_bio_prison_free_cell(struct dm_bio_prison *prison,
struct dm_bio_prison_cell *cell)
{
- mempool_free(cell, prison->cell_pool);
+ mempool_free(cell, &prison->cell_pool);
}
EXPORT_SYMBOL_GPL(dm_bio_prison_free_cell);
diff --git a/drivers/md/dm-bio-prison-v2.c b/drivers/md/dm-bio-prison-v2.c
index 8ce3a1a588cf..601b1569206a 100644
--- a/drivers/md/dm-bio-prison-v2.c
+++ b/drivers/md/dm-bio-prison-v2.c
@@ -21,7 +21,7 @@ struct dm_bio_prison_v2 {
struct workqueue_struct *wq;
spinlock_t lock;
- mempool_t *cell_pool;
+ mempool_t cell_pool;
struct rb_root cells;
};
@@ -36,6 +36,7 @@ static struct kmem_cache *_cell_cache;
struct dm_bio_prison_v2 *dm_bio_prison_create_v2(struct workqueue_struct *wq)
{
struct dm_bio_prison_v2 *prison = kmalloc(sizeof(*prison), GFP_KERNEL);
+ int ret;
if (!prison)
return NULL;
@@ -43,8 +44,8 @@ struct dm_bio_prison_v2 *dm_bio_prison_create_v2(struct workqueue_struct *wq)
prison->wq = wq;
spin_lock_init(&prison->lock);
- prison->cell_pool = mempool_create_slab_pool(MIN_CELLS, _cell_cache);
- if (!prison->cell_pool) {
+ ret = mempool_init_slab_pool(&prison->cell_pool, MIN_CELLS, _cell_cache);
+ if (ret) {
kfree(prison);
return NULL;
}
@@ -57,21 +58,21 @@ EXPORT_SYMBOL_GPL(dm_bio_prison_create_v2);
void dm_bio_prison_destroy_v2(struct dm_bio_prison_v2 *prison)
{
- mempool_destroy(prison->cell_pool);
+ mempool_exit(&prison->cell_pool);
kfree(prison);
}
EXPORT_SYMBOL_GPL(dm_bio_prison_destroy_v2);
struct dm_bio_prison_cell_v2 *dm_bio_prison_alloc_cell_v2(struct dm_bio_prison_v2 *prison, gfp_t gfp)
{
- return mempool_alloc(prison->cell_pool, gfp);
+ return mempool_alloc(&prison->cell_pool, gfp);
}
EXPORT_SYMBOL_GPL(dm_bio_prison_alloc_cell_v2);
void dm_bio_prison_free_cell_v2(struct dm_bio_prison_v2 *prison,
struct dm_bio_prison_cell_v2 *cell)
{
- mempool_free(cell, prison->cell_pool);
+ mempool_free(cell, &prison->cell_pool);
}
EXPORT_SYMBOL_GPL(dm_bio_prison_free_cell_v2);
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index da208638fba4..001c71248246 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -447,9 +447,9 @@ struct cache {
struct work_struct migration_worker;
struct delayed_work waker;
struct dm_bio_prison_v2 *prison;
- struct bio_set *bs;
+ struct bio_set bs;
- mempool_t *migration_pool;
+ mempool_t migration_pool;
struct dm_cache_policy *policy;
unsigned policy_nr_args;
@@ -550,7 +550,7 @@ static struct dm_cache_migration *alloc_migration(struct cache *cache)
{
struct dm_cache_migration *mg;
- mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
+ mg = mempool_alloc(&cache->migration_pool, GFP_NOWAIT);
if (!mg)
return NULL;
@@ -569,7 +569,7 @@ static void free_migration(struct dm_cache_migration *mg)
if (atomic_dec_and_test(&cache->nr_allocated_migrations))
wake_up(&cache->migration_wait);
- mempool_free(mg, cache->migration_pool);
+ mempool_free(mg, &cache->migration_pool);
}
/*----------------------------------------------------------------*/
@@ -924,7 +924,7 @@ static void issue_op(struct bio *bio, void *context)
static void remap_to_origin_and_cache(struct cache *cache, struct bio *bio,
dm_oblock_t oblock, dm_cblock_t cblock)
{
- struct bio *origin_bio = bio_clone_fast(bio, GFP_NOIO, cache->bs);
+ struct bio *origin_bio = bio_clone_fast(bio, GFP_NOIO, &cache->bs);
BUG_ON(!origin_bio);
@@ -2011,7 +2011,7 @@ static void destroy(struct cache *cache)
{
unsigned i;
- mempool_destroy(cache->migration_pool);
+ mempool_exit(&cache->migration_pool);
if (cache->prison)
dm_bio_prison_destroy_v2(cache->prison);
@@ -2047,8 +2047,7 @@ static void destroy(struct cache *cache)
kfree(cache->ctr_args[i]);
kfree(cache->ctr_args);
- if (cache->bs)
- bioset_free(cache->bs);
+ bioset_exit(&cache->bs);
kfree(cache);
}
@@ -2498,8 +2497,8 @@ static int cache_create(struct cache_args *ca, struct cache **result)
cache->features = ca->features;
if (writethrough_mode(cache)) {
/* Create bioset for writethrough bios issued to origin */
- cache->bs = bioset_create(BIO_POOL_SIZE, 0, 0);
- if (!cache->bs)
+ r = bioset_init(&cache->bs, BIO_POOL_SIZE, 0, 0);
+ if (r)
goto bad;
}
@@ -2630,9 +2629,9 @@ static int cache_create(struct cache_args *ca, struct cache **result)
goto bad;
}
- cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
- migration_cache);
- if (!cache->migration_pool) {
+ r = mempool_init_slab_pool(&cache->migration_pool, MIGRATION_POOL_SIZE,
+ migration_cache);
+ if (r) {
*error = "Error creating cache's migration mempool";
goto bad;
}
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 3222e21cbbf8..f21c5d21bf1b 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -91,8 +91,8 @@ struct mapped_device {
/*
* io objects are allocated from here.
*/
- struct bio_set *io_bs;
- struct bio_set *bs;
+ struct bio_set io_bs;
+ struct bio_set bs;
/*
* freeze/thaw support require holding onto a super block
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 44ff473dab3e..da02f4d8e4b9 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -143,14 +143,14 @@ struct crypt_config {
* pool for per bio private data, crypto requests,
* encryption requeusts/buffer pages and integrity tags
*/
- mempool_t *req_pool;
- mempool_t *page_pool;
- mempool_t *tag_pool;
+ mempool_t req_pool;
+ mempool_t page_pool;
+ mempool_t tag_pool;
unsigned tag_pool_max_sectors;
struct percpu_counter n_allocated_pages;
- struct bio_set *bs;
+ struct bio_set bs;
struct mutex bio_alloc_lock;
struct workqueue_struct *io_queue;
@@ -1245,7 +1245,7 @@ static void crypt_alloc_req_skcipher(struct crypt_config *cc,
unsigned key_index = ctx->cc_sector & (cc->tfms_count - 1);
if (!ctx->r.req)
- ctx->r.req = mempool_alloc(cc->req_pool, GFP_NOIO);
+ ctx->r.req = mempool_alloc(&cc->req_pool, GFP_NOIO);
skcipher_request_set_tfm(ctx->r.req, cc->cipher_tfm.tfms[key_index]);
@@ -1262,7 +1262,7 @@ static void crypt_alloc_req_aead(struct crypt_config *cc,
struct convert_context *ctx)
{
if (!ctx->r.req_aead)
- ctx->r.req_aead = mempool_alloc(cc->req_pool, GFP_NOIO);
+ ctx->r.req_aead = mempool_alloc(&cc->req_pool, GFP_NOIO);
aead_request_set_tfm(ctx->r.req_aead, cc->cipher_tfm.tfms_aead[0]);
@@ -1290,7 +1290,7 @@ static void crypt_free_req_skcipher(struct crypt_config *cc,
struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size);
if ((struct skcipher_request *)(io + 1) != req)
- mempool_free(req, cc->req_pool);
+ mempool_free(req, &cc->req_pool);
}
static void crypt_free_req_aead(struct crypt_config *cc,
@@ -1299,7 +1299,7 @@ static void crypt_free_req_aead(struct crypt_config *cc,
struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size);
if ((struct aead_request *)(io + 1) != req)
- mempool_free(req, cc->req_pool);
+ mempool_free(req, &cc->req_pool);
}
static void crypt_free_req(struct crypt_config *cc, void *req, struct bio *base_bio)
@@ -1409,7 +1409,7 @@ retry:
if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM))
mutex_lock(&cc->bio_alloc_lock);
- clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs);
+ clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, &cc->bs);
if (!clone)
goto out;
@@ -1418,7 +1418,7 @@ retry:
remaining_size = size;
for (i = 0; i < nr_iovecs; i++) {
- page = mempool_alloc(cc->page_pool, gfp_mask);
+ page = mempool_alloc(&cc->page_pool, gfp_mask);
if (!page) {
crypt_free_buffer_pages(cc, clone);
bio_put(clone);
@@ -1453,7 +1453,7 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
bio_for_each_segment_all(bv, clone, i) {
BUG_ON(!bv->bv_page);
- mempool_free(bv->bv_page, cc->page_pool);
+ mempool_free(bv->bv_page, &cc->page_pool);
}
}
@@ -1492,7 +1492,7 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
crypt_free_req(cc, io->ctx.r.req, base_bio);
if (unlikely(io->integrity_metadata_from_pool))
- mempool_free(io->integrity_metadata, io->cc->tag_pool);
+ mempool_free(io->integrity_metadata, &io->cc->tag_pool);
else
kfree(io->integrity_metadata);
@@ -1565,7 +1565,7 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
* biovecs we don't need to worry about the block layer
* modifying the biovec array; so leverage bio_clone_fast().
*/
- clone = bio_clone_fast(io->base_bio, gfp, cc->bs);
+ clone = bio_clone_fast(io->base_bio, gfp, &cc->bs);
if (!clone)
return 1;
@@ -2219,15 +2219,13 @@ static void crypt_dtr(struct dm_target *ti)
crypt_free_tfms(cc);
- if (cc->bs)
- bioset_free(cc->bs);
+ bioset_exit(&cc->bs);
- mempool_destroy(cc->page_pool);
- mempool_destroy(cc->req_pool);
- mempool_destroy(cc->tag_pool);
+ mempool_exit(&cc->page_pool);
+ mempool_exit(&cc->req_pool);
+ mempool_exit(&cc->tag_pool);
- if (cc->page_pool)
- WARN_ON(percpu_counter_sum(&cc->n_allocated_pages) != 0);
+ WARN_ON(percpu_counter_sum(&cc->n_allocated_pages) != 0);
percpu_counter_destroy(&cc->n_allocated_pages);
if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
@@ -2743,8 +2741,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
iv_size_padding = align_mask;
}
- ret = -ENOMEM;
-
/* ...| IV + padding | original IV | original sec. number | bio tag offset | */
additional_req_size = sizeof(struct dm_crypt_request) +
iv_size_padding + cc->iv_size +
@@ -2752,8 +2748,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
sizeof(uint64_t) +
sizeof(unsigned int);
- cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + additional_req_size);
- if (!cc->req_pool) {
+ ret = mempool_init_kmalloc_pool(&cc->req_pool, MIN_IOS, cc->dmreq_start + additional_req_size);
+ if (ret) {
ti->error = "Cannot allocate crypt request mempool";
goto bad;
}
@@ -2762,14 +2758,14 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start + additional_req_size,
ARCH_KMALLOC_MINALIGN);
- cc->page_pool = mempool_create(BIO_MAX_PAGES, crypt_page_alloc, crypt_page_free, cc);
- if (!cc->page_pool) {
+ ret = mempool_init(&cc->page_pool, BIO_MAX_PAGES, crypt_page_alloc, crypt_page_free, cc);
+ if (ret) {
ti->error = "Cannot allocate page mempool";
goto bad;
}
- cc->bs = bioset_create(MIN_IOS, 0, BIOSET_NEED_BVECS);
- if (!cc->bs) {
+ ret = bioset_init(&cc->bs, MIN_IOS, 0, BIOSET_NEED_BVECS);
+ if (ret) {
ti->error = "Cannot allocate crypt bioset";
goto bad;
}
@@ -2806,11 +2802,10 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (!cc->tag_pool_max_sectors)
cc->tag_pool_max_sectors = 1;
- cc->tag_pool = mempool_create_kmalloc_pool(MIN_IOS,
+ ret = mempool_init_kmalloc_pool(&cc->tag_pool, MIN_IOS,
cc->tag_pool_max_sectors * cc->on_disk_tag_size);
- if (!cc->tag_pool) {
+ if (ret) {
ti->error = "Cannot allocate integrity tags mempool";
- ret = -ENOMEM;
goto bad;
}
@@ -2903,7 +2898,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN)))) {
if (bio_sectors(bio) > cc->tag_pool_max_sectors)
dm_accept_partial_bio(bio, cc->tag_pool_max_sectors);
- io->integrity_metadata = mempool_alloc(cc->tag_pool, GFP_NOIO);
+ io->integrity_metadata = mempool_alloc(&cc->tag_pool, GFP_NOIO);
io->integrity_metadata_from_pool = true;
}
}
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 514fb4aec5d1..fc68c7aaef8e 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -142,7 +142,7 @@ struct dm_integrity_c {
unsigned tag_size;
__s8 log2_tag_size;
sector_t start;
- mempool_t *journal_io_mempool;
+ mempool_t journal_io_mempool;
struct dm_io_client *io;
struct dm_bufio_client *bufio;
struct workqueue_struct *metadata_wq;
@@ -1817,7 +1817,7 @@ static void complete_copy_from_journal(unsigned long error, void *context)
struct journal_completion *comp = io->comp;
struct dm_integrity_c *ic = comp->ic;
remove_range(ic, &io->range);
- mempool_free(io, ic->journal_io_mempool);
+ mempool_free(io, &ic->journal_io_mempool);
if (unlikely(error != 0))
dm_integrity_io_error(ic, "copying from journal", -EIO);
complete_journal_op(comp);
@@ -1886,7 +1886,7 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start,
}
next_loop = k - 1;
- io = mempool_alloc(ic->journal_io_mempool, GFP_NOIO);
+ io = mempool_alloc(&ic->journal_io_mempool, GFP_NOIO);
io->comp = &comp;
io->range.logical_sector = sec;
io->range.n_sectors = (k - j) << ic->sb->log2_sectors_per_block;
@@ -1918,7 +1918,7 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start,
if (j == k) {
remove_range_unlocked(ic, &io->range);
spin_unlock_irq(&ic->endio_wait.lock);
- mempool_free(io, ic->journal_io_mempool);
+ mempool_free(io, &ic->journal_io_mempool);
goto skip_io;
}
for (l = j; l < k; l++) {
@@ -2980,9 +2980,8 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto bad;
}
- ic->journal_io_mempool = mempool_create_slab_pool(JOURNAL_IO_MEMPOOL, journal_io_cache);
- if (!ic->journal_io_mempool) {
- r = -ENOMEM;
+ r = mempool_init_slab_pool(&ic->journal_io_mempool, JOURNAL_IO_MEMPOOL, journal_io_cache);
+ if (r) {
ti->error = "Cannot allocate mempool";
goto bad;
}
@@ -3196,7 +3195,7 @@ static void dm_integrity_dtr(struct dm_target *ti)
destroy_workqueue(ic->writer_wq);
if (ic->bufio)
dm_bufio_client_destroy(ic->bufio);
- mempool_destroy(ic->journal_io_mempool);
+ mempool_exit(&ic->journal_io_mempool);
if (ic->io)
dm_io_client_destroy(ic->io);
if (ic->dev)
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index a8d914d5abbe..53c6ed0eaa1f 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -22,8 +22,8 @@
#define DM_IO_MAX_REGIONS BITS_PER_LONG
struct dm_io_client {
- mempool_t *pool;
- struct bio_set *bios;
+ mempool_t pool;
+ struct bio_set bios;
};
/*
@@ -49,32 +49,33 @@ struct dm_io_client *dm_io_client_create(void)
{
struct dm_io_client *client;
unsigned min_ios = dm_get_reserved_bio_based_ios();
+ int ret;
client = kmalloc(sizeof(*client), GFP_KERNEL);
if (!client)
return ERR_PTR(-ENOMEM);
- client->pool = mempool_create_slab_pool(min_ios, _dm_io_cache);
- if (!client->pool)
+ ret = mempool_init_slab_pool(&client->pool, min_ios, _dm_io_cache);
+ if (ret)
goto bad;
- client->bios = bioset_create(min_ios, 0, BIOSET_NEED_BVECS);
- if (!client->bios)
+ ret = bioset_init(&client->bios, min_ios, 0, BIOSET_NEED_BVECS);
+ if (ret)
goto bad;
return client;
bad:
- mempool_destroy(client->pool);
+ mempool_exit(&client->pool);
kfree(client);
- return ERR_PTR(-ENOMEM);
+ return ERR_PTR(ret);
}
EXPORT_SYMBOL(dm_io_client_create);
void dm_io_client_destroy(struct dm_io_client *client)
{
- mempool_destroy(client->pool);
- bioset_free(client->bios);
+ mempool_exit(&client->pool);
+ bioset_exit(&client->bios);
kfree(client);
}
EXPORT_SYMBOL(dm_io_client_destroy);
@@ -120,7 +121,7 @@ static void complete_io(struct io *io)
invalidate_kernel_vmap_range(io->vma_invalidate_address,
io->vma_invalidate_size);
- mempool_free(io, io->client->pool);
+ mempool_free(io, &io->client->pool);
fn(error_bits, context);
}
@@ -344,7 +345,7 @@ static void do_region(int op, int op_flags, unsigned region,
dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT)));
}
- bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
+ bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, &io->client->bios);
bio->bi_iter.bi_sector = where->sector + (where->count - remaining);
bio_set_dev(bio, where->bdev);
bio->bi_end_io = endio;
@@ -442,7 +443,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
init_completion(&sio.wait);
- io = mempool_alloc(client->pool, GFP_NOIO);
+ io = mempool_alloc(&client->pool, GFP_NOIO);
io->error_bits = 0;
atomic_set(&io->count, 1); /* see dispatch_io() */
io->client = client;
@@ -474,7 +475,7 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions,
return -EIO;
}
- io = mempool_alloc(client->pool, GFP_NOIO);
+ io = mempool_alloc(&client->pool, GFP_NOIO);
io->error_bits = 0;
atomic_set(&io->count, 1); /* see dispatch_io() */
io->client = client;
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index e6e7c686646d..c89a675a2aac 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -47,7 +47,7 @@ struct dm_kcopyd_client {
wait_queue_head_t destroyq;
atomic_t nr_jobs;
- mempool_t *job_pool;
+ mempool_t job_pool;
struct workqueue_struct *kcopyd_wq;
struct work_struct kcopyd_work;
@@ -479,7 +479,7 @@ static int run_complete_job(struct kcopyd_job *job)
*/
if (job->master_job == job) {
mutex_destroy(&job->lock);
- mempool_free(job, kc->job_pool);
+ mempool_free(job, &kc->job_pool);
}
fn(read_err, write_err, context);
@@ -751,7 +751,7 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
* Allocate an array of jobs consisting of one master job
* followed by SPLIT_COUNT sub jobs.
*/
- job = mempool_alloc(kc->job_pool, GFP_NOIO);
+ job = mempool_alloc(&kc->job_pool, GFP_NOIO);
mutex_init(&job->lock);
/*
@@ -835,7 +835,7 @@ void *dm_kcopyd_prepare_callback(struct dm_kcopyd_client *kc,
{
struct kcopyd_job *job;
- job = mempool_alloc(kc->job_pool, GFP_NOIO);
+ job = mempool_alloc(&kc->job_pool, GFP_NOIO);
memset(job, 0, sizeof(struct kcopyd_job));
job->kc = kc;
@@ -879,7 +879,7 @@ int kcopyd_cancel(struct kcopyd_job *job, int block)
*---------------------------------------------------------------*/
struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *throttle)
{
- int r = -ENOMEM;
+ int r;
struct dm_kcopyd_client *kc;
kc = kmalloc(sizeof(*kc), GFP_KERNEL);
@@ -892,14 +892,16 @@ struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *thro
INIT_LIST_HEAD(&kc->pages_jobs);
kc->throttle = throttle;
- kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache);
- if (!kc->job_pool)
+ r = mempool_init_slab_pool(&kc->job_pool, MIN_JOBS, _job_cache);
+ if (r)
goto bad_slab;
INIT_WORK(&kc->kcopyd_work, do_work);
kc->kcopyd_wq = alloc_workqueue("kcopyd", WQ_MEM_RECLAIM, 0);
- if (!kc->kcopyd_wq)
+ if (!kc->kcopyd_wq) {
+ r = -ENOMEM;
goto bad_workqueue;
+ }
kc->pages = NULL;
kc->nr_reserved_pages = kc->nr_free_pages = 0;
@@ -923,7 +925,7 @@ bad_io_client:
bad_client_pages:
destroy_workqueue(kc->kcopyd_wq);
bad_workqueue:
- mempool_destroy(kc->job_pool);
+ mempool_exit(&kc->job_pool);
bad_slab:
kfree(kc);
@@ -942,7 +944,7 @@ void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc)
destroy_workqueue(kc->kcopyd_wq);
dm_io_client_destroy(kc->io_client);
client_free_pages(kc);
- mempool_destroy(kc->job_pool);
+ mempool_exit(&kc->job_pool);
kfree(kc);
}
EXPORT_SYMBOL(dm_kcopyd_client_destroy);
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index 53b7b06d0aa8..52090bee17c2 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -76,7 +76,7 @@ struct log_c {
*/
uint32_t integrated_flush;
- mempool_t *flush_entry_pool;
+ mempool_t flush_entry_pool;
};
static struct kmem_cache *_flush_entry_cache;
@@ -249,11 +249,10 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
goto out;
}
- lc->flush_entry_pool = mempool_create_slab_pool(FLUSH_ENTRY_POOL_SIZE,
- _flush_entry_cache);
- if (!lc->flush_entry_pool) {
+ r = mempool_init_slab_pool(&lc->flush_entry_pool, FLUSH_ENTRY_POOL_SIZE,
+ _flush_entry_cache);
+ if (r) {
DMERR("Failed to create flush_entry_pool");
- r = -ENOMEM;
goto out;
}
@@ -313,7 +312,7 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
out:
kfree(devices_rdata);
if (r) {
- mempool_destroy(lc->flush_entry_pool);
+ mempool_exit(&lc->flush_entry_pool);
kfree(lc);
kfree(ctr_str);
} else {
@@ -342,7 +341,7 @@ static void userspace_dtr(struct dm_dirty_log *log)
if (lc->log_dev)
dm_put_device(lc->ti, lc->log_dev);
- mempool_destroy(lc->flush_entry_pool);
+ mempool_exit(&lc->flush_entry_pool);
kfree(lc->usr_argv_str);
kfree(lc);
@@ -570,7 +569,7 @@ static int userspace_flush(struct dm_dirty_log *log)
int mark_list_is_empty;
int clear_list_is_empty;
struct dm_dirty_log_flush_entry *fe, *tmp_fe;
- mempool_t *flush_entry_pool = lc->flush_entry_pool;
+ mempool_t *flush_entry_pool = &lc->flush_entry_pool;
spin_lock_irqsave(&lc->flush_lock, flags);
list_splice_init(&lc->mark_list, &mark_list);
@@ -653,7 +652,7 @@ static void userspace_mark_region(struct dm_dirty_log *log, region_t region)
struct dm_dirty_log_flush_entry *fe;
/* Wait for an allocation, but _never_ fail */
- fe = mempool_alloc(lc->flush_entry_pool, GFP_NOIO);
+ fe = mempool_alloc(&lc->flush_entry_pool, GFP_NOIO);
BUG_ON(!fe);
spin_lock_irqsave(&lc->flush_lock, flags);
@@ -687,7 +686,7 @@ static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
* to cause the region to be resync'ed when the
* device is activated next time.
*/
- fe = mempool_alloc(lc->flush_entry_pool, GFP_ATOMIC);
+ fe = mempool_alloc(&lc->flush_entry_pool, GFP_ATOMIC);
if (!fe) {
DMERR("Failed to allocate memory to clear region.");
return;
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 203a0419d2b0..d94ba6f72ff5 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -520,7 +520,8 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
bdev = pgpath->path.dev->bdev;
q = bdev_get_queue(bdev);
- clone = blk_get_request(q, rq->cmd_flags | REQ_NOMERGE, GFP_ATOMIC);
+ clone = blk_get_request(q, rq->cmd_flags | REQ_NOMERGE,
+ BLK_MQ_REQ_NOWAIT);
if (IS_ERR(clone)) {
/* EBUSY, ENODEV or EWOULDBLOCK: requeue */
if (blk_queue_dying(q)) {
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index 85c32b22a420..43149eb49375 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -63,7 +63,7 @@ struct dm_region_hash {
/* hash table */
rwlock_t hash_lock;
- mempool_t *region_pool;
+ mempool_t region_pool;
unsigned mask;
unsigned nr_buckets;
unsigned prime;
@@ -169,6 +169,7 @@ struct dm_region_hash *dm_region_hash_create(
struct dm_region_hash *rh;
unsigned nr_buckets, max_buckets;
size_t i;
+ int ret;
/*
* Calculate a suitable number of buckets for our hash
@@ -220,9 +221,9 @@ struct dm_region_hash *dm_region_hash_create(
INIT_LIST_HEAD(&rh->failed_recovered_regions);
rh->flush_failure = 0;
- rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS,
- sizeof(struct dm_region));
- if (!rh->region_pool) {
+ ret = mempool_init_kmalloc_pool(&rh->region_pool, MIN_REGIONS,
+ sizeof(struct dm_region));
+ if (ret) {
vfree(rh->buckets);
kfree(rh);
rh = ERR_PTR(-ENOMEM);
@@ -242,14 +243,14 @@ void dm_region_hash_destroy(struct dm_region_hash *rh)
list_for_each_entry_safe(reg, nreg, rh->buckets + h,
hash_list) {
BUG_ON(atomic_read(&reg->pending));
- mempool_free(reg, rh->region_pool);
+ mempool_free(reg, &rh->region_pool);
}
}
if (rh->log)
dm_dirty_log_destroy(rh->log);
- mempool_destroy(rh->region_pool);
+ mempool_exit(&rh->region_pool);
vfree(rh->buckets);
kfree(rh);
}
@@ -287,7 +288,7 @@ static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region)
{
struct dm_region *reg, *nreg;
- nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC);
+ nreg = mempool_alloc(&rh->region_pool, GFP_ATOMIC);
if (unlikely(!nreg))
nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL);
@@ -303,7 +304,7 @@ static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region)
reg = __rh_lookup(rh, region);
if (reg)
/* We lost the race. */
- mempool_free(nreg, rh->region_pool);
+ mempool_free(nreg, &rh->region_pool);
else {
__rh_insert(rh, nreg);
if (nreg->state == DM_RH_CLEAN) {
@@ -481,17 +482,17 @@ void dm_rh_update_states(struct dm_region_hash *rh, int errors_handled)
list_for_each_entry_safe(reg, next, &recovered, list) {
rh->log->type->clear_region(rh->log, reg->key);
complete_resync_work(reg, 1);
- mempool_free(reg, rh->region_pool);
+ mempool_free(reg, &rh->region_pool);
}
list_for_each_entry_safe(reg, next, &failed_recovered, list) {
complete_resync_work(reg, errors_handled ? 0 : 1);
- mempool_free(reg, rh->region_pool);
+ mempool_free(reg, &rh->region_pool);
}
list_for_each_entry_safe(reg, next, &clean, list) {
rh->log->type->clear_region(rh->log, reg->key);
- mempool_free(reg, rh->region_pool);
+ mempool_free(reg, &rh->region_pool);
}
rh->log->type->flush(rh->log);
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index bf0b840645cc..6e547b8dd298 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -406,7 +406,7 @@ static blk_status_t dm_dispatch_clone_request(struct request *clone, struct requ
if (blk_queue_io_stat(clone->q))
clone->rq_flags |= RQF_IO_STAT;
- clone->start_time = jiffies;
+ clone->start_time_ns = ktime_get_ns();
r = blk_insert_cloned_request(clone->q, clone);
if (r != BLK_STS_OK && r != BLK_STS_RESOURCE && r != BLK_STS_DEV_RESOURCE)
/* must complete clone in terms of original request */
@@ -433,7 +433,7 @@ static int setup_clone(struct request *clone, struct request *rq,
{
int r;
- r = blk_rq_prep_clone(clone, rq, tio->md->bs, gfp_mask,
+ r = blk_rq_prep_clone(clone, rq, &tio->md->bs, gfp_mask,
dm_rq_bio_constructor, tio);
if (r)
return r;
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 216035be5661..b11ddc55f297 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -87,7 +87,7 @@ struct dm_snapshot {
*/
struct list_head out_of_order_list;
- mempool_t *pending_pool;
+ mempool_t pending_pool;
struct dm_exception_table pending;
struct dm_exception_table complete;
@@ -682,7 +682,7 @@ static void free_completed_exception(struct dm_exception *e)
static struct dm_snap_pending_exception *alloc_pending_exception(struct dm_snapshot *s)
{
- struct dm_snap_pending_exception *pe = mempool_alloc(s->pending_pool,
+ struct dm_snap_pending_exception *pe = mempool_alloc(&s->pending_pool,
GFP_NOIO);
atomic_inc(&s->pending_exceptions_count);
@@ -695,7 +695,7 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe)
{
struct dm_snapshot *s = pe->snap;
- mempool_free(pe, s->pending_pool);
+ mempool_free(pe, &s->pending_pool);
smp_mb__before_atomic();
atomic_dec(&s->pending_exceptions_count);
}
@@ -1196,10 +1196,9 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad_kcopyd;
}
- s->pending_pool = mempool_create_slab_pool(MIN_IOS, pending_cache);
- if (!s->pending_pool) {
+ r = mempool_init_slab_pool(&s->pending_pool, MIN_IOS, pending_cache);
+ if (r) {
ti->error = "Could not allocate mempool for pending exceptions";
- r = -ENOMEM;
goto bad_pending_pool;
}
@@ -1259,7 +1258,7 @@ bad_read_metadata:
unregister_snapshot(s);
bad_load_and_register:
- mempool_destroy(s->pending_pool);
+ mempool_exit(&s->pending_pool);
bad_pending_pool:
dm_kcopyd_client_destroy(s->kcopyd_client);
@@ -1355,7 +1354,7 @@ static void snapshot_dtr(struct dm_target *ti)
while (atomic_read(&s->pending_exceptions_count))
msleep(1);
/*
- * Ensure instructions in mempool_destroy aren't reordered
+ * Ensure instructions in mempool_exit aren't reordered
* before atomic_read.
*/
smp_mb();
@@ -1367,7 +1366,7 @@ static void snapshot_dtr(struct dm_target *ti)
__free_exceptions(s);
- mempool_destroy(s->pending_pool);
+ mempool_exit(&s->pending_pool);
dm_exception_store_destroy(s->store);
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index b11107497d2e..6c923824ec91 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -260,7 +260,7 @@ struct pool {
struct dm_deferred_set *all_io_ds;
struct dm_thin_new_mapping *next_mapping;
- mempool_t *mapping_pool;
+ mempool_t mapping_pool;
process_bio_fn process_bio;
process_bio_fn process_discard;
@@ -917,7 +917,7 @@ static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
{
cell_error(m->tc->pool, m->cell);
list_del(&m->list);
- mempool_free(m, m->tc->pool->mapping_pool);
+ mempool_free(m, &m->tc->pool->mapping_pool);
}
static void process_prepared_mapping(struct dm_thin_new_mapping *m)
@@ -961,7 +961,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
out:
list_del(&m->list);
- mempool_free(m, pool->mapping_pool);
+ mempool_free(m, &pool->mapping_pool);
}
/*----------------------------------------------------------------*/
@@ -971,7 +971,7 @@ static void free_discard_mapping(struct dm_thin_new_mapping *m)
struct thin_c *tc = m->tc;
if (m->cell)
cell_defer_no_holder(tc, m->cell);
- mempool_free(m, tc->pool->mapping_pool);
+ mempool_free(m, &tc->pool->mapping_pool);
}
static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
@@ -999,7 +999,7 @@ static void process_prepared_discard_no_passdown(struct dm_thin_new_mapping *m)
bio_endio(m->bio);
cell_defer_no_holder(tc, m->cell);
- mempool_free(m, tc->pool->mapping_pool);
+ mempool_free(m, &tc->pool->mapping_pool);
}
/*----------------------------------------------------------------*/
@@ -1092,7 +1092,7 @@ static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
metadata_operation_failed(pool, "dm_thin_remove_range", r);
bio_io_error(m->bio);
cell_defer_no_holder(tc, m->cell);
- mempool_free(m, pool->mapping_pool);
+ mempool_free(m, &pool->mapping_pool);
return;
}
@@ -1105,7 +1105,7 @@ static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
metadata_operation_failed(pool, "dm_pool_inc_data_range", r);
bio_io_error(m->bio);
cell_defer_no_holder(tc, m->cell);
- mempool_free(m, pool->mapping_pool);
+ mempool_free(m, &pool->mapping_pool);
return;
}
@@ -1150,7 +1150,7 @@ static void process_prepared_discard_passdown_pt2(struct dm_thin_new_mapping *m)
bio_endio(m->bio);
cell_defer_no_holder(tc, m->cell);
- mempool_free(m, pool->mapping_pool);
+ mempool_free(m, &pool->mapping_pool);
}
static void process_prepared(struct pool *pool, struct list_head *head,
@@ -1196,7 +1196,7 @@ static int ensure_next_mapping(struct pool *pool)
if (pool->next_mapping)
return 0;
- pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC);
+ pool->next_mapping = mempool_alloc(&pool->mapping_pool, GFP_ATOMIC);
return pool->next_mapping ? 0 : -ENOMEM;
}
@@ -2835,8 +2835,8 @@ static void __pool_destroy(struct pool *pool)
destroy_workqueue(pool->wq);
if (pool->next_mapping)
- mempool_free(pool->next_mapping, pool->mapping_pool);
- mempool_destroy(pool->mapping_pool);
+ mempool_free(pool->next_mapping, &pool->mapping_pool);
+ mempool_exit(&pool->mapping_pool);
dm_deferred_set_destroy(pool->shared_read_ds);
dm_deferred_set_destroy(pool->all_io_ds);
kfree(pool);
@@ -2931,11 +2931,11 @@ static struct pool *pool_create(struct mapped_device *pool_md,
}
pool->next_mapping = NULL;
- pool->mapping_pool = mempool_create_slab_pool(MAPPING_POOL_SIZE,
- _new_mapping_cache);
- if (!pool->mapping_pool) {
+ r = mempool_init_slab_pool(&pool->mapping_pool, MAPPING_POOL_SIZE,
+ _new_mapping_cache);
+ if (r) {
*error = "Error creating pool's mapping mempool";
- err_p = ERR_PTR(-ENOMEM);
+ err_p = ERR_PTR(r);
goto bad_mapping_pool;
}
@@ -2955,7 +2955,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
return pool;
bad_sort_array:
- mempool_destroy(pool->mapping_pool);
+ mempool_exit(&pool->mapping_pool);
bad_mapping_pool:
dm_deferred_set_destroy(pool->all_io_ds);
bad_all_io_ds:
diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index e13f90832b6b..86405869f1af 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -309,13 +309,13 @@ static int fec_alloc_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio)
unsigned n;
if (!fio->rs)
- fio->rs = mempool_alloc(v->fec->rs_pool, GFP_NOIO);
+ fio->rs = mempool_alloc(&v->fec->rs_pool, GFP_NOIO);
fec_for_each_prealloc_buffer(n) {
if (fio->bufs[n])
continue;
- fio->bufs[n] = mempool_alloc(v->fec->prealloc_pool, GFP_NOWAIT);
+ fio->bufs[n] = mempool_alloc(&v->fec->prealloc_pool, GFP_NOWAIT);
if (unlikely(!fio->bufs[n])) {
DMERR("failed to allocate FEC buffer");
return -ENOMEM;
@@ -327,7 +327,7 @@ static int fec_alloc_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio)
if (fio->bufs[n])
continue;
- fio->bufs[n] = mempool_alloc(v->fec->extra_pool, GFP_NOWAIT);
+ fio->bufs[n] = mempool_alloc(&v->fec->extra_pool, GFP_NOWAIT);
/* we can manage with even one buffer if necessary */
if (unlikely(!fio->bufs[n]))
break;
@@ -335,7 +335,7 @@ static int fec_alloc_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio)
fio->nbufs = n;
if (!fio->output)
- fio->output = mempool_alloc(v->fec->output_pool, GFP_NOIO);
+ fio->output = mempool_alloc(&v->fec->output_pool, GFP_NOIO);
return 0;
}
@@ -493,15 +493,15 @@ void verity_fec_finish_io(struct dm_verity_io *io)
if (!verity_fec_is_enabled(io->v))
return;
- mempool_free(fio->rs, f->rs_pool);
+ mempool_free(fio->rs, &f->rs_pool);
fec_for_each_prealloc_buffer(n)
- mempool_free(fio->bufs[n], f->prealloc_pool);
+ mempool_free(fio->bufs[n], &f->prealloc_pool);
fec_for_each_extra_buffer(fio, n)
- mempool_free(fio->bufs[n], f->extra_pool);
+ mempool_free(fio->bufs[n], &f->extra_pool);
- mempool_free(fio->output, f->output_pool);
+ mempool_free(fio->output, &f->output_pool);
}
/*
@@ -549,9 +549,9 @@ void verity_fec_dtr(struct dm_verity *v)
if (!verity_fec_is_enabled(v))
goto out;
- mempool_destroy(f->rs_pool);
- mempool_destroy(f->prealloc_pool);
- mempool_destroy(f->extra_pool);
+ mempool_exit(&f->rs_pool);
+ mempool_exit(&f->prealloc_pool);
+ mempool_exit(&f->extra_pool);
kmem_cache_destroy(f->cache);
if (f->data_bufio)
@@ -675,6 +675,7 @@ int verity_fec_ctr(struct dm_verity *v)
struct dm_verity_fec *f = v->fec;
struct dm_target *ti = v->ti;
u64 hash_blocks;
+ int ret;
if (!verity_fec_is_enabled(v)) {
verity_fec_dtr(v);
@@ -770,11 +771,11 @@ int verity_fec_ctr(struct dm_verity *v)
}
/* Preallocate an rs_control structure for each worker thread */
- f->rs_pool = mempool_create(num_online_cpus(), fec_rs_alloc,
- fec_rs_free, (void *) v);
- if (!f->rs_pool) {
+ ret = mempool_init(&f->rs_pool, num_online_cpus(), fec_rs_alloc,
+ fec_rs_free, (void *) v);
+ if (ret) {
ti->error = "Cannot allocate RS pool";
- return -ENOMEM;
+ return ret;
}
f->cache = kmem_cache_create("dm_verity_fec_buffers",
@@ -786,26 +787,26 @@ int verity_fec_ctr(struct dm_verity *v)
}
/* Preallocate DM_VERITY_FEC_BUF_PREALLOC buffers for each thread */
- f->prealloc_pool = mempool_create_slab_pool(num_online_cpus() *
- DM_VERITY_FEC_BUF_PREALLOC,
- f->cache);
- if (!f->prealloc_pool) {
+ ret = mempool_init_slab_pool(&f->prealloc_pool, num_online_cpus() *
+ DM_VERITY_FEC_BUF_PREALLOC,
+ f->cache);
+ if (ret) {
ti->error = "Cannot allocate FEC buffer prealloc pool";
- return -ENOMEM;
+ return ret;
}
- f->extra_pool = mempool_create_slab_pool(0, f->cache);
- if (!f->extra_pool) {
+ ret = mempool_init_slab_pool(&f->extra_pool, 0, f->cache);
+ if (ret) {
ti->error = "Cannot allocate FEC buffer extra pool";
- return -ENOMEM;
+ return ret;
}
/* Preallocate an output buffer for each thread */
- f->output_pool = mempool_create_kmalloc_pool(num_online_cpus(),
- 1 << v->data_dev_block_bits);
- if (!f->output_pool) {
+ ret = mempool_init_kmalloc_pool(&f->output_pool, num_online_cpus(),
+ 1 << v->data_dev_block_bits);
+ if (ret) {
ti->error = "Cannot allocate FEC output pool";
- return -ENOMEM;
+ return ret;
}
/* Reserve space for our per-bio data */
diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h
index bb31ce87a933..6ad803b2b36c 100644
--- a/drivers/md/dm-verity-fec.h
+++ b/drivers/md/dm-verity-fec.h
@@ -46,10 +46,10 @@ struct dm_verity_fec {
sector_t hash_blocks; /* blocks covered after v->hash_start */
unsigned char roots; /* number of parity bytes, M-N of RS(M, N) */
unsigned char rsn; /* N of RS(M, N) */
- mempool_t *rs_pool; /* mempool for fio->rs */
- mempool_t *prealloc_pool; /* mempool for preallocated buffers */
- mempool_t *extra_pool; /* mempool for extra buffers */
- mempool_t *output_pool; /* mempool for output */
+ mempool_t rs_pool; /* mempool for fio->rs */
+ mempool_t prealloc_pool; /* mempool for preallocated buffers */
+ mempool_t extra_pool; /* mempool for extra buffers */
+ mempool_t output_pool; /* mempool for output */
struct kmem_cache *cache; /* cache for buffers */
};
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index e73b0776683c..30602d15ad9a 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -57,7 +57,7 @@ struct dmz_target {
struct workqueue_struct *chunk_wq;
/* For cloned BIOs to zones */
- struct bio_set *bio_set;
+ struct bio_set bio_set;
/* For flush */
spinlock_t flush_lock;
@@ -121,7 +121,7 @@ static int dmz_submit_read_bio(struct dmz_target *dmz, struct dm_zone *zone,
}
/* Partial BIO: we need to clone the BIO */
- clone = bio_clone_fast(bio, GFP_NOIO, dmz->bio_set);
+ clone = bio_clone_fast(bio, GFP_NOIO, &dmz->bio_set);
if (!clone)
return -ENOMEM;
@@ -779,10 +779,9 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->len = (sector_t)dmz_nr_chunks(dmz->metadata) << dev->zone_nr_sectors_shift;
/* Zone BIO */
- dmz->bio_set = bioset_create(DMZ_MIN_BIOS, 0, 0);
- if (!dmz->bio_set) {
+ ret = bioset_init(&dmz->bio_set, DMZ_MIN_BIOS, 0, 0);
+ if (ret) {
ti->error = "Create BIO set failed";
- ret = -ENOMEM;
goto err_meta;
}
@@ -828,7 +827,7 @@ err_cwq:
destroy_workqueue(dmz->chunk_wq);
err_bio:
mutex_destroy(&dmz->chunk_lock);
- bioset_free(dmz->bio_set);
+ bioset_exit(&dmz->bio_set);
err_meta:
dmz_dtr_metadata(dmz->metadata);
err_dev:
@@ -858,7 +857,7 @@ static void dmz_dtr(struct dm_target *ti)
dmz_dtr_metadata(dmz->metadata);
- bioset_free(dmz->bio_set);
+ bioset_exit(&dmz->bio_set);
dmz_put_zoned_device(ti);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 0a7b0107ca78..98dff36b89a3 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -148,8 +148,8 @@ static int dm_numa_node = DM_NUMA_NODE;
* For mempools pre-allocation at the table loading time.
*/
struct dm_md_mempools {
- struct bio_set *bs;
- struct bio_set *io_bs;
+ struct bio_set bs;
+ struct bio_set io_bs;
};
struct table_device {
@@ -537,7 +537,7 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
struct dm_target_io *tio;
struct bio *clone;
- clone = bio_alloc_bioset(GFP_NOIO, 0, md->io_bs);
+ clone = bio_alloc_bioset(GFP_NOIO, 0, &md->io_bs);
if (!clone)
return NULL;
@@ -572,7 +572,7 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *t
/* the dm_target_io embedded in ci->io is available */
tio = &ci->io->tio;
} else {
- struct bio *clone = bio_alloc_bioset(gfp_mask, 0, ci->io->md->bs);
+ struct bio *clone = bio_alloc_bioset(gfp_mask, 0, &ci->io->md->bs);
if (!clone)
return NULL;
@@ -1583,7 +1583,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md,
* won't be affected by this reassignment.
*/
struct bio *b = bio_clone_bioset(bio, GFP_NOIO,
- md->queue->bio_split);
+ &md->queue->bio_split);
ci.io->orig_bio = b;
bio_advance(bio, (bio_sectors(bio) - ci.sector_count) << 9);
bio_chain(b, bio);
@@ -1785,10 +1785,8 @@ static void cleanup_mapped_device(struct mapped_device *md)
destroy_workqueue(md->wq);
if (md->kworker_task)
kthread_stop(md->kworker_task);
- if (md->bs)
- bioset_free(md->bs);
- if (md->io_bs)
- bioset_free(md->io_bs);
+ bioset_exit(&md->bs);
+ bioset_exit(&md->io_bs);
if (md->dax_dev) {
kill_dax(md->dax_dev);
@@ -1965,16 +1963,10 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
* If so, reload bioset because front_pad may have changed
* because a different table was loaded.
*/
- if (md->bs) {
- bioset_free(md->bs);
- md->bs = NULL;
- }
- if (md->io_bs) {
- bioset_free(md->io_bs);
- md->io_bs = NULL;
- }
+ bioset_exit(&md->bs);
+ bioset_exit(&md->io_bs);
- } else if (md->bs) {
+ } else if (bioset_initialized(&md->bs)) {
/*
* There's no need to reload with request-based dm
* because the size of front_pad doesn't change.
@@ -1986,12 +1978,14 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
goto out;
}
- BUG_ON(!p || md->bs || md->io_bs);
+ BUG_ON(!p ||
+ bioset_initialized(&md->bs) ||
+ bioset_initialized(&md->io_bs));
md->bs = p->bs;
- p->bs = NULL;
+ memset(&p->bs, 0, sizeof(p->bs));
md->io_bs = p->io_bs;
- p->io_bs = NULL;
+ memset(&p->io_bs, 0, sizeof(p->io_bs));
out:
/* mempool bind completed, no longer need any mempools in the table */
dm_table_free_md_mempools(t);
@@ -2905,6 +2899,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_qu
struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
unsigned int pool_size = 0;
unsigned int front_pad, io_front_pad;
+ int ret;
if (!pools)
return NULL;
@@ -2916,10 +2911,10 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_qu
pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
io_front_pad = roundup(front_pad, __alignof__(struct dm_io)) + offsetof(struct dm_io, tio);
- pools->io_bs = bioset_create(pool_size, io_front_pad, 0);
- if (!pools->io_bs)
+ ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, 0);
+ if (ret)
goto out;
- if (integrity && bioset_integrity_create(pools->io_bs, pool_size))
+ if (integrity && bioset_integrity_create(&pools->io_bs, pool_size))
goto out;
break;
case DM_TYPE_REQUEST_BASED:
@@ -2932,11 +2927,11 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_qu
BUG();
}
- pools->bs = bioset_create(pool_size, front_pad, 0);
- if (!pools->bs)
+ ret = bioset_init(&pools->bs, pool_size, front_pad, 0);
+ if (ret)
goto out;
- if (integrity && bioset_integrity_create(pools->bs, pool_size))
+ if (integrity && bioset_integrity_create(&pools->bs, pool_size))
goto out;
return pools;
@@ -2952,10 +2947,8 @@ void dm_free_md_mempools(struct dm_md_mempools *pools)
if (!pools)
return;
- if (pools->bs)
- bioset_free(pools->bs);
- if (pools->io_bs)
- bioset_free(pools->io_bs);
+ bioset_exit(&pools->bs);
+ bioset_exit(&pools->io_bs);
kfree(pools);
}
diff --git a/drivers/md/md-faulty.c b/drivers/md/md-faulty.c
index 38264b38420f..c2fdf899de14 100644
--- a/drivers/md/md-faulty.c
+++ b/drivers/md/md-faulty.c
@@ -214,7 +214,7 @@ static bool faulty_make_request(struct mddev *mddev, struct bio *bio)
}
}
if (failit) {
- struct bio *b = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
+ struct bio *b = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
bio_set_dev(b, conf->rdev->bdev);
b->bi_private = bio;
diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c
index 4964323d936b..d45c697c0ebe 100644
--- a/drivers/md/md-linear.c
+++ b/drivers/md/md-linear.c
@@ -269,7 +269,7 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio)
if (unlikely(bio_end_sector(bio) > end_sector)) {
/* This bio crosses a device boundary, so we have to split it */
struct bio *split = bio_split(bio, end_sector - bio_sector,
- GFP_NOIO, mddev->bio_set);
+ GFP_NOIO, &mddev->bio_set);
bio_chain(split, bio);
generic_make_request(bio);
bio = split;
diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c
index 0a7e99d62c69..f71fcdb9b39c 100644
--- a/drivers/md/md-multipath.c
+++ b/drivers/md/md-multipath.c
@@ -80,7 +80,7 @@ static void multipath_end_bh_io(struct multipath_bh *mp_bh, blk_status_t status)
bio->bi_status = status;
bio_endio(bio);
- mempool_free(mp_bh, conf->pool);
+ mempool_free(mp_bh, &conf->pool);
}
static void multipath_end_request(struct bio *bio)
@@ -117,7 +117,7 @@ static bool multipath_make_request(struct mddev *mddev, struct bio * bio)
return true;
}
- mp_bh = mempool_alloc(conf->pool, GFP_NOIO);
+ mp_bh = mempool_alloc(&conf->pool, GFP_NOIO);
mp_bh->master_bio = bio;
mp_bh->mddev = mddev;
@@ -125,7 +125,7 @@ static bool multipath_make_request(struct mddev *mddev, struct bio * bio)
mp_bh->path = multipath_map(conf);
if (mp_bh->path < 0) {
bio_io_error(bio);
- mempool_free(mp_bh, conf->pool);
+ mempool_free(mp_bh, &conf->pool);
return true;
}
multipath = conf->multipaths + mp_bh->path;
@@ -378,6 +378,7 @@ static int multipath_run (struct mddev *mddev)
struct multipath_info *disk;
struct md_rdev *rdev;
int working_disks;
+ int ret;
if (md_check_no_bitmap(mddev))
return -EINVAL;
@@ -431,9 +432,9 @@ static int multipath_run (struct mddev *mddev)
}
mddev->degraded = conf->raid_disks - working_disks;
- conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS,
- sizeof(struct multipath_bh));
- if (conf->pool == NULL)
+ ret = mempool_init_kmalloc_pool(&conf->pool, NR_RESERVED_BUFS,
+ sizeof(struct multipath_bh));
+ if (ret)
goto out_free_conf;
mddev->thread = md_register_thread(multipathd, mddev,
@@ -455,7 +456,7 @@ static int multipath_run (struct mddev *mddev)
return 0;
out_free_conf:
- mempool_destroy(conf->pool);
+ mempool_exit(&conf->pool);
kfree(conf->multipaths);
kfree(conf);
mddev->private = NULL;
@@ -467,7 +468,7 @@ static void multipath_free(struct mddev *mddev, void *priv)
{
struct mpconf *conf = priv;
- mempool_destroy(conf->pool);
+ mempool_exit(&conf->pool);
kfree(conf->multipaths);
kfree(conf);
}
diff --git a/drivers/md/md-multipath.h b/drivers/md/md-multipath.h
index 0adb941f485a..b3099e5fc4d7 100644
--- a/drivers/md/md-multipath.h
+++ b/drivers/md/md-multipath.h
@@ -13,7 +13,7 @@ struct mpconf {
spinlock_t device_lock;
struct list_head retry_list;
- mempool_t *pool;
+ mempool_t pool;
};
/*
diff --git a/drivers/md/md.c b/drivers/md/md.c
index c208c01f63a5..fc692b7128bb 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -193,10 +193,10 @@ struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
{
struct bio *b;
- if (!mddev || !mddev->bio_set)
+ if (!mddev || !bioset_initialized(&mddev->bio_set))
return bio_alloc(gfp_mask, nr_iovecs);
- b = bio_alloc_bioset(gfp_mask, nr_iovecs, mddev->bio_set);
+ b = bio_alloc_bioset(gfp_mask, nr_iovecs, &mddev->bio_set);
if (!b)
return NULL;
return b;
@@ -205,10 +205,10 @@ EXPORT_SYMBOL_GPL(bio_alloc_mddev);
static struct bio *md_bio_alloc_sync(struct mddev *mddev)
{
- if (!mddev || !mddev->sync_set)
+ if (!mddev || !bioset_initialized(&mddev->sync_set))
return bio_alloc(GFP_NOIO, 1);
- return bio_alloc_bioset(GFP_NOIO, 1, mddev->sync_set);
+ return bio_alloc_bioset(GFP_NOIO, 1, &mddev->sync_set);
}
/*
@@ -510,7 +510,10 @@ static void mddev_delayed_delete(struct work_struct *ws);
static void mddev_put(struct mddev *mddev)
{
- struct bio_set *bs = NULL, *sync_bs = NULL;
+ struct bio_set bs, sync_bs;
+
+ memset(&bs, 0, sizeof(bs));
+ memset(&sync_bs, 0, sizeof(sync_bs));
if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
return;
@@ -521,8 +524,8 @@ static void mddev_put(struct mddev *mddev)
list_del_init(&mddev->all_mddevs);
bs = mddev->bio_set;
sync_bs = mddev->sync_set;
- mddev->bio_set = NULL;
- mddev->sync_set = NULL;
+ memset(&mddev->bio_set, 0, sizeof(mddev->bio_set));
+ memset(&mddev->sync_set, 0, sizeof(mddev->sync_set));
if (mddev->gendisk) {
/* We did a probe so need to clean up. Call
* queue_work inside the spinlock so that
@@ -535,10 +538,8 @@ static void mddev_put(struct mddev *mddev)
kfree(mddev);
}
spin_unlock(&all_mddevs_lock);
- if (bs)
- bioset_free(bs);
- if (sync_bs)
- bioset_free(sync_bs);
+ bioset_exit(&bs);
+ bioset_exit(&sync_bs);
}
static void md_safemode_timeout(struct timer_list *t);
@@ -2123,7 +2124,7 @@ int md_integrity_register(struct mddev *mddev)
bdev_get_integrity(reference->bdev));
pr_debug("md: data integrity enabled on %s\n", mdname(mddev));
- if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) {
+ if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE)) {
pr_err("md: failed to create integrity pool for %s\n",
mdname(mddev));
return -EINVAL;
@@ -5497,17 +5498,15 @@ int md_run(struct mddev *mddev)
sysfs_notify_dirent_safe(rdev->sysfs_state);
}
- if (mddev->bio_set == NULL) {
- mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
- if (!mddev->bio_set)
- return -ENOMEM;
+ if (!bioset_initialized(&mddev->bio_set)) {
+ err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
+ if (err)
+ return err;
}
- if (mddev->sync_set == NULL) {
- mddev->sync_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
- if (!mddev->sync_set) {
- err = -ENOMEM;
+ if (!bioset_initialized(&mddev->sync_set)) {
+ err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
+ if (err)
goto abort;
- }
}
spin_lock(&pers_lock);
@@ -5668,14 +5667,8 @@ int md_run(struct mddev *mddev)
return 0;
abort:
- if (mddev->bio_set) {
- bioset_free(mddev->bio_set);
- mddev->bio_set = NULL;
- }
- if (mddev->sync_set) {
- bioset_free(mddev->sync_set);
- mddev->sync_set = NULL;
- }
+ bioset_exit(&mddev->bio_set);
+ bioset_exit(&mddev->sync_set);
return err;
}
@@ -5888,14 +5881,8 @@ void md_stop(struct mddev *mddev)
* This is called from dm-raid
*/
__md_stop(mddev);
- if (mddev->bio_set) {
- bioset_free(mddev->bio_set);
- mddev->bio_set = NULL;
- }
- if (mddev->sync_set) {
- bioset_free(mddev->sync_set);
- mddev->sync_set = NULL;
- }
+ bioset_exit(&mddev->bio_set);
+ bioset_exit(&mddev->sync_set);
}
EXPORT_SYMBOL_GPL(md_stop);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index fbc925cce810..3507cab22cb6 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -452,8 +452,8 @@ struct mddev {
struct attribute_group *to_remove;
- struct bio_set *bio_set;
- struct bio_set *sync_set; /* for sync operations like
+ struct bio_set bio_set;
+ struct bio_set sync_set; /* for sync operations like
* metadata and bitmap writes
*/
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 584c10347267..65ae47a02218 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -479,7 +479,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
if (bio_end_sector(bio) > zone->zone_end) {
struct bio *split = bio_split(bio,
zone->zone_end - bio->bi_iter.bi_sector, GFP_NOIO,
- mddev->bio_set);
+ &mddev->bio_set);
bio_chain(split, bio);
generic_make_request(bio);
bio = split;
@@ -582,7 +582,8 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
sector = bio_sector;
if (sectors < bio_sectors(bio)) {
- struct bio *split = bio_split(bio, sectors, GFP_NOIO, mddev->bio_set);
+ struct bio *split = bio_split(bio, sectors, GFP_NOIO,
+ &mddev->bio_set);
bio_chain(split, bio);
generic_make_request(bio);
bio = split;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index e9e3308cb0a7..bad28520719b 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -221,7 +221,7 @@ static void free_r1bio(struct r1bio *r1_bio)
struct r1conf *conf = r1_bio->mddev->private;
put_all_bios(conf, r1_bio);
- mempool_free(r1_bio, conf->r1bio_pool);
+ mempool_free(r1_bio, &conf->r1bio_pool);
}
static void put_buf(struct r1bio *r1_bio)
@@ -236,7 +236,7 @@ static void put_buf(struct r1bio *r1_bio)
rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev);
}
- mempool_free(r1_bio, conf->r1buf_pool);
+ mempool_free(r1_bio, &conf->r1buf_pool);
lower_barrier(conf, sect);
}
@@ -1178,7 +1178,7 @@ alloc_r1bio(struct mddev *mddev, struct bio *bio)
struct r1conf *conf = mddev->private;
struct r1bio *r1_bio;
- r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+ r1_bio = mempool_alloc(&conf->r1bio_pool, GFP_NOIO);
/* Ensure no bio records IO_BLOCKED */
memset(r1_bio->bios, 0, conf->raid_disks * sizeof(r1_bio->bios[0]));
init_r1bio(r1_bio, mddev, bio);
@@ -1268,7 +1268,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
if (max_sectors < bio_sectors(bio)) {
struct bio *split = bio_split(bio, max_sectors,
- gfp, conf->bio_split);
+ gfp, &conf->bio_split);
bio_chain(split, bio);
generic_make_request(bio);
bio = split;
@@ -1278,7 +1278,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
r1_bio->read_disk = rdisk;
- read_bio = bio_clone_fast(bio, gfp, mddev->bio_set);
+ read_bio = bio_clone_fast(bio, gfp, &mddev->bio_set);
r1_bio->bios[rdisk] = read_bio;
@@ -1439,7 +1439,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
if (max_sectors < bio_sectors(bio)) {
struct bio *split = bio_split(bio, max_sectors,
- GFP_NOIO, conf->bio_split);
+ GFP_NOIO, &conf->bio_split);
bio_chain(split, bio);
generic_make_request(bio);
bio = split;
@@ -1479,9 +1479,9 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
if (r1_bio->behind_master_bio)
mbio = bio_clone_fast(r1_bio->behind_master_bio,
- GFP_NOIO, mddev->bio_set);
+ GFP_NOIO, &mddev->bio_set);
else
- mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
+ mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
if (r1_bio->behind_master_bio) {
if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
@@ -1657,8 +1657,7 @@ static void close_sync(struct r1conf *conf)
_allow_barrier(conf, idx);
}
- mempool_destroy(conf->r1buf_pool);
- conf->r1buf_pool = NULL;
+ mempool_exit(&conf->r1buf_pool);
}
static int raid1_spare_active(struct mddev *mddev)
@@ -2348,10 +2347,10 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
wbio = bio_clone_fast(r1_bio->behind_master_bio,
GFP_NOIO,
- mddev->bio_set);
+ &mddev->bio_set);
} else {
wbio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO,
- mddev->bio_set);
+ &mddev->bio_set);
}
bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
@@ -2564,17 +2563,15 @@ static int init_resync(struct r1conf *conf)
int buffs;
buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
- BUG_ON(conf->r1buf_pool);
- conf->r1buf_pool = mempool_create(buffs, r1buf_pool_alloc, r1buf_pool_free,
- conf->poolinfo);
- if (!conf->r1buf_pool)
- return -ENOMEM;
- return 0;
+ BUG_ON(mempool_initialized(&conf->r1buf_pool));
+
+ return mempool_init(&conf->r1buf_pool, buffs, r1buf_pool_alloc,
+ r1buf_pool_free, conf->poolinfo);
}
static struct r1bio *raid1_alloc_init_r1buf(struct r1conf *conf)
{
- struct r1bio *r1bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
+ struct r1bio *r1bio = mempool_alloc(&conf->r1buf_pool, GFP_NOIO);
struct resync_pages *rps;
struct bio *bio;
int i;
@@ -2617,7 +2614,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
int idx = sector_to_idx(sector_nr);
int page_idx = 0;
- if (!conf->r1buf_pool)
+ if (!mempool_initialized(&conf->r1buf_pool))
if (init_resync(conf))
return 0;
@@ -2953,14 +2950,13 @@ static struct r1conf *setup_conf(struct mddev *mddev)
if (!conf->poolinfo)
goto abort;
conf->poolinfo->raid_disks = mddev->raid_disks * 2;
- conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
- r1bio_pool_free,
- conf->poolinfo);
- if (!conf->r1bio_pool)
+ err = mempool_init(&conf->r1bio_pool, NR_RAID1_BIOS, r1bio_pool_alloc,
+ r1bio_pool_free, conf->poolinfo);
+ if (err)
goto abort;
- conf->bio_split = bioset_create(BIO_POOL_SIZE, 0, 0);
- if (!conf->bio_split)
+ err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0);
+ if (err)
goto abort;
conf->poolinfo->mddev = mddev;
@@ -3033,7 +3029,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
abort:
if (conf) {
- mempool_destroy(conf->r1bio_pool);
+ mempool_exit(&conf->r1bio_pool);
kfree(conf->mirrors);
safe_put_page(conf->tmppage);
kfree(conf->poolinfo);
@@ -3041,8 +3037,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
kfree(conf->nr_waiting);
kfree(conf->nr_queued);
kfree(conf->barrier);
- if (conf->bio_split)
- bioset_free(conf->bio_split);
+ bioset_exit(&conf->bio_split);
kfree(conf);
}
return ERR_PTR(err);
@@ -3144,7 +3139,7 @@ static void raid1_free(struct mddev *mddev, void *priv)
{
struct r1conf *conf = priv;
- mempool_destroy(conf->r1bio_pool);
+ mempool_exit(&conf->r1bio_pool);
kfree(conf->mirrors);
safe_put_page(conf->tmppage);
kfree(conf->poolinfo);
@@ -3152,8 +3147,7 @@ static void raid1_free(struct mddev *mddev, void *priv)
kfree(conf->nr_waiting);
kfree(conf->nr_queued);
kfree(conf->barrier);
- if (conf->bio_split)
- bioset_free(conf->bio_split);
+ bioset_exit(&conf->bio_split);
kfree(conf);
}
@@ -3199,13 +3193,17 @@ static int raid1_reshape(struct mddev *mddev)
* At the same time, we "pack" the devices so that all the missing
* devices have the higher raid_disk numbers.
*/
- mempool_t *newpool, *oldpool;
+ mempool_t newpool, oldpool;
struct pool_info *newpoolinfo;
struct raid1_info *newmirrors;
struct r1conf *conf = mddev->private;
int cnt, raid_disks;
unsigned long flags;
int d, d2;
+ int ret;
+
+ memset(&newpool, 0, sizeof(newpool));
+ memset(&oldpool, 0, sizeof(oldpool));
/* Cannot change chunk_size, layout, or level */
if (mddev->chunk_sectors != mddev->new_chunk_sectors ||
@@ -3237,17 +3235,17 @@ static int raid1_reshape(struct mddev *mddev)
newpoolinfo->mddev = mddev;
newpoolinfo->raid_disks = raid_disks * 2;
- newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
- r1bio_pool_free, newpoolinfo);
- if (!newpool) {
+ ret = mempool_init(&newpool, NR_RAID1_BIOS, r1bio_pool_alloc,
+ r1bio_pool_free, newpoolinfo);
+ if (ret) {
kfree(newpoolinfo);
- return -ENOMEM;
+ return ret;
}
newmirrors = kzalloc(sizeof(struct raid1_info) * raid_disks * 2,
GFP_KERNEL);
if (!newmirrors) {
kfree(newpoolinfo);
- mempool_destroy(newpool);
+ mempool_exit(&newpool);
return -ENOMEM;
}
@@ -3287,7 +3285,7 @@ static int raid1_reshape(struct mddev *mddev)
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
- mempool_destroy(oldpool);
+ mempool_exit(&oldpool);
return 0;
}
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index eb84bc68e2fd..e7ccad898736 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -118,10 +118,10 @@ struct r1conf {
* mempools - it changes when the array grows or shrinks
*/
struct pool_info *poolinfo;
- mempool_t *r1bio_pool;
- mempool_t *r1buf_pool;
+ mempool_t r1bio_pool;
+ mempool_t r1buf_pool;
- struct bio_set *bio_split;
+ struct bio_set bio_split;
/* temporary buffer to synchronous IO when attempting to repair
* a read error.
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 3c60774c8430..37d4b236b81b 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -291,14 +291,14 @@ static void free_r10bio(struct r10bio *r10_bio)
struct r10conf *conf = r10_bio->mddev->private;
put_all_bios(conf, r10_bio);
- mempool_free(r10_bio, conf->r10bio_pool);
+ mempool_free(r10_bio, &conf->r10bio_pool);
}
static void put_buf(struct r10bio *r10_bio)
{
struct r10conf *conf = r10_bio->mddev->private;
- mempool_free(r10_bio, conf->r10buf_pool);
+ mempool_free(r10_bio, &conf->r10buf_pool);
lower_barrier(conf);
}
@@ -1204,7 +1204,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
(unsigned long long)r10_bio->sector);
if (max_sectors < bio_sectors(bio)) {
struct bio *split = bio_split(bio, max_sectors,
- gfp, conf->bio_split);
+ gfp, &conf->bio_split);
bio_chain(split, bio);
generic_make_request(bio);
bio = split;
@@ -1213,7 +1213,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
}
slot = r10_bio->read_slot;
- read_bio = bio_clone_fast(bio, gfp, mddev->bio_set);
+ read_bio = bio_clone_fast(bio, gfp, &mddev->bio_set);
r10_bio->devs[slot].bio = read_bio;
r10_bio->devs[slot].rdev = rdev;
@@ -1261,7 +1261,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
} else
rdev = conf->mirrors[devnum].rdev;
- mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
+ mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
if (replacement)
r10_bio->devs[n_copy].repl_bio = mbio;
else
@@ -1509,7 +1509,7 @@ retry_write:
if (r10_bio->sectors < bio_sectors(bio)) {
struct bio *split = bio_split(bio, r10_bio->sectors,
- GFP_NOIO, conf->bio_split);
+ GFP_NOIO, &conf->bio_split);
bio_chain(split, bio);
generic_make_request(bio);
bio = split;
@@ -1533,7 +1533,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
struct r10conf *conf = mddev->private;
struct r10bio *r10_bio;
- r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
+ r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO);
r10_bio->master_bio = bio;
r10_bio->sectors = sectors;
@@ -1732,8 +1732,7 @@ static void close_sync(struct r10conf *conf)
wait_barrier(conf);
allow_barrier(conf);
- mempool_destroy(conf->r10buf_pool);
- conf->r10buf_pool = NULL;
+ mempool_exit(&conf->r10buf_pool);
}
static int raid10_spare_active(struct mddev *mddev)
@@ -2583,7 +2582,7 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
if (sectors > sect_to_write)
sectors = sect_to_write;
/* Write at 'sector' for 'sectors' */
- wbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
+ wbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors);
wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector);
wbio->bi_iter.bi_sector = wsector +
@@ -2816,25 +2815,25 @@ static void raid10d(struct md_thread *thread)
static int init_resync(struct r10conf *conf)
{
- int buffs;
- int i;
+ int ret, buffs, i;
buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
- BUG_ON(conf->r10buf_pool);
+ BUG_ON(mempool_initialized(&conf->r10buf_pool));
conf->have_replacement = 0;
for (i = 0; i < conf->geo.raid_disks; i++)
if (conf->mirrors[i].replacement)
conf->have_replacement = 1;
- conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
- if (!conf->r10buf_pool)
- return -ENOMEM;
+ ret = mempool_init(&conf->r10buf_pool, buffs,
+ r10buf_pool_alloc, r10buf_pool_free, conf);
+ if (ret)
+ return ret;
conf->next_resync = 0;
return 0;
}
static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf)
{
- struct r10bio *r10bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
+ struct r10bio *r10bio = mempool_alloc(&conf->r10buf_pool, GFP_NOIO);
struct rsync_pages *rp;
struct bio *bio;
int nalloc;
@@ -2945,7 +2944,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
sector_t chunk_mask = conf->geo.chunk_mask;
int page_idx = 0;
- if (!conf->r10buf_pool)
+ if (!mempool_initialized(&conf->r10buf_pool))
if (init_resync(conf))
return 0;
@@ -3699,13 +3698,13 @@ static struct r10conf *setup_conf(struct mddev *mddev)
conf->geo = geo;
conf->copies = copies;
- conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
- r10bio_pool_free, conf);
- if (!conf->r10bio_pool)
+ err = mempool_init(&conf->r10bio_pool, NR_RAID10_BIOS, r10bio_pool_alloc,
+ r10bio_pool_free, conf);
+ if (err)
goto out;
- conf->bio_split = bioset_create(BIO_POOL_SIZE, 0, 0);
- if (!conf->bio_split)
+ err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0);
+ if (err)
goto out;
calc_sectors(conf, mddev->dev_sectors);
@@ -3733,6 +3732,7 @@ static struct r10conf *setup_conf(struct mddev *mddev)
init_waitqueue_head(&conf->wait_barrier);
atomic_set(&conf->nr_pending, 0);
+ err = -ENOMEM;
conf->thread = md_register_thread(raid10d, mddev, "raid10");
if (!conf->thread)
goto out;
@@ -3742,11 +3742,10 @@ static struct r10conf *setup_conf(struct mddev *mddev)
out:
if (conf) {
- mempool_destroy(conf->r10bio_pool);
+ mempool_exit(&conf->r10bio_pool);
kfree(conf->mirrors);
safe_put_page(conf->tmppage);
- if (conf->bio_split)
- bioset_free(conf->bio_split);
+ bioset_exit(&conf->bio_split);
kfree(conf);
}
return ERR_PTR(err);
@@ -3953,7 +3952,7 @@ static int raid10_run(struct mddev *mddev)
out_free_conf:
md_unregister_thread(&mddev->thread);
- mempool_destroy(conf->r10bio_pool);
+ mempool_exit(&conf->r10bio_pool);
safe_put_page(conf->tmppage);
kfree(conf->mirrors);
kfree(conf);
@@ -3966,13 +3965,12 @@ static void raid10_free(struct mddev *mddev, void *priv)
{
struct r10conf *conf = priv;
- mempool_destroy(conf->r10bio_pool);
+ mempool_exit(&conf->r10bio_pool);
safe_put_page(conf->tmppage);
kfree(conf->mirrors);
kfree(conf->mirrors_old);
kfree(conf->mirrors_new);
- if (conf->bio_split)
- bioset_free(conf->bio_split);
+ bioset_exit(&conf->bio_split);
kfree(conf);
}
@@ -4543,7 +4541,7 @@ read_more:
* on all the target devices.
*/
// FIXME
- mempool_free(r10_bio, conf->r10buf_pool);
+ mempool_free(r10_bio, &conf->r10buf_pool);
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
return sectors_done;
}
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index e2e8840de9bf..d3eaaf3eb1bc 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -93,10 +93,10 @@ struct r10conf {
*/
wait_queue_head_t wait_barrier;
- mempool_t *r10bio_pool;
- mempool_t *r10buf_pool;
+ mempool_t r10bio_pool;
+ mempool_t r10buf_pool;
struct page *tmppage;
- struct bio_set *bio_split;
+ struct bio_set bio_split;
/* When taking over an array from a different personality, we store
* the new thread here until we fully activate the array.
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 3c65f52b68f5..2b775abf377b 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -125,9 +125,9 @@ struct r5l_log {
struct list_head no_mem_stripes; /* pending stripes, -ENOMEM */
struct kmem_cache *io_kc;
- mempool_t *io_pool;
- struct bio_set *bs;
- mempool_t *meta_pool;
+ mempool_t io_pool;
+ struct bio_set bs;
+ mempool_t meta_pool;
struct md_thread *reclaim_thread;
unsigned long reclaim_target; /* number of space that need to be
@@ -579,7 +579,7 @@ static void r5l_log_endio(struct bio *bio)
md_error(log->rdev->mddev, log->rdev);
bio_put(bio);
- mempool_free(io->meta_page, log->meta_pool);
+ mempool_free(io->meta_page, &log->meta_pool);
spin_lock_irqsave(&log->io_list_lock, flags);
__r5l_set_io_unit_state(io, IO_UNIT_IO_END);
@@ -748,7 +748,7 @@ static void r5l_submit_current_io(struct r5l_log *log)
static struct bio *r5l_bio_alloc(struct r5l_log *log)
{
- struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs);
+ struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, &log->bs);
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
bio_set_dev(bio, log->rdev->bdev);
@@ -780,7 +780,7 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
struct r5l_io_unit *io;
struct r5l_meta_block *block;
- io = mempool_alloc(log->io_pool, GFP_ATOMIC);
+ io = mempool_alloc(&log->io_pool, GFP_ATOMIC);
if (!io)
return NULL;
memset(io, 0, sizeof(*io));
@@ -791,7 +791,7 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
bio_list_init(&io->flush_barriers);
io->state = IO_UNIT_RUNNING;
- io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO);
+ io->meta_page = mempool_alloc(&log->meta_pool, GFP_NOIO);
block = page_address(io->meta_page);
clear_page(block);
block->magic = cpu_to_le32(R5LOG_MAGIC);
@@ -1223,7 +1223,7 @@ static bool r5l_complete_finished_ios(struct r5l_log *log)
log->next_checkpoint = io->log_start;
list_del(&io->log_sibling);
- mempool_free(io, log->io_pool);
+ mempool_free(io, &log->io_pool);
r5l_run_no_mem_stripe(log);
found = true;
@@ -1647,7 +1647,7 @@ static int r5l_recovery_allocate_ra_pool(struct r5l_log *log,
{
struct page *page;
- ctx->ra_bio = bio_alloc_bioset(GFP_KERNEL, BIO_MAX_PAGES, log->bs);
+ ctx->ra_bio = bio_alloc_bioset(GFP_KERNEL, BIO_MAX_PAGES, &log->bs);
if (!ctx->ra_bio)
return -ENOMEM;
@@ -3066,6 +3066,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
struct request_queue *q = bdev_get_queue(rdev->bdev);
struct r5l_log *log;
char b[BDEVNAME_SIZE];
+ int ret;
pr_debug("md/raid:%s: using device %s as journal\n",
mdname(conf->mddev), bdevname(rdev->bdev, b));
@@ -3111,16 +3112,16 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
if (!log->io_kc)
goto io_kc;
- log->io_pool = mempool_create_slab_pool(R5L_POOL_SIZE, log->io_kc);
- if (!log->io_pool)
+ ret = mempool_init_slab_pool(&log->io_pool, R5L_POOL_SIZE, log->io_kc);
+ if (ret)
goto io_pool;
- log->bs = bioset_create(R5L_POOL_SIZE, 0, BIOSET_NEED_BVECS);
- if (!log->bs)
+ ret = bioset_init(&log->bs, R5L_POOL_SIZE, 0, BIOSET_NEED_BVECS);
+ if (ret)
goto io_bs;
- log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0);
- if (!log->meta_pool)
+ ret = mempool_init_page_pool(&log->meta_pool, R5L_POOL_SIZE, 0);
+ if (ret)
goto out_mempool;
spin_lock_init(&log->tree_lock);
@@ -3155,11 +3156,11 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
rcu_assign_pointer(conf->log, NULL);
md_unregister_thread(&log->reclaim_thread);
reclaim_thread:
- mempool_destroy(log->meta_pool);
+ mempool_exit(&log->meta_pool);
out_mempool:
- bioset_free(log->bs);
+ bioset_exit(&log->bs);
io_bs:
- mempool_destroy(log->io_pool);
+ mempool_exit(&log->io_pool);
io_pool:
kmem_cache_destroy(log->io_kc);
io_kc:
@@ -3178,9 +3179,9 @@ void r5l_exit_log(struct r5conf *conf)
wake_up(&conf->mddev->sb_wait);
flush_work(&log->disable_writeback_work);
md_unregister_thread(&log->reclaim_thread);
- mempool_destroy(log->meta_pool);
- bioset_free(log->bs);
- mempool_destroy(log->io_pool);
+ mempool_exit(&log->meta_pool);
+ bioset_exit(&log->bs);
+ mempool_exit(&log->io_pool);
kmem_cache_destroy(log->io_kc);
kfree(log);
}
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 42890a08375b..3a7c36326589 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -105,9 +105,9 @@ struct ppl_conf {
atomic64_t seq; /* current log write sequence number */
struct kmem_cache *io_kc;
- mempool_t *io_pool;
- struct bio_set *bs;
- struct bio_set *flush_bs;
+ mempool_t io_pool;
+ struct bio_set bs;
+ struct bio_set flush_bs;
/* used only for recovery */
int recovered_entries;
@@ -244,7 +244,7 @@ static struct ppl_io_unit *ppl_new_iounit(struct ppl_log *log,
struct ppl_header *pplhdr;
struct page *header_page;
- io = mempool_alloc(ppl_conf->io_pool, GFP_NOWAIT);
+ io = mempool_alloc(&ppl_conf->io_pool, GFP_NOWAIT);
if (!io)
return NULL;
@@ -503,7 +503,7 @@ static void ppl_submit_iounit(struct ppl_io_unit *io)
struct bio *prev = bio;
bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES,
- ppl_conf->bs);
+ &ppl_conf->bs);
bio->bi_opf = prev->bi_opf;
bio_copy_dev(bio, prev);
bio->bi_iter.bi_sector = bio_end_sector(prev);
@@ -570,7 +570,7 @@ static void ppl_io_unit_finished(struct ppl_io_unit *io)
list_del(&io->log_sibling);
spin_unlock(&log->io_list_lock);
- mempool_free(io, ppl_conf->io_pool);
+ mempool_free(io, &ppl_conf->io_pool);
spin_lock(&ppl_conf->no_mem_stripes_lock);
if (!list_empty(&ppl_conf->no_mem_stripes)) {
@@ -642,7 +642,7 @@ static void ppl_do_flush(struct ppl_io_unit *io)
struct bio *bio;
char b[BDEVNAME_SIZE];
- bio = bio_alloc_bioset(GFP_NOIO, 0, ppl_conf->flush_bs);
+ bio = bio_alloc_bioset(GFP_NOIO, 0, &ppl_conf->flush_bs);
bio_set_dev(bio, bdev);
bio->bi_private = io;
bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
@@ -1246,11 +1246,9 @@ static void __ppl_exit_log(struct ppl_conf *ppl_conf)
kfree(ppl_conf->child_logs);
- if (ppl_conf->bs)
- bioset_free(ppl_conf->bs);
- if (ppl_conf->flush_bs)
- bioset_free(ppl_conf->flush_bs);
- mempool_destroy(ppl_conf->io_pool);
+ bioset_exit(&ppl_conf->bs);
+ bioset_exit(&ppl_conf->flush_bs);
+ mempool_exit(&ppl_conf->io_pool);
kmem_cache_destroy(ppl_conf->io_kc);
kfree(ppl_conf);
@@ -1387,24 +1385,18 @@ int ppl_init_log(struct r5conf *conf)
goto err;
}
- ppl_conf->io_pool = mempool_create(conf->raid_disks, ppl_io_pool_alloc,
- ppl_io_pool_free, ppl_conf->io_kc);
- if (!ppl_conf->io_pool) {
- ret = -ENOMEM;
+ ret = mempool_init(&ppl_conf->io_pool, conf->raid_disks, ppl_io_pool_alloc,
+ ppl_io_pool_free, ppl_conf->io_kc);
+ if (ret)
goto err;
- }
- ppl_conf->bs = bioset_create(conf->raid_disks, 0, BIOSET_NEED_BVECS);
- if (!ppl_conf->bs) {
- ret = -ENOMEM;
+ ret = bioset_init(&ppl_conf->bs, conf->raid_disks, 0, BIOSET_NEED_BVECS);
+ if (ret)
goto err;
- }
- ppl_conf->flush_bs = bioset_create(conf->raid_disks, 0, 0);
- if (!ppl_conf->flush_bs) {
- ret = -ENOMEM;
+ ret = bioset_init(&ppl_conf->flush_bs, conf->raid_disks, 0, 0);
+ if (ret)
goto err;
- }
ppl_conf->count = conf->raid_disks;
ppl_conf->child_logs = kcalloc(ppl_conf->count, sizeof(struct ppl_log),
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index be117d0a65a8..a2e64989b01f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5192,7 +5192,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
/*
* use bio_clone_fast to make a copy of the bio
*/
- align_bi = bio_clone_fast(raid_bio, GFP_NOIO, mddev->bio_set);
+ align_bi = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->bio_set);
if (!align_bi)
return 0;
/*
@@ -5277,7 +5277,7 @@ static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
if (sectors < bio_sectors(raid_bio)) {
struct r5conf *conf = mddev->private;
- split = bio_split(raid_bio, sectors, GFP_NOIO, conf->bio_split);
+ split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split);
bio_chain(split, raid_bio);
generic_make_request(raid_bio);
raid_bio = split;
@@ -6773,8 +6773,7 @@ static void free_conf(struct r5conf *conf)
if (conf->disks[i].extra_page)
put_page(conf->disks[i].extra_page);
kfree(conf->disks);
- if (conf->bio_split)
- bioset_free(conf->bio_split);
+ bioset_exit(&conf->bio_split);
kfree(conf->stripe_hashtbl);
kfree(conf->pending_data);
kfree(conf);
@@ -6853,6 +6852,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
int i;
int group_cnt, worker_cnt_per_group;
struct r5worker_group *new_group;
+ int ret;
if (mddev->new_level != 5
&& mddev->new_level != 4
@@ -6950,8 +6950,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
goto abort;
}
- conf->bio_split = bioset_create(BIO_POOL_SIZE, 0, 0);
- if (!conf->bio_split)
+ ret = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0);
+ if (ret)
goto abort;
conf->mddev = mddev;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 3f8da26032ac..72e75ba6abf0 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -669,7 +669,7 @@ struct r5conf {
int pool_size; /* number of disks in stripeheads in pool */
spinlock_t device_lock;
struct disk_info *disks;
- struct bio_set *bio_split;
+ struct bio_set bio_split;
/* When taking over an array from a different personality, we store
* the new thread here until we fully activate the array.
diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c
index 57b13dfbd21e..a15181fa45f7 100644
--- a/drivers/memstick/core/ms_block.c
+++ b/drivers/memstick/core/ms_block.c
@@ -2094,14 +2094,9 @@ static const struct block_device_operations msb_bdops = {
static int msb_init_disk(struct memstick_dev *card)
{
struct msb_data *msb = memstick_get_drvdata(card);
- struct memstick_host *host = card->host;
int rc;
- u64 limit = BLK_BOUNCE_HIGH;
unsigned long capacity;
- if (host->dev.dma_mask && *(host->dev.dma_mask))
- limit = *(host->dev.dma_mask);
-
mutex_lock(&msb_disk_lock);
msb->disk_id = idr_alloc(&msb_disk_idr, card, 0, 256, GFP_KERNEL);
mutex_unlock(&msb_disk_lock);
@@ -2123,7 +2118,6 @@ static int msb_init_disk(struct memstick_dev *card)
msb->queue->queuedata = card;
- blk_queue_bounce_limit(msb->queue, limit);
blk_queue_max_hw_sectors(msb->queue, MS_BLOCK_MAX_PAGES);
blk_queue_max_segments(msb->queue, MS_BLOCK_MAX_SEGS);
blk_queue_max_segment_size(msb->queue,
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index 8897962781bb..5ee932631fae 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -1170,17 +1170,12 @@ static int mspro_block_init_card(struct memstick_dev *card)
static int mspro_block_init_disk(struct memstick_dev *card)
{
struct mspro_block_data *msb = memstick_get_drvdata(card);
- struct memstick_host *host = card->host;
struct mspro_devinfo *dev_info = NULL;
struct mspro_sys_info *sys_info = NULL;
struct mspro_sys_attr *s_attr = NULL;
int rc, disk_id;
- u64 limit = BLK_BOUNCE_HIGH;
unsigned long capacity;
- if (host->dev.dma_mask && *(host->dev.dma_mask))
- limit = *(host->dev.dma_mask);
-
for (rc = 0; msb->attr_group.attrs[rc]; ++rc) {
s_attr = mspro_from_sysfs_attr(msb->attr_group.attrs[rc]);
@@ -1219,7 +1214,6 @@ static int mspro_block_init_disk(struct memstick_dev *card)
msb->queue->queuedata = card;
- blk_queue_bounce_limit(msb->queue, limit);
blk_queue_max_hw_sectors(msb->queue, MSPRO_BLOCK_MAX_PAGES);
blk_queue_max_segments(msb->queue, MSPRO_BLOCK_MAX_SEGS);
blk_queue_max_segment_size(msb->queue,
diff --git a/drivers/message/fusion/mptsas.c b/drivers/message/fusion/mptsas.c
index 86503f60468f..19a5aa70ecda 100644
--- a/drivers/message/fusion/mptsas.c
+++ b/drivers/message/fusion/mptsas.c
@@ -1929,7 +1929,7 @@ static enum blk_eh_timer_return mptsas_eh_timed_out(struct scsi_cmnd *sc)
MPT_SCSI_HOST *hd;
MPT_ADAPTER *ioc;
VirtDevice *vdevice;
- enum blk_eh_timer_return rc = BLK_EH_NOT_HANDLED;
+ enum blk_eh_timer_return rc = BLK_EH_DONE;
hd = shost_priv(sc->device->host);
if (hd == NULL) {
diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index 38a7586b00cc..d89e17829527 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -244,7 +244,7 @@ static ssize_t power_ro_lock_store(struct device *dev,
mq = &md->queue;
/* Dispatch locking to the block layer */
- req = blk_get_request(mq->queue, REQ_OP_DRV_OUT, __GFP_RECLAIM);
+ req = blk_get_request(mq->queue, REQ_OP_DRV_OUT, 0);
if (IS_ERR(req)) {
count = PTR_ERR(req);
goto out_put;
@@ -650,8 +650,7 @@ static int mmc_blk_ioctl_cmd(struct mmc_blk_data *md,
*/
mq = &md->queue;
req = blk_get_request(mq->queue,
- idata->ic.write_flag ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN,
- __GFP_RECLAIM);
+ idata->ic.write_flag ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
if (IS_ERR(req)) {
err = PTR_ERR(req);
goto cmd_done;
@@ -721,8 +720,7 @@ static int mmc_blk_ioctl_multi_cmd(struct mmc_blk_data *md,
*/
mq = &md->queue;
req = blk_get_request(mq->queue,
- idata[0]->ic.write_flag ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN,
- __GFP_RECLAIM);
+ idata[0]->ic.write_flag ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
if (IS_ERR(req)) {
err = PTR_ERR(req);
goto cmd_err;
@@ -2750,7 +2748,7 @@ static int mmc_dbg_card_status_get(void *data, u64 *val)
int ret;
/* Ask the block layer about the card status */
- req = blk_get_request(mq->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+ req = blk_get_request(mq->queue, REQ_OP_DRV_IN, 0);
if (IS_ERR(req))
return PTR_ERR(req);
req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_GET_CARD_STATUS;
@@ -2786,7 +2784,7 @@ static int mmc_ext_csd_open(struct inode *inode, struct file *filp)
return -ENOMEM;
/* Ask the block layer for the EXT CSD */
- req = blk_get_request(mq->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+ req = blk_get_request(mq->queue, REQ_OP_DRV_IN, 0);
if (IS_ERR(req)) {
err = PTR_ERR(req);
goto out_free;
diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c
index 56e9a803db21..648eb6743ed5 100644
--- a/drivers/mmc/core/queue.c
+++ b/drivers/mmc/core/queue.c
@@ -111,8 +111,9 @@ static enum blk_eh_timer_return mmc_cqe_timed_out(struct request *req)
__mmc_cqe_recovery_notifier(mq);
return BLK_EH_RESET_TIMER;
}
- /* No timeout */
- return BLK_EH_HANDLED;
+ /* No timeout (XXX: huh? comment doesn't make much sense) */
+ blk_mq_complete_request(req);
+ return BLK_EH_DONE;
default:
/* Timeout is handled by mmc core */
return BLK_EH_RESET_TIMER;
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 16ae4ae8e8f9..29c0bfd74e8a 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -82,7 +82,6 @@ static blk_status_t do_blktrans_request(struct mtd_blktrans_ops *tr,
block = blk_rq_pos(req) << 9 >> tr->blkshift;
nsect = blk_rq_cur_bytes(req) >> tr->blkshift;
- buf = bio_data(req->bio);
if (req_op(req) == REQ_OP_FLUSH) {
if (tr->flush(dev))
@@ -100,9 +99,14 @@ static blk_status_t do_blktrans_request(struct mtd_blktrans_ops *tr,
return BLK_STS_IOERR;
return BLK_STS_OK;
case REQ_OP_READ:
- for (; nsect > 0; nsect--, block++, buf += tr->blksize)
- if (tr->readsect(dev, block, buf))
+ buf = kmap(bio_page(req->bio)) + bio_offset(req->bio);
+ for (; nsect > 0; nsect--, block++, buf += tr->blksize) {
+ if (tr->readsect(dev, block, buf)) {
+ kunmap(bio_page(req->bio));
return BLK_STS_IOERR;
+ }
+ }
+ kunmap(bio_page(req->bio));
rq_flush_dcache_pages(req);
return BLK_STS_OK;
case REQ_OP_WRITE:
@@ -110,9 +114,14 @@ static blk_status_t do_blktrans_request(struct mtd_blktrans_ops *tr,
return BLK_STS_IOERR;
rq_flush_dcache_pages(req);
- for (; nsect > 0; nsect--, block++, buf += tr->blksize)
- if (tr->writesect(dev, block, buf))
+ buf = kmap(bio_page(req->bio)) + bio_offset(req->bio);
+ for (; nsect > 0; nsect--, block++, buf += tr->blksize) {
+ if (tr->writesect(dev, block, buf)) {
+ kunmap(bio_page(req->bio));
return BLK_STS_IOERR;
+ }
+ }
+ kunmap(bio_page(req->bio));
return BLK_STS_OK;
default:
return BLK_STS_IOERR;
@@ -418,7 +427,6 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new)
new->rq->queuedata = new;
blk_queue_logical_block_size(new->rq, tr->blksize);
- blk_queue_bounce_limit(new->rq, BLK_BOUNCE_HIGH);
blk_queue_flag_set(QUEUE_FLAG_NONROT, new->rq);
blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, new->rq);
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index b9ca782fe82d..04a20da76786 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -101,6 +101,15 @@ static void nvme_ns_remove(struct nvme_ns *ns);
static int nvme_revalidate_disk(struct gendisk *disk);
static void nvme_put_subsystem(struct nvme_subsystem *subsys);
+static void nvme_queue_scan(struct nvme_ctrl *ctrl)
+{
+ /*
+ * Only new queue scan work when admin and IO queues are both alive
+ */
+ if (ctrl->state == NVME_CTRL_LIVE)
+ queue_work(nvme_wq, &ctrl->scan_work);
+}
+
int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
{
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
@@ -244,9 +253,6 @@ EXPORT_SYMBOL_GPL(nvme_complete_rq);
void nvme_cancel_request(struct request *req, void *data, bool reserved)
{
- if (!blk_mq_request_started(req))
- return;
-
dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
"Cancelling I/O %d", req->tag);
@@ -1033,6 +1039,21 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
}
EXPORT_SYMBOL_GPL(nvme_set_queue_count);
+#define NVME_AEN_SUPPORTED \
+ (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT)
+
+static void nvme_enable_aen(struct nvme_ctrl *ctrl)
+{
+ u32 result;
+ int status;
+
+ status = nvme_set_features(ctrl, NVME_FEAT_ASYNC_EVENT,
+ ctrl->oaes & NVME_AEN_SUPPORTED, NULL, 0, &result);
+ if (status)
+ dev_warn(ctrl->device, "Failed to configure AEN (cfg %x)\n",
+ ctrl->oaes & NVME_AEN_SUPPORTED);
+}
+
static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
{
struct nvme_user_io io;
@@ -1351,13 +1372,19 @@ static void nvme_set_chunk_size(struct nvme_ns *ns)
blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size));
}
-static void nvme_config_discard(struct nvme_ctrl *ctrl,
- unsigned stream_alignment, struct request_queue *queue)
+static void nvme_config_discard(struct nvme_ns *ns)
{
+ struct nvme_ctrl *ctrl = ns->ctrl;
+ struct request_queue *queue = ns->queue;
u32 size = queue_logical_block_size(queue);
- if (stream_alignment)
- size *= stream_alignment;
+ if (!(ctrl->oncs & NVME_CTRL_ONCS_DSM)) {
+ blk_queue_flag_clear(QUEUE_FLAG_DISCARD, queue);
+ return;
+ }
+
+ if (ctrl->nr_streams && ns->sws && ns->sgs)
+ size *= ns->sws * ns->sgs;
BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
NVME_DSM_MAX_RANGES);
@@ -1365,9 +1392,12 @@ static void nvme_config_discard(struct nvme_ctrl *ctrl,
queue->limits.discard_alignment = 0;
queue->limits.discard_granularity = size;
+ /* If discard is already enabled, don't reset queue limits */
+ if (blk_queue_flag_test_and_set(QUEUE_FLAG_DISCARD, queue))
+ return;
+
blk_queue_max_discard_sectors(queue, UINT_MAX);
blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES);
- blk_queue_flag_set(QUEUE_FLAG_DISCARD, queue);
if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
@@ -1411,10 +1441,6 @@ static void nvme_update_disk_info(struct gendisk *disk,
{
sector_t capacity = le64_to_cpup(&id->nsze) << (ns->lba_shift - 9);
unsigned short bs = 1 << ns->lba_shift;
- unsigned stream_alignment = 0;
-
- if (ns->ctrl->nr_streams && ns->sws && ns->sgs)
- stream_alignment = ns->sws * ns->sgs;
blk_mq_freeze_queue(disk->queue);
blk_integrity_unregister(disk);
@@ -1428,10 +1454,9 @@ static void nvme_update_disk_info(struct gendisk *disk,
nvme_init_integrity(disk, ns->ms, ns->pi_type);
if (ns->ms && !nvme_ns_has_pi(ns) && !blk_get_integrity(disk))
capacity = 0;
- set_capacity(disk, capacity);
- if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM)
- nvme_config_discard(ns->ctrl, stream_alignment, disk->queue);
+ set_capacity(disk, capacity);
+ nvme_config_discard(ns);
blk_mq_unfreeze_queue(disk->queue);
}
@@ -1577,7 +1602,7 @@ static int nvme_pr_reserve(struct block_device *bdev, u64 key,
static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
enum pr_type type, bool abort)
{
- u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1;
+ u32 cdw10 = nvme_pr_type(type) << 8 | (abort ? 2 : 1);
return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
}
@@ -1589,7 +1614,7 @@ static int nvme_pr_clear(struct block_device *bdev, u64 key)
static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
{
- u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0;
+ u32 cdw10 = nvme_pr_type(type) << 8 | (key ? 1 << 3 : 0);
return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
}
@@ -2183,7 +2208,8 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
* Verify that the subsystem actually supports multiple
* controllers, else bail out.
*/
- if (nvme_active_ctrls(found) && !(id->cmic & (1 << 1))) {
+ if (!ctrl->opts->discovery_nqn &&
+ nvme_active_ctrls(found) && !(id->cmic & (1 << 1))) {
dev_err(ctrl->device,
"ignoring ctrl due to duplicate subnqn (%s).\n",
found->subnqn);
@@ -2314,7 +2340,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) {
ret = nvme_get_effects_log(ctrl);
if (ret < 0)
- return ret;
+ goto out_free;
}
if (!ctrl->identified) {
@@ -2345,6 +2371,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
ctrl->oacs = le16_to_cpu(id->oacs);
ctrl->oncs = le16_to_cpup(&id->oncs);
+ ctrl->oaes = le32_to_cpu(id->oaes);
atomic_set(&ctrl->abort_limit, id->acl + 1);
ctrl->vwc = id->vwc;
ctrl->cntlid = le16_to_cpup(&id->cntlid);
@@ -3170,6 +3197,42 @@ static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl, unsigned nn)
nvme_remove_invalid_namespaces(ctrl, nn);
}
+static bool nvme_scan_changed_ns_log(struct nvme_ctrl *ctrl)
+{
+ size_t log_size = NVME_MAX_CHANGED_NAMESPACES * sizeof(__le32);
+ __le32 *log;
+ int error, i;
+ bool ret = false;
+
+ log = kzalloc(log_size, GFP_KERNEL);
+ if (!log)
+ return false;
+
+ error = nvme_get_log(ctrl, NVME_LOG_CHANGED_NS, log, log_size);
+ if (error) {
+ dev_warn(ctrl->device,
+ "reading changed ns log failed: %d\n", error);
+ goto out_free_log;
+ }
+
+ if (log[0] == cpu_to_le32(0xffffffff))
+ goto out_free_log;
+
+ for (i = 0; i < NVME_MAX_CHANGED_NAMESPACES; i++) {
+ u32 nsid = le32_to_cpu(log[i]);
+
+ if (nsid == 0)
+ break;
+ dev_info(ctrl->device, "rescanning namespace %d.\n", nsid);
+ nvme_validate_ns(ctrl, nsid);
+ }
+ ret = true;
+
+out_free_log:
+ kfree(log);
+ return ret;
+}
+
static void nvme_scan_work(struct work_struct *work)
{
struct nvme_ctrl *ctrl =
@@ -3182,6 +3245,12 @@ static void nvme_scan_work(struct work_struct *work)
WARN_ON_ONCE(!ctrl->tagset);
+ if (test_and_clear_bit(EVENT_NS_CHANGED, &ctrl->events)) {
+ if (nvme_scan_changed_ns_log(ctrl))
+ goto out_sort_namespaces;
+ dev_info(ctrl->device, "rescanning namespaces.\n");
+ }
+
if (nvme_identify_ctrl(ctrl, &id))
return;
@@ -3189,25 +3258,16 @@ static void nvme_scan_work(struct work_struct *work)
if (ctrl->vs >= NVME_VS(1, 1, 0) &&
!(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)) {
if (!nvme_scan_ns_list(ctrl, nn))
- goto done;
+ goto out_free_id;
}
nvme_scan_ns_sequential(ctrl, nn);
- done:
+out_free_id:
+ kfree(id);
+out_sort_namespaces:
down_write(&ctrl->namespaces_rwsem);
list_sort(NULL, &ctrl->namespaces, ns_cmp);
up_write(&ctrl->namespaces_rwsem);
- kfree(id);
-}
-
-void nvme_queue_scan(struct nvme_ctrl *ctrl)
-{
- /*
- * Only new queue scan work when admin and IO queues are both alive
- */
- if (ctrl->state == NVME_CTRL_LIVE)
- queue_work(nvme_wq, &ctrl->scan_work);
}
-EXPORT_SYMBOL_GPL(nvme_queue_scan);
/*
* This function iterates the namespace list unlocked to allow recovery from
@@ -3322,8 +3382,23 @@ static void nvme_fw_act_work(struct work_struct *work)
nvme_get_fw_slot_info(ctrl);
}
+static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
+{
+ switch ((result & 0xff00) >> 8) {
+ case NVME_AER_NOTICE_NS_CHANGED:
+ set_bit(EVENT_NS_CHANGED, &ctrl->events);
+ nvme_queue_scan(ctrl);
+ break;
+ case NVME_AER_NOTICE_FW_ACT_STARTING:
+ queue_work(nvme_wq, &ctrl->fw_act_work);
+ break;
+ default:
+ dev_warn(ctrl->device, "async event result %08x\n", result);
+ }
+}
+
void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
- union nvme_result *res)
+ volatile union nvme_result *res)
{
u32 result = le32_to_cpu(res->u32);
@@ -3331,6 +3406,9 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
return;
switch (result & 0x7) {
+ case NVME_AER_NOTICE:
+ nvme_handle_aen_notice(ctrl, result);
+ break;
case NVME_AER_ERROR:
case NVME_AER_SMART:
case NVME_AER_CSS:
@@ -3340,18 +3418,6 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
default:
break;
}
-
- switch (result & 0xff07) {
- case NVME_AER_NOTICE_NS_CHANGED:
- dev_info(ctrl->device, "rescanning\n");
- nvme_queue_scan(ctrl);
- break;
- case NVME_AER_NOTICE_FW_ACT_STARTING:
- queue_work(nvme_wq, &ctrl->fw_act_work);
- break;
- default:
- dev_warn(ctrl->device, "async event result %08x\n", result);
- }
queue_work(nvme_wq, &ctrl->async_event_work);
}
EXPORT_SYMBOL_GPL(nvme_complete_async_event);
@@ -3374,6 +3440,7 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl)
if (ctrl->queue_count > 1) {
nvme_queue_scan(ctrl);
+ nvme_enable_aen(ctrl);
queue_work(nvme_wq, &ctrl->async_event_work);
nvme_start_queues(ctrl);
}
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 7ae732a77fe8..5f5f7067c41d 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -57,7 +57,7 @@ static struct nvmf_host *nvmf_host_add(const char *hostnqn)
goto out_unlock;
kref_init(&host->ref);
- memcpy(host->nqn, hostnqn, NVMF_NQN_SIZE);
+ strlcpy(host->nqn, hostnqn, NVMF_NQN_SIZE);
list_add_tail(&host->list, &nvmf_hosts);
out_unlock:
@@ -545,71 +545,54 @@ blk_status_t nvmf_check_if_ready(struct nvme_ctrl *ctrl, struct request *rq,
return BLK_STS_OK;
switch (ctrl->state) {
- case NVME_CTRL_DELETING:
- goto reject_io;
-
case NVME_CTRL_NEW:
case NVME_CTRL_CONNECTING:
+ case NVME_CTRL_DELETING:
+ /*
+ * This is the case of starting a new or deleting an association
+ * but connectivity was lost before it was fully created or torn
+ * down. We need to error the commands used to initialize the
+ * controller so the reconnect can go into a retry attempt. The
+ * commands should all be marked REQ_FAILFAST_DRIVER, which will
+ * hit the reject path below. Anything else will be queued while
+ * the state settles.
+ */
if (!is_connected)
- /*
- * This is the case of starting a new
- * association but connectivity was lost
- * before it was fully created. We need to
- * error the commands used to initialize the
- * controller so the reconnect can go into a
- * retry attempt. The commands should all be
- * marked REQ_FAILFAST_DRIVER, which will hit
- * the reject path below. Anything else will
- * be queued while the state settles.
- */
- goto reject_or_queue_io;
-
- if ((queue_live &&
- !(nvme_req(rq)->flags & NVME_REQ_USERCMD)) ||
- (!queue_live && blk_rq_is_passthrough(rq) &&
- cmd->common.opcode == nvme_fabrics_command &&
- cmd->fabrics.fctype == nvme_fabrics_type_connect))
- /*
- * If queue is live, allow only commands that
- * are internally generated pass through. These
- * are commands on the admin queue to initialize
- * the controller. This will reject any ioctl
- * admin cmds received while initializing.
- *
- * If the queue is not live, allow only a
- * connect command. This will reject any ioctl
- * admin cmd as well as initialization commands
- * if the controller reverted the queue to non-live.
- */
+ break;
+
+ /*
+ * If queue is live, allow only commands that are internally
+ * generated pass through. These are commands on the admin
+ * queue to initialize the controller. This will reject any
+ * ioctl admin cmds received while initializing.
+ */
+ if (queue_live && !(nvme_req(rq)->flags & NVME_REQ_USERCMD))
return BLK_STS_OK;
/*
- * fall-thru to the reject_or_queue_io clause
+ * If the queue is not live, allow only a connect command. This
+ * will reject any ioctl admin cmd as well as initialization
+ * commands if the controller reverted the queue to non-live.
*/
+ if (!queue_live && blk_rq_is_passthrough(rq) &&
+ cmd->common.opcode == nvme_fabrics_command &&
+ cmd->fabrics.fctype == nvme_fabrics_type_connect)
+ return BLK_STS_OK;
break;
-
- /* these cases fall-thru
- * case NVME_CTRL_LIVE:
- * case NVME_CTRL_RESETTING:
- */
default:
break;
}
-reject_or_queue_io:
/*
- * Any other new io is something we're not in a state to send
- * to the device. Default action is to busy it and retry it
- * after the controller state is recovered. However, anything
- * marked for failfast or nvme multipath is immediately failed.
- * Note: commands used to initialize the controller will be
- * marked for failfast.
+ * Any other new io is something we're not in a state to send to the
+ * device. Default action is to busy it and retry it after the
+ * controller state is recovered. However, anything marked for failfast
+ * or nvme multipath is immediately failed. Note: commands used to
+ * initialize the controller will be marked for failfast.
* Note: nvme cli/ioctl commands are marked for failfast.
*/
if (!blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
return BLK_STS_RESOURCE;
-
-reject_io:
nvme_req(rq)->status = NVME_SC_ABORT_REQ;
return BLK_STS_IOERR;
}
@@ -689,10 +672,6 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
opts->discovery_nqn =
!(strcmp(opts->subsysnqn,
NVME_DISC_SUBSYS_NAME));
- if (opts->discovery_nqn) {
- opts->kato = 0;
- opts->nr_io_queues = 0;
- }
break;
case NVMF_OPT_TRADDR:
p = match_strdup(args);
@@ -851,6 +830,11 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
}
}
+ if (opts->discovery_nqn) {
+ opts->kato = 0;
+ opts->nr_io_queues = 0;
+ opts->duplicate_connect = true;
+ }
if (ctrl_loss_tmo < 0)
opts->max_reconnects = -1;
else
@@ -983,16 +967,6 @@ nvmf_create_ctrl(struct device *dev, const char *buf, size_t count)
goto out_module_put;
}
- if (strcmp(ctrl->subsys->subnqn, opts->subsysnqn)) {
- dev_warn(ctrl->device,
- "controller returned incorrect NQN: \"%s\".\n",
- ctrl->subsys->subnqn);
- module_put(ops->module);
- up_read(&nvmf_transports_rwsem);
- nvme_delete_ctrl_sync(ctrl);
- return ERR_PTR(-EINVAL);
- }
-
module_put(ops->module);
up_read(&nvmf_transports_rwsem);
return ctrl;
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index ef46c915b7b5..0cf0460a5c92 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -139,7 +139,9 @@ static inline bool
nvmf_ctlr_matches_baseopts(struct nvme_ctrl *ctrl,
struct nvmf_ctrl_options *opts)
{
- if (strcmp(opts->subsysnqn, ctrl->opts->subsysnqn) ||
+ if (ctrl->state == NVME_CTRL_DELETING ||
+ ctrl->state == NVME_CTRL_DEAD ||
+ strcmp(opts->subsysnqn, ctrl->opts->subsysnqn) ||
strcmp(opts->host->nqn, ctrl->opts->host->nqn) ||
memcmp(&opts->host->id, &ctrl->opts->host->id, sizeof(uuid_t)))
return false;
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 6cb26bcf6ec0..0bad65803271 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -1686,16 +1686,6 @@ done:
goto check_error;
}
- /*
- * Force failures of commands if we're killing the controller
- * or have an error on a command used to create an new association
- */
- if (status &&
- (blk_queue_dying(rq->q) ||
- ctrl->ctrl.state == NVME_CTRL_NEW ||
- ctrl->ctrl.state == NVME_CTRL_CONNECTING))
- status |= cpu_to_le16(NVME_SC_DNR << 1);
-
__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
nvme_end_request(rq, status, result);
@@ -2403,9 +2393,6 @@ nvme_fc_terminate_exchange(struct request *req, void *data, bool reserved)
struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(req);
- if (!blk_mq_request_started(req))
- return;
-
__nvme_fc_abort_op(ctrl, op);
}
@@ -3284,6 +3271,8 @@ nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts)
}
spin_unlock_irqrestore(&nvme_fc_lock, flags);
+ pr_warn("%s: %s - %s combination not found\n",
+ __func__, opts->traddr, opts->host_traddr);
return ERR_PTR(-ENOENT);
}
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 17d2f7cf3fed..de24fe77c80b 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -22,6 +22,7 @@
#include <linux/lightnvm.h>
#include <linux/sed-opal.h>
#include <linux/fault-inject.h>
+#include <linux/rcupdate.h>
extern unsigned int nvme_io_timeout;
#define NVME_IO_TIMEOUT (nvme_io_timeout * HZ)
@@ -180,6 +181,7 @@ struct nvme_ctrl {
u16 kas;
u8 npss;
u8 apsta;
+ u32 oaes;
u32 aen_result;
unsigned int shutdown_timeout;
unsigned int kato;
@@ -192,6 +194,8 @@ struct nvme_ctrl {
struct delayed_work ka_work;
struct nvme_command ka_cmd;
struct work_struct fw_act_work;
+#define EVENT_NS_CHANGED (1 << 0)
+ unsigned long events;
/* Power saving configuration */
u64 ps_max_latency_us;
@@ -398,14 +402,13 @@ void nvme_stop_ctrl(struct nvme_ctrl *ctrl);
void nvme_put_ctrl(struct nvme_ctrl *ctrl);
int nvme_init_identify(struct nvme_ctrl *ctrl);
-void nvme_queue_scan(struct nvme_ctrl *ctrl);
void nvme_remove_namespaces(struct nvme_ctrl *ctrl);
int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
bool send);
void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
- union nvme_result *res);
+ volatile union nvme_result *res);
void nvme_stop_queues(struct nvme_ctrl *ctrl);
void nvme_start_queues(struct nvme_ctrl *ctrl);
@@ -454,7 +457,7 @@ static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
{
struct nvme_ns_head *head = ns->head;
- if (head && ns == srcu_dereference(head->current_path, &head->srcu))
+ if (head && ns == rcu_access_pointer(head->current_path))
rcu_assign_pointer(head->current_path, NULL);
}
struct nvme_ns *nvme_find_path(struct nvme_ns_head *head);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 17a0190bd88f..e526437bacbf 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -13,6 +13,7 @@
*/
#include <linux/aer.h>
+#include <linux/async.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
#include <linux/blk-mq-pci.h>
@@ -68,7 +69,6 @@ MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2");
struct nvme_dev;
struct nvme_queue;
-static void nvme_process_cq(struct nvme_queue *nvmeq);
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
/*
@@ -147,9 +147,10 @@ static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
struct nvme_queue {
struct device *q_dmadev;
struct nvme_dev *dev;
- spinlock_t q_lock;
+ spinlock_t sq_lock;
struct nvme_command *sq_cmds;
struct nvme_command __iomem *sq_cmds_io;
+ spinlock_t cq_lock ____cacheline_aligned_in_smp;
volatile struct nvme_completion *cqes;
struct blk_mq_tags **tags;
dma_addr_t sq_dma_addr;
@@ -159,9 +160,9 @@ struct nvme_queue {
s16 cq_vector;
u16 sq_tail;
u16 cq_head;
+ u16 last_cq_head;
u16 qid;
u8 cq_phase;
- u8 cqe_seen;
u32 *dbbuf_sq_db;
u32 *dbbuf_cq_db;
u32 *dbbuf_sq_ei;
@@ -420,28 +421,25 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
}
/**
- * __nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
+ * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
* @nvmeq: The queue to use
* @cmd: The command to send
- *
- * Safe to use from interrupt context
*/
-static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
- struct nvme_command *cmd)
+static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
{
- u16 tail = nvmeq->sq_tail;
-
+ spin_lock(&nvmeq->sq_lock);
if (nvmeq->sq_cmds_io)
- memcpy_toio(&nvmeq->sq_cmds_io[tail], cmd, sizeof(*cmd));
+ memcpy_toio(&nvmeq->sq_cmds_io[nvmeq->sq_tail], cmd,
+ sizeof(*cmd));
else
- memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
+ memcpy(&nvmeq->sq_cmds[nvmeq->sq_tail], cmd, sizeof(*cmd));
- if (++tail == nvmeq->q_depth)
- tail = 0;
- if (nvme_dbbuf_update_and_check_event(tail, nvmeq->dbbuf_sq_db,
- nvmeq->dbbuf_sq_ei))
- writel(tail, nvmeq->q_db);
- nvmeq->sq_tail = tail;
+ if (++nvmeq->sq_tail == nvmeq->q_depth)
+ nvmeq->sq_tail = 0;
+ if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail,
+ nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei))
+ writel(nvmeq->sq_tail, nvmeq->q_db);
+ spin_unlock(&nvmeq->sq_lock);
}
static void **nvme_pci_iod_list(struct request *req)
@@ -872,6 +870,13 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
struct nvme_command cmnd;
blk_status_t ret;
+ /*
+ * We should not need to do this, but we're still using this to
+ * ensure we can drain requests on a dying queue.
+ */
+ if (unlikely(nvmeq->cq_vector < 0))
+ return BLK_STS_IOERR;
+
ret = nvme_setup_cmd(ns, req, &cmnd);
if (ret)
return ret;
@@ -887,16 +892,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
}
blk_mq_start_request(req);
-
- spin_lock_irq(&nvmeq->q_lock);
- if (unlikely(nvmeq->cq_vector < 0)) {
- ret = BLK_STS_IOERR;
- spin_unlock_irq(&nvmeq->q_lock);
- goto out_cleanup_iod;
- }
- __nvme_submit_cmd(nvmeq, &cmnd);
- nvme_process_cq(nvmeq);
- spin_unlock_irq(&nvmeq->q_lock);
+ nvme_submit_cmd(nvmeq, &cmnd);
return BLK_STS_OK;
out_cleanup_iod:
nvme_free_iod(dev, req);
@@ -914,10 +910,10 @@ static void nvme_pci_complete_rq(struct request *req)
}
/* We read the CQE phase first to check if the rest of the entry is valid */
-static inline bool nvme_cqe_valid(struct nvme_queue *nvmeq, u16 head,
- u16 phase)
+static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq)
{
- return (le16_to_cpu(nvmeq->cqes[head].status) & 1) == phase;
+ return (le16_to_cpu(nvmeq->cqes[nvmeq->cq_head].status) & 1) ==
+ nvmeq->cq_phase;
}
static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq)
@@ -931,9 +927,9 @@ static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq)
}
}
-static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
- struct nvme_completion *cqe)
+static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
{
+ volatile struct nvme_completion *cqe = &nvmeq->cqes[idx];
struct request *req;
if (unlikely(cqe->command_id >= nvmeq->q_depth)) {
@@ -956,83 +952,87 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
return;
}
- nvmeq->cqe_seen = 1;
req = blk_mq_tag_to_rq(*nvmeq->tags, cqe->command_id);
nvme_end_request(req, cqe->status, cqe->result);
}
-static inline bool nvme_read_cqe(struct nvme_queue *nvmeq,
- struct nvme_completion *cqe)
+static void nvme_complete_cqes(struct nvme_queue *nvmeq, u16 start, u16 end)
{
- if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) {
- *cqe = nvmeq->cqes[nvmeq->cq_head];
+ while (start != end) {
+ nvme_handle_cqe(nvmeq, start);
+ if (++start == nvmeq->q_depth)
+ start = 0;
+ }
+}
- if (++nvmeq->cq_head == nvmeq->q_depth) {
- nvmeq->cq_head = 0;
- nvmeq->cq_phase = !nvmeq->cq_phase;
- }
- return true;
+static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
+{
+ if (++nvmeq->cq_head == nvmeq->q_depth) {
+ nvmeq->cq_head = 0;
+ nvmeq->cq_phase = !nvmeq->cq_phase;
}
- return false;
}
-static void nvme_process_cq(struct nvme_queue *nvmeq)
+static inline bool nvme_process_cq(struct nvme_queue *nvmeq, u16 *start,
+ u16 *end, int tag)
{
- struct nvme_completion cqe;
- int consumed = 0;
+ bool found = false;
- while (nvme_read_cqe(nvmeq, &cqe)) {
- nvme_handle_cqe(nvmeq, &cqe);
- consumed++;
+ *start = nvmeq->cq_head;
+ while (!found && nvme_cqe_pending(nvmeq)) {
+ if (nvmeq->cqes[nvmeq->cq_head].command_id == tag)
+ found = true;
+ nvme_update_cq_head(nvmeq);
}
+ *end = nvmeq->cq_head;
- if (consumed)
+ if (*start != *end)
nvme_ring_cq_doorbell(nvmeq);
+ return found;
}
static irqreturn_t nvme_irq(int irq, void *data)
{
- irqreturn_t result;
struct nvme_queue *nvmeq = data;
- spin_lock(&nvmeq->q_lock);
- nvme_process_cq(nvmeq);
- result = nvmeq->cqe_seen ? IRQ_HANDLED : IRQ_NONE;
- nvmeq->cqe_seen = 0;
- spin_unlock(&nvmeq->q_lock);
- return result;
+ irqreturn_t ret = IRQ_NONE;
+ u16 start, end;
+
+ spin_lock(&nvmeq->cq_lock);
+ if (nvmeq->cq_head != nvmeq->last_cq_head)
+ ret = IRQ_HANDLED;
+ nvme_process_cq(nvmeq, &start, &end, -1);
+ nvmeq->last_cq_head = nvmeq->cq_head;
+ spin_unlock(&nvmeq->cq_lock);
+
+ if (start != end) {
+ nvme_complete_cqes(nvmeq, start, end);
+ return IRQ_HANDLED;
+ }
+
+ return ret;
}
static irqreturn_t nvme_irq_check(int irq, void *data)
{
struct nvme_queue *nvmeq = data;
- if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase))
+ if (nvme_cqe_pending(nvmeq))
return IRQ_WAKE_THREAD;
return IRQ_NONE;
}
static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag)
{
- struct nvme_completion cqe;
- int found = 0, consumed = 0;
+ u16 start, end;
+ bool found;
- if (!nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase))
+ if (!nvme_cqe_pending(nvmeq))
return 0;
- spin_lock_irq(&nvmeq->q_lock);
- while (nvme_read_cqe(nvmeq, &cqe)) {
- nvme_handle_cqe(nvmeq, &cqe);
- consumed++;
-
- if (tag == cqe.command_id) {
- found = 1;
- break;
- }
- }
-
- if (consumed)
- nvme_ring_cq_doorbell(nvmeq);
- spin_unlock_irq(&nvmeq->q_lock);
+ spin_lock_irq(&nvmeq->cq_lock);
+ found = nvme_process_cq(nvmeq, &start, &end, tag);
+ spin_unlock_irq(&nvmeq->cq_lock);
+ nvme_complete_cqes(nvmeq, start, end);
return found;
}
@@ -1052,10 +1052,7 @@ static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
memset(&c, 0, sizeof(c));
c.common.opcode = nvme_admin_async_event;
c.common.command_id = NVME_AQ_BLK_MQ_DEPTH;
-
- spin_lock_irq(&nvmeq->q_lock);
- __nvme_submit_cmd(nvmeq, &c);
- spin_unlock_irq(&nvmeq->q_lock);
+ nvme_submit_cmd(nvmeq, &c);
}
static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
@@ -1070,7 +1067,7 @@ static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
}
static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
- struct nvme_queue *nvmeq)
+ struct nvme_queue *nvmeq, s16 vector)
{
struct nvme_command c;
int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
@@ -1085,7 +1082,7 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
c.create_cq.cqid = cpu_to_le16(qid);
c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
c.create_cq.cq_flags = cpu_to_le16(flags);
- c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);
+ c.create_cq.irq_vector = cpu_to_le16(vector);
return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
}
@@ -1208,7 +1205,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
nvme_warn_reset(dev, csts);
nvme_dev_disable(dev, false);
nvme_reset_ctrl(&dev->ctrl);
- return BLK_EH_HANDLED;
+ return BLK_EH_DONE;
}
/*
@@ -1218,24 +1215,24 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
dev_warn(dev->ctrl.device,
"I/O %d QID %d timeout, completion polled\n",
req->tag, nvmeq->qid);
- return BLK_EH_HANDLED;
+ return BLK_EH_DONE;
}
/*
* Shutdown immediately if controller times out while starting. The
* reset work will see the pci device disabled when it gets the forced
* cancellation error. All outstanding requests are completed on
- * shutdown, so we return BLK_EH_HANDLED.
+ * shutdown, so we return BLK_EH_DONE.
*/
switch (dev->ctrl.state) {
case NVME_CTRL_CONNECTING:
case NVME_CTRL_RESETTING:
- dev_warn(dev->ctrl.device,
+ dev_warn_ratelimited(dev->ctrl.device,
"I/O %d QID %d timeout, disable controller\n",
req->tag, nvmeq->qid);
nvme_dev_disable(dev, false);
nvme_req(req)->flags |= NVME_REQ_CANCELLED;
- return BLK_EH_HANDLED;
+ return BLK_EH_DONE;
default:
break;
}
@@ -1252,12 +1249,8 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
nvme_dev_disable(dev, false);
nvme_reset_ctrl(&dev->ctrl);
- /*
- * Mark the request as handled, since the inline shutdown
- * forces all outstanding requests to complete.
- */
nvme_req(req)->flags |= NVME_REQ_CANCELLED;
- return BLK_EH_HANDLED;
+ return BLK_EH_DONE;
}
if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) {
@@ -1321,15 +1314,21 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
{
int vector;
- spin_lock_irq(&nvmeq->q_lock);
+ spin_lock_irq(&nvmeq->cq_lock);
if (nvmeq->cq_vector == -1) {
- spin_unlock_irq(&nvmeq->q_lock);
+ spin_unlock_irq(&nvmeq->cq_lock);
return 1;
}
vector = nvmeq->cq_vector;
nvmeq->dev->online_queues--;
nvmeq->cq_vector = -1;
- spin_unlock_irq(&nvmeq->q_lock);
+ spin_unlock_irq(&nvmeq->cq_lock);
+
+ /*
+ * Ensure that nvme_queue_rq() sees it ->cq_vector == -1 without
+ * having to grab the lock.
+ */
+ mb();
if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q);
@@ -1342,15 +1341,18 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
{
struct nvme_queue *nvmeq = &dev->queues[0];
+ u16 start, end;
if (shutdown)
nvme_shutdown_ctrl(&dev->ctrl);
else
nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap);
- spin_lock_irq(&nvmeq->q_lock);
- nvme_process_cq(nvmeq);
- spin_unlock_irq(&nvmeq->q_lock);
+ spin_lock_irq(&nvmeq->cq_lock);
+ nvme_process_cq(nvmeq, &start, &end, -1);
+ spin_unlock_irq(&nvmeq->cq_lock);
+
+ nvme_complete_cqes(nvmeq, start, end);
}
static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
@@ -1408,7 +1410,8 @@ static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
nvmeq->q_dmadev = dev->dev;
nvmeq->dev = dev;
- spin_lock_init(&nvmeq->q_lock);
+ spin_lock_init(&nvmeq->sq_lock);
+ spin_lock_init(&nvmeq->cq_lock);
nvmeq->cq_head = 0;
nvmeq->cq_phase = 1;
nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
@@ -1444,7 +1447,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
{
struct nvme_dev *dev = nvmeq->dev;
- spin_lock_irq(&nvmeq->q_lock);
+ spin_lock_irq(&nvmeq->cq_lock);
nvmeq->sq_tail = 0;
nvmeq->cq_head = 0;
nvmeq->cq_phase = 1;
@@ -1452,13 +1455,14 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
nvme_dbbuf_init(dev, nvmeq, qid);
dev->online_queues++;
- spin_unlock_irq(&nvmeq->q_lock);
+ spin_unlock_irq(&nvmeq->cq_lock);
}
static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
{
struct nvme_dev *dev = nvmeq->dev;
int result;
+ s16 vector;
if (dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
unsigned offset = (qid - 1) * roundup(SQ_SIZE(nvmeq->q_depth),
@@ -1471,15 +1475,21 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
* A queue's vector matches the queue identifier unless the controller
* has only one vector available.
*/
- nvmeq->cq_vector = dev->num_vecs == 1 ? 0 : qid;
- result = adapter_alloc_cq(dev, qid, nvmeq);
+ vector = dev->num_vecs == 1 ? 0 : qid;
+ result = adapter_alloc_cq(dev, qid, nvmeq, vector);
if (result < 0)
- goto release_vector;
+ goto out;
result = adapter_alloc_sq(dev, qid, nvmeq);
if (result < 0)
goto release_cq;
+ /*
+ * Set cq_vector after alloc cq/sq, otherwise nvme_suspend_queue will
+ * invoke free_irq for it and cause a 'Trying to free already-free IRQ
+ * xxx' warning if the create CQ/SQ command times out.
+ */
+ nvmeq->cq_vector = vector;
nvme_init_queue(nvmeq, qid);
result = queue_request_irq(nvmeq);
if (result < 0)
@@ -1487,13 +1497,13 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
return result;
- release_sq:
+release_sq:
+ nvmeq->cq_vector = -1;
dev->online_queues--;
adapter_delete_sq(dev, qid);
- release_cq:
+release_cq:
adapter_delete_cq(dev, qid);
- release_vector:
- nvmeq->cq_vector = -1;
+out:
return result;
}
@@ -1997,19 +2007,22 @@ static void nvme_del_queue_end(struct request *req, blk_status_t error)
static void nvme_del_cq_end(struct request *req, blk_status_t error)
{
struct nvme_queue *nvmeq = req->end_io_data;
+ u16 start, end;
if (!error) {
unsigned long flags;
/*
- * We might be called with the AQ q_lock held
- * and the I/O queue q_lock should always
+ * We might be called with the AQ cq_lock held
+ * and the I/O queue cq_lock should always
* nest inside the AQ one.
*/
- spin_lock_irqsave_nested(&nvmeq->q_lock, flags,
+ spin_lock_irqsave_nested(&nvmeq->cq_lock, flags,
SINGLE_DEPTH_NESTING);
- nvme_process_cq(nvmeq);
- spin_unlock_irqrestore(&nvmeq->q_lock, flags);
+ nvme_process_cq(nvmeq, &start, &end, -1);
+ spin_unlock_irqrestore(&nvmeq->cq_lock, flags);
+
+ nvme_complete_cqes(nvmeq, start, end);
}
nvme_del_queue_end(req, error);
@@ -2497,6 +2510,15 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
return 0;
}
+static void nvme_async_probe(void *data, async_cookie_t cookie)
+{
+ struct nvme_dev *dev = data;
+
+ nvme_reset_ctrl_sync(&dev->ctrl);
+ flush_work(&dev->ctrl.scan_work);
+ nvme_put_ctrl(&dev->ctrl);
+}
+
static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
int node, result = -ENOMEM;
@@ -2541,7 +2563,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
- nvme_reset_ctrl(&dev->ctrl);
+ nvme_get_ctrl(&dev->ctrl);
+ async_schedule(nvme_async_probe, dev);
return 0;
@@ -2685,6 +2708,9 @@ static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev)
static void nvme_error_resume(struct pci_dev *pdev)
{
+ struct nvme_dev *dev = pci_get_drvdata(pdev);
+
+ flush_work(&dev->ctrl.reset_work);
pci_cleanup_aer_uncorrect_error_status(pdev);
}
@@ -2714,6 +2740,8 @@ static const struct pci_device_id nvme_id_table[] = {
NVME_QUIRK_MEDIUM_PRIO_SQ },
{ PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */
.driver_data = NVME_QUIRK_IDENTIFY_CNS, },
+ { PCI_DEVICE(0x1bb1, 0x0100), /* Seagate Nytro Flash Storage */
+ .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
{ PCI_DEVICE(0x1c58, 0x0003), /* HGST adapter */
.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
{ PCI_DEVICE(0x1c58, 0x0023), /* WDC SN200 adapter */
@@ -2728,6 +2756,8 @@ static const struct pci_device_id nvme_id_table[] = {
.driver_data = NVME_QUIRK_LIGHTNVM, },
{ PCI_DEVICE(0x1d1d, 0x2807), /* CNEX WL */
.driver_data = NVME_QUIRK_LIGHTNVM, },
+ { PCI_DEVICE(0x1d1d, 0x2601), /* CNEX Granby */
+ .driver_data = NVME_QUIRK_LIGHTNVM, },
{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 1eb4438a8763..7b3f08410430 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -778,7 +778,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
if (error) {
dev_err(ctrl->ctrl.device,
"prop_get NVME_REG_CAP failed\n");
- goto out_cleanup_queue;
+ goto out_stop_queue;
}
ctrl->ctrl.sqsize =
@@ -786,23 +786,25 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
if (error)
- goto out_cleanup_queue;
+ goto out_stop_queue;
ctrl->ctrl.max_hw_sectors =
(ctrl->max_fr_pages - 1) << (ilog2(SZ_4K) - 9);
error = nvme_init_identify(&ctrl->ctrl);
if (error)
- goto out_cleanup_queue;
+ goto out_stop_queue;
error = nvme_rdma_alloc_qe(ctrl->queues[0].device->dev,
&ctrl->async_event_sqe, sizeof(struct nvme_command),
DMA_TO_DEVICE);
if (error)
- goto out_cleanup_queue;
+ goto out_stop_queue;
return 0;
+out_stop_queue:
+ nvme_rdma_stop_queue(&ctrl->queues[0]);
out_cleanup_queue:
if (new)
blk_cleanup_queue(ctrl->ctrl.admin_q);
@@ -1598,7 +1600,7 @@ nvme_rdma_timeout(struct request *rq, bool reserved)
/* fail with DNR on cmd timeout */
nvme_req(rq)->status = NVME_SC_ABORT_REQ | NVME_SC_DNR;
- return BLK_EH_HANDLED;
+ return BLK_EH_DONE;
}
static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h
index ea91fccd1bc0..01390f0e1671 100644
--- a/drivers/nvme/host/trace.h
+++ b/drivers/nvme/host/trace.h
@@ -148,8 +148,8 @@ TRACE_EVENT(nvme_complete_rq,
__entry->flags = nvme_req(req)->flags;
__entry->status = nvme_req(req)->status;
),
- TP_printk("cmdid=%u, qid=%d, res=%llu, retries=%u, flags=0x%x, status=%u",
- __entry->cid, __entry->qid, __entry->result,
+ TP_printk("qid=%d, cmdid=%u, res=%llu, retries=%u, flags=0x%x, status=%u",
+ __entry->qid, __entry->cid, __entry->result,
__entry->retries, __entry->flags, __entry->status)
);
diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile
index 488250189c99..8118c93391c6 100644
--- a/drivers/nvme/target/Makefile
+++ b/drivers/nvme/target/Makefile
@@ -6,8 +6,8 @@ obj-$(CONFIG_NVME_TARGET_RDMA) += nvmet-rdma.o
obj-$(CONFIG_NVME_TARGET_FC) += nvmet-fc.o
obj-$(CONFIG_NVME_TARGET_FCLOOP) += nvme-fcloop.o
-nvmet-y += core.o configfs.o admin-cmd.o io-cmd.o fabrics-cmd.o \
- discovery.o
+nvmet-y += core.o configfs.o admin-cmd.o fabrics-cmd.o \
+ discovery.o io-cmd-file.o io-cmd-bdev.o
nvme-loop-y += loop.o
nvmet-rdma-y += rdma.o
nvmet-fc-y += fc.o
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 5e0e9fcc0d4d..ead8fbe6922e 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -32,6 +32,11 @@ u32 nvmet_get_log_page_len(struct nvme_command *cmd)
return len;
}
+static void nvmet_execute_get_log_page_noop(struct nvmet_req *req)
+{
+ nvmet_req_complete(req, nvmet_zero_sgl(req, 0, req->data_len));
+}
+
static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req,
struct nvme_smart_log *slog)
{
@@ -45,6 +50,10 @@ static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req,
return NVME_SC_INVALID_NS;
}
+ /* we don't have the right data for file backed ns */
+ if (!ns->bdev)
+ goto out;
+
host_reads = part_stat_read(ns->bdev->bd_part, ios[READ]);
data_units_read = part_stat_read(ns->bdev->bd_part, sectors[READ]);
host_writes = part_stat_read(ns->bdev->bd_part, ios[WRITE]);
@@ -54,6 +63,7 @@ static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req,
put_unaligned_le64(data_units_read, &slog->data_units_read[0]);
put_unaligned_le64(host_writes, &slog->host_writes[0]);
put_unaligned_le64(data_units_written, &slog->data_units_written[0]);
+out:
nvmet_put_namespace(ns);
return NVME_SC_SUCCESS;
@@ -71,6 +81,9 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req,
rcu_read_lock();
list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) {
+ /* we don't have the right data for file backed ns */
+ if (!ns->bdev)
+ continue;
host_reads += part_stat_read(ns->bdev->bd_part, ios[READ]);
data_units_read +=
part_stat_read(ns->bdev->bd_part, sectors[READ]);
@@ -89,74 +102,50 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req,
return NVME_SC_SUCCESS;
}
-static u16 nvmet_get_smart_log(struct nvmet_req *req,
- struct nvme_smart_log *slog)
+static void nvmet_execute_get_log_page_smart(struct nvmet_req *req)
{
- u16 status;
+ struct nvme_smart_log *log;
+ u16 status = NVME_SC_INTERNAL;
+
+ if (req->data_len != sizeof(*log))
+ goto out;
+
+ log = kzalloc(sizeof(*log), GFP_KERNEL);
+ if (!log)
+ goto out;
- WARN_ON(req == NULL || slog == NULL);
if (req->cmd->get_log_page.nsid == cpu_to_le32(NVME_NSID_ALL))
- status = nvmet_get_smart_log_all(req, slog);
+ status = nvmet_get_smart_log_all(req, log);
else
- status = nvmet_get_smart_log_nsid(req, slog);
- return status;
+ status = nvmet_get_smart_log_nsid(req, log);
+ if (status)
+ goto out;
+
+ status = nvmet_copy_to_sgl(req, 0, log, sizeof(*log));
+out:
+ nvmet_req_complete(req, status);
}
-static void nvmet_execute_get_log_page(struct nvmet_req *req)
+static void nvmet_execute_get_log_changed_ns(struct nvmet_req *req)
{
- struct nvme_smart_log *smart_log;
- size_t data_len = nvmet_get_log_page_len(req->cmd);
- void *buf;
- u16 status = 0;
+ struct nvmet_ctrl *ctrl = req->sq->ctrl;
+ u16 status = NVME_SC_INTERNAL;
+ size_t len;
- buf = kzalloc(data_len, GFP_KERNEL);
- if (!buf) {
- status = NVME_SC_INTERNAL;
+ if (req->data_len != NVME_MAX_CHANGED_NAMESPACES * sizeof(__le32))
goto out;
- }
- switch (req->cmd->get_log_page.lid) {
- case NVME_LOG_ERROR:
- /*
- * We currently never set the More bit in the status field,
- * so all error log entries are invalid and can be zeroed out.
- * This is called a minum viable implementation (TM) of this
- * mandatory log page.
- */
- break;
- case NVME_LOG_SMART:
- /*
- * XXX: fill out actual smart log
- *
- * We might have a hard time coming up with useful values for
- * many of the fields, and even when we have useful data
- * available (e.g. units or commands read/written) those aren't
- * persistent over power loss.
- */
- if (data_len != sizeof(*smart_log)) {
- status = NVME_SC_INTERNAL;
- goto err;
- }
- smart_log = buf;
- status = nvmet_get_smart_log(req, smart_log);
- if (status)
- goto err;
- break;
- case NVME_LOG_FW_SLOT:
- /*
- * We only support a single firmware slot which always is
- * active, so we can zero out the whole firmware slot log and
- * still claim to fully implement this mandatory log page.
- */
- break;
- default:
- BUG();
- }
-
- status = nvmet_copy_to_sgl(req, 0, buf, data_len);
-
-err:
- kfree(buf);
+ mutex_lock(&ctrl->lock);
+ if (ctrl->nr_changed_ns == U32_MAX)
+ len = sizeof(__le32);
+ else
+ len = ctrl->nr_changed_ns * sizeof(__le32);
+ status = nvmet_copy_to_sgl(req, 0, ctrl->changed_ns_list, len);
+ if (!status)
+ status = nvmet_zero_sgl(req, len, req->data_len - len);
+ ctrl->nr_changed_ns = 0;
+ clear_bit(NVME_AEN_CFG_NS_ATTR, &ctrl->aen_masked);
+ mutex_unlock(&ctrl->lock);
out:
nvmet_req_complete(req, status);
}
@@ -201,7 +190,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
id->ver = cpu_to_le32(ctrl->subsys->ver);
/* XXX: figure out what to do about RTD3R/RTD3 */
- id->oaes = cpu_to_le32(1 << 8);
+ id->oaes = cpu_to_le32(NVMET_AEN_CFG_OPTIONAL);
id->ctratt = cpu_to_le32(1 << 0);
id->oacs = 0;
@@ -447,6 +436,16 @@ static void nvmet_execute_set_features(struct nvmet_req *req)
req->sq->ctrl->kato = DIV_ROUND_UP(val32, 1000);
nvmet_set_result(req, req->sq->ctrl->kato);
break;
+ case NVME_FEAT_ASYNC_EVENT:
+ val32 = le32_to_cpu(req->cmd->common.cdw10[1]);
+ if (val32 & ~NVMET_AEN_CFG_ALL) {
+ status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+ break;
+ }
+
+ WRITE_ONCE(req->sq->ctrl->aen_enabled, val32);
+ nvmet_set_result(req, val32);
+ break;
case NVME_FEAT_HOST_ID:
status = NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
break;
@@ -485,9 +484,10 @@ static void nvmet_execute_get_features(struct nvmet_req *req)
break;
case NVME_FEAT_WRITE_ATOMIC:
break;
+#endif
case NVME_FEAT_ASYNC_EVENT:
+ nvmet_set_result(req, READ_ONCE(req->sq->ctrl->aen_enabled));
break;
-#endif
case NVME_FEAT_VOLATILE_WC:
nvmet_set_result(req, 1);
break;
@@ -548,8 +548,6 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
struct nvme_command *cmd = req->cmd;
u16 ret;
- req->ns = NULL;
-
ret = nvmet_check_ctrl_status(req, cmd);
if (unlikely(ret))
return ret;
@@ -560,9 +558,28 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
switch (cmd->get_log_page.lid) {
case NVME_LOG_ERROR:
+ /*
+ * We currently never set the More bit in the status
+ * field, so all error log entries are invalid and can
+ * be zeroed out. This is called a minum viable
+ * implementation (TM) of this mandatory log page.
+ */
+ req->execute = nvmet_execute_get_log_page_noop;
+ return 0;
case NVME_LOG_SMART:
+ req->execute = nvmet_execute_get_log_page_smart;
+ return 0;
case NVME_LOG_FW_SLOT:
- req->execute = nvmet_execute_get_log_page;
+ /*
+ * We only support a single firmware slot which always
+ * is active, so we can zero out the whole firmware slot
+ * log and still claim to fully implement this mandatory
+ * log page.
+ */
+ req->execute = nvmet_execute_get_log_page_noop;
+ return 0;
+ case NVME_LOG_CHANGED_NS:
+ req->execute = nvmet_execute_get_log_changed_ns;
return 0;
}
break;
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index e95424f172fd..a03da764ecae 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -57,6 +57,13 @@ u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, size_t len)
return 0;
}
+u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len)
+{
+ if (sg_zero_buffer(req->sg, req->sg_cnt, len, off) != len)
+ return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
+ return 0;
+}
+
static unsigned int nvmet_max_nsid(struct nvmet_subsys *subsys)
{
struct nvmet_ns *ns;
@@ -137,6 +144,51 @@ static void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
schedule_work(&ctrl->async_event_work);
}
+static bool nvmet_aen_disabled(struct nvmet_ctrl *ctrl, u32 aen)
+{
+ if (!(READ_ONCE(ctrl->aen_enabled) & aen))
+ return true;
+ return test_and_set_bit(aen, &ctrl->aen_masked);
+}
+
+static void nvmet_add_to_changed_ns_log(struct nvmet_ctrl *ctrl, __le32 nsid)
+{
+ u32 i;
+
+ mutex_lock(&ctrl->lock);
+ if (ctrl->nr_changed_ns > NVME_MAX_CHANGED_NAMESPACES)
+ goto out_unlock;
+
+ for (i = 0; i < ctrl->nr_changed_ns; i++) {
+ if (ctrl->changed_ns_list[i] == nsid)
+ goto out_unlock;
+ }
+
+ if (ctrl->nr_changed_ns == NVME_MAX_CHANGED_NAMESPACES) {
+ ctrl->changed_ns_list[0] = cpu_to_le32(0xffffffff);
+ ctrl->nr_changed_ns = U32_MAX;
+ goto out_unlock;
+ }
+
+ ctrl->changed_ns_list[ctrl->nr_changed_ns++] = nsid;
+out_unlock:
+ mutex_unlock(&ctrl->lock);
+}
+
+static void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid)
+{
+ struct nvmet_ctrl *ctrl;
+
+ list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
+ nvmet_add_to_changed_ns_log(ctrl, cpu_to_le32(nsid));
+ if (nvmet_aen_disabled(ctrl, NVME_AEN_CFG_NS_ATTR))
+ continue;
+ nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE,
+ NVME_AER_NOTICE_NS_CHANGED,
+ NVME_LOG_CHANGED_NS);
+ }
+}
+
int nvmet_register_transport(const struct nvmet_fabrics_ops *ops)
{
int ret = 0;
@@ -271,33 +323,31 @@ void nvmet_put_namespace(struct nvmet_ns *ns)
percpu_ref_put(&ns->ref);
}
+static void nvmet_ns_dev_disable(struct nvmet_ns *ns)
+{
+ nvmet_bdev_ns_disable(ns);
+ nvmet_file_ns_disable(ns);
+}
+
int nvmet_ns_enable(struct nvmet_ns *ns)
{
struct nvmet_subsys *subsys = ns->subsys;
- struct nvmet_ctrl *ctrl;
int ret = 0;
mutex_lock(&subsys->lock);
if (ns->enabled)
goto out_unlock;
- ns->bdev = blkdev_get_by_path(ns->device_path, FMODE_READ | FMODE_WRITE,
- NULL);
- if (IS_ERR(ns->bdev)) {
- pr_err("failed to open block device %s: (%ld)\n",
- ns->device_path, PTR_ERR(ns->bdev));
- ret = PTR_ERR(ns->bdev);
- ns->bdev = NULL;
+ ret = nvmet_bdev_ns_enable(ns);
+ if (ret)
+ ret = nvmet_file_ns_enable(ns);
+ if (ret)
goto out_unlock;
- }
-
- ns->size = i_size_read(ns->bdev->bd_inode);
- ns->blksize_shift = blksize_bits(bdev_logical_block_size(ns->bdev));
ret = percpu_ref_init(&ns->ref, nvmet_destroy_namespace,
0, GFP_KERNEL);
if (ret)
- goto out_blkdev_put;
+ goto out_dev_put;
if (ns->nsid > subsys->max_nsid)
subsys->max_nsid = ns->nsid;
@@ -320,24 +370,20 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
list_add_tail_rcu(&ns->dev_link, &old->dev_link);
}
- list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
- nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, 0, 0);
-
+ nvmet_ns_changed(subsys, ns->nsid);
ns->enabled = true;
ret = 0;
out_unlock:
mutex_unlock(&subsys->lock);
return ret;
-out_blkdev_put:
- blkdev_put(ns->bdev, FMODE_WRITE|FMODE_READ);
- ns->bdev = NULL;
+out_dev_put:
+ nvmet_ns_dev_disable(ns);
goto out_unlock;
}
void nvmet_ns_disable(struct nvmet_ns *ns)
{
struct nvmet_subsys *subsys = ns->subsys;
- struct nvmet_ctrl *ctrl;
mutex_lock(&subsys->lock);
if (!ns->enabled)
@@ -363,11 +409,8 @@ void nvmet_ns_disable(struct nvmet_ns *ns)
percpu_ref_exit(&ns->ref);
mutex_lock(&subsys->lock);
- list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
- nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, 0, 0);
-
- if (ns->bdev)
- blkdev_put(ns->bdev, FMODE_WRITE|FMODE_READ);
+ nvmet_ns_changed(subsys, ns->nsid);
+ nvmet_ns_dev_disable(ns);
out_unlock:
mutex_unlock(&subsys->lock);
}
@@ -499,6 +542,25 @@ int nvmet_sq_init(struct nvmet_sq *sq)
}
EXPORT_SYMBOL_GPL(nvmet_sq_init);
+static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
+{
+ struct nvme_command *cmd = req->cmd;
+ u16 ret;
+
+ ret = nvmet_check_ctrl_status(req, cmd);
+ if (unlikely(ret))
+ return ret;
+
+ req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid);
+ if (unlikely(!req->ns))
+ return NVME_SC_INVALID_NS | NVME_SC_DNR;
+
+ if (req->ns->file)
+ return nvmet_file_parse_io_cmd(req);
+ else
+ return nvmet_bdev_parse_io_cmd(req);
+}
+
bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
struct nvmet_sq *sq, const struct nvmet_fabrics_ops *ops)
{
@@ -710,15 +772,14 @@ out:
u16 nvmet_check_ctrl_status(struct nvmet_req *req, struct nvme_command *cmd)
{
if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) {
- pr_err("got io cmd %d while CC.EN == 0 on qid = %d\n",
+ pr_err("got cmd %d while CC.EN == 0 on qid = %d\n",
cmd->common.opcode, req->sq->qid);
return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
}
if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) {
- pr_err("got io cmd %d while CSTS.RDY == 0 on qid = %d\n",
+ pr_err("got cmd %d while CSTS.RDY == 0 on qid = %d\n",
cmd->common.opcode, req->sq->qid);
- req->ns = NULL;
return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
}
return 0;
@@ -809,12 +870,18 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
kref_init(&ctrl->ref);
ctrl->subsys = subsys;
+ WRITE_ONCE(ctrl->aen_enabled, NVMET_AEN_CFG_OPTIONAL);
+
+ ctrl->changed_ns_list = kmalloc_array(NVME_MAX_CHANGED_NAMESPACES,
+ sizeof(__le32), GFP_KERNEL);
+ if (!ctrl->changed_ns_list)
+ goto out_free_ctrl;
ctrl->cqs = kcalloc(subsys->max_qid + 1,
sizeof(struct nvmet_cq *),
GFP_KERNEL);
if (!ctrl->cqs)
- goto out_free_ctrl;
+ goto out_free_changed_ns_list;
ctrl->sqs = kcalloc(subsys->max_qid + 1,
sizeof(struct nvmet_sq *),
@@ -872,6 +939,8 @@ out_free_sqs:
kfree(ctrl->sqs);
out_free_cqs:
kfree(ctrl->cqs);
+out_free_changed_ns_list:
+ kfree(ctrl->changed_ns_list);
out_free_ctrl:
kfree(ctrl);
out_put_subsystem:
@@ -898,6 +967,7 @@ static void nvmet_ctrl_free(struct kref *ref)
kfree(ctrl->sqs);
kfree(ctrl->cqs);
+ kfree(ctrl->changed_ns_list);
kfree(ctrl);
nvmet_subsys_put(subsys);
diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c
index 231e04e0a496..08656b849bd6 100644
--- a/drivers/nvme/target/discovery.c
+++ b/drivers/nvme/target/discovery.c
@@ -187,8 +187,6 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req)
{
struct nvme_command *cmd = req->cmd;
- req->ns = NULL;
-
if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) {
pr_err("got cmd %d while not ready\n",
cmd->common.opcode);
diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c
index 19e9e42ae943..d84ae004cb85 100644
--- a/drivers/nvme/target/fabrics-cmd.c
+++ b/drivers/nvme/target/fabrics-cmd.c
@@ -77,8 +77,6 @@ u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req)
{
struct nvme_command *cmd = req->cmd;
- req->ns = NULL;
-
switch (cmd->fabrics.fctype) {
case nvme_fabrics_type_property_set:
req->data_len = 0;
@@ -242,8 +240,6 @@ u16 nvmet_parse_connect_cmd(struct nvmet_req *req)
{
struct nvme_command *cmd = req->cmd;
- req->ns = NULL;
-
if (cmd->common.opcode != nvme_fabrics_command) {
pr_err("invalid command 0x%x on unconnected queue.\n",
cmd->fabrics.opcode);
diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index 33ee8d3145f8..408279cb6f2c 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -31,7 +31,7 @@
/* *************************** Data Structures/Defines ****************** */
-#define NVMET_LS_CTX_COUNT 4
+#define NVMET_LS_CTX_COUNT 256
/* for this implementation, assume small single frame rqst/rsp */
#define NVME_FC_MAX_LS_BUFFER_SIZE 2048
diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd-bdev.c
index cd2344179673..e0b0f7df70c2 100644
--- a/drivers/nvme/target/io-cmd.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -16,6 +16,34 @@
#include <linux/module.h>
#include "nvmet.h"
+int nvmet_bdev_ns_enable(struct nvmet_ns *ns)
+{
+ int ret;
+
+ ns->bdev = blkdev_get_by_path(ns->device_path,
+ FMODE_READ | FMODE_WRITE, NULL);
+ if (IS_ERR(ns->bdev)) {
+ ret = PTR_ERR(ns->bdev);
+ if (ret != -ENOTBLK) {
+ pr_err("failed to open block device %s: (%ld)\n",
+ ns->device_path, PTR_ERR(ns->bdev));
+ }
+ ns->bdev = NULL;
+ return ret;
+ }
+ ns->size = i_size_read(ns->bdev->bd_inode);
+ ns->blksize_shift = blksize_bits(bdev_logical_block_size(ns->bdev));
+ return 0;
+}
+
+void nvmet_bdev_ns_disable(struct nvmet_ns *ns)
+{
+ if (ns->bdev) {
+ blkdev_put(ns->bdev, FMODE_WRITE | FMODE_READ);
+ ns->bdev = NULL;
+ }
+}
+
static void nvmet_bio_done(struct bio *bio)
{
struct nvmet_req *req = bio->bi_private;
@@ -23,20 +51,14 @@ static void nvmet_bio_done(struct bio *bio)
nvmet_req_complete(req,
bio->bi_status ? NVME_SC_INTERNAL | NVME_SC_DNR : 0);
- if (bio != &req->inline_bio)
+ if (bio != &req->b.inline_bio)
bio_put(bio);
}
-static inline u32 nvmet_rw_len(struct nvmet_req *req)
-{
- return ((u32)le16_to_cpu(req->cmd->rw.length) + 1) <<
- req->ns->blksize_shift;
-}
-
-static void nvmet_execute_rw(struct nvmet_req *req)
+static void nvmet_bdev_execute_rw(struct nvmet_req *req)
{
int sg_cnt = req->sg_cnt;
- struct bio *bio = &req->inline_bio;
+ struct bio *bio = &req->b.inline_bio;
struct scatterlist *sg;
sector_t sector;
blk_qc_t cookie;
@@ -89,9 +111,9 @@ static void nvmet_execute_rw(struct nvmet_req *req)
blk_poll(bdev_get_queue(req->ns->bdev), cookie);
}
-static void nvmet_execute_flush(struct nvmet_req *req)
+static void nvmet_bdev_execute_flush(struct nvmet_req *req)
{
- struct bio *bio = &req->inline_bio;
+ struct bio *bio = &req->b.inline_bio;
bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec));
bio_set_dev(bio, req->ns->bdev);
@@ -102,7 +124,7 @@ static void nvmet_execute_flush(struct nvmet_req *req)
submit_bio(bio);
}
-static u16 nvmet_discard_range(struct nvmet_ns *ns,
+static u16 nvmet_bdev_discard_range(struct nvmet_ns *ns,
struct nvme_dsm_range *range, struct bio **bio)
{
int ret;
@@ -116,7 +138,7 @@ static u16 nvmet_discard_range(struct nvmet_ns *ns,
return 0;
}
-static void nvmet_execute_discard(struct nvmet_req *req)
+static void nvmet_bdev_execute_discard(struct nvmet_req *req)
{
struct nvme_dsm_range range;
struct bio *bio = NULL;
@@ -129,7 +151,7 @@ static void nvmet_execute_discard(struct nvmet_req *req)
if (status)
break;
- status = nvmet_discard_range(req->ns, &range, &bio);
+ status = nvmet_bdev_discard_range(req->ns, &range, &bio);
if (status)
break;
}
@@ -148,11 +170,11 @@ static void nvmet_execute_discard(struct nvmet_req *req)
}
}
-static void nvmet_execute_dsm(struct nvmet_req *req)
+static void nvmet_bdev_execute_dsm(struct nvmet_req *req)
{
switch (le32_to_cpu(req->cmd->dsm.attributes)) {
case NVME_DSMGMT_AD:
- nvmet_execute_discard(req);
+ nvmet_bdev_execute_discard(req);
return;
case NVME_DSMGMT_IDR:
case NVME_DSMGMT_IDW:
@@ -163,7 +185,7 @@ static void nvmet_execute_dsm(struct nvmet_req *req)
}
}
-static void nvmet_execute_write_zeroes(struct nvmet_req *req)
+static void nvmet_bdev_execute_write_zeroes(struct nvmet_req *req)
{
struct nvme_write_zeroes_cmd *write_zeroes = &req->cmd->write_zeroes;
struct bio *bio = NULL;
@@ -189,38 +211,27 @@ static void nvmet_execute_write_zeroes(struct nvmet_req *req)
}
}
-u16 nvmet_parse_io_cmd(struct nvmet_req *req)
+u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req)
{
struct nvme_command *cmd = req->cmd;
- u16 ret;
-
- ret = nvmet_check_ctrl_status(req, cmd);
- if (unlikely(ret)) {
- req->ns = NULL;
- return ret;
- }
-
- req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid);
- if (unlikely(!req->ns))
- return NVME_SC_INVALID_NS | NVME_SC_DNR;
switch (cmd->common.opcode) {
case nvme_cmd_read:
case nvme_cmd_write:
- req->execute = nvmet_execute_rw;
+ req->execute = nvmet_bdev_execute_rw;
req->data_len = nvmet_rw_len(req);
return 0;
case nvme_cmd_flush:
- req->execute = nvmet_execute_flush;
+ req->execute = nvmet_bdev_execute_flush;
req->data_len = 0;
return 0;
case nvme_cmd_dsm:
- req->execute = nvmet_execute_dsm;
+ req->execute = nvmet_bdev_execute_dsm;
req->data_len = (le32_to_cpu(cmd->dsm.nr) + 1) *
sizeof(struct nvme_dsm_range);
return 0;
case nvme_cmd_write_zeroes:
- req->execute = nvmet_execute_write_zeroes;
+ req->execute = nvmet_bdev_execute_write_zeroes;
return 0;
default:
pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode,
diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c
new file mode 100644
index 000000000000..8c42b3a8c420
--- /dev/null
+++ b/drivers/nvme/target/io-cmd-file.c
@@ -0,0 +1,304 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NVMe Over Fabrics Target File I/O commands implementation.
+ * Copyright (c) 2017-2018 Western Digital Corporation or its
+ * affiliates.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/uio.h>
+#include <linux/falloc.h>
+#include <linux/file.h>
+#include "nvmet.h"
+
+#define NVMET_MAX_MPOOL_BVEC 16
+#define NVMET_MIN_MPOOL_OBJ 16
+
+void nvmet_file_ns_disable(struct nvmet_ns *ns)
+{
+ if (ns->file) {
+ mempool_destroy(ns->bvec_pool);
+ ns->bvec_pool = NULL;
+ kmem_cache_destroy(ns->bvec_cache);
+ ns->bvec_cache = NULL;
+ fput(ns->file);
+ ns->file = NULL;
+ }
+}
+
+int nvmet_file_ns_enable(struct nvmet_ns *ns)
+{
+ int ret;
+ struct kstat stat;
+
+ ns->file = filp_open(ns->device_path,
+ O_RDWR | O_LARGEFILE | O_DIRECT, 0);
+ if (IS_ERR(ns->file)) {
+ pr_err("failed to open file %s: (%ld)\n",
+ ns->device_path, PTR_ERR(ns->file));
+ return PTR_ERR(ns->file);
+ }
+
+ ret = vfs_getattr(&ns->file->f_path,
+ &stat, STATX_SIZE, AT_STATX_FORCE_SYNC);
+ if (ret)
+ goto err;
+
+ ns->size = stat.size;
+ ns->blksize_shift = file_inode(ns->file)->i_blkbits;
+
+ ns->bvec_cache = kmem_cache_create("nvmet-bvec",
+ NVMET_MAX_MPOOL_BVEC * sizeof(struct bio_vec),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (!ns->bvec_cache) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ ns->bvec_pool = mempool_create(NVMET_MIN_MPOOL_OBJ, mempool_alloc_slab,
+ mempool_free_slab, ns->bvec_cache);
+
+ if (!ns->bvec_pool) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ return ret;
+err:
+ ns->size = 0;
+ ns->blksize_shift = 0;
+ nvmet_file_ns_disable(ns);
+ return ret;
+}
+
+static void nvmet_file_init_bvec(struct bio_vec *bv, struct sg_page_iter *iter)
+{
+ bv->bv_page = sg_page_iter_page(iter);
+ bv->bv_offset = iter->sg->offset;
+ bv->bv_len = PAGE_SIZE - iter->sg->offset;
+}
+
+static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos,
+ unsigned long nr_segs, size_t count)
+{
+ struct kiocb *iocb = &req->f.iocb;
+ ssize_t (*call_iter)(struct kiocb *iocb, struct iov_iter *iter);
+ struct iov_iter iter;
+ int ki_flags = 0, rw;
+ ssize_t ret;
+
+ if (req->cmd->rw.opcode == nvme_cmd_write) {
+ if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA))
+ ki_flags = IOCB_DSYNC;
+ call_iter = req->ns->file->f_op->write_iter;
+ rw = WRITE;
+ } else {
+ call_iter = req->ns->file->f_op->read_iter;
+ rw = READ;
+ }
+
+ iov_iter_bvec(&iter, ITER_BVEC | rw, req->f.bvec, nr_segs, count);
+
+ iocb->ki_pos = pos;
+ iocb->ki_filp = req->ns->file;
+ iocb->ki_flags = IOCB_DIRECT | ki_flags;
+
+ ret = call_iter(iocb, &iter);
+
+ if (ret != -EIOCBQUEUED && iocb->ki_complete)
+ iocb->ki_complete(iocb, ret, 0);
+
+ return ret;
+}
+
+static void nvmet_file_io_done(struct kiocb *iocb, long ret, long ret2)
+{
+ struct nvmet_req *req = container_of(iocb, struct nvmet_req, f.iocb);
+
+ if (req->f.bvec != req->inline_bvec) {
+ if (likely(req->f.mpool_alloc == false))
+ kfree(req->f.bvec);
+ else
+ mempool_free(req->f.bvec, req->ns->bvec_pool);
+ }
+
+ nvmet_req_complete(req, ret != req->data_len ?
+ NVME_SC_INTERNAL | NVME_SC_DNR : 0);
+}
+
+static void nvmet_file_execute_rw(struct nvmet_req *req)
+{
+ ssize_t nr_bvec = DIV_ROUND_UP(req->data_len, PAGE_SIZE);
+ struct sg_page_iter sg_pg_iter;
+ unsigned long bv_cnt = 0;
+ bool is_sync = false;
+ size_t len = 0, total_len = 0;
+ ssize_t ret = 0;
+ loff_t pos;
+
+ if (!req->sg_cnt || !nr_bvec) {
+ nvmet_req_complete(req, 0);
+ return;
+ }
+
+ if (nr_bvec > NVMET_MAX_INLINE_BIOVEC)
+ req->f.bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec),
+ GFP_KERNEL);
+ else
+ req->f.bvec = req->inline_bvec;
+
+ req->f.mpool_alloc = false;
+ if (unlikely(!req->f.bvec)) {
+ /* fallback under memory pressure */
+ req->f.bvec = mempool_alloc(req->ns->bvec_pool, GFP_KERNEL);
+ req->f.mpool_alloc = true;
+ if (nr_bvec > NVMET_MAX_MPOOL_BVEC)
+ is_sync = true;
+ }
+
+ pos = le64_to_cpu(req->cmd->rw.slba) << req->ns->blksize_shift;
+
+ memset(&req->f.iocb, 0, sizeof(struct kiocb));
+ for_each_sg_page(req->sg, &sg_pg_iter, req->sg_cnt, 0) {
+ nvmet_file_init_bvec(&req->f.bvec[bv_cnt], &sg_pg_iter);
+ len += req->f.bvec[bv_cnt].bv_len;
+ total_len += req->f.bvec[bv_cnt].bv_len;
+ bv_cnt++;
+
+ WARN_ON_ONCE((nr_bvec - 1) < 0);
+
+ if (unlikely(is_sync) &&
+ (nr_bvec - 1 == 0 || bv_cnt == NVMET_MAX_MPOOL_BVEC)) {
+ ret = nvmet_file_submit_bvec(req, pos, bv_cnt, len);
+ if (ret < 0)
+ goto out;
+ pos += len;
+ bv_cnt = 0;
+ len = 0;
+ }
+ nr_bvec--;
+ }
+
+ if (WARN_ON_ONCE(total_len != req->data_len))
+ ret = -EIO;
+out:
+ if (unlikely(is_sync || ret)) {
+ nvmet_file_io_done(&req->f.iocb, ret < 0 ? ret : total_len, 0);
+ return;
+ }
+ req->f.iocb.ki_complete = nvmet_file_io_done;
+ nvmet_file_submit_bvec(req, pos, bv_cnt, total_len);
+}
+
+static void nvmet_file_flush_work(struct work_struct *w)
+{
+ struct nvmet_req *req = container_of(w, struct nvmet_req, f.work);
+ int ret;
+
+ ret = vfs_fsync(req->ns->file, 1);
+
+ nvmet_req_complete(req, ret < 0 ? NVME_SC_INTERNAL | NVME_SC_DNR : 0);
+}
+
+static void nvmet_file_execute_flush(struct nvmet_req *req)
+{
+ INIT_WORK(&req->f.work, nvmet_file_flush_work);
+ schedule_work(&req->f.work);
+}
+
+static void nvmet_file_execute_discard(struct nvmet_req *req)
+{
+ int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
+ struct nvme_dsm_range range;
+ loff_t offset;
+ loff_t len;
+ int i, ret;
+
+ for (i = 0; i <= le32_to_cpu(req->cmd->dsm.nr); i++) {
+ if (nvmet_copy_from_sgl(req, i * sizeof(range), &range,
+ sizeof(range)))
+ break;
+ offset = le64_to_cpu(range.slba) << req->ns->blksize_shift;
+ len = le32_to_cpu(range.nlb) << req->ns->blksize_shift;
+ ret = vfs_fallocate(req->ns->file, mode, offset, len);
+ if (ret)
+ break;
+ }
+
+ nvmet_req_complete(req, ret < 0 ? NVME_SC_INTERNAL | NVME_SC_DNR : 0);
+}
+
+static void nvmet_file_dsm_work(struct work_struct *w)
+{
+ struct nvmet_req *req = container_of(w, struct nvmet_req, f.work);
+
+ switch (le32_to_cpu(req->cmd->dsm.attributes)) {
+ case NVME_DSMGMT_AD:
+ nvmet_file_execute_discard(req);
+ return;
+ case NVME_DSMGMT_IDR:
+ case NVME_DSMGMT_IDW:
+ default:
+ /* Not supported yet */
+ nvmet_req_complete(req, 0);
+ return;
+ }
+}
+
+static void nvmet_file_execute_dsm(struct nvmet_req *req)
+{
+ INIT_WORK(&req->f.work, nvmet_file_dsm_work);
+ schedule_work(&req->f.work);
+}
+
+static void nvmet_file_write_zeroes_work(struct work_struct *w)
+{
+ struct nvmet_req *req = container_of(w, struct nvmet_req, f.work);
+ struct nvme_write_zeroes_cmd *write_zeroes = &req->cmd->write_zeroes;
+ int mode = FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE;
+ loff_t offset;
+ loff_t len;
+ int ret;
+
+ offset = le64_to_cpu(write_zeroes->slba) << req->ns->blksize_shift;
+ len = (((sector_t)le16_to_cpu(write_zeroes->length) + 1) <<
+ req->ns->blksize_shift);
+
+ ret = vfs_fallocate(req->ns->file, mode, offset, len);
+ nvmet_req_complete(req, ret < 0 ? NVME_SC_INTERNAL | NVME_SC_DNR : 0);
+}
+
+static void nvmet_file_execute_write_zeroes(struct nvmet_req *req)
+{
+ INIT_WORK(&req->f.work, nvmet_file_write_zeroes_work);
+ schedule_work(&req->f.work);
+}
+
+u16 nvmet_file_parse_io_cmd(struct nvmet_req *req)
+{
+ struct nvme_command *cmd = req->cmd;
+
+ switch (cmd->common.opcode) {
+ case nvme_cmd_read:
+ case nvme_cmd_write:
+ req->execute = nvmet_file_execute_rw;
+ req->data_len = nvmet_rw_len(req);
+ return 0;
+ case nvme_cmd_flush:
+ req->execute = nvmet_file_execute_flush;
+ req->data_len = 0;
+ return 0;
+ case nvme_cmd_dsm:
+ req->execute = nvmet_file_execute_dsm;
+ req->data_len = (le32_to_cpu(cmd->dsm.nr) + 1) *
+ sizeof(struct nvme_dsm_range);
+ return 0;
+ case nvme_cmd_write_zeroes:
+ req->execute = nvmet_file_execute_write_zeroes;
+ req->data_len = 0;
+ return 0;
+ default:
+ pr_err("unhandled cmd for file ns %d on qid %d\n",
+ cmd->common.opcode, req->sq->qid);
+ return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
+ }
+}
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 27a8561c0cb9..1304ec3a7ede 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -45,6 +45,7 @@ struct nvme_loop_ctrl {
struct nvme_ctrl ctrl;
struct nvmet_ctrl *target_ctrl;
+ struct nvmet_port *port;
};
static inline struct nvme_loop_ctrl *to_loop_ctrl(struct nvme_ctrl *ctrl)
@@ -63,7 +64,8 @@ struct nvme_loop_queue {
unsigned long flags;
};
-static struct nvmet_port *nvmet_loop_port;
+static LIST_HEAD(nvme_loop_ports);
+static DEFINE_MUTEX(nvme_loop_ports_mutex);
static LIST_HEAD(nvme_loop_ctrl_list);
static DEFINE_MUTEX(nvme_loop_ctrl_mutex);
@@ -146,7 +148,7 @@ nvme_loop_timeout(struct request *rq, bool reserved)
/* fail with DNR on admin cmd timeout */
nvme_req(rq)->status = NVME_SC_ABORT_REQ | NVME_SC_DNR;
- return BLK_EH_HANDLED;
+ return BLK_EH_DONE;
}
static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
@@ -169,12 +171,12 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
blk_mq_start_request(req);
iod->cmd.common.flags |= NVME_CMD_SGL_METABUF;
- iod->req.port = nvmet_loop_port;
+ iod->req.port = queue->ctrl->port;
if (!nvmet_req_init(&iod->req, &queue->nvme_cq,
&queue->nvme_sq, &nvme_loop_ops))
return BLK_STS_OK;
- if (blk_rq_payload_bytes(req)) {
+ if (blk_rq_nr_phys_segments(req)) {
iod->sg_table.sgl = iod->first_sgl;
if (sg_alloc_table_chained(&iod->sg_table,
blk_rq_nr_phys_segments(req),
@@ -517,6 +519,7 @@ static const struct nvme_ctrl_ops nvme_loop_ctrl_ops = {
.free_ctrl = nvme_loop_free_ctrl,
.submit_async_event = nvme_loop_submit_async_event,
.delete_ctrl = nvme_loop_delete_ctrl_host,
+ .get_address = nvmf_get_address,
};
static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl)
@@ -565,6 +568,23 @@ out_destroy_queues:
return ret;
}
+static struct nvmet_port *nvme_loop_find_port(struct nvme_ctrl *ctrl)
+{
+ struct nvmet_port *p, *found = NULL;
+
+ mutex_lock(&nvme_loop_ports_mutex);
+ list_for_each_entry(p, &nvme_loop_ports, entry) {
+ /* if no transport address is specified use the first port */
+ if ((ctrl->opts->mask & NVMF_OPT_TRADDR) &&
+ strcmp(ctrl->opts->traddr, p->disc_addr.traddr))
+ continue;
+ found = p;
+ break;
+ }
+ mutex_unlock(&nvme_loop_ports_mutex);
+ return found;
+}
+
static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev,
struct nvmf_ctrl_options *opts)
{
@@ -589,6 +609,7 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev,
ctrl->ctrl.sqsize = opts->queue_size - 1;
ctrl->ctrl.kato = opts->kato;
+ ctrl->port = nvme_loop_find_port(&ctrl->ctrl);
ctrl->queues = kcalloc(opts->nr_io_queues + 1, sizeof(*ctrl->queues),
GFP_KERNEL);
@@ -646,27 +667,17 @@ out_put_ctrl:
static int nvme_loop_add_port(struct nvmet_port *port)
{
- /*
- * XXX: disalow adding more than one port so
- * there is no connection rejections when a
- * a subsystem is assigned to a port for which
- * loop doesn't have a pointer.
- * This scenario would be possible if we allowed
- * more than one port to be added and a subsystem
- * was assigned to a port other than nvmet_loop_port.
- */
-
- if (nvmet_loop_port)
- return -EPERM;
-
- nvmet_loop_port = port;
+ mutex_lock(&nvme_loop_ports_mutex);
+ list_add_tail(&port->entry, &nvme_loop_ports);
+ mutex_unlock(&nvme_loop_ports_mutex);
return 0;
}
static void nvme_loop_remove_port(struct nvmet_port *port)
{
- if (port == nvmet_loop_port)
- nvmet_loop_port = NULL;
+ mutex_lock(&nvme_loop_ports_mutex);
+ list_del_init(&port->entry);
+ mutex_unlock(&nvme_loop_ports_mutex);
}
static const struct nvmet_fabrics_ops nvme_loop_ops = {
@@ -682,6 +693,7 @@ static struct nvmf_transport_ops nvme_loop_transport = {
.name = "loop",
.module = THIS_MODULE,
.create_ctrl = nvme_loop_create_ctrl,
+ .allowed_opts = NVMF_OPT_TRADDR,
};
static int __init nvme_loop_init_module(void)
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 15fd84ab21f8..480dfe10fad9 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -30,6 +30,21 @@
#define NVMET_ASYNC_EVENTS 4
#define NVMET_ERROR_LOG_SLOTS 128
+
+/*
+ * Supported optional AENs:
+ */
+#define NVMET_AEN_CFG_OPTIONAL \
+ NVME_AEN_CFG_NS_ATTR
+
+/*
+ * Plus mandatory SMART AENs (we'll never send them, but allow enabling them):
+ */
+#define NVMET_AEN_CFG_ALL \
+ (NVME_SMART_CRIT_SPARE | NVME_SMART_CRIT_TEMPERATURE | \
+ NVME_SMART_CRIT_RELIABILITY | NVME_SMART_CRIT_MEDIA | \
+ NVME_SMART_CRIT_VOLATILE_MEMORY | NVMET_AEN_CFG_OPTIONAL)
+
/* Helper Macros when NVMe error is NVME_SC_CONNECT_INVALID_PARAM
* The 16 bit shift is to set IATTR bit to 1, which means offending
* offset starts in the data section of connect()
@@ -43,6 +58,7 @@ struct nvmet_ns {
struct list_head dev_link;
struct percpu_ref ref;
struct block_device *bdev;
+ struct file *file;
u32 nsid;
u32 blksize_shift;
loff_t size;
@@ -57,6 +73,8 @@ struct nvmet_ns {
struct config_group group;
struct completion disable_done;
+ mempool_t *bvec_pool;
+ struct kmem_cache *bvec_cache;
};
static inline struct nvmet_ns *to_nvmet_ns(struct config_item *item)
@@ -82,7 +100,7 @@ struct nvmet_sq {
/**
* struct nvmet_port - Common structure to keep port
* information for the target.
- * @entry: List head for holding a list of these elements.
+ * @entry: Entry into referrals or transport list.
* @disc_addr: Address information is stored in a format defined
* for a discovery log page entry.
* @group: ConfigFS group for this element's folder.
@@ -120,6 +138,8 @@ struct nvmet_ctrl {
u16 cntlid;
u32 kato;
+ u32 aen_enabled;
+ unsigned long aen_masked;
struct nvmet_req *async_event_cmds[NVMET_ASYNC_EVENTS];
unsigned int nr_async_event_cmds;
struct list_head async_events;
@@ -132,6 +152,9 @@ struct nvmet_ctrl {
const struct nvmet_fabrics_ops *ops;
+ __le32 *changed_ns_list;
+ u32 nr_changed_ns;
+
char subsysnqn[NVMF_NQN_FIELD_LEN];
char hostnqn[NVMF_NQN_FIELD_LEN];
};
@@ -222,8 +245,18 @@ struct nvmet_req {
struct nvmet_cq *cq;
struct nvmet_ns *ns;
struct scatterlist *sg;
- struct bio inline_bio;
struct bio_vec inline_bvec[NVMET_MAX_INLINE_BIOVEC];
+ union {
+ struct {
+ struct bio inline_bio;
+ } b;
+ struct {
+ bool mpool_alloc;
+ struct kiocb iocb;
+ struct bio_vec *bvec;
+ struct work_struct work;
+ } f;
+ };
int sg_cnt;
/* data length as parsed from the command: */
size_t data_len;
@@ -263,7 +296,8 @@ struct nvmet_async_event {
};
u16 nvmet_parse_connect_cmd(struct nvmet_req *req);
-u16 nvmet_parse_io_cmd(struct nvmet_req *req);
+u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req);
+u16 nvmet_file_parse_io_cmd(struct nvmet_req *req);
u16 nvmet_parse_admin_cmd(struct nvmet_req *req);
u16 nvmet_parse_discovery_cmd(struct nvmet_req *req);
u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req);
@@ -316,6 +350,7 @@ u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf,
size_t len);
u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf,
size_t len);
+u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len);
u32 nvmet_get_log_page_len(struct nvme_command *cmd);
@@ -338,4 +373,14 @@ extern struct rw_semaphore nvmet_config_sem;
bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys,
const char *hostnqn);
+int nvmet_bdev_ns_enable(struct nvmet_ns *ns);
+int nvmet_file_ns_enable(struct nvmet_ns *ns);
+void nvmet_bdev_ns_disable(struct nvmet_ns *ns);
+void nvmet_file_ns_disable(struct nvmet_ns *ns);
+
+static inline u32 nvmet_rw_len(struct nvmet_req *req)
+{
+ return ((u32)le16_to_cpu(req->cmd->rw.length) + 1) <<
+ req->ns->blksize_shift;
+}
#endif /* _NVMET_H */
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 02c03e418c27..cb9b685118da 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -3054,7 +3054,7 @@ out:
*
* Return values:
* BLK_EH_RESET_TIMER if the request should be left running
- * BLK_EH_NOT_HANDLED if the request is handled or terminated
+ * BLK_EH_DONE if the request is handled or terminated
* by the driver.
*/
enum blk_eh_timer_return dasd_times_out(struct request *req, bool reserved)
@@ -3067,7 +3067,7 @@ enum blk_eh_timer_return dasd_times_out(struct request *req, bool reserved)
cqr = *((struct dasd_ccw_req **) blk_mq_rq_to_pdu(req));
if (!cqr)
- return BLK_EH_NOT_HANDLED;
+ return BLK_EH_DONE;
spin_lock_irqsave(&cqr->dq->lock, flags);
device = cqr->startdev ? cqr->startdev : block->base;
@@ -3126,7 +3126,7 @@ enum blk_eh_timer_return dasd_times_out(struct request *req, bool reserved)
spin_unlock(&block->queue_lock);
spin_unlock_irqrestore(&cqr->dq->lock, flags);
- return rc ? BLK_EH_RESET_TIMER : BLK_EH_NOT_HANDLED;
+ return rc ? BLK_EH_RESET_TIMER : BLK_EH_DONE;
}
static int dasd_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
diff --git a/drivers/sbus/char/Kconfig b/drivers/sbus/char/Kconfig
index bf3c5f735614..89edd13fd572 100644
--- a/drivers/sbus/char/Kconfig
+++ b/drivers/sbus/char/Kconfig
@@ -28,13 +28,6 @@ config TADPOLE_TS102_UCTRL
events, and can also notice the attachment/detachment of external
monitors and mice.
-config SUN_JSFLASH
- tristate "JavaStation OS Flash SIMM"
- depends on SPARC32
- help
- If you say Y here, you will be able to boot from your JavaStation's
- Flash memory.
-
config BBC_I2C
tristate "UltraSPARC-III bootbus i2c controller driver"
depends on PCI && SPARC64
diff --git a/drivers/sbus/char/Makefile b/drivers/sbus/char/Makefile
index 8c48ed96683f..44347c918f6b 100644
--- a/drivers/sbus/char/Makefile
+++ b/drivers/sbus/char/Makefile
@@ -15,6 +15,5 @@ obj-$(CONFIG_DISPLAY7SEG) += display7seg.o
obj-$(CONFIG_OBP_FLASH) += flash.o
obj-$(CONFIG_SUN_OPENPROMIO) += openprom.o
obj-$(CONFIG_TADPOLE_TS102_UCTRL) += uctrl.o
-obj-$(CONFIG_SUN_JSFLASH) += jsflash.o
obj-$(CONFIG_BBC_I2C) += bbc.o
obj-$(CONFIG_ORACLE_DAX) += oradax.o
diff --git a/drivers/sbus/char/jsflash.c b/drivers/sbus/char/jsflash.c
deleted file mode 100644
index 14f377ac1280..000000000000
--- a/drivers/sbus/char/jsflash.c
+++ /dev/null
@@ -1,658 +0,0 @@
-/*
- * drivers/sbus/char/jsflash.c
- *
- * Copyright (C) 1991, 1992 Linus Torvalds (drivers/char/mem.c)
- * Copyright (C) 1997 Eddie C. Dost (drivers/sbus/char/flash.c)
- * Copyright (C) 1997-2000 Pavel Machek <pavel@ucw.cz> (drivers/block/nbd.c)
- * Copyright (C) 1999-2000 Pete Zaitcev
- *
- * This driver is used to program OS into a Flash SIMM on
- * Krups and Espresso platforms.
- *
- * TODO: do not allow erase/programming if file systems are mounted.
- * TODO: Erase/program both banks of a 8MB SIMM.
- *
- * It is anticipated that programming an OS Flash will be a routine
- * procedure. In the same time it is exceedingly dangerous because
- * a user can program its OBP flash with OS image and effectively
- * kill the machine.
- *
- * This driver uses an interface different from Eddie's flash.c
- * as a silly safeguard.
- *
- * XXX The flash.c manipulates page caching characteristics in a certain
- * dubious way; also it assumes that remap_pfn_range() can remap
- * PCI bus locations, which may be false. ioremap() must be used
- * instead. We should discuss this.
- */
-
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/miscdevice.h>
-#include <linux/fcntl.h>
-#include <linux/poll.h>
-#include <linux/init.h>
-#include <linux/string.h>
-#include <linux/genhd.h>
-#include <linux/blkdev.h>
-#include <linux/uaccess.h>
-#include <asm/pgtable.h>
-#include <asm/io.h>
-#include <asm/pcic.h>
-#include <asm/oplib.h>
-
-#include <asm/jsflash.h> /* ioctl arguments. <linux/> ?? */
-#define JSFIDSZ (sizeof(struct jsflash_ident_arg))
-#define JSFPRGSZ (sizeof(struct jsflash_program_arg))
-
-/*
- * Our device numbers have no business in system headers.
- * The only thing a user knows is the device name /dev/jsflash.
- *
- * Block devices are laid out like this:
- * minor+0 - Bootstrap, for 8MB SIMM 0x20400000[0x800000]
- * minor+1 - Filesystem to mount, normally 0x20400400[0x7ffc00]
- * minor+2 - Whole flash area for any case... 0x20000000[0x01000000]
- * Total 3 minors per flash device.
- *
- * It is easier to have static size vectors, so we define
- * a total minor range JSF_MAX, which must cover all minors.
- */
-/* character device */
-#define JSF_MINOR 178 /* 178 is registered with hpa */
-/* block device */
-#define JSF_MAX 3 /* 3 minors wasted total so far. */
-#define JSF_NPART 3 /* 3 minors per flash device */
-#define JSF_PART_BITS 2 /* 2 bits of minors to cover JSF_NPART */
-#define JSF_PART_MASK 0x3 /* 2 bits mask */
-
-static DEFINE_MUTEX(jsf_mutex);
-
-/*
- * Access functions.
- * We could ioremap(), but it's easier this way.
- */
-static unsigned int jsf_inl(unsigned long addr)
-{
- unsigned long retval;
-
- __asm__ __volatile__("lda [%1] %2, %0\n\t" :
- "=r" (retval) :
- "r" (addr), "i" (ASI_M_BYPASS));
- return retval;
-}
-
-static void jsf_outl(unsigned long addr, __u32 data)
-{
-
- __asm__ __volatile__("sta %0, [%1] %2\n\t" : :
- "r" (data), "r" (addr), "i" (ASI_M_BYPASS) :
- "memory");
-}
-
-/*
- * soft carrier
- */
-
-struct jsfd_part {
- unsigned long dbase;
- unsigned long dsize;
-};
-
-struct jsflash {
- unsigned long base;
- unsigned long size;
- unsigned long busy; /* In use? */
- struct jsflash_ident_arg id;
- /* int mbase; */ /* Minor base, typically zero */
- struct jsfd_part dv[JSF_NPART];
-};
-
-/*
- * We do not map normal memory or obio as a safety precaution.
- * But offsets are real, for ease of userland programming.
- */
-#define JSF_BASE_TOP 0x30000000
-#define JSF_BASE_ALL 0x20000000
-
-#define JSF_BASE_JK 0x20400000
-
-/*
- */
-static struct gendisk *jsfd_disk[JSF_MAX];
-
-/*
- * Let's pretend we may have several of these...
- */
-static struct jsflash jsf0;
-
-/*
- * Wait for AMD to finish its embedded algorithm.
- * We use the Toggle bit DQ6 (0x40) because it does not
- * depend on the data value as /DATA bit DQ7 does.
- *
- * XXX Do we need any timeout here? So far it never hanged, beware broken hw.
- */
-static void jsf_wait(unsigned long p) {
- unsigned int x1, x2;
-
- for (;;) {
- x1 = jsf_inl(p);
- x2 = jsf_inl(p);
- if ((x1 & 0x40404040) == (x2 & 0x40404040)) return;
- }
-}
-
-/*
- * Programming will only work if Flash is clean,
- * we leave it to the programmer application.
- *
- * AMD must be programmed one byte at a time;
- * thus, Simple Tech SIMM must be written 4 bytes at a time.
- *
- * Write waits for the chip to become ready after the write
- * was finished. This is done so that application would read
- * consistent data after the write is done.
- */
-static void jsf_write4(unsigned long fa, u32 data) {
-
- jsf_outl(fa, 0xAAAAAAAA); /* Unlock 1 Write 1 */
- jsf_outl(fa, 0x55555555); /* Unlock 1 Write 2 */
- jsf_outl(fa, 0xA0A0A0A0); /* Byte Program */
- jsf_outl(fa, data);
-
- jsf_wait(fa);
-}
-
-/*
- */
-static void jsfd_read(char *buf, unsigned long p, size_t togo) {
- union byte4 {
- char s[4];
- unsigned int n;
- } b;
-
- while (togo >= 4) {
- togo -= 4;
- b.n = jsf_inl(p);
- memcpy(buf, b.s, 4);
- p += 4;
- buf += 4;
- }
-}
-
-static int jsfd_queue;
-
-static struct request *jsfd_next_request(void)
-{
- struct request_queue *q;
- struct request *rq;
- int old_pos = jsfd_queue;
-
- do {
- q = jsfd_disk[jsfd_queue]->queue;
- if (++jsfd_queue == JSF_MAX)
- jsfd_queue = 0;
- if (q) {
- rq = blk_fetch_request(q);
- if (rq)
- return rq;
- }
- } while (jsfd_queue != old_pos);
-
- return NULL;
-}
-
-static void jsfd_request(void)
-{
- struct request *req;
-
- req = jsfd_next_request();
- while (req) {
- struct jsfd_part *jdp = req->rq_disk->private_data;
- unsigned long offset = blk_rq_pos(req) << 9;
- size_t len = blk_rq_cur_bytes(req);
- blk_status_t err = BLK_STS_IOERR;
-
- if ((offset + len) > jdp->dsize)
- goto end;
-
- if (rq_data_dir(req) != READ) {
- printk(KERN_ERR "jsfd: write\n");
- goto end;
- }
-
- if ((jdp->dbase & 0xff000000) != 0x20000000) {
- printk(KERN_ERR "jsfd: bad base %x\n", (int)jdp->dbase);
- goto end;
- }
-
- jsfd_read(bio_data(req->bio), jdp->dbase + offset, len);
- err = BLK_STS_OK;
- end:
- if (!__blk_end_request_cur(req, err))
- req = jsfd_next_request();
- }
-}
-
-static void jsfd_do_request(struct request_queue *q)
-{
- jsfd_request();
-}
-
-/*
- * The memory devices use the full 32/64 bits of the offset, and so we cannot
- * check against negative addresses: they are ok. The return value is weird,
- * though, in that case (0).
- *
- * also note that seeking relative to the "end of file" isn't supported:
- * it has no meaning, so it returns -EINVAL.
- */
-static loff_t jsf_lseek(struct file * file, loff_t offset, int orig)
-{
- loff_t ret;
-
- mutex_lock(&jsf_mutex);
- switch (orig) {
- case 0:
- file->f_pos = offset;
- ret = file->f_pos;
- break;
- case 1:
- file->f_pos += offset;
- ret = file->f_pos;
- break;
- default:
- ret = -EINVAL;
- }
- mutex_unlock(&jsf_mutex);
- return ret;
-}
-
-/*
- * OS SIMM Cannot be read in other size but a 32bits word.
- */
-static ssize_t jsf_read(struct file * file, char __user * buf,
- size_t togo, loff_t *ppos)
-{
- unsigned long p = *ppos;
- char __user *tmp = buf;
-
- union byte4 {
- char s[4];
- unsigned int n;
- } b;
-
- if (p < JSF_BASE_ALL || p >= JSF_BASE_TOP) {
- return 0;
- }
-
- if ((p + togo) < p /* wrap */
- || (p + togo) >= JSF_BASE_TOP) {
- togo = JSF_BASE_TOP - p;
- }
-
- if (p < JSF_BASE_ALL && togo != 0) {
-#if 0 /* __bzero XXX */
- size_t x = JSF_BASE_ALL - p;
- if (x > togo) x = togo;
- clear_user(tmp, x);
- tmp += x;
- p += x;
- togo -= x;
-#else
- /*
- * Implementation of clear_user() calls __bzero
- * without regard to modversions,
- * so we cannot build a module.
- */
- return 0;
-#endif
- }
-
- while (togo >= 4) {
- togo -= 4;
- b.n = jsf_inl(p);
- if (copy_to_user(tmp, b.s, 4))
- return -EFAULT;
- tmp += 4;
- p += 4;
- }
-
- /*
- * XXX Small togo may remain if 1 byte is ordered.
- * It would be nice if we did a word size read and unpacked it.
- */
-
- *ppos = p;
- return tmp-buf;
-}
-
-static ssize_t jsf_write(struct file * file, const char __user * buf,
- size_t count, loff_t *ppos)
-{
- return -ENOSPC;
-}
-
-/*
- */
-static int jsf_ioctl_erase(unsigned long arg)
-{
- unsigned long p;
-
- /* p = jsf0.base; hits wrong bank */
- p = 0x20400000;
-
- jsf_outl(p, 0xAAAAAAAA); /* Unlock 1 Write 1 */
- jsf_outl(p, 0x55555555); /* Unlock 1 Write 2 */
- jsf_outl(p, 0x80808080); /* Erase setup */
- jsf_outl(p, 0xAAAAAAAA); /* Unlock 2 Write 1 */
- jsf_outl(p, 0x55555555); /* Unlock 2 Write 2 */
- jsf_outl(p, 0x10101010); /* Chip erase */
-
-#if 0
- /*
- * This code is ok, except that counter based timeout
- * has no place in this world. Let's just drop timeouts...
- */
- {
- int i;
- __u32 x;
- for (i = 0; i < 1000000; i++) {
- x = jsf_inl(p);
- if ((x & 0x80808080) == 0x80808080) break;
- }
- if ((x & 0x80808080) != 0x80808080) {
- printk("jsf0: erase timeout with 0x%08x\n", x);
- } else {
- printk("jsf0: erase done with 0x%08x\n", x);
- }
- }
-#else
- jsf_wait(p);
-#endif
-
- return 0;
-}
-
-/*
- * Program a block of flash.
- * Very simple because we can do it byte by byte anyway.
- */
-static int jsf_ioctl_program(void __user *arg)
-{
- struct jsflash_program_arg abuf;
- char __user *uptr;
- unsigned long p;
- unsigned int togo;
- union {
- unsigned int n;
- char s[4];
- } b;
-
- if (copy_from_user(&abuf, arg, JSFPRGSZ))
- return -EFAULT;
- p = abuf.off;
- togo = abuf.size;
- if ((togo & 3) || (p & 3)) return -EINVAL;
-
- uptr = (char __user *) (unsigned long) abuf.data;
- while (togo != 0) {
- togo -= 4;
- if (copy_from_user(&b.s[0], uptr, 4))
- return -EFAULT;
- jsf_write4(p, b.n);
- p += 4;
- uptr += 4;
- }
-
- return 0;
-}
-
-static long jsf_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
-{
- mutex_lock(&jsf_mutex);
- int error = -ENOTTY;
- void __user *argp = (void __user *)arg;
-
- if (!capable(CAP_SYS_ADMIN)) {
- mutex_unlock(&jsf_mutex);
- return -EPERM;
- }
- switch (cmd) {
- case JSFLASH_IDENT:
- if (copy_to_user(argp, &jsf0.id, JSFIDSZ)) {
- mutex_unlock(&jsf_mutex);
- return -EFAULT;
- }
- break;
- case JSFLASH_ERASE:
- error = jsf_ioctl_erase(arg);
- break;
- case JSFLASH_PROGRAM:
- error = jsf_ioctl_program(argp);
- break;
- }
-
- mutex_unlock(&jsf_mutex);
- return error;
-}
-
-static int jsf_mmap(struct file * file, struct vm_area_struct * vma)
-{
- return -ENXIO;
-}
-
-static int jsf_open(struct inode * inode, struct file * filp)
-{
- mutex_lock(&jsf_mutex);
- if (jsf0.base == 0) {
- mutex_unlock(&jsf_mutex);
- return -ENXIO;
- }
- if (test_and_set_bit(0, (void *)&jsf0.busy) != 0) {
- mutex_unlock(&jsf_mutex);
- return -EBUSY;
- }
-
- mutex_unlock(&jsf_mutex);
- return 0; /* XXX What security? */
-}
-
-static int jsf_release(struct inode *inode, struct file *file)
-{
- jsf0.busy = 0;
- return 0;
-}
-
-static const struct file_operations jsf_fops = {
- .owner = THIS_MODULE,
- .llseek = jsf_lseek,
- .read = jsf_read,
- .write = jsf_write,
- .unlocked_ioctl = jsf_ioctl,
- .mmap = jsf_mmap,
- .open = jsf_open,
- .release = jsf_release,
-};
-
-static struct miscdevice jsf_dev = { JSF_MINOR, "jsflash", &jsf_fops };
-
-static const struct block_device_operations jsfd_fops = {
- .owner = THIS_MODULE,
-};
-
-static int jsflash_init(void)
-{
- int rc;
- struct jsflash *jsf;
- phandle node;
- char banner[128];
- struct linux_prom_registers reg0;
-
- node = prom_getchild(prom_root_node);
- node = prom_searchsiblings(node, "flash-memory");
- if (node != 0 && (s32)node != -1) {
- if (prom_getproperty(node, "reg",
- (char *)&reg0, sizeof(reg0)) == -1) {
- printk("jsflash: no \"reg\" property\n");
- return -ENXIO;
- }
- if (reg0.which_io != 0) {
- printk("jsflash: bus number nonzero: 0x%x:%x\n",
- reg0.which_io, reg0.phys_addr);
- return -ENXIO;
- }
- /*
- * Flash may be somewhere else, for instance on Ebus.
- * So, don't do the following check for IIep flash space.
- */
-#if 0
- if ((reg0.phys_addr >> 24) != 0x20) {
- printk("jsflash: suspicious address: 0x%x:%x\n",
- reg0.which_io, reg0.phys_addr);
- return -ENXIO;
- }
-#endif
- if ((int)reg0.reg_size <= 0) {
- printk("jsflash: bad size 0x%x\n", (int)reg0.reg_size);
- return -ENXIO;
- }
- } else {
- /* XXX Remove this code once PROLL ID12 got widespread */
- printk("jsflash: no /flash-memory node, use PROLL >= 12\n");
- prom_getproperty(prom_root_node, "banner-name", banner, 128);
- if (strcmp (banner, "JavaStation-NC") != 0 &&
- strcmp (banner, "JavaStation-E") != 0) {
- return -ENXIO;
- }
- reg0.which_io = 0;
- reg0.phys_addr = 0x20400000;
- reg0.reg_size = 0x00800000;
- }
-
- /* Let us be really paranoid for modifications to probing code. */
- if (sparc_cpu_model != sun4m) {
- /* We must be on sun4m because we use MMU Bypass ASI. */
- return -ENXIO;
- }
-
- if (jsf0.base == 0) {
- jsf = &jsf0;
-
- jsf->base = reg0.phys_addr;
- jsf->size = reg0.reg_size;
-
- /* XXX Redo the userland interface. */
- jsf->id.off = JSF_BASE_ALL;
- jsf->id.size = 0x01000000; /* 16M - all segments */
- strcpy(jsf->id.name, "Krups_all");
-
- jsf->dv[0].dbase = jsf->base;
- jsf->dv[0].dsize = jsf->size;
- jsf->dv[1].dbase = jsf->base + 1024;
- jsf->dv[1].dsize = jsf->size - 1024;
- jsf->dv[2].dbase = JSF_BASE_ALL;
- jsf->dv[2].dsize = 0x01000000;
-
- printk("Espresso Flash @0x%lx [%d MB]\n", jsf->base,
- (int) (jsf->size / (1024*1024)));
- }
-
- if ((rc = misc_register(&jsf_dev)) != 0) {
- printk(KERN_ERR "jsf: unable to get misc minor %d\n",
- JSF_MINOR);
- jsf0.base = 0;
- return rc;
- }
-
- return 0;
-}
-
-static int jsfd_init(void)
-{
- static DEFINE_SPINLOCK(lock);
- struct jsflash *jsf;
- struct jsfd_part *jdp;
- int err;
- int i;
-
- if (jsf0.base == 0)
- return -ENXIO;
-
- err = -ENOMEM;
- for (i = 0; i < JSF_MAX; i++) {
- struct gendisk *disk = alloc_disk(1);
- if (!disk)
- goto out;
- disk->queue = blk_init_queue(jsfd_do_request, &lock);
- if (!disk->queue) {
- put_disk(disk);
- goto out;
- }
- blk_queue_bounce_limit(disk->queue, BLK_BOUNCE_HIGH);
- jsfd_disk[i] = disk;
- }
-
- if (register_blkdev(JSFD_MAJOR, "jsfd")) {
- err = -EIO;
- goto out;
- }
-
- for (i = 0; i < JSF_MAX; i++) {
- struct gendisk *disk = jsfd_disk[i];
- if ((i & JSF_PART_MASK) >= JSF_NPART) continue;
- jsf = &jsf0; /* actually, &jsfv[i >> JSF_PART_BITS] */
- jdp = &jsf->dv[i&JSF_PART_MASK];
-
- disk->major = JSFD_MAJOR;
- disk->first_minor = i;
- sprintf(disk->disk_name, "jsfd%d", i);
- disk->fops = &jsfd_fops;
- set_capacity(disk, jdp->dsize >> 9);
- disk->private_data = jdp;
- add_disk(disk);
- set_disk_ro(disk, 1);
- }
- return 0;
-out:
- while (i--)
- put_disk(jsfd_disk[i]);
- return err;
-}
-
-MODULE_LICENSE("GPL");
-
-static int __init jsflash_init_module(void) {
- int rc;
-
- if ((rc = jsflash_init()) == 0) {
- jsfd_init();
- return 0;
- }
- return rc;
-}
-
-static void __exit jsflash_cleanup_module(void)
-{
- int i;
-
- for (i = 0; i < JSF_MAX; i++) {
- if ((i & JSF_PART_MASK) >= JSF_NPART) continue;
- del_gendisk(jsfd_disk[i]);
- blk_cleanup_queue(jsfd_disk[i]->queue);
- put_disk(jsfd_disk[i]);
- }
- if (jsf0.busy)
- printk("jsf0: cleaning busy unit\n");
- jsf0.base = 0;
- jsf0.busy = 0;
-
- misc_deregister(&jsf_dev);
- unregister_blkdev(JSFD_MAJOR, "jsfd");
-}
-
-module_init(jsflash_init_module);
-module_exit(jsflash_cleanup_module);
diff --git a/drivers/scsi/gdth.c b/drivers/scsi/gdth.c
index c35f05c4c6bb..85604795d8ee 100644
--- a/drivers/scsi/gdth.c
+++ b/drivers/scsi/gdth.c
@@ -3882,7 +3882,7 @@ static enum blk_eh_timer_return gdth_timed_out(struct scsi_cmnd *scp)
struct gdth_cmndinfo *cmndinfo = gdth_cmnd_priv(scp);
u8 b, t;
unsigned long flags;
- enum blk_eh_timer_return retval = BLK_EH_NOT_HANDLED;
+ enum blk_eh_timer_return retval = BLK_EH_DONE;
TRACE(("%s() cmd 0x%x\n", scp->cmnd[0], __func__));
b = scp->device->channel;
diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
index 15a2fef51e38..71bdc0b52cf9 100644
--- a/drivers/scsi/libiscsi.c
+++ b/drivers/scsi/libiscsi.c
@@ -1963,7 +1963,7 @@ static int iscsi_has_ping_timed_out(struct iscsi_conn *conn)
enum blk_eh_timer_return iscsi_eh_cmd_timed_out(struct scsi_cmnd *sc)
{
- enum blk_eh_timer_return rc = BLK_EH_NOT_HANDLED;
+ enum blk_eh_timer_return rc = BLK_EH_DONE;
struct iscsi_task *task = NULL, *running_task;
struct iscsi_cls_session *cls_session;
struct iscsi_session *session;
@@ -1982,7 +1982,7 @@ enum blk_eh_timer_return iscsi_eh_cmd_timed_out(struct scsi_cmnd *sc)
* Raced with completion. Blk layer has taken ownership
* so let timeout code complete it now.
*/
- rc = BLK_EH_HANDLED;
+ rc = BLK_EH_DONE;
goto done;
}
@@ -1997,7 +1997,7 @@ enum blk_eh_timer_return iscsi_eh_cmd_timed_out(struct scsi_cmnd *sc)
if (unlikely(system_state != SYSTEM_RUNNING)) {
sc->result = DID_NO_CONNECT << 16;
ISCSI_DBG_EH(session, "sc on shutdown, handled\n");
- rc = BLK_EH_HANDLED;
+ rc = BLK_EH_DONE;
goto done;
}
/*
diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c
index b89c6e6c0589..ce656c466ca9 100644
--- a/drivers/scsi/megaraid/megaraid_sas_base.c
+++ b/drivers/scsi/megaraid/megaraid_sas_base.c
@@ -2772,7 +2772,7 @@ blk_eh_timer_return megasas_reset_timer(struct scsi_cmnd *scmd)
if (time_after(jiffies, scmd->jiffies_at_alloc +
(scmd_timeout * 2) * HZ)) {
- return BLK_EH_NOT_HANDLED;
+ return BLK_EH_DONE;
}
instance = (struct megasas_instance *)scmd->device->host->hostdata;
diff --git a/drivers/scsi/mvumi.c b/drivers/scsi/mvumi.c
index fe97401ad192..afd27165cd93 100644
--- a/drivers/scsi/mvumi.c
+++ b/drivers/scsi/mvumi.c
@@ -2155,7 +2155,7 @@ static enum blk_eh_timer_return mvumi_timed_out(struct scsi_cmnd *scmd)
mvumi_return_cmd(mhba, cmd);
spin_unlock_irqrestore(mhba->shost->host_lock, flags);
- return BLK_EH_NOT_HANDLED;
+ return BLK_EH_DONE;
}
static int
diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
index e18877177f1b..5a33e1ad9881 100644
--- a/drivers/scsi/osd/osd_initiator.c
+++ b/drivers/scsi/osd/osd_initiator.c
@@ -99,7 +99,7 @@ static int _osd_get_print_system_info(struct osd_dev *od,
int nelem = ARRAY_SIZE(get_attrs), a = 0;
int ret;
- or = osd_start_request(od, GFP_KERNEL);
+ or = osd_start_request(od);
if (!or)
return -ENOMEM;
@@ -409,16 +409,15 @@ static void _osd_request_free(struct osd_request *or)
kfree(or);
}
-struct osd_request *osd_start_request(struct osd_dev *dev, gfp_t gfp)
+struct osd_request *osd_start_request(struct osd_dev *dev)
{
struct osd_request *or;
- or = _osd_request_alloc(gfp);
+ or = _osd_request_alloc(GFP_KERNEL);
if (!or)
return NULL;
or->osd_dev = dev;
- or->alloc_flags = gfp;
or->timeout = dev->def_timeout;
or->retries = OSD_REQ_RETRIES;
@@ -546,7 +545,7 @@ static int _osd_realloc_seg(struct osd_request *or,
if (seg->alloc_size >= max_bytes)
return 0;
- buff = krealloc(seg->buff, max_bytes, or->alloc_flags);
+ buff = krealloc(seg->buff, max_bytes, GFP_KERNEL);
if (!buff) {
OSD_ERR("Failed to Realloc %d-bytes was-%d\n", max_bytes,
seg->alloc_size);
@@ -728,7 +727,7 @@ static int _osd_req_list_objects(struct osd_request *or,
_osd_req_encode_olist(or, list);
WARN_ON(or->in.bio);
- bio = bio_map_kern(q, list, len, or->alloc_flags);
+ bio = bio_map_kern(q, list, len, GFP_KERNEL);
if (IS_ERR(bio)) {
OSD_ERR("!!! Failed to allocate list_objects BIO\n");
return PTR_ERR(bio);
@@ -1190,14 +1189,14 @@ static int _req_append_segment(struct osd_request *or,
pad_buff = io->pad_buff;
ret = blk_rq_map_kern(io->req->q, io->req, pad_buff, padding,
- or->alloc_flags);
+ GFP_KERNEL);
if (ret)
return ret;
io->total_bytes += padding;
}
ret = blk_rq_map_kern(io->req->q, io->req, seg->buff, seg->total_bytes,
- or->alloc_flags);
+ GFP_KERNEL);
if (ret)
return ret;
@@ -1564,14 +1563,14 @@ static int _osd_req_finalize_data_integrity(struct osd_request *or,
* osd_finalize_request and helpers
*/
static struct request *_make_request(struct request_queue *q, bool has_write,
- struct _osd_io_info *oii, gfp_t flags)
+ struct _osd_io_info *oii)
{
struct request *req;
struct bio *bio = oii->bio;
int ret;
req = blk_get_request(q, has_write ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN,
- flags);
+ 0);
if (IS_ERR(req))
return req;
@@ -1589,13 +1588,12 @@ static struct request *_make_request(struct request_queue *q, bool has_write,
static int _init_blk_request(struct osd_request *or,
bool has_in, bool has_out)
{
- gfp_t flags = or->alloc_flags;
struct scsi_device *scsi_device = or->osd_dev->scsi_device;
struct request_queue *q = scsi_device->request_queue;
struct request *req;
int ret;
- req = _make_request(q, has_out, has_out ? &or->out : &or->in, flags);
+ req = _make_request(q, has_out, has_out ? &or->out : &or->in);
if (IS_ERR(req)) {
ret = PTR_ERR(req);
goto out;
@@ -1611,7 +1609,7 @@ static int _init_blk_request(struct osd_request *or,
or->out.req = req;
if (has_in) {
/* allocate bidi request */
- req = _make_request(q, false, &or->in, flags);
+ req = _make_request(q, false, &or->in);
if (IS_ERR(req)) {
OSD_DEBUG("blk_get_request for bidi failed\n");
ret = PTR_ERR(req);
diff --git a/drivers/scsi/osst.c b/drivers/scsi/osst.c
index 20ec1c01dbd5..2bbe797f8c3d 100644
--- a/drivers/scsi/osst.c
+++ b/drivers/scsi/osst.c
@@ -368,7 +368,7 @@ static int osst_execute(struct osst_request *SRpnt, const unsigned char *cmd,
int write = (data_direction == DMA_TO_DEVICE);
req = blk_get_request(SRpnt->stp->device->request_queue,
- write ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, GFP_KERNEL);
+ write ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
if (IS_ERR(req))
return DRIVER_ERROR << 24;
diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c
index 94c14ce94da2..0e13349dce57 100644
--- a/drivers/scsi/qla4xxx/ql4_os.c
+++ b/drivers/scsi/qla4xxx/ql4_os.c
@@ -1848,7 +1848,7 @@ static enum blk_eh_timer_return qla4xxx_eh_cmd_timed_out(struct scsi_cmnd *sc)
struct iscsi_cls_session *session;
struct iscsi_session *sess;
unsigned long flags;
- enum blk_eh_timer_return ret = BLK_EH_NOT_HANDLED;
+ enum blk_eh_timer_return ret = BLK_EH_DONE;
session = starget_to_session(scsi_target(sc->device));
sess = session->dd_data;
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index 946039117bf4..9c02ba2e7ef3 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -282,7 +282,7 @@ void scsi_eh_scmd_add(struct scsi_cmnd *scmd)
enum blk_eh_timer_return scsi_times_out(struct request *req)
{
struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(req);
- enum blk_eh_timer_return rtn = BLK_EH_NOT_HANDLED;
+ enum blk_eh_timer_return rtn = BLK_EH_DONE;
struct Scsi_Host *host = scmd->device->host;
trace_scsi_dispatch_cmd_timeout(scmd);
@@ -294,7 +294,7 @@ enum blk_eh_timer_return scsi_times_out(struct request *req)
if (host->hostt->eh_timed_out)
rtn = host->hostt->eh_timed_out(scmd);
- if (rtn == BLK_EH_NOT_HANDLED) {
+ if (rtn == BLK_EH_DONE) {
if (scsi_abort_command(scmd) != SUCCESS) {
set_host_byte(scmd, DID_TIME_OUT);
scsi_eh_scmd_add(scmd);
@@ -1933,11 +1933,7 @@ static void scsi_eh_lock_door(struct scsi_device *sdev)
struct request *req;
struct scsi_request *rq;
- /*
- * blk_get_request with GFP_KERNEL (__GFP_RECLAIM) sleeps until a
- * request becomes available
- */
- req = blk_get_request(sdev->request_queue, REQ_OP_SCSI_IN, GFP_KERNEL);
+ req = blk_get_request(sdev->request_queue, REQ_OP_SCSI_IN, 0);
if (IS_ERR(req))
return;
rq = scsi_req(req);
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index e9b4f279d29c..f125fd71c0f2 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -265,7 +265,7 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
struct scsi_request *rq;
int ret = DRIVER_ERROR << 24;
- req = blk_get_request_flags(sdev->request_queue,
+ req = blk_get_request(sdev->request_queue,
data_direction == DMA_TO_DEVICE ?
REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, BLK_MQ_REQ_PREEMPT);
if (IS_ERR(req))
@@ -273,7 +273,7 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
rq = scsi_req(req);
if (bufflen && blk_rq_map_kern(sdev->request_queue, req,
- buffer, bufflen, __GFP_RECLAIM))
+ buffer, bufflen, GFP_NOIO))
goto out;
rq->cmd_len = COMMAND_SIZE(cmd[0]);
diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index be3be0f9cb2d..1da3d71e9f61 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -2087,7 +2087,7 @@ fc_eh_timed_out(struct scsi_cmnd *scmd)
if (rport->port_state == FC_PORTSTATE_BLOCKED)
return BLK_EH_RESET_TIMER;
- return BLK_EH_NOT_HANDLED;
+ return BLK_EH_DONE;
}
EXPORT_SYMBOL(fc_eh_timed_out);
@@ -3591,10 +3591,9 @@ fc_bsg_job_timeout(struct request *req)
}
/* the blk_end_sync_io() doesn't check the error */
- if (!inflight)
- return BLK_EH_NOT_HANDLED;
- else
- return BLK_EH_HANDLED;
+ if (inflight)
+ blk_mq_complete_request(req);
+ return BLK_EH_DONE;
}
/**
@@ -3781,8 +3780,7 @@ fc_bsg_hostadd(struct Scsi_Host *shost, struct fc_host_attrs *fc_host)
snprintf(bsg_name, sizeof(bsg_name),
"fc_host%d", shost->host_no);
- q = bsg_setup_queue(dev, bsg_name, fc_bsg_dispatch, i->f->dd_bsg_size,
- NULL);
+ q = bsg_setup_queue(dev, bsg_name, fc_bsg_dispatch, i->f->dd_bsg_size);
if (IS_ERR(q)) {
dev_err(dev,
"fc_host%d: bsg interface failed to initialize - setup queue\n",
@@ -3827,8 +3825,8 @@ fc_bsg_rportadd(struct Scsi_Host *shost, struct fc_rport *rport)
if (!i->f->bsg_request)
return -ENOTSUPP;
- q = bsg_setup_queue(dev, NULL, fc_bsg_dispatch, i->f->dd_bsg_size,
- NULL);
+ q = bsg_setup_queue(dev, dev_name(dev), fc_bsg_dispatch,
+ i->f->dd_bsg_size);
if (IS_ERR(q)) {
dev_err(dev, "failed to setup bsg queue\n");
return PTR_ERR(q);
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 65f6c94f2e9b..6fd2fe210fc3 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -1542,7 +1542,7 @@ iscsi_bsg_host_add(struct Scsi_Host *shost, struct iscsi_cls_host *ihost)
return -ENOTSUPP;
snprintf(bsg_name, sizeof(bsg_name), "iscsi_host%d", shost->host_no);
- q = bsg_setup_queue(dev, bsg_name, iscsi_bsg_host_dispatch, 0, NULL);
+ q = bsg_setup_queue(dev, bsg_name, iscsi_bsg_host_dispatch, 0);
if (IS_ERR(q)) {
shost_printk(KERN_ERR, shost, "bsg interface failed to "
"initialize - no request queue\n");
diff --git a/drivers/scsi/scsi_transport_sas.c b/drivers/scsi/scsi_transport_sas.c
index 08acbabfae07..e2953b416746 100644
--- a/drivers/scsi/scsi_transport_sas.c
+++ b/drivers/scsi/scsi_transport_sas.c
@@ -187,16 +187,6 @@ static int sas_smp_dispatch(struct bsg_job *job)
return 0;
}
-static void sas_host_release(struct device *dev)
-{
- struct Scsi_Host *shost = dev_to_shost(dev);
- struct sas_host_attrs *sas_host = to_sas_host_attrs(shost);
- struct request_queue *q = sas_host->q;
-
- if (q)
- blk_cleanup_queue(q);
-}
-
static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy)
{
struct request_queue *q;
@@ -208,7 +198,7 @@ static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy)
if (rphy) {
q = bsg_setup_queue(&rphy->dev, dev_name(&rphy->dev),
- sas_smp_dispatch, 0, NULL);
+ sas_smp_dispatch, 0);
if (IS_ERR(q))
return PTR_ERR(q);
rphy->q = q;
@@ -217,7 +207,7 @@ static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy)
snprintf(name, sizeof(name), "sas_host%d", shost->host_no);
q = bsg_setup_queue(&shost->shost_gendev, name,
- sas_smp_dispatch, 0, sas_host_release);
+ sas_smp_dispatch, 0);
if (IS_ERR(q))
return PTR_ERR(q);
to_sas_host_attrs(shost)->q = q;
@@ -260,8 +250,11 @@ static int sas_host_remove(struct transport_container *tc, struct device *dev,
struct Scsi_Host *shost = dev_to_shost(dev);
struct request_queue *q = to_sas_host_attrs(shost)->q;
- if (q)
+ if (q) {
bsg_unregister_queue(q);
+ blk_cleanup_queue(q);
+ }
+
return 0;
}
diff --git a/drivers/scsi/scsi_transport_srp.c b/drivers/scsi/scsi_transport_srp.c
index 456ce9f19569..4e46fdb2d7c9 100644
--- a/drivers/scsi/scsi_transport_srp.c
+++ b/drivers/scsi/scsi_transport_srp.c
@@ -604,7 +604,7 @@ EXPORT_SYMBOL(srp_reconnect_rport);
*
* If a timeout occurs while an rport is in the blocked state, ask the SCSI
* EH to continue waiting (BLK_EH_RESET_TIMER). Otherwise let the SCSI core
- * handle the timeout (BLK_EH_NOT_HANDLED).
+ * handle the timeout (BLK_EH_DONE).
*
* Note: This function is called from soft-IRQ context and with the request
* queue lock held.
@@ -620,7 +620,7 @@ enum blk_eh_timer_return srp_timed_out(struct scsi_cmnd *scmd)
return rport && rport->fast_io_fail_tmo < 0 &&
rport->dev_loss_tmo < 0 &&
i->f->reset_timer_if_blocked && scsi_device_blocked(sdev) ?
- BLK_EH_RESET_TIMER : BLK_EH_NOT_HANDLED;
+ BLK_EH_RESET_TIMER : BLK_EH_DONE;
}
EXPORT_SYMBOL(srp_timed_out);
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 5c40d809830f..b6f174df9c8c 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1715,7 +1715,7 @@ sg_start_req(Sg_request *srp, unsigned char *cmd)
* does not sleep except under memory pressure.
*/
rq = blk_get_request(q, hp->dxfer_direction == SG_DXFER_TO_DEV ?
- REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, GFP_KERNEL);
+ REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
if (IS_ERR(rq)) {
kfree(long_cmdp);
return PTR_ERR(rq);
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 6c399480783d..a427ce9497be 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -545,7 +545,7 @@ static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd,
req = blk_get_request(SRpnt->stp->device->request_queue,
data_direction == DMA_TO_DEVICE ?
- REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, GFP_KERNEL);
+ REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
if (IS_ERR(req))
return DRIVER_ERROR << 24;
rq = scsi_req(req);
diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index 00e79057f870..d0a1674915a1 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -6497,12 +6497,12 @@ static enum blk_eh_timer_return ufshcd_eh_timed_out(struct scsi_cmnd *scmd)
bool found = false;
if (!scmd || !scmd->device || !scmd->device->host)
- return BLK_EH_NOT_HANDLED;
+ return BLK_EH_DONE;
host = scmd->device->host;
hba = shost_priv(host);
if (!hba)
- return BLK_EH_NOT_HANDLED;
+ return BLK_EH_DONE;
spin_lock_irqsave(host->host_lock, flags);
@@ -6520,7 +6520,7 @@ static enum blk_eh_timer_return ufshcd_eh_timed_out(struct scsi_cmnd *scmd)
* SCSI command was not actually dispatched to UFS driver, otherwise
* let SCSI layer handle the error as usual.
*/
- return found ? BLK_EH_NOT_HANDLED : BLK_EH_RESET_TIMER;
+ return found ? BLK_EH_DONE : BLK_EH_RESET_TIMER;
}
static const struct attribute_group *ufshcd_driver_groups[] = {
diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c
index 60429011292a..ce1321a5cb7b 100644
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -94,8 +94,8 @@ static int iblock_configure_device(struct se_device *dev)
return -EINVAL;
}
- ib_dev->ibd_bio_set = bioset_create(IBLOCK_BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
- if (!ib_dev->ibd_bio_set) {
+ ret = bioset_init(&ib_dev->ibd_bio_set, IBLOCK_BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
+ if (ret) {
pr_err("IBLOCK: Unable to create bioset\n");
goto out;
}
@@ -141,7 +141,7 @@ static int iblock_configure_device(struct se_device *dev)
bi = bdev_get_integrity(bd);
if (bi) {
- struct bio_set *bs = ib_dev->ibd_bio_set;
+ struct bio_set *bs = &ib_dev->ibd_bio_set;
if (!strcmp(bi->profile->name, "T10-DIF-TYPE3-IP") ||
!strcmp(bi->profile->name, "T10-DIF-TYPE1-IP")) {
@@ -164,7 +164,7 @@ static int iblock_configure_device(struct se_device *dev)
goto out_blkdev_put;
}
pr_debug("IBLOCK setup BIP bs->bio_integrity_pool: %p\n",
- bs->bio_integrity_pool);
+ &bs->bio_integrity_pool);
}
dev->dev_attrib.hw_pi_prot_type = dev->dev_attrib.pi_prot_type;
}
@@ -174,8 +174,7 @@ static int iblock_configure_device(struct se_device *dev)
out_blkdev_put:
blkdev_put(ib_dev->ibd_bd, FMODE_WRITE|FMODE_READ|FMODE_EXCL);
out_free_bioset:
- bioset_free(ib_dev->ibd_bio_set);
- ib_dev->ibd_bio_set = NULL;
+ bioset_exit(&ib_dev->ibd_bio_set);
out:
return ret;
}
@@ -199,8 +198,7 @@ static void iblock_destroy_device(struct se_device *dev)
if (ib_dev->ibd_bd != NULL)
blkdev_put(ib_dev->ibd_bd, FMODE_WRITE|FMODE_READ|FMODE_EXCL);
- if (ib_dev->ibd_bio_set != NULL)
- bioset_free(ib_dev->ibd_bio_set);
+ bioset_exit(&ib_dev->ibd_bio_set);
}
static unsigned long long iblock_emulate_read_cap_with_block_size(
@@ -332,7 +330,7 @@ iblock_get_bio(struct se_cmd *cmd, sector_t lba, u32 sg_num, int op,
if (sg_num > BIO_MAX_PAGES)
sg_num = BIO_MAX_PAGES;
- bio = bio_alloc_bioset(GFP_NOIO, sg_num, ib_dev->ibd_bio_set);
+ bio = bio_alloc_bioset(GFP_NOIO, sg_num, &ib_dev->ibd_bio_set);
if (!bio) {
pr_err("Unable to allocate memory for bio\n");
return NULL;
diff --git a/drivers/target/target_core_iblock.h b/drivers/target/target_core_iblock.h
index b4aeb2584ad4..9cc3843404d4 100644
--- a/drivers/target/target_core_iblock.h
+++ b/drivers/target/target_core_iblock.h
@@ -22,7 +22,7 @@ struct iblock_dev {
struct se_device dev;
unsigned char ibd_udev_path[SE_UDEV_PATH_LEN];
u32 ibd_flags;
- struct bio_set *ibd_bio_set;
+ struct bio_set ibd_bio_set;
struct block_device *ibd_bd;
bool ibd_readonly;
} ____cacheline_aligned;
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index 6cb933ecc084..668934ea74cb 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -986,8 +986,7 @@ pscsi_execute_cmd(struct se_cmd *cmd)
req = blk_get_request(pdv->pdv_sd->request_queue,
cmd->data_direction == DMA_TO_DEVICE ?
- REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN,
- GFP_KERNEL);
+ REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
if (IS_ERR(req)) {
pr_err("PSCSI: blk_get_request() failed\n");
ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 7ec920e27065..bef6934b6189 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -272,7 +272,7 @@ struct blkdev_dio {
struct bio bio;
};
-static struct bio_set *blkdev_dio_pool __read_mostly;
+static struct bio_set blkdev_dio_pool;
static void blkdev_bio_end_io(struct bio *bio)
{
@@ -334,7 +334,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
(bdev_logical_block_size(bdev) - 1))
return -EINVAL;
- bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, blkdev_dio_pool);
+ bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool);
bio_get(bio); /* extra ref for the completion handler */
dio = container_of(bio, struct blkdev_dio, bio);
@@ -432,10 +432,7 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
static __init int blkdev_init(void)
{
- blkdev_dio_pool = bioset_create(4, offsetof(struct blkdev_dio, bio), BIOSET_NEED_BVECS);
- if (!blkdev_dio_pool)
- return -ENOMEM;
- return 0;
+ return bioset_init(&blkdev_dio_pool, 4, offsetof(struct blkdev_dio, bio), BIOSET_NEED_BVECS);
}
module_init(blkdev_init);
@@ -1322,27 +1319,30 @@ static void flush_disk(struct block_device *bdev, bool kill_dirty)
* check_disk_size_change - checks for disk size change and adjusts bdev size.
* @disk: struct gendisk to check
* @bdev: struct bdev to adjust.
+ * @verbose: if %true log a message about a size change if there is any
*
* This routine checks to see if the bdev size does not match the disk size
* and adjusts it if it differs. When shrinking the bdev size, its all caches
* are freed.
*/
-void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
+void check_disk_size_change(struct gendisk *disk, struct block_device *bdev,
+ bool verbose)
{
loff_t disk_size, bdev_size;
disk_size = (loff_t)get_capacity(disk) << 9;
bdev_size = i_size_read(bdev->bd_inode);
if (disk_size != bdev_size) {
- printk(KERN_INFO
- "%s: detected capacity change from %lld to %lld\n",
- disk->disk_name, bdev_size, disk_size);
+ if (verbose) {
+ printk(KERN_INFO
+ "%s: detected capacity change from %lld to %lld\n",
+ disk->disk_name, bdev_size, disk_size);
+ }
i_size_write(bdev->bd_inode, disk_size);
if (bdev_size > disk_size)
flush_disk(bdev, false);
}
}
-EXPORT_SYMBOL(check_disk_size_change);
/**
* revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back
@@ -1364,7 +1364,7 @@ int revalidate_disk(struct gendisk *disk)
return ret;
mutex_lock(&bdev->bd_mutex);
- check_disk_size_change(disk, bdev);
+ check_disk_size_change(disk, bdev, ret == 0);
bdev->bd_invalidated = 0;
mutex_unlock(&bdev->bd_mutex);
bdput(bdev);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e99b329002cf..56d32bb462f9 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -26,7 +26,7 @@
static struct kmem_cache *extent_state_cache;
static struct kmem_cache *extent_buffer_cache;
-static struct bio_set *btrfs_bioset;
+static struct bio_set btrfs_bioset;
static inline bool extent_state_in_tree(const struct extent_state *state)
{
@@ -162,20 +162,18 @@ int __init extent_io_init(void)
if (!extent_buffer_cache)
goto free_state_cache;
- btrfs_bioset = bioset_create(BIO_POOL_SIZE,
- offsetof(struct btrfs_io_bio, bio),
- BIOSET_NEED_BVECS);
- if (!btrfs_bioset)
+ if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
+ offsetof(struct btrfs_io_bio, bio),
+ BIOSET_NEED_BVECS))
goto free_buffer_cache;
- if (bioset_integrity_create(btrfs_bioset, BIO_POOL_SIZE))
+ if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE))
goto free_bioset;
return 0;
free_bioset:
- bioset_free(btrfs_bioset);
- btrfs_bioset = NULL;
+ bioset_exit(&btrfs_bioset);
free_buffer_cache:
kmem_cache_destroy(extent_buffer_cache);
@@ -198,8 +196,7 @@ void __cold extent_io_exit(void)
rcu_barrier();
kmem_cache_destroy(extent_state_cache);
kmem_cache_destroy(extent_buffer_cache);
- if (btrfs_bioset)
- bioset_free(btrfs_bioset);
+ bioset_exit(&btrfs_bioset);
}
void extent_io_tree_init(struct extent_io_tree *tree,
@@ -2679,7 +2676,7 @@ struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte)
{
struct bio *bio;
- bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, btrfs_bioset);
+ bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &btrfs_bioset);
bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = first_byte >> 9;
btrfs_io_bio_init(btrfs_io_bio(bio));
@@ -2692,7 +2689,7 @@ struct bio *btrfs_bio_clone(struct bio *bio)
struct bio *new;
/* Bio allocation backed by a bioset does not fail */
- new = bio_clone_fast(bio, GFP_NOFS, btrfs_bioset);
+ new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset);
btrfs_bio = btrfs_io_bio(new);
btrfs_io_bio_init(btrfs_bio);
btrfs_bio->iter = bio->bi_iter;
@@ -2704,7 +2701,7 @@ struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs)
struct bio *bio;
/* Bio allocation backed by a bioset does not fail */
- bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, btrfs_bioset);
+ bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset);
btrfs_io_bio_init(btrfs_io_bio(bio));
return bio;
}
@@ -2715,7 +2712,7 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
struct btrfs_io_bio *btrfs_bio;
/* this will never fail when it's backed by a bioset */
- bio = bio_clone_fast(orig, GFP_NOFS, btrfs_bioset);
+ bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset);
ASSERT(bio);
btrfs_bio = btrfs_io_bio(bio);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 874607bb6e02..093fb54cd316 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -432,8 +432,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
struct bio *bio;
/*
- * bio_alloc() is guaranteed to return a bio when called with
- * __GFP_RECLAIM and we request a valid number of vectors.
+ * bio_alloc() is guaranteed to return a bio when allowed to sleep and
+ * we request a valid number of vectors.
*/
bio = bio_alloc(GFP_KERNEL, nr_vecs);
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 3c6a9c156b7a..ddbf87246898 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -790,7 +790,7 @@ int ore_create(struct ore_io_state *ios)
for (i = 0; i < ios->oc->numdevs; i++) {
struct osd_request *or;
- or = osd_start_request(_ios_od(ios, i), GFP_KERNEL);
+ or = osd_start_request(_ios_od(ios, i));
if (unlikely(!or)) {
ORE_ERR("%s: osd_start_request failed\n", __func__);
ret = -ENOMEM;
@@ -815,7 +815,7 @@ int ore_remove(struct ore_io_state *ios)
for (i = 0; i < ios->oc->numdevs; i++) {
struct osd_request *or;
- or = osd_start_request(_ios_od(ios, i), GFP_KERNEL);
+ or = osd_start_request(_ios_od(ios, i));
if (unlikely(!or)) {
ORE_ERR("%s: osd_start_request failed\n", __func__);
ret = -ENOMEM;
@@ -847,7 +847,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
struct osd_request *or;
- or = osd_start_request(_ios_od(ios, dev), GFP_KERNEL);
+ or = osd_start_request(_ios_od(ios, dev));
if (unlikely(!or)) {
ORE_ERR("%s: osd_start_request failed\n", __func__);
ret = -ENOMEM;
@@ -966,7 +966,7 @@ int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp)
return 0; /* Just an empty slot */
first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1;
- or = osd_start_request(_ios_od(ios, first_dev), GFP_KERNEL);
+ or = osd_start_request(_ios_od(ios, first_dev));
if (unlikely(!or)) {
ORE_ERR("%s: osd_start_request failed\n", __func__);
return -ENOMEM;
@@ -1060,7 +1060,7 @@ static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp,
struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
struct osd_request *or;
- or = osd_start_request(_ios_od(ios, cur_comp), GFP_KERNEL);
+ or = osd_start_request(_ios_od(ios, cur_comp));
if (unlikely(!or)) {
ORE_ERR("%s: osd_start_request failed\n", __func__);
return -ENOMEM;
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 179cd5c2f52a..719a3152da80 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -229,7 +229,7 @@ void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
static int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
u64 offset, void *p, unsigned length)
{
- struct osd_request *or = osd_start_request(od, GFP_KERNEL);
+ struct osd_request *or = osd_start_request(od);
/* struct osd_sense_info osi = {.key = 0};*/
int ret;
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 70b8bf781fce..a43dfedd69ec 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -227,7 +227,7 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev,
if (!buf)
return -ENOMEM;
- rq = blk_get_request(q, REQ_OP_SCSI_IN, GFP_KERNEL);
+ rq = blk_get_request(q, REQ_OP_SCSI_IN, 0);
if (IS_ERR(rq)) {
error = -ENOMEM;
goto out_free_buf;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 0ab824f574ed..102463543db3 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -594,7 +594,7 @@ xfs_alloc_ioend(
struct xfs_ioend *ioend;
struct bio *bio;
- bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, xfs_ioend_bioset);
+ bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &xfs_ioend_bioset);
xfs_init_bio_from_bh(bio, bh);
ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 69346d460dfa..694c85b03813 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -18,7 +18,7 @@
#ifndef __XFS_AOPS_H__
#define __XFS_AOPS_H__
-extern struct bio_set *xfs_ioend_bioset;
+extern struct bio_set xfs_ioend_bioset;
/*
* Types of I/O for bmap clustering and I/O completion tracking.
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index d71424052917..f643d76db516 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -63,7 +63,7 @@
#include <linux/parser.h>
static const struct super_operations xfs_super_operations;
-struct bio_set *xfs_ioend_bioset;
+struct bio_set xfs_ioend_bioset;
static struct kset *xfs_kset; /* top-level xfs sysfs dir */
#ifdef DEBUG
@@ -1845,10 +1845,9 @@ MODULE_ALIAS_FS("xfs");
STATIC int __init
xfs_init_zones(void)
{
- xfs_ioend_bioset = bioset_create(4 * MAX_BUF_PER_PAGE,
+ if (bioset_init(&xfs_ioend_bioset, 4 * MAX_BUF_PER_PAGE,
offsetof(struct xfs_ioend, io_inline_bio),
- BIOSET_NEED_BVECS);
- if (!xfs_ioend_bioset)
+ BIOSET_NEED_BVECS))
goto out;
xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t),
@@ -1997,7 +1996,7 @@ xfs_init_zones(void)
out_destroy_log_ticket_zone:
kmem_zone_destroy(xfs_log_ticket_zone);
out_free_ioend_bioset:
- bioset_free(xfs_ioend_bioset);
+ bioset_exit(&xfs_ioend_bioset);
out:
return -ENOMEM;
}
@@ -2029,7 +2028,7 @@ xfs_destroy_zones(void)
kmem_zone_destroy(xfs_btree_cur_zone);
kmem_zone_destroy(xfs_bmap_free_item_zone);
kmem_zone_destroy(xfs_log_ticket_zone);
- bioset_free(xfs_ioend_bioset);
+ bioset_exit(&xfs_ioend_bioset);
}
STATIC int __init
diff --git a/include/linux/bio.h b/include/linux/bio.h
index ce547a25e8ae..810a8bee8f85 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -67,8 +67,12 @@
#define bio_multiple_segments(bio) \
((bio)->bi_iter.bi_size != bio_iovec(bio).bv_len)
-#define bio_sectors(bio) ((bio)->bi_iter.bi_size >> 9)
-#define bio_end_sector(bio) ((bio)->bi_iter.bi_sector + bio_sectors((bio)))
+
+#define bvec_iter_sectors(iter) ((iter).bi_size >> 9)
+#define bvec_iter_end_sector(iter) ((iter).bi_sector + bvec_iter_sectors((iter)))
+
+#define bio_sectors(bio) bvec_iter_sectors((bio)->bi_iter)
+#define bio_end_sector(bio) bvec_iter_end_sector((bio)->bi_iter)
/*
* Return the data direction, READ or WRITE.
@@ -406,13 +410,13 @@ static inline struct bio *bio_next_split(struct bio *bio, int sectors,
return bio_split(bio, sectors, gfp, bs);
}
-extern struct bio_set *bioset_create(unsigned int, unsigned int, int flags);
enum {
BIOSET_NEED_BVECS = BIT(0),
BIOSET_NEED_RESCUER = BIT(1),
};
-extern void bioset_free(struct bio_set *);
-extern mempool_t *biovec_create_pool(int pool_entries);
+extern int bioset_init(struct bio_set *, unsigned int, unsigned int, int flags);
+extern void bioset_exit(struct bio_set *);
+extern int biovec_init_pool(mempool_t *pool, int pool_entries);
extern struct bio *bio_alloc_bioset(gfp_t, unsigned int, struct bio_set *);
extern void bio_put(struct bio *);
@@ -421,11 +425,11 @@ extern void __bio_clone_fast(struct bio *, struct bio *);
extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *);
extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs);
-extern struct bio_set *fs_bio_set;
+extern struct bio_set fs_bio_set;
static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
{
- return bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
+ return bio_alloc_bioset(gfp_mask, nr_iovecs, &fs_bio_set);
}
static inline struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs)
@@ -499,7 +503,10 @@ static inline void bio_flush_dcache_pages(struct bio *bi)
}
#endif
+extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
+ struct bio *src, struct bvec_iter *src_iter);
extern void bio_copy_data(struct bio *dst, struct bio *src);
+extern void bio_list_copy_data(struct bio *dst, struct bio *src);
extern void bio_free_pages(struct bio *bio);
extern struct bio *bio_copy_user_iov(struct request_queue *,
@@ -507,7 +514,13 @@ extern struct bio *bio_copy_user_iov(struct request_queue *,
struct iov_iter *,
gfp_t);
extern int bio_uncopy_user(struct bio *);
-void zero_fill_bio(struct bio *bio);
+void zero_fill_bio_iter(struct bio *bio, struct bvec_iter iter);
+
+static inline void zero_fill_bio(struct bio *bio)
+{
+ zero_fill_bio_iter(bio, bio->bi_iter);
+}
+
extern struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *);
extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int);
extern unsigned int bvec_nr_vecs(unsigned short idx);
@@ -722,11 +735,11 @@ struct bio_set {
struct kmem_cache *bio_slab;
unsigned int front_pad;
- mempool_t *bio_pool;
- mempool_t *bvec_pool;
+ mempool_t bio_pool;
+ mempool_t bvec_pool;
#if defined(CONFIG_BLK_DEV_INTEGRITY)
- mempool_t *bio_integrity_pool;
- mempool_t *bvec_integrity_pool;
+ mempool_t bio_integrity_pool;
+ mempool_t bvec_integrity_pool;
#endif
/*
@@ -745,6 +758,11 @@ struct biovec_slab {
struct kmem_cache *slab;
};
+static inline bool bioset_initialized(struct bio_set *bs)
+{
+ return bs->bio_slab != NULL;
+}
+
/*
* a small number of entries is fine, not going to be performance critical.
* basically we just need to survive
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index ebc34a5686dc..fb355173f3c7 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -259,7 +259,8 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
void blk_mq_kick_requeue_list(struct request_queue *q);
void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs);
void blk_mq_complete_request(struct request *rq);
-
+bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list,
+ struct bio *bio);
bool blk_mq_queue_stopped(struct request_queue *q);
void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 17b18b91ebac..3c4f390aea4b 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -8,6 +8,7 @@
#include <linux/types.h>
#include <linux/bvec.h>
+#include <linux/ktime.h>
struct bio_set;
struct bio;
@@ -90,10 +91,52 @@ static inline bool blk_path_error(blk_status_t error)
return true;
}
-struct blk_issue_stat {
- u64 stat;
+/*
+ * From most significant bit:
+ * 1 bit: reserved for other usage, see below
+ * 12 bits: original size of bio
+ * 51 bits: issue time of bio
+ */
+#define BIO_ISSUE_RES_BITS 1
+#define BIO_ISSUE_SIZE_BITS 12
+#define BIO_ISSUE_RES_SHIFT (64 - BIO_ISSUE_RES_BITS)
+#define BIO_ISSUE_SIZE_SHIFT (BIO_ISSUE_RES_SHIFT - BIO_ISSUE_SIZE_BITS)
+#define BIO_ISSUE_TIME_MASK ((1ULL << BIO_ISSUE_SIZE_SHIFT) - 1)
+#define BIO_ISSUE_SIZE_MASK \
+ (((1ULL << BIO_ISSUE_SIZE_BITS) - 1) << BIO_ISSUE_SIZE_SHIFT)
+#define BIO_ISSUE_RES_MASK (~((1ULL << BIO_ISSUE_RES_SHIFT) - 1))
+
+/* Reserved bit for blk-throtl */
+#define BIO_ISSUE_THROTL_SKIP_LATENCY (1ULL << 63)
+
+struct bio_issue {
+ u64 value;
};
+static inline u64 __bio_issue_time(u64 time)
+{
+ return time & BIO_ISSUE_TIME_MASK;
+}
+
+static inline u64 bio_issue_time(struct bio_issue *issue)
+{
+ return __bio_issue_time(issue->value);
+}
+
+static inline sector_t bio_issue_size(struct bio_issue *issue)
+{
+ return ((issue->value & BIO_ISSUE_SIZE_MASK) >> BIO_ISSUE_SIZE_SHIFT);
+}
+
+static inline void bio_issue_init(struct bio_issue *issue,
+ sector_t size)
+{
+ size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1;
+ issue->value = ((issue->value & BIO_ISSUE_RES_MASK) |
+ (ktime_get_ns() & BIO_ISSUE_TIME_MASK) |
+ ((u64)size << BIO_ISSUE_SIZE_SHIFT));
+}
+
/*
* main unit of I/O for the block layer and lower layers (ie drivers and
* stacking drivers)
@@ -138,7 +181,7 @@ struct bio {
struct cgroup_subsys_state *bi_css;
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
void *bi_cg_private;
- struct blk_issue_stat bi_issue_stat;
+ struct bio_issue bi_issue;
#endif
#endif
union {
@@ -186,6 +229,8 @@ struct bio {
* throttling rules. Don't do it again. */
#define BIO_TRACE_COMPLETION 10 /* bio_endio() should trace the final completion
* of this bio. */
+#define BIO_QUEUE_ENTERED 11 /* can use blk_queue_enter_live() */
+
/* See BVEC_POOL_OFFSET below before adding new flags */
/*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5c4eee043191..bca3a92eb55f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -125,16 +125,23 @@ typedef __u32 __bitwise req_flags_t;
#define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18))
/* The per-zone write lock is held for this request */
#define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19))
-/* timeout is expired */
-#define RQF_MQ_TIMEOUT_EXPIRED ((__force req_flags_t)(1 << 20))
/* already slept for hybrid poll */
-#define RQF_MQ_POLL_SLEPT ((__force req_flags_t)(1 << 21))
+#define RQF_MQ_POLL_SLEPT ((__force req_flags_t)(1 << 20))
/* flags that prevent us from merging requests: */
#define RQF_NOMERGE_FLAGS \
(RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD)
/*
+ * Request state for blk-mq.
+ */
+enum mq_rq_state {
+ MQ_RQ_IDLE = 0,
+ MQ_RQ_IN_FLIGHT = 1,
+ MQ_RQ_COMPLETE = 2,
+};
+
+/*
* Try to put the fields that are referenced together in the same cacheline.
*
* If you modify this structure, make sure to update blk_rq_init() and
@@ -205,9 +212,20 @@ struct request {
struct gendisk *rq_disk;
struct hd_struct *part;
- unsigned long start_time;
- struct blk_issue_stat issue_stat;
- /* Number of scatter-gather DMA addr+len pairs after
+ /* Time that I/O was submitted to the kernel. */
+ u64 start_time_ns;
+ /* Time that I/O was submitted to the device. */
+ u64 io_start_time_ns;
+
+#ifdef CONFIG_BLK_WBT
+ unsigned short wbt_flags;
+#endif
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+ unsigned short throtl_size;
+#endif
+
+ /*
+ * Number of scatter-gather DMA addr+len pairs after
* physical address coalescing is performed.
*/
unsigned short nr_phys_segments;
@@ -219,32 +237,14 @@ struct request {
unsigned short write_hint;
unsigned short ioprio;
- unsigned int timeout;
-
void *special; /* opaque pointer available for LLD use */
unsigned int extra_len; /* length of alignment and padding */
- /*
- * On blk-mq, the lower bits of ->gstate (generation number and
- * state) carry the MQ_RQ_* state value and the upper bits the
- * generation number which is monotonically incremented and used to
- * distinguish the reuse instances.
- *
- * ->gstate_seq allows updates to ->gstate and other fields
- * (currently ->deadline) during request start to be read
- * atomically from the timeout path, so that it can operate on a
- * coherent set of information.
- */
- seqcount_t gstate_seq;
- u64 gstate;
+ enum mq_rq_state state;
+ refcount_t ref;
- /*
- * ->aborted_gstate is used by the timeout to claim a specific
- * recycle instance of this request. See blk_mq_timeout_work().
- */
- struct u64_stats_sync aborted_gstate_sync;
- u64 aborted_gstate;
+ unsigned int timeout;
/* access through blk_rq_set_deadline, blk_rq_deadline */
unsigned long __deadline;
@@ -267,8 +267,6 @@ struct request {
#ifdef CONFIG_BLK_CGROUP
struct request_list *rl; /* rl this rq is alloced from */
- unsigned long long start_time_ns;
- unsigned long long io_start_time_ns; /* when passed to hardware */
#endif
};
@@ -328,9 +326,8 @@ typedef int (init_rq_fn)(struct request_queue *, struct request *, gfp_t);
typedef void (exit_rq_fn)(struct request_queue *, struct request *);
enum blk_eh_timer_return {
- BLK_EH_NOT_HANDLED,
- BLK_EH_HANDLED,
- BLK_EH_RESET_TIMER,
+ BLK_EH_DONE, /* drivers has completed the command */
+ BLK_EH_RESET_TIMER, /* reset timer and try again */
};
typedef enum blk_eh_timer_return (rq_timed_out_fn)(struct request *);
@@ -655,7 +652,7 @@ struct request_queue {
struct blk_mq_tag_set *tag_set;
struct list_head tag_set_list;
- struct bio_set *bio_split;
+ struct bio_set bio_split;
#ifdef CONFIG_BLK_DEBUG_FS
struct dentry *debugfs_dir;
@@ -967,11 +964,8 @@ extern void blk_rq_init(struct request_queue *q, struct request *rq);
extern void blk_init_request_from_bio(struct request *req, struct bio *bio);
extern void blk_put_request(struct request *);
extern void __blk_put_request(struct request_queue *, struct request *);
-extern struct request *blk_get_request_flags(struct request_queue *,
- unsigned int op,
- blk_mq_req_flags_t flags);
extern struct request *blk_get_request(struct request_queue *, unsigned int op,
- gfp_t gfp_mask);
+ blk_mq_req_flags_t flags);
extern void blk_requeue_request(struct request_queue *, struct request *);
extern int blk_lld_busy(struct request_queue *q);
extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
@@ -1788,48 +1782,6 @@ int kblockd_schedule_work(struct work_struct *work);
int kblockd_schedule_work_on(int cpu, struct work_struct *work);
int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
-#ifdef CONFIG_BLK_CGROUP
-/*
- * This should not be using sched_clock(). A real patch is in progress
- * to fix this up, until that is in place we need to disable preemption
- * around sched_clock() in this function and set_io_start_time_ns().
- */
-static inline void set_start_time_ns(struct request *req)
-{
- preempt_disable();
- req->start_time_ns = sched_clock();
- preempt_enable();
-}
-
-static inline void set_io_start_time_ns(struct request *req)
-{
- preempt_disable();
- req->io_start_time_ns = sched_clock();
- preempt_enable();
-}
-
-static inline uint64_t rq_start_time_ns(struct request *req)
-{
- return req->start_time_ns;
-}
-
-static inline uint64_t rq_io_start_time_ns(struct request *req)
-{
- return req->io_start_time_ns;
-}
-#else
-static inline void set_start_time_ns(struct request *req) {}
-static inline void set_io_start_time_ns(struct request *req) {}
-static inline uint64_t rq_start_time_ns(struct request *req)
-{
- return 0;
-}
-static inline uint64_t rq_io_start_time_ns(struct request *req)
-{
- return 0;
-}
-#endif
-
#define MODULE_ALIAS_BLOCKDEV(major,minor) \
MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor))
#define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \
diff --git a/include/linux/bsg-lib.h b/include/linux/bsg-lib.h
index 28a7ccc55c89..6aeaf6472665 100644
--- a/include/linux/bsg-lib.h
+++ b/include/linux/bsg-lib.h
@@ -72,8 +72,7 @@ struct bsg_job {
void bsg_job_done(struct bsg_job *job, int result,
unsigned int reply_payload_rcv_len);
struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
- bsg_job_fn *job_fn, int dd_job_size,
- void (*release)(struct device *));
+ bsg_job_fn *job_fn, int dd_job_size);
void bsg_job_put(struct bsg_job *job);
int __must_check bsg_job_get(struct bsg_job *job);
diff --git a/include/linux/bsg.h b/include/linux/bsg.h
index 0c7dd9ceb139..dac37b6e00ec 100644
--- a/include/linux/bsg.h
+++ b/include/linux/bsg.h
@@ -17,17 +17,13 @@ struct bsg_ops {
struct bsg_class_device {
struct device *class_dev;
- struct device *parent;
int minor;
struct request_queue *queue;
- struct kref ref;
const struct bsg_ops *ops;
- void (*release)(struct device *);
};
int bsg_register_queue(struct request_queue *q, struct device *parent,
- const char *name, const struct bsg_ops *ops,
- void (*release)(struct device *));
+ const char *name, const struct bsg_ops *ops);
int bsg_scsi_register_queue(struct request_queue *q, struct device *parent);
void bsg_unregister_queue(struct request_queue *q);
#else
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 6d9e230dffd2..a02deea30185 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -218,8 +218,6 @@ extern void elv_unregister(struct elevator_type *);
extern ssize_t elv_iosched_show(struct request_queue *, char *);
extern ssize_t elv_iosched_store(struct request_queue *, const char *, size_t);
-extern int elevator_init(struct request_queue *, char *);
-extern void elevator_exit(struct request_queue *, struct elevator_queue *);
extern bool elv_bio_merge_ok(struct request *, struct bio *);
extern struct elevator_queue *elevator_alloc(struct request_queue *,
struct elevator_type *);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 760d8da1b6c7..d8d4831af9ff 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2570,7 +2570,7 @@ extern bool is_bad_inode(struct inode *);
#ifdef CONFIG_BLOCK
extern void check_disk_size_change(struct gendisk *disk,
- struct block_device *bdev);
+ struct block_device *bdev, bool verbose);
extern int revalidate_disk(struct gendisk *);
extern int check_disk_change(struct block_device *);
extern int __invalidate_device(struct block_device *, bool);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 1795fecdea17..1c113134c98f 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1133,7 +1133,6 @@ extern int ata_sas_port_start(struct ata_port *ap);
extern void ata_sas_port_stop(struct ata_port *ap);
extern int ata_sas_slave_configure(struct scsi_device *, struct ata_port *);
extern int ata_sas_queuecmd(struct scsi_cmnd *cmd, struct ata_port *ap);
-extern enum blk_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd);
extern int sata_scr_valid(struct ata_link *link);
extern int sata_scr_read(struct ata_link *link, int reg, u32 *val);
extern int sata_scr_write(struct ata_link *link, int reg, u32 val);
@@ -1359,7 +1358,6 @@ extern struct device_attribute *ata_common_sdev_attrs[];
.proc_name = drv_name, \
.slave_configure = ata_scsi_slave_config, \
.slave_destroy = ata_scsi_slave_destroy, \
- .eh_timed_out = ata_scsi_timed_out, \
.bios_param = ata_std_bios_param, \
.unlock_native_capacity = ata_scsi_unlock_native_capacity, \
.sdev_attrs = ata_common_sdev_attrs
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 6e0859b9d4d2..e9e0d1c7eaf5 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -489,7 +489,7 @@ typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *);
typedef sector_t (nvm_tgt_capacity_fn)(void *);
typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *,
int flags);
-typedef void (nvm_tgt_exit_fn)(void *);
+typedef void (nvm_tgt_exit_fn)(void *, bool);
typedef int (nvm_tgt_sysfs_init_fn)(struct gendisk *);
typedef void (nvm_tgt_sysfs_exit_fn)(struct gendisk *);
diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index b51f5c430c26..0c964ac107c2 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -25,6 +25,18 @@ typedef struct mempool_s {
wait_queue_head_t wait;
} mempool_t;
+static inline bool mempool_initialized(mempool_t *pool)
+{
+ return pool->elements != NULL;
+}
+
+void mempool_exit(mempool_t *pool);
+int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
+ mempool_free_t *free_fn, void *pool_data,
+ gfp_t gfp_mask, int node_id);
+int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
+ mempool_free_t *free_fn, void *pool_data);
+
extern mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
mempool_free_t *free_fn, void *pool_data);
extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
@@ -43,6 +55,14 @@ extern void mempool_free(void *element, mempool_t *pool);
*/
void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data);
void mempool_free_slab(void *element, void *pool_data);
+
+static inline int
+mempool_init_slab_pool(mempool_t *pool, int min_nr, struct kmem_cache *kc)
+{
+ return mempool_init(pool, min_nr, mempool_alloc_slab,
+ mempool_free_slab, (void *) kc);
+}
+
static inline mempool_t *
mempool_create_slab_pool(int min_nr, struct kmem_cache *kc)
{
@@ -56,6 +76,13 @@ mempool_create_slab_pool(int min_nr, struct kmem_cache *kc)
*/
void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data);
void mempool_kfree(void *element, void *pool_data);
+
+static inline int mempool_init_kmalloc_pool(mempool_t *pool, int min_nr, size_t size)
+{
+ return mempool_init(pool, min_nr, mempool_kmalloc,
+ mempool_kfree, (void *) size);
+}
+
static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size)
{
return mempool_create(min_nr, mempool_kmalloc, mempool_kfree,
@@ -68,6 +95,13 @@ static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size)
*/
void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data);
void mempool_free_pages(void *element, void *pool_data);
+
+static inline int mempool_init_page_pool(mempool_t *pool, int min_nr, int order)
+{
+ return mempool_init(pool, min_nr, mempool_alloc_pages,
+ mempool_free_pages, (void *)(long)order);
+}
+
static inline mempool_t *mempool_create_page_pool(int min_nr, int order)
{
return mempool_create(min_nr, mempool_alloc_pages, mempool_free_pages,
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 4112e2bd747f..2950ce957656 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -436,10 +436,19 @@ enum {
enum {
NVME_AER_ERROR = 0,
NVME_AER_SMART = 1,
+ NVME_AER_NOTICE = 2,
NVME_AER_CSS = 6,
NVME_AER_VS = 7,
- NVME_AER_NOTICE_NS_CHANGED = 0x0002,
- NVME_AER_NOTICE_FW_ACT_STARTING = 0x0102,
+};
+
+enum {
+ NVME_AER_NOTICE_NS_CHANGED = 0x00,
+ NVME_AER_NOTICE_FW_ACT_STARTING = 0x01,
+};
+
+enum {
+ NVME_AEN_CFG_NS_ATTR = 1 << 8,
+ NVME_AEN_CFG_FW_ACT = 1 << 9,
};
struct nvme_lba_range_type {
@@ -747,6 +756,7 @@ enum {
NVME_LOG_ERROR = 0x01,
NVME_LOG_SMART = 0x02,
NVME_LOG_FW_SLOT = 0x03,
+ NVME_LOG_CHANGED_NS = 0x04,
NVME_LOG_CMD_EFFECTS = 0x05,
NVME_LOG_DISC = 0x70,
NVME_LOG_RESERVATION = 0x80,
@@ -755,6 +765,8 @@ enum {
NVME_FWACT_ACTV = (2 << 3),
};
+#define NVME_MAX_CHANGED_NAMESPACES 1024
+
struct nvme_identify {
__u8 opcode;
__u8 flags;
diff --git a/include/linux/pktcdvd.h b/include/linux/pktcdvd.h
index 93d142ad1528..174601554b06 100644
--- a/include/linux/pktcdvd.h
+++ b/include/linux/pktcdvd.h
@@ -186,7 +186,7 @@ struct pktcdvd_device
sector_t current_sector; /* Keep track of where the elevator is */
atomic_t scan_queue; /* Set to non-zero when pkt_handle_queue */
/* needs to be run. */
- mempool_t *rb_pool; /* mempool for pkt_rb_node allocations */
+ mempool_t rb_pool; /* mempool for pkt_rb_node allocations */
struct packet_iosched iosched;
struct gendisk *disk;
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 841585f6e5f2..e6539536dea9 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -127,6 +127,12 @@ struct sbitmap_queue {
* @round_robin: Allocate bits in strict round-robin order.
*/
bool round_robin;
+
+ /**
+ * @min_shallow_depth: The minimum shallow depth which may be passed to
+ * sbitmap_queue_get_shallow() or __sbitmap_queue_get_shallow().
+ */
+ unsigned int min_shallow_depth;
};
/**
@@ -390,6 +396,9 @@ int __sbitmap_queue_get(struct sbitmap_queue *sbq);
* @shallow_depth: The maximum number of bits to allocate from a single word.
* See sbitmap_get_shallow().
*
+ * If you call this, make sure to call sbitmap_queue_min_shallow_depth() after
+ * initializing @sbq.
+ *
* Return: Non-negative allocated bit number if successful, -1 otherwise.
*/
int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
@@ -424,6 +433,9 @@ static inline int sbitmap_queue_get(struct sbitmap_queue *sbq,
* @shallow_depth: The maximum number of bits to allocate from a single word.
* See sbitmap_get_shallow().
*
+ * If you call this, make sure to call sbitmap_queue_min_shallow_depth() after
+ * initializing @sbq.
+ *
* Return: Non-negative allocated bit number if successful, -1 otherwise.
*/
static inline int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
@@ -439,6 +451,23 @@ static inline int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
}
/**
+ * sbitmap_queue_min_shallow_depth() - Inform a &struct sbitmap_queue of the
+ * minimum shallow depth that will be used.
+ * @sbq: Bitmap queue in question.
+ * @min_shallow_depth: The minimum shallow depth that will be passed to
+ * sbitmap_queue_get_shallow() or __sbitmap_queue_get_shallow().
+ *
+ * sbitmap_queue_clear() batches wakeups as an optimization. The batch size
+ * depends on the depth of the bitmap. Since the shallow allocation functions
+ * effectively operate with a different depth, the shallow depth must be taken
+ * into account when calculating the batch size. This function must be called
+ * with the minimum shallow depth that will be used. Failure to do so can result
+ * in missed wakeups.
+ */
+void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq,
+ unsigned int min_shallow_depth);
+
+/**
* sbitmap_queue_clear() - Free an allocated bit and wake up waiters on a
* &struct sbitmap_queue.
* @sbq: Bitmap to free from.
@@ -484,6 +513,13 @@ static inline struct sbq_wait_state *sbq_wait_ptr(struct sbitmap_queue *sbq,
void sbitmap_queue_wake_all(struct sbitmap_queue *sbq);
/**
+ * sbitmap_queue_wake_up() - Wake up some of waiters in one waitqueue
+ * on a &struct sbitmap_queue.
+ * @sbq: Bitmap queue to wake up.
+ */
+void sbitmap_queue_wake_up(struct sbitmap_queue *sbq);
+
+/**
* sbitmap_queue_show() - Dump &struct sbitmap_queue information to a &struct
* seq_file.
* @sbq: Bitmap queue to show.
diff --git a/include/scsi/osd_initiator.h b/include/scsi/osd_initiator.h
index a29d3086eb56..86a569d008b2 100644
--- a/include/scsi/osd_initiator.h
+++ b/include/scsi/osd_initiator.h
@@ -148,7 +148,6 @@ struct osd_request {
u8 *pad_buff;
} out, in;
- gfp_t alloc_flags;
unsigned timeout;
unsigned retries;
unsigned sense_len;
@@ -202,14 +201,11 @@ static inline bool osd_req_is_ver1(struct osd_request *or)
*
* @osd_dev: OSD device that holds the scsi-device and default values
* that the request is associated with.
- * @gfp: The allocation flags to use for request allocation, and all
- * subsequent allocations. This will be stored at
- * osd_request->alloc_flags, can be changed by user later
*
* Allocate osd_request and initialize all members to the
* default/initial state.
*/
-struct osd_request *osd_start_request(struct osd_dev *od, gfp_t gfp);
+struct osd_request *osd_start_request(struct osd_dev *od);
enum osd_req_options {
OSD_REQ_FUA = 0x08, /* Force Unit Access */
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index 12f454cb6f61..53b485fe9b67 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -307,7 +307,7 @@ struct scsi_host_template {
* EH_HANDLED: I fixed the error, please complete the command
* EH_RESET_TIMER: I need more time, reset the timer and
* begin counting again
- * EH_NOT_HANDLED Begin normal error recovery
+ * EH_DONE: Begin normal error recovery
*
* Status: OPTIONAL
*/
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 11b4282c2d20..1efcb5b0c3ed 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -269,7 +269,7 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr,
struct bio *bio;
int error = 0;
- bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1);
+ bio = bio_alloc(GFP_NOIO | __GFP_HIGH, 1);
bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
bio_set_dev(bio, hib_resume_bdev);
bio_set_op_attrs(bio, op, op_flags);
@@ -376,7 +376,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
return -ENOSPC;
if (hb) {
- src = (void *)__get_free_page(__GFP_RECLAIM | __GFP_NOWARN |
+ src = (void *)__get_free_page(GFP_NOIO | __GFP_NOWARN |
__GFP_NORETRY);
if (src) {
copy_page(src, buf);
@@ -384,7 +384,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
ret = hib_wait_io(hb); /* Free pages */
if (ret)
return ret;
- src = (void *)__get_free_page(__GFP_RECLAIM |
+ src = (void *)__get_free_page(GFP_NOIO |
__GFP_NOWARN |
__GFP_NORETRY);
if (src) {
@@ -691,7 +691,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
nr_threads = num_online_cpus() - 1;
nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
- page = (void *)__get_free_page(__GFP_RECLAIM | __GFP_HIGH);
+ page = (void *)__get_free_page(GFP_NOIO | __GFP_HIGH);
if (!page) {
pr_err("Failed to allocate LZO page\n");
ret = -ENOMEM;
@@ -989,7 +989,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
last = tmp;
tmp->map = (struct swap_map_page *)
- __get_free_page(__GFP_RECLAIM | __GFP_HIGH);
+ __get_free_page(GFP_NOIO | __GFP_HIGH);
if (!tmp->map) {
release_swap_reader(handle);
return -ENOMEM;
@@ -1261,8 +1261,8 @@ static int load_image_lzo(struct swap_map_handle *handle,
for (i = 0; i < read_pages; i++) {
page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ?
- __GFP_RECLAIM | __GFP_HIGH :
- __GFP_RECLAIM | __GFP_NOWARN |
+ GFP_NOIO | __GFP_HIGH :
+ GFP_NOIO | __GFP_NOWARN |
__GFP_NORETRY);
if (!page[i]) {
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index e6a9c06ec70c..6fdc6267f4a8 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -270,18 +270,33 @@ void sbitmap_bitmap_show(struct sbitmap *sb, struct seq_file *m)
}
EXPORT_SYMBOL_GPL(sbitmap_bitmap_show);
-static unsigned int sbq_calc_wake_batch(unsigned int depth)
+static unsigned int sbq_calc_wake_batch(struct sbitmap_queue *sbq,
+ unsigned int depth)
{
unsigned int wake_batch;
+ unsigned int shallow_depth;
/*
* For each batch, we wake up one queue. We need to make sure that our
- * batch size is small enough that the full depth of the bitmap is
- * enough to wake up all of the queues.
+ * batch size is small enough that the full depth of the bitmap,
+ * potentially limited by a shallow depth, is enough to wake up all of
+ * the queues.
+ *
+ * Each full word of the bitmap has bits_per_word bits, and there might
+ * be a partial word. There are depth / bits_per_word full words and
+ * depth % bits_per_word bits left over. In bitwise arithmetic:
+ *
+ * bits_per_word = 1 << shift
+ * depth / bits_per_word = depth >> shift
+ * depth % bits_per_word = depth & ((1 << shift) - 1)
+ *
+ * Each word can be limited to sbq->min_shallow_depth bits.
*/
- wake_batch = SBQ_WAKE_BATCH;
- if (wake_batch > depth / SBQ_WAIT_QUEUES)
- wake_batch = max(1U, depth / SBQ_WAIT_QUEUES);
+ shallow_depth = min(1U << sbq->sb.shift, sbq->min_shallow_depth);
+ depth = ((depth >> sbq->sb.shift) * shallow_depth +
+ min(depth & ((1U << sbq->sb.shift) - 1), shallow_depth));
+ wake_batch = clamp_t(unsigned int, depth / SBQ_WAIT_QUEUES, 1,
+ SBQ_WAKE_BATCH);
return wake_batch;
}
@@ -307,7 +322,8 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
*per_cpu_ptr(sbq->alloc_hint, i) = prandom_u32() % depth;
}
- sbq->wake_batch = sbq_calc_wake_batch(depth);
+ sbq->min_shallow_depth = UINT_MAX;
+ sbq->wake_batch = sbq_calc_wake_batch(sbq, depth);
atomic_set(&sbq->wake_index, 0);
sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node);
@@ -327,21 +343,28 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
}
EXPORT_SYMBOL_GPL(sbitmap_queue_init_node);
-void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth)
+static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
+ unsigned int depth)
{
- unsigned int wake_batch = sbq_calc_wake_batch(depth);
+ unsigned int wake_batch = sbq_calc_wake_batch(sbq, depth);
int i;
if (sbq->wake_batch != wake_batch) {
WRITE_ONCE(sbq->wake_batch, wake_batch);
/*
- * Pairs with the memory barrier in sbq_wake_up() to ensure that
- * the batch size is updated before the wait counts.
+ * Pairs with the memory barrier in sbitmap_queue_wake_up()
+ * to ensure that the batch size is updated before the wait
+ * counts.
*/
smp_mb__before_atomic();
for (i = 0; i < SBQ_WAIT_QUEUES; i++)
atomic_set(&sbq->ws[i].wait_cnt, 1);
}
+}
+
+void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth)
+{
+ sbitmap_queue_update_wake_batch(sbq, depth);
sbitmap_resize(&sbq->sb, depth);
}
EXPORT_SYMBOL_GPL(sbitmap_queue_resize);
@@ -380,6 +403,8 @@ int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
unsigned int hint, depth;
int nr;
+ WARN_ON_ONCE(shallow_depth < sbq->min_shallow_depth);
+
hint = this_cpu_read(*sbq->alloc_hint);
depth = READ_ONCE(sbq->sb.depth);
if (unlikely(hint >= depth)) {
@@ -403,6 +428,14 @@ int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
}
EXPORT_SYMBOL_GPL(__sbitmap_queue_get_shallow);
+void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq,
+ unsigned int min_shallow_depth)
+{
+ sbq->min_shallow_depth = min_shallow_depth;
+ sbitmap_queue_update_wake_batch(sbq, sbq->sb.depth);
+}
+EXPORT_SYMBOL_GPL(sbitmap_queue_min_shallow_depth);
+
static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq)
{
int i, wake_index;
@@ -425,52 +458,67 @@ static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq)
return NULL;
}
-static void sbq_wake_up(struct sbitmap_queue *sbq)
+static bool __sbq_wake_up(struct sbitmap_queue *sbq)
{
struct sbq_wait_state *ws;
unsigned int wake_batch;
int wait_cnt;
- /*
- * Pairs with the memory barrier in set_current_state() to ensure the
- * proper ordering of clear_bit()/waitqueue_active() in the waker and
- * test_and_set_bit_lock()/prepare_to_wait()/finish_wait() in the
- * waiter. See the comment on waitqueue_active(). This is __after_atomic
- * because we just did clear_bit_unlock() in the caller.
- */
- smp_mb__after_atomic();
-
ws = sbq_wake_ptr(sbq);
if (!ws)
- return;
+ return false;
wait_cnt = atomic_dec_return(&ws->wait_cnt);
if (wait_cnt <= 0) {
+ int ret;
+
wake_batch = READ_ONCE(sbq->wake_batch);
+
/*
* Pairs with the memory barrier in sbitmap_queue_resize() to
* ensure that we see the batch size update before the wait
* count is reset.
*/
smp_mb__before_atomic();
+
/*
- * If there are concurrent callers to sbq_wake_up(), the last
- * one to decrement the wait count below zero will bump it back
- * up. If there is a concurrent resize, the count reset will
- * either cause the cmpxchg to fail or overwrite after the
- * cmpxchg.
+ * For concurrent callers of this, the one that failed the
+ * atomic_cmpxhcg() race should call this function again
+ * to wakeup a new batch on a different 'ws'.
*/
- atomic_cmpxchg(&ws->wait_cnt, wait_cnt, wait_cnt + wake_batch);
- sbq_index_atomic_inc(&sbq->wake_index);
- wake_up_nr(&ws->wait, wake_batch);
+ ret = atomic_cmpxchg(&ws->wait_cnt, wait_cnt, wake_batch);
+ if (ret == wait_cnt) {
+ sbq_index_atomic_inc(&sbq->wake_index);
+ wake_up_nr(&ws->wait, wake_batch);
+ return false;
+ }
+
+ return true;
}
+
+ return false;
+}
+
+void sbitmap_queue_wake_up(struct sbitmap_queue *sbq)
+{
+ while (__sbq_wake_up(sbq))
+ ;
}
+EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up);
void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
unsigned int cpu)
{
sbitmap_clear_bit_unlock(&sbq->sb, nr);
- sbq_wake_up(sbq);
+ /*
+ * Pairs with the memory barrier in set_current_state() to ensure the
+ * proper ordering of clear_bit_unlock()/waitqueue_active() in the waker
+ * and test_and_set_bit_lock()/prepare_to_wait()/finish_wait() in the
+ * waiter. See the comment on waitqueue_active().
+ */
+ smp_mb__after_atomic();
+ sbitmap_queue_wake_up(sbq);
+
if (likely(!sbq->round_robin && nr < sbq->sb.depth))
*per_cpu_ptr(sbq->alloc_hint, cpu) = nr;
}
@@ -482,7 +530,7 @@ void sbitmap_queue_wake_all(struct sbitmap_queue *sbq)
/*
* Pairs with the memory barrier in set_current_state() like in
- * sbq_wake_up().
+ * sbitmap_queue_wake_up().
*/
smp_mb();
wake_index = atomic_read(&sbq->wake_index);
@@ -528,5 +576,6 @@ void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m)
seq_puts(m, "}\n");
seq_printf(m, "round_robin=%d\n", sbq->round_robin);
+ seq_printf(m, "min_shallow_depth=%u\n", sbq->min_shallow_depth);
}
EXPORT_SYMBOL_GPL(sbitmap_queue_show);
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 7441bd93b732..8fe3ebd6ac00 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -412,6 +412,7 @@ static void wb_exit(struct bdi_writeback *wb)
* protected.
*/
static DEFINE_SPINLOCK(cgwb_lock);
+static struct workqueue_struct *cgwb_release_wq;
/**
* wb_congested_get_create - get or create a wb_congested
@@ -522,7 +523,7 @@ static void cgwb_release(struct percpu_ref *refcnt)
{
struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback,
refcnt);
- schedule_work(&wb->release_work);
+ queue_work(cgwb_release_wq, &wb->release_work);
}
static void cgwb_kill(struct bdi_writeback *wb)
@@ -784,6 +785,21 @@ static void cgwb_bdi_register(struct backing_dev_info *bdi)
spin_unlock_irq(&cgwb_lock);
}
+static int __init cgwb_init(void)
+{
+ /*
+ * There can be many concurrent release work items overwhelming
+ * system_wq. Put them in a separate wq and limit concurrency.
+ * There's no point in executing many of these in parallel.
+ */
+ cgwb_release_wq = alloc_workqueue("cgwb_release", 0, 1);
+ if (!cgwb_release_wq)
+ return -ENOMEM;
+
+ return 0;
+}
+subsys_initcall(cgwb_init);
+
#else /* CONFIG_CGROUP_WRITEBACK */
static int cgwb_bdi_init(struct backing_dev_info *bdi)
diff --git a/mm/mempool.c b/mm/mempool.c
index 5c9dce34719b..b54f2c20e5e0 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -138,6 +138,28 @@ static void *remove_element(mempool_t *pool, gfp_t flags)
}
/**
+ * mempool_exit - exit a mempool initialized with mempool_init()
+ * @pool: pointer to the memory pool which was initialized with
+ * mempool_init().
+ *
+ * Free all reserved elements in @pool and @pool itself. This function
+ * only sleeps if the free_fn() function sleeps.
+ *
+ * May be called on a zeroed but uninitialized mempool (i.e. allocated with
+ * kzalloc()).
+ */
+void mempool_exit(mempool_t *pool)
+{
+ while (pool->curr_nr) {
+ void *element = remove_element(pool, GFP_KERNEL);
+ pool->free(element, pool->pool_data);
+ }
+ kfree(pool->elements);
+ pool->elements = NULL;
+}
+EXPORT_SYMBOL(mempool_exit);
+
+/**
* mempool_destroy - deallocate a memory pool
* @pool: pointer to the memory pool which was allocated via
* mempool_create().
@@ -150,15 +172,65 @@ void mempool_destroy(mempool_t *pool)
if (unlikely(!pool))
return;
- while (pool->curr_nr) {
- void *element = remove_element(pool, GFP_KERNEL);
- pool->free(element, pool->pool_data);
- }
- kfree(pool->elements);
+ mempool_exit(pool);
kfree(pool);
}
EXPORT_SYMBOL(mempool_destroy);
+int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
+ mempool_free_t *free_fn, void *pool_data,
+ gfp_t gfp_mask, int node_id)
+{
+ spin_lock_init(&pool->lock);
+ pool->min_nr = min_nr;
+ pool->pool_data = pool_data;
+ pool->alloc = alloc_fn;
+ pool->free = free_fn;
+ init_waitqueue_head(&pool->wait);
+
+ pool->elements = kmalloc_array_node(min_nr, sizeof(void *),
+ gfp_mask, node_id);
+ if (!pool->elements)
+ return -ENOMEM;
+
+ /*
+ * First pre-allocate the guaranteed number of buffers.
+ */
+ while (pool->curr_nr < pool->min_nr) {
+ void *element;
+
+ element = pool->alloc(gfp_mask, pool->pool_data);
+ if (unlikely(!element)) {
+ mempool_exit(pool);
+ return -ENOMEM;
+ }
+ add_element(pool, element);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(mempool_init_node);
+
+/**
+ * mempool_init - initialize a memory pool
+ * @min_nr: the minimum number of elements guaranteed to be
+ * allocated for this pool.
+ * @alloc_fn: user-defined element-allocation function.
+ * @free_fn: user-defined element-freeing function.
+ * @pool_data: optional private data available to the user-defined functions.
+ *
+ * Like mempool_create(), but initializes the pool in (i.e. embedded in another
+ * structure).
+ */
+int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
+ mempool_free_t *free_fn, void *pool_data)
+{
+ return mempool_init_node(pool, min_nr, alloc_fn, free_fn,
+ pool_data, GFP_KERNEL, NUMA_NO_NODE);
+
+}
+EXPORT_SYMBOL(mempool_init);
+
/**
* mempool_create - create a memory pool
* @min_nr: the minimum number of elements guaranteed to be
@@ -186,35 +258,17 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
gfp_t gfp_mask, int node_id)
{
mempool_t *pool;
+
pool = kzalloc_node(sizeof(*pool), gfp_mask, node_id);
if (!pool)
return NULL;
- pool->elements = kmalloc_array_node(min_nr, sizeof(void *),
- gfp_mask, node_id);
- if (!pool->elements) {
+
+ if (mempool_init_node(pool, min_nr, alloc_fn, free_fn, pool_data,
+ gfp_mask, node_id)) {
kfree(pool);
return NULL;
}
- spin_lock_init(&pool->lock);
- pool->min_nr = min_nr;
- pool->pool_data = pool_data;
- init_waitqueue_head(&pool->wait);
- pool->alloc = alloc_fn;
- pool->free = free_fn;
- /*
- * First pre-allocate the guaranteed number of buffers.
- */
- while (pool->curr_nr < pool->min_nr) {
- void *element;
-
- element = pool->alloc(gfp_mask, pool->pool_data);
- if (unlikely(!element)) {
- mempool_destroy(pool);
- return NULL;
- }
- add_element(pool, element);
- }
return pool;
}
EXPORT_SYMBOL(mempool_create_node);