From 986994a27587efd8ce4c595cab89b570f7475359 Mon Sep 17 00:00:00 2001
From: Parav Pandit
Date: Thu, 26 Jan 2017 17:17:28 +0200
Subject: nvme: Use CNS as 8-bit field and avoid endianness conversion

This patch defines CNS field as 8-bit field and avoids cpu_to/from_le
conversions.
Also initialize nvme_command cns value explicitly to NVME_ID_CNS_NS
for readability (don't rely on the fact that NVME_ID_CNS_NS = 0).

Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/nvme.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 0b676a02cf3e..5b32521456d6 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -644,7 +644,9 @@ struct nvme_identify {
 	__le32			nsid;
 	__u64			rsvd2[2];
 	union nvme_data_ptr	dptr;
-	__le32			cns;
+	__u8			cns;
+	__u8			rsvd3;
+	__le16			ctrlid;
 	__u32			rsvd11[5];
 };
 
-- 
cgit v1.2.3


From c5552fde102fcc3f2cf9e502b8ac90e3500d8fdf Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Tue, 7 Feb 2017 10:08:45 -0800
Subject: nvme: Enable autonomous power state transitions

NVMe devices can advertise multiple power states.  These states can
be either "operational" (the device is fully functional but possibly
slow) or "non-operational" (the device is asleep until woken up).
Some devices can automatically enter a non-operational state when
idle for a specified amount of time and then automatically wake back
up when needed.

The hardware configuration is a table.  For each state, an entry in
the table indicates the next deeper non-operational state, if any,
to autonomously transition to and the idle time required before
transitioning.

This patch teaches the driver to program APST so that each successive
non-operational state will be entered after an idle time equal to 100%
of the total latency (entry plus exit) associated with that state.
The maximum acceptable latency is controlled using dev_pm_qos
(e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational
states with total latency greater than this value will not be used.
As a special case, setting the latency tolerance to 0 will disable
APST entirely.  On hardware without APST support, the sysfs file will
not be exposed.

The latency tolerance for newly-probed devices is set by the module
parameter nvme_core.default_ps_max_latency_us.

In theory, the device can expose "default" APST table, but this
doesn't seem to function correctly on my device (Samsung 950), nor
does it seem particularly useful.  There is also an optional
mechanism by which a configuration can be "saved" so it will be
automatically loaded on reset.  This can be configured from
userspace, but it doesn't seem useful to support in the driver.

On my laptop, enabling APST seems to save nearly 1W.

The hardware tables can be decoded in userspace with nvme-cli.
'nvme id-ctrl /dev/nvmeN' will show the power state table and
'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST
configuration.

This feature is quirked off on a known-buggy Samsung device.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/core.c | 154 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/nvme/host/nvme.h |  11 ++++
 include/linux/nvme.h     |   6 ++
 3 files changed, 171 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 38efe4f752a1..849e85847ad4 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -26,6 +26,7 @@
 #include <linux/ptrace.h>
 #include <linux/nvme_ioctl.h>
 #include <linux/t10-pi.h>
+#include <linux/pm_qos.h>
 #include <scsi/sg.h>
 #include <asm/unaligned.h>
 
@@ -56,6 +57,11 @@ EXPORT_SYMBOL_GPL(nvme_max_retries);
 static int nvme_char_major;
 module_param(nvme_char_major, int, 0);
 
+static unsigned long default_ps_max_latency_us = 25000;
+module_param(default_ps_max_latency_us, ulong, 0644);
+MODULE_PARM_DESC(default_ps_max_latency_us,
+		 "max power saving latency for new devices; use PM QOS to change per device");
+
 static LIST_HEAD(nvme_ctrl_list);
 static DEFINE_SPINLOCK(dev_list_lock);
 
@@ -1252,6 +1258,122 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
 	blk_queue_write_cache(q, vwc, vwc);
 }
 
+static void nvme_configure_apst(struct nvme_ctrl *ctrl)
+{
+	/*
+	 * APST (Autonomous Power State Transition) lets us program a
+	 * table of power state transitions that the controller will
+	 * perform automatically.  We configure it with a simple
+	 * heuristic: we are willing to spend at most 2% of the time
+	 * transitioning between power states.  Therefore, when running
+	 * in any given state, we will enter the next lower-power
+	 * non-operational state after waiting 100 * (enlat + exlat)
+	 * microseconds, as long as that state's total latency is under
+	 * the requested maximum latency.
+	 *
+	 * We will not autonomously enter any non-operational state for
+	 * which the total latency exceeds ps_max_latency_us.  Users
+	 * can set ps_max_latency_us to zero to turn off APST.
+	 */
+
+	unsigned apste;
+	struct nvme_feat_auto_pst *table;
+	int ret;
+
+	/*
+	 * If APST isn't supported or if we haven't been initialized yet,
+	 * then don't do anything.
+	 */
+	if (!ctrl->apsta)
+		return;
+
+	if (ctrl->npss > 31) {
+		dev_warn(ctrl->device, "NPSS is invalid; not using APST\n");
+		return;
+	}
+
+	table = kzalloc(sizeof(*table), GFP_KERNEL);
+	if (!table)
+		return;
+
+	if (ctrl->ps_max_latency_us == 0) {
+		/* Turn off APST. */
+		apste = 0;
+	} else {
+		__le64 target = cpu_to_le64(0);
+		int state;
+
+		/*
+		 * Walk through all states from lowest- to highest-power.
+		 * According to the spec, lower-numbered states use more
+		 * power.  NPSS, despite the name, is the index of the
+		 * lowest-power state, not the number of states.
+		 */
+		for (state = (int)ctrl->npss; state >= 0; state--) {
+			u64 total_latency_us, transition_ms;
+
+			if (target)
+				table->entries[state] = target;
+
+			/*
+			 * Is this state a useful non-operational state for
+			 * higher-power states to autonomously transition to?
+			 */
+			if (!(ctrl->psd[state].flags &
+			      NVME_PS_FLAGS_NON_OP_STATE))
+				continue;
+
+			total_latency_us =
+				(u64)le32_to_cpu(ctrl->psd[state].entry_lat) +
+				+ le32_to_cpu(ctrl->psd[state].exit_lat);
+			if (total_latency_us > ctrl->ps_max_latency_us)
+				continue;
+
+			/*
+			 * This state is good.  Use it as the APST idle
+			 * target for higher power states.
+			 */
+			transition_ms = total_latency_us + 19;
+			do_div(transition_ms, 20);
+			if (transition_ms > (1 << 24) - 1)
+				transition_ms = (1 << 24) - 1;
+
+			target = cpu_to_le64((state << 3) |
+					     (transition_ms << 8));
+		}
+
+		apste = 1;
+	}
+
+	ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste,
+				table, sizeof(*table), NULL);
+	if (ret)
+		dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret);
+
+	kfree(table);
+}
+
+static void nvme_set_latency_tolerance(struct device *dev, s32 val)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+	u64 latency;
+
+	switch (val) {
+	case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT:
+	case PM_QOS_LATENCY_ANY:
+		latency = U64_MAX;
+		break;
+
+	default:
+		latency = val;
+	}
+
+	if (ctrl->ps_max_latency_us != latency) {
+		ctrl->ps_max_latency_us = latency;
+		nvme_configure_apst(ctrl);
+	}
+}
+
 struct nvme_core_quirk_entry {
 	/*
 	 * NVMe model and firmware strings are padded with spaces.  For
@@ -1265,6 +1387,16 @@ struct nvme_core_quirk_entry {
 };
 
 static const struct nvme_core_quirk_entry core_quirks[] = {
+	/*
+	 * Seen on a Samsung "SM951 NVMe SAMSUNG 256GB": using APST causes
+	 * the controller to go out to lunch.  It dies when the watchdog
+	 * timer reads CSTS and gets 0xffffffff.
+	 */
+	{
+		.vid = 0x144d,
+		.fr = "BXW75D0Q",
+		.quirks = NVME_QUIRK_NO_APST,
+	},
 };
 
 /* match is null-terminated but idstr is space-padded. */
@@ -1307,6 +1439,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 	u64 cap;
 	int ret, page_shift;
 	u32 max_hw_sectors;
+	u8 prev_apsta;
 
 	ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
 	if (ret) {
@@ -1368,6 +1501,11 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 	ctrl->sgls = le32_to_cpu(id->sgls);
 	ctrl->kas = le16_to_cpu(id->kas);
 
+	ctrl->npss = id->npss;
+	prev_apsta = ctrl->apsta;
+	ctrl->apsta = (ctrl->quirks & NVME_QUIRK_NO_APST) ? 0 : id->apsta;
+	memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd));
+
 	if (ctrl->ops->is_fabrics) {
 		ctrl->icdoff = le16_to_cpu(id->icdoff);
 		ctrl->ioccsz = le32_to_cpu(id->ioccsz);
@@ -1392,7 +1530,15 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 
 	kfree(id);
 
+	if (ctrl->apsta && !prev_apsta)
+		dev_pm_qos_expose_latency_tolerance(ctrl->device);
+	else if (!ctrl->apsta && prev_apsta)
+		dev_pm_qos_hide_latency_tolerance(ctrl->device);
+
+	nvme_configure_apst(ctrl);
+
 	ctrl->identified = true;
+
 	return ret;
 }
 EXPORT_SYMBOL_GPL(nvme_init_identify);
@@ -2154,6 +2300,14 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 	list_add_tail(&ctrl->node, &nvme_ctrl_list);
 	spin_unlock(&dev_list_lock);
 
+	/*
+	 * Initialize latency tolerance controls.  The sysfs files won't
+	 * be visible to userspace unless the device actually supports APST.
+	 */
+	ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance;
+	dev_pm_qos_update_user_latency_tolerance(ctrl->device,
+		min(default_ps_max_latency_us, (unsigned long)S32_MAX));
+
 	return 0;
 out_release_instance:
 	nvme_release_instance(ctrl);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 42ede67bfbc6..a3da1e90b99d 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -78,6 +78,11 @@ enum nvme_quirks {
 	 * readiness, which is done by reading the NVME_CSTS_RDY bit.
 	 */
 	NVME_QUIRK_DELAY_BEFORE_CHK_RDY		= (1 << 3),
+
+	/*
+	 * APST should not be used.
+	 */
+	NVME_QUIRK_NO_APST			= (1 << 4),
 };
 
 /*
@@ -148,13 +153,19 @@ struct nvme_ctrl {
 	u32 vs;
 	u32 sgls;
 	u16 kas;
+	u8 npss;
+	u8 apsta;
 	unsigned int kato;
 	bool subsystem;
 	unsigned long quirks;
+	struct nvme_id_power_state psd[32];
 	struct work_struct scan_work;
 	struct work_struct async_event_work;
 	struct delayed_work ka_work;
 
+	/* Power saving configuration */
+	u64 ps_max_latency_us;
+
 	/* Fabrics only */
 	u16 sqsize;
 	u32 ioccsz;
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 5b32521456d6..c43d435d4225 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -579,6 +579,12 @@ struct nvme_write_zeroes_cmd {
 	__le16			appmask;
 };
 
+/* Features */
+
+struct nvme_feat_auto_pst {
+	__le64 entries[32];
+};
+
 /* Admin commands */
 
 enum nvme_admin_opcode {
-- 
cgit v1.2.3


From 3ee80c3d15b61e61611903033df414a0590cfde9 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy
Date: Mon, 20 Feb 2017 13:44:28 +0200
Subject: nvme-rdma: move nvme cm status helper to .h file

This will enable the usage for nvme rdma target.
Also move from a lookup array to a switch statement.

Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/rdma.c  | 22 ----------------------
 include/linux/nvme-rdma.h | 24 ++++++++++++++++++++++++
 2 files changed, 24 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index aedac6e28c6f..ee789b38bae5 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -42,28 +42,6 @@
 
 #define NVME_RDMA_MAX_INLINE_SEGMENTS	1
 
-static const char *const nvme_rdma_cm_status_strs[] = {
-	[NVME_RDMA_CM_INVALID_LEN]	= "invalid length",
-	[NVME_RDMA_CM_INVALID_RECFMT]	= "invalid record format",
-	[NVME_RDMA_CM_INVALID_QID]	= "invalid queue ID",
-	[NVME_RDMA_CM_INVALID_HSQSIZE]	= "invalid host SQ size",
-	[NVME_RDMA_CM_INVALID_HRQSIZE]	= "invalid host RQ size",
-	[NVME_RDMA_CM_NO_RSC]		= "resource not found",
-	[NVME_RDMA_CM_INVALID_IRD]	= "invalid IRD",
-	[NVME_RDMA_CM_INVALID_ORD]	= "Invalid ORD",
-};
-
-static const char *nvme_rdma_cm_msg(enum nvme_rdma_cm_status status)
-{
-	size_t index = status;
-
-	if (index < ARRAY_SIZE(nvme_rdma_cm_status_strs) &&
-	    nvme_rdma_cm_status_strs[index])
-		return nvme_rdma_cm_status_strs[index];
-	else
-		return "unrecognized reason";
-};
-
 /*
  * We handle AEN commands ourselves and don't even let the
  * block layer know about them.
diff --git a/include/linux/nvme-rdma.h b/include/linux/nvme-rdma.h
index bf240a3cbf99..a72fd04aa5e1 100644
--- a/include/linux/nvme-rdma.h
+++ b/include/linux/nvme-rdma.h
@@ -29,6 +29,30 @@ enum nvme_rdma_cm_status {
 	NVME_RDMA_CM_INVALID_ORD	= 0x08,
 };
 
+static inline const char *nvme_rdma_cm_msg(enum nvme_rdma_cm_status status)
+{
+	switch (status) {
+	case NVME_RDMA_CM_INVALID_LEN:
+		return "invalid length";
+	case NVME_RDMA_CM_INVALID_RECFMT:
+		return "invalid record format";
+	case NVME_RDMA_CM_INVALID_QID:
+		return "invalid queue ID";
+	case NVME_RDMA_CM_INVALID_HSQSIZE:
+		return "invalid host SQ size";
+	case NVME_RDMA_CM_INVALID_HRQSIZE:
+		return "invalid host RQ size";
+	case NVME_RDMA_CM_NO_RSC:
+		return "resource not found";
+	case NVME_RDMA_CM_INVALID_IRD:
+		return "invalid IRD";
+	case NVME_RDMA_CM_INVALID_ORD:
+		return "Invalid ORD";
+	default:
+		return "unrecognized reason";
+	}
+}
+
 /**
  * struct nvme_rdma_cm_req - rdma connect request
  *
-- 
cgit v1.2.3


From 7d6d15789d69856f1c5405e106a773c87277eb8c Mon Sep 17 00:00:00 2001
From: Scott Bauer
Date: Wed, 22 Feb 2017 10:15:06 -0700
Subject: block/sed-opal: Introduce free_opal_dev to free the structure and
 clean up state

Before we free the opal structure we need to clean up any saved
locking ranges that the user had told us to unlock from a suspend.

Signed-off-by: Scott Bauer <scott.bauer@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/sed-opal.c         | 30 ++++++++++++++++++++++++++++++
 include/linux/sed-opal.h |  5 +++++
 2 files changed, 35 insertions(+)

(limited to 'include/linux')

diff --git a/block/sed-opal.c b/block/sed-opal.c
index 893557554cc5..020bf3e28e38 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -1987,6 +1987,28 @@ static int check_opal_support(struct opal_dev *dev)
 	return ret;
 }
 
+static void clean_opal_dev(struct opal_dev *dev)
+{
+
+	struct opal_suspend_data *suspend, *next;
+
+	mutex_lock(&dev->dev_lock);
+	list_for_each_entry_safe(suspend, next, &dev->unlk_lst, node) {
+		list_del(&suspend->node);
+		kfree(suspend);
+	}
+	mutex_unlock(&dev->dev_lock);
+}
+
+void free_opal_dev(struct opal_dev *dev)
+{
+	if (!dev)
+		return;
+	clean_opal_dev(dev);
+	kfree(dev);
+}
+EXPORT_SYMBOL(free_opal_dev);
+
 struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv)
 {
 	struct opal_dev *dev;
@@ -2141,6 +2163,14 @@ static int opal_reverttper(struct opal_dev *dev, struct opal_key *opal)
 	setup_opal_dev(dev, revert_steps);
 	ret = next(dev);
 	mutex_unlock(&dev->dev_lock);
+
+	/*
+	 * If we successfully reverted lets clean
+	 * any saved locking ranges.
+	 */
+	if (!ret)
+		clean_opal_dev(dev);
+
 	return ret;
 }
 
diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h
index deee23d012e7..04b124fca51e 100644
--- a/include/linux/sed-opal.h
+++ b/include/linux/sed-opal.h
@@ -27,6 +27,7 @@ typedef int (sec_send_recv)(void *data, u16 spsp, u8 secp, void *buffer,
 		size_t len, bool send);
 
 #ifdef CONFIG_BLK_SED_OPAL
+void free_opal_dev(struct opal_dev *dev);
 bool opal_unlock_from_suspend(struct opal_dev *dev);
 struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv);
 int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *ioctl_ptr);
@@ -51,6 +52,10 @@ static inline bool is_sed_ioctl(unsigned int cmd)
 	return false;
 }
 #else
+static inline void free_opal_dev(struct opal_dev *dev)
+{
+}
+
 static inline bool is_sed_ioctl(unsigned int cmd)
 {
 	return false;
-- 
cgit v1.2.3


From da55f2cc78418dee88400aafbbaed19d7ac8188e Mon Sep 17 00:00:00 2001
From: Omar Sandoval
Date: Wed, 22 Feb 2017 10:58:29 -0800
Subject: blk-mq: use sbq wait queues instead of restart for driver tags

Commit 50e1dab86aa2 ("blk-mq-sched: fix starvation for multiple hardware
queues and shared tags") fixed one starvation issue for shared tags.
However, we can still get into a situation where we fail to allocate a
tag because all tags are allocated but we don't have any pending
requests on any hardware queue.

One solution for this would be to restart all queues that share a tag
map, but that really sucks. Ideally, we could just block and wait for a
tag, but that isn't always possible from blk_mq_dispatch_rq_list().

However, we can still use the struct sbitmap_queue wait queues with a
custom callback instead of blocking. This has a few benefits:

1. It avoids iterating over all hardware queues when completing an I/O,
   which the current restart code has to do.
2. It benefits from the existing rolling wakeup code.
3. It avoids punting to another thread just to have it block.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c         | 64 +++++++++++++++++++++++++++++++++++++++++++-------
 include/linux/blk-mq.h |  2 ++
 2 files changed, 57 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index b29e7dc7b309..9e6b064e5339 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -904,6 +904,44 @@ static bool reorder_tags_to_front(struct list_head *list)
 	return first != NULL;
 }
 
+static int blk_mq_dispatch_wake(wait_queue_t *wait, unsigned mode, int flags,
+				void *key)
+{
+	struct blk_mq_hw_ctx *hctx;
+
+	hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
+
+	list_del(&wait->task_list);
+	clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state);
+	blk_mq_run_hw_queue(hctx, true);
+	return 1;
+}
+
+static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx)
+{
+	struct sbq_wait_state *ws;
+
+	/*
+	 * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait.
+	 * The thread which wins the race to grab this bit adds the hardware
+	 * queue to the wait queue.
+	 */
+	if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) ||
+	    test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state))
+		return false;
+
+	init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
+	ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx);
+
+	/*
+	 * As soon as this returns, it's no longer safe to fiddle with
+	 * hctx->dispatch_wait, since a completion can wake up the wait queue
+	 * and unlock the bit.
+	 */
+	add_wait_queue(&ws->wait, &hctx->dispatch_wait);
+	return true;
+}
+
 bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 {
 	struct request_queue *q = hctx->queue;
@@ -931,15 +969,22 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 				continue;
 
 			/*
-			 * We failed getting a driver tag. Mark the queue(s)
-			 * as needing a restart. Retry getting a tag again,
-			 * in case the needed IO completed right before we
-			 * marked the queue as needing a restart.
+			 * The initial allocation attempt failed, so we need to
+			 * rerun the hardware queue when a tag is freed.
 			 */
-			blk_mq_sched_mark_restart(hctx);
-			if (!blk_mq_get_driver_tag(rq, &hctx, false))
+			if (blk_mq_dispatch_wait_add(hctx)) {
+				/*
+				 * It's possible that a tag was freed in the
+				 * window between the allocation failure and
+				 * adding the hardware queue to the wait queue.
+				 */
+				if (!blk_mq_get_driver_tag(rq, &hctx, false))
+					break;
+			} else {
 				break;
+			}
 		}
+
 		list_del_init(&rq->queuelist);
 
 		bd.rq = rq;
@@ -995,10 +1040,11 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 		 *
 		 * blk_mq_run_hw_queue() already checks the STOPPED bit
 		 *
-		 * If RESTART is set, then let completion restart the queue
-		 * instead of potentially looping here.
+		 * If RESTART or TAG_WAITING is set, then let completion restart
+		 * the queue instead of potentially looping here.
 		 */
-		if (!blk_mq_sched_needs_restart(hctx))
+		if (!blk_mq_sched_needs_restart(hctx) &&
+		    !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state))
 			blk_mq_run_hw_queue(hctx, true);
 	}
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 8e4df3d6c8cd..001d30d727c5 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -33,6 +33,7 @@ struct blk_mq_hw_ctx {
 	struct blk_mq_ctx	**ctxs;
 	unsigned int		nr_ctx;
 
+	wait_queue_t		dispatch_wait;
 	atomic_t		wait_index;
 
 	struct blk_mq_tags	*tags;
@@ -160,6 +161,7 @@ enum {
 	BLK_MQ_S_STOPPED	= 0,
 	BLK_MQ_S_TAG_ACTIVE	= 1,
 	BLK_MQ_S_SCHED_RESTART	= 2,
+	BLK_MQ_S_TAG_WAITING	= 3,
 
 	BLK_MQ_MAX_DEPTH	= 10240,
 
-- 
cgit v1.2.3