From 986994a27587efd8ce4c595cab89b570f7475359 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Thu, 26 Jan 2017 17:17:28 +0200 Subject: nvme: Use CNS as 8-bit field and avoid endianness conversion This patch defines CNS field as 8-bit field and avoids cpu_to/from_le conversions. Also initialize nvme_command cns value explicitly to NVME_ID_CNS_NS for readability (don't rely on the fact that NVME_ID_CNS_NS = 0). Reviewed-by: Max Gurtovoy Signed-off-by: Parav Pandit Reviewed-by: Keith Busch Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe --- include/linux/nvme.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 0b676a02cf3e..5b32521456d6 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -644,7 +644,9 @@ struct nvme_identify { __le32 nsid; __u64 rsvd2[2]; union nvme_data_ptr dptr; - __le32 cns; + __u8 cns; + __u8 rsvd3; + __le16 ctrlid; __u32 rsvd11[5]; }; -- cgit v1.2.3 From c5552fde102fcc3f2cf9e502b8ac90e3500d8fdf Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 7 Feb 2017 10:08:45 -0800 Subject: nvme: Enable autonomous power state transitions NVMe devices can advertise multiple power states. These states can be either "operational" (the device is fully functional but possibly slow) or "non-operational" (the device is asleep until woken up). Some devices can automatically enter a non-operational state when idle for a specified amount of time and then automatically wake back up when needed. The hardware configuration is a table. For each state, an entry in the table indicates the next deeper non-operational state, if any, to autonomously transition to and the idle time required before transitioning. This patch teaches the driver to program APST so that each successive non-operational state will be entered after an idle time equal to 100% of the total latency (entry plus exit) associated with that state. The maximum acceptable latency is controlled using dev_pm_qos (e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational states with total latency greater than this value will not be used. As a special case, setting the latency tolerance to 0 will disable APST entirely. On hardware without APST support, the sysfs file will not be exposed. The latency tolerance for newly-probed devices is set by the module parameter nvme_core.default_ps_max_latency_us. In theory, the device can expose "default" APST table, but this doesn't seem to function correctly on my device (Samsung 950), nor does it seem particularly useful. There is also an optional mechanism by which a configuration can be "saved" so it will be automatically loaded on reset. This can be configured from userspace, but it doesn't seem useful to support in the driver. On my laptop, enabling APST seems to save nearly 1W. The hardware tables can be decoded in userspace with nvme-cli. 'nvme id-ctrl /dev/nvmeN' will show the power state table and 'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST configuration. This feature is quirked off on a known-buggy Samsung device. Signed-off-by: Andy Lutomirski Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 154 +++++++++++++++++++++++++++++++++++++++++++++++ drivers/nvme/host/nvme.h | 11 ++++ include/linux/nvme.h | 6 ++ 3 files changed, 171 insertions(+) (limited to 'include/linux') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 38efe4f752a1..849e85847ad4 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -56,6 +57,11 @@ EXPORT_SYMBOL_GPL(nvme_max_retries); static int nvme_char_major; module_param(nvme_char_major, int, 0); +static unsigned long default_ps_max_latency_us = 25000; +module_param(default_ps_max_latency_us, ulong, 0644); +MODULE_PARM_DESC(default_ps_max_latency_us, + "max power saving latency for new devices; use PM QOS to change per device"); + static LIST_HEAD(nvme_ctrl_list); static DEFINE_SPINLOCK(dev_list_lock); @@ -1252,6 +1258,122 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, blk_queue_write_cache(q, vwc, vwc); } +static void nvme_configure_apst(struct nvme_ctrl *ctrl) +{ + /* + * APST (Autonomous Power State Transition) lets us program a + * table of power state transitions that the controller will + * perform automatically. We configure it with a simple + * heuristic: we are willing to spend at most 2% of the time + * transitioning between power states. Therefore, when running + * in any given state, we will enter the next lower-power + * non-operational state after waiting 100 * (enlat + exlat) + * microseconds, as long as that state's total latency is under + * the requested maximum latency. + * + * We will not autonomously enter any non-operational state for + * which the total latency exceeds ps_max_latency_us. Users + * can set ps_max_latency_us to zero to turn off APST. + */ + + unsigned apste; + struct nvme_feat_auto_pst *table; + int ret; + + /* + * If APST isn't supported or if we haven't been initialized yet, + * then don't do anything. + */ + if (!ctrl->apsta) + return; + + if (ctrl->npss > 31) { + dev_warn(ctrl->device, "NPSS is invalid; not using APST\n"); + return; + } + + table = kzalloc(sizeof(*table), GFP_KERNEL); + if (!table) + return; + + if (ctrl->ps_max_latency_us == 0) { + /* Turn off APST. */ + apste = 0; + } else { + __le64 target = cpu_to_le64(0); + int state; + + /* + * Walk through all states from lowest- to highest-power. + * According to the spec, lower-numbered states use more + * power. NPSS, despite the name, is the index of the + * lowest-power state, not the number of states. + */ + for (state = (int)ctrl->npss; state >= 0; state--) { + u64 total_latency_us, transition_ms; + + if (target) + table->entries[state] = target; + + /* + * Is this state a useful non-operational state for + * higher-power states to autonomously transition to? + */ + if (!(ctrl->psd[state].flags & + NVME_PS_FLAGS_NON_OP_STATE)) + continue; + + total_latency_us = + (u64)le32_to_cpu(ctrl->psd[state].entry_lat) + + + le32_to_cpu(ctrl->psd[state].exit_lat); + if (total_latency_us > ctrl->ps_max_latency_us) + continue; + + /* + * This state is good. Use it as the APST idle + * target for higher power states. + */ + transition_ms = total_latency_us + 19; + do_div(transition_ms, 20); + if (transition_ms > (1 << 24) - 1) + transition_ms = (1 << 24) - 1; + + target = cpu_to_le64((state << 3) | + (transition_ms << 8)); + } + + apste = 1; + } + + ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste, + table, sizeof(*table), NULL); + if (ret) + dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret); + + kfree(table); +} + +static void nvme_set_latency_tolerance(struct device *dev, s32 val) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + u64 latency; + + switch (val) { + case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT: + case PM_QOS_LATENCY_ANY: + latency = U64_MAX; + break; + + default: + latency = val; + } + + if (ctrl->ps_max_latency_us != latency) { + ctrl->ps_max_latency_us = latency; + nvme_configure_apst(ctrl); + } +} + struct nvme_core_quirk_entry { /* * NVMe model and firmware strings are padded with spaces. For @@ -1265,6 +1387,16 @@ struct nvme_core_quirk_entry { }; static const struct nvme_core_quirk_entry core_quirks[] = { + /* + * Seen on a Samsung "SM951 NVMe SAMSUNG 256GB": using APST causes + * the controller to go out to lunch. It dies when the watchdog + * timer reads CSTS and gets 0xffffffff. + */ + { + .vid = 0x144d, + .fr = "BXW75D0Q", + .quirks = NVME_QUIRK_NO_APST, + }, }; /* match is null-terminated but idstr is space-padded. */ @@ -1307,6 +1439,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) u64 cap; int ret, page_shift; u32 max_hw_sectors; + u8 prev_apsta; ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs); if (ret) { @@ -1368,6 +1501,11 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) ctrl->sgls = le32_to_cpu(id->sgls); ctrl->kas = le16_to_cpu(id->kas); + ctrl->npss = id->npss; + prev_apsta = ctrl->apsta; + ctrl->apsta = (ctrl->quirks & NVME_QUIRK_NO_APST) ? 0 : id->apsta; + memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd)); + if (ctrl->ops->is_fabrics) { ctrl->icdoff = le16_to_cpu(id->icdoff); ctrl->ioccsz = le32_to_cpu(id->ioccsz); @@ -1392,7 +1530,15 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) kfree(id); + if (ctrl->apsta && !prev_apsta) + dev_pm_qos_expose_latency_tolerance(ctrl->device); + else if (!ctrl->apsta && prev_apsta) + dev_pm_qos_hide_latency_tolerance(ctrl->device); + + nvme_configure_apst(ctrl); + ctrl->identified = true; + return ret; } EXPORT_SYMBOL_GPL(nvme_init_identify); @@ -2154,6 +2300,14 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, list_add_tail(&ctrl->node, &nvme_ctrl_list); spin_unlock(&dev_list_lock); + /* + * Initialize latency tolerance controls. The sysfs files won't + * be visible to userspace unless the device actually supports APST. + */ + ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance; + dev_pm_qos_update_user_latency_tolerance(ctrl->device, + min(default_ps_max_latency_us, (unsigned long)S32_MAX)); + return 0; out_release_instance: nvme_release_instance(ctrl); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 42ede67bfbc6..a3da1e90b99d 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -78,6 +78,11 @@ enum nvme_quirks { * readiness, which is done by reading the NVME_CSTS_RDY bit. */ NVME_QUIRK_DELAY_BEFORE_CHK_RDY = (1 << 3), + + /* + * APST should not be used. + */ + NVME_QUIRK_NO_APST = (1 << 4), }; /* @@ -148,13 +153,19 @@ struct nvme_ctrl { u32 vs; u32 sgls; u16 kas; + u8 npss; + u8 apsta; unsigned int kato; bool subsystem; unsigned long quirks; + struct nvme_id_power_state psd[32]; struct work_struct scan_work; struct work_struct async_event_work; struct delayed_work ka_work; + /* Power saving configuration */ + u64 ps_max_latency_us; + /* Fabrics only */ u16 sqsize; u32 ioccsz; diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 5b32521456d6..c43d435d4225 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -579,6 +579,12 @@ struct nvme_write_zeroes_cmd { __le16 appmask; }; +/* Features */ + +struct nvme_feat_auto_pst { + __le64 entries[32]; +}; + /* Admin commands */ enum nvme_admin_opcode { -- cgit v1.2.3 From 3ee80c3d15b61e61611903033df414a0590cfde9 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Mon, 20 Feb 2017 13:44:28 +0200 Subject: nvme-rdma: move nvme cm status helper to .h file This will enable the usage for nvme rdma target. Also move from a lookup array to a switch statement. Signed-off-by: Max Gurtovoy Reviewed-by: Parav Pandit Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/rdma.c | 22 ---------------------- include/linux/nvme-rdma.h | 24 ++++++++++++++++++++++++ 2 files changed, 24 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index aedac6e28c6f..ee789b38bae5 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -42,28 +42,6 @@ #define NVME_RDMA_MAX_INLINE_SEGMENTS 1 -static const char *const nvme_rdma_cm_status_strs[] = { - [NVME_RDMA_CM_INVALID_LEN] = "invalid length", - [NVME_RDMA_CM_INVALID_RECFMT] = "invalid record format", - [NVME_RDMA_CM_INVALID_QID] = "invalid queue ID", - [NVME_RDMA_CM_INVALID_HSQSIZE] = "invalid host SQ size", - [NVME_RDMA_CM_INVALID_HRQSIZE] = "invalid host RQ size", - [NVME_RDMA_CM_NO_RSC] = "resource not found", - [NVME_RDMA_CM_INVALID_IRD] = "invalid IRD", - [NVME_RDMA_CM_INVALID_ORD] = "Invalid ORD", -}; - -static const char *nvme_rdma_cm_msg(enum nvme_rdma_cm_status status) -{ - size_t index = status; - - if (index < ARRAY_SIZE(nvme_rdma_cm_status_strs) && - nvme_rdma_cm_status_strs[index]) - return nvme_rdma_cm_status_strs[index]; - else - return "unrecognized reason"; -}; - /* * We handle AEN commands ourselves and don't even let the * block layer know about them. diff --git a/include/linux/nvme-rdma.h b/include/linux/nvme-rdma.h index bf240a3cbf99..a72fd04aa5e1 100644 --- a/include/linux/nvme-rdma.h +++ b/include/linux/nvme-rdma.h @@ -29,6 +29,30 @@ enum nvme_rdma_cm_status { NVME_RDMA_CM_INVALID_ORD = 0x08, }; +static inline const char *nvme_rdma_cm_msg(enum nvme_rdma_cm_status status) +{ + switch (status) { + case NVME_RDMA_CM_INVALID_LEN: + return "invalid length"; + case NVME_RDMA_CM_INVALID_RECFMT: + return "invalid record format"; + case NVME_RDMA_CM_INVALID_QID: + return "invalid queue ID"; + case NVME_RDMA_CM_INVALID_HSQSIZE: + return "invalid host SQ size"; + case NVME_RDMA_CM_INVALID_HRQSIZE: + return "invalid host RQ size"; + case NVME_RDMA_CM_NO_RSC: + return "resource not found"; + case NVME_RDMA_CM_INVALID_IRD: + return "invalid IRD"; + case NVME_RDMA_CM_INVALID_ORD: + return "Invalid ORD"; + default: + return "unrecognized reason"; + } +} + /** * struct nvme_rdma_cm_req - rdma connect request * -- cgit v1.2.3 From 7d6d15789d69856f1c5405e106a773c87277eb8c Mon Sep 17 00:00:00 2001 From: Scott Bauer Date: Wed, 22 Feb 2017 10:15:06 -0700 Subject: block/sed-opal: Introduce free_opal_dev to free the structure and clean up state Before we free the opal structure we need to clean up any saved locking ranges that the user had told us to unlock from a suspend. Signed-off-by: Scott Bauer Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/sed-opal.c | 30 ++++++++++++++++++++++++++++++ include/linux/sed-opal.h | 5 +++++ 2 files changed, 35 insertions(+) (limited to 'include/linux') diff --git a/block/sed-opal.c b/block/sed-opal.c index 893557554cc5..020bf3e28e38 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -1987,6 +1987,28 @@ static int check_opal_support(struct opal_dev *dev) return ret; } +static void clean_opal_dev(struct opal_dev *dev) +{ + + struct opal_suspend_data *suspend, *next; + + mutex_lock(&dev->dev_lock); + list_for_each_entry_safe(suspend, next, &dev->unlk_lst, node) { + list_del(&suspend->node); + kfree(suspend); + } + mutex_unlock(&dev->dev_lock); +} + +void free_opal_dev(struct opal_dev *dev) +{ + if (!dev) + return; + clean_opal_dev(dev); + kfree(dev); +} +EXPORT_SYMBOL(free_opal_dev); + struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv) { struct opal_dev *dev; @@ -2141,6 +2163,14 @@ static int opal_reverttper(struct opal_dev *dev, struct opal_key *opal) setup_opal_dev(dev, revert_steps); ret = next(dev); mutex_unlock(&dev->dev_lock); + + /* + * If we successfully reverted lets clean + * any saved locking ranges. + */ + if (!ret) + clean_opal_dev(dev); + return ret; } diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index deee23d012e7..04b124fca51e 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -27,6 +27,7 @@ typedef int (sec_send_recv)(void *data, u16 spsp, u8 secp, void *buffer, size_t len, bool send); #ifdef CONFIG_BLK_SED_OPAL +void free_opal_dev(struct opal_dev *dev); bool opal_unlock_from_suspend(struct opal_dev *dev); struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv); int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *ioctl_ptr); @@ -51,6 +52,10 @@ static inline bool is_sed_ioctl(unsigned int cmd) return false; } #else +static inline void free_opal_dev(struct opal_dev *dev) +{ +} + static inline bool is_sed_ioctl(unsigned int cmd) { return false; -- cgit v1.2.3 From da55f2cc78418dee88400aafbbaed19d7ac8188e Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Wed, 22 Feb 2017 10:58:29 -0800 Subject: blk-mq: use sbq wait queues instead of restart for driver tags Commit 50e1dab86aa2 ("blk-mq-sched: fix starvation for multiple hardware queues and shared tags") fixed one starvation issue for shared tags. However, we can still get into a situation where we fail to allocate a tag because all tags are allocated but we don't have any pending requests on any hardware queue. One solution for this would be to restart all queues that share a tag map, but that really sucks. Ideally, we could just block and wait for a tag, but that isn't always possible from blk_mq_dispatch_rq_list(). However, we can still use the struct sbitmap_queue wait queues with a custom callback instead of blocking. This has a few benefits: 1. It avoids iterating over all hardware queues when completing an I/O, which the current restart code has to do. 2. It benefits from the existing rolling wakeup code. 3. It avoids punting to another thread just to have it block. Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/blk-mq.c | 64 +++++++++++++++++++++++++++++++++++++++++++------- include/linux/blk-mq.h | 2 ++ 2 files changed, 57 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/block/blk-mq.c b/block/blk-mq.c index b29e7dc7b309..9e6b064e5339 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -904,6 +904,44 @@ static bool reorder_tags_to_front(struct list_head *list) return first != NULL; } +static int blk_mq_dispatch_wake(wait_queue_t *wait, unsigned mode, int flags, + void *key) +{ + struct blk_mq_hw_ctx *hctx; + + hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); + + list_del(&wait->task_list); + clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state); + blk_mq_run_hw_queue(hctx, true); + return 1; +} + +static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx) +{ + struct sbq_wait_state *ws; + + /* + * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait. + * The thread which wins the race to grab this bit adds the hardware + * queue to the wait queue. + */ + if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) || + test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state)) + return false; + + init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); + ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx); + + /* + * As soon as this returns, it's no longer safe to fiddle with + * hctx->dispatch_wait, since a completion can wake up the wait queue + * and unlock the bit. + */ + add_wait_queue(&ws->wait, &hctx->dispatch_wait); + return true; +} + bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) { struct request_queue *q = hctx->queue; @@ -931,15 +969,22 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) continue; /* - * We failed getting a driver tag. Mark the queue(s) - * as needing a restart. Retry getting a tag again, - * in case the needed IO completed right before we - * marked the queue as needing a restart. + * The initial allocation attempt failed, so we need to + * rerun the hardware queue when a tag is freed. */ - blk_mq_sched_mark_restart(hctx); - if (!blk_mq_get_driver_tag(rq, &hctx, false)) + if (blk_mq_dispatch_wait_add(hctx)) { + /* + * It's possible that a tag was freed in the + * window between the allocation failure and + * adding the hardware queue to the wait queue. + */ + if (!blk_mq_get_driver_tag(rq, &hctx, false)) + break; + } else { break; + } } + list_del_init(&rq->queuelist); bd.rq = rq; @@ -995,10 +1040,11 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) * * blk_mq_run_hw_queue() already checks the STOPPED bit * - * If RESTART is set, then let completion restart the queue - * instead of potentially looping here. + * If RESTART or TAG_WAITING is set, then let completion restart + * the queue instead of potentially looping here. */ - if (!blk_mq_sched_needs_restart(hctx)) + if (!blk_mq_sched_needs_restart(hctx) && + !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) blk_mq_run_hw_queue(hctx, true); } diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 8e4df3d6c8cd..001d30d727c5 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -33,6 +33,7 @@ struct blk_mq_hw_ctx { struct blk_mq_ctx **ctxs; unsigned int nr_ctx; + wait_queue_t dispatch_wait; atomic_t wait_index; struct blk_mq_tags *tags; @@ -160,6 +161,7 @@ enum { BLK_MQ_S_STOPPED = 0, BLK_MQ_S_TAG_ACTIVE = 1, BLK_MQ_S_SCHED_RESTART = 2, + BLK_MQ_S_TAG_WAITING = 3, BLK_MQ_MAX_DEPTH = 10240, -- cgit v1.2.3