diff options
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/acpi/property.c | 3 | ||||
-rw-r--r-- | drivers/nvme/host/core.c | 294 | ||||
-rw-r--r-- | drivers/nvme/host/fabrics.c | 2 | ||||
-rw-r--r-- | drivers/nvme/host/fabrics.h | 3 | ||||
-rw-r--r-- | drivers/nvme/host/fc.c | 6 | ||||
-rw-r--r-- | drivers/nvme/host/hwmon.c | 3 | ||||
-rw-r--r-- | drivers/nvme/host/multipath.c | 35 | ||||
-rw-r--r-- | drivers/nvme/host/nvme.h | 36 | ||||
-rw-r--r-- | drivers/nvme/host/pci.c | 133 | ||||
-rw-r--r-- | drivers/nvme/host/rdma.c | 99 | ||||
-rw-r--r-- | drivers/nvme/host/tcp.c | 27 | ||||
-rw-r--r-- | drivers/nvme/target/Kconfig | 12 | ||||
-rw-r--r-- | drivers/nvme/target/Makefile | 1 | ||||
-rw-r--r-- | drivers/nvme/target/admin-cmd.c | 24 | ||||
-rw-r--r-- | drivers/nvme/target/configfs.c | 103 | ||||
-rw-r--r-- | drivers/nvme/target/core.c | 77 | ||||
-rw-r--r-- | drivers/nvme/target/fc.c | 30 | ||||
-rw-r--r-- | drivers/nvme/target/loop.c | 13 | ||||
-rw-r--r-- | drivers/nvme/target/nvmet.h | 55 | ||||
-rw-r--r-- | drivers/nvme/target/passthru.c | 544 | ||||
-rw-r--r-- | drivers/nvme/target/rdma.c | 14 |
21 files changed, 1204 insertions, 310 deletions
diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c index e601c4511a8b..c2e2ae774a19 100644 --- a/drivers/acpi/property.c +++ b/drivers/acpi/property.c @@ -45,6 +45,9 @@ static const guid_t prp_guids[] = { /* Thunderbolt GUID for WAKE_SUPPORTED: 6c501103-c189-4296-ba72-9bf5a26ebe5d */ GUID_INIT(0x6c501103, 0xc189, 0x4296, 0xba, 0x72, 0x9b, 0xf5, 0xa2, 0x6e, 0xbe, 0x5d), + /* Storage device needs D3 GUID: 5025030f-842f-4ab4-a561-99a5189762d0 */ + GUID_INIT(0x5025030f, 0x842f, 0x4ab4, + 0xa5, 0x61, 0x99, 0xa5, 0x18, 0x97, 0x62, 0xd0), }; /* ACPI _DSD data subnodes GUID: dbb8e3e6-5886-4ba6-8795-1319f52a966b */ diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 3d00ea4e7146..05aa568a60af 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -366,6 +366,16 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, break; } break; + case NVME_CTRL_DELETING_NOIO: + switch (old_state) { + case NVME_CTRL_DELETING: + case NVME_CTRL_DEAD: + changed = true; + /* FALLTHRU */ + default: + break; + } + break; case NVME_CTRL_DEAD: switch (old_state) { case NVME_CTRL_DELETING: @@ -403,6 +413,7 @@ static bool nvme_state_terminal(struct nvme_ctrl *ctrl) case NVME_CTRL_CONNECTING: return false; case NVME_CTRL_DELETING: + case NVME_CTRL_DELETING_NOIO: case NVME_CTRL_DEAD: return true; default: @@ -454,10 +465,11 @@ static void nvme_free_ns(struct kref *kref) kfree(ns); } -static void nvme_put_ns(struct nvme_ns *ns) +void nvme_put_ns(struct nvme_ns *ns) { kref_put(&ns->kref, nvme_free_ns); } +EXPORT_SYMBOL_NS_GPL(nvme_put_ns, NVME_TARGET_PASSTHRU); static inline void nvme_clear_nvme_request(struct request *req) { @@ -593,6 +605,14 @@ static void nvme_assign_write_stream(struct nvme_ctrl *ctrl, req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9; } +static void nvme_setup_passthrough(struct request *req, + struct nvme_command *cmd) +{ + memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd)); + /* passthru commands should let the driver set the SGL flags */ + cmd->common.flags &= ~NVME_CMD_SGL_ALL; +} + static inline void nvme_setup_flush(struct nvme_ns *ns, struct nvme_command *cmnd) { @@ -758,7 +778,7 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, switch (req_op(req)) { case REQ_OP_DRV_IN: case REQ_OP_DRV_OUT: - memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd)); + nvme_setup_passthrough(req, cmd); break; case REQ_OP_FLUSH: nvme_setup_flush(ns, cmd); @@ -909,6 +929,120 @@ out: return ERR_PTR(ret); } +static u32 nvme_known_admin_effects(u8 opcode) +{ + switch (opcode) { + case nvme_admin_format_nvm: + return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC | + NVME_CMD_EFFECTS_CSE_MASK; + case nvme_admin_sanitize_nvm: + return NVME_CMD_EFFECTS_CSE_MASK; + default: + break; + } + return 0; +} + +u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode) +{ + u32 effects = 0; + + if (ns) { + if (ns->head->effects) + effects = le32_to_cpu(ns->head->effects->iocs[opcode]); + if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC)) + dev_warn(ctrl->device, + "IO command:%02x has unhandled effects:%08x\n", + opcode, effects); + return 0; + } + + if (ctrl->effects) + effects = le32_to_cpu(ctrl->effects->acs[opcode]); + effects |= nvme_known_admin_effects(opcode); + + return effects; +} +EXPORT_SYMBOL_NS_GPL(nvme_command_effects, NVME_TARGET_PASSTHRU); + +static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, + u8 opcode) +{ + u32 effects = nvme_command_effects(ctrl, ns, opcode); + + /* + * For simplicity, IO to all namespaces is quiesced even if the command + * effects say only one namespace is affected. + */ + if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) { + mutex_lock(&ctrl->scan_lock); + mutex_lock(&ctrl->subsys->lock); + nvme_mpath_start_freeze(ctrl->subsys); + nvme_mpath_wait_freeze(ctrl->subsys); + nvme_start_freeze(ctrl); + nvme_wait_freeze(ctrl); + } + return effects; +} + +static void nvme_update_formats(struct nvme_ctrl *ctrl, u32 *effects) +{ + struct nvme_ns *ns; + + down_read(&ctrl->namespaces_rwsem); + list_for_each_entry(ns, &ctrl->namespaces, list) + if (_nvme_revalidate_disk(ns->disk)) + nvme_set_queue_dying(ns); + else if (blk_queue_is_zoned(ns->disk->queue)) { + /* + * IO commands are required to fully revalidate a zoned + * device. Force the command effects to trigger rescan + * work so report zones can run in a context with + * unfrozen IO queues. + */ + *effects |= NVME_CMD_EFFECTS_NCC; + } + up_read(&ctrl->namespaces_rwsem); +} + +static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) +{ + /* + * Revalidate LBA changes prior to unfreezing. This is necessary to + * prevent memory corruption if a logical block size was changed by + * this command. + */ + if (effects & NVME_CMD_EFFECTS_LBCC) + nvme_update_formats(ctrl, &effects); + if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) { + nvme_unfreeze(ctrl); + nvme_mpath_unfreeze(ctrl->subsys); + mutex_unlock(&ctrl->subsys->lock); + nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL); + mutex_unlock(&ctrl->scan_lock); + } + if (effects & NVME_CMD_EFFECTS_CCC) + nvme_init_identify(ctrl); + if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) { + nvme_queue_scan(ctrl); + flush_work(&ctrl->scan_work); + } +} + +void nvme_execute_passthru_rq(struct request *rq) +{ + struct nvme_command *cmd = nvme_req(rq)->cmd; + struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl; + struct nvme_ns *ns = rq->q->queuedata; + struct gendisk *disk = ns ? ns->disk : NULL; + u32 effects; + + effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode); + blk_execute_rq(rq->q, disk, rq, 0); + nvme_passthru_end(ctrl, effects); +} +EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU); + static int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, void __user *ubuffer, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, @@ -947,7 +1081,7 @@ static int nvme_submit_user_cmd(struct request_queue *q, } } - blk_execute_rq(req->q, disk, req, 0); + nvme_execute_passthru_rq(req); if (nvme_req(req)->flags & NVME_REQ_CANCELLED) ret = -EINTR; else @@ -1375,105 +1509,12 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) metadata, meta_len, lower_32_bits(io.slba), NULL, 0); } -static u32 nvme_known_admin_effects(u8 opcode) -{ - switch (opcode) { - case nvme_admin_format_nvm: - return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC | - NVME_CMD_EFFECTS_CSE_MASK; - case nvme_admin_sanitize_nvm: - return NVME_CMD_EFFECTS_CSE_MASK; - default: - break; - } - return 0; -} - -static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - u8 opcode) -{ - u32 effects = 0; - - if (ns) { - if (ns->head->effects) - effects = le32_to_cpu(ns->head->effects->iocs[opcode]); - if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC)) - dev_warn(ctrl->device, - "IO command:%02x has unhandled effects:%08x\n", - opcode, effects); - return 0; - } - - if (ctrl->effects) - effects = le32_to_cpu(ctrl->effects->acs[opcode]); - effects |= nvme_known_admin_effects(opcode); - - /* - * For simplicity, IO to all namespaces is quiesced even if the command - * effects say only one namespace is affected. - */ - if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) { - mutex_lock(&ctrl->scan_lock); - mutex_lock(&ctrl->subsys->lock); - nvme_mpath_start_freeze(ctrl->subsys); - nvme_mpath_wait_freeze(ctrl->subsys); - nvme_start_freeze(ctrl); - nvme_wait_freeze(ctrl); - } - return effects; -} - -static void nvme_update_formats(struct nvme_ctrl *ctrl, u32 *effects) -{ - struct nvme_ns *ns; - - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) - if (_nvme_revalidate_disk(ns->disk)) - nvme_set_queue_dying(ns); - else if (blk_queue_is_zoned(ns->disk->queue)) { - /* - * IO commands are required to fully revalidate a zoned - * device. Force the command effects to trigger rescan - * work so report zones can run in a context with - * unfrozen IO queues. - */ - *effects |= NVME_CMD_EFFECTS_NCC; - } - up_read(&ctrl->namespaces_rwsem); -} - -static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) -{ - /* - * Revalidate LBA changes prior to unfreezing. This is necessary to - * prevent memory corruption if a logical block size was changed by - * this command. - */ - if (effects & NVME_CMD_EFFECTS_LBCC) - nvme_update_formats(ctrl, &effects); - if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) { - nvme_unfreeze(ctrl); - nvme_mpath_unfreeze(ctrl->subsys); - mutex_unlock(&ctrl->subsys->lock); - nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL); - mutex_unlock(&ctrl->scan_lock); - } - if (effects & NVME_CMD_EFFECTS_CCC) - nvme_init_identify(ctrl); - if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) { - nvme_queue_scan(ctrl); - flush_work(&ctrl->scan_work); - } -} - static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, struct nvme_passthru_cmd __user *ucmd) { struct nvme_passthru_cmd cmd; struct nvme_command c; unsigned timeout = 0; - u32 effects; u64 result; int status; @@ -1500,12 +1541,10 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, if (cmd.timeout_ms) timeout = msecs_to_jiffies(cmd.timeout_ms); - effects = nvme_passthru_start(ctrl, ns, cmd.opcode); status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, nvme_to_user_ptr(cmd.addr), cmd.data_len, nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, 0, &result, timeout); - nvme_passthru_end(ctrl, effects); if (status >= 0) { if (put_user(result, &ucmd->result)) @@ -1521,7 +1560,6 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, struct nvme_passthru_cmd64 cmd; struct nvme_command c; unsigned timeout = 0; - u32 effects; int status; if (!capable(CAP_SYS_ADMIN)) @@ -1547,12 +1585,10 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, if (cmd.timeout_ms) timeout = msecs_to_jiffies(cmd.timeout_ms); - effects = nvme_passthru_start(ctrl, ns, cmd.opcode); status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, nvme_to_user_ptr(cmd.addr), cmd.data_len, nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, 0, &cmd.result, timeout); - nvme_passthru_end(ctrl, effects); if (status >= 0) { if (put_user(cmd.result, &ucmd->result)) @@ -2345,12 +2381,7 @@ EXPORT_SYMBOL_GPL(nvme_disable_ctrl); int nvme_enable_ctrl(struct nvme_ctrl *ctrl) { - /* - * Default to a 4K page size, with the intention to update this - * path in the future to accomodate architectures with differing - * kernel and IO page sizes. - */ - unsigned dev_page_min, page_shift = 12; + unsigned dev_page_min; int ret; ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap); @@ -2360,20 +2391,18 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl) } dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12; - if (page_shift < dev_page_min) { + if (NVME_CTRL_PAGE_SHIFT < dev_page_min) { dev_err(ctrl->device, "Minimum device page size %u too large for host (%u)\n", - 1 << dev_page_min, 1 << page_shift); + 1 << dev_page_min, 1 << NVME_CTRL_PAGE_SHIFT); return -ENODEV; } - ctrl->page_size = 1 << page_shift; - if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI) ctrl->ctrl_config = NVME_CC_CSS_CSI; else ctrl->ctrl_config = NVME_CC_CSS_NVM; - ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT; + ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT; ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE; ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; ctrl->ctrl_config |= NVME_CC_ENABLE; @@ -2423,13 +2452,13 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, if (ctrl->max_hw_sectors) { u32 max_segments = - (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1; + (ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1; max_segments = min_not_zero(max_segments, ctrl->max_segments); blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors); blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX)); } - blk_queue_virt_boundary(q, ctrl->page_size - 1); + blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1); blk_queue_dma_alignment(q, 7); if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) vwc = true; @@ -3483,6 +3512,7 @@ static ssize_t nvme_sysfs_show_state(struct device *dev, [NVME_CTRL_RESETTING] = "resetting", [NVME_CTRL_CONNECTING] = "connecting", [NVME_CTRL_DELETING] = "deleting", + [NVME_CTRL_DELETING_NOIO]= "deleting (no IO)", [NVME_CTRL_DEAD] = "dead", }; @@ -3586,8 +3616,8 @@ static ssize_t nvme_ctrl_reconnect_delay_store(struct device *dev, int err; err = kstrtou32(buf, 10, &v); - if (err || v > UINT_MAX) - return -EINVAL; + if (err) + return err; ctrl->opts->reconnect_delay = v; return count; @@ -3798,7 +3828,7 @@ static int ns_cmp(void *priv, struct list_head *a, struct list_head *b) return nsa->head->ns_id - nsb->head->ns_id; } -static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) +struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) { struct nvme_ns *ns, *ret = NULL; @@ -3816,6 +3846,7 @@ static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) up_read(&ctrl->namespaces_rwsem); return ret; } +EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU); static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) { @@ -4119,6 +4150,9 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl) if (ctrl->state == NVME_CTRL_DEAD) nvme_kill_queues(ctrl); + /* this is a no-op when called from the controller reset handler */ + nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO); + down_write(&ctrl->namespaces_rwsem); list_splice_init(&ctrl->namespaces, &ns_list); up_write(&ctrl->namespaces_rwsem); @@ -4313,8 +4347,7 @@ EXPORT_SYMBOL_GPL(nvme_stop_ctrl); void nvme_start_ctrl(struct nvme_ctrl *ctrl) { - if (ctrl->kato) - nvme_start_keep_alive(ctrl); + nvme_start_keep_alive(ctrl); nvme_enable_aen(ctrl); @@ -4559,6 +4592,29 @@ void nvme_sync_queues(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_sync_queues); +struct nvme_ctrl *nvme_ctrl_get_by_path(const char *path) +{ + struct nvme_ctrl *ctrl; + struct file *f; + + f = filp_open(path, O_RDWR, 0); + if (IS_ERR(f)) + return ERR_CAST(f); + + if (f->f_op != &nvme_dev_fops) { + ctrl = ERR_PTR(-EINVAL); + goto out_close; + } + + ctrl = f->private_data; + nvme_get_ctrl(ctrl); + +out_close: + filp_close(f, NULL); + return ctrl; +} +EXPORT_SYMBOL_NS_GPL(nvme_ctrl_get_by_path, NVME_TARGET_PASSTHRU); + /* * Check we didn't inadvertently grow the command structure sizes: */ diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 2a6c8190eeb7..4ec4829d6233 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -547,7 +547,7 @@ static struct nvmf_transport_ops *nvmf_lookup_transport( blk_status_t nvmf_fail_nonready_command(struct nvme_ctrl *ctrl, struct request *rq) { - if (ctrl->state != NVME_CTRL_DELETING && + if (ctrl->state != NVME_CTRL_DELETING_NOIO && ctrl->state != NVME_CTRL_DEAD && !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH)) return BLK_STS_RESOURCE; diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index a0ec40ab62ee..a9c1e3b4585e 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -182,7 +182,8 @@ bool nvmf_ip_options_match(struct nvme_ctrl *ctrl, static inline bool nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, bool queue_live) { - if (likely(ctrl->state == NVME_CTRL_LIVE)) + if (likely(ctrl->state == NVME_CTRL_LIVE || + ctrl->state == NVME_CTRL_DELETING)) return true; return __nvmf_check_ready(ctrl, rq, queue_live); } diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 6aa30bb5a762..eae43bb444e0 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -826,6 +826,7 @@ nvme_fc_ctrl_connectivity_loss(struct nvme_fc_ctrl *ctrl) break; case NVME_CTRL_DELETING: + case NVME_CTRL_DELETING_NOIO: default: /* no action to take - let it delete */ break; @@ -3001,8 +3002,9 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) if (ret) goto out_disconnect_admin_queue; - ctrl->ctrl.max_hw_sectors = - (ctrl->lport->ops->max_sgl_segments - 1) << (PAGE_SHIFT - 9); + ctrl->ctrl.max_segments = ctrl->lport->ops->max_sgl_segments; + ctrl->ctrl.max_hw_sectors = ctrl->ctrl.max_segments << + (ilog2(SZ_4K) - 9); blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); diff --git a/drivers/nvme/host/hwmon.c b/drivers/nvme/host/hwmon.c index 23ba8bf678ae..412a6c97c0d8 100644 --- a/drivers/nvme/host/hwmon.c +++ b/drivers/nvme/host/hwmon.c @@ -241,7 +241,8 @@ void nvme_hwmon_init(struct nvme_ctrl *ctrl) err = nvme_hwmon_get_smart_log(data); if (err) { - dev_warn(dev, "Failed to read smart log (error %d)\n", err); + dev_warn(ctrl->device, + "Failed to read smart log (error %d)\n", err); devm_kfree(dev, data); return; } diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 74bad4e3d377..3ded54d2c9c6 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -167,9 +167,18 @@ void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) static bool nvme_path_is_disabled(struct nvme_ns *ns) { - return ns->ctrl->state != NVME_CTRL_LIVE || - test_bit(NVME_NS_ANA_PENDING, &ns->flags) || - test_bit(NVME_NS_REMOVING, &ns->flags); + /* + * We don't treat NVME_CTRL_DELETING as a disabled path as I/O should + * still be able to complete assuming that the controller is connected. + * Otherwise it will fail immediately and return to the requeue list. + */ + if (ns->ctrl->state != NVME_CTRL_LIVE && + ns->ctrl->state != NVME_CTRL_DELETING) + return true; + if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) || + test_bit(NVME_NS_REMOVING, &ns->flags)) + return true; + return false; } static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node) @@ -246,6 +255,12 @@ static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head, fallback = ns; } + /* No optimized path found, re-check the current path */ + if (!nvme_path_is_disabled(old) && + old->ana_state == NVME_ANA_OPTIMIZED) { + found = old; + goto out; + } if (!fallback) return NULL; found = fallback; @@ -266,10 +281,13 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head) struct nvme_ns *ns; ns = srcu_dereference(head->current_path[node], &head->srcu); - if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR && ns) - ns = nvme_round_robin_path(head, node, ns); - if (unlikely(!ns || !nvme_path_is_optimized(ns))) - ns = __nvme_find_path(head, node); + if (unlikely(!ns)) + return __nvme_find_path(head, node); + + if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR) + return nvme_round_robin_path(head, node, ns); + if (unlikely(!nvme_path_is_optimized(ns))) + return __nvme_find_path(head, node); return ns; } @@ -563,6 +581,9 @@ static void nvme_ana_work(struct work_struct *work) { struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work); + if (ctrl->state != NVME_CTRL_LIVE) + return; + nvme_read_ana_log(ctrl); } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 13ca90bcd352..c5c1bac797aa 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -37,6 +37,14 @@ extern unsigned int admin_timeout; #define NVME_INLINE_METADATA_SG_CNT 1 #endif +/* + * Default to a 4K page size, with the intention to update this + * path in the future to accommodate architectures with differing + * kernel and IO page sizes. + */ +#define NVME_CTRL_PAGE_SHIFT 12 +#define NVME_CTRL_PAGE_SIZE (1 << NVME_CTRL_PAGE_SHIFT) + extern struct workqueue_struct *nvme_wq; extern struct workqueue_struct *nvme_reset_wq; extern struct workqueue_struct *nvme_delete_wq; @@ -173,12 +181,32 @@ static inline u16 nvme_req_qid(struct request *req) */ #define NVME_QUIRK_DELAY_AMOUNT 2300 +/* + * enum nvme_ctrl_state: Controller state + * + * @NVME_CTRL_NEW: New controller just allocated, initial state + * @NVME_CTRL_LIVE: Controller is connected and I/O capable + * @NVME_CTRL_RESETTING: Controller is resetting (or scheduled reset) + * @NVME_CTRL_CONNECTING: Controller is disconnected, now connecting the + * transport + * @NVME_CTRL_DELETING: Controller is deleting (or scheduled deletion) + * @NVME_CTRL_DELETING_NOIO: Controller is deleting and I/O is not + * disabled/failed immediately. This state comes + * after all async event processing took place and + * before ns removal and the controller deletion + * progress + * @NVME_CTRL_DEAD: Controller is non-present/unresponsive during + * shutdown or removal. In this case we forcibly + * kill all inflight I/O as they have no chance to + * complete + */ enum nvme_ctrl_state { NVME_CTRL_NEW, NVME_CTRL_LIVE, NVME_CTRL_RESETTING, NVME_CTRL_CONNECTING, NVME_CTRL_DELETING, + NVME_CTRL_DELETING_NOIO, NVME_CTRL_DEAD, }; @@ -234,7 +262,6 @@ struct nvme_ctrl { u32 queue_count; u64 cap; - u32 page_size; u32 max_hw_sectors; u32 max_segments; u32 max_integrity_segments; @@ -763,4 +790,11 @@ void nvme_hwmon_init(struct nvme_ctrl *ctrl); static inline void nvme_hwmon_init(struct nvme_ctrl *ctrl) { } #endif +u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, + u8 opcode); +void nvme_execute_passthru_rq(struct request *rq); +struct nvme_ctrl *nvme_ctrl_get_by_path(const char *path); +struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid); +void nvme_put_ns(struct nvme_ns *ns); + #endif /* _NVME_H */ diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 45e94f016ec2..61e612d52d61 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -4,6 +4,7 @@ * Copyright (c) 2011-2014, Intel Corporation. */ +#include <linux/acpi.h> #include <linux/aer.h> #include <linux/async.h> #include <linux/blkdev.h> @@ -94,6 +95,10 @@ static unsigned int poll_queues; module_param_cb(poll_queues, &io_queue_count_ops, &poll_queues, 0644); MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO."); +static bool noacpi; +module_param(noacpi, bool, 0444); +MODULE_PARM_DESC(noacpi, "disable acpi bios quirks"); + struct nvme_dev; struct nvme_queue; @@ -346,10 +351,10 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db, * as it only leads to a small amount of wasted memory for the lifetime of * the I/O. */ -static int nvme_npages(unsigned size, struct nvme_dev *dev) +static int nvme_pci_npages_prp(void) { - unsigned nprps = DIV_ROUND_UP(size + dev->ctrl.page_size, - dev->ctrl.page_size); + unsigned nprps = DIV_ROUND_UP(NVME_MAX_KB_SZ + NVME_CTRL_PAGE_SIZE, + NVME_CTRL_PAGE_SIZE); return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); } @@ -357,22 +362,18 @@ static int nvme_npages(unsigned size, struct nvme_dev *dev) * Calculates the number of pages needed for the SGL segments. For example a 4k * page can accommodate 256 SGL descriptors. */ -static int nvme_pci_npages_sgl(unsigned int num_seg) +static int nvme_pci_npages_sgl(void) { - return DIV_ROUND_UP(num_seg * sizeof(struct nvme_sgl_desc), PAGE_SIZE); + return DIV_ROUND_UP(NVME_MAX_SEGS * sizeof(struct nvme_sgl_desc), + PAGE_SIZE); } -static size_t nvme_pci_iod_alloc_size(struct nvme_dev *dev, - unsigned int size, unsigned int nseg, bool use_sgl) +static size_t nvme_pci_iod_alloc_size(void) { - size_t alloc_size; - - if (use_sgl) - alloc_size = sizeof(__le64 *) * nvme_pci_npages_sgl(nseg); - else - alloc_size = sizeof(__le64 *) * nvme_npages(size, dev); + size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl()); - return alloc_size + sizeof(struct scatterlist) * nseg; + return sizeof(__le64 *) * npages + + sizeof(struct scatterlist) * NVME_MAX_SEGS; } static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, @@ -515,7 +516,7 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req) static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1; + const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1; dma_addr_t dma_addr = iod->first_dma, next_dma_addr; int i; @@ -582,34 +583,33 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, struct scatterlist *sg = iod->sg; int dma_len = sg_dma_len(sg); u64 dma_addr = sg_dma_address(sg); - u32 page_size = dev->ctrl.page_size; - int offset = dma_addr & (page_size - 1); + int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1); __le64 *prp_list; void **list = nvme_pci_iod_list(req); dma_addr_t prp_dma; int nprps, i; - length -= (page_size - offset); + length -= (NVME_CTRL_PAGE_SIZE - offset); if (length <= 0) { iod->first_dma = 0; goto done; } - dma_len -= (page_size - offset); + dma_len -= (NVME_CTRL_PAGE_SIZE - offset); if (dma_len) { - dma_addr += (page_size - offset); + dma_addr += (NVME_CTRL_PAGE_SIZE - offset); } else { sg = sg_next(sg); dma_addr = sg_dma_address(sg); dma_len = sg_dma_len(sg); } - if (length <= page_size) { + if (length <= NVME_CTRL_PAGE_SIZE) { iod->first_dma = dma_addr; goto done; } - nprps = DIV_ROUND_UP(length, page_size); + nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE); if (nprps <= (256 / 8)) { pool = dev->prp_small_pool; iod->npages = 0; @@ -628,7 +628,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, iod->first_dma = prp_dma; i = 0; for (;;) { - if (i == page_size >> 3) { + if (i == NVME_CTRL_PAGE_SIZE >> 3) { __le64 *old_prp_list = prp_list; prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); if (!prp_list) @@ -639,9 +639,9 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, i = 1; } prp_list[i++] = cpu_to_le64(dma_addr); - dma_len -= page_size; - dma_addr += page_size; - length -= page_size; + dma_len -= NVME_CTRL_PAGE_SIZE; + dma_addr += NVME_CTRL_PAGE_SIZE; + length -= NVME_CTRL_PAGE_SIZE; if (length <= 0) break; if (dma_len > 0) @@ -751,8 +751,8 @@ static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev, struct bio_vec *bv) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - unsigned int offset = bv->bv_offset & (dev->ctrl.page_size - 1); - unsigned int first_prp_len = dev->ctrl.page_size - offset; + unsigned int offset = bv->bv_offset & (NVME_CTRL_PAGE_SIZE - 1); + unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - offset; iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0); if (dma_mapping_error(dev->dev, iod->first_dma)) @@ -794,7 +794,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, struct bio_vec bv = req_bvec(req); if (!is_pci_p2pdma_page(bv.bv_page)) { - if (bv.bv_offset + bv.bv_len <= dev->ctrl.page_size * 2) + if (bv.bv_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2) return nvme_setup_prp_simple(dev, req, &cmnd->rw, &bv); @@ -1396,12 +1396,12 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, { int q_depth = dev->q_depth; unsigned q_size_aligned = roundup(q_depth * entry_size, - dev->ctrl.page_size); + NVME_CTRL_PAGE_SIZE); if (q_size_aligned * nr_io_queues > dev->cmb_size) { u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues); - mem_per_q = round_down(mem_per_q, dev->ctrl.page_size); + mem_per_q = round_down(mem_per_q, NVME_CTRL_PAGE_SIZE); q_depth = div_u64(mem_per_q, entry_size); /* @@ -1816,6 +1816,7 @@ static inline void nvme_release_cmb(struct nvme_dev *dev) static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits) { + u32 host_mem_size = dev->host_mem_size >> NVME_CTRL_PAGE_SHIFT; u64 dma_addr = dev->host_mem_descs_dma; struct nvme_command c; int ret; @@ -1824,8 +1825,7 @@ static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits) c.features.opcode = nvme_admin_set_features; c.features.fid = cpu_to_le32(NVME_FEAT_HOST_MEM_BUF); c.features.dword11 = cpu_to_le32(bits); - c.features.dword12 = cpu_to_le32(dev->host_mem_size >> - ilog2(dev->ctrl.page_size)); + c.features.dword12 = cpu_to_le32(host_mem_size); c.features.dword13 = cpu_to_le32(lower_32_bits(dma_addr)); c.features.dword14 = cpu_to_le32(upper_32_bits(dma_addr)); c.features.dword15 = cpu_to_le32(dev->nr_host_mem_descs); @@ -1845,7 +1845,7 @@ static void nvme_free_host_mem(struct nvme_dev *dev) for (i = 0; i < dev->nr_host_mem_descs; i++) { struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i]; - size_t size = le32_to_cpu(desc->size) * dev->ctrl.page_size; + size_t size = le32_to_cpu(desc->size) * NVME_CTRL_PAGE_SIZE; dma_free_attrs(dev->dev, size, dev->host_mem_desc_bufs[i], le64_to_cpu(desc->addr), @@ -1897,7 +1897,7 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred, break; descs[i].addr = cpu_to_le64(dma_addr); - descs[i].size = cpu_to_le32(len / dev->ctrl.page_size); + descs[i].size = cpu_to_le32(len / NVME_CTRL_PAGE_SIZE); i++; } @@ -1913,7 +1913,7 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred, out_free_bufs: while (--i >= 0) { - size_t size = le32_to_cpu(descs[i].size) * dev->ctrl.page_size; + size_t size = le32_to_cpu(descs[i].size) * NVME_CTRL_PAGE_SIZE; dma_free_attrs(dev->dev, size, bufs[i], le64_to_cpu(descs[i].addr), @@ -2759,6 +2759,54 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev) return 0; } +#ifdef CONFIG_ACPI +static bool nvme_acpi_storage_d3(struct pci_dev *dev) +{ + struct acpi_device *adev; + struct pci_dev *root; + acpi_handle handle; + acpi_status status; + u8 val; + + /* + * Look for _DSD property specifying that the storage device on the port + * must use D3 to support deep platform power savings during + * suspend-to-idle. + */ + root = pcie_find_root_port(dev); + if (!root) + return false; + + adev = ACPI_COMPANION(&root->dev); + if (!adev) + return false; + + /* + * The property is defined in the PXSX device for South complex ports + * and in the PEGP device for North complex ports. + */ + status = acpi_get_handle(adev->handle, "PXSX", &handle); + if (ACPI_FAILURE(status)) { + status = acpi_get_handle(adev->handle, "PEGP", &handle); + if (ACPI_FAILURE(status)) + return false; + } + + if (acpi_bus_get_device(handle, &adev)) + return false; + + if (fwnode_property_read_u8(acpi_fwnode_handle(adev), "StorageD3Enable", + &val)) + return false; + return val == 1; +} +#else +static inline bool nvme_acpi_storage_d3(struct pci_dev *dev) +{ + return false; +} +#endif /* CONFIG_ACPI */ + static void nvme_async_probe(void *data, async_cookie_t cookie) { struct nvme_dev *dev = data; @@ -2808,12 +2856,21 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) quirks |= check_vendor_combination_bug(pdev); + if (!noacpi && nvme_acpi_storage_d3(pdev)) { + /* + * Some systems use a bios work around to ask for D3 on + * platforms that support kernel managed suspend. + */ + dev_info(&pdev->dev, + "platform quirk: setting simple suspend\n"); + quirks |= NVME_QUIRK_SIMPLE_SUSPEND; + } + /* * Double check that our mempool alloc size will cover the biggest * command we support. */ - alloc_size = nvme_pci_iod_alloc_size(dev, NVME_MAX_KB_SZ, - NVME_MAX_SEGS, true); + alloc_size = nvme_pci_iod_alloc_size(); WARN_ON_ONCE(alloc_size > PAGE_SIZE); dev->iod_mempool = mempool_create_node(1, mempool_kmalloc, diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index e881f879ac63..44c76ffbb264 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -96,6 +96,7 @@ struct nvme_rdma_queue { int cm_error; struct completion cm_done; bool pi_support; + int cq_size; }; struct nvme_rdma_ctrl { @@ -275,6 +276,7 @@ static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor) init_attr.recv_cq = queue->ib_cq; if (queue->pi_support) init_attr.create_flags |= IB_QP_CREATE_INTEGRITY_EN; + init_attr.qp_context = queue; ret = rdma_create_qp(queue->cm_id, dev->pd, &init_attr); @@ -409,6 +411,14 @@ out_err: return NULL; } +static void nvme_rdma_free_cq(struct nvme_rdma_queue *queue) +{ + if (nvme_rdma_poll_queue(queue)) + ib_free_cq(queue->ib_cq); + else + ib_cq_pool_put(queue->ib_cq, queue->cq_size); +} + static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) { struct nvme_rdma_device *dev; @@ -430,7 +440,7 @@ static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) * the destruction of the QP shouldn't use rdma_cm API. */ ib_destroy_qp(queue->qp); - ib_free_cq(queue->ib_cq); + nvme_rdma_free_cq(queue); nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size, sizeof(struct nvme_completion), DMA_FROM_DEVICE); @@ -450,13 +460,42 @@ static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev, bool pi_support) return min_t(u32, NVME_RDMA_MAX_SEGMENTS, max_page_list_len - 1); } +static int nvme_rdma_create_cq(struct ib_device *ibdev, + struct nvme_rdma_queue *queue) +{ + int ret, comp_vector, idx = nvme_rdma_queue_idx(queue); + enum ib_poll_context poll_ctx; + + /* + * Spread I/O queues completion vectors according their queue index. + * Admin queues can always go on completion vector 0. + */ + comp_vector = (idx == 0 ? idx : idx - 1) % ibdev->num_comp_vectors; + + /* Polling queues need direct cq polling context */ + if (nvme_rdma_poll_queue(queue)) { + poll_ctx = IB_POLL_DIRECT; + queue->ib_cq = ib_alloc_cq(ibdev, queue, queue->cq_size, + comp_vector, poll_ctx); + } else { + poll_ctx = IB_POLL_SOFTIRQ; + queue->ib_cq = ib_cq_pool_get(ibdev, queue->cq_size, + comp_vector, poll_ctx); + } + + if (IS_ERR(queue->ib_cq)) { + ret = PTR_ERR(queue->ib_cq); + return ret; + } + + return 0; +} + static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) { struct ib_device *ibdev; const int send_wr_factor = 3; /* MR, SEND, INV */ const int cq_factor = send_wr_factor + 1; /* + RECV */ - int comp_vector, idx = nvme_rdma_queue_idx(queue); - enum ib_poll_context poll_ctx; int ret, pages_per_mr; queue->device = nvme_rdma_find_get_device(queue->cm_id); @@ -467,26 +506,12 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) } ibdev = queue->device->dev; - /* - * Spread I/O queues completion vectors according their queue index. - * Admin queues can always go on completion vector 0. - */ - comp_vector = (idx == 0 ? idx : idx - 1) % ibdev->num_comp_vectors; - - /* Polling queues need direct cq polling context */ - if (nvme_rdma_poll_queue(queue)) - poll_ctx = IB_POLL_DIRECT; - else - poll_ctx = IB_POLL_SOFTIRQ; - /* +1 for ib_stop_cq */ - queue->ib_cq = ib_alloc_cq(ibdev, queue, - cq_factor * queue->queue_size + 1, - comp_vector, poll_ctx); - if (IS_ERR(queue->ib_cq)) { - ret = PTR_ERR(queue->ib_cq); + queue->cq_size = cq_factor * queue->queue_size + 1; + + ret = nvme_rdma_create_cq(ibdev, queue); + if (ret) goto out_put_dev; - } ret = nvme_rdma_create_qp(queue, send_wr_factor); if (ret) @@ -512,7 +537,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) if (ret) { dev_err(queue->ctrl->ctrl.device, "failed to initialize MR pool sized %d for QID %d\n", - queue->queue_size, idx); + queue->queue_size, nvme_rdma_queue_idx(queue)); goto out_destroy_ring; } @@ -523,7 +548,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) if (ret) { dev_err(queue->ctrl->ctrl.device, "failed to initialize PI MR pool sized %d for QID %d\n", - queue->queue_size, idx); + queue->queue_size, nvme_rdma_queue_idx(queue)); goto out_destroy_mr_pool; } } @@ -540,7 +565,7 @@ out_destroy_ring: out_destroy_qp: rdma_destroy_qp(queue->cm_id); out_destroy_ib_cq: - ib_free_cq(queue->ib_cq); + nvme_rdma_free_cq(queue); out_put_dev: nvme_rdma_dev_put(queue->device); return ret; @@ -942,15 +967,20 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) ret = PTR_ERR(ctrl->ctrl.connect_q); goto out_free_tag_set; } - } else { - blk_mq_update_nr_hw_queues(&ctrl->tag_set, - ctrl->ctrl.queue_count - 1); } ret = nvme_rdma_start_io_queues(ctrl); if (ret) goto out_cleanup_connect_q; + if (!new) { + nvme_start_queues(&ctrl->ctrl); + nvme_wait_freeze(&ctrl->ctrl); + blk_mq_update_nr_hw_queues(ctrl->ctrl.tagset, + ctrl->ctrl.queue_count - 1); + nvme_unfreeze(&ctrl->ctrl); + } + return 0; out_cleanup_connect_q: @@ -983,6 +1013,7 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, bool remove) { if (ctrl->ctrl.queue_count > 1) { + nvme_start_freeze(&ctrl->ctrl); nvme_stop_queues(&ctrl->ctrl); nvme_rdma_stop_io_queues(ctrl); if (ctrl->ctrl.tagset) { @@ -1077,11 +1108,12 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new) changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); if (!changed) { /* - * state change failure is ok if we're in DELETING state, + * state change failure is ok if we started ctrl delete, * unless we're during creation of a new controller to * avoid races with teardown flow. */ - WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING); + WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING && + ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO); WARN_ON_ONCE(new); ret = -EINVAL; goto destroy_io; @@ -1134,8 +1166,9 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { - /* state change failure is ok if we're in DELETING state */ - WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING); + /* state change failure is ok if we started ctrl delete */ + WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING && + ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO); return; } @@ -1163,7 +1196,7 @@ static void nvme_rdma_end_request(struct nvme_rdma_request *req) static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc, const char *op) { - struct nvme_rdma_queue *queue = cq->cq_context; + struct nvme_rdma_queue *queue = wc->qp->qp_context; struct nvme_rdma_ctrl *ctrl = queue->ctrl; if (ctrl->ctrl.state == NVME_CTRL_LIVE) @@ -1706,7 +1739,7 @@ static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) { struct nvme_rdma_qe *qe = container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe); - struct nvme_rdma_queue *queue = cq->cq_context; + struct nvme_rdma_queue *queue = wc->qp->qp_context; struct ib_device *ibdev = queue->device->dev; struct nvme_completion *cqe = qe->data; const size_t len = sizeof(struct nvme_completion); diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index b2e73e19ef01..378c049e0a5e 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1771,15 +1771,20 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) ret = PTR_ERR(ctrl->connect_q); goto out_free_tag_set; } - } else { - blk_mq_update_nr_hw_queues(ctrl->tagset, - ctrl->queue_count - 1); } ret = nvme_tcp_start_io_queues(ctrl); if (ret) goto out_cleanup_connect_q; + if (!new) { + nvme_start_queues(ctrl); + nvme_wait_freeze(ctrl); + blk_mq_update_nr_hw_queues(ctrl->tagset, + ctrl->queue_count - 1); + nvme_unfreeze(ctrl); + } + return 0; out_cleanup_connect_q: @@ -1884,6 +1889,7 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, { if (ctrl->queue_count <= 1) return; + nvme_start_freeze(ctrl); nvme_stop_queues(ctrl); nvme_tcp_stop_io_queues(ctrl); if (ctrl->tagset) { @@ -1950,11 +1956,12 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) { /* - * state change failure is ok if we're in DELETING state, + * state change failure is ok if we started ctrl delete, * unless we're during creation of a new controller to * avoid races with teardown flow. */ - WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING); + WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING && + ctrl->state != NVME_CTRL_DELETING_NOIO); WARN_ON_ONCE(new); ret = -EINVAL; goto destroy_io; @@ -2010,8 +2017,9 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) blk_mq_unquiesce_queue(ctrl->admin_q); if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { - /* state change failure is ok if we're in DELETING state */ - WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING); + /* state change failure is ok if we started ctrl delete */ + WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING && + ctrl->state != NVME_CTRL_DELETING_NOIO); return; } @@ -2046,8 +2054,9 @@ static void nvme_reset_ctrl_work(struct work_struct *work) nvme_tcp_teardown_ctrl(ctrl, false); if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { - /* state change failure is ok if we're in DELETING state */ - WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING); + /* state change failure is ok if we started ctrl delete */ + WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING && + ctrl->state != NVME_CTRL_DELETING_NOIO); return; } diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig index 4474952d64c6..8056955e652c 100644 --- a/drivers/nvme/target/Kconfig +++ b/drivers/nvme/target/Kconfig @@ -16,6 +16,18 @@ config NVME_TARGET To configure the NVMe target you probably want to use the nvmetcli tool from http://git.infradead.org/users/hch/nvmetcli.git. +config NVME_TARGET_PASSTHRU + bool "NVMe Target Passthrough support" + depends on NVME_TARGET + depends on NVME_CORE=y || NVME_CORE=NVME_TARGET + help + This enables target side NVMe passthru controller support for the + NVMe Over Fabrics protocol. It allows for hosts to manage and + directly access an actual NVMe controller residing on the target + side, incuding executing Vendor Unique Commands. + + If unsure, say N. + config NVME_TARGET_LOOP tristate "NVMe loopback device support" depends on NVME_TARGET diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile index 2b33836f3d3e..ebf91fc4c72e 100644 --- a/drivers/nvme/target/Makefile +++ b/drivers/nvme/target/Makefile @@ -11,6 +11,7 @@ obj-$(CONFIG_NVME_TARGET_TCP) += nvmet-tcp.o nvmet-y += core.o configfs.o admin-cmd.o fabrics-cmd.o \ discovery.o io-cmd-file.o io-cmd-bdev.o +nvmet-$(CONFIG_NVME_TARGET_PASSTHRU) += passthru.o nvme-loop-y += loop.o nvmet-rdma-y += rdma.o nvmet-fc-y += fc.o diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 95bb3bc4e335..e9fe91786bbb 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -113,11 +113,10 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req, u64 data_units_read = 0, data_units_written = 0; struct nvmet_ns *ns; struct nvmet_ctrl *ctrl; + unsigned long idx; ctrl = req->sq->ctrl; - - rcu_read_lock(); - list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) { + xa_for_each(&ctrl->subsys->namespaces, idx, ns) { /* we don't have the right data for file backed ns */ if (!ns->bdev) continue; @@ -127,9 +126,7 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req, host_writes += part_stat_read(ns->bdev->bd_part, ios[WRITE]); data_units_written += DIV_ROUND_UP( part_stat_read(ns->bdev->bd_part, sectors[WRITE]), 1000); - } - rcu_read_unlock(); put_unaligned_le64(host_reads, &slog->host_reads[0]); put_unaligned_le64(data_units_read, &slog->data_units_read[0]); @@ -230,14 +227,13 @@ static u32 nvmet_format_ana_group(struct nvmet_req *req, u32 grpid, { struct nvmet_ctrl *ctrl = req->sq->ctrl; struct nvmet_ns *ns; + unsigned long idx; u32 count = 0; if (!(req->cmd->get_log_page.lsp & NVME_ANA_LOG_RGO)) { - rcu_read_lock(); - list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) + xa_for_each(&ctrl->subsys->namespaces, idx, ns) if (ns->anagrpid == grpid) desc->nsids[count++] = cpu_to_le32(ns->nsid); - rcu_read_unlock(); } desc->grpid = cpu_to_le32(grpid); @@ -556,6 +552,7 @@ static void nvmet_execute_identify_nslist(struct nvmet_req *req) static const int buf_size = NVME_IDENTIFY_DATA_SIZE; struct nvmet_ctrl *ctrl = req->sq->ctrl; struct nvmet_ns *ns; + unsigned long idx; u32 min_nsid = le32_to_cpu(req->cmd->identify.nsid); __le32 *list; u16 status = 0; @@ -567,15 +564,13 @@ static void nvmet_execute_identify_nslist(struct nvmet_req *req) goto out; } - rcu_read_lock(); - list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) { + xa_for_each(&ctrl->subsys->namespaces, idx, ns) { if (ns->nsid <= min_nsid) continue; list[i++] = cpu_to_le32(ns->nsid); if (i == buf_size / sizeof(__le32)) break; } - rcu_read_unlock(); status = nvmet_copy_to_sgl(req, 0, list, buf_size); @@ -754,7 +749,7 @@ u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask) return 0; } -static void nvmet_execute_set_features(struct nvmet_req *req) +void nvmet_execute_set_features(struct nvmet_req *req) { struct nvmet_subsys *subsys = req->sq->ctrl->subsys; u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10); @@ -829,7 +824,7 @@ void nvmet_get_feat_async_event(struct nvmet_req *req) nvmet_set_result(req, READ_ONCE(req->sq->ctrl->aen_enabled)); } -static void nvmet_execute_get_features(struct nvmet_req *req) +void nvmet_execute_get_features(struct nvmet_req *req) { struct nvmet_subsys *subsys = req->sq->ctrl->subsys; u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10); @@ -945,6 +940,9 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req) if (unlikely(ret)) return ret; + if (nvmet_req_passthru_ctrl(req)) + return nvmet_parse_passthru_admin_cmd(req); + switch (cmd->common.opcode) { case nvme_admin_get_log_page: req->execute = nvmet_execute_get_log_page; diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index cdec47de89ed..74b2b61c773b 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -666,6 +666,103 @@ static const struct config_item_type nvmet_namespaces_type = { .ct_owner = THIS_MODULE, }; +#ifdef CONFIG_NVME_TARGET_PASSTHRU + +static ssize_t nvmet_passthru_device_path_show(struct config_item *item, + char *page) +{ + struct nvmet_subsys *subsys = to_subsys(item->ci_parent); + + return snprintf(page, PAGE_SIZE, "%s\n", subsys->passthru_ctrl_path); +} + +static ssize_t nvmet_passthru_device_path_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_subsys *subsys = to_subsys(item->ci_parent); + size_t len; + int ret; + + mutex_lock(&subsys->lock); + + ret = -EBUSY; + if (subsys->passthru_ctrl) + goto out_unlock; + + ret = -EINVAL; + len = strcspn(page, "\n"); + if (!len) + goto out_unlock; + + kfree(subsys->passthru_ctrl_path); + ret = -ENOMEM; + subsys->passthru_ctrl_path = kstrndup(page, len, GFP_KERNEL); + if (!subsys->passthru_ctrl_path) + goto out_unlock; + + mutex_unlock(&subsys->lock); + + return count; +out_unlock: + mutex_unlock(&subsys->lock); + return ret; +} +CONFIGFS_ATTR(nvmet_passthru_, device_path); + +static ssize_t nvmet_passthru_enable_show(struct config_item *item, + char *page) +{ + struct nvmet_subsys *subsys = to_subsys(item->ci_parent); + + return sprintf(page, "%d\n", subsys->passthru_ctrl ? 1 : 0); +} + +static ssize_t nvmet_passthru_enable_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_subsys *subsys = to_subsys(item->ci_parent); + bool enable; + int ret = 0; + + if (strtobool(page, &enable)) + return -EINVAL; + + if (enable) + ret = nvmet_passthru_ctrl_enable(subsys); + else + nvmet_passthru_ctrl_disable(subsys); + + return ret ? ret : count; +} +CONFIGFS_ATTR(nvmet_passthru_, enable); + +static struct configfs_attribute *nvmet_passthru_attrs[] = { + &nvmet_passthru_attr_device_path, + &nvmet_passthru_attr_enable, + NULL, +}; + +static const struct config_item_type nvmet_passthru_type = { + .ct_attrs = nvmet_passthru_attrs, + .ct_owner = THIS_MODULE, +}; + +static void nvmet_add_passthru_group(struct nvmet_subsys *subsys) +{ + config_group_init_type_name(&subsys->passthru_group, + "passthru", &nvmet_passthru_type); + configfs_add_default_group(&subsys->passthru_group, + &subsys->group); +} + +#else /* CONFIG_NVME_TARGET_PASSTHRU */ + +static void nvmet_add_passthru_group(struct nvmet_subsys *subsys) +{ +} + +#endif /* CONFIG_NVME_TARGET_PASSTHRU */ + static int nvmet_port_subsys_allow_link(struct config_item *parent, struct config_item *target) { @@ -879,6 +976,10 @@ static ssize_t nvmet_subsys_attr_version_store(struct config_item *item, int major, minor, tertiary = 0; int ret; + /* passthru subsystems use the underlying controller's version */ + if (nvmet_passthru_ctrl(subsys)) + return -EINVAL; + ret = sscanf(page, "%d.%d.%d\n", &major, &minor, &tertiary); if (ret != 2 && ret != 3) return -EINVAL; @@ -1121,6 +1222,8 @@ static struct config_group *nvmet_subsys_make(struct config_group *group, configfs_add_default_group(&subsys->allowed_hosts_group, &subsys->group); + nvmet_add_passthru_group(subsys); + return &subsys->group; } diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 9cdc39c8b729..b92f45f5cd5b 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -115,13 +115,14 @@ u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len) static unsigned int nvmet_max_nsid(struct nvmet_subsys *subsys) { - struct nvmet_ns *ns; + unsigned long nsid = 0; + struct nvmet_ns *cur; + unsigned long idx; - if (list_empty(&subsys->namespaces)) - return 0; + xa_for_each(&subsys->namespaces, idx, cur) + nsid = cur->nsid; - ns = list_last_entry(&subsys->namespaces, struct nvmet_ns, dev_link); - return ns->nsid; + return nsid; } static u32 nvmet_async_event_result(struct nvmet_async_event *aen) @@ -410,28 +411,13 @@ static void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl) cancel_delayed_work_sync(&ctrl->ka_work); } -static struct nvmet_ns *__nvmet_find_namespace(struct nvmet_ctrl *ctrl, - __le32 nsid) -{ - struct nvmet_ns *ns; - - list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) { - if (ns->nsid == le32_to_cpu(nsid)) - return ns; - } - - return NULL; -} - struct nvmet_ns *nvmet_find_namespace(struct nvmet_ctrl *ctrl, __le32 nsid) { struct nvmet_ns *ns; - rcu_read_lock(); - ns = __nvmet_find_namespace(ctrl, nsid); + ns = xa_load(&ctrl->subsys->namespaces, le32_to_cpu(nsid)); if (ns) percpu_ref_get(&ns->ref); - rcu_read_unlock(); return ns; } @@ -558,6 +544,12 @@ int nvmet_ns_enable(struct nvmet_ns *ns) mutex_lock(&subsys->lock); ret = 0; + + if (nvmet_passthru_ctrl(subsys)) { + pr_info("cannot enable both passthru and regular namespaces for a single subsystem"); + goto out_unlock; + } + if (ns->enabled) goto out_unlock; @@ -586,24 +578,10 @@ int nvmet_ns_enable(struct nvmet_ns *ns) if (ns->nsid > subsys->max_nsid) subsys->max_nsid = ns->nsid; - /* - * The namespaces list needs to be sorted to simplify the implementation - * of the Identify Namepace List subcommand. - */ - if (list_empty(&subsys->namespaces)) { - list_add_tail_rcu(&ns->dev_link, &subsys->namespaces); - } else { - struct nvmet_ns *old; - - list_for_each_entry_rcu(old, &subsys->namespaces, dev_link, - lockdep_is_held(&subsys->lock)) { - BUG_ON(ns->nsid == old->nsid); - if (ns->nsid < old->nsid) - break; - } + ret = xa_insert(&subsys->namespaces, ns->nsid, ns, GFP_KERNEL); + if (ret) + goto out_restore_subsys_maxnsid; - list_add_tail_rcu(&ns->dev_link, &old->dev_link); - } subsys->nr_namespaces++; nvmet_ns_changed(subsys, ns->nsid); @@ -612,6 +590,10 @@ int nvmet_ns_enable(struct nvmet_ns *ns) out_unlock: mutex_unlock(&subsys->lock); return ret; + +out_restore_subsys_maxnsid: + subsys->max_nsid = nvmet_max_nsid(subsys); + percpu_ref_exit(&ns->ref); out_dev_put: list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid)); @@ -630,7 +612,7 @@ void nvmet_ns_disable(struct nvmet_ns *ns) goto out_unlock; ns->enabled = false; - list_del_rcu(&ns->dev_link); + xa_erase(&ns->subsys->namespaces, ns->nsid); if (ns->nsid == subsys->max_nsid) subsys->max_nsid = nvmet_max_nsid(subsys); @@ -681,7 +663,6 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid) if (!ns) return NULL; - INIT_LIST_HEAD(&ns->dev_link); init_completion(&ns->disable_done); ns->nsid = nsid; @@ -874,6 +855,9 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req) if (unlikely(ret)) return ret; + if (nvmet_req_passthru_ctrl(req)) + return nvmet_parse_passthru_io_cmd(req); + req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid); if (unlikely(!req->ns)) { req->error_loc = offsetof(struct nvme_common_command, nsid); @@ -1263,14 +1247,14 @@ static void nvmet_setup_p2p_ns_map(struct nvmet_ctrl *ctrl, struct nvmet_req *req) { struct nvmet_ns *ns; + unsigned long idx; if (!req->p2p_client) return; ctrl->p2p_client = get_device(req->p2p_client); - list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link, - lockdep_is_held(&ctrl->subsys->lock)) + xa_for_each(&ctrl->subsys->namespaces, idx, ns) nvmet_p2pmem_ns_add_p2p(ctrl, ns); } @@ -1495,7 +1479,7 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, if (!subsys) return ERR_PTR(-ENOMEM); - subsys->ver = NVME_VS(1, 3, 0); /* NVMe 1.3.0 */ + subsys->ver = NVMET_DEFAULT_VS; /* generate a random serial number as our controllers are ephemeral: */ get_random_bytes(&subsys->serial, sizeof(subsys->serial)); @@ -1523,7 +1507,7 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, kref_init(&subsys->ref); mutex_init(&subsys->lock); - INIT_LIST_HEAD(&subsys->namespaces); + xa_init(&subsys->namespaces); INIT_LIST_HEAD(&subsys->ctrls); INIT_LIST_HEAD(&subsys->hosts); @@ -1535,7 +1519,10 @@ static void nvmet_subsys_free(struct kref *ref) struct nvmet_subsys *subsys = container_of(ref, struct nvmet_subsys, ref); - WARN_ON_ONCE(!list_empty(&subsys->namespaces)); + WARN_ON_ONCE(!xa_empty(&subsys->namespaces)); + + xa_destroy(&subsys->namespaces); + nvmet_passthru_subsys_free(subsys); kfree(subsys->subsysnqn); kfree_rcu(subsys->model, rcuhead); diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 27fd3b5aa621..55bafd56166a 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -167,7 +167,6 @@ struct nvmet_fc_tgt_assoc { struct nvmet_fc_tgt_queue *queues[NVMET_NR_QUEUES + 1]; struct kref ref; struct work_struct del_work; - atomic_t del_work_active; }; @@ -1090,7 +1089,6 @@ nvmet_fc_delete_assoc(struct work_struct *work) container_of(work, struct nvmet_fc_tgt_assoc, del_work); nvmet_fc_delete_target_assoc(assoc); - atomic_set(&assoc->del_work_active, 0); nvmet_fc_tgt_a_put(assoc); } @@ -1123,7 +1121,6 @@ nvmet_fc_alloc_target_assoc(struct nvmet_fc_tgtport *tgtport, void *hosthandle) INIT_LIST_HEAD(&assoc->a_list); kref_init(&assoc->ref); INIT_WORK(&assoc->del_work, nvmet_fc_delete_assoc); - atomic_set(&assoc->del_work_active, 0); atomic_set(&assoc->terminating, 0); while (needrandom) { @@ -1243,7 +1240,8 @@ nvmet_fc_find_target_assoc(struct nvmet_fc_tgtport *tgtport, list_for_each_entry(assoc, &tgtport->assoc_list, a_list) { if (association_id == assoc->association_id) { ret = assoc; - nvmet_fc_tgt_a_get(assoc); + if (!nvmet_fc_tgt_a_get(assoc)) + ret = NULL; break; } } @@ -1477,21 +1475,15 @@ __nvmet_fc_free_assocs(struct nvmet_fc_tgtport *tgtport) { struct nvmet_fc_tgt_assoc *assoc, *next; unsigned long flags; - int ret; spin_lock_irqsave(&tgtport->lock, flags); list_for_each_entry_safe(assoc, next, &tgtport->assoc_list, a_list) { if (!nvmet_fc_tgt_a_get(assoc)) continue; - ret = atomic_cmpxchg(&assoc->del_work_active, 0, 1); - if (ret == 0) { - if (!schedule_work(&assoc->del_work)) - nvmet_fc_tgt_a_put(assoc); - } else { + if (!schedule_work(&assoc->del_work)) /* already deleting - release local reference */ nvmet_fc_tgt_a_put(assoc); - } } spin_unlock_irqrestore(&tgtport->lock, flags); } @@ -1533,7 +1525,6 @@ nvmet_fc_invalidate_host(struct nvmet_fc_target_port *target_port, struct nvmet_fc_tgt_assoc *assoc, *next; unsigned long flags; bool noassoc = true; - int ret; spin_lock_irqsave(&tgtport->lock, flags); list_for_each_entry_safe(assoc, next, @@ -1545,14 +1536,9 @@ nvmet_fc_invalidate_host(struct nvmet_fc_target_port *target_port, continue; assoc->hostport->invalid = 1; noassoc = false; - ret = atomic_cmpxchg(&assoc->del_work_active, 0, 1); - if (ret == 0) { - if (!schedule_work(&assoc->del_work)) - nvmet_fc_tgt_a_put(assoc); - } else { + if (!schedule_work(&assoc->del_work)) /* already deleting - release local reference */ nvmet_fc_tgt_a_put(assoc); - } } spin_unlock_irqrestore(&tgtport->lock, flags); @@ -1573,7 +1559,6 @@ nvmet_fc_delete_ctrl(struct nvmet_ctrl *ctrl) struct nvmet_fc_tgt_queue *queue; unsigned long flags; bool found_ctrl = false; - int ret; /* this is a bit ugly, but don't want to make locks layered */ spin_lock_irqsave(&nvmet_fc_tgtlock, flags); @@ -1597,14 +1582,9 @@ nvmet_fc_delete_ctrl(struct nvmet_ctrl *ctrl) nvmet_fc_tgtport_put(tgtport); if (found_ctrl) { - ret = atomic_cmpxchg(&assoc->del_work_active, 0, 1); - if (ret == 0) { - if (!schedule_work(&assoc->del_work)) - nvmet_fc_tgt_a_put(assoc); - } else { + if (!schedule_work(&assoc->del_work)) /* already deleting - release local reference */ nvmet_fc_tgt_a_put(assoc); - } return; } diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index f2c80a51985f..4884ef1e46a2 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -444,7 +444,6 @@ static void nvme_loop_reset_ctrl_work(struct work_struct *work) { struct nvme_loop_ctrl *ctrl = container_of(work, struct nvme_loop_ctrl, ctrl.reset_work); - bool changed; int ret; nvme_stop_ctrl(&ctrl->ctrl); @@ -471,8 +470,8 @@ static void nvme_loop_reset_ctrl_work(struct work_struct *work) blk_mq_update_nr_hw_queues(&ctrl->tag_set, ctrl->ctrl.queue_count - 1); - changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); - WARN_ON_ONCE(!changed); + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE)) + WARN_ON_ONCE(1); nvme_start_ctrl(&ctrl->ctrl); @@ -567,7 +566,6 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts) { struct nvme_loop_ctrl *ctrl; - bool changed; int ret; ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); @@ -583,6 +581,9 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, if (ret) goto out_put_ctrl; + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) + WARN_ON_ONCE(1); + ret = -ENOMEM; ctrl->ctrl.sqsize = opts->queue_size - 1; @@ -617,8 +618,8 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, dev_info(ctrl->ctrl.device, "new ctrl: \"%s\"\n", ctrl->ctrl.opts->subsysnqn); - changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); - WARN_ON_ONCE(!changed); + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE)) + WARN_ON_ONCE(1); mutex_lock(&nvme_loop_ctrl_mutex); list_add_tail(&ctrl->list, &nvme_loop_ctrl_list); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 6f8bd6a93575..47ee3fb193bd 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -21,6 +21,8 @@ #include <linux/radix-tree.h> #include <linux/t10-pi.h> +#define NVMET_DEFAULT_VS NVME_VS(1, 3, 0) + #define NVMET_ASYNC_EVENTS 4 #define NVMET_ERROR_LOG_SLOTS 128 #define NVMET_NO_ERROR_LOC ((u16)-1) @@ -52,7 +54,6 @@ (cpu_to_le32(offsetof(struct nvmf_connect_command, x))) struct nvmet_ns { - struct list_head dev_link; struct percpu_ref ref; struct block_device *bdev; struct file *file; @@ -219,7 +220,7 @@ struct nvmet_subsys { struct mutex lock; struct kref ref; - struct list_head namespaces; + struct xarray namespaces; unsigned int nr_namespaces; unsigned int max_nsid; u16 cntlid_min; @@ -243,6 +244,12 @@ struct nvmet_subsys { struct config_group allowed_hosts_group; struct nvmet_subsys_model __rcu *model; + +#ifdef CONFIG_NVME_TARGET_PASSTHRU + struct nvme_ctrl *passthru_ctrl; + char *passthru_ctrl_path; + struct config_group passthru_group; +#endif /* CONFIG_NVME_TARGET_PASSTHRU */ }; static inline struct nvmet_subsys *to_subsys(struct config_item *item) @@ -322,6 +329,11 @@ struct nvmet_req { struct bio_vec *bvec; struct work_struct work; } f; + struct { + struct request *rq; + struct work_struct work; + bool use_workqueue; + } p; }; int sg_cnt; int metadata_sg_cnt; @@ -401,6 +413,8 @@ void nvmet_req_complete(struct nvmet_req *req, u16 status); int nvmet_req_alloc_sgls(struct nvmet_req *req); void nvmet_req_free_sgls(struct nvmet_req *req); +void nvmet_execute_set_features(struct nvmet_req *req); +void nvmet_execute_get_features(struct nvmet_req *req); void nvmet_execute_keep_alive(struct nvmet_req *req); void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid, @@ -533,6 +547,43 @@ static inline u32 nvmet_dsm_len(struct nvmet_req *req) sizeof(struct nvme_dsm_range); } +#ifdef CONFIG_NVME_TARGET_PASSTHRU +void nvmet_passthru_subsys_free(struct nvmet_subsys *subsys); +int nvmet_passthru_ctrl_enable(struct nvmet_subsys *subsys); +void nvmet_passthru_ctrl_disable(struct nvmet_subsys *subsys); +u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req); +u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req); +static inline struct nvme_ctrl *nvmet_passthru_ctrl(struct nvmet_subsys *subsys) +{ + return subsys->passthru_ctrl; +} +#else /* CONFIG_NVME_TARGET_PASSTHRU */ +static inline void nvmet_passthru_subsys_free(struct nvmet_subsys *subsys) +{ +} +static inline void nvmet_passthru_ctrl_disable(struct nvmet_subsys *subsys) +{ +} +static inline u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req) +{ + return 0; +} +static inline u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req) +{ + return 0; +} +static inline struct nvme_ctrl *nvmet_passthru_ctrl(struct nvmet_subsys *subsys) +{ + return NULL; +} +#endif /* CONFIG_NVME_TARGET_PASSTHRU */ + +static inline struct nvme_ctrl * +nvmet_req_passthru_ctrl(struct nvmet_req *req) +{ + return nvmet_passthru_ctrl(req->sq->ctrl->subsys); +} + u16 errno_to_nvme_status(struct nvmet_req *req, int errno); /* Convert a 32-bit number to a 16-bit 0's based number */ diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c new file mode 100644 index 000000000000..89d91dc999a6 --- /dev/null +++ b/drivers/nvme/target/passthru.c @@ -0,0 +1,544 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NVMe Over Fabrics Target Passthrough command implementation. + * + * Copyright (c) 2017-2018 Western Digital Corporation or its + * affiliates. + * Copyright (c) 2019-2020, Eideticom Inc. + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> + +#include "../host/nvme.h" +#include "nvmet.h" + +MODULE_IMPORT_NS(NVME_TARGET_PASSTHRU); + +/* + * xarray to maintain one passthru subsystem per nvme controller. + */ +static DEFINE_XARRAY(passthru_subsystems); + +static u16 nvmet_passthru_override_id_ctrl(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + struct nvme_ctrl *pctrl = ctrl->subsys->passthru_ctrl; + u16 status = NVME_SC_SUCCESS; + struct nvme_id_ctrl *id; + u32 max_hw_sectors; + int page_shift; + + id = kzalloc(sizeof(*id), GFP_KERNEL); + if (!id) + return NVME_SC_INTERNAL; + + status = nvmet_copy_from_sgl(req, 0, id, sizeof(*id)); + if (status) + goto out_free; + + id->cntlid = cpu_to_le16(ctrl->cntlid); + id->ver = cpu_to_le32(ctrl->subsys->ver); + + /* + * The passthru NVMe driver may have a limit on the number of segments + * which depends on the host's memory fragementation. To solve this, + * ensure mdts is limited to the pages equal to the number of segments. + */ + max_hw_sectors = min_not_zero(pctrl->max_segments << (PAGE_SHIFT - 9), + pctrl->max_hw_sectors); + + page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12; + + id->mdts = ilog2(max_hw_sectors) + 9 - page_shift; + + id->acl = 3; + /* + * We export aerl limit for the fabrics controller, update this when + * passthru based aerl support is added. + */ + id->aerl = NVMET_ASYNC_EVENTS - 1; + + /* emulate kas as most of the PCIe ctrl don't have a support for kas */ + id->kas = cpu_to_le16(NVMET_KAS); + + /* don't support host memory buffer */ + id->hmpre = 0; + id->hmmin = 0; + + id->sqes = min_t(__u8, ((0x6 << 4) | 0x6), id->sqes); + id->cqes = min_t(__u8, ((0x4 << 4) | 0x4), id->cqes); + id->maxcmd = cpu_to_le16(NVMET_MAX_CMD); + + /* don't support fuse commands */ + id->fuses = 0; + + id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */ + if (ctrl->ops->flags & NVMF_KEYED_SGLS) + id->sgls |= cpu_to_le32(1 << 2); + if (req->port->inline_data_size) + id->sgls |= cpu_to_le32(1 << 20); + + /* + * When passsthru controller is setup using nvme-loop transport it will + * export the passthru ctrl subsysnqn (PCIe NVMe ctrl) and will fail in + * the nvme/host/core.c in the nvme_init_subsystem()->nvme_active_ctrl() + * code path with duplicate ctr subsynqn. In order to prevent that we + * mask the passthru-ctrl subsysnqn with the target ctrl subsysnqn. + */ + memcpy(id->subnqn, ctrl->subsysnqn, sizeof(id->subnqn)); + + /* use fabric id-ctrl values */ + id->ioccsz = cpu_to_le32((sizeof(struct nvme_command) + + req->port->inline_data_size) / 16); + id->iorcsz = cpu_to_le32(sizeof(struct nvme_completion) / 16); + + id->msdbd = ctrl->ops->msdbd; + + /* Support multipath connections with fabrics */ + id->cmic |= 1 << 1; + + /* Disable reservations, see nvmet_parse_passthru_io_cmd() */ + id->oncs &= cpu_to_le16(~NVME_CTRL_ONCS_RESERVATIONS); + + status = nvmet_copy_to_sgl(req, 0, id, sizeof(struct nvme_id_ctrl)); + +out_free: + kfree(id); + return status; +} + +static u16 nvmet_passthru_override_id_ns(struct nvmet_req *req) +{ + u16 status = NVME_SC_SUCCESS; + struct nvme_id_ns *id; + int i; + + id = kzalloc(sizeof(*id), GFP_KERNEL); + if (!id) + return NVME_SC_INTERNAL; + + status = nvmet_copy_from_sgl(req, 0, id, sizeof(struct nvme_id_ns)); + if (status) + goto out_free; + + for (i = 0; i < (id->nlbaf + 1); i++) + if (id->lbaf[i].ms) + memset(&id->lbaf[i], 0, sizeof(id->lbaf[i])); + + id->flbas = id->flbas & ~(1 << 4); + + /* + * Presently the NVMEof target code does not support sending + * metadata, so we must disable it here. This should be updated + * once target starts supporting metadata. + */ + id->mc = 0; + + status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); + +out_free: + kfree(id); + return status; +} + +static void nvmet_passthru_execute_cmd_work(struct work_struct *w) +{ + struct nvmet_req *req = container_of(w, struct nvmet_req, p.work); + struct request *rq = req->p.rq; + u16 status; + + nvme_execute_passthru_rq(rq); + + status = nvme_req(rq)->status; + if (status == NVME_SC_SUCCESS && + req->cmd->common.opcode == nvme_admin_identify) { + switch (req->cmd->identify.cns) { + case NVME_ID_CNS_CTRL: + nvmet_passthru_override_id_ctrl(req); + break; + case NVME_ID_CNS_NS: + nvmet_passthru_override_id_ns(req); + break; + } + } + + req->cqe->result = nvme_req(rq)->result; + nvmet_req_complete(req, status); + blk_put_request(rq); +} + +static void nvmet_passthru_req_done(struct request *rq, + blk_status_t blk_status) +{ + struct nvmet_req *req = rq->end_io_data; + + req->cqe->result = nvme_req(rq)->result; + nvmet_req_complete(req, nvme_req(rq)->status); + blk_put_request(rq); +} + +static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq) +{ + int sg_cnt = req->sg_cnt; + struct scatterlist *sg; + int op_flags = 0; + struct bio *bio; + int i, ret; + + if (req->cmd->common.opcode == nvme_cmd_flush) + op_flags = REQ_FUA; + else if (nvme_is_write(req->cmd)) + op_flags = REQ_SYNC | REQ_IDLE; + + bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES)); + bio->bi_end_io = bio_put; + bio->bi_opf = req_op(rq) | op_flags; + + for_each_sg(req->sg, sg, req->sg_cnt, i) { + if (bio_add_pc_page(rq->q, bio, sg_page(sg), sg->length, + sg->offset) < sg->length) { + bio_put(bio); + return -EINVAL; + } + sg_cnt--; + } + + ret = blk_rq_append_bio(rq, &bio); + if (unlikely(ret)) { + bio_put(bio); + return ret; + } + + return 0; +} + +static void nvmet_passthru_execute_cmd(struct nvmet_req *req) +{ + struct nvme_ctrl *ctrl = nvmet_req_passthru_ctrl(req); + struct request_queue *q = ctrl->admin_q; + struct nvme_ns *ns = NULL; + struct request *rq = NULL; + u32 effects; + u16 status; + int ret; + + if (likely(req->sq->qid != 0)) { + u32 nsid = le32_to_cpu(req->cmd->common.nsid); + + ns = nvme_find_get_ns(ctrl, nsid); + if (unlikely(!ns)) { + pr_err("failed to get passthru ns nsid:%u\n", nsid); + status = NVME_SC_INVALID_NS | NVME_SC_DNR; + goto fail_out; + } + + q = ns->queue; + } + + rq = nvme_alloc_request(q, req->cmd, BLK_MQ_REQ_NOWAIT, NVME_QID_ANY); + if (IS_ERR(rq)) { + rq = NULL; + status = NVME_SC_INTERNAL; + goto fail_out; + } + + if (req->sg_cnt) { + ret = nvmet_passthru_map_sg(req, rq); + if (unlikely(ret)) { + status = NVME_SC_INTERNAL; + goto fail_out; + } + } + + /* + * If there are effects for the command we are about to execute, or + * an end_req function we need to use nvme_execute_passthru_rq() + * synchronously in a work item seeing the end_req function and + * nvme_passthru_end() can't be called in the request done callback + * which is typically in interrupt context. + */ + effects = nvme_command_effects(ctrl, ns, req->cmd->common.opcode); + if (req->p.use_workqueue || effects) { + INIT_WORK(&req->p.work, nvmet_passthru_execute_cmd_work); + req->p.rq = rq; + schedule_work(&req->p.work); + } else { + rq->end_io_data = req; + blk_execute_rq_nowait(rq->q, ns ? ns->disk : NULL, rq, 0, + nvmet_passthru_req_done); + } + + if (ns) + nvme_put_ns(ns); + + return; + +fail_out: + if (ns) + nvme_put_ns(ns); + nvmet_req_complete(req, status); + blk_put_request(rq); +} + +/* + * We need to emulate set host behaviour to ensure that any requested + * behaviour of the target's host matches the requested behaviour + * of the device's host and fail otherwise. + */ +static void nvmet_passthru_set_host_behaviour(struct nvmet_req *req) +{ + struct nvme_ctrl *ctrl = nvmet_req_passthru_ctrl(req); + struct nvme_feat_host_behavior *host; + u16 status = NVME_SC_INTERNAL; + int ret; + + host = kzalloc(sizeof(*host) * 2, GFP_KERNEL); + if (!host) + goto out_complete_req; + + ret = nvme_get_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0, + host, sizeof(*host), NULL); + if (ret) + goto out_free_host; + + status = nvmet_copy_from_sgl(req, 0, &host[1], sizeof(*host)); + if (status) + goto out_free_host; + + if (memcmp(&host[0], &host[1], sizeof(host[0]))) { + pr_warn("target host has requested different behaviour from the local host\n"); + status = NVME_SC_INTERNAL; + } + +out_free_host: + kfree(host); +out_complete_req: + nvmet_req_complete(req, status); +} + +static u16 nvmet_setup_passthru_command(struct nvmet_req *req) +{ + req->p.use_workqueue = false; + req->execute = nvmet_passthru_execute_cmd; + return NVME_SC_SUCCESS; +} + +u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req) +{ + switch (req->cmd->common.opcode) { + case nvme_cmd_resv_register: + case nvme_cmd_resv_report: + case nvme_cmd_resv_acquire: + case nvme_cmd_resv_release: + /* + * Reservations cannot be supported properly because the + * underlying device has no way of differentiating different + * hosts that connect via fabrics. This could potentially be + * emulated in the future if regular targets grow support for + * this feature. + */ + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + + return nvmet_setup_passthru_command(req); +} + +/* + * Only features that are emulated or specifically allowed in the list are + * passed down to the controller. This function implements the allow list for + * both get and set features. + */ +static u16 nvmet_passthru_get_set_features(struct nvmet_req *req) +{ + switch (le32_to_cpu(req->cmd->features.fid)) { + case NVME_FEAT_ARBITRATION: + case NVME_FEAT_POWER_MGMT: + case NVME_FEAT_LBA_RANGE: + case NVME_FEAT_TEMP_THRESH: + case NVME_FEAT_ERR_RECOVERY: + case NVME_FEAT_VOLATILE_WC: + case NVME_FEAT_WRITE_ATOMIC: + case NVME_FEAT_AUTO_PST: + case NVME_FEAT_TIMESTAMP: + case NVME_FEAT_HCTM: + case NVME_FEAT_NOPSC: + case NVME_FEAT_RRL: + case NVME_FEAT_PLM_CONFIG: + case NVME_FEAT_PLM_WINDOW: + case NVME_FEAT_HOST_BEHAVIOR: + case NVME_FEAT_SANITIZE: + case NVME_FEAT_VENDOR_START ... NVME_FEAT_VENDOR_END: + return nvmet_setup_passthru_command(req); + + case NVME_FEAT_ASYNC_EVENT: + /* There is no support for forwarding ASYNC events */ + case NVME_FEAT_IRQ_COALESCE: + case NVME_FEAT_IRQ_CONFIG: + /* The IRQ settings will not apply to the target controller */ + case NVME_FEAT_HOST_MEM_BUF: + /* + * Any HMB that's set will not be passed through and will + * not work as expected + */ + case NVME_FEAT_SW_PROGRESS: + /* + * The Pre-Boot Software Load Count doesn't make much + * sense for a target to export + */ + case NVME_FEAT_RESV_MASK: + case NVME_FEAT_RESV_PERSIST: + /* No reservations, see nvmet_parse_passthru_io_cmd() */ + default: + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } +} + +u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req) +{ + /* + * Passthru all vendor specific commands + */ + if (req->cmd->common.opcode >= nvme_admin_vendor_start) + return nvmet_setup_passthru_command(req); + + switch (req->cmd->common.opcode) { + case nvme_admin_async_event: + req->execute = nvmet_execute_async_event; + return NVME_SC_SUCCESS; + case nvme_admin_keep_alive: + /* + * Most PCIe ctrls don't support keep alive cmd, we route keep + * alive to the non-passthru mode. In future please change this + * code when PCIe ctrls with keep alive support available. + */ + req->execute = nvmet_execute_keep_alive; + return NVME_SC_SUCCESS; + case nvme_admin_set_features: + switch (le32_to_cpu(req->cmd->features.fid)) { + case NVME_FEAT_ASYNC_EVENT: + case NVME_FEAT_KATO: + case NVME_FEAT_NUM_QUEUES: + case NVME_FEAT_HOST_ID: + req->execute = nvmet_execute_set_features; + return NVME_SC_SUCCESS; + case NVME_FEAT_HOST_BEHAVIOR: + req->execute = nvmet_passthru_set_host_behaviour; + return NVME_SC_SUCCESS; + default: + return nvmet_passthru_get_set_features(req); + } + break; + case nvme_admin_get_features: + switch (le32_to_cpu(req->cmd->features.fid)) { + case NVME_FEAT_ASYNC_EVENT: + case NVME_FEAT_KATO: + case NVME_FEAT_NUM_QUEUES: + case NVME_FEAT_HOST_ID: + req->execute = nvmet_execute_get_features; + return NVME_SC_SUCCESS; + default: + return nvmet_passthru_get_set_features(req); + } + break; + case nvme_admin_identify: + switch (req->cmd->identify.cns) { + case NVME_ID_CNS_CTRL: + req->execute = nvmet_passthru_execute_cmd; + req->p.use_workqueue = true; + return NVME_SC_SUCCESS; + case NVME_ID_CNS_NS: + req->execute = nvmet_passthru_execute_cmd; + req->p.use_workqueue = true; + return NVME_SC_SUCCESS; + default: + return nvmet_setup_passthru_command(req); + } + case nvme_admin_get_log_page: + return nvmet_setup_passthru_command(req); + default: + /* Reject commands not in the allowlist above */ + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } +} + +int nvmet_passthru_ctrl_enable(struct nvmet_subsys *subsys) +{ + struct nvme_ctrl *ctrl; + int ret = -EINVAL; + void *old; + + mutex_lock(&subsys->lock); + if (!subsys->passthru_ctrl_path) + goto out_unlock; + if (subsys->passthru_ctrl) + goto out_unlock; + + if (subsys->nr_namespaces) { + pr_info("cannot enable both passthru and regular namespaces for a single subsystem"); + goto out_unlock; + } + + ctrl = nvme_ctrl_get_by_path(subsys->passthru_ctrl_path); + if (IS_ERR(ctrl)) { + ret = PTR_ERR(ctrl); + pr_err("failed to open nvme controller %s\n", + subsys->passthru_ctrl_path); + + goto out_unlock; + } + + old = xa_cmpxchg(&passthru_subsystems, ctrl->cntlid, NULL, + subsys, GFP_KERNEL); + if (xa_is_err(old)) { + ret = xa_err(old); + goto out_put_ctrl; + } + + if (old) + goto out_put_ctrl; + + subsys->passthru_ctrl = ctrl; + subsys->ver = ctrl->vs; + + if (subsys->ver < NVME_VS(1, 2, 1)) { + pr_warn("nvme controller version is too old: %llu.%llu.%llu, advertising 1.2.1\n", + NVME_MAJOR(subsys->ver), NVME_MINOR(subsys->ver), + NVME_TERTIARY(subsys->ver)); + subsys->ver = NVME_VS(1, 2, 1); + } + + mutex_unlock(&subsys->lock); + return 0; + +out_put_ctrl: + nvme_put_ctrl(ctrl); +out_unlock: + mutex_unlock(&subsys->lock); + return ret; +} + +static void __nvmet_passthru_ctrl_disable(struct nvmet_subsys *subsys) +{ + if (subsys->passthru_ctrl) { + xa_erase(&passthru_subsystems, subsys->passthru_ctrl->cntlid); + nvme_put_ctrl(subsys->passthru_ctrl); + } + subsys->passthru_ctrl = NULL; + subsys->ver = NVMET_DEFAULT_VS; +} + +void nvmet_passthru_ctrl_disable(struct nvmet_subsys *subsys) +{ + mutex_lock(&subsys->lock); + __nvmet_passthru_ctrl_disable(subsys); + mutex_unlock(&subsys->lock); +} + +void nvmet_passthru_subsys_free(struct nvmet_subsys *subsys) +{ + mutex_lock(&subsys->lock); + __nvmet_passthru_ctrl_disable(subsys); + mutex_unlock(&subsys->lock); + kfree(subsys->passthru_ctrl_path); +} diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 6731e0349480..3ccb59260b4a 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -752,7 +752,7 @@ static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc) { struct nvmet_rdma_rsp *rsp = container_of(wc->wr_cqe, struct nvmet_rdma_rsp, read_cqe); - struct nvmet_rdma_queue *queue = cq->cq_context; + struct nvmet_rdma_queue *queue = wc->qp->qp_context; u16 status = 0; WARN_ON(rsp->n_rdma <= 0); @@ -1008,7 +1008,7 @@ static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) { struct nvmet_rdma_cmd *cmd = container_of(wc->wr_cqe, struct nvmet_rdma_cmd, cqe); - struct nvmet_rdma_queue *queue = cq->cq_context; + struct nvmet_rdma_queue *queue = wc->qp->qp_context; struct nvmet_rdma_rsp *rsp; if (unlikely(wc->status != IB_WC_SUCCESS)) { @@ -1258,9 +1258,8 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) */ nr_cqe = queue->recv_queue_size + 2 * queue->send_queue_size; - queue->cq = ib_alloc_cq(ndev->device, queue, - nr_cqe + 1, queue->comp_vector, - IB_POLL_WORKQUEUE); + queue->cq = ib_cq_pool_get(ndev->device, nr_cqe + 1, + queue->comp_vector, IB_POLL_WORKQUEUE); if (IS_ERR(queue->cq)) { ret = PTR_ERR(queue->cq); pr_err("failed to create CQ cqe= %d ret= %d\n", @@ -1322,7 +1321,7 @@ out: err_destroy_qp: rdma_destroy_qp(queue->cm_id); err_destroy_cq: - ib_free_cq(queue->cq); + ib_cq_pool_put(queue->cq, nr_cqe + 1); goto out; } @@ -1332,7 +1331,8 @@ static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue) if (queue->cm_id) rdma_destroy_id(queue->cm_id); ib_destroy_qp(queue->qp); - ib_free_cq(queue->cq); + ib_cq_pool_put(queue->cq, queue->recv_queue_size + 2 * + queue->send_queue_size + 1); } static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue) |