diff options
author | Linus Torvalds | 2022-05-23 13:06:15 -0700 |
---|---|---|
committer | Linus Torvalds | 2022-05-23 13:06:15 -0700 |
commit | 9836e93c0a7e031ac6a71c56171c229de1eea7cf (patch) | |
tree | f53f3460e86752c50aac9ee16b4426c84d277899 /drivers | |
parent | e1a8fde7203fa8a3e3f35d4f9df47477d23529c1 (diff) | |
parent | 3fe07bcd800d6e5e4e4263ca2564d69095c157bf (diff) |
Merge tag 'for-5.19/io_uring-passthrough-2022-05-22' of git://git.kernel.dk/linux-block
Pull io_uring NVMe command passthrough from Jens Axboe:
"On top of everything else, this adds support for passthrough for
io_uring.
The initial feature for this is NVMe passthrough support, which allows
non-filesystem based IO commands and admin commands.
To support this, io_uring grows support for SQE and CQE members that
are twice as big, allowing to pass in a full NVMe command without
having to copy data around. And to complete with more than just a
single 32-bit value as the output"
* tag 'for-5.19/io_uring-passthrough-2022-05-22' of git://git.kernel.dk/linux-block: (22 commits)
io_uring: cleanup handling of the two task_work lists
nvme: enable uring-passthrough for admin commands
nvme: helper for uring-passthrough checks
blk-mq: fix passthrough plugging
nvme: add vectored-io support for uring-cmd
nvme: wire-up uring-cmd support for io-passthru on char-device.
nvme: refactor nvme_submit_user_cmd()
block: wire-up support for passthrough plugging
fs,io_uring: add infrastructure for uring-cmd
io_uring: support CQE32 for nop operation
io_uring: enable CQE32
io_uring: support CQE32 in /proc info
io_uring: add tracing for additional CQE32 fields
io_uring: overflow processing for CQE32
io_uring: flush completions for CQE32
io_uring: modify io_get_cqe for CQE32
io_uring: add CQE32 completion processing
io_uring: add CQE32 setup processing
io_uring: change ring size calculation for CQE32
io_uring: store add. return values for CQE32
...
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/nvme/host/core.c | 2 | ||||
-rw-r--r-- | drivers/nvme/host/ioctl.c | 278 | ||||
-rw-r--r-- | drivers/nvme/host/multipath.c | 1 | ||||
-rw-r--r-- | drivers/nvme/host/nvme.h | 5 |
4 files changed, 274 insertions, 12 deletions
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index e1846d04817f..1a984045e49c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3146,6 +3146,7 @@ static const struct file_operations nvme_dev_fops = { .release = nvme_dev_release, .unlocked_ioctl = nvme_dev_ioctl, .compat_ioctl = compat_ptr_ioctl, + .uring_cmd = nvme_dev_uring_cmd, }; static ssize_t nvme_sysfs_reset(struct device *dev, @@ -3699,6 +3700,7 @@ static const struct file_operations nvme_ns_chr_fops = { .release = nvme_ns_chr_release, .unlocked_ioctl = nvme_ns_chr_ioctl, .compat_ioctl = compat_ptr_ioctl, + .uring_cmd = nvme_ns_chr_uring_cmd, }; static int nvme_add_ns_cdev(struct nvme_ns *ns) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 554566371ffa..096b1b47d750 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -5,6 +5,7 @@ */ #include <linux/ptrace.h> /* for force_successful_syscall_return */ #include <linux/nvme_ioctl.h> +#include <linux/io_uring.h> #include "nvme.h" /* @@ -53,10 +54,21 @@ out: return ERR_PTR(ret); } -static int nvme_submit_user_cmd(struct request_queue *q, +static int nvme_finish_user_metadata(struct request *req, void __user *ubuf, + void *meta, unsigned len, int ret) +{ + if (!ret && req_op(req) == REQ_OP_DRV_IN && + copy_to_user(ubuf, meta, len)) + ret = -EFAULT; + kfree(meta); + return ret; +} + +static struct request *nvme_alloc_user_request(struct request_queue *q, struct nvme_command *cmd, void __user *ubuffer, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, - u32 meta_seed, u64 *result, unsigned timeout, bool vec) + u32 meta_seed, void **metap, unsigned timeout, bool vec, + unsigned int rq_flags, blk_mq_req_flags_t blk_flags) { bool write = nvme_is_write(cmd); struct nvme_ns *ns = q->queuedata; @@ -66,9 +78,9 @@ static int nvme_submit_user_cmd(struct request_queue *q, void *meta = NULL; int ret; - req = blk_mq_alloc_request(q, nvme_req_op(cmd), 0); + req = blk_mq_alloc_request(q, nvme_req_op(cmd) | rq_flags, blk_flags); if (IS_ERR(req)) - return PTR_ERR(req); + return req; nvme_init_request(req, cmd); if (timeout) @@ -105,26 +117,50 @@ static int nvme_submit_user_cmd(struct request_queue *q, goto out_unmap; } req->cmd_flags |= REQ_INTEGRITY; + *metap = meta; } } + return req; + +out_unmap: + if (bio) + blk_rq_unmap_user(bio); +out: + blk_mq_free_request(req); + return ERR_PTR(ret); +} + +static int nvme_submit_user_cmd(struct request_queue *q, + struct nvme_command *cmd, void __user *ubuffer, + unsigned bufflen, void __user *meta_buffer, unsigned meta_len, + u32 meta_seed, u64 *result, unsigned timeout, bool vec) +{ + struct request *req; + void *meta = NULL; + struct bio *bio; + int ret; + + req = nvme_alloc_user_request(q, cmd, ubuffer, bufflen, meta_buffer, + meta_len, meta_seed, &meta, timeout, vec, 0, 0); + if (IS_ERR(req)) + return PTR_ERR(req); + + bio = req->bio; + ret = nvme_execute_passthru_rq(req); + if (result) *result = le64_to_cpu(nvme_req(req)->result.u64); - if (meta && !ret && !write) { - if (copy_to_user(meta_buffer, meta, meta_len)) - ret = -EFAULT; - } - kfree(meta); - out_unmap: + if (meta) + ret = nvme_finish_user_metadata(req, meta_buffer, meta, + meta_len, ret); if (bio) blk_rq_unmap_user(bio); - out: blk_mq_free_request(req); return ret; } - static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) { struct nvme_user_io io; @@ -296,6 +332,139 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, return status; } +struct nvme_uring_data { + __u64 metadata; + __u64 addr; + __u32 data_len; + __u32 metadata_len; + __u32 timeout_ms; +}; + +/* + * This overlays struct io_uring_cmd pdu. + * Expect build errors if this grows larger than that. + */ +struct nvme_uring_cmd_pdu { + union { + struct bio *bio; + struct request *req; + }; + void *meta; /* kernel-resident buffer */ + void __user *meta_buffer; + u32 meta_len; +}; + +static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu( + struct io_uring_cmd *ioucmd) +{ + return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu; +} + +static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd) +{ + struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); + struct request *req = pdu->req; + struct bio *bio = req->bio; + int status; + u64 result; + + if (nvme_req(req)->flags & NVME_REQ_CANCELLED) + status = -EINTR; + else + status = nvme_req(req)->status; + + result = le64_to_cpu(nvme_req(req)->result.u64); + + if (pdu->meta) + status = nvme_finish_user_metadata(req, pdu->meta_buffer, + pdu->meta, pdu->meta_len, status); + if (bio) + blk_rq_unmap_user(bio); + blk_mq_free_request(req); + + io_uring_cmd_done(ioucmd, status, result); +} + +static void nvme_uring_cmd_end_io(struct request *req, blk_status_t err) +{ + struct io_uring_cmd *ioucmd = req->end_io_data; + struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); + /* extract bio before reusing the same field for request */ + struct bio *bio = pdu->bio; + + pdu->req = req; + req->bio = bio; + /* this takes care of moving rest of completion-work to task context */ + io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_cb); +} + +static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, + struct io_uring_cmd *ioucmd, unsigned int issue_flags, bool vec) +{ + struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); + const struct nvme_uring_cmd *cmd = ioucmd->cmd; + struct request_queue *q = ns ? ns->queue : ctrl->admin_q; + struct nvme_uring_data d; + struct nvme_command c; + struct request *req; + unsigned int rq_flags = 0; + blk_mq_req_flags_t blk_flags = 0; + void *meta = NULL; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + c.common.opcode = READ_ONCE(cmd->opcode); + c.common.flags = READ_ONCE(cmd->flags); + if (c.common.flags) + return -EINVAL; + + c.common.command_id = 0; + c.common.nsid = cpu_to_le32(cmd->nsid); + if (!nvme_validate_passthru_nsid(ctrl, ns, le32_to_cpu(c.common.nsid))) + return -EINVAL; + + c.common.cdw2[0] = cpu_to_le32(READ_ONCE(cmd->cdw2)); + c.common.cdw2[1] = cpu_to_le32(READ_ONCE(cmd->cdw3)); + c.common.metadata = 0; + c.common.dptr.prp1 = c.common.dptr.prp2 = 0; + c.common.cdw10 = cpu_to_le32(READ_ONCE(cmd->cdw10)); + c.common.cdw11 = cpu_to_le32(READ_ONCE(cmd->cdw11)); + c.common.cdw12 = cpu_to_le32(READ_ONCE(cmd->cdw12)); + c.common.cdw13 = cpu_to_le32(READ_ONCE(cmd->cdw13)); + c.common.cdw14 = cpu_to_le32(READ_ONCE(cmd->cdw14)); + c.common.cdw15 = cpu_to_le32(READ_ONCE(cmd->cdw15)); + + d.metadata = READ_ONCE(cmd->metadata); + d.addr = READ_ONCE(cmd->addr); + d.data_len = READ_ONCE(cmd->data_len); + d.metadata_len = READ_ONCE(cmd->metadata_len); + d.timeout_ms = READ_ONCE(cmd->timeout_ms); + + if (issue_flags & IO_URING_F_NONBLOCK) { + rq_flags = REQ_NOWAIT; + blk_flags = BLK_MQ_REQ_NOWAIT; + } + + req = nvme_alloc_user_request(q, &c, nvme_to_user_ptr(d.addr), + d.data_len, nvme_to_user_ptr(d.metadata), + d.metadata_len, 0, &meta, d.timeout_ms ? + msecs_to_jiffies(d.timeout_ms) : 0, vec, rq_flags, + blk_flags); + if (IS_ERR(req)) + return PTR_ERR(req); + req->end_io_data = ioucmd; + + /* to free bio on completion, as req->bio will be null at that time */ + pdu->bio = req->bio; + pdu->meta = meta; + pdu->meta_buffer = nvme_to_user_ptr(d.metadata); + pdu->meta_len = d.metadata_len; + + blk_execute_rq_nowait(req, 0, nvme_uring_cmd_end_io); + return -EIOCBQUEUED; +} + static bool is_ctrl_ioctl(unsigned int cmd) { if (cmd == NVME_IOCTL_ADMIN_CMD || cmd == NVME_IOCTL_ADMIN64_CMD) @@ -387,6 +556,53 @@ long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return __nvme_ioctl(ns, cmd, (void __user *)arg); } +static int nvme_uring_cmd_checks(unsigned int issue_flags) +{ + /* IOPOLL not supported yet */ + if (issue_flags & IO_URING_F_IOPOLL) + return -EOPNOTSUPP; + + /* NVMe passthrough requires big SQE/CQE support */ + if ((issue_flags & (IO_URING_F_SQE128|IO_URING_F_CQE32)) != + (IO_URING_F_SQE128|IO_URING_F_CQE32)) + return -EOPNOTSUPP; + return 0; +} + +static int nvme_ns_uring_cmd(struct nvme_ns *ns, struct io_uring_cmd *ioucmd, + unsigned int issue_flags) +{ + struct nvme_ctrl *ctrl = ns->ctrl; + int ret; + + BUILD_BUG_ON(sizeof(struct nvme_uring_cmd_pdu) > sizeof(ioucmd->pdu)); + + ret = nvme_uring_cmd_checks(issue_flags); + if (ret) + return ret; + + switch (ioucmd->cmd_op) { + case NVME_URING_CMD_IO: + ret = nvme_uring_cmd_io(ctrl, ns, ioucmd, issue_flags, false); + break; + case NVME_URING_CMD_IO_VEC: + ret = nvme_uring_cmd_io(ctrl, ns, ioucmd, issue_flags, true); + break; + default: + ret = -ENOTTY; + } + + return ret; +} + +int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags) +{ + struct nvme_ns *ns = container_of(file_inode(ioucmd->file)->i_cdev, + struct nvme_ns, cdev); + + return nvme_ns_uring_cmd(ns, ioucmd, issue_flags); +} + #ifdef CONFIG_NVME_MULTIPATH static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *argp, struct nvme_ns_head *head, int srcu_idx) @@ -453,8 +669,46 @@ out_unlock: srcu_read_unlock(&head->srcu, srcu_idx); return ret; } + +int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd, + unsigned int issue_flags) +{ + struct cdev *cdev = file_inode(ioucmd->file)->i_cdev; + struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head, cdev); + int srcu_idx = srcu_read_lock(&head->srcu); + struct nvme_ns *ns = nvme_find_path(head); + int ret = -EINVAL; + + if (ns) + ret = nvme_ns_uring_cmd(ns, ioucmd, issue_flags); + srcu_read_unlock(&head->srcu, srcu_idx); + return ret; +} #endif /* CONFIG_NVME_MULTIPATH */ +int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags) +{ + struct nvme_ctrl *ctrl = ioucmd->file->private_data; + int ret; + + ret = nvme_uring_cmd_checks(issue_flags); + if (ret) + return ret; + + switch (ioucmd->cmd_op) { + case NVME_URING_CMD_ADMIN: + ret = nvme_uring_cmd_io(ctrl, NULL, ioucmd, issue_flags, false); + break; + case NVME_URING_CMD_ADMIN_VEC: + ret = nvme_uring_cmd_io(ctrl, NULL, ioucmd, issue_flags, true); + break; + default: + ret = -ENOTTY; + } + + return ret; +} + static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp) { struct nvme_ns *ns; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index d464fdf978fb..d3e2440d8abb 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -437,6 +437,7 @@ static const struct file_operations nvme_ns_head_chr_fops = { .release = nvme_ns_head_chr_release, .unlocked_ioctl = nvme_ns_head_chr_ioctl, .compat_ioctl = compat_ptr_ioctl, + .uring_cmd = nvme_ns_head_chr_uring_cmd, }; static int nvme_add_ns_head_cdev(struct nvme_ns_head *head) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index a2b53ca63335..26d35c557588 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -782,7 +782,12 @@ long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long nvme_dev_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd, + unsigned int issue_flags); +int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd, + unsigned int issue_flags); int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo); +int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags); extern const struct attribute_group *nvme_ns_id_attr_groups[]; extern const struct pr_ops nvme_pr_ops; |