From 886d0137f104a440d9dfa1d16efc1db06c9a2c02 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 5 Mar 2021 12:59:30 -0700 Subject: io-wq: fix race in freeing 'wq' and worker access Ran into a use-after-free on the main io-wq struct, wq. It has a worker ref and completion event, but the manager itself isn't holding a reference. This can lead to a race where the manager thinks there are no workers and exits, but a worker is being added. That leads to the following trace: BUG: KASAN: use-after-free in io_wqe_worker+0x4c0/0x5e0 Read of size 8 at addr ffff888108baa8a0 by task iou-wrk-3080422/3080425 CPU: 5 PID: 3080425 Comm: iou-wrk-3080422 Not tainted 5.12.0-rc1+ #110 Hardware name: Micro-Star International Co., Ltd. MS-7C60/TRX40 PRO 10G (MS-7C60), BIOS 1.60 05/13/2020 Call Trace: dump_stack+0x90/0xbe print_address_description.constprop.0+0x67/0x28d ? io_wqe_worker+0x4c0/0x5e0 kasan_report.cold+0x7b/0xd4 ? io_wqe_worker+0x4c0/0x5e0 __asan_load8+0x6d/0xa0 io_wqe_worker+0x4c0/0x5e0 ? io_worker_handle_work+0xc00/0xc00 ? recalc_sigpending+0xe5/0x120 ? io_worker_handle_work+0xc00/0xc00 ? io_worker_handle_work+0xc00/0xc00 ret_from_fork+0x1f/0x30 Allocated by task 3080422: kasan_save_stack+0x23/0x60 __kasan_kmalloc+0x80/0xa0 kmem_cache_alloc_node_trace+0xa0/0x480 io_wq_create+0x3b5/0x600 io_uring_alloc_task_context+0x13c/0x380 io_uring_add_task_file+0x109/0x140 __x64_sys_io_uring_enter+0x45f/0x660 do_syscall_64+0x32/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae Freed by task 3080422: kasan_save_stack+0x23/0x60 kasan_set_track+0x20/0x40 kasan_set_free_info+0x24/0x40 __kasan_slab_free+0xe8/0x120 kfree+0xa8/0x400 io_wq_put+0x14a/0x220 io_wq_put_and_exit+0x9a/0xc0 io_uring_clean_tctx+0x101/0x140 __io_uring_files_cancel+0x36e/0x3c0 do_exit+0x169/0x1340 __x64_sys_exit+0x34/0x40 do_syscall_64+0x32/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae Have the manager itself hold a reference, and now both drop points drop and complete if we hit zero, and the manager can unconditionally do a wait_for_completion() instead of having a race between reading the ref count and waiting if it was non-zero. Fixes: fb3a1f6c745c ("io-wq: have manager wait for all workers to exit") Signed-off-by: Jens Axboe --- fs/io-wq.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 28868eb4cd09..1bfdb86336e4 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -722,9 +722,9 @@ static int io_wq_manager(void *data) io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL); rcu_read_unlock(); - /* we might not ever have created any workers */ - if (atomic_read(&wq->worker_refs)) - wait_for_completion(&wq->worker_done); + if (atomic_dec_and_test(&wq->worker_refs)) + complete(&wq->worker_done); + wait_for_completion(&wq->worker_done); spin_lock_irq(&wq->hash->wait.lock); for_each_node(node) @@ -774,7 +774,8 @@ static int io_wq_fork_manager(struct io_wq *wq) if (wq->manager) return 0; - reinit_completion(&wq->worker_done); + init_completion(&wq->worker_done); + atomic_set(&wq->worker_refs, 1); tsk = create_io_thread(io_wq_manager, wq, NUMA_NO_NODE); if (!IS_ERR(tsk)) { wq->manager = get_task_struct(tsk); @@ -782,6 +783,9 @@ static int io_wq_fork_manager(struct io_wq *wq) return 0; } + if (atomic_dec_and_test(&wq->worker_refs)) + complete(&wq->worker_done); + return PTR_ERR(tsk); } @@ -1018,13 +1022,9 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) init_completion(&wq->exited); refcount_set(&wq->refs, 1); - init_completion(&wq->worker_done); - atomic_set(&wq->worker_refs, 0); - ret = io_wq_fork_manager(wq); if (!ret) return wq; - err: io_wq_put_hash(data->hash); cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node); -- cgit v1.2.3 From 003e8dccdb22712dae388e682182d5f08b32386f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 6 Mar 2021 09:22:27 -0700 Subject: io-wq: always track creds for async issue If we go async with a request, grab the creds that the task currently has assigned and make sure that the async side switches to them. This is handled in the same way that we do for registered personalities. Signed-off-by: Jens Axboe --- fs/io-wq.h | 2 +- fs/io_uring.c | 33 +++++++++++++++++++-------------- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/fs/io-wq.h b/fs/io-wq.h index 5fbf7997149e..1ac2f3248088 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -79,8 +79,8 @@ static inline void wq_list_del(struct io_wq_work_list *list, struct io_wq_work { struct io_wq_work_node list; + const struct cred *creds; unsigned flags; - unsigned short personality; }; static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) diff --git a/fs/io_uring.c b/fs/io_uring.c index 92c25b5f1349..d51c6ba9268b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1183,6 +1183,9 @@ static void io_prep_async_work(struct io_kiocb *req) const struct io_op_def *def = &io_op_defs[req->opcode]; struct io_ring_ctx *ctx = req->ctx; + if (!req->work.creds) + req->work.creds = get_current_cred(); + if (req->flags & REQ_F_FORCE_ASYNC) req->work.flags |= IO_WQ_WORK_CONCURRENT; @@ -1648,6 +1651,10 @@ static void io_dismantle_req(struct io_kiocb *req) io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); if (req->fixed_rsrc_refs) percpu_ref_put(req->fixed_rsrc_refs); + if (req->work.creds) { + put_cred(req->work.creds); + req->work.creds = NULL; + } if (req->flags & REQ_F_INFLIGHT) { struct io_ring_ctx *ctx = req->ctx; @@ -5916,18 +5923,8 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) const struct cred *creds = NULL; int ret; - if (req->work.personality) { - const struct cred *new_creds; - - if (!(issue_flags & IO_URING_F_NONBLOCK)) - mutex_lock(&ctx->uring_lock); - new_creds = idr_find(&ctx->personality_idr, req->work.personality); - if (!(issue_flags & IO_URING_F_NONBLOCK)) - mutex_unlock(&ctx->uring_lock); - if (!new_creds) - return -EINVAL; - creds = override_creds(new_creds); - } + if (req->work.creds && req->work.creds != current_cred()) + creds = override_creds(req->work.creds); switch (req->opcode) { case IORING_OP_NOP: @@ -6291,7 +6288,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, { struct io_submit_state *state; unsigned int sqe_flags; - int ret = 0; + int personality, ret = 0; req->opcode = READ_ONCE(sqe->opcode); /* same numerical values with corresponding REQ_F_*, safe to copy */ @@ -6324,8 +6321,16 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, return -EOPNOTSUPP; req->work.list.next = NULL; + personality = READ_ONCE(sqe->personality); + if (personality) { + req->work.creds = idr_find(&ctx->personality_idr, personality); + if (!req->work.creds) + return -EINVAL; + get_cred(req->work.creds); + } else { + req->work.creds = NULL; + } req->work.flags = 0; - req->work.personality = READ_ONCE(sqe->personality); state = &ctx->submit_state; /* -- cgit v1.2.3 From 2941267bd3dad018de1d51fe2cd996b7bc1e5a5d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 6 Mar 2021 11:02:11 +0000 Subject: io_uring: make del_task_file more forgiving Rework io_uring_del_task_file(), so it accepts an index to delete, and it's not necessarily have to be in the ->xa. Infer file from xa_erase() to maintain a single origin of truth. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index d51c6ba9268b..00a736867b76 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8785,15 +8785,18 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file) /* * Remove this io_uring_file -> task mapping. */ -static void io_uring_del_task_file(struct file *file) +static void io_uring_del_task_file(unsigned long index) { struct io_uring_task *tctx = current->io_uring; + struct file *file; + + file = xa_erase(&tctx->xa, index); + if (!file) + return; if (tctx->last == file) tctx->last = NULL; - file = xa_erase(&tctx->xa, (unsigned long)file); - if (file) - fput(file); + fput(file); } static void io_uring_clean_tctx(struct io_uring_task *tctx) @@ -8802,7 +8805,7 @@ static void io_uring_clean_tctx(struct io_uring_task *tctx) unsigned long index; xa_for_each(&tctx->xa, index, file) - io_uring_del_task_file(file); + io_uring_del_task_file(index); if (tctx->io_wq) { io_wq_put_and_exit(tctx->io_wq); tctx->io_wq = NULL; -- cgit v1.2.3 From 13bf43f5f4739739751c0049a1582610c283bdde Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 6 Mar 2021 11:02:12 +0000 Subject: io_uring: introduce ctx to tctx back map For each pair tcxt-ctx create an object and chain it into ctx, so we have a way to traverse all tctx that are using current ctx. Preparation patch, will be used later. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 58 ++++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 44 insertions(+), 14 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 00a736867b76..9a2cff0662e0 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -454,6 +454,7 @@ struct io_ring_ctx { /* Keep this last, we don't need it for the fast path */ struct work_struct exit_work; + struct list_head tctx_list; }; /* @@ -805,6 +806,13 @@ struct io_kiocb { struct io_wq_work work; }; +struct io_tctx_node { + struct list_head ctx_node; + struct task_struct *task; + struct file *file; + struct io_ring_ctx *ctx; +}; + struct io_defer_entry { struct list_head list; struct io_kiocb *req; @@ -1144,6 +1152,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->rsrc_ref_list); INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work); init_llist_head(&ctx->rsrc_put_llist); + INIT_LIST_HEAD(&ctx->tctx_list); INIT_LIST_HEAD(&ctx->submit_state.comp.free_list); INIT_LIST_HEAD(&ctx->submit_state.comp.locked_free_list); return ctx; @@ -8748,6 +8757,7 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file) { struct io_uring_task *tctx = current->io_uring; + struct io_tctx_node *node; int ret; if (unlikely(!tctx)) { @@ -8760,13 +8770,25 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file) void *old = xa_load(&tctx->xa, (unsigned long)file); if (!old) { + node = kmalloc(sizeof(*node), GFP_KERNEL); + if (!node) + return -ENOMEM; + node->ctx = ctx; + node->file = file; + node->task = current; + get_file(file); ret = xa_err(xa_store(&tctx->xa, (unsigned long)file, - file, GFP_KERNEL)); + node, GFP_KERNEL)); if (ret) { fput(file); + kfree(node); return ret; } + + mutex_lock(&ctx->uring_lock); + list_add(&node->ctx_node, &ctx->tctx_list); + mutex_unlock(&ctx->uring_lock); } tctx->last = file; } @@ -8788,23 +8810,31 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file) static void io_uring_del_task_file(unsigned long index) { struct io_uring_task *tctx = current->io_uring; - struct file *file; + struct io_tctx_node *node; - file = xa_erase(&tctx->xa, index); - if (!file) + node = xa_erase(&tctx->xa, index); + if (!node) return; - if (tctx->last == file) + WARN_ON_ONCE(current != node->task); + WARN_ON_ONCE(list_empty(&node->ctx_node)); + + mutex_lock(&node->ctx->uring_lock); + list_del(&node->ctx_node); + mutex_unlock(&node->ctx->uring_lock); + + if (tctx->last == node->file) tctx->last = NULL; - fput(file); + fput(node->file); + kfree(node); } static void io_uring_clean_tctx(struct io_uring_task *tctx) { - struct file *file; + struct io_tctx_node *node; unsigned long index; - xa_for_each(&tctx->xa, index, file) + xa_for_each(&tctx->xa, index, node) io_uring_del_task_file(index); if (tctx->io_wq) { io_wq_put_and_exit(tctx->io_wq); @@ -8815,13 +8845,13 @@ static void io_uring_clean_tctx(struct io_uring_task *tctx) void __io_uring_files_cancel(struct files_struct *files) { struct io_uring_task *tctx = current->io_uring; - struct file *file; + struct io_tctx_node *node; unsigned long index; /* make sure overflow events are dropped */ atomic_inc(&tctx->in_idle); - xa_for_each(&tctx->xa, index, file) - io_uring_cancel_task_requests(file->private_data, files); + xa_for_each(&tctx->xa, index, node) + io_uring_cancel_task_requests(node->ctx, files); atomic_dec(&tctx->in_idle); if (files) @@ -8884,11 +8914,11 @@ void __io_uring_task_cancel(void) atomic_inc(&tctx->in_idle); if (tctx->sqpoll) { - struct file *file; + struct io_tctx_node *node; unsigned long index; - xa_for_each(&tctx->xa, index, file) - io_uring_cancel_sqpoll(file->private_data); + xa_for_each(&tctx->xa, index, node) + io_uring_cancel_sqpoll(node->ctx); } do { -- cgit v1.2.3 From d56d938b4bef3e1421a42023cdcd6e13c1f50831 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 6 Mar 2021 11:02:13 +0000 Subject: io_uring: do ctx initiated file note removal Another preparation patch. When full quiesce is done on ctx exit, use task_work infra to remove corresponding to the ctx io_uring->xa entries. For that we use the back tctx map. Also use ->in_idle to prevent removing it while we traversing ->xa on cancellation, just ignore it. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 9a2cff0662e0..8a4ab86ae64f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -987,6 +987,7 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_UNLINKAT] = {}, }; +static void io_uring_del_task_file(unsigned long index); static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, struct files_struct *files); @@ -8536,10 +8537,33 @@ static bool io_run_ctx_fallback(struct io_ring_ctx *ctx) return executed; } +struct io_tctx_exit { + struct callback_head task_work; + struct completion completion; + unsigned long index; +}; + +static void io_tctx_exit_cb(struct callback_head *cb) +{ + struct io_uring_task *tctx = current->io_uring; + struct io_tctx_exit *work; + + work = container_of(cb, struct io_tctx_exit, task_work); + /* + * When @in_idle, we're in cancellation and it's racy to remove the + * node. It'll be removed by the end of cancellation, just ignore it. + */ + if (!atomic_read(&tctx->in_idle)) + io_uring_del_task_file(work->index); + complete(&work->completion); +} + static void io_ring_exit_work(struct work_struct *work) { - struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, - exit_work); + struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work); + struct io_tctx_exit exit; + struct io_tctx_node *node; + int ret; /* * If we're doing polled IO and end up having requests being @@ -8550,6 +8574,26 @@ static void io_ring_exit_work(struct work_struct *work) do { io_uring_try_cancel_requests(ctx, NULL, NULL); } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)); + + mutex_lock(&ctx->uring_lock); + while (!list_empty(&ctx->tctx_list)) { + node = list_first_entry(&ctx->tctx_list, struct io_tctx_node, + ctx_node); + exit.index = (unsigned long)node->file; + init_completion(&exit.completion); + init_task_work(&exit.task_work, io_tctx_exit_cb); + ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL); + if (WARN_ON_ONCE(ret)) + continue; + wake_up_process(node->task); + + mutex_unlock(&ctx->uring_lock); + wait_for_completion(&exit.completion); + cond_resched(); + mutex_lock(&ctx->uring_lock); + } + mutex_unlock(&ctx->uring_lock); + io_ring_ctx_free(ctx); } -- cgit v1.2.3 From eebd2e37e662617a6b8041db75205f0a262ce870 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 6 Mar 2021 11:02:14 +0000 Subject: io_uring: don't take task ring-file notes With ->flush() gone we're now leaving all uring file notes until the task dies/execs, so the ctx will not be freed until all tasks that have ever submit a request die. It was nicer with flush but not much, we could have locked as described ctx in many cases. Now we guarantee that ctx outlives all tctx in a sense that io_ring_exit_work() waits for all tctxs to drop their corresponding enties in ->xa, and ctx won't go away until then. Hence, additional io_uring file reference (a.k.a. task file notes) are not needed anymore. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 8a4ab86ae64f..f448213267c8 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8821,11 +8821,9 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file) node->file = file; node->task = current; - get_file(file); ret = xa_err(xa_store(&tctx->xa, (unsigned long)file, node, GFP_KERNEL)); if (ret) { - fput(file); kfree(node); return ret; } @@ -8856,6 +8854,8 @@ static void io_uring_del_task_file(unsigned long index) struct io_uring_task *tctx = current->io_uring; struct io_tctx_node *node; + if (!tctx) + return; node = xa_erase(&tctx->xa, index); if (!node) return; @@ -8869,7 +8869,6 @@ static void io_uring_del_task_file(unsigned long index) if (tctx->last == node->file) tctx->last = NULL; - fput(node->file); kfree(node); } -- cgit v1.2.3 From baf186c4d345f5a105e63df01100936ad622f369 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 6 Mar 2021 11:02:15 +0000 Subject: io_uring: index io_uring->xa by ctx not file We don't use task file notes anymore, and no need left in indexing task->io_uring->xa by file, and replace it with ctx. It's better design-wise, especially since we keep a dangling file, and so have to keep an eye on not dereferencing it. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 24 +++++++++++------------- include/linux/io_uring.h | 2 +- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index f448213267c8..01a7fa4a4889 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -809,7 +809,6 @@ struct io_kiocb { struct io_tctx_node { struct list_head ctx_node; struct task_struct *task; - struct file *file; struct io_ring_ctx *ctx; }; @@ -8540,7 +8539,7 @@ static bool io_run_ctx_fallback(struct io_ring_ctx *ctx) struct io_tctx_exit { struct callback_head task_work; struct completion completion; - unsigned long index; + struct io_ring_ctx *ctx; }; static void io_tctx_exit_cb(struct callback_head *cb) @@ -8554,7 +8553,7 @@ static void io_tctx_exit_cb(struct callback_head *cb) * node. It'll be removed by the end of cancellation, just ignore it. */ if (!atomic_read(&tctx->in_idle)) - io_uring_del_task_file(work->index); + io_uring_del_task_file((unsigned long)work->ctx); complete(&work->completion); } @@ -8579,7 +8578,7 @@ static void io_ring_exit_work(struct work_struct *work) while (!list_empty(&ctx->tctx_list)) { node = list_first_entry(&ctx->tctx_list, struct io_tctx_node, ctx_node); - exit.index = (unsigned long)node->file; + exit.ctx = ctx; init_completion(&exit.completion); init_task_work(&exit.task_work, io_tctx_exit_cb); ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL); @@ -8798,7 +8797,7 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, /* * Note that this task has used io_uring. We use it for cancelation purposes. */ -static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file) +static int io_uring_add_task_file(struct io_ring_ctx *ctx) { struct io_uring_task *tctx = current->io_uring; struct io_tctx_node *node; @@ -8810,18 +8809,17 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file) return ret; tctx = current->io_uring; } - if (tctx->last != file) { - void *old = xa_load(&tctx->xa, (unsigned long)file); + if (tctx->last != ctx) { + void *old = xa_load(&tctx->xa, (unsigned long)ctx); if (!old) { node = kmalloc(sizeof(*node), GFP_KERNEL); if (!node) return -ENOMEM; node->ctx = ctx; - node->file = file; node->task = current; - ret = xa_err(xa_store(&tctx->xa, (unsigned long)file, + ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx, node, GFP_KERNEL)); if (ret) { kfree(node); @@ -8832,7 +8830,7 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file) list_add(&node->ctx_node, &ctx->tctx_list); mutex_unlock(&ctx->uring_lock); } - tctx->last = file; + tctx->last = ctx; } /* @@ -8867,7 +8865,7 @@ static void io_uring_del_task_file(unsigned long index) list_del(&node->ctx_node); mutex_unlock(&node->ctx->uring_lock); - if (tctx->last == node->file) + if (tctx->last == node->ctx) tctx->last = NULL; kfree(node); } @@ -9166,7 +9164,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, } submitted = to_submit; } else if (to_submit) { - ret = io_uring_add_task_file(ctx, f.file); + ret = io_uring_add_task_file(ctx); if (unlikely(ret)) goto out; mutex_lock(&ctx->uring_lock); @@ -9375,7 +9373,7 @@ static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file) if (fd < 0) return fd; - ret = io_uring_add_task_file(ctx, file); + ret = io_uring_add_task_file(ctx); if (ret) { put_unused_fd(fd); return ret; diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 7cb7bd0e334c..9761a0ec9f95 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -18,7 +18,7 @@ struct io_uring_task { /* submission side */ struct xarray xa; struct wait_queue_head wait; - struct file *last; + void *last; void *io_wq; struct percpu_counter inflight; atomic_t in_idle; -- cgit v1.2.3 From b5bb3a24f69da92e0ec2a301452364333e45be03 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 6 Mar 2021 11:02:16 +0000 Subject: io_uring: warn when ring exit takes too long We use system_unbound_wq to run io_ring_exit_work(), so it's hard to monitor whether removal hang or not. Add WARN_ONCE to catch hangs. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 01a7fa4a4889..945e54690b81 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8560,6 +8560,7 @@ static void io_tctx_exit_cb(struct callback_head *cb) static void io_ring_exit_work(struct work_struct *work) { struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work); + unsigned long timeout = jiffies + HZ * 60 * 5; struct io_tctx_exit exit; struct io_tctx_node *node; int ret; @@ -8572,10 +8573,14 @@ static void io_ring_exit_work(struct work_struct *work) */ do { io_uring_try_cancel_requests(ctx, NULL, NULL); + + WARN_ON_ONCE(time_after(jiffies, timeout)); } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)); mutex_lock(&ctx->uring_lock); while (!list_empty(&ctx->tctx_list)) { + WARN_ON_ONCE(time_after(jiffies, timeout)); + node = list_first_entry(&ctx->tctx_list, struct io_tctx_node, ctx_node); exit.ctx = ctx; -- cgit v1.2.3 From 1b00764f09b6912d25e188d972a7764a457926ba Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 6 Mar 2021 11:02:17 +0000 Subject: io_uring: cancel reqs of all iowq's on ring exit io_ring_exit_work() have to cancel all requests, including those staying in io-wq, however it tries only cancellation of current tctx, which is NULL. If we've got task==NULL, use the ctx-to-tctx map to go over all tctx/io-wq and try cancellations on them. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 42 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 945e54690b81..8c74c7799960 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8688,19 +8688,55 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx, } } +static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data) +{ + struct io_kiocb *req = container_of(work, struct io_kiocb, work); + + return req->ctx == data; +} + +static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) +{ + struct io_tctx_node *node; + enum io_wq_cancel cret; + bool ret = false; + + mutex_lock(&ctx->uring_lock); + list_for_each_entry(node, &ctx->tctx_list, ctx_node) { + struct io_uring_task *tctx = node->task->io_uring; + + /* + * io_wq will stay alive while we hold uring_lock, because it's + * killed after ctx nodes, which requires to take the lock. + */ + if (!tctx || !tctx->io_wq) + continue; + cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true); + ret |= (cret != IO_WQ_CANCEL_NOTFOUND); + } + mutex_unlock(&ctx->uring_lock); + + return ret; +} + static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, struct files_struct *files) { struct io_task_cancel cancel = { .task = task, .files = files, }; - struct task_struct *tctx_task = task ?: current; - struct io_uring_task *tctx = tctx_task->io_uring; + struct io_uring_task *tctx = task ? task->io_uring : NULL; while (1) { enum io_wq_cancel cret; bool ret = false; - if (tctx && tctx->io_wq) { + if (!task) { + ret |= io_uring_try_cancel_iowq(ctx); + } else if (tctx && tctx->io_wq) { + /* + * Cancels requests of all rings, not only @ctx, but + * it's fine as the task is in exit/exec. + */ cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb, &cancel, true); ret |= (cret != IO_WQ_CANCEL_NOTFOUND); -- cgit v1.2.3 From 678eeba481d8c161203382832a4379d507050aed Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 6 Mar 2021 11:02:18 +0000 Subject: io-wq: warn on creating manager while exiting Add a simple warning making sure that nobody tries to create a new manager while we're under IO_WQ_BIT_EXIT. That can potentially happen due to racy work submission after final put. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io-wq.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/io-wq.c b/fs/io-wq.c index 1bfdb86336e4..1ab9324e602f 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -774,6 +774,8 @@ static int io_wq_fork_manager(struct io_wq *wq) if (wq->manager) return 0; + WARN_ON_ONCE(test_bit(IO_WQ_BIT_EXIT, &wq->state)); + init_completion(&wq->worker_done); atomic_set(&wq->worker_refs, 1); tsk = create_io_thread(io_wq_manager, wq, NUMA_NO_NODE); -- cgit v1.2.3 From 7c30f36a98ae488741178d69662e4f2baa53e7f6 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Sun, 7 Mar 2021 11:54:28 +0100 Subject: io_uring: run __io_sq_thread() with the initial creds from io_uring_setup() With IORING_SETUP_ATTACH_WQ we should let __io_sq_thread() use the initial creds from each ctx. Signed-off-by: Stefan Metzmacher Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 8c74c7799960..4d3333ca27a3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -380,6 +380,7 @@ struct io_ring_ctx { /* Only used for accounting purposes */ struct mm_struct *mm_account; + const struct cred *sq_creds; /* cred used for __io_sq_thread() */ struct io_sq_data *sq_data; /* if using sq thread polling */ struct wait_queue_head sqo_sq_wait; @@ -6719,7 +6720,13 @@ static int io_sq_thread(void *data) sqt_spin = false; cap_entries = !list_is_singular(&sqd->ctx_list); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { + const struct cred *creds = NULL; + + if (ctx->sq_creds != current_cred()) + creds = override_creds(ctx->sq_creds); ret = __io_sq_thread(ctx, cap_entries); + if (creds) + revert_creds(creds); if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list))) sqt_spin = true; } @@ -7152,6 +7159,8 @@ static void io_sq_thread_finish(struct io_ring_ctx *ctx) io_put_sq_data(sqd); ctx->sq_data = NULL; + if (ctx->sq_creds) + put_cred(ctx->sq_creds); } } @@ -7890,6 +7899,7 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, goto err; } + ctx->sq_creds = get_current_cred(); ctx->sq_data = sqd; io_sq_thread_park(sqd); mutex_lock(&sqd->ctx_lock); -- cgit v1.2.3 From 041474885e9707a38fad081abe30159eb6d463f9 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Sun, 7 Mar 2021 11:54:29 +0100 Subject: io_uring: kill io_sq_thread_fork() and return -EOWNERDEAD if the sq_thread is gone This brings the behavior back in line with what 5.11 and earlier did, and this is no longer needed with the improved handling of creds not needing to do unshare(). Signed-off-by: Stefan Metzmacher Signed-off-by: Jens Axboe --- fs/io_uring.c | 31 +++---------------------------- 1 file changed, 3 insertions(+), 28 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 4d3333ca27a3..7cf96be691d8 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -336,7 +336,6 @@ struct io_ring_ctx { unsigned int drain_next: 1; unsigned int eventfd_async: 1; unsigned int restricted: 1; - unsigned int sqo_exec: 1; /* * Ring buffer of indices into array of io_uring_sqe, which is @@ -6786,7 +6785,6 @@ static int io_sq_thread(void *data) sqd->thread = NULL; list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { - ctx->sqo_exec = 1; io_ring_set_wakeup_flag(ctx); } @@ -7846,26 +7844,6 @@ void __io_uring_free(struct task_struct *tsk) tsk->io_uring = NULL; } -static int io_sq_thread_fork(struct io_sq_data *sqd, struct io_ring_ctx *ctx) -{ - struct task_struct *tsk; - int ret; - - clear_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); - reinit_completion(&sqd->parked); - ctx->sqo_exec = 0; - sqd->task_pid = current->pid; - tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE); - if (IS_ERR(tsk)) - return PTR_ERR(tsk); - ret = io_uring_alloc_task_context(tsk, ctx); - if (ret) - set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); - sqd->thread = tsk; - wake_up_new_task(tsk); - return ret; -} - static int io_sq_offload_create(struct io_ring_ctx *ctx, struct io_uring_params *p) { @@ -9199,13 +9177,10 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (ctx->flags & IORING_SETUP_SQPOLL) { io_cqring_overflow_flush(ctx, false, NULL, NULL); - if (unlikely(ctx->sqo_exec)) { - ret = io_sq_thread_fork(ctx->sq_data, ctx); - if (ret) - goto out; - ctx->sqo_exec = 0; - } ret = -EOWNERDEAD; + if (unlikely(ctx->sq_data->thread == NULL)) { + goto out; + } if (flags & IORING_ENTER_SQ_WAKEUP) wake_up(&ctx->sq_data->wait); if (flags & IORING_ENTER_SQ_WAIT) { -- cgit v1.2.3 From 05962f95f9ac7af25fea037ef51b37c0eccb5590 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 6 Mar 2021 13:58:48 -0700 Subject: io_uring: SQPOLL parking fixes We keep running into weird dependency issues between the sqd lock and the parking state. Disentangle the SQPOLL thread from the last bits of the kthread parking inheritance, and just replace the parking state, and two associated locks, with a single rw mutex. The SQPOLL thread keeps the mutex for read all the time, except if someone has marked us needing to park. Then we drop/re-acquire and try again. This greatly simplifies the parking state machine (by just getting rid of it), and makes it a lot more obvious how it works - if you need to modify the ctx list, then you simply park the thread which will grab the lock for writing. Fold in fix from Hillf Danton on not setting STOP on a fatal signal. Fixes: e54945ae947f ("io_uring: SQPOLL stop error handling fixes") Signed-off-by: Jens Axboe --- fs/io_uring.c | 133 +++++++++++++++------------------------------------------- 1 file changed, 34 insertions(+), 99 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 7cf96be691d8..2a3542b487ff 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -258,12 +258,11 @@ enum { struct io_sq_data { refcount_t refs; - struct mutex lock; + struct rw_semaphore rw_lock; /* ctx's that are using this sqd */ struct list_head ctx_list; struct list_head ctx_new_list; - struct mutex ctx_lock; struct task_struct *thread; struct wait_queue_head wait; @@ -274,7 +273,6 @@ struct io_sq_data { unsigned long state; struct completion startup; - struct completion parked; struct completion exited; }; @@ -6638,45 +6636,6 @@ static void io_sqd_init_new(struct io_sq_data *sqd) io_sqd_update_thread_idle(sqd); } -static bool io_sq_thread_should_stop(struct io_sq_data *sqd) -{ - return test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); -} - -static bool io_sq_thread_should_park(struct io_sq_data *sqd) -{ - return test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); -} - -static void io_sq_thread_parkme(struct io_sq_data *sqd) -{ - for (;;) { - /* - * TASK_PARKED is a special state; we must serialize against - * possible pending wakeups to avoid store-store collisions on - * task->state. - * - * Such a collision might possibly result in the task state - * changin from TASK_PARKED and us failing the - * wait_task_inactive() in kthread_park(). - */ - set_special_state(TASK_PARKED); - if (!test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) - break; - - /* - * Thread is going to call schedule(), do not preempt it, - * or the caller of kthread_park() may spend more time in - * wait_task_inactive(). - */ - preempt_disable(); - complete(&sqd->parked); - schedule_preempt_disabled(); - preempt_enable(); - } - __set_current_state(TASK_RUNNING); -} - static int io_sq_thread(void *data) { struct io_sq_data *sqd = data; @@ -6697,17 +6656,16 @@ static int io_sq_thread(void *data) wait_for_completion(&sqd->startup); - while (!io_sq_thread_should_stop(sqd)) { + down_read(&sqd->rw_lock); + + while (!test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)) { int ret; bool cap_entries, sqt_spin, needs_sched; - /* - * Any changes to the sqd lists are synchronized through the - * thread parking. This synchronizes the thread vs users, - * the users are synchronized on the sqd->ctx_lock. - */ - if (io_sq_thread_should_park(sqd)) { - io_sq_thread_parkme(sqd); + if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) { + up_read(&sqd->rw_lock); + cond_resched(); + down_read(&sqd->rw_lock); continue; } if (unlikely(!list_empty(&sqd->ctx_new_list))) { @@ -6752,12 +6710,14 @@ static int io_sq_thread(void *data) } } - if (needs_sched && !io_sq_thread_should_park(sqd)) { + if (needs_sched && !test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) { list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_set_wakeup_flag(ctx); + up_read(&sqd->rw_lock); schedule(); try_to_freeze(); + down_read(&sqd->rw_lock); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_clear_wakeup_flag(ctx); } @@ -6768,28 +6728,16 @@ static int io_sq_thread(void *data) list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_uring_cancel_sqpoll(ctx); + up_read(&sqd->rw_lock); io_run_task_work(); - /* - * Ensure that we park properly if racing with someone trying to park - * while we're exiting. If we fail to grab the lock, check park and - * park if necessary. The ordering with the park bit and the lock - * ensures that we catch this reliably. - */ - if (!mutex_trylock(&sqd->lock)) { - if (io_sq_thread_should_park(sqd)) - io_sq_thread_parkme(sqd); - mutex_lock(&sqd->lock); - } - + down_write(&sqd->rw_lock); sqd->thread = NULL; - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_set_wakeup_flag(ctx); - } - + up_write(&sqd->rw_lock); complete(&sqd->exited); - mutex_unlock(&sqd->lock); do_exit(0); } @@ -7088,44 +7036,40 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) } static void io_sq_thread_unpark(struct io_sq_data *sqd) - __releases(&sqd->lock) + __releases(&sqd->rw_lock) { if (sqd->thread == current) return; clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); - if (sqd->thread) - wake_up_state(sqd->thread, TASK_PARKED); - mutex_unlock(&sqd->lock); + up_write(&sqd->rw_lock); } static void io_sq_thread_park(struct io_sq_data *sqd) - __acquires(&sqd->lock) + __acquires(&sqd->rw_lock) { if (sqd->thread == current) return; set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); - mutex_lock(&sqd->lock); - if (sqd->thread) { + down_write(&sqd->rw_lock); + /* set again for consistency, in case concurrent parks are happening */ + set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); + if (sqd->thread) wake_up_process(sqd->thread); - wait_for_completion(&sqd->parked); - } } static void io_sq_thread_stop(struct io_sq_data *sqd) { if (test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)) return; - mutex_lock(&sqd->lock); - if (sqd->thread) { - set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); - WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)); - wake_up_process(sqd->thread); - mutex_unlock(&sqd->lock); - wait_for_completion(&sqd->exited); - WARN_ON_ONCE(sqd->thread); - } else { - mutex_unlock(&sqd->lock); + down_write(&sqd->rw_lock); + if (!sqd->thread) { + up_write(&sqd->rw_lock); + return; } + set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); + wake_up_process(sqd->thread); + up_write(&sqd->rw_lock); + wait_for_completion(&sqd->exited); } static void io_put_sq_data(struct io_sq_data *sqd) @@ -7142,18 +7086,13 @@ static void io_sq_thread_finish(struct io_ring_ctx *ctx) if (sqd) { complete(&sqd->startup); - if (sqd->thread) { + if (sqd->thread) wait_for_completion(&ctx->sq_thread_comp); - io_sq_thread_park(sqd); - } - mutex_lock(&sqd->ctx_lock); + io_sq_thread_park(sqd); list_del(&ctx->sqd_list); io_sqd_update_thread_idle(sqd); - mutex_unlock(&sqd->ctx_lock); - - if (sqd->thread) - io_sq_thread_unpark(sqd); + io_sq_thread_unpark(sqd); io_put_sq_data(sqd); ctx->sq_data = NULL; @@ -7202,11 +7141,9 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p) refcount_set(&sqd->refs, 1); INIT_LIST_HEAD(&sqd->ctx_list); INIT_LIST_HEAD(&sqd->ctx_new_list); - mutex_init(&sqd->ctx_lock); - mutex_init(&sqd->lock); + init_rwsem(&sqd->rw_lock); init_waitqueue_head(&sqd->wait); init_completion(&sqd->startup); - init_completion(&sqd->parked); init_completion(&sqd->exited); return sqd; } @@ -7880,9 +7817,7 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, ctx->sq_creds = get_current_cred(); ctx->sq_data = sqd; io_sq_thread_park(sqd); - mutex_lock(&sqd->ctx_lock); list_add(&ctx->sqd_list, &sqd->ctx_new_list); - mutex_unlock(&sqd->ctx_lock); io_sq_thread_unpark(sqd); ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle); -- cgit v1.2.3 From f458dd8441e56d122ddf1d8e2af0b6ee62f52af9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 8 Mar 2021 12:14:14 +0000 Subject: io_uring: fix unrelated ctx reqs cancellation io-wq now is per-task, so cancellations now should match against request's ctx. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2a3542b487ff..d4f018f5838d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5573,22 +5573,30 @@ add: return 0; } +struct io_cancel_data { + struct io_ring_ctx *ctx; + u64 user_data; +}; + static bool io_cancel_cb(struct io_wq_work *work, void *data) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); + struct io_cancel_data *cd = data; - return req->user_data == (unsigned long) data; + return req->ctx == cd->ctx && req->user_data == cd->user_data; } -static int io_async_cancel_one(struct io_uring_task *tctx, void *sqe_addr) +static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data, + struct io_ring_ctx *ctx) { + struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, }; enum io_wq_cancel cancel_ret; int ret = 0; - if (!tctx->io_wq) + if (!tctx || !tctx->io_wq) return -ENOENT; - cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, sqe_addr, false); + cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false); switch (cancel_ret) { case IO_WQ_CANCEL_OK: ret = 0; @@ -5611,8 +5619,7 @@ static void io_async_find_and_cancel(struct io_ring_ctx *ctx, unsigned long flags; int ret; - ret = io_async_cancel_one(req->task->io_uring, - (void *) (unsigned long) sqe_addr); + ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx); if (ret != -ENOENT) { spin_lock_irqsave(&ctx->completion_lock, flags); goto done; -- cgit v1.2.3 From 0298ef969a110ca03654f0cea9b50e3f3b331acc Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 8 Mar 2021 13:20:57 +0000 Subject: io_uring: clean R_DISABLED startup mess There are enough of problems with IORING_SETUP_R_DISABLED, including the burden of checking and kicking off the SQO task all over the codebase -- for exit/cancel/etc. Rework it, always start the thread but don't do submit unless the flag is gone, that's much easier. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index d4f018f5838d..3f6db813d670 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6606,7 +6606,8 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) if (!list_empty(&ctx->iopoll_list)) io_do_iopoll(ctx, &nr_events, 0); - if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs))) + if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) && + !(ctx->flags & IORING_SETUP_R_DISABLED)) ret = io_submit_sqes(ctx, to_submit); mutex_unlock(&ctx->uring_lock); } @@ -7861,6 +7862,7 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, wake_up_new_task(tsk); if (ret) goto err; + complete(&sqd->startup); } else if (p->flags & IORING_SETUP_SQ_AFF) { /* Can't have SQ_AFF without SQPOLL */ ret = -EINVAL; @@ -7873,15 +7875,6 @@ err: return ret; } -static void io_sq_offload_start(struct io_ring_ctx *ctx) -{ - struct io_sq_data *sqd = ctx->sq_data; - - ctx->flags &= ~IORING_SETUP_R_DISABLED; - if (ctx->flags & IORING_SETUP_SQPOLL) - complete(&sqd->startup); -} - static inline void __io_unaccount_mem(struct user_struct *user, unsigned long nr_pages) { @@ -8742,11 +8735,6 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, struct task_struct *task = current; if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) { - /* never started, nothing to cancel */ - if (ctx->flags & IORING_SETUP_R_DISABLED) { - io_sq_offload_start(ctx); - return; - } io_sq_thread_park(ctx->sq_data); task = ctx->sq_data->thread; if (task) @@ -9449,9 +9437,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, if (ret) goto err; - if (!(p->flags & IORING_SETUP_R_DISABLED)) - io_sq_offload_start(ctx); - memset(&p->sq_off, 0, sizeof(p->sq_off)); p->sq_off.head = offsetof(struct io_rings, sq.head); p->sq_off.tail = offsetof(struct io_rings, sq.tail); @@ -9668,7 +9653,9 @@ static int io_register_enable_rings(struct io_ring_ctx *ctx) if (ctx->restrictions.registered) ctx->restricted = 1; - io_sq_offload_start(ctx); + ctx->flags &= ~IORING_SETUP_R_DISABLED; + if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait)) + wake_up(&ctx->sq_data->wait); return 0; } -- cgit v1.2.3 From 61cf93700fe6359552848ed5e3becba6cd760efa Mon Sep 17 00:00:00 2001 From: Matthew Wilcox (Oracle) Date: Mon, 8 Mar 2021 14:16:16 +0000 Subject: io_uring: Convert personality_idr to XArray You can't call idr_remove() from within a idr_for_each() callback, but you can call xa_erase() from an xa_for_each() loop, so switch the entire personality_idr from the IDR to the XArray. This manifests as a use-after-free as idr_for_each() attempts to walk the rest of the node after removing the last entry from it. Fixes: 071698e13ac6 ("io_uring: allow registering credentials") Cc: stable@vger.kernel.org # 5.6+ Reported-by: yangerkun Signed-off-by: Matthew Wilcox (Oracle) [Pavel: rebased (creds load was moved into io_init_req())] Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/7ccff36e1375f2b0ebf73d957f037b43becc0dde.1615212806.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 47 ++++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 3f6db813d670..84eb499368a4 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -406,7 +406,8 @@ struct io_ring_ctx { struct idr io_buffer_idr; - struct idr personality_idr; + struct xarray personalities; + u32 pers_next; struct { unsigned cached_cq_tail; @@ -1137,7 +1138,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_completion(&ctx->ref_comp); init_completion(&ctx->sq_thread_comp); idr_init(&ctx->io_buffer_idr); - idr_init(&ctx->personality_idr); + xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->wait); spin_lock_init(&ctx->completion_lock); @@ -6337,7 +6338,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->work.list.next = NULL; personality = READ_ONCE(sqe->personality); if (personality) { - req->work.creds = idr_find(&ctx->personality_idr, personality); + req->work.creds = xa_load(&ctx->personalities, personality); if (!req->work.creds) return -EINVAL; get_cred(req->work.creds); @@ -8355,7 +8356,6 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); io_eventfd_unregister(ctx); io_destroy_buffers(ctx); - idr_destroy(&ctx->personality_idr); #if defined(CONFIG_UNIX) if (ctx->ring_sock) { @@ -8420,7 +8420,7 @@ static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) { const struct cred *creds; - creds = idr_remove(&ctx->personality_idr, id); + creds = xa_erase(&ctx->personalities, id); if (creds) { put_cred(creds); return 0; @@ -8429,14 +8429,6 @@ static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) return -EINVAL; } -static int io_remove_personalities(int id, void *p, void *data) -{ - struct io_ring_ctx *ctx = data; - - io_unregister_personality(ctx, id); - return 0; -} - static bool io_run_ctx_fallback(struct io_ring_ctx *ctx) { struct callback_head *work, *next; @@ -8526,13 +8518,17 @@ static void io_ring_exit_work(struct work_struct *work) static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) { + unsigned long index; + struct creds *creds; + mutex_lock(&ctx->uring_lock); percpu_ref_kill(&ctx->refs); /* if force is set, the ring is going away. always drop after that */ ctx->cq_overflow_flushed = 1; if (ctx->rings) __io_cqring_overflow_flush(ctx, true, NULL, NULL); - idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx); + xa_for_each(&ctx->personalities, index, creds) + io_unregister_personality(ctx, index); mutex_unlock(&ctx->uring_lock); io_kill_timeouts(ctx, NULL, NULL); @@ -9162,10 +9158,9 @@ out_fput: } #ifdef CONFIG_PROC_FS -static int io_uring_show_cred(int id, void *p, void *data) +static int io_uring_show_cred(struct seq_file *m, unsigned int id, + const struct cred *cred) { - const struct cred *cred = p; - struct seq_file *m = data; struct user_namespace *uns = seq_user_ns(m); struct group_info *gi; kernel_cap_t cap; @@ -9233,9 +9228,13 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, (unsigned int) buf->len); } - if (has_lock && !idr_is_empty(&ctx->personality_idr)) { + if (has_lock && !xa_empty(&ctx->personalities)) { + unsigned long index; + const struct cred *cred; + seq_printf(m, "Personalities:\n"); - idr_for_each(&ctx->personality_idr, io_uring_show_cred, m); + xa_for_each(&ctx->personalities, index, cred) + io_uring_show_cred(m, index, cred); } seq_printf(m, "PollList:\n"); spin_lock_irq(&ctx->completion_lock); @@ -9564,14 +9563,16 @@ out: static int io_register_personality(struct io_ring_ctx *ctx) { const struct cred *creds; + u32 id; int ret; creds = get_current_cred(); - ret = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1, - USHRT_MAX, GFP_KERNEL); - if (ret < 0) - put_cred(creds); + ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds, + XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL); + if (!ret) + return id; + put_cred(creds); return ret; } -- cgit v1.2.3 From cc20e3fec682700b673fcd286e6bef8e9da947e2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 8 Mar 2021 09:34:43 -0700 Subject: io-wq: remove unused 'user' member of io_wq Previous patches killed the last user of this, now it's just a dead member in the struct. Get rid of it. Signed-off-by: Jens Axboe --- fs/io-wq.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 1ab9324e602f..c2e7031f6d09 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -110,7 +110,6 @@ struct io_wq { io_wq_work_fn *do_work; struct task_struct *manager; - struct user_struct *user; struct io_wq_hash *hash; -- cgit v1.2.3 From 97a73a0f9fbfb2be682fd037814576dbfa0e0da8 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 8 Mar 2021 17:30:54 +0000 Subject: io_uring: fix io_sq_offload_create error handling Don't set IO_SQ_THREAD_SHOULD_STOP when io_sq_offload_create() has failed on io_uring_alloc_task_context() but leave everything to io_sq_thread_finish(), because currently io_sq_thread_finish() hangs on trying to park it. That's great it stalls there, because otherwise the following io_sq_thread_stop() would be skipped on IO_SQ_THREAD_SHOULD_STOP check and the sqo would race for sqd with freeing ctx. A simple error injection gives something like this. [ 245.463955] INFO: task sqpoll-test-hang:523 blocked for more than 122 seconds. [ 245.463983] Call Trace: [ 245.463990] __schedule+0x36b/0x950 [ 245.464005] schedule+0x68/0xe0 [ 245.464013] schedule_timeout+0x209/0x2a0 [ 245.464032] wait_for_completion+0x8b/0xf0 [ 245.464043] io_sq_thread_finish+0x44/0x1a0 [ 245.464049] io_uring_setup+0x9ea/0xc80 [ 245.464058] __x64_sys_io_uring_setup+0x16/0x20 [ 245.464064] do_syscall_64+0x38/0x50 [ 245.464073] entry_SYSCALL_64_after_hwframe+0x44/0xae Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 84eb499368a4..3299807894ec 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7856,10 +7856,9 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, ret = PTR_ERR(tsk); goto err; } - ret = io_uring_alloc_task_context(tsk, ctx); - if (ret) - set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); + sqd->thread = tsk; + ret = io_uring_alloc_task_context(tsk, ctx); wake_up_new_task(tsk); if (ret) goto err; -- cgit v1.2.3 From 33cc89a9fc248a486857381584cc6b67d9405fab Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 9 Mar 2021 00:37:58 +0000 Subject: io_uring: add io_disarm_next() helper A preparation patch placing all preparations before extracting a next request into a separate helper io_disarm_next(). Also, don't spuriously do ev_posted in a rare case where REQ_F_FAIL_LINK is set but there are no requests linked (i.e. after cancelling a linked timeout or setting IOSQE_IO_LINK on a last request of a submission batch). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/44ecff68d6b47e1c4e6b891bdde1ddc08cfc3590.1615250156.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 68 ++++++++++++++++++++++++++++++----------------------------- 1 file changed, 35 insertions(+), 33 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 3299807894ec..cc9a2cc95608 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1705,15 +1705,11 @@ static inline void io_remove_next_linked(struct io_kiocb *req) nxt->link = NULL; } -static void io_kill_linked_timeout(struct io_kiocb *req) +static bool io_kill_linked_timeout(struct io_kiocb *req) + __must_hold(&req->ctx->completion_lock) { - struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *link; + struct io_kiocb *link = req->link; bool cancelled = false; - unsigned long flags; - - spin_lock_irqsave(&ctx->completion_lock, flags); - link = req->link; /* * Can happen if a linked timeout fired and link had been like @@ -1728,50 +1724,48 @@ static void io_kill_linked_timeout(struct io_kiocb *req) ret = hrtimer_try_to_cancel(&io->timer); if (ret != -1) { io_cqring_fill_event(link, -ECANCELED); - io_commit_cqring(ctx); + io_put_req_deferred(link, 1); cancelled = true; } } req->flags &= ~REQ_F_LINK_TIMEOUT; - spin_unlock_irqrestore(&ctx->completion_lock, flags); - - if (cancelled) { - io_cqring_ev_posted(ctx); - io_put_req(link); - } + return cancelled; } - static void io_fail_links(struct io_kiocb *req) + __must_hold(&req->ctx->completion_lock) { - struct io_kiocb *link, *nxt; - struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; + struct io_kiocb *nxt, *link = req->link; - spin_lock_irqsave(&ctx->completion_lock, flags); - link = req->link; req->link = NULL; - while (link) { nxt = link->link; link->link = NULL; trace_io_uring_fail_link(req, link); io_cqring_fill_event(link, -ECANCELED); - io_put_req_deferred(link, 2); link = nxt; } - io_commit_cqring(ctx); - spin_unlock_irqrestore(&ctx->completion_lock, flags); +} - io_cqring_ev_posted(ctx); +static bool io_disarm_next(struct io_kiocb *req) + __must_hold(&req->ctx->completion_lock) +{ + bool posted = false; + + if (likely(req->flags & REQ_F_LINK_TIMEOUT)) + posted = io_kill_linked_timeout(req); + if (unlikely(req->flags & REQ_F_FAIL_LINK)) { + posted |= (req->link != NULL); + io_fail_links(req); + } + return posted; } static struct io_kiocb *__io_req_find_next(struct io_kiocb *req) { - if (req->flags & REQ_F_LINK_TIMEOUT) - io_kill_linked_timeout(req); + struct io_kiocb *nxt; /* * If LINK is set, we have dependent requests in this chain. If we @@ -1779,14 +1773,22 @@ static struct io_kiocb *__io_req_find_next(struct io_kiocb *req) * dependencies to the next request. In case of failure, fail the rest * of the chain. */ - if (likely(!(req->flags & REQ_F_FAIL_LINK))) { - struct io_kiocb *nxt = req->link; + if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL_LINK)) { + struct io_ring_ctx *ctx = req->ctx; + unsigned long flags; + bool posted; - req->link = NULL; - return nxt; + spin_lock_irqsave(&ctx->completion_lock, flags); + posted = io_disarm_next(req); + if (posted) + io_commit_cqring(req->ctx); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + if (posted) + io_cqring_ev_posted(ctx); } - io_fail_links(req); - return NULL; + nxt = req->link; + req->link = NULL; + return nxt; } static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) -- cgit v1.2.3 From 7a612350a989866510dc5c874fd8ffe1f37555d2 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 9 Mar 2021 00:37:59 +0000 Subject: io_uring: fix complete_post races for linked req Calling io_queue_next() after spin_unlock in io_req_complete_post() races with the other side extracting and reusing this request. Hand coded parts of io_req_find_next() considering that io_disarm_next() and io_req_task_queue() have (and safe) to be called with completion_lock held. It already does io_commit_cqring() and io_cqring_ev_posted(), so just reuse it for post io_disarm_next(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/5672a62f3150ee7c55849f40c0037655c4f2840f.1615250156.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index cc9a2cc95608..f7153483a3ac 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -985,6 +985,7 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_UNLINKAT] = {}, }; +static bool io_disarm_next(struct io_kiocb *req); static void io_uring_del_task_file(unsigned long index); static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, @@ -1525,15 +1526,14 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res) __io_cqring_fill_event(req, res, 0); } -static inline void io_req_complete_post(struct io_kiocb *req, long res, - unsigned int cflags) +static void io_req_complete_post(struct io_kiocb *req, long res, + unsigned int cflags) { struct io_ring_ctx *ctx = req->ctx; unsigned long flags; spin_lock_irqsave(&ctx->completion_lock, flags); __io_cqring_fill_event(req, res, cflags); - io_commit_cqring(ctx); /* * If we're the last reference to this request, add to our locked * free_list cache. @@ -1541,19 +1541,26 @@ static inline void io_req_complete_post(struct io_kiocb *req, long res, if (refcount_dec_and_test(&req->refs)) { struct io_comp_state *cs = &ctx->submit_state.comp; + if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { + if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL_LINK)) + io_disarm_next(req); + if (req->link) { + io_req_task_queue(req->link); + req->link = NULL; + } + } io_dismantle_req(req); io_put_task(req->task, 1); list_add(&req->compl.list, &cs->locked_free_list); cs->locked_free_nr++; } else req = NULL; + io_commit_cqring(ctx); spin_unlock_irqrestore(&ctx->completion_lock, flags); - io_cqring_ev_posted(ctx); - if (req) { - io_queue_next(req); + + if (req) percpu_ref_put(&ctx->refs); - } } static void io_req_complete_state(struct io_kiocb *req, long res, -- cgit v1.2.3 From 70e35125093b05b0e607ba1f5358ddf76946756c Mon Sep 17 00:00:00 2001 From: yangerkun Date: Tue, 9 Mar 2021 11:04:10 +0800 Subject: io-wq: fix ref leak for req in case of exit cancelations do_work such as io_wq_submit_work that cancel the work may leave a ref of req as 1 if we have links. Fix it by call io_run_cancel. Fixes: 4fb6ac326204 ("io-wq: improve manager/worker handling over exec") Signed-off-by: yangerkun Link: https://lore.kernel.org/r/20210309030410.3294078-1-yangerkun@huawei.com Signed-off-by: Jens Axboe --- fs/io-wq.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index c2e7031f6d09..3d7060ba547a 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -799,8 +799,7 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) /* Can only happen if manager creation fails after exec */ if (io_wq_fork_manager(wqe->wq) || test_bit(IO_WQ_BIT_EXIT, &wqe->wq->state)) { - work->flags |= IO_WQ_WORK_CANCEL; - wqe->wq->do_work(work); + io_run_cancel(work, wqe); return; } -- cgit v1.2.3 From 93e68e036c2fc1ce18e784418e4e19975a5882b4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 9 Mar 2021 07:02:21 -0700 Subject: io_uring: move all io_kiocb init early in io_init_req() If we hit an error path in the function, make sure that the io_kiocb is fully initialized at that point so that freeing the request always sees a valid state. Signed-off-by: Jens Axboe --- fs/io_uring.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index f7153483a3ac..0f18e4a7bd08 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6327,6 +6327,9 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, refcount_set(&req->refs, 2); req->task = current; req->result = 0; + req->work.list.next = NULL; + req->work.creds = NULL; + req->work.flags = 0; /* enforce forwards compatibility on users */ if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) { @@ -6344,17 +6347,13 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, !io_op_defs[req->opcode].buffer_select) return -EOPNOTSUPP; - req->work.list.next = NULL; personality = READ_ONCE(sqe->personality); if (personality) { req->work.creds = xa_load(&ctx->personalities, personality); if (!req->work.creds) return -EINVAL; get_cred(req->work.creds); - } else { - req->work.creds = NULL; } - req->work.flags = 0; state = &ctx->submit_state; /* -- cgit v1.2.3 From 5199328a0d415b3e372633096b1b92f36b8ac9e5 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Tue, 9 Mar 2021 14:30:41 +0800 Subject: io_uring: remove unneeded variable 'ret' Fix the following coccicheck warning: ./fs/io_uring.c:8984:5-8: Unneeded variable: "ret". Return "0" on line 8998 Reported-by: Abaci Robot Signed-off-by: Yang Li Link: https://lore.kernel.org/r/1615271441-33649-1-git-send-email-yang.lee@linux.alibaba.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 0f18e4a7bd08..6325f32ef6a3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -9022,7 +9022,6 @@ static unsigned long io_uring_nommu_get_unmapped_area(struct file *file, static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx) { - int ret = 0; DEFINE_WAIT(wait); do { @@ -9036,7 +9035,7 @@ static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx) } while (!signal_pending(current)); finish_wait(&ctx->sqo_sq_wait, &wait); - return ret; + return 0; } static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz, -- cgit v1.2.3 From e8f98f24549d62cc54bf608c815904a56d4437bc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 9 Mar 2021 16:32:13 -0700 Subject: io_uring: always wait for sqd exited when stopping SQPOLL thread We have a tiny race where io_put_sq_data() calls io_sq_thead_stop() and finds the thread gone, but the thread has indeed not fully exited or called complete() yet. Close it up by always having io_sq_thread_stop() wait on completion of the exit event. Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6325f32ef6a3..62f998bf2ce8 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7079,12 +7079,9 @@ static void io_sq_thread_stop(struct io_sq_data *sqd) if (test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)) return; down_write(&sqd->rw_lock); - if (!sqd->thread) { - up_write(&sqd->rw_lock); - return; - } set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); - wake_up_process(sqd->thread); + if (sqd->thread) + wake_up_process(sqd->thread); up_write(&sqd->rw_lock); wait_for_completion(&sqd->exited); } @@ -7849,9 +7846,9 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, ret = -EINVAL; if (cpu >= nr_cpu_ids) - goto err; + goto err_sqpoll; if (!cpu_online(cpu)) - goto err; + goto err_sqpoll; sqd->sq_cpu = cpu; } else { @@ -7862,7 +7859,7 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE); if (IS_ERR(tsk)) { ret = PTR_ERR(tsk); - goto err; + goto err_sqpoll; } sqd->thread = tsk; @@ -7881,6 +7878,9 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, err: io_sq_thread_finish(ctx); return ret; +err_sqpoll: + complete(&ctx->sq_data->exited); + goto err; } static inline void __io_unaccount_mem(struct user_struct *user, -- cgit v1.2.3 From e22bc9b481a90d7898984ea17621f04a653e2cd1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 9 Mar 2021 19:49:02 -0700 Subject: kernel: make IO threads unfreezable by default The io-wq threads were already marked as no-freeze, but the manager was not. On resume, we perpetually have signal_pending() being true, and hence the manager will loop and spin 100% of the time. Just mark the tasks created by create_io_thread() as PF_NOFREEZE by default, and remove any knowledge of it in io-wq and io_uring. Reported-by: Kevin Locke Tested-by: Kevin Locke Signed-off-by: Jens Axboe --- fs/io-wq.c | 3 +-- fs/io_uring.c | 1 - kernel/fork.c | 1 + 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 3d7060ba547a..0ae9ecadf295 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -591,7 +591,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) tsk->pf_io_worker = worker; worker->task = tsk; set_cpus_allowed_ptr(tsk, cpumask_of_node(wqe->node)); - tsk->flags |= PF_NOFREEZE | PF_NO_SETAFFINITY; + tsk->flags |= PF_NO_SETAFFINITY; raw_spin_lock_irq(&wqe->lock); hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); @@ -709,7 +709,6 @@ static int io_wq_manager(void *data) set_current_state(TASK_INTERRUPTIBLE); io_wq_check_workers(wq); schedule_timeout(HZ); - try_to_freeze(); if (fatal_signal_pending(current)) set_bit(IO_WQ_BIT_EXIT, &wq->state); } while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)); diff --git a/fs/io_uring.c b/fs/io_uring.c index 62f998bf2ce8..14165e18020c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6733,7 +6733,6 @@ static int io_sq_thread(void *data) up_read(&sqd->rw_lock); schedule(); - try_to_freeze(); down_read(&sqd->rw_lock); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_clear_wakeup_flag(ctx); diff --git a/kernel/fork.c b/kernel/fork.c index d3171e8e88e5..72e444cd0ffe 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2436,6 +2436,7 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) if (!IS_ERR(tsk)) { sigfillset(&tsk->blocked); sigdelsetmask(&tsk->blocked, sigmask(SIGKILL)); + tsk->flags |= PF_NOFREEZE; } return tsk; } -- cgit v1.2.3 From 78d7f6ba82edb7f8763390982be29051c4216772 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 10 Mar 2021 13:13:53 +0000 Subject: io_uring: fix invalid ctx->sq_thread_idle We have to set ctx->sq_thread_idle before adding a ring to an SQ task, otherwise sqd races for seeing zero and accounting it as such. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 14165e18020c..7072c0eb22c1 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7829,14 +7829,14 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, ctx->sq_creds = get_current_cred(); ctx->sq_data = sqd; - io_sq_thread_park(sqd); - list_add(&ctx->sqd_list, &sqd->ctx_new_list); - io_sq_thread_unpark(sqd); - ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle); if (!ctx->sq_thread_idle) ctx->sq_thread_idle = HZ; + io_sq_thread_park(sqd); + list_add(&ctx->sqd_list, &sqd->ctx_new_list); + io_sq_thread_unpark(sqd); + if (sqd->thread) return 0; -- cgit v1.2.3 From 7d41e8543d809c3c900d1212d6ea887eb284b69a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 10 Mar 2021 13:13:54 +0000 Subject: io_uring: remove indirect ctx into sqo injection We use ->ctx_new_list to notify sqo about new ctx pending, then sqo should stop and splice it to its sqd->ctx_list, paired with ->sq_thread_comp. The last one is broken because nobody reinitialises it, and trying to fix it would only add more complexity and bugs. And the first isn't really needed as is done under park(), that protects from races well. Add ctx into sqd->ctx_list directly (under park()), it's much simpler and allows to kill both, ctx_new_list and sq_thread_comp. note: apparently there is no real problem at the moment, because sq_thread_comp is used only by io_sq_thread_finish() followed by parking, where list_del(&ctx->sqd_list) removes it well regardless whether it's in the new or the active list. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 28 +++------------------------- 1 file changed, 3 insertions(+), 25 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 7072c0eb22c1..5c045a9f7ffe 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -262,7 +262,6 @@ struct io_sq_data { /* ctx's that are using this sqd */ struct list_head ctx_list; - struct list_head ctx_new_list; struct task_struct *thread; struct wait_queue_head wait; @@ -398,7 +397,6 @@ struct io_ring_ctx { struct user_struct *user; struct completion ref_comp; - struct completion sq_thread_comp; #if defined(CONFIG_UNIX) struct socket *ring_sock; @@ -1137,7 +1135,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_waitqueue_head(&ctx->cq_wait); INIT_LIST_HEAD(&ctx->cq_overflow_list); init_completion(&ctx->ref_comp); - init_completion(&ctx->sq_thread_comp); idr_init(&ctx->io_buffer_idr); xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); mutex_init(&ctx->uring_lock); @@ -6640,19 +6637,6 @@ static void io_sqd_update_thread_idle(struct io_sq_data *sqd) sqd->sq_thread_idle = sq_thread_idle; } -static void io_sqd_init_new(struct io_sq_data *sqd) -{ - struct io_ring_ctx *ctx; - - while (!list_empty(&sqd->ctx_new_list)) { - ctx = list_first_entry(&sqd->ctx_new_list, struct io_ring_ctx, sqd_list); - list_move_tail(&ctx->sqd_list, &sqd->ctx_list); - complete(&ctx->sq_thread_comp); - } - - io_sqd_update_thread_idle(sqd); -} - static int io_sq_thread(void *data) { struct io_sq_data *sqd = data; @@ -6683,11 +6667,8 @@ static int io_sq_thread(void *data) up_read(&sqd->rw_lock); cond_resched(); down_read(&sqd->rw_lock); - continue; - } - if (unlikely(!list_empty(&sqd->ctx_new_list))) { - io_sqd_init_new(sqd); timeout = jiffies + sqd->sq_thread_idle; + continue; } if (fatal_signal_pending(current)) break; @@ -7099,9 +7080,6 @@ static void io_sq_thread_finish(struct io_ring_ctx *ctx) if (sqd) { complete(&sqd->startup); - if (sqd->thread) - wait_for_completion(&ctx->sq_thread_comp); - io_sq_thread_park(sqd); list_del(&ctx->sqd_list); io_sqd_update_thread_idle(sqd); @@ -7153,7 +7131,6 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p) refcount_set(&sqd->refs, 1); INIT_LIST_HEAD(&sqd->ctx_list); - INIT_LIST_HEAD(&sqd->ctx_new_list); init_rwsem(&sqd->rw_lock); init_waitqueue_head(&sqd->wait); init_completion(&sqd->startup); @@ -7834,7 +7811,8 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, ctx->sq_thread_idle = HZ; io_sq_thread_park(sqd); - list_add(&ctx->sqd_list, &sqd->ctx_new_list); + list_add(&ctx->sqd_list, &sqd->ctx_list); + io_sqd_update_thread_idle(sqd); io_sq_thread_unpark(sqd); if (sqd->thread) -- cgit v1.2.3 From 5c2469e0a22e035d52f3ba768151cc75e3d4a1cd Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 11 Mar 2021 10:17:56 -0700 Subject: io_uring: force creation of separate context for ATTACH_WQ and non-threads Earlier kernels had SQPOLL threads that could share across anything, as we grabbed the context we needed on a per-ring basis. This is no longer the case, so only allow attaching directly if we're in the same thread group. That is the common use case. For non-group tasks, just setup a new context and thread as we would've done if sharing wasn't set. This isn't 100% ideal in terms of CPU utilization for the forked and share case, but hopefully that isn't much of a concern. If it is, there are plans in motion for how to improve that. Most importantly, we want to avoid app side regressions where sharing worked before and now doesn't. With this patch, functionality is equivalent to previous kernels that supported IORING_SETUP_ATTACH_WQ with SQPOLL. Reported-by: Stefan Metzmacher Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 5c045a9f7ffe..472eab7359f2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -269,6 +269,7 @@ struct io_sq_data { unsigned sq_thread_idle; int sq_cpu; pid_t task_pid; + pid_t task_tgid; unsigned long state; struct completion startup; @@ -7112,6 +7113,10 @@ static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p) fdput(f); return ERR_PTR(-EINVAL); } + if (sqd->task_tgid != current->tgid) { + fdput(f); + return ERR_PTR(-EPERM); + } refcount_inc(&sqd->refs); fdput(f); @@ -7122,8 +7127,14 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p) { struct io_sq_data *sqd; - if (p->flags & IORING_SETUP_ATTACH_WQ) - return io_attach_sq_data(p); + if (p->flags & IORING_SETUP_ATTACH_WQ) { + sqd = io_attach_sq_data(p); + if (!IS_ERR(sqd)) + return sqd; + /* fall through for EPERM case, setup new sqd/task */ + if (PTR_ERR(sqd) != -EPERM) + return sqd; + } sqd = kzalloc(sizeof(*sqd), GFP_KERNEL); if (!sqd) @@ -7833,6 +7844,7 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, } sqd->task_pid = current->pid; + sqd->task_tgid = current->tgid; tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE); if (IS_ERR(tsk)) { ret = PTR_ERR(tsk); -- cgit v1.2.3 From d052d1d685f5125249ab4ff887562c88ba959638 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 11 Mar 2021 10:49:20 -0700 Subject: io_uring: perform IOPOLL reaping if canceler is thread itself We bypass IOPOLL completion polling (and reaping) for the SQPOLL thread, but if it's the thread itself invoking cancelations, then we still need to perform it or no one will. Fixes: 9936c7c2bc76 ("io_uring: deduplicate core cancellations sequence") Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 472eab7359f2..49f85f49e1c3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8658,7 +8658,8 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, } /* SQPOLL thread does its own polling */ - if (!(ctx->flags & IORING_SETUP_SQPOLL) && !files) { + if ((!(ctx->flags & IORING_SETUP_SQPOLL) && !files) || + (ctx->sq_data && ctx->sq_data->thread == current)) { while (!list_empty_careful(&ctx->iopoll_list)) { io_iopoll_try_reap_events(ctx); ret = true; -- cgit v1.2.3 From e1915f76a8981f0a750cf56515df42582a37c4b0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 11 Mar 2021 23:29:35 +0000 Subject: io_uring: cancel deferred requests in try_cancel As io_uring_cancel_files() and others let SQO to run between io_uring_try_cancel_requests(), SQO may generate new deferred requests, so it's safer to try to cancel them in it. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 49f85f49e1c3..56f3d8f408c9 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8577,11 +8577,11 @@ static bool io_cancel_task_cb(struct io_wq_work *work, void *data) return ret; } -static void io_cancel_defer_files(struct io_ring_ctx *ctx, +static bool io_cancel_defer_files(struct io_ring_ctx *ctx, struct task_struct *task, struct files_struct *files) { - struct io_defer_entry *de = NULL; + struct io_defer_entry *de; LIST_HEAD(list); spin_lock_irq(&ctx->completion_lock); @@ -8592,6 +8592,8 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx, } } spin_unlock_irq(&ctx->completion_lock); + if (list_empty(&list)) + return false; while (!list_empty(&list)) { de = list_first_entry(&list, struct io_defer_entry, list); @@ -8601,6 +8603,7 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx, io_req_complete(de->req, -ECANCELED); kfree(de); } + return true; } static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data) @@ -8666,6 +8669,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, } } + ret |= io_cancel_defer_files(ctx, task, files); ret |= io_poll_remove_all(ctx, task, files); ret |= io_kill_timeouts(ctx, task, files); ret |= io_run_task_work(); @@ -8734,8 +8738,6 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, atomic_inc(&task->io_uring->in_idle); } - io_cancel_defer_files(ctx, task, files); - io_uring_cancel_files(ctx, task, files); if (!files) io_uring_try_cancel_requests(ctx, task, NULL); -- cgit v1.2.3 From 0df8ea602b3fe80819a34361027ad40485e78909 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 11 Mar 2021 23:29:36 +0000 Subject: io_uring: remove useless ->startup completion We always do complete(&sqd->startup) almost right after sqd->thread creation, either in the success path or in io_sq_thread_finish(). It's specifically created not started for us to be able to set some stuff like sqd->thread and io_uring_alloc_task_context() before following right after wake_up_new_task(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 56f3d8f408c9..6349374d715d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -272,7 +272,6 @@ struct io_sq_data { pid_t task_tgid; unsigned long state; - struct completion startup; struct completion exited; }; @@ -6656,8 +6655,6 @@ static int io_sq_thread(void *data) set_cpus_allowed_ptr(current, cpu_online_mask); current->flags |= PF_NO_SETAFFINITY; - wait_for_completion(&sqd->startup); - down_read(&sqd->rw_lock); while (!test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)) { @@ -7080,7 +7077,6 @@ static void io_sq_thread_finish(struct io_ring_ctx *ctx) struct io_sq_data *sqd = ctx->sq_data; if (sqd) { - complete(&sqd->startup); io_sq_thread_park(sqd); list_del(&ctx->sqd_list); io_sqd_update_thread_idle(sqd); @@ -7144,7 +7140,6 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p) INIT_LIST_HEAD(&sqd->ctx_list); init_rwsem(&sqd->rw_lock); init_waitqueue_head(&sqd->wait); - init_completion(&sqd->startup); init_completion(&sqd->exited); return sqd; } @@ -7856,7 +7851,6 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, wake_up_new_task(tsk); if (ret) goto err; - complete(&sqd->startup); } else if (p->flags & IORING_SETUP_SQ_AFF) { /* Can't have SQ_AFF without SQPOLL */ ret = -EINVAL; -- cgit v1.2.3 From 26984fbf3ad9d1c1fb56a0c1e0cdf9fa3b806f0c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 11 Mar 2021 23:29:37 +0000 Subject: io_uring: prevent racy sqd->thread checks SQPOLL thread to which we're trying to attach may be going away, it's not nice but a more serious problem is if io_sq_offload_create() sees sqd->thread==NULL, and tries to init it with a new thread. There are tons of ways it can be exploited or fail. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6349374d715d..cdec59510433 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7119,14 +7119,18 @@ static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p) return sqd; } -static struct io_sq_data *io_get_sq_data(struct io_uring_params *p) +static struct io_sq_data *io_get_sq_data(struct io_uring_params *p, + bool *attached) { struct io_sq_data *sqd; + *attached = false; if (p->flags & IORING_SETUP_ATTACH_WQ) { sqd = io_attach_sq_data(p); - if (!IS_ERR(sqd)) + if (!IS_ERR(sqd)) { + *attached = true; return sqd; + } /* fall through for EPERM case, setup new sqd/task */ if (PTR_ERR(sqd) != -EPERM) return sqd; @@ -7799,12 +7803,13 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, if (ctx->flags & IORING_SETUP_SQPOLL) { struct task_struct *tsk; struct io_sq_data *sqd; + bool attached; ret = -EPERM; if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE)) goto err; - sqd = io_get_sq_data(p); + sqd = io_get_sq_data(p, &attached); if (IS_ERR(sqd)) { ret = PTR_ERR(sqd); goto err; @@ -7816,13 +7821,24 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, if (!ctx->sq_thread_idle) ctx->sq_thread_idle = HZ; + ret = 0; io_sq_thread_park(sqd); - list_add(&ctx->sqd_list, &sqd->ctx_list); - io_sqd_update_thread_idle(sqd); + /* don't attach to a dying SQPOLL thread, would be racy */ + if (attached && !sqd->thread) { + ret = -ENXIO; + } else { + list_add(&ctx->sqd_list, &sqd->ctx_list); + io_sqd_update_thread_idle(sqd); + } io_sq_thread_unpark(sqd); - if (sqd->thread) + if (ret < 0) { + io_put_sq_data(sqd); + ctx->sq_data = NULL; + return ret; + } else if (attached) { return 0; + } if (p->flags & IORING_SETUP_SQ_AFF) { int cpu = p->sq_thread_cpu; -- cgit v1.2.3 From 521d6a737a31c08dbab204a95cd4fb5bee725f0f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 11 Mar 2021 23:29:38 +0000 Subject: io_uring: cancel sqpoll via task_work 1) The first problem is io_uring_cancel_sqpoll() -> io_uring_cancel_task_requests() basically doing park(); park(); and so hanging. 2) Another one is more subtle, when the master task is doing cancellations, but SQPOLL task submits in-between the end of the cancellation but before finish() requests taking a ref to the ctx, and so eternally locking it up. 3) Yet another is a dying SQPOLL task doing io_uring_cancel_sqpoll() and same io_uring_cancel_sqpoll() from the owner task, they race for tctx->wait events. And there probably more of them. Instead do SQPOLL cancellations from within SQPOLL task context via task_work, see io_sqpoll_cancel_sync(). With that we don't need temporal park()/unpark() during cancellation, which is ugly, subtle and anyway doesn't allow to do io_run_task_work() properly. io_uring_cancel_sqpoll() is called only from SQPOLL task context and under sqd locking, so all parking is removed from there. And so, io_sq_thread_[un]park() and io_sq_thread_stop() are not used now by SQPOLL task, and that spare us from some headache. Also remove ctx->sqd_list early to avoid 2). And kill tctx->sqpoll, which is not used anymore. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 151 +++++++++++++++++++++++++++------------------------------- 1 file changed, 71 insertions(+), 80 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index cdec59510433..70286b393c0e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6665,6 +6665,7 @@ static int io_sq_thread(void *data) up_read(&sqd->rw_lock); cond_resched(); down_read(&sqd->rw_lock); + io_run_task_work(); timeout = jiffies + sqd->sq_thread_idle; continue; } @@ -6720,18 +6721,22 @@ static int io_sq_thread(void *data) finish_wait(&sqd->wait, &wait); timeout = jiffies + sqd->sq_thread_idle; } - - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) - io_uring_cancel_sqpoll(ctx); up_read(&sqd->rw_lock); - + down_write(&sqd->rw_lock); + /* + * someone may have parked and added a cancellation task_work, run + * it first because we don't want it in io_uring_cancel_sqpoll() + */ io_run_task_work(); - down_write(&sqd->rw_lock); + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) + io_uring_cancel_sqpoll(ctx); sqd->thread = NULL; list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_set_wakeup_flag(ctx); up_write(&sqd->rw_lock); + + io_run_task_work(); complete(&sqd->exited); do_exit(0); } @@ -7033,8 +7038,8 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) static void io_sq_thread_unpark(struct io_sq_data *sqd) __releases(&sqd->rw_lock) { - if (sqd->thread == current) - return; + WARN_ON_ONCE(sqd->thread == current); + clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); up_write(&sqd->rw_lock); } @@ -7042,8 +7047,8 @@ static void io_sq_thread_unpark(struct io_sq_data *sqd) static void io_sq_thread_park(struct io_sq_data *sqd) __acquires(&sqd->rw_lock) { - if (sqd->thread == current) - return; + WARN_ON_ONCE(sqd->thread == current); + set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); down_write(&sqd->rw_lock); /* set again for consistency, in case concurrent parks are happening */ @@ -7054,8 +7059,8 @@ static void io_sq_thread_park(struct io_sq_data *sqd) static void io_sq_thread_stop(struct io_sq_data *sqd) { - if (test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)) - return; + WARN_ON_ONCE(sqd->thread == current); + down_write(&sqd->rw_lock); set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); if (sqd->thread) @@ -7078,7 +7083,7 @@ static void io_sq_thread_finish(struct io_ring_ctx *ctx) if (sqd) { io_sq_thread_park(sqd); - list_del(&ctx->sqd_list); + list_del_init(&ctx->sqd_list); io_sqd_update_thread_idle(sqd); io_sq_thread_unpark(sqd); @@ -7760,7 +7765,6 @@ static int io_uring_alloc_task_context(struct task_struct *task, init_waitqueue_head(&tctx->wait); tctx->last = NULL; atomic_set(&tctx->in_idle, 0); - tctx->sqpoll = false; task->io_uring = tctx; spin_lock_init(&tctx->task_lock); INIT_WQ_LIST(&tctx->task_list); @@ -8719,43 +8723,12 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, io_uring_try_cancel_requests(ctx, task, files); - if (ctx->sq_data) - io_sq_thread_unpark(ctx->sq_data); prepare_to_wait(&task->io_uring->wait, &wait, TASK_UNINTERRUPTIBLE); if (inflight == io_uring_count_inflight(ctx, task, files)) schedule(); finish_wait(&task->io_uring->wait, &wait); - if (ctx->sq_data) - io_sq_thread_park(ctx->sq_data); - } -} - -/* - * We need to iteratively cancel requests, in case a request has dependent - * hard links. These persist even for failure of cancelations, hence keep - * looping until none are found. - */ -static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, - struct files_struct *files) -{ - struct task_struct *task = current; - - if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) { - io_sq_thread_park(ctx->sq_data); - task = ctx->sq_data->thread; - if (task) - atomic_inc(&task->io_uring->in_idle); } - - io_uring_cancel_files(ctx, task, files); - if (!files) - io_uring_try_cancel_requests(ctx, task, NULL); - - if (task) - atomic_dec(&task->io_uring->in_idle); - if (ctx->sq_data) - io_sq_thread_unpark(ctx->sq_data); } /* @@ -8796,15 +8769,6 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx) } tctx->last = ctx; } - - /* - * This is race safe in that the task itself is doing this, hence it - * cannot be going through the exit/cancel paths at the same time. - * This cannot be modified while exit/cancel is running. - */ - if (!tctx->sqpoll && (ctx->flags & IORING_SETUP_SQPOLL)) - tctx->sqpoll = true; - return 0; } @@ -8847,6 +8811,44 @@ static void io_uring_clean_tctx(struct io_uring_task *tctx) } } +static s64 tctx_inflight(struct io_uring_task *tctx) +{ + return percpu_counter_sum(&tctx->inflight); +} + +static void io_sqpoll_cancel_cb(struct callback_head *cb) +{ + struct io_tctx_exit *work = container_of(cb, struct io_tctx_exit, task_work); + struct io_ring_ctx *ctx = work->ctx; + struct io_sq_data *sqd = ctx->sq_data; + + if (sqd->thread) + io_uring_cancel_sqpoll(ctx); + complete(&work->completion); +} + +static void io_sqpoll_cancel_sync(struct io_ring_ctx *ctx) +{ + struct io_sq_data *sqd = ctx->sq_data; + struct io_tctx_exit work = { .ctx = ctx, }; + struct task_struct *task; + + io_sq_thread_park(sqd); + list_del_init(&ctx->sqd_list); + io_sqd_update_thread_idle(sqd); + task = sqd->thread; + if (task) { + init_completion(&work.completion); + init_task_work(&work.task_work, io_sqpoll_cancel_cb); + WARN_ON_ONCE(task_work_add(task, &work.task_work, TWA_SIGNAL)); + wake_up_process(task); + } + io_sq_thread_unpark(sqd); + + if (task) + wait_for_completion(&work.completion); +} + void __io_uring_files_cancel(struct files_struct *files) { struct io_uring_task *tctx = current->io_uring; @@ -8855,41 +8857,40 @@ void __io_uring_files_cancel(struct files_struct *files) /* make sure overflow events are dropped */ atomic_inc(&tctx->in_idle); - xa_for_each(&tctx->xa, index, node) - io_uring_cancel_task_requests(node->ctx, files); + xa_for_each(&tctx->xa, index, node) { + struct io_ring_ctx *ctx = node->ctx; + + if (ctx->sq_data) { + io_sqpoll_cancel_sync(ctx); + continue; + } + io_uring_cancel_files(ctx, current, files); + if (!files) + io_uring_try_cancel_requests(ctx, current, NULL); + } atomic_dec(&tctx->in_idle); if (files) io_uring_clean_tctx(tctx); } -static s64 tctx_inflight(struct io_uring_task *tctx) -{ - return percpu_counter_sum(&tctx->inflight); -} - +/* should only be called by SQPOLL task */ static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx) { struct io_sq_data *sqd = ctx->sq_data; - struct io_uring_task *tctx; + struct io_uring_task *tctx = current->io_uring; s64 inflight; DEFINE_WAIT(wait); - if (!sqd) - return; - io_sq_thread_park(sqd); - if (!sqd->thread || !sqd->thread->io_uring) { - io_sq_thread_unpark(sqd); - return; - } - tctx = ctx->sq_data->thread->io_uring; + WARN_ON_ONCE(!sqd || ctx->sq_data->thread != current); + atomic_inc(&tctx->in_idle); do { /* read completions before cancelations */ inflight = tctx_inflight(tctx); if (!inflight) break; - io_uring_cancel_task_requests(ctx, NULL); + io_uring_try_cancel_requests(ctx, current, NULL); prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE); /* @@ -8902,7 +8903,6 @@ static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx) finish_wait(&tctx->wait, &wait); } while (1); atomic_dec(&tctx->in_idle); - io_sq_thread_unpark(sqd); } /* @@ -8917,15 +8917,6 @@ void __io_uring_task_cancel(void) /* make sure overflow events are dropped */ atomic_inc(&tctx->in_idle); - - if (tctx->sqpoll) { - struct io_tctx_node *node; - unsigned long index; - - xa_for_each(&tctx->xa, index, node) - io_uring_cancel_sqpoll(node->ctx); - } - do { /* read completions before cancelations */ inflight = tctx_inflight(tctx); -- cgit v1.2.3 From 58f99373834151e1ca7edc49bc5578d9d40db099 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 12 Mar 2021 16:25:55 +0000 Subject: io_uring: fix OP_ASYNC_CANCEL across tasks IORING_OP_ASYNC_CANCEL tries io-wq cancellation only for current task. If it fails go over tctx_list and try it out for every single tctx. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 41 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 70286b393c0e..a4bce17af506 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5667,8 +5667,47 @@ static int io_async_cancel_prep(struct io_kiocb *req, static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; + u64 sqe_addr = req->cancel.addr; + struct io_tctx_node *node; + int ret; - io_async_find_and_cancel(ctx, req, req->cancel.addr, 0); + /* tasks should wait for their io-wq threads, so safe w/o sync */ + ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx); + spin_lock_irq(&ctx->completion_lock); + if (ret != -ENOENT) + goto done; + ret = io_timeout_cancel(ctx, sqe_addr); + if (ret != -ENOENT) + goto done; + ret = io_poll_cancel(ctx, sqe_addr); + if (ret != -ENOENT) + goto done; + spin_unlock_irq(&ctx->completion_lock); + + /* slow path, try all io-wq's */ + io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); + ret = -ENOENT; + list_for_each_entry(node, &ctx->tctx_list, ctx_node) { + struct io_uring_task *tctx = node->task->io_uring; + + if (!tctx || !tctx->io_wq) + continue; + ret = io_async_cancel_one(tctx, req->cancel.addr, ctx); + if (ret != -ENOENT) + break; + } + io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); + + spin_lock_irq(&ctx->completion_lock); +done: + io_cqring_fill_event(req, ret); + io_commit_cqring(ctx); + spin_unlock_irq(&ctx->completion_lock); + io_cqring_ev_posted(ctx); + + if (ret < 0) + req_set_fail_links(req); + io_put_req(req); return 0; } -- cgit v1.2.3