From 713b9825a4c47897f66ad69409581e7734a8728e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 8 Sep 2021 10:09:29 +0100 Subject: io-wq: fix cancellation on create-worker failure WARNING: CPU: 0 PID: 10392 at fs/io_uring.c:1151 req_ref_put_and_test fs/io_uring.c:1151 [inline] WARNING: CPU: 0 PID: 10392 at fs/io_uring.c:1151 req_ref_put_and_test fs/io_uring.c:1146 [inline] WARNING: CPU: 0 PID: 10392 at fs/io_uring.c:1151 io_req_complete_post+0xf5b/0x1190 fs/io_uring.c:1794 Modules linked in: Call Trace: tctx_task_work+0x1e5/0x570 fs/io_uring.c:2158 task_work_run+0xe0/0x1a0 kernel/task_work.c:164 tracehook_notify_signal include/linux/tracehook.h:212 [inline] handle_signal_work kernel/entry/common.c:146 [inline] exit_to_user_mode_loop kernel/entry/common.c:172 [inline] exit_to_user_mode_prepare+0x232/0x2a0 kernel/entry/common.c:209 __syscall_exit_to_user_mode_work kernel/entry/common.c:291 [inline] syscall_exit_to_user_mode+0x19/0x60 kernel/entry/common.c:302 do_syscall_64+0x42/0xb0 arch/x86/entry/common.c:86 entry_SYSCALL_64_after_hwframe+0x44/0xae When io_wqe_enqueue() -> io_wqe_create_worker() fails, we can't just call io_run_cancel() to clean up the request, it's already enqueued via io_wqe_insert_work() and will be executed either by some other worker during cancellation (e.g. in io_wq_put_and_exit()). Reported-by: Hao Sun Fixes: 3146cba99aa28 ("io-wq: make worker creation resilient against signals") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/93b9de0fcf657affab0acfd675d4abcd273ee863.1631092071.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io-wq.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/io-wq.c b/fs/io-wq.c index d80e4a735677..35e7ee26f7ea 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -832,6 +832,11 @@ append: wq_list_add_after(&work->list, &tail->list, &acct->work_list); } +static bool io_wq_work_match_item(struct io_wq_work *work, void *data) +{ + return work == data; +} + static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) { struct io_wqe_acct *acct = io_work_get_acct(wqe, work); @@ -844,7 +849,6 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) */ if (test_bit(IO_WQ_BIT_EXIT, &wqe->wq->state) || (work->flags & IO_WQ_WORK_CANCEL)) { -run_cancel: io_run_cancel(work, wqe); return; } @@ -864,15 +868,22 @@ run_cancel: bool did_create; did_create = io_wqe_create_worker(wqe, acct); - if (unlikely(!did_create)) { - raw_spin_lock(&wqe->lock); - /* fatal condition, failed to create the first worker */ - if (!acct->nr_workers) { - raw_spin_unlock(&wqe->lock); - goto run_cancel; - } - raw_spin_unlock(&wqe->lock); + if (likely(did_create)) + return; + + raw_spin_lock(&wqe->lock); + /* fatal condition, failed to create the first worker */ + if (!acct->nr_workers) { + struct io_cb_cancel_data match = { + .fn = io_wq_work_match_item, + .data = work, + .cancel_all = false, + }; + + if (io_acct_cancel_pending_work(wqe, acct, &match)) + raw_spin_lock(&wqe->lock); } + raw_spin_unlock(&wqe->lock); } } -- cgit v1.2.3 From c57a91fb1ccfa203ba3e31e5a389cb04de5b0561 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 8 Sep 2021 20:49:17 +0100 Subject: io_uring: fix missing mb() before waitqueue_active In case of !SQPOLL, io_cqring_ev_posted_iopoll() doesn't provide a memory barrier required by waitqueue_active(&ctx->poll_wait). There is a wq_has_sleeper(), which does smb_mb() inside, but it's called only for SQPOLL. Fixes: 5fd4617840596 ("io_uring: be smarter about waking multiple CQ ring waiters") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/2982e53bcea2274006ed435ee2a77197107d8a29.1631130542.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index d816c09c88a5..d80d8359501f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1619,8 +1619,11 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) { + /* see waitqueue_active() comment */ + smp_mb(); + if (ctx->flags & IORING_SETUP_SQPOLL) { - if (wq_has_sleeper(&ctx->cq_wait)) + if (waitqueue_active(&ctx->cq_wait)) wake_up_all(&ctx->cq_wait); } if (io_should_trigger_evfd(ctx)) -- cgit v1.2.3 From 009ad9f0c6eed0caa7943bc46aa1ae2cb8c382fb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 8 Sep 2021 19:07:26 -0600 Subject: io_uring: drop ctx->uring_lock before acquiring sqd->lock The SQPOLL thread dictates the lock order, and we hold the ctx->uring_lock for all the registration opcodes. We also hold a ref to the ctx, and we do drop the lock for other reasons to quiesce, so it's fine to drop the ctx lock temporarily to grab the sqd->lock. This fixes the following lockdep splat: ====================================================== WARNING: possible circular locking dependency detected 5.14.0-syzkaller #0 Not tainted ------------------------------------------------------ syz-executor.5/25433 is trying to acquire lock: ffff888023426870 (&sqd->lock){+.+.}-{3:3}, at: io_register_iowq_max_workers fs/io_uring.c:10551 [inline] ffff888023426870 (&sqd->lock){+.+.}-{3:3}, at: __io_uring_register fs/io_uring.c:10757 [inline] ffff888023426870 (&sqd->lock){+.+.}-{3:3}, at: __do_sys_io_uring_register+0x10aa/0x2e70 fs/io_uring.c:10792 but task is already holding lock: ffff8880885b40a8 (&ctx->uring_lock){+.+.}-{3:3}, at: __do_sys_io_uring_register+0x2e1/0x2e70 fs/io_uring.c:10791 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&ctx->uring_lock){+.+.}-{3:3}: __mutex_lock_common kernel/locking/mutex.c:596 [inline] __mutex_lock+0x131/0x12f0 kernel/locking/mutex.c:729 __io_sq_thread fs/io_uring.c:7291 [inline] io_sq_thread+0x65a/0x1370 fs/io_uring.c:7368 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295 -> #0 (&sqd->lock){+.+.}-{3:3}: check_prev_add kernel/locking/lockdep.c:3051 [inline] check_prevs_add kernel/locking/lockdep.c:3174 [inline] validate_chain kernel/locking/lockdep.c:3789 [inline] __lock_acquire+0x2a07/0x54a0 kernel/locking/lockdep.c:5015 lock_acquire kernel/locking/lockdep.c:5625 [inline] lock_acquire+0x1ab/0x510 kernel/locking/lockdep.c:5590 __mutex_lock_common kernel/locking/mutex.c:596 [inline] __mutex_lock+0x131/0x12f0 kernel/locking/mutex.c:729 io_register_iowq_max_workers fs/io_uring.c:10551 [inline] __io_uring_register fs/io_uring.c:10757 [inline] __do_sys_io_uring_register+0x10aa/0x2e70 fs/io_uring.c:10792 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&ctx->uring_lock); lock(&sqd->lock); lock(&ctx->uring_lock); lock(&sqd->lock); *** DEADLOCK *** Fixes: 2e480058ddc2 ("io-wq: provide a way to limit max number of workers") Reported-by: syzbot+97fa56483f69d677969f@syzkaller.appspotmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index d80d8359501f..b21a423a4de8 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -10551,7 +10551,14 @@ static int io_register_iowq_max_workers(struct io_ring_ctx *ctx, if (ctx->flags & IORING_SETUP_SQPOLL) { sqd = ctx->sq_data; if (sqd) { + /* + * Observe the correct sqd->lock -> ctx->uring_lock + * ordering. Fine to drop uring_lock here, we hold + * a ref to the ctx. + */ + mutex_unlock(&ctx->uring_lock); mutex_lock(&sqd->lock); + mutex_lock(&ctx->uring_lock); tctx = sqd->thread->io_uring; } } else { -- cgit v1.2.3 From 3b33e3f4a6c0296812a7ee757bdae83290e64aed Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 8 Sep 2021 19:57:26 -0600 Subject: io-wq: fix silly logic error in io_task_work_match() We check for the func with an OR condition, which means it always ends up being false and we never match the task_work we want to cancel. In the unexpected case that we do exit with that pending, we can trigger a hang waiting for a worker to exit, but it was never created. syzbot reports that as such: INFO: task syz-executor687:8514 blocked for more than 143 seconds. Not tainted 5.14.0-syzkaller #0 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:syz-executor687 state:D stack:27296 pid: 8514 ppid: 8479 flags:0x00024004 Call Trace: context_switch kernel/sched/core.c:4940 [inline] __schedule+0x940/0x26f0 kernel/sched/core.c:6287 schedule+0xd3/0x270 kernel/sched/core.c:6366 schedule_timeout+0x1db/0x2a0 kernel/time/timer.c:1857 do_wait_for_common kernel/sched/completion.c:85 [inline] __wait_for_common kernel/sched/completion.c:106 [inline] wait_for_common kernel/sched/completion.c:117 [inline] wait_for_completion+0x176/0x280 kernel/sched/completion.c:138 io_wq_exit_workers fs/io-wq.c:1162 [inline] io_wq_put_and_exit+0x40c/0xc70 fs/io-wq.c:1197 io_uring_clean_tctx fs/io_uring.c:9607 [inline] io_uring_cancel_generic+0x5fe/0x740 fs/io_uring.c:9687 io_uring_files_cancel include/linux/io_uring.h:16 [inline] do_exit+0x265/0x2a30 kernel/exit.c:780 do_group_exit+0x125/0x310 kernel/exit.c:922 get_signal+0x47f/0x2160 kernel/signal.c:2868 arch_do_signal_or_restart+0x2a9/0x1c40 arch/x86/kernel/signal.c:865 handle_signal_work kernel/entry/common.c:148 [inline] exit_to_user_mode_loop kernel/entry/common.c:172 [inline] exit_to_user_mode_prepare+0x17d/0x290 kernel/entry/common.c:209 __syscall_exit_to_user_mode_work kernel/entry/common.c:291 [inline] syscall_exit_to_user_mode+0x19/0x60 kernel/entry/common.c:302 do_syscall_64+0x42/0xb0 arch/x86/entry/common.c:86 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x445cd9 RSP: 002b:00007fc657f4b308 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca RAX: 0000000000000001 RBX: 00000000004cb448 RCX: 0000000000445cd9 RDX: 00000000000f4240 RSI: 0000000000000081 RDI: 00000000004cb44c RBP: 00000000004cb440 R08: 000000000000000e R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 000000000049b154 R13: 0000000000000003 R14: 00007fc657f4b400 R15: 0000000000022000 While in there, also decrement accr->nr_workers. This isn't strictly needed as we're exiting, but let's make sure the accounting matches up. Fixes: 3146cba99aa2 ("io-wq: make worker creation resilient against signals") Reported-by: syzbot+f62d3e0a4ea4f38f5326@syzkaller.appspotmail.com Signed-off-by: Jens Axboe --- fs/io-wq.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/io-wq.c b/fs/io-wq.c index 35e7ee26f7ea..c2e73ce6888a 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -1133,7 +1133,7 @@ static bool io_task_work_match(struct callback_head *cb, void *data) { struct io_worker *worker; - if (cb->func != create_worker_cb || cb->func != create_worker_cont) + if (cb->func != create_worker_cb && cb->func != create_worker_cont) return false; worker = container_of(cb, struct io_worker, create_work); return worker->wqe->wq == data; @@ -1154,9 +1154,14 @@ static void io_wq_exit_workers(struct io_wq *wq) while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) { struct io_worker *worker; + struct io_wqe_acct *acct; worker = container_of(cb, struct io_worker, create_work); - atomic_dec(&worker->wqe->acct[worker->create_index].nr_running); + acct = io_wqe_get_acct(worker); + atomic_dec(&acct->nr_running); + raw_spin_lock(&worker->wqe->lock); + acct->nr_workers--; + raw_spin_unlock(&worker->wqe->lock); io_worker_ref_put(wq); clear_bit_unlock(0, &worker->create_state); io_worker_release(worker); -- cgit v1.2.3 From 66e70be722886e4f134350212baa13f217e39e42 Mon Sep 17 00:00:00 2001 From: Qiang.zhang Date: Thu, 9 Sep 2021 19:58:22 +0800 Subject: io-wq: fix memory leak in create_io_worker() BUG: memory leak unreferenced object 0xffff888126fcd6c0 (size 192): comm "syz-executor.1", pid 11934, jiffies 4294983026 (age 15.690s) backtrace: [] kmalloc_node include/linux/slab.h:609 [inline] [] kzalloc_node include/linux/slab.h:732 [inline] [] create_io_worker+0x41/0x1e0 fs/io-wq.c:739 [] io_wqe_create_worker fs/io-wq.c:267 [inline] [] io_wqe_enqueue+0x1fe/0x330 fs/io-wq.c:866 [] io_queue_async_work+0xc4/0x200 fs/io_uring.c:1473 [] __io_queue_sqe+0x34c/0x510 fs/io_uring.c:6933 [] io_req_task_submit+0x4b/0xa0 fs/io_uring.c:2233 [] io_async_task_func+0x108/0x1c0 fs/io_uring.c:5462 [] tctx_task_work+0x1b3/0x3a0 fs/io_uring.c:2158 [] task_work_run+0x73/0xb0 kernel/task_work.c:164 [] tracehook_notify_signal include/linux/tracehook.h:212 [inline] [] handle_signal_work kernel/entry/common.c:146 [inline] [] exit_to_user_mode_loop kernel/entry/common.c:172 [inline] [] exit_to_user_mode_prepare+0x151/0x180 kernel/entry/common.c:209 [] __syscall_exit_to_user_mode_work kernel/entry/common.c:291 [inline] [] syscall_exit_to_user_mode+0x1d/0x40 kernel/entry/common.c:302 [] do_syscall_64+0x42/0xb0 arch/x86/entry/common.c:86 [] entry_SYSCALL_64_after_hwframe+0x44/0xae when create_io_thread() return error, and not retry, the worker object need to be freed. Reported-by: syzbot+65454c239241d3d647da@syzkaller.appspotmail.com Signed-off-by: Qiang.zhang Link: https://lore.kernel.org/r/20210909115822.181188-1-qiang.zhang@windriver.com Signed-off-by: Jens Axboe --- fs/io-wq.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/io-wq.c b/fs/io-wq.c index c2e73ce6888a..6c55362c1f99 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -709,6 +709,7 @@ static void create_worker_cont(struct callback_head *cb) } raw_spin_unlock(&wqe->lock); io_worker_ref_put(wqe->wq); + kfree(worker); return; } @@ -725,6 +726,7 @@ static void io_workqueue_create(struct work_struct *work) if (!io_queue_worker_create(worker, acct, create_worker_cont)) { clear_bit_unlock(0, &worker->create_state); io_worker_release(worker); + kfree(worker); } } @@ -759,6 +761,7 @@ fail: if (!IS_ERR(tsk)) { io_init_new_worker(wqe, worker, tsk); } else if (!io_should_retry_thread(PTR_ERR(tsk))) { + kfree(worker); goto fail; } else { INIT_WORK(&worker->work, io_workqueue_create); -- cgit v1.2.3 From 2ae2eb9dde18979b40629dd413b9adbd6c894cdf Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 9 Sep 2021 13:56:27 +0100 Subject: io_uring: fail links of cancelled timeouts When we cancel a timeout we should mark it with REQ_F_FAIL, so linked requests are cancelled as well, but not queued for further execution. Cc: stable@vger.kernel.org Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/fff625b44eeced3a5cae79f60e6acf3fbdf8f990.1631192135.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index b21a423a4de8..ffd91844b2e5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1482,6 +1482,8 @@ static void io_kill_timeout(struct io_kiocb *req, int status) struct io_timeout_data *io = req->async_data; if (hrtimer_try_to_cancel(&io->timer) != -1) { + if (status) + req_set_fail(req); atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); list_del_init(&req->timeout.list); -- cgit v1.2.3 From 32c2d33e0b7c4ea53284d5d9435dd022b582c8cf Mon Sep 17 00:00:00 2001 From: Hao Xu Date: Tue, 7 Sep 2021 11:22:43 +0800 Subject: io_uring: fix off-by-one in BUILD_BUG_ON check of __REQ_F_LAST_BIT Build check of __REQ_F_LAST_BIT should be larger than, not equal or larger than. It's perfectly valid to have __REQ_F_LAST_BIT be 32, as that means that the last valid bit is 31 which does fit in the type. Signed-off-by: Hao Xu Link: https://lore.kernel.org/r/20210907032243.114190-1-haoxu@linux.alibaba.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index ffd91844b2e5..f795ad281038 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -10863,7 +10863,7 @@ static int __init io_uring_init(void) BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8)); BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); - BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int)); + BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int)); req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); -- cgit v1.2.3