From 4f64a6c9f6f11e8b7314f8e27e2c4568706009e6 Mon Sep 17 00:00:00 2001 From: James Clark Date: Fri, 27 Jan 2023 14:31:41 +0000 Subject: perf: Fix perf_event_pmu_context serialization Syzkaller triggered a WARN in put_pmu_ctx(). WARNING: CPU: 1 PID: 2245 at kernel/events/core.c:4925 put_pmu_ctx+0x1f0/0x278 This is because there is no locking around the access of "if (!epc->ctx)" in find_get_pmu_context() and when it is set to NULL in put_pmu_ctx(). The decrement of the reference count in put_pmu_ctx() also happens outside of the spinlock, leading to the possibility of this order of events, and the context being cleared in put_pmu_ctx(), after its refcount is non zero: CPU0 CPU1 find_get_pmu_context() if (!epc->ctx) == false put_pmu_ctx() atomic_dec_and_test(&epc->refcount) == true epc->refcount == 0 atomic_inc(&epc->refcount); epc->refcount == 1 list_del_init(&epc->pmu_ctx_entry); epc->ctx = NULL; Another issue is that WARN_ON for no active PMU events in put_pmu_ctx() is outside of the lock. If the perf_event_pmu_context is an embedded one, even after clearing it, it won't be deleted and can be re-used. So the warning can trigger. For this reason it also needs to be moved inside the lock. The above warning is very quick to trigger on Arm by running these two commands at the same time: while true; do perf record -- ls; done while true; do perf record -- ls; done [peterz: atomic_dec_and_raw_lock*()] Fixes: bd2756811766 ("perf: Rewrite core context handling") Reported-by: syzbot+697196bc0265049822bd@syzkaller.appspotmail.com Signed-off-by: James Clark Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ravi Bangoria Link: https://lore.kernel.org/r/20230127143141.1782804-2-james.clark@arm.com --- kernel/events/core.c | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d56328e5080e..c4be13e50547 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4813,19 +4813,17 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu); epc = &cpc->epc; - + raw_spin_lock_irq(&ctx->lock); if (!epc->ctx) { atomic_set(&epc->refcount, 1); epc->embedded = 1; - raw_spin_lock_irq(&ctx->lock); list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); epc->ctx = ctx; - raw_spin_unlock_irq(&ctx->lock); } else { WARN_ON_ONCE(epc->ctx != ctx); atomic_inc(&epc->refcount); } - + raw_spin_unlock_irq(&ctx->lock); return epc; } @@ -4896,33 +4894,30 @@ static void free_epc_rcu(struct rcu_head *head) static void put_pmu_ctx(struct perf_event_pmu_context *epc) { + struct perf_event_context *ctx = epc->ctx; unsigned long flags; - if (!atomic_dec_and_test(&epc->refcount)) + /* + * XXX + * + * lockdep_assert_held(&ctx->mutex); + * + * can't because of the call-site in _free_event()/put_event() + * which isn't always called under ctx->mutex. + */ + if (!atomic_dec_and_raw_lock_irqsave(&epc->refcount, &ctx->lock, flags)) return; - if (epc->ctx) { - struct perf_event_context *ctx = epc->ctx; + WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry)); - /* - * XXX - * - * lockdep_assert_held(&ctx->mutex); - * - * can't because of the call-site in _free_event()/put_event() - * which isn't always called under ctx->mutex. - */ - - WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry)); - raw_spin_lock_irqsave(&ctx->lock, flags); - list_del_init(&epc->pmu_ctx_entry); - epc->ctx = NULL; - raw_spin_unlock_irqrestore(&ctx->lock, flags); - } + list_del_init(&epc->pmu_ctx_entry); + epc->ctx = NULL; WARN_ON_ONCE(!list_empty(&epc->pinned_active)); WARN_ON_ONCE(!list_empty(&epc->flexible_active)); + raw_spin_unlock_irqrestore(&ctx->lock, flags); + if (epc->embedded) return; -- cgit v1.2.3 From 3e46d910d8acf94e5360126593b68bf4fee4c4a1 Mon Sep 17 00:00:00 2001 From: Shiju Jose Date: Thu, 2 Feb 2023 18:23:09 +0000 Subject: tracing: Fix poll() and select() do not work on per_cpu trace_pipe and trace_pipe_raw poll() and select() on per_cpu trace_pipe and trace_pipe_raw do not work since kernel 6.1-rc6. This issue is seen after the commit 42fb0a1e84ff525ebe560e2baf9451ab69127e2b ("tracing/ring-buffer: Have polling block on watermark"). This issue is firstly detected and reported, when testing the CXL error events in the rasdaemon and also erified using the test application for poll() and select(). This issue occurs for the per_cpu case, when calling the ring_buffer_poll_wait(), in kernel/trace/ring_buffer.c, with the buffer_percent > 0 and then wait until the percentage of pages are available. The default value set for the buffer_percent is 50 in the kernel/trace/trace.c. As a fix, allow userspace application could set buffer_percent as 0 through the buffer_percent_fops, so that the task will wake up as soon as data is added to any of the specific cpu buffer. Link: https://lore.kernel.org/linux-trace-kernel/20230202182309.742-2-shiju.jose@huawei.com Cc: Cc: Cc: Cc: stable@vger.kernel.org Fixes: 42fb0a1e84ff5 ("tracing/ring-buffer: Have polling block on watermark") Signed-off-by: Shiju Jose Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 78ed5f1baa8c..c9e40f692650 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9148,9 +9148,6 @@ buffer_percent_write(struct file *filp, const char __user *ubuf, if (val > 100) return -EINVAL; - if (!val) - val = 1; - tr->buffer_percent = val; (*ppos)++; -- cgit v1.2.3 From d83d7ed260283560700d4034a80baad46620481b Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 2 Feb 2023 16:15:54 +0100 Subject: kernel/irq/irqdomain.c: fix memory leak with using debugfs_lookup() When calling debugfs_lookup() the result must have dput() called on it, otherwise the memory will leak over time. To make things simpler, just call debugfs_lookup_and_remove() instead which handles all of the logic at once. Cc: Thomas Gleixner Cc: stable Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20230202151554.2310273-1-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- kernel/irq/irqdomain.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 8fe1da9614ee..e2096b51c004 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1915,7 +1915,7 @@ static void debugfs_add_domain_dir(struct irq_domain *d) static void debugfs_remove_domain_dir(struct irq_domain *d) { - debugfs_remove(debugfs_lookup(d->name, domain_dir)); + debugfs_lookup_and_remove(d->name, domain_dir); } void __init irq_domain_debugfs_init(struct dentry *root) -- cgit v1.2.3 From 3fb906e7fabbb5b76c3c5256b10dc46ef80a0bfe Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sun, 5 Feb 2023 22:48:53 -0500 Subject: cgroup/cpuset: Don't filter offline CPUs in cpuset_cpus_allowed() for top cpuset tasks Since commit 8f9ea86fdf99 ("sched: Always preserve the user requested cpumask"), relax_compatible_cpus_allowed_ptr() is calling __sched_setaffinity() unconditionally. This helps to expose a bug in the current cpuset hotplug code where the cpumasks of the tasks in the top cpuset are not updated at all when some CPUs become online or offline. It is likely caused by the fact that some of the tasks in the top cpuset, like percpu kthreads, cannot have their cpu affinity changed. One way to reproduce this as suggested by Peter is: - boot machine - offline all CPUs except one - taskset -p ffffffff $$ - online all CPUs Fix this by allowing cpuset_cpus_allowed() to return a wider mask that includes offline CPUs for those tasks that are in the top cpuset. For tasks not in the top cpuset, the old rule applies and only online CPUs will be returned in the mask since hotplug events will update their cpumasks accordingly. Fixes: 8f9ea86fdf99 ("sched: Always preserve the user requested cpumask") Reported-by: Will Deacon Originally-from: Peter Zijlstra (Intel) Tested-by: Will Deacon Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 205dc9edcaa9..528285a91499 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3692,15 +3692,38 @@ void __init cpuset_init_smp(void) * Description: Returns the cpumask_var_t cpus_allowed of the cpuset * attached to the specified @tsk. Guaranteed to return some non-empty * subset of cpu_online_mask, even if this means going outside the - * tasks cpuset. + * tasks cpuset, except when the task is in the top cpuset. **/ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) { unsigned long flags; + struct cpuset *cs; spin_lock_irqsave(&callback_lock, flags); - guarantee_online_cpus(tsk, pmask); + rcu_read_lock(); + + cs = task_cs(tsk); + if (cs != &top_cpuset) + guarantee_online_cpus(tsk, pmask); + /* + * Tasks in the top cpuset won't get update to their cpumasks + * when a hotplug online/offline event happens. So we include all + * offline cpus in the allowed cpu list. + */ + if ((cs == &top_cpuset) || cpumask_empty(pmask)) { + const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); + + /* + * We first exclude cpus allocated to partitions. If there is no + * allowable online cpu left, we fall back to all possible cpus. + */ + cpumask_andnot(pmask, possible_mask, top_cpuset.subparts_cpus); + if (!cpumask_intersects(pmask, cpu_online_mask)) + cpumask_copy(pmask, possible_mask); + } + + rcu_read_unlock(); spin_unlock_irqrestore(&callback_lock, flags); } -- cgit v1.2.3 From 7a2127e66a00e073db8d90f9aac308f4a8a64226 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 31 Jan 2023 22:17:19 +0000 Subject: cpuset: Call set_cpus_allowed_ptr() with appropriate mask for task set_cpus_allowed_ptr() will fail with -EINVAL if the requested affinity mask is not a subset of the task_cpu_possible_mask() for the task being updated. Consequently, on a heterogeneous system with cpusets spanning the different CPU types, updates to the cgroup hierarchy can silently fail to update task affinities when the effective affinity mask for the cpuset is expanded. For example, consider an arm64 system with 4 CPUs, where CPUs 2-3 are the only cores capable of executing 32-bit tasks. Attaching a 32-bit task to a cpuset containing CPUs 0-2 will correctly affine the task to CPU 2. Extending the cpuset to CPUs 0-3, however, will fail to extend the affinity mask of the 32-bit task because update_tasks_cpumask() will pass the full 0-3 mask to set_cpus_allowed_ptr(). Extend update_tasks_cpumask() to take a temporary 'cpumask' paramater and use it to mask the 'effective_cpus' mask with the possible mask for each task being updated. Fixes: 431c69fac05b ("cpuset: Honour task_cpu_possible_mask() in guarantee_online_cpus()") Signed-off-by: Will Deacon Acked-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 528285a91499..ca826bd1eba3 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1205,12 +1205,13 @@ void rebuild_sched_domains(void) /** * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed + * @new_cpus: the temp variable for the new effective_cpus mask * * Iterate through each task of @cs updating its cpus_allowed to the * effective cpuset's. As this function is called with cpuset_rwsem held, * cpuset membership stays stable. */ -static void update_tasks_cpumask(struct cpuset *cs) +static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) { struct css_task_iter it; struct task_struct *task; @@ -1224,7 +1225,10 @@ static void update_tasks_cpumask(struct cpuset *cs) if (top_cs && (task->flags & PF_KTHREAD) && kthread_is_per_cpu(task)) continue; - set_cpus_allowed_ptr(task, cs->effective_cpus); + + cpumask_and(new_cpus, cs->effective_cpus, + task_cpu_possible_mask(task)); + set_cpus_allowed_ptr(task, new_cpus); } css_task_iter_end(&it); } @@ -1509,7 +1513,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, spin_unlock_irq(&callback_lock); if (adding || deleting) - update_tasks_cpumask(parent); + update_tasks_cpumask(parent, tmp->new_cpus); /* * Set or clear CS_SCHED_LOAD_BALANCE when partcmd_update, if necessary. @@ -1661,7 +1665,7 @@ update_parent_subparts: WARN_ON(!is_in_v2_mode() && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); - update_tasks_cpumask(cp); + update_tasks_cpumask(cp, tmp->new_cpus); /* * On legacy hierarchy, if the effective cpumask of any non- @@ -2309,7 +2313,7 @@ static int update_prstate(struct cpuset *cs, int new_prs) } } - update_tasks_cpumask(parent); + update_tasks_cpumask(parent, tmpmask.new_cpus); if (parent->child_ecpus_count) update_sibling_cpumasks(parent, cs, &tmpmask); @@ -3348,7 +3352,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs, * as the tasks will be migrated to an ancestor. */ if (cpus_updated && !cpumask_empty(cs->cpus_allowed)) - update_tasks_cpumask(cs); + update_tasks_cpumask(cs, new_cpus); if (mems_updated && !nodes_empty(cs->mems_allowed)) update_tasks_nodemask(cs); @@ -3385,7 +3389,7 @@ hotplug_update_tasks(struct cpuset *cs, spin_unlock_irq(&callback_lock); if (cpus_updated) - update_tasks_cpumask(cs); + update_tasks_cpumask(cs, new_cpus); if (mems_updated) update_tasks_nodemask(cs); } -- cgit v1.2.3