From f466ae66fa6a599f9a53b5f9bafea4b8cfffa7fb Mon Sep 17 00:00:00 2001 From: Paul E. McKenney Date: Mon, 9 Jan 2017 23:23:15 -0800 Subject: rcu: Remove cond_resched() from Tiny synchronize_sched() It is now legal to invoke synchronize_sched() at early boot, which causes Tiny RCU's synchronize_sched() to emit spurious splats. This commit therefore removes the cond_resched() from Tiny RCU's synchronize_sched(). Fixes: 8b355e3bc140 ("rcu: Drive expedited grace periods from workqueue") Signed-off-by: Paul E. McKenney Cc: # 4.9.0- --- kernel/rcu/tiny.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 1898559e6b60..b23a4d076f3d 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -185,9 +185,6 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused * benefits of doing might_sleep() to reduce latency.) * * Cool, huh? (Due to Josh Triplett.) - * - * But we want to make this a static inline later. The cond_resched() - * currently makes this problematic. */ void synchronize_sched(void) { @@ -195,7 +192,6 @@ void synchronize_sched(void) lock_is_held(&rcu_lock_map) || lock_is_held(&rcu_sched_lock_map), "Illegal synchronize_sched() in RCU read-side critical section"); - cond_resched(); } EXPORT_SYMBOL_GPL(synchronize_sched); -- cgit v1.2.3 From 52d7e48b86fc108e45a656d8e53e4237993c481d Mon Sep 17 00:00:00 2001 From: Paul E. McKenney Date: Tue, 10 Jan 2017 02:28:26 -0800 Subject: rcu: Narrow early boot window of illegal synchronous grace periods The current preemptible RCU implementation goes through three phases during bootup. In the first phase, there is only one CPU that is running with preemption disabled, so that a no-op is a synchronous grace period. In the second mid-boot phase, the scheduler is running, but RCU has not yet gotten its kthreads spawned (and, for expedited grace periods, workqueues are not yet running. During this time, any attempt to do a synchronous grace period will hang the system (or complain bitterly, depending). In the third and final phase, RCU is fully operational and everything works normally. This has been OK for some time, but there has recently been some synchronous grace periods showing up during the second mid-boot phase. This code worked "by accident" for awhile, but started failing as soon as expedited RCU grace periods switched over to workqueues in commit 8b355e3bc140 ("rcu: Drive expedited grace periods from workqueue"). Note that the code was buggy even before this commit, as it was subject to failure on real-time systems that forced all expedited grace periods to run as normal grace periods (for example, using the rcu_normal ksysfs parameter). The callchain from the failure case is as follows: early_amd_iommu_init() |-> acpi_put_table(ivrs_base); |-> acpi_tb_put_table(table_desc); |-> acpi_tb_invalidate_table(table_desc); |-> acpi_tb_release_table(...) |-> acpi_os_unmap_memory |-> acpi_os_unmap_iomem |-> acpi_os_map_cleanup |-> synchronize_rcu_expedited The kernel showing this callchain was built with CONFIG_PREEMPT_RCU=y, which caused the code to try using workqueues before they were initialized, which did not go well. This commit therefore reworks RCU to permit synchronous grace periods to proceed during this mid-boot phase. This commit is therefore a fix to a regression introduced in v4.9, and is therefore being put forward post-merge-window in v4.10. This commit sets a flag from the existing rcu_scheduler_starting() function which causes all synchronous grace periods to take the expedited path. The expedited path now checks this flag, using the requesting task to drive the expedited grace period forward during the mid-boot phase. Finally, this flag is updated by a core_initcall() function named rcu_exp_runtime_mode(), which causes the runtime codepaths to be used. Note that this arrangement assumes that tasks are not sent POSIX signals (or anything similar) from the time that the first task is spawned through core_initcall() time. Fixes: 8b355e3bc140 ("rcu: Drive expedited grace periods from workqueue") Reported-by: "Zheng, Lv" Reported-by: Borislav Petkov Signed-off-by: Paul E. McKenney Tested-by: Stan Kain Tested-by: Ivan Tested-by: Emanuel Castelo Tested-by: Bruno Pesavento Tested-by: Borislav Petkov Tested-by: Frederic Bezies Cc: # 4.9.0- --- include/linux/rcupdate.h | 4 ++++ kernel/rcu/rcu.h | 1 + kernel/rcu/tiny_plugin.h | 9 +++++++-- kernel/rcu/tree.c | 33 ++++++++++++++++++------------ kernel/rcu/tree_exp.h | 52 ++++++++++++++++++++++++++++++++++++++---------- kernel/rcu/tree_plugin.h | 2 +- kernel/rcu/update.c | 38 +++++++++++++++++++++++++++-------- 7 files changed, 104 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 321f9ed552a9..01f71e1d2e94 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -444,6 +444,10 @@ bool __rcu_is_watching(void); #error "Unknown RCU implementation specified to kernel configuration" #endif +#define RCU_SCHEDULER_INACTIVE 0 +#define RCU_SCHEDULER_INIT 1 +#define RCU_SCHEDULER_RUNNING 2 + /* * init_rcu_head_on_stack()/destroy_rcu_head_on_stack() are needed for dynamic * initialization and destruction of rcu_head on the stack. rcu_head structures diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 80adef7d4c3d..0d6ff3e471be 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -136,6 +136,7 @@ int rcu_jiffies_till_stall_check(void); #define TPS(x) tracepoint_string(x) void rcu_early_boot_tests(void); +void rcu_test_sync_prims(void); /* * This function really isn't for public consumption, but RCU is special in diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index 196f0302e2f4..c64b827ecbca 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h @@ -60,12 +60,17 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active); /* * During boot, we forgive RCU lockdep issues. After this function is - * invoked, we start taking RCU lockdep issues seriously. + * invoked, we start taking RCU lockdep issues seriously. Note that unlike + * Tree RCU, Tiny RCU transitions directly from RCU_SCHEDULER_INACTIVE + * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage. + * The reason for this is that Tiny RCU does not need kthreads, so does + * not have to care about the fact that the scheduler is half-initialized + * at a certain phase of the boot process. */ void __init rcu_scheduler_starting(void) { WARN_ON(nr_context_switches() > 0); - rcu_scheduler_active = 1; + rcu_scheduler_active = RCU_SCHEDULER_RUNNING; } #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 96c52e43f7ca..cb4e2056ccf3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -127,13 +127,16 @@ int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ int sysctl_panic_on_rcu_stall __read_mostly; /* - * The rcu_scheduler_active variable transitions from zero to one just - * before the first task is spawned. So when this variable is zero, RCU - * can assume that there is but one task, allowing RCU to (for example) + * The rcu_scheduler_active variable is initialized to the value + * RCU_SCHEDULER_INACTIVE and transitions RCU_SCHEDULER_INIT just before the + * first task is spawned. So when this variable is RCU_SCHEDULER_INACTIVE, + * RCU can assume that there is but one task, allowing RCU to (for example) * optimize synchronize_rcu() to a simple barrier(). When this variable - * is one, RCU must actually do all the hard work required to detect real - * grace periods. This variable is also used to suppress boot-time false - * positives from lockdep-RCU error checking. + * is RCU_SCHEDULER_INIT, RCU must actually do all the hard work required + * to detect real grace periods. This variable is also used to suppress + * boot-time false positives from lockdep-RCU error checking. Finally, it + * transitions from RCU_SCHEDULER_INIT to RCU_SCHEDULER_RUNNING after RCU + * is fully initialized, including all of its kthreads having been spawned. */ int rcu_scheduler_active __read_mostly; EXPORT_SYMBOL_GPL(rcu_scheduler_active); @@ -3980,18 +3983,22 @@ static int __init rcu_spawn_gp_kthread(void) early_initcall(rcu_spawn_gp_kthread); /* - * This function is invoked towards the end of the scheduler's initialization - * process. Before this is called, the idle task might contain - * RCU read-side critical sections (during which time, this idle - * task is booting the system). After this function is called, the - * idle tasks are prohibited from containing RCU read-side critical - * sections. This function also enables RCU lockdep checking. + * This function is invoked towards the end of the scheduler's + * initialization process. Before this is called, the idle task might + * contain synchronous grace-period primitives (during which time, this idle + * task is booting the system, and such primitives are no-ops). After this + * function is called, any synchronous grace-period primitives are run as + * expedited, with the requesting task driving the grace period forward. + * A later core_initcall() rcu_exp_runtime_mode() will switch to full + * runtime RCU functionality. */ void rcu_scheduler_starting(void) { WARN_ON(num_online_cpus() != 1); WARN_ON(nr_context_switches() > 0); - rcu_scheduler_active = 1; + rcu_test_sync_prims(); + rcu_scheduler_active = RCU_SCHEDULER_INIT; + rcu_test_sync_prims(); } /* diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index d3053e99fdb6..e59e1849b89a 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -531,6 +531,20 @@ struct rcu_exp_work { struct work_struct rew_work; }; +/* + * Common code to drive an expedited grace period forward, used by + * workqueues and mid-boot-time tasks. + */ +static void rcu_exp_sel_wait_wake(struct rcu_state *rsp, + smp_call_func_t func, unsigned long s) +{ + /* Initialize the rcu_node tree in preparation for the wait. */ + sync_rcu_exp_select_cpus(rsp, func); + + /* Wait and clean up, including waking everyone. */ + rcu_exp_wait_wake(rsp, s); +} + /* * Work-queue handler to drive an expedited grace period forward. */ @@ -538,12 +552,8 @@ static void wait_rcu_exp_gp(struct work_struct *wp) { struct rcu_exp_work *rewp; - /* Initialize the rcu_node tree in preparation for the wait. */ rewp = container_of(wp, struct rcu_exp_work, rew_work); - sync_rcu_exp_select_cpus(rewp->rew_rsp, rewp->rew_func); - - /* Wait and clean up, including waking everyone. */ - rcu_exp_wait_wake(rewp->rew_rsp, rewp->rew_s); + rcu_exp_sel_wait_wake(rewp->rew_rsp, rewp->rew_func, rewp->rew_s); } /* @@ -569,12 +579,18 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp, if (exp_funnel_lock(rsp, s)) return; /* Someone else did our work for us. */ - /* Marshall arguments and schedule the expedited grace period. */ - rew.rew_func = func; - rew.rew_rsp = rsp; - rew.rew_s = s; - INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); - schedule_work(&rew.rew_work); + /* Ensure that load happens before action based on it. */ + if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) { + /* Direct call during scheduler init and early_initcalls(). */ + rcu_exp_sel_wait_wake(rsp, func, s); + } else { + /* Marshall arguments & schedule the expedited grace period. */ + rew.rew_func = func; + rew.rew_rsp = rsp; + rew.rew_s = s; + INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); + schedule_work(&rew.rew_work); + } /* Wait for expedited grace period to complete. */ rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); @@ -676,6 +692,8 @@ void synchronize_rcu_expedited(void) { struct rcu_state *rsp = rcu_state_p; + if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) + return; _synchronize_rcu_expedited(rsp, sync_rcu_exp_handler); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); @@ -693,3 +711,15 @@ void synchronize_rcu_expedited(void) EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ + +/* + * Switch to run-time mode once Tree RCU has fully initialized. + */ +static int __init rcu_exp_runtime_mode(void) +{ + rcu_test_sync_prims(); + rcu_scheduler_active = RCU_SCHEDULER_RUNNING; + rcu_test_sync_prims(); + return 0; +} +core_initcall(rcu_exp_runtime_mode); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 85c5a883c6e3..56583e764ebf 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -670,7 +670,7 @@ void synchronize_rcu(void) lock_is_held(&rcu_lock_map) || lock_is_held(&rcu_sched_lock_map), "Illegal synchronize_rcu() in RCU read-side critical section"); - if (!rcu_scheduler_active) + if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) return; if (rcu_gp_is_expedited()) synchronize_rcu_expedited(); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index f19271dce0a9..4f6db7e6a117 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -121,11 +121,14 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held); * Should expedited grace-period primitives always fall back to their * non-expedited counterparts? Intended for use within RCU. Note * that if the user specifies both rcu_expedited and rcu_normal, then - * rcu_normal wins. + * rcu_normal wins. (Except during the time period during boot from + * when the first task is spawned until the rcu_exp_runtime_mode() + * core_initcall() is invoked, at which point everything is expedited.) */ bool rcu_gp_is_normal(void) { - return READ_ONCE(rcu_normal); + return READ_ONCE(rcu_normal) && + rcu_scheduler_active != RCU_SCHEDULER_INIT; } EXPORT_SYMBOL_GPL(rcu_gp_is_normal); @@ -135,13 +138,14 @@ static atomic_t rcu_expedited_nesting = /* * Should normal grace-period primitives be expedited? Intended for * use within RCU. Note that this function takes the rcu_expedited - * sysfs/boot variable into account as well as the rcu_expedite_gp() - * nesting. So looping on rcu_unexpedite_gp() until rcu_gp_is_expedited() - * returns false is a -really- bad idea. + * sysfs/boot variable and rcu_scheduler_active into account as well + * as the rcu_expedite_gp() nesting. So looping on rcu_unexpedite_gp() + * until rcu_gp_is_expedited() returns false is a -really- bad idea. */ bool rcu_gp_is_expedited(void) { - return rcu_expedited || atomic_read(&rcu_expedited_nesting); + return rcu_expedited || atomic_read(&rcu_expedited_nesting) || + rcu_scheduler_active == RCU_SCHEDULER_INIT; } EXPORT_SYMBOL_GPL(rcu_gp_is_expedited); @@ -257,7 +261,7 @@ EXPORT_SYMBOL_GPL(rcu_callback_map); int notrace debug_lockdep_rcu_enabled(void) { - return rcu_scheduler_active && debug_locks && + return rcu_scheduler_active != RCU_SCHEDULER_INACTIVE && debug_locks && current->lockdep_recursion == 0; } EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); @@ -591,7 +595,7 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks); void synchronize_rcu_tasks(void) { /* Complain if the scheduler has not started. */ - RCU_LOCKDEP_WARN(!rcu_scheduler_active, + RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, "synchronize_rcu_tasks called too soon"); /* Wait for the grace period. */ @@ -813,6 +817,23 @@ static void rcu_spawn_tasks_kthread(void) #endif /* #ifdef CONFIG_TASKS_RCU */ +/* + * Test each non-SRCU synchronous grace-period wait API. This is + * useful just after a change in mode for these primitives, and + * during early boot. + */ +void rcu_test_sync_prims(void) +{ + if (!IS_ENABLED(CONFIG_PROVE_RCU)) + return; + synchronize_rcu(); + synchronize_rcu_bh(); + synchronize_sched(); + synchronize_rcu_expedited(); + synchronize_rcu_bh_expedited(); + synchronize_sched_expedited(); +} + #ifdef CONFIG_PROVE_RCU /* @@ -865,6 +886,7 @@ void rcu_early_boot_tests(void) early_boot_test_call_rcu_bh(); if (rcu_self_test_sched) early_boot_test_call_rcu_sched(); + rcu_test_sync_prims(); } static int rcu_verify_early_boot_tests(void) -- cgit v1.2.3 From 4205e4786d0b9fc3b4fec7b1910cf645a0468307 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 10 Jan 2017 14:01:05 +0100 Subject: cpu/hotplug: Provide dynamic range for prepare stage Mathieu reported that the LTTNG modules are broken as of 4.10-rc1 due to the removal of the cpu hotplug notifiers. Usually I don't care much about out of tree modules, but LTTNG is widely used in distros. There are two ways to solve that: 1) Reserve a hotplug state for LTTNG 2) Add a dynamic range for the prepare states. While #1 is the simplest solution, #2 is the proper one as we can convert in tree users, which do not care about ordering, to the dynamic range as well. Add a dynamic range which allows LTTNG to request states in the prepare stage. Reported-and-tested-by: Mathieu Desnoyers Signed-off-by: Thomas Gleixner Reviewed-by: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Sebastian Sewior Cc: Steven Rostedt Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1701101353010.3401@nanos Signed-off-by: Thomas Gleixner --- include/linux/cpuhotplug.h | 2 ++ kernel/cpu.c | 22 ++++++++++++++++++---- 2 files changed, 20 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 20bfefbe7594..d936a0021839 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -74,6 +74,8 @@ enum cpuhp_state { CPUHP_ZCOMP_PREPARE, CPUHP_TIMERS_DEAD, CPUHP_MIPS_SOC_PREPARE, + CPUHP_BP_PREPARE_DYN, + CPUHP_BP_PREPARE_DYN_END = CPUHP_BP_PREPARE_DYN + 20, CPUHP_BRINGUP_CPU, CPUHP_AP_IDLE_DEAD, CPUHP_AP_OFFLINE, diff --git a/kernel/cpu.c b/kernel/cpu.c index f75c4d031eeb..c47506357519 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1302,10 +1302,24 @@ static int cpuhp_cb_check(enum cpuhp_state state) */ static int cpuhp_reserve_state(enum cpuhp_state state) { - enum cpuhp_state i; + enum cpuhp_state i, end; + struct cpuhp_step *step; - for (i = CPUHP_AP_ONLINE_DYN; i <= CPUHP_AP_ONLINE_DYN_END; i++) { - if (!cpuhp_ap_states[i].name) + switch (state) { + case CPUHP_AP_ONLINE_DYN: + step = cpuhp_ap_states + CPUHP_AP_ONLINE_DYN; + end = CPUHP_AP_ONLINE_DYN_END; + break; + case CPUHP_BP_PREPARE_DYN: + step = cpuhp_bp_states + CPUHP_BP_PREPARE_DYN; + end = CPUHP_BP_PREPARE_DYN_END; + break; + default: + return -EINVAL; + } + + for (i = state; i <= end; i++, step++) { + if (!step->name) return i; } WARN(1, "No more dynamic states available for CPU hotplug\n"); @@ -1323,7 +1337,7 @@ static int cpuhp_store_callbacks(enum cpuhp_state state, const char *name, mutex_lock(&cpuhp_state_mutex); - if (state == CPUHP_AP_ONLINE_DYN) { + if (state == CPUHP_AP_ONLINE_DYN || state == CPUHP_BP_PREPARE_DYN) { ret = cpuhp_reserve_state(state); if (ret < 0) goto out; -- cgit v1.2.3 From 5eb7c0d04f04a667c049fe090a95494a8de2955c Mon Sep 17 00:00:00 2001 From: Larry Finger Date: Sun, 1 Jan 2017 20:25:25 -0600 Subject: taint/module: Fix problems when out-of-kernel driver defines true or false Commit 7fd8329ba502 ("taint/module: Clean up global and module taint flags handling") used the key words true and false as character members of a new struct. These names cause problems when out-of-kernel modules such as VirtualBox include their own definitions of true and false. Fixes: 7fd8329ba502 ("taint/module: Clean up global and module taint flags handling") Signed-off-by: Larry Finger Cc: Petr Mladek Cc: Jessica Yu Cc: Rusty Russell Reported-by: Valdis Kletnieks Reviewed-by: Petr Mladek Acked-by: Rusty Russell Signed-off-by: Jessica Yu --- include/linux/kernel.h | 4 ++-- kernel/module.c | 2 +- kernel/panic.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 56aec84237ad..cb09238f6d32 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -514,8 +514,8 @@ extern enum system_states { #define TAINT_FLAGS_COUNT 16 struct taint_flag { - char true; /* character printed when tainted */ - char false; /* character printed when not tainted */ + char c_true; /* character printed when tainted */ + char c_false; /* character printed when not tainted */ bool module; /* also show as a per-module taint flag */ }; diff --git a/kernel/module.c b/kernel/module.c index 5088784c0cf9..38d4270925d4 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1145,7 +1145,7 @@ static size_t module_flags_taint(struct module *mod, char *buf) for (i = 0; i < TAINT_FLAGS_COUNT; i++) { if (taint_flags[i].module && test_bit(i, &mod->taints)) - buf[l++] = taint_flags[i].true; + buf[l++] = taint_flags[i].c_true; } return l; diff --git a/kernel/panic.c b/kernel/panic.c index c51edaa04fce..901c4fb46002 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -355,7 +355,7 @@ const char *print_tainted(void) for (i = 0; i < TAINT_FLAGS_COUNT; i++) { const struct taint_flag *t = &taint_flags[i]; *s++ = test_bit(i, &tainted_mask) ? - t->true : t->false; + t->c_true : t->c_false; } *s = 0; } else -- cgit v1.2.3 From 0fec9557fd0c5349e3bd1a2141612a60bc20bb71 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Tue, 17 Jan 2017 15:35:01 +0100 Subject: cpu/hotplug: Remove unused but set variable in _cpu_down() After the recent removal of the hotplug notifiers the variable 'hasdied' in _cpu_down() is set but no longer read, leading to the following GCC warning when building with 'make W=1': kernel/cpu.c:767:7: warning: variable ‘hasdied’ set but not used [-Wunused-but-set-variable] Fix it by removing the variable. Fixes: 530e9b76ae8f ("cpu/hotplug: Remove obsolete cpu hotplug register/unregister functions") Signed-off-by: Tobias Klauser Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20170117143501.20893-1-tklauser@distanz.ch Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index c47506357519..0a5f630f5c54 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -764,7 +764,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); int prev_state, ret = 0; - bool hasdied = false; if (num_online_cpus() == 1) return -EBUSY; @@ -809,7 +808,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, cpuhp_kick_ap_work(cpu); } - hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE; out: cpu_hotplug_done(); return ret; -- cgit v1.2.3 From d407bd25a204bd66b7346dde24bd3d37ef0e0b05 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 18 Jan 2017 15:14:17 +0100 Subject: bpf: don't trigger OOM killer under pressure with map alloc This patch adds two helpers, bpf_map_area_alloc() and bpf_map_area_free(), that are to be used for map allocations. Using kmalloc() for very large allocations can cause excessive work within the page allocator, so i) fall back earlier to vmalloc() when the attempt is considered costly anyway, and even more importantly ii) don't trigger OOM killer with any of the allocators. Since this is based on a user space request, for example, when creating maps with element pre-allocation, we really want such requests to fail instead of killing other user space processes. Also, don't spam the kernel log with warnings should any of the allocations fail under pressure. Given that, we can make backend selection in bpf_map_area_alloc() generic, and convert all maps over to use this API for spots with potentially large allocation requests. Note, replacing the one kmalloc_array() is fine as overflow checks happen earlier in htab_map_alloc(), since it must also protect the multiplication for vmalloc() should kmalloc_array() fail. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 2 ++ kernel/bpf/arraymap.c | 18 +++++++----------- kernel/bpf/hashtab.c | 22 +++++++++------------- kernel/bpf/stackmap.c | 20 ++++++++------------ kernel/bpf/syscall.c | 26 ++++++++++++++++++++++++++ 5 files changed, 52 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 05cf951df3fe..3ed1f3b1d594 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -247,6 +247,8 @@ struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref); void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); int bpf_map_precharge_memlock(u32 pages); +void *bpf_map_area_alloc(size_t size); +void bpf_map_area_free(void *base); extern int sysctl_unprivileged_bpf_disabled; diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 229a5d5df977..3d55d95dcf49 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -11,7 +11,6 @@ */ #include #include -#include #include #include #include @@ -74,14 +73,10 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) if (array_size >= U32_MAX - PAGE_SIZE) return ERR_PTR(-ENOMEM); - /* allocate all map elements and zero-initialize them */ - array = kzalloc(array_size, GFP_USER | __GFP_NOWARN); - if (!array) { - array = vzalloc(array_size); - if (!array) - return ERR_PTR(-ENOMEM); - } + array = bpf_map_area_alloc(array_size); + if (!array) + return ERR_PTR(-ENOMEM); /* copy mandatory map attributes */ array->map.map_type = attr->map_type; @@ -97,7 +92,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) if (array_size >= U32_MAX - PAGE_SIZE || elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) { - kvfree(array); + bpf_map_area_free(array); return ERR_PTR(-ENOMEM); } out: @@ -262,7 +257,7 @@ static void array_map_free(struct bpf_map *map) if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) bpf_array_free_percpu(array); - kvfree(array); + bpf_map_area_free(array); } static const struct bpf_map_ops array_ops = { @@ -319,7 +314,8 @@ static void fd_array_map_free(struct bpf_map *map) /* make sure it's empty */ for (i = 0; i < array->map.max_entries; i++) BUG_ON(array->ptrs[i] != NULL); - kvfree(array); + + bpf_map_area_free(array); } static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3f2bb58952d8..a753bbe7df0a 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -13,7 +13,6 @@ #include #include #include -#include #include "percpu_freelist.h" #include "bpf_lru_list.h" @@ -103,7 +102,7 @@ static void htab_free_elems(struct bpf_htab *htab) free_percpu(pptr); } free_elems: - vfree(htab->elems); + bpf_map_area_free(htab->elems); } static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, @@ -125,7 +124,8 @@ static int prealloc_init(struct bpf_htab *htab) { int err = -ENOMEM, i; - htab->elems = vzalloc(htab->elem_size * htab->map.max_entries); + htab->elems = bpf_map_area_alloc(htab->elem_size * + htab->map.max_entries); if (!htab->elems) return -ENOMEM; @@ -320,14 +320,10 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) goto free_htab; err = -ENOMEM; - htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket), - GFP_USER | __GFP_NOWARN); - - if (!htab->buckets) { - htab->buckets = vmalloc(htab->n_buckets * sizeof(struct bucket)); - if (!htab->buckets) - goto free_htab; - } + htab->buckets = bpf_map_area_alloc(htab->n_buckets * + sizeof(struct bucket)); + if (!htab->buckets) + goto free_htab; for (i = 0; i < htab->n_buckets; i++) { INIT_HLIST_HEAD(&htab->buckets[i].head); @@ -354,7 +350,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) free_extra_elems: free_percpu(htab->extra_elems); free_buckets: - kvfree(htab->buckets); + bpf_map_area_free(htab->buckets); free_htab: kfree(htab); return ERR_PTR(err); @@ -1014,7 +1010,7 @@ static void htab_map_free(struct bpf_map *map) prealloc_destroy(htab); free_percpu(htab->extra_elems); - kvfree(htab->buckets); + bpf_map_area_free(htab->buckets); kfree(htab); } diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 732ae16d12b7..be8519148c25 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -7,7 +7,6 @@ #include #include #include -#include #include #include #include "percpu_freelist.h" @@ -32,7 +31,7 @@ static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size; int err; - smap->elems = vzalloc(elem_size * smap->map.max_entries); + smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries); if (!smap->elems) return -ENOMEM; @@ -45,7 +44,7 @@ static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) return 0; free_elems: - vfree(smap->elems); + bpf_map_area_free(smap->elems); return err; } @@ -76,12 +75,9 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) if (cost >= U32_MAX - PAGE_SIZE) return ERR_PTR(-E2BIG); - smap = kzalloc(cost, GFP_USER | __GFP_NOWARN); - if (!smap) { - smap = vzalloc(cost); - if (!smap) - return ERR_PTR(-ENOMEM); - } + smap = bpf_map_area_alloc(cost); + if (!smap) + return ERR_PTR(-ENOMEM); err = -E2BIG; cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); @@ -112,7 +108,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) put_buffers: put_callchain_buffers(); free_smap: - kvfree(smap); + bpf_map_area_free(smap); return ERR_PTR(err); } @@ -262,9 +258,9 @@ static void stack_map_free(struct bpf_map *map) /* wait for bpf programs to complete before freeing stack map */ synchronize_rcu(); - vfree(smap->elems); + bpf_map_area_free(smap->elems); pcpu_freelist_destroy(&smap->freelist); - kvfree(smap); + bpf_map_area_free(smap); put_callchain_buffers(); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1d6b29e4e2c3..19b6129eab23 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include #include @@ -49,6 +51,30 @@ void bpf_register_map_type(struct bpf_map_type_list *tl) list_add(&tl->list_node, &bpf_map_types); } +void *bpf_map_area_alloc(size_t size) +{ + /* We definitely need __GFP_NORETRY, so OOM killer doesn't + * trigger under memory pressure as we really just want to + * fail instead. + */ + const gfp_t flags = __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO; + void *area; + + if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { + area = kmalloc(size, GFP_USER | flags); + if (area != NULL) + return area; + } + + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | flags, + PAGE_KERNEL); +} + +void bpf_map_area_free(void *area) +{ + kvfree(area); +} + int bpf_map_precharge_memlock(u32 pages) { struct user_struct *user = get_current_user(); -- cgit v1.2.3 From e326ce013a8e851193eb337aafb1aa396c533a61 Mon Sep 17 00:00:00 2001 From: Rafael J. Wysocki Date: Fri, 20 Jan 2017 03:25:34 +0100 Subject: Revert "PM / sleep / ACPI: Use the ACPI_FADT_LOW_POWER_S0 flag" Revert commit 08b98d329165 (PM / sleep / ACPI: Use the ACPI_FADT_LOW_POWER_S0 flag) as it caused system suspend (in the default configuration) to fail on Dell XPS13 (9360) with the Kaby Lake processor. Fixes: 08b98d329165 (PM / sleep / ACPI: Use the ACPI_FADT_LOW_POWER_S0 flag) Reported-by: Paul Menzel Signed-off-by: Rafael J. Wysocki --- Documentation/power/states.txt | 4 +--- drivers/acpi/sleep.c | 8 -------- include/linux/suspend.h | 2 -- kernel/power/suspend.c | 4 ++-- 4 files changed, 3 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/Documentation/power/states.txt b/Documentation/power/states.txt index 8a39ce45d8a0..008ecb588317 100644 --- a/Documentation/power/states.txt +++ b/Documentation/power/states.txt @@ -35,9 +35,7 @@ only one way to cause the system to go into the Suspend-To-RAM state (write The default suspend mode (ie. the one to be used without writing anything into /sys/power/mem_sleep) is either "deep" (if Suspend-To-RAM is supported) or "s2idle", but it can be overridden by the value of the "mem_sleep_default" -parameter in the kernel command line. On some ACPI-based systems, depending on -the information in the FADT, the default may be "s2idle" even if Suspend-To-RAM -is supported. +parameter in the kernel command line. The properties of all of the sleep states are described below. diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index ce1855fd584b..deb0ff78eba8 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -691,14 +691,6 @@ static void acpi_sleep_suspend_setup(void) if (acpi_sleep_state_supported(i)) sleep_states[i] = 1; - /* - * Use suspend-to-idle by default if ACPI_FADT_LOW_POWER_S0 is set and - * the default suspend mode was not selected from the command line. - */ - if (acpi_gbl_FADT.flags & ACPI_FADT_LOW_POWER_S0 && - mem_sleep_default > PM_SUSPEND_MEM) - mem_sleep_default = PM_SUSPEND_FREEZE; - suspend_set_ops(old_suspend_ordering ? &acpi_suspend_ops_old : &acpi_suspend_ops); freeze_set_ops(&acpi_freeze_ops); diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 0c729c3c8549..d9718378a8be 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -194,8 +194,6 @@ struct platform_freeze_ops { }; #ifdef CONFIG_SUSPEND -extern suspend_state_t mem_sleep_default; - /** * suspend_set_ops - set platform dependent suspend operations * @ops: The new suspend operations to set. diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index f67ceb7768b8..15e6baef5c73 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -46,7 +46,7 @@ static const char * const mem_sleep_labels[] = { const char *mem_sleep_states[PM_SUSPEND_MAX]; suspend_state_t mem_sleep_current = PM_SUSPEND_FREEZE; -suspend_state_t mem_sleep_default = PM_SUSPEND_MAX; +static suspend_state_t mem_sleep_default = PM_SUSPEND_MEM; unsigned int pm_suspend_global_flags; EXPORT_SYMBOL_GPL(pm_suspend_global_flags); @@ -168,7 +168,7 @@ void suspend_set_ops(const struct platform_suspend_ops *ops) } if (valid_state(PM_SUSPEND_MEM)) { mem_sleep_states[PM_SUSPEND_MEM] = mem_sleep_labels[PM_SUSPEND_MEM]; - if (mem_sleep_default >= PM_SUSPEND_MEM) + if (mem_sleep_default == PM_SUSPEND_MEM) mem_sleep_current = PM_SUSPEND_MEM; } -- cgit v1.2.3 From 880a38547ff08715ce4f1daf9a4bb30c87676e68 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 20 Jan 2017 15:21:35 +0200 Subject: userns: Make ucounts lock irq-safe The ucounts_lock is being used to protect various ucounts lifecycle management functionalities. However, those services can also be invoked when a pidns is being freed in an RCU callback (e.g. softirq context). This can lead to deadlocks. There were already efforts trying to prevent similar deadlocks in add7c65ca426 ("pid: fix lockdep deadlock warning due to ucount_lock"), however they just moved the context from hardirq to softrq. Fix this issue once and for all by explictly making the lock disable irqs altogether. Dmitry Vyukov reported: > I've got the following deadlock report while running syzkaller fuzzer > on eec0d3d065bfcdf9cd5f56dd2a36b94d12d32297 of linux-next (on odroid > device if it matters): > > ================================= > [ INFO: inconsistent lock state ] > 4.10.0-rc3-next-20170112-xc2-dirty #6 Not tainted > --------------------------------- > inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage. > swapper/2/0 [HC0[0]:SC1[1]:HE1:SE0] takes: > (ucounts_lock){+.?...}, at: [< inline >] spin_lock > ./include/linux/spinlock.h:302 > (ucounts_lock){+.?...}, at: [] > put_ucounts+0x60/0x138 kernel/ucount.c:162 > {SOFTIRQ-ON-W} state was registered at: > [] mark_lock+0x220/0xb60 kernel/locking/lockdep.c:3054 > [< inline >] mark_irqflags kernel/locking/lockdep.c:2941 > [] __lock_acquire+0x388/0x3260 kernel/locking/lockdep.c:3295 > [] lock_acquire+0xa4/0x138 kernel/locking/lockdep.c:3753 > [< inline >] __raw_spin_lock ./include/linux/spinlock_api_smp.h:144 > [] _raw_spin_lock+0x90/0xd0 kernel/locking/spinlock.c:151 > [< inline >] spin_lock ./include/linux/spinlock.h:302 > [< inline >] get_ucounts kernel/ucount.c:131 > [] inc_ucount+0x80/0x6c8 kernel/ucount.c:189 > [< inline >] inc_mnt_namespaces fs/namespace.c:2818 > [] alloc_mnt_ns+0x78/0x3a8 fs/namespace.c:2849 > [] create_mnt_ns+0x28/0x200 fs/namespace.c:2959 > [< inline >] init_mount_tree fs/namespace.c:3199 > [] mnt_init+0x258/0x384 fs/namespace.c:3251 > [] vfs_caches_init+0x6c/0x80 fs/dcache.c:3626 > [] start_kernel+0x414/0x460 init/main.c:648 > [] __primary_switched+0x6c/0x70 arch/arm64/kernel/head.S:456 > irq event stamp: 2316924 > hardirqs last enabled at (2316924): [< inline >] rcu_do_batch > kernel/rcu/tree.c:2911 > hardirqs last enabled at (2316924): [< inline >] > invoke_rcu_callbacks kernel/rcu/tree.c:3182 > hardirqs last enabled at (2316924): [< inline >] > __rcu_process_callbacks kernel/rcu/tree.c:3149 > hardirqs last enabled at (2316924): [] > rcu_process_callbacks+0x7a4/0xc28 kernel/rcu/tree.c:3166 > hardirqs last disabled at (2316923): [< inline >] rcu_do_batch > kernel/rcu/tree.c:2900 > hardirqs last disabled at (2316923): [< inline >] > invoke_rcu_callbacks kernel/rcu/tree.c:3182 > hardirqs last disabled at (2316923): [< inline >] > __rcu_process_callbacks kernel/rcu/tree.c:3149 > hardirqs last disabled at (2316923): [] > rcu_process_callbacks+0x210/0xc28 kernel/rcu/tree.c:3166 > softirqs last enabled at (2316912): [] > _local_bh_enable+0x4c/0x80 kernel/softirq.c:155 > softirqs last disabled at (2316913): [< inline >] > do_softirq_own_stack ./include/linux/interrupt.h:488 > softirqs last disabled at (2316913): [< inline >] > invoke_softirq kernel/softirq.c:371 > softirqs last disabled at (2316913): [] > irq_exit+0x264/0x308 kernel/softirq.c:405 > > other info that might help us debug this: > Possible unsafe locking scenario: > > CPU0 > ---- > lock(ucounts_lock); > > lock(ucounts_lock); > > *** DEADLOCK *** > > 1 lock held by swapper/2/0: > #0: (rcu_callback){......}, at: [< inline >] __rcu_reclaim > kernel/rcu/rcu.h:108 > #0: (rcu_callback){......}, at: [< inline >] rcu_do_batch > kernel/rcu/tree.c:2919 > #0: (rcu_callback){......}, at: [< inline >] > invoke_rcu_callbacks kernel/rcu/tree.c:3182 > #0: (rcu_callback){......}, at: [< inline >] > __rcu_process_callbacks kernel/rcu/tree.c:3149 > #0: (rcu_callback){......}, at: [] > rcu_process_callbacks+0x720/0xc28 kernel/rcu/tree.c:3166 > > stack backtrace: > CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.10.0-rc3-next-20170112-xc2-dirty #6 > Hardware name: Hardkernel ODROID-C2 (DT) > Call trace: > [] dump_backtrace+0x0/0x440 arch/arm64/kernel/traps.c:500 > [] show_stack+0x20/0x30 arch/arm64/kernel/traps.c:225 > [] dump_stack+0x110/0x168 > [] print_usage_bug.part.27+0x49c/0x4bc > kernel/locking/lockdep.c:2387 > [< inline >] print_usage_bug kernel/locking/lockdep.c:2357 > [< inline >] valid_state kernel/locking/lockdep.c:2400 > [< inline >] mark_lock_irq kernel/locking/lockdep.c:2617 > [] mark_lock+0x934/0xb60 kernel/locking/lockdep.c:3065 > [< inline >] mark_irqflags kernel/locking/lockdep.c:2923 > [] __lock_acquire+0x640/0x3260 kernel/locking/lockdep.c:3295 > [] lock_acquire+0xa4/0x138 kernel/locking/lockdep.c:3753 > [< inline >] __raw_spin_lock ./include/linux/spinlock_api_smp.h:144 > [] _raw_spin_lock+0x90/0xd0 kernel/locking/spinlock.c:151 > [< inline >] spin_lock ./include/linux/spinlock.h:302 > [] put_ucounts+0x60/0x138 kernel/ucount.c:162 > [] dec_ucount+0xf4/0x158 kernel/ucount.c:214 > [< inline >] dec_pid_namespaces kernel/pid_namespace.c:89 > [] delayed_free_pidns+0x40/0xe0 kernel/pid_namespace.c:156 > [< inline >] __rcu_reclaim kernel/rcu/rcu.h:118 > [< inline >] rcu_do_batch kernel/rcu/tree.c:2919 > [< inline >] invoke_rcu_callbacks kernel/rcu/tree.c:3182 > [< inline >] __rcu_process_callbacks kernel/rcu/tree.c:3149 > [] rcu_process_callbacks+0x768/0xc28 kernel/rcu/tree.c:3166 > [] __do_softirq+0x324/0x6e0 kernel/softirq.c:284 > [< inline >] do_softirq_own_stack ./include/linux/interrupt.h:488 > [< inline >] invoke_softirq kernel/softirq.c:371 > [] irq_exit+0x264/0x308 kernel/softirq.c:405 > [] __handle_domain_irq+0xc0/0x150 kernel/irq/irqdesc.c:636 > [] gic_handle_irq+0x68/0xd8 > Exception stack(0xffff8000648e7dd0 to 0xffff8000648e7f00) > 7dc0: ffff8000648d4b3c 0000000000000007 > 7de0: 0000000000000000 1ffff0000c91a967 1ffff0000c91a967 1ffff0000c91a967 > 7e00: ffff20000a4b6b68 0000000000000001 0000000000000007 0000000000000001 > 7e20: 1fffe4000149ae90 ffff200009d35000 0000000000000000 0000000000000002 > 7e40: 0000000000000000 0000000000000000 0000000002624a1a 0000000000000000 > 7e60: 0000000000000000 ffff200009cbcd88 000060006d2ed000 0000000000000140 > 7e80: ffff200009cff000 ffff200009cb6000 ffff200009cc2020 ffff200009d2159d > 7ea0: 0000000000000000 ffff8000648d4380 0000000000000000 ffff8000648e7f00 > 7ec0: ffff20000820a478 ffff8000648e7f00 ffff20000820a47c 0000000010000145 > 7ee0: 0000000000000140 dfff200000000000 ffffffffffffffff ffff20000820a478 > [] el1_irq+0xb8/0x130 arch/arm64/kernel/entry.S:486 > [< inline >] arch_local_irq_restore > ./arch/arm64/include/asm/irqflags.h:81 > [] rcu_idle_exit+0x64/0xa8 kernel/rcu/tree.c:1030 > [< inline >] cpuidle_idle_call kernel/sched/idle.c:200 > [] do_idle+0x1dc/0x2d0 kernel/sched/idle.c:243 > [] cpu_startup_entry+0x24/0x28 kernel/sched/idle.c:345 > [] secondary_start_kernel+0x2cc/0x358 > arch/arm64/kernel/smp.c:276 > [<000000000279f1a4>] 0x279f1a4 Reported-by: Dmitry Vyukov Tested-by: Dmitry Vyukov Fixes: add7c65ca426 ("pid: fix lockdep deadlock warning due to ucount_lock") Fixes: f333c700c610 ("pidns: Add a limit on the number of pid namespaces") Cc: stable@vger.kernel.org Link: https://www.spinics.net/lists/kernel/msg2426637.html Signed-off-by: Nikolay Borisov Signed-off-by: Eric W. Biederman --- kernel/ucount.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/ucount.c b/kernel/ucount.c index 9d20d5dd298a..4bbd38ec3788 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -128,10 +128,10 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid) struct hlist_head *hashent = ucounts_hashentry(ns, uid); struct ucounts *ucounts, *new; - spin_lock(&ucounts_lock); + spin_lock_irq(&ucounts_lock); ucounts = find_ucounts(ns, uid, hashent); if (!ucounts) { - spin_unlock(&ucounts_lock); + spin_unlock_irq(&ucounts_lock); new = kzalloc(sizeof(*new), GFP_KERNEL); if (!new) @@ -141,7 +141,7 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid) new->uid = uid; atomic_set(&new->count, 0); - spin_lock(&ucounts_lock); + spin_lock_irq(&ucounts_lock); ucounts = find_ucounts(ns, uid, hashent); if (ucounts) { kfree(new); @@ -152,16 +152,18 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid) } if (!atomic_add_unless(&ucounts->count, 1, INT_MAX)) ucounts = NULL; - spin_unlock(&ucounts_lock); + spin_unlock_irq(&ucounts_lock); return ucounts; } static void put_ucounts(struct ucounts *ucounts) { + unsigned long flags; + if (atomic_dec_and_test(&ucounts->count)) { - spin_lock(&ucounts_lock); + spin_lock_irqsave(&ucounts_lock, flags); hlist_del_init(&ucounts->node); - spin_unlock(&ucounts_lock); + spin_unlock_irqrestore(&ucounts_lock, flags); kfree(ucounts); } -- cgit v1.2.3 From b94f51183b0617e7b9b4fb4137d4cf1cab7547c2 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 24 Jan 2017 15:17:53 -0800 Subject: kernel/watchdog: prevent false hardlockup on overloaded system On an overloaded system, it is possible that a change in the watchdog threshold can be delayed long enough to trigger a false positive. This can easily be achieved by having a cpu spinning indefinitely on a task, while another cpu updates watchdog threshold. What happens is while trying to park the watchdog threads, the hrtimers on the other cpus trigger and reprogram themselves with the new slower watchdog threshold. Meanwhile, the nmi watchdog is still programmed with the old faster threshold. Because the one cpu is blocked, it prevents the thread parking on the other cpus from completing, which is needed to shutdown the nmi watchdog and reprogram it correctly. As a result, a false positive from the nmi watchdog is reported. Fix this by setting a park_in_progress flag to block all lockups until the parking is complete. Fix provided by Ulrich Obergfell. [akpm@linux-foundation.org: s/park_in_progress/watchdog_park_in_progress/] Link: http://lkml.kernel.org/r/1481041033-192236-1-git-send-email-dzickus@redhat.com Signed-off-by: Don Zickus Reviewed-by: Aaron Tomlin Cc: Ulrich Obergfell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/nmi.h | 1 + kernel/watchdog.c | 9 +++++++++ kernel/watchdog_hld.c | 3 +++ 3 files changed, 13 insertions(+) (limited to 'kernel') diff --git a/include/linux/nmi.h b/include/linux/nmi.h index aacca824a6ae..0a3fadc32693 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -110,6 +110,7 @@ extern int watchdog_user_enabled; extern int watchdog_thresh; extern unsigned long watchdog_enabled; extern unsigned long *watchdog_cpumask_bits; +extern atomic_t watchdog_park_in_progress; #ifdef CONFIG_SMP extern int sysctl_softlockup_all_cpu_backtrace; extern int sysctl_hardlockup_all_cpu_backtrace; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index d4b0fa01cae3..63177be0159e 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -49,6 +49,8 @@ unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); #define for_each_watchdog_cpu(cpu) \ for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) +atomic_t watchdog_park_in_progress = ATOMIC_INIT(0); + /* * The 'watchdog_running' variable is set to 1 when the watchdog threads * are registered/started and is set to 0 when the watchdog threads are @@ -260,6 +262,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) int duration; int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; + if (atomic_read(&watchdog_park_in_progress) != 0) + return HRTIMER_NORESTART; + /* kick the hardlockup detector */ watchdog_interrupt_count(); @@ -467,12 +472,16 @@ static int watchdog_park_threads(void) { int cpu, ret = 0; + atomic_set(&watchdog_park_in_progress, 1); + for_each_watchdog_cpu(cpu) { ret = kthread_park(per_cpu(softlockup_watchdog, cpu)); if (ret) break; } + atomic_set(&watchdog_park_in_progress, 0); + return ret; } diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 84016c8aee6b..12b8dd640786 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -84,6 +84,9 @@ static void watchdog_overflow_callback(struct perf_event *event, /* Ensure the watchdog never gets throttled */ event->hw.interrupts = 0; + if (atomic_read(&watchdog_park_in_progress) != 0) + return; + if (__this_cpu_read(watchdog_nmi_touch) == true) { __this_cpu_write(watchdog_nmi_touch, false); return; -- cgit v1.2.3 From ff7a28a074ccbea999dadbb58c46212cf90984c6 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Tue, 24 Jan 2017 15:18:29 -0800 Subject: kernel/panic.c: add missing \n When a system panics, the "Rebooting in X seconds.." message is never printed because it lacks a new line. Fix it. Link: http://lkml.kernel.org/r/20170119114751.2724-1-jslaby@suse.cz Signed-off-by: Jiri Slaby Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 901c4fb46002..08aa88dde7de 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -249,7 +249,7 @@ void panic(const char *fmt, ...) * Delay timeout seconds before rebooting the machine. * We can't use the "normal" timers since we just panicked. */ - pr_emerg("Rebooting in %d seconds..", panic_timeout); + pr_emerg("Rebooting in %d seconds..\n", panic_timeout); for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) { touch_nmi_watchdog(); -- cgit v1.2.3 From ff9f8a7cf935468a94d9927c68b00daae701667e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 25 Jan 2017 18:20:55 -0800 Subject: sysctl: fix proc_doulongvec_ms_jiffies_minmax() We perform the conversion between kernel jiffies and ms only when exporting kernel value to user space. We need to do the opposite operation when value is written by user. Only matters when HZ != 1000 Signed-off-by: Eric Dumazet Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8dbaec0e4f7f..1aea594a54db 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2475,6 +2475,7 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int break; if (neg) continue; + val = convmul * val / convdiv; if ((min && val < *min) || (max && val > *max)) continue; *i = val; -- cgit v1.2.3