From 1f5320d5972aa50d3e8d2b227b636b370e608359 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Thu, 4 Oct 2012 16:37:16 +0900 Subject: cgroup: notify_on_release may not be triggered in some cases notify_on_release must be triggered when the last process in a cgroup is move to another. But if the first(and only) process in a cgroup is moved to another, notify_on_release is not triggered. # mkdir /cgroup/cpu/SRC # mkdir /cgroup/cpu/DST # # echo 1 >/cgroup/cpu/SRC/notify_on_release # echo 1 >/cgroup/cpu/DST/notify_on_release # # sleep 300 & [1] 8629 # # echo 8629 >/cgroup/cpu/SRC/tasks # echo 8629 >/cgroup/cpu/DST/tasks -> notify_on_release for /SRC must be triggered at this point, but it isn't. This is because put_css_set() is called before setting CGRP_RELEASABLE in cgroup_task_migrate(), and is a regression introduce by the commit:74a1166d(cgroups: make procs file writable), which was merged into v3.0. Cc: Ben Blum Cc: # v3.0.x and later Acked-by: Li Zefan Signed-off-by: Daisuke Nishimura Signed-off-by: Tejun Heo --- kernel/cgroup.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 13774b3b39aa..d1739fc7eb94 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1962,9 +1962,8 @@ static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, * trading it for newcg is protected by cgroup_mutex, we're safe to drop * it here; it will be freed under RCU. */ - put_css_set(oldcg); - set_bit(CGRP_RELEASABLE, &oldcgrp->flags); + put_css_set(oldcg); } /** -- cgit v1.2.3 From 9bb71308b8133d643648776243e4d5599b1c193d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 18 Oct 2012 17:52:07 -0700 Subject: Revert "cgroup: Drop task_lock(parent) on cgroup_fork()" This reverts commit 7e381b0eb1e1a9805c37335562e8dc02e7d7848c. The commit incorrectly assumed that fork path always performed threadgroup_change_begin/end() and depended on that for synchronization against task exit and cgroup migration paths instead of explicitly grabbing task_lock(). threadgroup_change is not locked when forking a new process (as opposed to a new thread in the same process) and even if it were it wouldn't be effective as different processes use different threadgroup locks. Revert the incorrect optimization. Signed-off-by: Tejun Heo LKML-Reference: <20121008020000.GB2575@localhost> Acked-by: Li Zefan Bitterly-Acked-by: Frederic Weisbecker Cc: stable@vger.kernel.org --- kernel/cgroup.c | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d1739fc7eb94..75aec12c78a0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4814,31 +4814,20 @@ static const struct file_operations proc_cgroupstats_operations = { * * A pointer to the shared css_set was automatically copied in * fork.c by dup_task_struct(). However, we ignore that copy, since - * it was not made under the protection of RCU, cgroup_mutex or - * threadgroup_change_begin(), so it might no longer be a valid - * cgroup pointer. cgroup_attach_task() might have already changed - * current->cgroups, allowing the previously referenced cgroup - * group to be removed and freed. - * - * Outside the pointer validity we also need to process the css_set - * inheritance between threadgoup_change_begin() and - * threadgoup_change_end(), this way there is no leak in any process - * wide migration performed by cgroup_attach_proc() that could otherwise - * miss a thread because it is too early or too late in the fork stage. + * it was not made under the protection of RCU or cgroup_mutex, so + * might no longer be a valid cgroup pointer. cgroup_attach_task() might + * have already changed current->cgroups, allowing the previously + * referenced cgroup group to be removed and freed. * * At the point that cgroup_fork() is called, 'current' is the parent * task, and the passed argument 'child' points to the child task. */ void cgroup_fork(struct task_struct *child) { - /* - * We don't need to task_lock() current because current->cgroups - * can't be changed concurrently here. The parent obviously hasn't - * exited and called cgroup_exit(), and we are synchronized against - * cgroup migration through threadgroup_change_begin(). - */ + task_lock(current); child->cgroups = current->cgroups; get_css_set(child->cgroups); + task_unlock(current); INIT_LIST_HEAD(&child->cg_list); } -- cgit v1.2.3 From d87838321124061f6c935069d97f37010fa417e6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 18 Oct 2012 17:40:30 -0700 Subject: Revert "cgroup: Remove task_lock() from cgroup_post_fork()" This reverts commit 7e3aa30ac8c904a706518b725c451bb486daaae9. The commit incorrectly assumed that fork path always performed threadgroup_change_begin/end() and depended on that for synchronization against task exit and cgroup migration paths instead of explicitly grabbing task_lock(). threadgroup_change is not locked when forking a new process (as opposed to a new thread in the same process) and even if it were it wouldn't be effective as different processes use different threadgroup locks. Revert the incorrect optimization. Signed-off-by: Tejun Heo LKML-Reference: <20121008020000.GB2575@localhost> Acked-by: Li Zefan Cc: Frederic Weisbecker Cc: stable@vger.kernel.org --- kernel/cgroup.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 75aec12c78a0..f24f724620dd 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4883,19 +4883,10 @@ void cgroup_post_fork(struct task_struct *child) */ if (use_task_css_set_links) { write_lock(&css_set_lock); - if (list_empty(&child->cg_list)) { - /* - * It's safe to use child->cgroups without task_lock() - * here because we are protected through - * threadgroup_change_begin() against concurrent - * css_set change in cgroup_task_migrate(). Also - * the task can't exit at that point until - * wake_up_new_task() is called, so we are protected - * against cgroup_exit() setting child->cgroup to - * init_css_set. - */ + task_lock(child); + if (list_empty(&child->cg_list)) list_add(&child->cg_list, &child->cgroups->tasks); - } + task_unlock(child); write_unlock(&css_set_lock); } } -- cgit v1.2.3 From c0158ca64da5732dfb86a3f28944e9626776692f Mon Sep 17 00:00:00 2001 From: Dan Magenheimer Date: Thu, 18 Oct 2012 16:31:37 -0700 Subject: workqueue: cancel_delayed_work() should return %false if work item is idle 57b30ae77b ("workqueue: reimplement cancel_delayed_work() using try_to_grab_pending()") made cancel_delayed_work() always return %true unless someone else is also trying to cancel the work item, which is broken - if the target work item is idle, the return value should be %false. try_to_grab_pending() indicates that the target work item was idle by zero return value. Use it for return. Note that this brings cancel_delayed_work() in line with __cancel_work_timer() in return value handling. Signed-off-by: Dan Magenheimer Signed-off-by: Tejun Heo LKML-Reference: <444a6439-b1a4-4740-9e7e-bc37267cfe73@default> --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index d951daa0ca9a..042d221d33cc 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2982,7 +2982,7 @@ bool cancel_delayed_work(struct delayed_work *dwork) set_work_cpu_and_clear_pending(&dwork->work, work_cpu(&dwork->work)); local_irq_restore(flags); - return true; + return ret; } EXPORT_SYMBOL(cancel_delayed_work); -- cgit v1.2.3 From f2302505775fd13ba93f034206f1e2a587017929 Mon Sep 17 00:00:00 2001 From: Andrew Vagin Date: Thu, 25 Oct 2012 13:38:07 -0700 Subject: pidns: limit the nesting depth of pid namespaces 'struct pid' is a "variable sized struct" - a header with an array of upids at the end. The size of the array depends on a level (depth) of pid namespaces. Now a level of pidns is not limited, so 'struct pid' can be more than one page. Looks reasonable, that it should be less than a page. MAX_PIS_NS_LEVEL is not calculated from PAGE_SIZE, because in this case it depends on architectures, config options and it will be reduced, if someone adds a new fields in struct pid or struct upid. I suggest to set MAX_PIS_NS_LEVEL = 32, because it saves ability to expand "struct pid" and it's more than enough for all known for me use-cases. When someone finds a reasonable use case, we can add a config option or a sysctl parameter. In addition it will reduce the effect of another problem, when we have many nested namespaces and the oldest one starts dying. zap_pid_ns_processe will be called for each namespace and find_vpid will be called for each process in a namespace. find_vpid will be called minimum max_level^2 / 2 times. The reason of that is that when we found a bit in pidmap, we can't determine this pidns is top for this process or it isn't. vpid is a heavy operation, so a fork bomb, which create many nested namespace, can make a system inaccessible for a long time. For example my system becomes inaccessible for a few minutes with 4000 processes. [akpm@linux-foundation.org: return -EINVAL in response to excessive nesting, not -ENOMEM] Signed-off-by: Andrew Vagin Acked-by: Oleg Nesterov Cc: Cyrill Gorcunov Cc: "Eric W. Biederman" Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid_namespace.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index eb00be205811..7b07cc0dfb75 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -71,12 +71,22 @@ err_alloc: return NULL; } +/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ +#define MAX_PID_NS_LEVEL 32 + static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns) { struct pid_namespace *ns; unsigned int level = parent_pid_ns->level + 1; - int i, err = -ENOMEM; + int i; + int err; + + if (level > MAX_PID_NS_LEVEL) { + err = -EINVAL; + goto out; + } + err = -ENOMEM; ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); if (ns == NULL) goto out; -- cgit v1.2.3 From 2008713c7174e5c0f207bac684c6df0939047009 Mon Sep 17 00:00:00 2001 From: H. Peter Anvin Date: Wed, 24 Oct 2012 14:11:48 -0700 Subject: Makefile: Documentation for external tool should be correct If one includes documentation for an external tool, it should be correct. This is not: 1. Overriding the input to rngd should typically be neither necessary nor desired. This is especially so since newer versions of rngd support a number of different *types* of sources. 2. The default kernel-exported device is called /dev/hwrng not /dev/hwrandom nor /dev/hw_random (both of which were used in the past; however, kernel and udev seem to have converged on /dev/hwrng.) Overall it is better if the documentation for rngd is kept with rngd rather than in a kernel Makefile. Signed-off-by: H. Peter Anvin Cc: David Howells Cc: Jeff Garzik Signed-off-by: Linus Torvalds --- kernel/Makefile | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 0dfeca4324ee..86e3285ae7e5 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -174,10 +174,8 @@ signing_key.priv signing_key.x509: x509.genkey @echo "###" @echo "### If this takes a long time, you might wish to run rngd in the" @echo "### background to keep the supply of entropy topped up. It" - @echo "### needs to be run as root, and should use a hardware random" - @echo "### number generator if one is available, eg:" - @echo "###" - @echo "### rngd -r /dev/hwrandom" + @echo "### needs to be run as root, and uses a hardware random" + @echo "### number generator if one is available." @echo "###" openssl req -new -nodes -utf8 $(sign_key_with_hash) -days 36500 -batch \ -x509 -config x509.genkey \ -- cgit v1.2.3 From 0d855354ea351bec6b222e9fea86a876cfafdcb6 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Fri, 26 Oct 2012 18:28:56 +0200 Subject: perf, powerpc: Fix hw breakpoints returning -ENOSPC I've been trying to get hardware breakpoints with perf to work on POWER7 but I'm getting the following: % perf record -e mem:0x10000000 true Error: sys_perf_event_open() syscall returned with 28 (No space left on device). /bin/dmesg may provide additional information. Fatal: No CONFIG_PERF_EVENTS=y kernel support configured? true: Terminated (FWIW adding -a and it works fine) Debugging it seems that __reserve_bp_slot() is returning ENOSPC because it thinks there are no free breakpoint slots on this CPU. I have a 2 CPUs, so perf userspace is doing two perf_event_open syscalls to add a counter to each CPU [1]. The first syscall succeeds but the second is failing. On this second syscall, fetch_bp_busy_slots() sets slots.pinned to be 1, despite there being no breakpoint on this CPU. This is because the call the task_bp_pinned, checks all CPUs, rather than just the current CPU. POWER7 only has one hardware breakpoint per CPU (ie. HBP_NUM=1), so we return ENOSPC. The following patch fixes this by checking the associated CPU for each breakpoint in task_bp_pinned. I'm not familiar with this code, so it's provided as a reference to the above issue. Signed-off-by: Michael Neuling Acked-by: Peter Zijlstra Cc: Michael Ellerman Cc: Jovi Zhang Cc: K Prasad Link: http://lkml.kernel.org/r/1351268936-2956-1-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/events/hw_breakpoint.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 9a7b487c6fe2..fe8a916507ed 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -111,14 +111,16 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) * Count the number of breakpoints of the same type and same task. * The given event must be not on the list. */ -static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type) +static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) { struct task_struct *tsk = bp->hw.bp_target; struct perf_event *iter; int count = 0; list_for_each_entry(iter, &bp_task_head, hw.bp_list) { - if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type) + if (iter->hw.bp_target == tsk && + find_slot_idx(iter) == type && + cpu == iter->cpu) count += hw_breakpoint_weight(iter); } @@ -141,7 +143,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, if (!tsk) slots->pinned += max_task_bp_pinned(cpu, type); else - slots->pinned += task_bp_pinned(bp, type); + slots->pinned += task_bp_pinned(cpu, bp, type); slots->flexible = per_cpu(nr_bp_flexible[type], cpu); return; @@ -154,7 +156,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, if (!tsk) nr += max_task_bp_pinned(cpu, type); else - nr += task_bp_pinned(bp, type); + nr += task_bp_pinned(cpu, bp, type); if (nr > slots->pinned) slots->pinned = nr; @@ -188,7 +190,7 @@ static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable, int old_idx = 0; int idx = 0; - old_count = task_bp_pinned(bp, type); + old_count = task_bp_pinned(cpu, bp, type); old_idx = old_count - 1; idx = old_idx + weight; -- cgit v1.2.3 From 59ef28b1f14899b10d6b2682c7057ca00a9a3f47 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 25 Oct 2012 10:49:25 +1030 Subject: module: fix out-by-one error in kallsyms Masaki found and patched a kallsyms issue: the last symbol in a module's symtab wasn't transferred. This is because we manually copy the zero'th entry (which is always empty) then copy the rest in a loop starting at 1, though from src[0]. His fix was minimal, I prefer to rewrite the loops in more standard form. There are two loops: one to get the size, and one to copy. Make these identical: always count entry 0 and any defined symbol in an allocated non-init section. This bug exists since the following commit was introduced. module: reduce symbol table for loaded modules (v2) commit: 4a4962263f07d14660849ec134ee42b63e95ea9a LKML: http://lkml.org/lkml/2012/10/24/27 Reported-by: Masaki Kimura Cc: stable@kernel.org --- kernel/module.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 6085f5ef88ea..6e48c3a43599 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2293,12 +2293,17 @@ static void layout_symtab(struct module *mod, struct load_info *info) src = (void *)info->hdr + symsect->sh_offset; nsrc = symsect->sh_size / sizeof(*src); + /* strtab always starts with a nul, so offset 0 is the empty string. */ + strtab_size = 1; + /* Compute total space required for the core symbols' strtab. */ - for (ndst = i = strtab_size = 1; i < nsrc; ++i, ++src) - if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) { - strtab_size += strlen(&info->strtab[src->st_name]) + 1; + for (ndst = i = 0; i < nsrc; i++) { + if (i == 0 || + is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) { + strtab_size += strlen(&info->strtab[src[i].st_name])+1; ndst++; } + } /* Append room for core symbols at end of core part. */ info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); @@ -2332,15 +2337,15 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) mod->core_symtab = dst = mod->module_core + info->symoffs; mod->core_strtab = s = mod->module_core + info->stroffs; src = mod->symtab; - *dst = *src; *s++ = 0; - for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { - if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) - continue; - - dst[ndst] = *src; - dst[ndst++].st_name = s - mod->core_strtab; - s += strlcpy(s, &mod->strtab[src->st_name], KSYM_NAME_LEN) + 1; + for (ndst = i = 0; i < mod->num_symtab; i++) { + if (i == 0 || + is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) { + dst[ndst] = src[i]; + dst[ndst++].st_name = s - mod->core_strtab; + s += strlcpy(s, &mod->strtab[src[i].st_name], + KSYM_NAME_LEN) + 1; + } } mod->core_num_syms = ndst; } -- cgit v1.2.3 From 59fa6245192159ab5e1e17b8e31f15afa9cff4bf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 23 Oct 2012 22:29:38 +0200 Subject: futex: Handle futex_pi OWNER_DIED take over correctly Siddhesh analyzed a failure in the take over of pi futexes in case the owner died and provided a workaround. See: http://sourceware.org/bugzilla/show_bug.cgi?id=14076 The detailed problem analysis shows: Futex F is initialized with PTHREAD_PRIO_INHERIT and PTHREAD_MUTEX_ROBUST_NP attributes. T1 lock_futex_pi(F); T2 lock_futex_pi(F); --> T2 blocks on the futex and creates pi_state which is associated to T1. T1 exits --> exit_robust_list() runs --> Futex F userspace value TID field is set to 0 and FUTEX_OWNER_DIED bit is set. T3 lock_futex_pi(F); --> Succeeds due to the check for F's userspace TID field == 0 --> Claims ownership of the futex and sets its own TID into the userspace TID field of futex F --> returns to user space T1 --> exit_pi_state_list() --> Transfers pi_state to waiter T2 and wakes T2 via rt_mutex_unlock(&pi_state->mutex) T2 --> acquires pi_state->mutex and gains real ownership of the pi_state --> Claims ownership of the futex and sets its own TID into the userspace TID field of futex F --> returns to user space T3 --> observes inconsistent state This problem is independent of UP/SMP, preemptible/non preemptible kernels, or process shared vs. private. The only difference is that certain configurations are more likely to expose it. So as Siddhesh correctly analyzed the following check in futex_lock_pi_atomic() is the culprit: if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { We check the userspace value for a TID value of 0 and take over the futex unconditionally if that's true. AFAICT this check is there as it is correct for a different corner case of futexes: the WAITERS bit became stale. Now the proposed change - if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { + if (unlikely(ownerdied || + !(curval & (FUTEX_TID_MASK | FUTEX_WAITERS)))) { solves the problem, but it's not obvious why and it wreckages the "stale WAITERS bit" case. What happens is, that due to the WAITERS bit being set (T2 is blocked on that futex) it enforces T3 to go through lookup_pi_state(), which in the above case returns an existing pi_state and therefor forces T3 to legitimately fight with T2 over the ownership of the pi_state (via pi_state->mutex). Probelm solved! Though that does not work for the "WAITERS bit is stale" problem because if lookup_pi_state() does not find existing pi_state it returns -ERSCH (due to TID == 0) which causes futex_lock_pi() to return -ESRCH to user space because the OWNER_DIED bit is not set. Now there is a different solution to that problem. Do not look at the user space value at all and enforce a lookup of possibly available pi_state. If pi_state can be found, then the new incoming locker T3 blocks on that pi_state and legitimately races with T2 to acquire the rt_mutex and the pi_state and therefor the proper ownership of the user space futex. lookup_pi_state() has the correct order of checks. It first tries to find a pi_state associated with the user space futex and only if that fails it checks for futex TID value = 0. If no pi_state is available nothing can create new state at that point because this happens with the hash bucket lock held. So the above scenario changes to: T1 lock_futex_pi(F); T2 lock_futex_pi(F); --> T2 blocks on the futex and creates pi_state which is associated to T1. T1 exits --> exit_robust_list() runs --> Futex F userspace value TID field is set to 0 and FUTEX_OWNER_DIED bit is set. T3 lock_futex_pi(F); --> Finds pi_state and blocks on pi_state->rt_mutex T1 --> exit_pi_state_list() --> Transfers pi_state to waiter T2 and wakes it via rt_mutex_unlock(&pi_state->mutex) T2 --> acquires pi_state->mutex and gains ownership of the pi_state --> Claims ownership of the futex and sets its own TID into the userspace TID field of futex F --> returns to user space This covers all gazillion points on which T3 might come in between T1's exit_robust_list() clearing the TID field and T2 fixing it up. It also solves the "WAITERS bit stale" problem by forcing the take over. Another benefit of changing the code this way is that it makes it less dependent on untrusted user space values and therefor minimizes the possible wreckage which might be inflicted. As usual after staring for too long at the futex code my brain hurts so much that I really want to ditch that whole optimization of avoiding the syscall for the non contended case for PI futexes and rip out the maze of corner case handling code. Unfortunately we can't as user space relies on that existing behaviour, but at least thinking about it helps me to preserve my mental sanity. Maybe we should nevertheless :) Reported-and-tested-by: Siddhesh Poyarekar Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1210232138540.2756@ionos Acked-by: Darren Hart Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- kernel/futex.c | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 3717e7b306e0..20ef219bbe9b 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -716,7 +716,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, struct futex_pi_state **ps, struct task_struct *task, int set_waiters) { - int lock_taken, ret, ownerdied = 0; + int lock_taken, ret, force_take = 0; u32 uval, newval, curval, vpid = task_pid_vnr(task); retry: @@ -755,17 +755,15 @@ retry: newval = curval | FUTEX_WAITERS; /* - * There are two cases, where a futex might have no owner (the - * owner TID is 0): OWNER_DIED. We take over the futex in this - * case. We also do an unconditional take over, when the owner - * of the futex died. - * - * This is safe as we are protected by the hash bucket lock ! + * Should we force take the futex? See below. */ - if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { - /* Keep the OWNER_DIED bit */ + if (unlikely(force_take)) { + /* + * Keep the OWNER_DIED and the WAITERS bit and set the + * new TID value. + */ newval = (curval & ~FUTEX_TID_MASK) | vpid; - ownerdied = 0; + force_take = 0; lock_taken = 1; } @@ -775,7 +773,7 @@ retry: goto retry; /* - * We took the lock due to owner died take over. + * We took the lock due to forced take over. */ if (unlikely(lock_taken)) return 1; @@ -790,20 +788,25 @@ retry: switch (ret) { case -ESRCH: /* - * No owner found for this futex. Check if the - * OWNER_DIED bit is set to figure out whether - * this is a robust futex or not. + * We failed to find an owner for this + * futex. So we have no pi_state to block + * on. This can happen in two cases: + * + * 1) The owner died + * 2) A stale FUTEX_WAITERS bit + * + * Re-read the futex value. */ if (get_futex_value_locked(&curval, uaddr)) return -EFAULT; /* - * We simply start over in case of a robust - * futex. The code above will take the futex - * and return happy. + * If the owner died or we have a stale + * WAITERS bit the owner TID in the user space + * futex is 0. */ - if (curval & FUTEX_OWNER_DIED) { - ownerdied = 1; + if (!(curval & FUTEX_TID_MASK)) { + force_take = 1; goto retry; } default: -- cgit v1.2.3 From 8ffeb9b0e6369135bf03a073514f571ef10606b9 Mon Sep 17 00:00:00 2001 From: Chuansheng Liu Date: Mon, 26 Nov 2012 16:29:54 -0800 Subject: watchdog: using u64 in get_sample_period() In get_sample_period(), unsigned long is not enough: watchdog_thresh * 2 * (NSEC_PER_SEC / 5) case1: watchdog_thresh is 10 by default, the sample value will be: 0xEE6B2800 case2: set watchdog_thresh is 20, the sample value will be: 0x1 DCD6 5000 In case2, we need use u64 to express the sample period. Otherwise, changing the threshold thru proc often can not be successful. Signed-off-by: liu chuansheng Acked-by: Don Zickus Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 9d4c8d5a1f53..dd4b80a9f1a9 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -116,7 +116,7 @@ static unsigned long get_timestamp(int this_cpu) return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ } -static unsigned long get_sample_period(void) +static u64 get_sample_period(void) { /* * convert watchdog_thresh from seconds to ns @@ -125,7 +125,7 @@ static unsigned long get_sample_period(void) * and hard thresholds) to increment before the * hardlockup detector generates a warning */ - return get_softlockup_thresh() * (NSEC_PER_SEC / 5); + return get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5); } /* Commands for resetting the watchdog */ -- cgit v1.2.3 From aa10990e028cac3d5e255711fb9fb47e00700e35 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Mon, 26 Nov 2012 16:29:56 -0800 Subject: futex: avoid wake_futex() for a PI futex_q Dave Jones reported a bug with futex_lock_pi() that his trinity test exposed. Sometime between queue_me() and taking the q.lock_ptr, the lock_ptr became NULL, resulting in a crash. While futex_wake() is careful to not call wake_futex() on futex_q's with a pi_state or an rt_waiter (which are either waiting for a futex_unlock_pi() or a PI futex_requeue()), futex_wake_op() and futex_requeue() do not perform the same test. Update futex_wake_op() and futex_requeue() to test for q.pi_state and q.rt_waiter and abort with -EINVAL if detected. To ensure any future breakage is caught, add a WARN() to wake_futex() if the same condition is true. This fix has seen 3 hours of testing with "trinity -c futex" on an x86_64 VM with 4 CPUS. [akpm@linux-foundation.org: tidy up the WARN()] Signed-off-by: Darren Hart Reported-by: Dave Jones Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: John Kacur Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 20ef219bbe9b..19eb089ca003 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -843,6 +843,9 @@ static void wake_futex(struct futex_q *q) { struct task_struct *p = q->task; + if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n")) + return; + /* * We set q->lock_ptr = NULL _before_ we wake up the task. If * a non-futex wake up happens on another CPU then the task @@ -1078,6 +1081,10 @@ retry_private: plist_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key1)) { + if (this->pi_state || this->rt_waiter) { + ret = -EINVAL; + goto out_unlock; + } wake_futex(this); if (++ret >= nr_wake) break; @@ -1090,6 +1097,10 @@ retry_private: op_ret = 0; plist_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key2)) { + if (this->pi_state || this->rt_waiter) { + ret = -EINVAL; + goto out_unlock; + } wake_futex(this); if (++op_ret >= nr_wake2) break; @@ -1098,6 +1109,7 @@ retry_private: ret += op_ret; } +out_unlock: double_unlock_hb(hb1, hb2); out_put_keys: put_futex_key(&key2); @@ -1387,9 +1399,13 @@ retry_private: /* * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always * be paired with each other and no other futex ops. + * + * We should never be requeueing a futex_q with a pi_state, + * which is awaiting a futex_unlock_pi(). */ if ((requeue_pi && !this->rt_waiter) || - (!requeue_pi && this->rt_waiter)) { + (!requeue_pi && this->rt_waiter) || + this->pi_state) { ret = -EINVAL; break; } -- cgit v1.2.3 From 412d32e6c98527078779e5b515823b2810e40324 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 28 Nov 2012 07:17:18 +0100 Subject: workqueue: exit rescuer_thread() as TASK_RUNNING A rescue thread exiting TASK_INTERRUPTIBLE can lead to a task scheduling off, never to be seen again. In the case where this occurred, an exiting thread hit reiserfs homebrew conditional resched while holding a mutex, bringing the box to its knees. PID: 18105 TASK: ffff8807fd412180 CPU: 5 COMMAND: "kdmflush" #0 [ffff8808157e7670] schedule at ffffffff8143f489 #1 [ffff8808157e77b8] reiserfs_get_block at ffffffffa038ab2d [reiserfs] #2 [ffff8808157e79a8] __block_write_begin at ffffffff8117fb14 #3 [ffff8808157e7a98] reiserfs_write_begin at ffffffffa0388695 [reiserfs] #4 [ffff8808157e7ad8] generic_perform_write at ffffffff810ee9e2 #5 [ffff8808157e7b58] generic_file_buffered_write at ffffffff810eeb41 #6 [ffff8808157e7ba8] __generic_file_aio_write at ffffffff810f1a3a #7 [ffff8808157e7c58] generic_file_aio_write at ffffffff810f1c88 #8 [ffff8808157e7cc8] do_sync_write at ffffffff8114f850 #9 [ffff8808157e7dd8] do_acct_process at ffffffff810a268f [exception RIP: kernel_thread_helper] RIP: ffffffff8144a5c0 RSP: ffff8808157e7f58 RFLAGS: 00000202 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffffffff8107af60 RDI: ffff8803ee491d18 RBP: 0000000000000000 R8: 0000000000000000 R9: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 Signed-off-by: Mike Galbraith Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org --- kernel/workqueue.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 042d221d33cc..ac25db111f48 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2407,8 +2407,10 @@ static int rescuer_thread(void *__wq) repeat: set_current_state(TASK_INTERRUPTIBLE); - if (kthread_should_stop()) + if (kthread_should_stop()) { + __set_current_state(TASK_RUNNING); return 0; + } /* * See whether any cpu is asking for help. Unbounded -- cgit v1.2.3 From 8852aac25e79e38cc6529f20298eed154f60b574 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 1 Dec 2012 16:23:42 -0800 Subject: workqueue: mod_delayed_work_on() shouldn't queue timer on 0 delay 8376fe22c7 ("workqueue: implement mod_delayed_work[_on]()") implemented mod_delayed_work[_on]() using the improved try_to_grab_pending(). The function is later used, among others, to replace [__]candel_delayed_work() + queue_delayed_work() combinations. Unfortunately, a delayed_work item w/ zero @delay is handled slightly differently by mod_delayed_work_on() compared to queue_delayed_work_on(). The latter skips timer altogether and directly queues it using queue_work_on() while the former schedules timer which will expire on the closest tick. This means, when @delay is zero, that [__]cancel_delayed_work() + queue_delayed_work_on() makes the target item immediately executable while mod_delayed_work_on() may induce delay of upto a full tick. This somewhat subtle difference breaks some of the converted users. e.g. block queue plugging uses delayed_work for deferred processing and uses mod_delayed_work_on() when the queue needs to be immediately unplugged. The above problem manifested as noticeably higher number of context switches under certain circumstances. The difference in behavior was caused by missing special case handling for 0 delay in mod_delayed_work_on() compared to queue_delayed_work_on(). Joonsoo Kim posted a patch to add it - ("workqueue: optimize mod_delayed_work_on() when @delay == 0")[1]. The patch was queued for 3.8 but it was described as optimization and I missed that it was a correctness issue. As both queue_delayed_work_on() and mod_delayed_work_on() use __queue_delayed_work() for queueing, it seems that the better approach is to move the 0 delay special handling to the function instead of duplicating it in mod_delayed_work_on(). Fix the problem by moving 0 delay special case handling from queue_delayed_work_on() to __queue_delayed_work(). This replaces Joonsoo's patch. [1] http://thread.gmane.org/gmane.linux.kernel/1379011/focus=1379012 Signed-off-by: Tejun Heo Reported-and-tested-by: Anders Kaseorg Reported-and-tested-by: Zlatko Calusic LKML-Reference: LKML-Reference: <50A78AA9.5040904@iskon.hr> Cc: Joonsoo Kim --- kernel/workqueue.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ac25db111f48..084aa47bac82 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1364,6 +1364,17 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, BUG_ON(timer_pending(timer)); BUG_ON(!list_empty(&work->entry)); + /* + * If @delay is 0, queue @dwork->work immediately. This is for + * both optimization and correctness. The earliest @timer can + * expire is on the closest next tick and delayed_work users depend + * on that there's no such delay when @delay is 0. + */ + if (!delay) { + __queue_work(cpu, wq, &dwork->work); + return; + } + timer_stats_timer_set_start_info(&dwork->timer); /* @@ -1417,9 +1428,6 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, bool ret = false; unsigned long flags; - if (!delay) - return queue_work_on(cpu, wq, &dwork->work); - /* read the comment in __queue_work() */ local_irq_save(flags); -- cgit v1.2.3 From 84ecfd15f5547c992c901df6ec14b4d507eb2c6e Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 23 Nov 2012 12:08:16 +0000 Subject: modsign: add symbol prefix to certificate list Add the arch symbol prefix (if applicable) to the asm definition of modsign_certificate_list and modsign_certificate_list_end. This uses the recently defined SYMBOL_PREFIX which is derived from CONFIG_SYMBOL_PREFIX. This fixes the build of module signing on the blackfin and metag architectures. Signed-off-by: James Hogan Cc: Rusty Russell Cc: David Howells Cc: Mike Frysinger Signed-off-by: Rusty Russell --- kernel/modsign_pubkey.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c index 4646eb2c3820..767e559dfb10 100644 --- a/kernel/modsign_pubkey.c +++ b/kernel/modsign_pubkey.c @@ -21,10 +21,10 @@ struct key *modsign_keyring; extern __initdata const u8 modsign_certificate_list[]; extern __initdata const u8 modsign_certificate_list_end[]; asm(".section .init.data,\"aw\"\n" - "modsign_certificate_list:\n" + SYMBOL_PREFIX "modsign_certificate_list:\n" ".incbin \"signing_key.x509\"\n" ".incbin \"extra_certificates\"\n" - "modsign_certificate_list_end:" + SYMBOL_PREFIX "modsign_certificate_list_end:" ); /* -- cgit v1.2.3 From fd8ef11730f1d03d5d6555aa53126e9e34f52f12 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Mon, 3 Dec 2012 06:25:25 +0100 Subject: Revert "sched, autogroup: Stop going ahead if autogroup is disabled" This reverts commit 800d4d30c8f20bd728e5741a3b77c4859a613f7c. Between commits 8323f26ce342 ("sched: Fix race in task_group()") and 800d4d30c8f2 ("sched, autogroup: Stop going ahead if autogroup is disabled"), autogroup is a wreck. With both applied, all you have to do to crash a box is disable autogroup during boot up, then reboot.. boom, NULL pointer dereference due to commit 800d4d30c8f2 not allowing autogroup to move things, and commit 8323f26ce342 making that the only way to switch runqueues: BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] effective_load.isra.43+0x50/0x90 Pid: 7047, comm: systemd-user-se Not tainted 3.6.8-smp #7 MEDIONPC MS-7502/MS-7502 RIP: effective_load.isra.43+0x50/0x90 Process systemd-user-se (pid: 7047, threadinfo ffff880221dde000, task ffff88022618b3a0) Call Trace: select_task_rq_fair+0x255/0x780 try_to_wake_up+0x156/0x2c0 wake_up_state+0xb/0x10 signal_wake_up+0x28/0x40 complete_signal+0x1d6/0x250 __send_signal+0x170/0x310 send_signal+0x40/0x80 do_send_sig_info+0x47/0x90 group_send_sig_info+0x4a/0x70 kill_pid_info+0x3a/0x60 sys_kill+0x97/0x1a0 ? vfs_read+0x120/0x160 ? sys_read+0x45/0x90 system_call_fastpath+0x16/0x1b Code: 49 0f af 41 50 31 d2 49 f7 f0 48 83 f8 01 48 0f 46 c6 48 2b 07 48 8b bf 40 01 00 00 48 85 ff 74 3a 45 31 c0 48 8b 8f 50 01 00 00 <48> 8b 11 4c 8b 89 80 00 00 00 49 89 d2 48 01 d0 45 8b 59 58 4c RIP [] effective_load.isra.43+0x50/0x90 RSP CR2: 0000000000000000 Signed-off-by: Mike Galbraith Acked-by: Ingo Molnar Cc: Yong Zhang Cc: Peter Zijlstra Cc: stable@vger.kernel.org # 2.6.39+ Signed-off-by: Linus Torvalds --- kernel/sched/auto_group.c | 4 ---- kernel/sched/auto_group.h | 5 ----- 2 files changed, 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 0984a21076a3..15f60d01198b 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -143,15 +143,11 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) p->signal->autogroup = autogroup_kref_get(ag); - if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) - goto out; - t = p; do { sched_move_task(t); } while_each_thread(p, t); -out: unlock_task_sighand(p, &flags); autogroup_kref_put(prev); } diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h index 8bd047142816..443232ebbb53 100644 --- a/kernel/sched/auto_group.h +++ b/kernel/sched/auto_group.h @@ -4,11 +4,6 @@ #include struct autogroup { - /* - * reference doesn't mean how many thread attach to this - * autogroup now. It just stands for the number of task - * could use this autogroup. - */ struct kref kref; struct task_group *tg; struct rw_semaphore lock; -- cgit v1.2.3 From fc4b514f2727f74a4587c31db87e0e93465518c3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 4 Dec 2012 07:40:39 -0800 Subject: workqueue: convert BUG_ON()s in __queue_delayed_work() to WARN_ON_ONCE()s 8852aac25e ("workqueue: mod_delayed_work_on() shouldn't queue timer on 0 delay") unexpectedly uncovered a very nasty abuse of delayed_work in megaraid - it allocated work_struct, casted it to delayed_work and then pass that into queue_delayed_work(). Previously, this was okay because 0 @delay short-circuited to queue_work() before doing anything with delayed_work. 8852aac25e moved 0 @delay test into __queue_delayed_work() after sanity check on delayed_work making megaraid trigger BUG_ON(). Although megaraid is already fixed by c1d390d8e6 ("megaraid: fix BUG_ON() from incorrect use of delayed work"), this patch converts BUG_ON()s in __queue_delayed_work() to WARN_ON_ONCE()s so that such abusers, if there are more, trigger warning but don't crash the machine. Signed-off-by: Tejun Heo Cc: Xiaotian Feng --- kernel/workqueue.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 084aa47bac82..1dae900df798 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1361,8 +1361,8 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, WARN_ON_ONCE(timer->function != delayed_work_timer_fn || timer->data != (unsigned long)dwork); - BUG_ON(timer_pending(timer)); - BUG_ON(!list_empty(&work->entry)); + WARN_ON_ONCE(timer_pending(timer)); + WARN_ON_ONCE(!list_empty(&work->entry)); /* * If @delay is 0, queue @dwork->work immediately. This is for -- cgit v1.2.3 From 8d4516904b39507458bee8115793528e12b1d8dd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 4 Dec 2012 18:59:34 +0100 Subject: watchdog: Fix CPU hotplug regression Norbert reported: "3.7-rc6 booted with nmi_watchdog=0 fails to suspend to RAM or offline CPUs. It's reproducable with a KVM guest and physical system." The reason is that commit bcd951cf(watchdog: Use hotplug thread infrastructure) missed to take this into account. So the cpu offline code gets stuck in the teardown function because it accesses non initialized data structures. Add a check for watchdog_enabled into that path to cure the issue. Reported-and-tested-by: Norbert Warmuth Tested-by: Joseph Salisbury Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1211231033230.2701@ionos Link: http://bugs.launchpad.net/bugs/1079534 Signed-off-by: Thomas Gleixner --- kernel/watchdog.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index dd4b80a9f1a9..c8c21be11ab4 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -368,6 +368,9 @@ static void watchdog_disable(unsigned int cpu) { struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); + if (!watchdog_enabled) + return; + watchdog_set_prio(SCHED_NORMAL, 0); hrtimer_cancel(hrtimer); /* disable the perf event */ -- cgit v1.2.3 From 12e130b04580532aa099893158aa2776b321ae7f Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 22 Oct 2012 15:05:48 +0100 Subject: MODSIGN: Don't use enum-type bitfields in module signature info block Don't use enum-type bitfields in the module signature info block as we can't be certain how the compiler will handle them. As I understand it, it is arch dependent, and it is possible for the compiler to rearrange them based on endianness and to insert a byte of padding to pad the three enums out to four bytes. Instead use u8 fields for these, which the compiler should emit in the right order without padding. Signed-off-by: David Howells Signed-off-by: Rusty Russell --- kernel/module_signing.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/module_signing.c b/kernel/module_signing.c index ea1b1df5dbb0..f2970bddc5ea 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -27,13 +27,13 @@ * - Information block */ struct module_signature { - enum pkey_algo algo : 8; /* Public-key crypto algorithm */ - enum pkey_hash_algo hash : 8; /* Digest algorithm */ - enum pkey_id_type id_type : 8; /* Key identifier type */ - u8 signer_len; /* Length of signer's name */ - u8 key_id_len; /* Length of key identifier */ - u8 __pad[3]; - __be32 sig_len; /* Length of signature data */ + u8 algo; /* Public-key crypto algorithm [enum pkey_algo] */ + u8 hash; /* Digest algorithm [enum pkey_hash_algo] */ + u8 id_type; /* Key identifier type [enum pkey_id_type] */ + u8 signer_len; /* Length of signer's name */ + u8 key_id_len; /* Length of key identifier */ + u8 __pad[3]; + __be32 sig_len; /* Length of signature data */ }; /* -- cgit v1.2.3