From 1f5320d5972aa50d3e8d2b227b636b370e608359 Mon Sep 17 00:00:00 2001
From: Daisuke Nishimura
Date: Thu, 4 Oct 2012 16:37:16 +0900
Subject: cgroup: notify_on_release may not be triggered in some cases

notify_on_release must be triggered when the last process in a cgroup is
move to another. But if the first(and only) process in a cgroup is moved to
another, notify_on_release is not triggered.

	# mkdir /cgroup/cpu/SRC
	# mkdir /cgroup/cpu/DST
	#
	# echo 1 >/cgroup/cpu/SRC/notify_on_release
	# echo 1 >/cgroup/cpu/DST/notify_on_release
	#
	# sleep 300 &
	[1] 8629
	#
	# echo 8629 >/cgroup/cpu/SRC/tasks
	# echo 8629 >/cgroup/cpu/DST/tasks
	-> notify_on_release for /SRC must be triggered at this point,
	   but it isn't.

This is because put_css_set() is called before setting CGRP_RELEASABLE
in cgroup_task_migrate(), and is a regression introduce by the
commit:74a1166d(cgroups: make procs file writable), which was merged
into v3.0.

Cc: Ben Blum <bblum@andrew.cmu.edu>
Cc: <stable@vger.kernel.org> # v3.0.x and later
Acked-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 13774b3b39aa..d1739fc7eb94 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1962,9 +1962,8 @@ static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
 	 * trading it for newcg is protected by cgroup_mutex, we're safe to drop
 	 * it here; it will be freed under RCU.
 	 */
-	put_css_set(oldcg);
-
 	set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
+	put_css_set(oldcg);
 }
 
 /**
-- 
cgit v1.2.3


From 9bb71308b8133d643648776243e4d5599b1c193d Mon Sep 17 00:00:00 2001
From: Tejun Heo
Date: Thu, 18 Oct 2012 17:52:07 -0700
Subject: Revert "cgroup: Drop task_lock(parent) on cgroup_fork()"

This reverts commit 7e381b0eb1e1a9805c37335562e8dc02e7d7848c.

The commit incorrectly assumed that fork path always performed
threadgroup_change_begin/end() and depended on that for
synchronization against task exit and cgroup migration paths instead
of explicitly grabbing task_lock().

threadgroup_change is not locked when forking a new process (as
opposed to a new thread in the same process) and even if it were it
wouldn't be effective as different processes use different threadgroup
locks.

Revert the incorrect optimization.

Signed-off-by: Tejun Heo <tj@kernel.org>
LKML-Reference: <20121008020000.GB2575@localhost>
Acked-by: Li Zefan <lizefan@huawei.com>
Bitterly-Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: stable@vger.kernel.org
---
 kernel/cgroup.c | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d1739fc7eb94..75aec12c78a0 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4814,31 +4814,20 @@ static const struct file_operations proc_cgroupstats_operations = {
  *
  * A pointer to the shared css_set was automatically copied in
  * fork.c by dup_task_struct().  However, we ignore that copy, since
- * it was not made under the protection of RCU, cgroup_mutex or
- * threadgroup_change_begin(), so it might no longer be a valid
- * cgroup pointer.  cgroup_attach_task() might have already changed
- * current->cgroups, allowing the previously referenced cgroup
- * group to be removed and freed.
- *
- * Outside the pointer validity we also need to process the css_set
- * inheritance between threadgoup_change_begin() and
- * threadgoup_change_end(), this way there is no leak in any process
- * wide migration performed by cgroup_attach_proc() that could otherwise
- * miss a thread because it is too early or too late in the fork stage.
+ * it was not made under the protection of RCU or cgroup_mutex, so
+ * might no longer be a valid cgroup pointer.  cgroup_attach_task() might
+ * have already changed current->cgroups, allowing the previously
+ * referenced cgroup group to be removed and freed.
  *
  * At the point that cgroup_fork() is called, 'current' is the parent
  * task, and the passed argument 'child' points to the child task.
  */
 void cgroup_fork(struct task_struct *child)
 {
-	/*
-	 * We don't need to task_lock() current because current->cgroups
-	 * can't be changed concurrently here. The parent obviously hasn't
-	 * exited and called cgroup_exit(), and we are synchronized against
-	 * cgroup migration through threadgroup_change_begin().
-	 */
+	task_lock(current);
 	child->cgroups = current->cgroups;
 	get_css_set(child->cgroups);
+	task_unlock(current);
 	INIT_LIST_HEAD(&child->cg_list);
 }
 
-- 
cgit v1.2.3


From d87838321124061f6c935069d97f37010fa417e6 Mon Sep 17 00:00:00 2001
From: Tejun Heo
Date: Thu, 18 Oct 2012 17:40:30 -0700
Subject: Revert "cgroup: Remove task_lock() from cgroup_post_fork()"

This reverts commit 7e3aa30ac8c904a706518b725c451bb486daaae9.

The commit incorrectly assumed that fork path always performed
threadgroup_change_begin/end() and depended on that for
synchronization against task exit and cgroup migration paths instead
of explicitly grabbing task_lock().

threadgroup_change is not locked when forking a new process (as
opposed to a new thread in the same process) and even if it were it
wouldn't be effective as different processes use different threadgroup
locks.

Revert the incorrect optimization.

Signed-off-by: Tejun Heo <tj@kernel.org>
LKML-Reference: <20121008020000.GB2575@localhost>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: stable@vger.kernel.org
---
 kernel/cgroup.c | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 75aec12c78a0..f24f724620dd 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4883,19 +4883,10 @@ void cgroup_post_fork(struct task_struct *child)
 	 */
 	if (use_task_css_set_links) {
 		write_lock(&css_set_lock);
-		if (list_empty(&child->cg_list)) {
-			/*
-			 * It's safe to use child->cgroups without task_lock()
-			 * here because we are protected through
-			 * threadgroup_change_begin() against concurrent
-			 * css_set change in cgroup_task_migrate(). Also
-			 * the task can't exit at that point until
-			 * wake_up_new_task() is called, so we are protected
-			 * against cgroup_exit() setting child->cgroup to
-			 * init_css_set.
-			 */
+		task_lock(child);
+		if (list_empty(&child->cg_list))
 			list_add(&child->cg_list, &child->cgroups->tasks);
-		}
+		task_unlock(child);
 		write_unlock(&css_set_lock);
 	}
 }
-- 
cgit v1.2.3


From c0158ca64da5732dfb86a3f28944e9626776692f Mon Sep 17 00:00:00 2001
From: Dan Magenheimer
Date: Thu, 18 Oct 2012 16:31:37 -0700
Subject: workqueue: cancel_delayed_work() should return %false if work item is
 idle

57b30ae77b ("workqueue: reimplement cancel_delayed_work() using
try_to_grab_pending()") made cancel_delayed_work() always return %true
unless someone else is also trying to cancel the work item, which is
broken - if the target work item is idle, the return value should be
%false.

try_to_grab_pending() indicates that the target work item was idle by
zero return value.  Use it for return.  Note that this brings
cancel_delayed_work() in line with __cancel_work_timer() in return
value handling.

Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
LKML-Reference: <444a6439-b1a4-4740-9e7e-bc37267cfe73@default>
---
 kernel/workqueue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index d951daa0ca9a..042d221d33cc 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2982,7 +2982,7 @@ bool cancel_delayed_work(struct delayed_work *dwork)
 
 	set_work_cpu_and_clear_pending(&dwork->work, work_cpu(&dwork->work));
 	local_irq_restore(flags);
-	return true;
+	return ret;
 }
 EXPORT_SYMBOL(cancel_delayed_work);
 
-- 
cgit v1.2.3


From f2302505775fd13ba93f034206f1e2a587017929 Mon Sep 17 00:00:00 2001
From: Andrew Vagin
Date: Thu, 25 Oct 2012 13:38:07 -0700
Subject: pidns: limit the nesting depth of pid namespaces

'struct pid' is a "variable sized struct" - a header with an array of
upids at the end.

The size of the array depends on a level (depth) of pid namespaces.  Now a
level of pidns is not limited, so 'struct pid' can be more than one page.

Looks reasonable, that it should be less than a page.  MAX_PIS_NS_LEVEL is
not calculated from PAGE_SIZE, because in this case it depends on
architectures, config options and it will be reduced, if someone adds a
new fields in struct pid or struct upid.

I suggest to set MAX_PIS_NS_LEVEL = 32, because it saves ability to expand
"struct pid" and it's more than enough for all known for me use-cases.
When someone finds a reasonable use case, we can add a config option or a
sysctl parameter.

In addition it will reduce the effect of another problem, when we have
many nested namespaces and the oldest one starts dying.
zap_pid_ns_processe will be called for each namespace and find_vpid will
be called for each process in a namespace.  find_vpid will be called
minimum max_level^2 / 2 times.  The reason of that is that when we found a
bit in pidmap, we can't determine this pidns is top for this process or it
isn't.

vpid is a heavy operation, so a fork bomb, which create many nested
namespace, can make a system inaccessible for a long time.  For example my
system becomes inaccessible for a few minutes with 4000 processes.

[akpm@linux-foundation.org: return -EINVAL in response to excessive nesting, not -ENOMEM]
Signed-off-by: Andrew Vagin <avagin@openvz.org>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/pid_namespace.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index eb00be205811..7b07cc0dfb75 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -71,12 +71,22 @@ err_alloc:
 	return NULL;
 }
 
+/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
+#define MAX_PID_NS_LEVEL 32
+
 static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns)
 {
 	struct pid_namespace *ns;
 	unsigned int level = parent_pid_ns->level + 1;
-	int i, err = -ENOMEM;
+	int i;
+	int err;
+
+	if (level > MAX_PID_NS_LEVEL) {
+		err = -EINVAL;
+		goto out;
+	}
 
+	err = -ENOMEM;
 	ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
 	if (ns == NULL)
 		goto out;
-- 
cgit v1.2.3


From 2008713c7174e5c0f207bac684c6df0939047009 Mon Sep 17 00:00:00 2001
From: H. Peter Anvin
Date: Wed, 24 Oct 2012 14:11:48 -0700
Subject: Makefile: Documentation for external tool should be correct

If one includes documentation for an external tool, it should be
correct.  This is not:

1. Overriding the input to rngd should typically be neither
   necessary nor desired.  This is especially so since newer
   versions of rngd support a number of different *types* of sources.
2. The default kernel-exported device is called /dev/hwrng not
   /dev/hwrandom nor /dev/hw_random (both of which were used in the
   past; however, kernel and udev seem to have converged on
   /dev/hwrng.)

Overall it is better if the documentation for rngd is kept with rngd
rather than in a kernel Makefile.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Jeff Garzik <jgarzik@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/Makefile | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 0dfeca4324ee..86e3285ae7e5 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -174,10 +174,8 @@ signing_key.priv signing_key.x509: x509.genkey
 	@echo "###"
 	@echo "### If this takes a long time, you might wish to run rngd in the"
 	@echo "### background to keep the supply of entropy topped up.  It"
-	@echo "### needs to be run as root, and should use a hardware random"
-	@echo "### number generator if one is available, eg:"
-	@echo "###"
-	@echo "###     rngd -r /dev/hwrandom"
+	@echo "### needs to be run as root, and uses a hardware random"
+	@echo "### number generator if one is available."
 	@echo "###"
 	openssl req -new -nodes -utf8 $(sign_key_with_hash) -days 36500 -batch \
 		-x509 -config x509.genkey \
-- 
cgit v1.2.3


From 0d855354ea351bec6b222e9fea86a876cfafdcb6 Mon Sep 17 00:00:00 2001
From: Michael Neuling
Date: Fri, 26 Oct 2012 18:28:56 +0200
Subject: perf, powerpc: Fix hw breakpoints returning -ENOSPC

I've been trying to get hardware breakpoints with perf to work
on POWER7 but I'm getting the following:

  % perf record -e mem:0x10000000 true

    Error: sys_perf_event_open() syscall returned with 28 (No space left on device).  /bin/dmesg may provide additional information.

    Fatal: No CONFIG_PERF_EVENTS=y kernel support configured?

  true: Terminated

(FWIW adding -a and it works fine)

Debugging it seems that __reserve_bp_slot() is returning ENOSPC
because it thinks there are no free breakpoint slots on this
CPU.

I have a 2 CPUs, so perf userspace is doing two perf_event_open
syscalls to add a counter to each CPU [1].  The first syscall
succeeds but the second is failing.

On this second syscall, fetch_bp_busy_slots() sets slots.pinned
to be 1, despite there being no breakpoint on this CPU.  This is
because the call the task_bp_pinned, checks all CPUs, rather
than just the current CPU. POWER7 only has one hardware
breakpoint per CPU (ie. HBP_NUM=1), so we return ENOSPC.

The following patch fixes this by checking the associated CPU
for each breakpoint in task_bp_pinned.  I'm not familiar with
this code, so it's provided as a reference to the above issue.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Michael Ellerman <michael@ellerman.id.au>
Cc: Jovi Zhang <bookjovi@gmail.com>
Cc: K Prasad <prasad@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/1351268936-2956-1-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/hw_breakpoint.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 9a7b487c6fe2..fe8a916507ed 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -111,14 +111,16 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
  * Count the number of breakpoints of the same type and same task.
  * The given event must be not on the list.
  */
-static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type)
+static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
 {
 	struct task_struct *tsk = bp->hw.bp_target;
 	struct perf_event *iter;
 	int count = 0;
 
 	list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
-		if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type)
+		if (iter->hw.bp_target == tsk &&
+		    find_slot_idx(iter) == type &&
+		    cpu == iter->cpu)
 			count += hw_breakpoint_weight(iter);
 	}
 
@@ -141,7 +143,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
 		if (!tsk)
 			slots->pinned += max_task_bp_pinned(cpu, type);
 		else
-			slots->pinned += task_bp_pinned(bp, type);
+			slots->pinned += task_bp_pinned(cpu, bp, type);
 		slots->flexible = per_cpu(nr_bp_flexible[type], cpu);
 
 		return;
@@ -154,7 +156,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
 		if (!tsk)
 			nr += max_task_bp_pinned(cpu, type);
 		else
-			nr += task_bp_pinned(bp, type);
+			nr += task_bp_pinned(cpu, bp, type);
 
 		if (nr > slots->pinned)
 			slots->pinned = nr;
@@ -188,7 +190,7 @@ static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable,
 	int old_idx = 0;
 	int idx = 0;
 
-	old_count = task_bp_pinned(bp, type);
+	old_count = task_bp_pinned(cpu, bp, type);
 	old_idx = old_count - 1;
 	idx = old_idx + weight;
 
-- 
cgit v1.2.3


From 59ef28b1f14899b10d6b2682c7057ca00a9a3f47 Mon Sep 17 00:00:00 2001
From: Rusty Russell
Date: Thu, 25 Oct 2012 10:49:25 +1030
Subject: module: fix out-by-one error in kallsyms

Masaki found and patched a kallsyms issue: the last symbol in a
module's symtab wasn't transferred.  This is because we manually copy
the zero'th entry (which is always empty) then copy the rest in a loop
starting at 1, though from src[0].  His fix was minimal, I prefer to
rewrite the loops in more standard form.

There are two loops: one to get the size, and one to copy.  Make these
identical: always count entry 0 and any defined symbol in an allocated
non-init section.

This bug exists since the following commit was introduced.
   module: reduce symbol table for loaded modules (v2)
   commit: 4a4962263f07d14660849ec134ee42b63e95ea9a

LKML: http://lkml.org/lkml/2012/10/24/27
Reported-by: Masaki Kimura <masaki.kimura.kz@hitachi.com>
Cc: stable@kernel.org
---
 kernel/module.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 6085f5ef88ea..6e48c3a43599 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2293,12 +2293,17 @@ static void layout_symtab(struct module *mod, struct load_info *info)
 	src = (void *)info->hdr + symsect->sh_offset;
 	nsrc = symsect->sh_size / sizeof(*src);
 
+	/* strtab always starts with a nul, so offset 0 is the empty string. */
+	strtab_size = 1;
+
 	/* Compute total space required for the core symbols' strtab. */
-	for (ndst = i = strtab_size = 1; i < nsrc; ++i, ++src)
-		if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) {
-			strtab_size += strlen(&info->strtab[src->st_name]) + 1;
+	for (ndst = i = 0; i < nsrc; i++) {
+		if (i == 0 ||
+		    is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) {
+			strtab_size += strlen(&info->strtab[src[i].st_name])+1;
 			ndst++;
 		}
+	}
 
 	/* Append room for core symbols at end of core part. */
 	info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
@@ -2332,15 +2337,15 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
 	mod->core_symtab = dst = mod->module_core + info->symoffs;
 	mod->core_strtab = s = mod->module_core + info->stroffs;
 	src = mod->symtab;
-	*dst = *src;
 	*s++ = 0;
-	for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) {
-		if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum))
-			continue;
-
-		dst[ndst] = *src;
-		dst[ndst++].st_name = s - mod->core_strtab;
-		s += strlcpy(s, &mod->strtab[src->st_name], KSYM_NAME_LEN) + 1;
+	for (ndst = i = 0; i < mod->num_symtab; i++) {
+		if (i == 0 ||
+		    is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) {
+			dst[ndst] = src[i];
+			dst[ndst++].st_name = s - mod->core_strtab;
+			s += strlcpy(s, &mod->strtab[src[i].st_name],
+				     KSYM_NAME_LEN) + 1;
+		}
 	}
 	mod->core_num_syms = ndst;
 }
-- 
cgit v1.2.3


From 59fa6245192159ab5e1e17b8e31f15afa9cff4bf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 23 Oct 2012 22:29:38 +0200
Subject: futex: Handle futex_pi OWNER_DIED take over correctly

Siddhesh analyzed a failure in the take over of pi futexes in case the
owner died and provided a workaround.
See: http://sourceware.org/bugzilla/show_bug.cgi?id=14076

The detailed problem analysis shows:

Futex F is initialized with PTHREAD_PRIO_INHERIT and
PTHREAD_MUTEX_ROBUST_NP attributes.

T1 lock_futex_pi(F);

T2 lock_futex_pi(F);
   --> T2 blocks on the futex and creates pi_state which is associated
       to T1.

T1 exits
   --> exit_robust_list() runs
       --> Futex F userspace value TID field is set to 0 and
           FUTEX_OWNER_DIED bit is set.

T3 lock_futex_pi(F);
   --> Succeeds due to the check for F's userspace TID field == 0
   --> Claims ownership of the futex and sets its own TID into the
       userspace TID field of futex F
   --> returns to user space

T1 --> exit_pi_state_list()
       --> Transfers pi_state to waiter T2 and wakes T2 via
       	   rt_mutex_unlock(&pi_state->mutex)

T2 --> acquires pi_state->mutex and gains real ownership of the
       pi_state
   --> Claims ownership of the futex and sets its own TID into the
       userspace TID field of futex F
   --> returns to user space

T3 --> observes inconsistent state

This problem is independent of UP/SMP, preemptible/non preemptible
kernels, or process shared vs. private. The only difference is that
certain configurations are more likely to expose it.

So as Siddhesh correctly analyzed the following check in
futex_lock_pi_atomic() is the culprit:

	if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {

We check the userspace value for a TID value of 0 and take over the
futex unconditionally if that's true.

AFAICT this check is there as it is correct for a different corner
case of futexes: the WAITERS bit became stale.

Now the proposed change

-	if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
+       if (unlikely(ownerdied ||
+                       !(curval & (FUTEX_TID_MASK | FUTEX_WAITERS)))) {

solves the problem, but it's not obvious why and it wreckages the
"stale WAITERS bit" case.

What happens is, that due to the WAITERS bit being set (T2 is blocked
on that futex) it enforces T3 to go through lookup_pi_state(), which
in the above case returns an existing pi_state and therefor forces T3
to legitimately fight with T2 over the ownership of the pi_state (via
pi_state->mutex). Probelm solved!

Though that does not work for the "WAITERS bit is stale" problem
because if lookup_pi_state() does not find existing pi_state it
returns -ERSCH (due to TID == 0) which causes futex_lock_pi() to
return -ESRCH to user space because the OWNER_DIED bit is not set.

Now there is a different solution to that problem. Do not look at the
user space value at all and enforce a lookup of possibly available
pi_state. If pi_state can be found, then the new incoming locker T3
blocks on that pi_state and legitimately races with T2 to acquire the
rt_mutex and the pi_state and therefor the proper ownership of the
user space futex.

lookup_pi_state() has the correct order of checks. It first tries to
find a pi_state associated with the user space futex and only if that
fails it checks for futex TID value = 0. If no pi_state is available
nothing can create new state at that point because this happens with
the hash bucket lock held.

So the above scenario changes to:

T1 lock_futex_pi(F);

T2 lock_futex_pi(F);
   --> T2 blocks on the futex and creates pi_state which is associated
       to T1.

T1 exits
   --> exit_robust_list() runs
       --> Futex F userspace value TID field is set to 0 and
           FUTEX_OWNER_DIED bit is set.

T3 lock_futex_pi(F);
   --> Finds pi_state and blocks on pi_state->rt_mutex

T1 --> exit_pi_state_list()
       --> Transfers pi_state to waiter T2 and wakes it via
       	   rt_mutex_unlock(&pi_state->mutex)

T2 --> acquires pi_state->mutex and gains ownership of the pi_state
   --> Claims ownership of the futex and sets its own TID into the
       userspace TID field of futex F
   --> returns to user space

This covers all gazillion points on which T3 might come in between
T1's exit_robust_list() clearing the TID field and T2 fixing it up. It
also solves the "WAITERS bit stale" problem by forcing the take over.

Another benefit of changing the code this way is that it makes it less
dependent on untrusted user space values and therefor minimizes the
possible wreckage which might be inflicted.

As usual after staring for too long at the futex code my brain hurts
so much that I really want to ditch that whole optimization of
avoiding the syscall for the non contended case for PI futexes and rip
out the maze of corner case handling code. Unfortunately we can't as
user space relies on that existing behaviour, but at least thinking
about it helps me to preserve my mental sanity. Maybe we should
nevertheless :)

Reported-and-tested-by: Siddhesh Poyarekar <siddhesh.poyarekar@gmail.com>
Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1210232138540.2756@ionos
Acked-by: Darren Hart <dvhart@linux.intel.com>
Cc: stable@vger.kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c | 41 ++++++++++++++++++++++-------------------
 1 file changed, 22 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 3717e7b306e0..20ef219bbe9b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -716,7 +716,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
 				struct futex_pi_state **ps,
 				struct task_struct *task, int set_waiters)
 {
-	int lock_taken, ret, ownerdied = 0;
+	int lock_taken, ret, force_take = 0;
 	u32 uval, newval, curval, vpid = task_pid_vnr(task);
 
 retry:
@@ -755,17 +755,15 @@ retry:
 	newval = curval | FUTEX_WAITERS;
 
 	/*
-	 * There are two cases, where a futex might have no owner (the
-	 * owner TID is 0): OWNER_DIED. We take over the futex in this
-	 * case. We also do an unconditional take over, when the owner
-	 * of the futex died.
-	 *
-	 * This is safe as we are protected by the hash bucket lock !
+	 * Should we force take the futex? See below.
 	 */
-	if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
-		/* Keep the OWNER_DIED bit */
+	if (unlikely(force_take)) {
+		/*
+		 * Keep the OWNER_DIED and the WAITERS bit and set the
+		 * new TID value.
+		 */
 		newval = (curval & ~FUTEX_TID_MASK) | vpid;
-		ownerdied = 0;
+		force_take = 0;
 		lock_taken = 1;
 	}
 
@@ -775,7 +773,7 @@ retry:
 		goto retry;
 
 	/*
-	 * We took the lock due to owner died take over.
+	 * We took the lock due to forced take over.
 	 */
 	if (unlikely(lock_taken))
 		return 1;
@@ -790,20 +788,25 @@ retry:
 		switch (ret) {
 		case -ESRCH:
 			/*
-			 * No owner found for this futex. Check if the
-			 * OWNER_DIED bit is set to figure out whether
-			 * this is a robust futex or not.
+			 * We failed to find an owner for this
+			 * futex. So we have no pi_state to block
+			 * on. This can happen in two cases:
+			 *
+			 * 1) The owner died
+			 * 2) A stale FUTEX_WAITERS bit
+			 *
+			 * Re-read the futex value.
 			 */
 			if (get_futex_value_locked(&curval, uaddr))
 				return -EFAULT;
 
 			/*
-			 * We simply start over in case of a robust
-			 * futex. The code above will take the futex
-			 * and return happy.
+			 * If the owner died or we have a stale
+			 * WAITERS bit the owner TID in the user space
+			 * futex is 0.
 			 */
-			if (curval & FUTEX_OWNER_DIED) {
-				ownerdied = 1;
+			if (!(curval & FUTEX_TID_MASK)) {
+				force_take = 1;
 				goto retry;
 			}
 		default:
-- 
cgit v1.2.3


From 8ffeb9b0e6369135bf03a073514f571ef10606b9 Mon Sep 17 00:00:00 2001
From: Chuansheng Liu
Date: Mon, 26 Nov 2012 16:29:54 -0800
Subject: watchdog: using u64 in get_sample_period()

In get_sample_period(), unsigned long is not enough:

  watchdog_thresh * 2 * (NSEC_PER_SEC / 5)

case1:
  watchdog_thresh is 10 by default, the sample value will be: 0xEE6B2800

case2:
 set watchdog_thresh is 20, the sample value will be: 0x1 DCD6 5000

In case2, we need use u64 to express the sample period.  Otherwise,
changing the threshold thru proc often can not be successful.

Signed-off-by: liu chuansheng <chuansheng.liu@intel.com>
Acked-by: Don Zickus <dzickus@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/watchdog.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 9d4c8d5a1f53..dd4b80a9f1a9 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -116,7 +116,7 @@ static unsigned long get_timestamp(int this_cpu)
 	return cpu_clock(this_cpu) >> 30LL;  /* 2^30 ~= 10^9 */
 }
 
-static unsigned long get_sample_period(void)
+static u64 get_sample_period(void)
 {
 	/*
 	 * convert watchdog_thresh from seconds to ns
@@ -125,7 +125,7 @@ static unsigned long get_sample_period(void)
 	 * and hard thresholds) to increment before the
 	 * hardlockup detector generates a warning
 	 */
-	return get_softlockup_thresh() * (NSEC_PER_SEC / 5);
+	return get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5);
 }
 
 /* Commands for resetting the watchdog */
-- 
cgit v1.2.3


From aa10990e028cac3d5e255711fb9fb47e00700e35 Mon Sep 17 00:00:00 2001
From: Darren Hart
Date: Mon, 26 Nov 2012 16:29:56 -0800
Subject: futex: avoid wake_futex() for a PI futex_q

Dave Jones reported a bug with futex_lock_pi() that his trinity test
exposed.  Sometime between queue_me() and taking the q.lock_ptr, the
lock_ptr became NULL, resulting in a crash.

While futex_wake() is careful to not call wake_futex() on futex_q's with
a pi_state or an rt_waiter (which are either waiting for a
futex_unlock_pi() or a PI futex_requeue()), futex_wake_op() and
futex_requeue() do not perform the same test.

Update futex_wake_op() and futex_requeue() to test for q.pi_state and
q.rt_waiter and abort with -EINVAL if detected.  To ensure any future
breakage is caught, add a WARN() to wake_futex() if the same condition
is true.

This fix has seen 3 hours of testing with "trinity -c futex" on an
x86_64 VM with 4 CPUS.

[akpm@linux-foundation.org: tidy up the WARN()]
Signed-off-by: Darren Hart <dvhart@linux.intel.com>
Reported-by: Dave Jones <davej@redat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: John Kacur <jkacur@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/futex.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 20ef219bbe9b..19eb089ca003 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -843,6 +843,9 @@ static void wake_futex(struct futex_q *q)
 {
 	struct task_struct *p = q->task;
 
+	if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
+		return;
+
 	/*
 	 * We set q->lock_ptr = NULL _before_ we wake up the task. If
 	 * a non-futex wake up happens on another CPU then the task
@@ -1078,6 +1081,10 @@ retry_private:
 
 	plist_for_each_entry_safe(this, next, head, list) {
 		if (match_futex (&this->key, &key1)) {
+			if (this->pi_state || this->rt_waiter) {
+				ret = -EINVAL;
+				goto out_unlock;
+			}
 			wake_futex(this);
 			if (++ret >= nr_wake)
 				break;
@@ -1090,6 +1097,10 @@ retry_private:
 		op_ret = 0;
 		plist_for_each_entry_safe(this, next, head, list) {
 			if (match_futex (&this->key, &key2)) {
+				if (this->pi_state || this->rt_waiter) {
+					ret = -EINVAL;
+					goto out_unlock;
+				}
 				wake_futex(this);
 				if (++op_ret >= nr_wake2)
 					break;
@@ -1098,6 +1109,7 @@ retry_private:
 		ret += op_ret;
 	}
 
+out_unlock:
 	double_unlock_hb(hb1, hb2);
 out_put_keys:
 	put_futex_key(&key2);
@@ -1387,9 +1399,13 @@ retry_private:
 		/*
 		 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
 		 * be paired with each other and no other futex ops.
+		 *
+		 * We should never be requeueing a futex_q with a pi_state,
+		 * which is awaiting a futex_unlock_pi().
 		 */
 		if ((requeue_pi && !this->rt_waiter) ||
-		    (!requeue_pi && this->rt_waiter)) {
+		    (!requeue_pi && this->rt_waiter) ||
+		    this->pi_state) {
 			ret = -EINVAL;
 			break;
 		}
-- 
cgit v1.2.3


From 412d32e6c98527078779e5b515823b2810e40324 Mon Sep 17 00:00:00 2001
From: Mike Galbraith
Date: Wed, 28 Nov 2012 07:17:18 +0100
Subject: workqueue: exit rescuer_thread() as TASK_RUNNING

A rescue thread exiting TASK_INTERRUPTIBLE can lead to a task scheduling
off, never to be seen again.  In the case where this occurred, an exiting
thread hit reiserfs homebrew conditional resched while holding a mutex,
bringing the box to its knees.

PID: 18105  TASK: ffff8807fd412180  CPU: 5   COMMAND: "kdmflush"
 #0 [ffff8808157e7670] schedule at ffffffff8143f489
 #1 [ffff8808157e77b8] reiserfs_get_block at ffffffffa038ab2d [reiserfs]
 #2 [ffff8808157e79a8] __block_write_begin at ffffffff8117fb14
 #3 [ffff8808157e7a98] reiserfs_write_begin at ffffffffa0388695 [reiserfs]
 #4 [ffff8808157e7ad8] generic_perform_write at ffffffff810ee9e2
 #5 [ffff8808157e7b58] generic_file_buffered_write at ffffffff810eeb41
 #6 [ffff8808157e7ba8] __generic_file_aio_write at ffffffff810f1a3a
 #7 [ffff8808157e7c58] generic_file_aio_write at ffffffff810f1c88
 #8 [ffff8808157e7cc8] do_sync_write at ffffffff8114f850
 #9 [ffff8808157e7dd8] do_acct_process at ffffffff810a268f
    [exception RIP: kernel_thread_helper]
    RIP: ffffffff8144a5c0  RSP: ffff8808157e7f58  RFLAGS: 00000202
    RAX: 0000000000000000  RBX: 0000000000000000  RCX: 0000000000000000
    RDX: 0000000000000000  RSI: ffffffff8107af60  RDI: ffff8803ee491d18
    RBP: 0000000000000000   R8: 0000000000000000   R9: 0000000000000000
    R10: 0000000000000000  R11: 0000000000000000  R12: 0000000000000000
    R13: 0000000000000000  R14: 0000000000000000  R15: 0000000000000000
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018

Signed-off-by: Mike Galbraith <mgalbraith@suse.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: stable@vger.kernel.org
---
 kernel/workqueue.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 042d221d33cc..ac25db111f48 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2407,8 +2407,10 @@ static int rescuer_thread(void *__wq)
 repeat:
 	set_current_state(TASK_INTERRUPTIBLE);
 
-	if (kthread_should_stop())
+	if (kthread_should_stop()) {
+		__set_current_state(TASK_RUNNING);
 		return 0;
+	}
 
 	/*
 	 * See whether any cpu is asking for help.  Unbounded
-- 
cgit v1.2.3


From 8852aac25e79e38cc6529f20298eed154f60b574 Mon Sep 17 00:00:00 2001
From: Tejun Heo
Date: Sat, 1 Dec 2012 16:23:42 -0800
Subject: workqueue: mod_delayed_work_on() shouldn't queue timer on 0 delay

8376fe22c7 ("workqueue: implement mod_delayed_work[_on]()")
implemented mod_delayed_work[_on]() using the improved
try_to_grab_pending().  The function is later used, among others, to
replace [__]candel_delayed_work() + queue_delayed_work() combinations.

Unfortunately, a delayed_work item w/ zero @delay is handled slightly
differently by mod_delayed_work_on() compared to
queue_delayed_work_on().  The latter skips timer altogether and
directly queues it using queue_work_on() while the former schedules
timer which will expire on the closest tick.  This means, when @delay
is zero, that [__]cancel_delayed_work() + queue_delayed_work_on()
makes the target item immediately executable while
mod_delayed_work_on() may induce delay of upto a full tick.

This somewhat subtle difference breaks some of the converted users.
e.g. block queue plugging uses delayed_work for deferred processing
and uses mod_delayed_work_on() when the queue needs to be immediately
unplugged.  The above problem manifested as noticeably higher number
of context switches under certain circumstances.

The difference in behavior was caused by missing special case handling
for 0 delay in mod_delayed_work_on() compared to
queue_delayed_work_on().  Joonsoo Kim posted a patch to add it -
("workqueue: optimize mod_delayed_work_on() when @delay == 0")[1].
The patch was queued for 3.8 but it was described as optimization and
I missed that it was a correctness issue.

As both queue_delayed_work_on() and mod_delayed_work_on() use
__queue_delayed_work() for queueing, it seems that the better approach
is to move the 0 delay special handling to the function instead of
duplicating it in mod_delayed_work_on().

Fix the problem by moving 0 delay special case handling from
queue_delayed_work_on() to __queue_delayed_work().  This replaces
Joonsoo's patch.

[1] http://thread.gmane.org/gmane.linux.kernel/1379011/focus=1379012

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Anders Kaseorg <andersk@MIT.EDU>
Reported-and-tested-by: Zlatko Calusic <zlatko.calusic@iskon.hr>
LKML-Reference: <alpine.DEB.2.00.1211280953350.26602@dr-wily.mit.edu>
LKML-Reference: <50A78AA9.5040904@iskon.hr>
Cc: Joonsoo Kim <js1304@gmail.com>
---
 kernel/workqueue.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ac25db111f48..084aa47bac82 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1364,6 +1364,17 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
 	BUG_ON(timer_pending(timer));
 	BUG_ON(!list_empty(&work->entry));
 
+	/*
+	 * If @delay is 0, queue @dwork->work immediately.  This is for
+	 * both optimization and correctness.  The earliest @timer can
+	 * expire is on the closest next tick and delayed_work users depend
+	 * on that there's no such delay when @delay is 0.
+	 */
+	if (!delay) {
+		__queue_work(cpu, wq, &dwork->work);
+		return;
+	}
+
 	timer_stats_timer_set_start_info(&dwork->timer);
 
 	/*
@@ -1417,9 +1428,6 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 	bool ret = false;
 	unsigned long flags;
 
-	if (!delay)
-		return queue_work_on(cpu, wq, &dwork->work);
-
 	/* read the comment in __queue_work() */
 	local_irq_save(flags);
 
-- 
cgit v1.2.3


From 84ecfd15f5547c992c901df6ec14b4d507eb2c6e Mon Sep 17 00:00:00 2001
From: James Hogan
Date: Fri, 23 Nov 2012 12:08:16 +0000
Subject: modsign: add symbol prefix to certificate list

Add the arch symbol prefix (if applicable) to the asm definition of
modsign_certificate_list and modsign_certificate_list_end. This uses the
recently defined SYMBOL_PREFIX which is derived from
CONFIG_SYMBOL_PREFIX.

This fixes the build of module signing on the blackfin and metag
architectures.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: David Howells <dhowells@redhat.com>
Cc: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/modsign_pubkey.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c
index 4646eb2c3820..767e559dfb10 100644
--- a/kernel/modsign_pubkey.c
+++ b/kernel/modsign_pubkey.c
@@ -21,10 +21,10 @@ struct key *modsign_keyring;
 extern __initdata const u8 modsign_certificate_list[];
 extern __initdata const u8 modsign_certificate_list_end[];
 asm(".section .init.data,\"aw\"\n"
-    "modsign_certificate_list:\n"
+    SYMBOL_PREFIX "modsign_certificate_list:\n"
     ".incbin \"signing_key.x509\"\n"
     ".incbin \"extra_certificates\"\n"
-    "modsign_certificate_list_end:"
+    SYMBOL_PREFIX "modsign_certificate_list_end:"
     );
 
 /*
-- 
cgit v1.2.3


From fd8ef11730f1d03d5d6555aa53126e9e34f52f12 Mon Sep 17 00:00:00 2001
From: Mike Galbraith
Date: Mon, 3 Dec 2012 06:25:25 +0100
Subject: Revert "sched, autogroup: Stop going ahead if autogroup is disabled"

This reverts commit 800d4d30c8f20bd728e5741a3b77c4859a613f7c.

Between commits 8323f26ce342 ("sched: Fix race in task_group()") and
800d4d30c8f2 ("sched, autogroup: Stop going ahead if autogroup is
disabled"), autogroup is a wreck.

With both applied, all you have to do to crash a box is disable
autogroup during boot up, then reboot..  boom, NULL pointer dereference
due to commit 800d4d30c8f2 not allowing autogroup to move things, and
commit 8323f26ce342 making that the only way to switch runqueues:

  BUG: unable to handle kernel NULL pointer dereference at           (null)
  IP: [<ffffffff81063ac0>] effective_load.isra.43+0x50/0x90
  Pid: 7047, comm: systemd-user-se Not tainted 3.6.8-smp #7 MEDIONPC MS-7502/MS-7502
  RIP: effective_load.isra.43+0x50/0x90
  Process systemd-user-se (pid: 7047, threadinfo ffff880221dde000, task ffff88022618b3a0)
  Call Trace:
    select_task_rq_fair+0x255/0x780
    try_to_wake_up+0x156/0x2c0
    wake_up_state+0xb/0x10
    signal_wake_up+0x28/0x40
    complete_signal+0x1d6/0x250
    __send_signal+0x170/0x310
    send_signal+0x40/0x80
    do_send_sig_info+0x47/0x90
    group_send_sig_info+0x4a/0x70
    kill_pid_info+0x3a/0x60
    sys_kill+0x97/0x1a0
    ? vfs_read+0x120/0x160
    ? sys_read+0x45/0x90
    system_call_fastpath+0x16/0x1b
  Code: 49 0f af 41 50 31 d2 49 f7 f0 48 83 f8 01 48 0f 46 c6 48 2b 07 48 8b bf 40 01 00 00 48 85 ff 74 3a 45 31 c0 48 8b 8f 50 01 00 00 <48> 8b 11 4c 8b 89 80 00 00 00 49 89 d2 48 01 d0 45 8b 59 58 4c
  RIP  [<ffffffff81063ac0>] effective_load.isra.43+0x50/0x90
   RSP <ffff880221ddfbd8>
  CR2: 0000000000000000

Signed-off-by: Mike Galbraith <efault@gmx.de>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Yong Zhang <yong.zhang0@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: stable@vger.kernel.org # 2.6.39+
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched/auto_group.c | 4 ----
 kernel/sched/auto_group.h | 5 -----
 2 files changed, 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 0984a21076a3..15f60d01198b 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -143,15 +143,11 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
 
 	p->signal->autogroup = autogroup_kref_get(ag);
 
-	if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
-		goto out;
-
 	t = p;
 	do {
 		sched_move_task(t);
 	} while_each_thread(p, t);
 
-out:
 	unlock_task_sighand(p, &flags);
 	autogroup_kref_put(prev);
 }
diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h
index 8bd047142816..443232ebbb53 100644
--- a/kernel/sched/auto_group.h
+++ b/kernel/sched/auto_group.h
@@ -4,11 +4,6 @@
 #include <linux/rwsem.h>
 
 struct autogroup {
-	/*
-	 * reference doesn't mean how many thread attach to this
-	 * autogroup now. It just stands for the number of task
-	 * could use this autogroup.
-	 */
 	struct kref		kref;
 	struct task_group	*tg;
 	struct rw_semaphore	lock;
-- 
cgit v1.2.3


From fc4b514f2727f74a4587c31db87e0e93465518c3 Mon Sep 17 00:00:00 2001
From: Tejun Heo
Date: Tue, 4 Dec 2012 07:40:39 -0800
Subject: workqueue: convert BUG_ON()s in __queue_delayed_work() to
 WARN_ON_ONCE()s

8852aac25e ("workqueue: mod_delayed_work_on() shouldn't queue timer on
0 delay") unexpectedly uncovered a very nasty abuse of delayed_work in
megaraid - it allocated work_struct, casted it to delayed_work and
then pass that into queue_delayed_work().

Previously, this was okay because 0 @delay short-circuited to
queue_work() before doing anything with delayed_work.  8852aac25e
moved 0 @delay test into __queue_delayed_work() after sanity check on
delayed_work making megaraid trigger BUG_ON().

Although megaraid is already fixed by c1d390d8e6 ("megaraid: fix
BUG_ON() from incorrect use of delayed work"), this patch converts
BUG_ON()s in __queue_delayed_work() to WARN_ON_ONCE()s so that such
abusers, if there are more, trigger warning but don't crash the
machine.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Xiaotian Feng <xtfeng@gmail.com>
---
 kernel/workqueue.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 084aa47bac82..1dae900df798 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1361,8 +1361,8 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
 
 	WARN_ON_ONCE(timer->function != delayed_work_timer_fn ||
 		     timer->data != (unsigned long)dwork);
-	BUG_ON(timer_pending(timer));
-	BUG_ON(!list_empty(&work->entry));
+	WARN_ON_ONCE(timer_pending(timer));
+	WARN_ON_ONCE(!list_empty(&work->entry));
 
 	/*
 	 * If @delay is 0, queue @dwork->work immediately.  This is for
-- 
cgit v1.2.3


From 8d4516904b39507458bee8115793528e12b1d8dd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 4 Dec 2012 18:59:34 +0100
Subject: watchdog: Fix CPU hotplug regression

Norbert reported:
"3.7-rc6 booted with nmi_watchdog=0 fails to suspend to RAM or
 offline CPUs. It's reproducable with a KVM guest and physical
 system."

The reason is that commit bcd951cf(watchdog: Use hotplug thread
infrastructure) missed to take this into account. So the cpu offline
code gets stuck in the teardown function because it accesses non
initialized data structures.

Add a check for watchdog_enabled into that path to cure the issue.

Reported-and-tested-by: Norbert Warmuth <nwarmuth@t-online.de>
Tested-by: Joseph Salisbury <joseph.salisbury@canonical.com>
Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1211231033230.2701@ionos
Link: http://bugs.launchpad.net/bugs/1079534
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/watchdog.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index dd4b80a9f1a9..c8c21be11ab4 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -368,6 +368,9 @@ static void watchdog_disable(unsigned int cpu)
 {
 	struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
 
+	if (!watchdog_enabled)
+		return;
+
 	watchdog_set_prio(SCHED_NORMAL, 0);
 	hrtimer_cancel(hrtimer);
 	/* disable the perf event */
-- 
cgit v1.2.3


From 12e130b04580532aa099893158aa2776b321ae7f Mon Sep 17 00:00:00 2001
From: David Howells
Date: Mon, 22 Oct 2012 15:05:48 +0100
Subject: MODSIGN: Don't use enum-type bitfields in module signature info block

Don't use enum-type bitfields in the module signature info block as we can't be
certain how the compiler will handle them.  As I understand it, it is arch
dependent, and it is possible for the compiler to rearrange them based on
endianness and to insert a byte of padding to pad the three enums out to four
bytes.

Instead use u8 fields for these, which the compiler should emit in the right
order without padding.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module_signing.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index ea1b1df5dbb0..f2970bddc5ea 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -27,13 +27,13 @@
  *	- Information block
  */
 struct module_signature {
-	enum pkey_algo		algo : 8;	/* Public-key crypto algorithm */
-	enum pkey_hash_algo	hash : 8;	/* Digest algorithm */
-	enum pkey_id_type	id_type : 8;	/* Key identifier type */
-	u8			signer_len;	/* Length of signer's name */
-	u8			key_id_len;	/* Length of key identifier */
-	u8			__pad[3];
-	__be32			sig_len;	/* Length of signature data */
+	u8	algo;		/* Public-key crypto algorithm [enum pkey_algo] */
+	u8	hash;		/* Digest algorithm [enum pkey_hash_algo] */
+	u8	id_type;	/* Key identifier type [enum pkey_id_type] */
+	u8	signer_len;	/* Length of signer's name */
+	u8	key_id_len;	/* Length of key identifier */
+	u8	__pad[3];
+	__be32	sig_len;	/* Length of signature data */
 };
 
 /*
-- 
cgit v1.2.3