From 9abf24d465180f5f2eb26a43545348262f16b771 Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju
Date: Tue, 12 Nov 2013 22:11:26 +0530
Subject: sched: Check sched_domain before computing group power

After commit 863bffc80898 ("sched/fair: Fix group power_orig
computation"), we can dereference rq->sd before it is set.

Fix this by falling back to power_of() in this case and add a comment
explaining things.

Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
[ Added comment and tweaked patch. ]
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: mikey@neuling.org
Link: http://lkml.kernel.org/r/20131113151718.GN21461@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

(limited to 'kernel/sched')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e8b652ebe027..fd773ade1a31 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5379,10 +5379,31 @@ void update_group_power(struct sched_domain *sd, int cpu)
 		 */
 
 		for_each_cpu(cpu, sched_group_cpus(sdg)) {
-			struct sched_group *sg = cpu_rq(cpu)->sd->groups;
+			struct sched_group_power *sgp;
+			struct rq *rq = cpu_rq(cpu);
 
-			power_orig += sg->sgp->power_orig;
-			power += sg->sgp->power;
+			/*
+			 * build_sched_domains() -> init_sched_groups_power()
+			 * gets here before we've attached the domains to the
+			 * runqueues.
+			 *
+			 * Use power_of(), which is set irrespective of domains
+			 * in update_cpu_power().
+			 *
+			 * This avoids power/power_orig from being 0 and
+			 * causing divide-by-zero issues on boot.
+			 *
+			 * Runtime updates will correct power_orig.
+			 */
+			if (unlikely(!rq->sd)) {
+				power_orig += power_of(cpu);
+				power += power_of(cpu);
+				continue;
+			}
+
+			sgp = rq->sd->groups->sgp;
+			power_orig += sgp->power_orig;
+			power += sgp->power;
 		}
 	} else  {
 		/*
-- 
cgit v1.2.3


From 42eb088ed246a5a817bb45a8b32fe234cf1c0f8b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Tue, 19 Nov 2013 16:41:49 +0100
Subject: sched: Avoid NULL dereference on sd_busy

Commit 37dc6b50cee9 ("sched: Remove unnecessary iteration over sched
domains to update nr_busy_cpus") forgot to clear 'sd_busy' under some
conditions leading to a possible NULL deref in set_cpu_sd_state_idle().

Reported-by: Anton Blanchard <anton@samba.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20131118113701.GF3866@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel/sched')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c1808606ee5f..a1591ca7eb5a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4910,8 +4910,9 @@ static void update_top_cache_domain(int cpu)
 	if (sd) {
 		id = cpumask_first(sched_domain_span(sd));
 		size = cpumask_weight(sched_domain_span(sd));
-		rcu_assign_pointer(per_cpu(sd_busy, cpu), sd->parent);
+		sd = sd->parent; /* sd_busy */
 	}
+	rcu_assign_pointer(per_cpu(sd_busy, cpu), sd);
 
 	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
 	per_cpu(sd_llc_size, cpu) = size;
-- 
cgit v1.2.3


From 0515973ffb16c2852a1bb1df2ca1456556faaaa5 Mon Sep 17 00:00:00 2001
From: Shigeru Yoshida
Date: Sun, 17 Nov 2013 12:12:36 +0900
Subject: sched: Fix a trivial typo in comments

Fix a trivial typo in rq_attach_root().

Signed-off-by: Shigeru Yoshida <shigeru.yoshida@gmail.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20131117.121236.1990617639803941055.shigeru.yoshida@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/sched')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a1591ca7eb5a..718730dd0480 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4762,7 +4762,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 		cpumask_clear_cpu(rq->cpu, old_rd->span);
 
 		/*
-		 * If we dont want to free the old_rt yet then
+		 * If we dont want to free the old_rd yet then
 		 * set old_rd to NULL to skip the freeing later
 		 * in this function:
 		 */
-- 
cgit v1.2.3


From 32e475d76a3e40879cd9ee4f69b19615062280d7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Thu, 21 Nov 2013 12:41:44 +0100
Subject: sched: Expose preempt_schedule_irq()

Tony reported that aa0d53260596 ("ia64: Use preempt_schedule_irq")
broke PREEMPT=n builds on ia64.

Ok, wrapped my brain around it. I tripped over the magic asm foo which
has a single need_resched check and schedule point for both sys call
return and interrupt return.

So you need the schedule_preempt_irq() for kernel preemption from
interrupt return while on a normal syscall preemption a schedule would
be sufficient. But using schedule_preempt_irq() is not harmful here in
any way. It just sets the preempt_active bit also in cases where it
would not be required.

Even on preempt=n kernels adding the preempt_active bit is completely
harmless. So instead of having an extra function, moving the existing
one out of the ifdef PREEMPT looks like the sanest thing to do.

It would also allow getting rid of various other sti/schedule/cli asm
magic in other archs.

Reported-and-Tested-by: Tony Luck <tony.luck@gmail.com>
Fixes: aa0d53260596 ("ia64: Use preempt_schedule_irq")
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
[slightly edited Changelog]
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1311211230030.30673@ionos.tec.linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel/sched')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 718730dd0480..e85cda20ab2b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2660,6 +2660,7 @@ asmlinkage void __sched notrace preempt_schedule(void)
 	} while (need_resched());
 }
 EXPORT_SYMBOL(preempt_schedule);
+#endif /* CONFIG_PREEMPT */
 
 /*
  * this is the entry point to schedule() from kernel preemption
@@ -2693,8 +2694,6 @@ asmlinkage void __sched preempt_schedule_irq(void)
 	exception_exit(prev_state);
 }
 
-#endif /* CONFIG_PREEMPT */
-
 int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
 			  void *key)
 {
-- 
cgit v1.2.3


From 8e8339a3a1069141985daaa2521ba304509ddecd Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Wed, 11 Dec 2013 11:09:53 +0100
Subject: sched: Initialize power_orig for overlapping groups

Yinghai reported that he saw a /0 in sg_capacity on his EX parts.
Make sure to always initialize power_orig now that we actually use it.

Ideally build_sched_domains() -> init_sched_groups_power() would also
initialize this; but for some yet unexplained reason some setups seem
to miss updates there.

Reported-by: Yinghai Lu <yinghai@kernel.org>
Tested-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/n/tip-l8ng2m9uml6fhibln8wqpom7@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel/sched')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e85cda20ab2b..19af58f3a261 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5112,6 +5112,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
 		 * die on a /0 trap.
 		 */
 		sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
+		sg->sgp->power_orig = sg->sgp->power;
 
 		/*
 		 * Make sure the first group of this domain contains the
-- 
cgit v1.2.3


From 9dbdb155532395ba000c5d5d187658b0e17e529f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Mon, 18 Nov 2013 18:27:06 +0100
Subject: sched/fair: Rework sched_fair time accounting

Christian suffers from a bad BIOS that wrecks his i5's TSC sync. This
results in him occasionally seeing time going backwards - which
crashes the scheduler ...

Most of our time accounting can actually handle that except the most
common one; the tick time update of sched_fair.

There is a further problem with that code; previously we assumed that
because we get a tick every TICK_NSEC our time delta could never
exceed 32bits and math was simpler.

However, ever since Frederic managed to get NO_HZ_FULL merged; this is
no longer the case since now a task can run for a long time indeed
without getting a tick. It only takes about ~4.2 seconds to overflow
our u32 in nanoseconds.

This means we not only need to better deal with time going backwards;
but also means we need to be able to deal with large deltas.

This patch reworks the entire code and uses mul_u64_u32_shr() as
proposed by Andy a long while ago.

We express our virtual time scale factor in a u32 multiplier and shift
right and the 32bit mul_u64_u32_shr() implementation reduces to a
single 32x32->64 multiply if the time delta is still short (common
case).

For 64bit a 64x64->128 multiply can be used if ARCH_SUPPORTS_INT128.

Reported-and-Tested-by: Christian Engelmayer <cengelma@gmx.at>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: fweisbec@gmail.com
Cc: Paul Turner <pjt@google.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/20131118172706.GI3866@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |   3 +-
 kernel/sched/fair.c   | 144 ++++++++++++++++++++++----------------------------
 2 files changed, 66 insertions(+), 81 deletions(-)

(limited to 'kernel/sched')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 96d674ba3876..53f97eb8dbc7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -930,7 +930,8 @@ struct pipe_inode_info;
 struct uts_namespace;
 
 struct load_weight {
-	unsigned long weight, inv_weight;
+	unsigned long weight;
+	u32 inv_weight;
 };
 
 struct sched_avg {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fd773ade1a31..9030da7bcb15 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -178,59 +178,61 @@ void sched_init_granularity(void)
 	update_sysctl();
 }
 
-#if BITS_PER_LONG == 32
-# define WMULT_CONST	(~0UL)
-#else
-# define WMULT_CONST	(1UL << 32)
-#endif
-
+#define WMULT_CONST	(~0U)
 #define WMULT_SHIFT	32
 
-/*
- * Shift right and round:
- */
-#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
+static void __update_inv_weight(struct load_weight *lw)
+{
+	unsigned long w;
+
+	if (likely(lw->inv_weight))
+		return;
+
+	w = scale_load_down(lw->weight);
+
+	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
+		lw->inv_weight = 1;
+	else if (unlikely(!w))
+		lw->inv_weight = WMULT_CONST;
+	else
+		lw->inv_weight = WMULT_CONST / w;
+}
 
 /*
- * delta *= weight / lw
+ * delta_exec * weight / lw.weight
+ *   OR
+ * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
+ *
+ * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
+ * we're guaranteed shift stays positive because inv_weight is guaranteed to
+ * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
+ *
+ * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
+ * weight/lw.weight <= 1, and therefore our shift will also be positive.
  */
-static unsigned long
-calc_delta_mine(unsigned long delta_exec, unsigned long weight,
-		struct load_weight *lw)
+static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
 {
-	u64 tmp;
+	u64 fact = scale_load_down(weight);
+	int shift = WMULT_SHIFT;
 
-	/*
-	 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
-	 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
-	 * 2^SCHED_LOAD_RESOLUTION.
-	 */
-	if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
-		tmp = (u64)delta_exec * scale_load_down(weight);
-	else
-		tmp = (u64)delta_exec;
+	__update_inv_weight(lw);
 
-	if (!lw->inv_weight) {
-		unsigned long w = scale_load_down(lw->weight);
-
-		if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
-			lw->inv_weight = 1;
-		else if (unlikely(!w))
-			lw->inv_weight = WMULT_CONST;
-		else
-			lw->inv_weight = WMULT_CONST / w;
+	if (unlikely(fact >> 32)) {
+		while (fact >> 32) {
+			fact >>= 1;
+			shift--;
+		}
 	}
 
-	/*
-	 * Check whether we'd overflow the 64-bit multiplication:
-	 */
-	if (unlikely(tmp > WMULT_CONST))
-		tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
-			WMULT_SHIFT/2);
-	else
-		tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
+	/* hint to use a 32x32->64 mul */
+	fact = (u64)(u32)fact * lw->inv_weight;
+
+	while (fact >> 32) {
+		fact >>= 1;
+		shift--;
+	}
 
-	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
+	return mul_u64_u32_shr(delta_exec, fact, shift);
 }
 
 
@@ -443,7 +445,7 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
 static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
 
 /**************************************************************
  * Scheduling class tree data structure manipulation methods:
@@ -612,11 +614,10 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
 /*
  * delta /= w
  */
-static inline unsigned long
-calc_delta_fair(unsigned long delta, struct sched_entity *se)
+static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
 {
 	if (unlikely(se->load.weight != NICE_0_LOAD))
-		delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
+		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
 
 	return delta;
 }
@@ -665,7 +666,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 			update_load_add(&lw, se->load.weight);
 			load = &lw;
 		}
-		slice = calc_delta_mine(slice, se->load.weight, load);
+		slice = __calc_delta(slice, se->load.weight, load);
 	}
 	return slice;
 }
@@ -703,47 +704,32 @@ void init_task_runnable_average(struct task_struct *p)
 #endif
 
 /*
- * Update the current task's runtime statistics. Skip current tasks that
- * are not in our scheduling class.
+ * Update the current task's runtime statistics.
  */
-static inline void
-__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
-	      unsigned long delta_exec)
-{
-	unsigned long delta_exec_weighted;
-
-	schedstat_set(curr->statistics.exec_max,
-		      max((u64)delta_exec, curr->statistics.exec_max));
-
-	curr->sum_exec_runtime += delta_exec;
-	schedstat_add(cfs_rq, exec_clock, delta_exec);
-	delta_exec_weighted = calc_delta_fair(delta_exec, curr);
-
-	curr->vruntime += delta_exec_weighted;
-	update_min_vruntime(cfs_rq);
-}
-
 static void update_curr(struct cfs_rq *cfs_rq)
 {
 	struct sched_entity *curr = cfs_rq->curr;
 	u64 now = rq_clock_task(rq_of(cfs_rq));
-	unsigned long delta_exec;
+	u64 delta_exec;
 
 	if (unlikely(!curr))
 		return;
 
-	/*
-	 * Get the amount of time the current task was running
-	 * since the last time we changed load (this cannot
-	 * overflow on 32 bits):
-	 */
-	delta_exec = (unsigned long)(now - curr->exec_start);
-	if (!delta_exec)
+	delta_exec = now - curr->exec_start;
+	if (unlikely((s64)delta_exec <= 0))
 		return;
 
-	__update_curr(cfs_rq, curr, delta_exec);
 	curr->exec_start = now;
 
+	schedstat_set(curr->statistics.exec_max,
+		      max(delta_exec, curr->statistics.exec_max));
+
+	curr->sum_exec_runtime += delta_exec;
+	schedstat_add(cfs_rq, exec_clock, delta_exec);
+
+	curr->vruntime += calc_delta_fair(delta_exec, curr);
+	update_min_vruntime(cfs_rq);
+
 	if (entity_is_task(curr)) {
 		struct task_struct *curtask = task_of(curr);
 
@@ -3015,8 +3001,7 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 	}
 }
 
-static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
-				     unsigned long delta_exec)
+static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
 {
 	/* dock delta_exec before expiring quota (as it could span periods) */
 	cfs_rq->runtime_remaining -= delta_exec;
@@ -3034,7 +3019,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
 }
 
 static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
 {
 	if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
 		return;
@@ -3574,8 +3559,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
 	return rq_clock_task(rq_of(cfs_rq));
 }
 
-static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
-				     unsigned long delta_exec) {}
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
-- 
cgit v1.2.3


From 5d4cf996cf134e8ddb4f906b8197feb9267c2b77 Mon Sep 17 00:00:00 2001
From: Mel Gorman
Date: Tue, 17 Dec 2013 09:21:25 +0000
Subject: sched: Assign correct scheduling domain to 'sd_llc'

Commit 42eb088e (sched: Avoid NULL dereference on sd_busy) corrected a NULL
dereference on sd_busy but the fix also altered what scheduling domain it
used for the 'sd_llc' percpu variable.

One impact of this is that a task selecting a runqueue may consider
idle CPUs that are not cache siblings as candidates for running.
Tasks are then running on CPUs that are not cache hot.

This was found through bisection where ebizzy threads were not seeing equal
performance and it looked like a scheduling fairness issue. This patch
mitigates but does not completely fix the problem on all machines tested
implying there may be an additional bug or a common root cause. Here are
the average range of performance seen by individual ebizzy threads. It
was tested on top of candidate patches related to x86 TLB range flushing.

	4-core machine
			    3.13.0-rc3            3.13.0-rc3
			       vanilla            fixsd-v3r3
	Mean   1        0.00 (  0.00%)        0.00 (  0.00%)
	Mean   2        0.34 (  0.00%)        0.10 ( 70.59%)
	Mean   3        1.29 (  0.00%)        0.93 ( 27.91%)
	Mean   4        7.08 (  0.00%)        0.77 ( 89.12%)
	Mean   5      193.54 (  0.00%)        2.14 ( 98.89%)
	Mean   6      151.12 (  0.00%)        2.06 ( 98.64%)
	Mean   7      115.38 (  0.00%)        2.04 ( 98.23%)
	Mean   8      108.65 (  0.00%)        1.92 ( 98.23%)

	8-core machine
	Mean   1         0.00 (  0.00%)        0.00 (  0.00%)
	Mean   2         0.40 (  0.00%)        0.21 ( 47.50%)
	Mean   3        23.73 (  0.00%)        0.89 ( 96.25%)
	Mean   4        12.79 (  0.00%)        1.04 ( 91.87%)
	Mean   5        13.08 (  0.00%)        2.42 ( 81.50%)
	Mean   6        23.21 (  0.00%)       69.46 (-199.27%)
	Mean   7        15.85 (  0.00%)      101.72 (-541.77%)
	Mean   8       109.37 (  0.00%)       19.13 ( 82.51%)
	Mean   12      124.84 (  0.00%)       28.62 ( 77.07%)
	Mean   16      113.50 (  0.00%)       24.16 ( 78.71%)

It's eliminated for one machine and reduced for another.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Alex Shi <alex.shi@linaro.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: H Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/20131217092124.GV11295@suse.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel/sched')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 19af58f3a261..a88f4a485c5e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4902,6 +4902,7 @@ DEFINE_PER_CPU(struct sched_domain *, sd_asym);
 static void update_top_cache_domain(int cpu)
 {
 	struct sched_domain *sd;
+	struct sched_domain *busy_sd = NULL;
 	int id = cpu;
 	int size = 1;
 
@@ -4909,9 +4910,9 @@ static void update_top_cache_domain(int cpu)
 	if (sd) {
 		id = cpumask_first(sched_domain_span(sd));
 		size = cpumask_weight(sched_domain_span(sd));
-		sd = sd->parent; /* sd_busy */
+		busy_sd = sd->parent; /* sd_busy */
 	}
-	rcu_assign_pointer(per_cpu(sd_busy, cpu), sd);
+	rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);
 
 	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
 	per_cpu(sd_llc_size, cpu) = size;
-- 
cgit v1.2.3


From 757dfcaa41844595964f1220f1d33182dae49976 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai
Date: Wed, 27 Nov 2013 19:59:13 +0400
Subject: sched/rt: Fix rq's cpupri leak while enqueue/dequeue child RT
 entities

This patch touches the RT group scheduling case.

Functions inc_rt_prio_smp() and dec_rt_prio_smp() change (global) rq's
priority, while rt_rq passed to them may be not the top-level rt_rq.
This is wrong, because changing of priority on a child level does not
guarantee that the priority is the highest all over the rq. So, this
leak makes RT balancing unusable.

The short example: the task having the highest priority among all rq's
RT tasks (no one other task has the same priority) are waking on a
throttle rt_rq.  The rq's cpupri is set to the task's priority
equivalent, but real rq->rt.highest_prio.curr is less.

The patch below fixes the problem.

Signed-off-by: Kirill Tkhai <tkhai@yandex.ru>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
CC: Steven Rostedt <rostedt@goodmis.org>
CC: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/49231385567953@web4m.yandex.ru
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/rt.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'kernel/sched')

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 7d57275fc396..1c4065575fa2 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -901,6 +901,13 @@ inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 {
 	struct rq *rq = rq_of_rt_rq(rt_rq);
 
+#ifdef CONFIG_RT_GROUP_SCHED
+	/*
+	 * Change rq's cpupri only if rt_rq is the top queue.
+	 */
+	if (&rq->rt != rt_rq)
+		return;
+#endif
 	if (rq->online && prio < prev_prio)
 		cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
 }
@@ -910,6 +917,13 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 {
 	struct rq *rq = rq_of_rt_rq(rt_rq);
 
+#ifdef CONFIG_RT_GROUP_SCHED
+	/*
+	 * Change rq's cpupri only if rt_rq is the top queue.
+	 */
+	if (&rq->rt != rt_rq)
+		return;
+#endif
 	if (rq->online && rt_rq->highest_prio.curr != prev_prio)
 		cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
 }
-- 
cgit v1.2.3


From 3c67f474558748b604e247d92b55dfe89654c81d Mon Sep 17 00:00:00 2001
From: Mel Gorman
Date: Wed, 18 Dec 2013 17:08:40 -0800
Subject: sched: numa: skip inaccessible VMAs

Inaccessible VMA should not be trapping NUMA hint faults. Skip them.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Alex Thorlton <athorlton@sgi.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched/fair.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel/sched')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9030da7bcb15..c7395d97e4cb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1738,6 +1738,13 @@ void task_numa_work(struct callback_head *work)
 		    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
 			continue;
 
+		/*
+		 * Skip inaccessible VMAs to avoid any confusion between
+		 * PROT_NONE and NUMA hinting ptes
+		 */
+		if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
+			continue;
+
 		do {
 			start = max(start, vma->vm_start);
 			end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
-- 
cgit v1.2.3