From a2d477778e82a60a0b7114cefdb70aa43af28782 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 12 Nov 2008 16:19:00 +0530 Subject: sched: fix stale value in average load per task Impact: fix load balancer load average calculation accuracy cpu_avg_load_per_task() returns a stale value when nr_running is 0. It returns an older stale (caculated when nr_running was non zero) value. This patch returns and sets rq->avg_load_per_task to zero when nr_running is 0. Compile and boot tested on a x86_64 box. Signed-off-by: Balbir Singh Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 50a21f964679..3bafbe350f4f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1456,6 +1456,8 @@ static unsigned long cpu_avg_load_per_task(int cpu) if (rq->nr_running) rq->avg_load_per_task = rq->load.weight / rq->nr_running; + else + rq->avg_load_per_task = 0; return rq->avg_load_per_task; } -- cgit v1.2.3 From 5cbd54ef470d880fc37fbe4b21eb514806d51e0d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 12 Nov 2008 20:05:50 +0100 Subject: sched: fix init_idle()'s use of sched_clock() Maciej Rutecki reported: > I have this bug during suspend to disk: > > [ 188.592151] Enabling non-boot CPUs ... > [ 188.592151] SMP alternatives: switching to SMP code > [ 188.666058] BUG: using smp_processor_id() in preemptible > [00000000] > code: suspend_to_disk/2934 > [ 188.666064] caller is native_sched_clock+0x2b/0x80 Which, as noted by Linus, was caused by me, via: 7cbaef9c "sched: optimize sched_clock() a bit" Move the rq locking a bit earlier in the initialization sequence, that will make the sched_clock() call in init_idle() non-preemptible. Reported-by: Maciej Rutecki Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 3bafbe350f4f..c94baf2969e7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5870,6 +5870,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; + spin_lock_irqsave(&rq->lock, flags); + __sched_fork(idle); idle->se.exec_start = sched_clock(); @@ -5877,7 +5879,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) idle->cpus_allowed = cpumask_of_cpu(cpu); __set_task_cpu(idle, cpu); - spin_lock_irqsave(&rq->lock, flags); rq->curr = rq->idle = idle; #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) idle->oncpu = 1; -- cgit v1.2.3 From 700018e0a77b4113172257fcdaa1c58e27a5074f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 18 Nov 2008 14:02:03 +0800 Subject: cpuset: fix regression when failed to generate sched domains Impact: properly rebuild sched-domains on kmalloc() failure When cpuset failed to generate sched domains due to kmalloc() failure, the scheduler should fallback to the single partition 'fallback_doms' and rebuild sched domains, but now it only destroys but not rebuilds sched domains. The regression was introduced by: | commit dfb512ec4834116124da61d6c1ee10fd0aa32bd6 | Author: Max Krasnyansky | Date: Fri Aug 29 13:11:41 2008 -0700 | | sched: arch_reinit_sched_domains() must destroy domains to force rebuild After the above commit, partition_sched_domains(0, NULL, NULL) will only destroy sched domains and partition_sched_domains(1, NULL, NULL) will create the default sched domain. Signed-off-by: Li Zefan Cc: Max Krasnyansky Cc: Signed-off-by: Ingo Molnar --- kernel/cpuset.c | 12 ++++++++---- kernel/sched.c | 13 +++++++------ 2 files changed, 15 insertions(+), 10 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3e00526f52ec..81fc6791a296 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -587,7 +587,6 @@ static int generate_sched_domains(cpumask_t **domains, int ndoms; /* number of sched domains in result */ int nslot; /* next empty doms[] cpumask_t slot */ - ndoms = 0; doms = NULL; dattr = NULL; csa = NULL; @@ -674,10 +673,8 @@ restart: * Convert to and populate cpu masks. */ doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); - if (!doms) { - ndoms = 0; + if (!doms) goto done; - } /* * The rest of the code, including the scheduler, can deal with @@ -732,6 +729,13 @@ restart: done: kfree(csa); + /* + * Fallback to the default domain if kmalloc() failed. + * See comments in partition_sched_domains(). + */ + if (doms == NULL) + ndoms = 1; + *domains = doms; *attributes = dattr; return ndoms; diff --git a/kernel/sched.c b/kernel/sched.c index c94baf2969e7..9b1e79371c20 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7789,13 +7789,14 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * * The passed in 'doms_new' should be kmalloc'd. This routine takes * ownership of it and will kfree it when done with it. If the caller - * failed the kmalloc call, then it can pass in doms_new == NULL, - * and partition_sched_domains() will fallback to the single partition - * 'fallback_doms', it also forces the domains to be rebuilt. + * failed the kmalloc call, then it can pass in doms_new == NULL && + * ndoms_new == 1, and partition_sched_domains() will fallback to + * the single partition 'fallback_doms', it also forces the domains + * to be rebuilt. * - * If doms_new==NULL it will be replaced with cpu_online_map. - * ndoms_new==0 is a special case for destroying existing domains. - * It will not create the default domain. + * If doms_new == NULL it will be replaced with cpu_online_map. + * ndoms_new == 0 is a special case for destroying existing domains, + * and it will not create the default domain. * * Call with hotplug lock held */ -- cgit v1.2.3 From 4cd4262034849da01eb88659af677b69f8169f06 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 26 Nov 2008 21:04:24 -0500 Subject: sched: prevent divide by zero error in cpu_avg_load_per_task Impact: fix divide by zero crash in scheduler rebalance irq While testing the branch profiler, I hit this crash: divide error: 0000 [#1] PREEMPT SMP [...] RIP: 0010:[] [] cpu_avg_load_per_task+0x50/0x7f [...] Call Trace: <0> [] find_busiest_group+0x3e5/0xcaa [] rebalance_domains+0x2da/0xa21 [] ? find_next_bit+0x1b2/0x1e6 [] run_rebalance_domains+0x112/0x19f [] __do_softirq+0xa8/0x232 [] call_softirq+0x1c/0x3e [] do_softirq+0x94/0x1cd [] irq_exit+0x6b/0x10e [] smp_apic_timer_interrupt+0xd3/0xff [] apic_timer_interrupt+0x13/0x20 The code for cpu_avg_load_per_task has: if (rq->nr_running) rq->avg_load_per_task = rq->load.weight / rq->nr_running; The runqueue lock is not held here, and there is nothing that prevents the rq->nr_running from going to zero after it passes the if condition. The branch profiler simply made the race window bigger. This patch saves off the rq->nr_running to a local variable and uses that for both the condition and the division. Signed-off-by: Steven Rostedt Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 9b1e79371c20..700aa9a1413f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1453,9 +1453,10 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); + unsigned long nr_running = rq->nr_running; - if (rq->nr_running) - rq->avg_load_per_task = rq->load.weight / rq->nr_running; + if (nr_running) + rq->avg_load_per_task = rq->load.weight / nr_running; else rq->avg_load_per_task = 0; -- cgit v1.2.3 From af6d596fd603219b054c1c90fb16672a9fd441bd Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 29 Nov 2008 20:45:15 +0100 Subject: sched: prevent divide by zero error in cpu_avg_load_per_task, update Regarding the bug addressed in: 4cd4262: sched: prevent divide by zero error in cpu_avg_load_per_task Linus points out that the fix is not complete: > There's nothing that keeps gcc from deciding not to reload > rq->nr_running. > > Of course, in _practice_, I don't think gcc ever will (if it decides > that it will spill, gcc is likely going to decide that it will > literally spill the local variable to the stack rather than decide to > reload off the pointer), but it's a valid compiler optimization, and > it even has a name (rematerialization). > > So I suspect that your patch does fix the bug, but it still leaves the > fairly unlikely _potential_ for it to re-appear at some point. > > We have ACCESS_ONCE() as a macro to guarantee that the compiler > doesn't rematerialize a pointer access. That also would clarify > the fact that we access something unsafe outside a lock. So make sure our nr_running value is immutable and cannot change after we check it for nonzero. Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 700aa9a1413f..b7480fb5c3dc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1453,7 +1453,7 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long nr_running = rq->nr_running; + unsigned long nr_running = ACCESS_ONCE(rq->nr_running); if (nr_running) rq->avg_load_per_task = rq->load.weight / nr_running; -- cgit v1.2.3