From bd425d4bfc7a1a6064dbbadfbac9c7eec0e426ec Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Wed, 22 Jun 2016 18:03:12 +0100 Subject: sched/core: Fix power to capacity renaming in comment It is seems that this one escaped Nico's renaming of cpu_power to cpu_capacity a while back. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: linux-kernel@vger.kernel.org Cc: mgalbraith@suse.de Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1466615004-3503-2-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 62c68e513e39..f3db596efd2c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1022,7 +1022,7 @@ extern void wake_up_q(struct wake_q_head *head); #define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ #define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ -#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share cpu power */ +#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share cpu capacity */ #define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */ #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ -- cgit v1.2.3 From d1c6d149cf04d6c7c3c3ebf4b66c82500cbcf6e1 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Sat, 23 Jul 2016 09:46:39 +0200 Subject: sched/debug: Make the "Preemption disabled at ..." message more useful This message is currently really useless since it always prints a value that comes from the printk() we just did, e.g.: BUG: sleeping function called from invalid context at mm/slab.h:388 in_atomic(): 0, irqs_disabled(): 0, pid: 31996, name: trinity-c1 Preemption disabled at:[] down_trylock+0x13/0x80 BUG: sleeping function called from invalid context at include/linux/freezer.h:56 in_atomic(): 0, irqs_disabled(): 0, pid: 31996, name: trinity-c1 Preemption disabled at:[] console_unlock+0x2f7/0x930 Here, both down_trylock() and console_unlock() is somewhere in the printk() path. We should save the value before calling printk() and use the saved value instead. That immediately reveals the offending callsite: BUG: sleeping function called from invalid context at mm/slab.h:388 in_atomic(): 0, irqs_disabled(): 0, pid: 14971, name: trinity-c2 Preemption disabled at:[] rhashtable_walk_start+0x46/0x150 Bug report: http://marc.info/?l=linux-netdev&m=146925979821849&w=2 Signed-off-by: Vegard Nossum Cc: Andrew Morton Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rusty Russel Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/sched.h | 9 +++++++++ kernel/sched/core.c | 21 +++++++++++++-------- 2 files changed, 22 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index f3db596efd2c..7f64e89a5873 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -3236,6 +3236,15 @@ static inline void cond_resched_rcu(void) #endif } +static inline unsigned long get_preempt_disable_ip(struct task_struct *p) +{ +#ifdef CONFIG_DEBUG_PREEMPT + return p->preempt_disable_ip; +#else + return 0; +#endif +} + /* * Does a critical section need to be broken due to another * task waiting?: (technically does not depend on CONFIG_PREEMPT, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 10f2595c408a..a65681605aef 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3171,6 +3171,9 @@ static inline void preempt_latency_stop(int val) { } */ static noinline void __schedule_bug(struct task_struct *prev) { + /* Save this before calling printk(), since that will clobber it */ + unsigned long preempt_disable_ip = get_preempt_disable_ip(current); + if (oops_in_progress) return; @@ -3181,13 +3184,12 @@ static noinline void __schedule_bug(struct task_struct *prev) print_modules(); if (irqs_disabled()) print_irqtrace_events(prev); -#ifdef CONFIG_DEBUG_PREEMPT - if (in_atomic_preempt_off()) { + if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) + && in_atomic_preempt_off()) { pr_err("Preemption disabled at:"); - print_ip_sym(current->preempt_disable_ip); + print_ip_sym(preempt_disable_ip); pr_cont("\n"); } -#endif if (panic_on_warn) panic("scheduling while atomic\n"); @@ -7571,6 +7573,7 @@ EXPORT_SYMBOL(__might_sleep); void ___might_sleep(const char *file, int line, int preempt_offset) { static unsigned long prev_jiffy; /* ratelimiting */ + unsigned long preempt_disable_ip; rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && @@ -7581,6 +7584,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset) return; prev_jiffy = jiffies; + /* Save this before calling printk(), since that will clobber it */ + preempt_disable_ip = get_preempt_disable_ip(current); + printk(KERN_ERR "BUG: sleeping function called from invalid context at %s:%d\n", file, line); @@ -7595,13 +7601,12 @@ void ___might_sleep(const char *file, int line, int preempt_offset) debug_show_held_locks(current); if (irqs_disabled()) print_irqtrace_events(current); -#ifdef CONFIG_DEBUG_PREEMPT - if (!preempt_count_equals(preempt_offset)) { + if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) + && !preempt_count_equals(preempt_offset)) { pr_err("Preemption disabled at:"); - print_ip_sym(current->preempt_disable_ip); + print_ip_sym(preempt_disable_ip); pr_cont("\n"); } -#endif dump_stack(); } EXPORT_SYMBOL(___might_sleep); -- cgit v1.2.3 From 1f6e6c7cb9bcd58abb5ee11243e0eefe6b36fc8e Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Mon, 25 Jul 2016 14:34:22 +0100 Subject: sched/core: Introduce SD_ASYM_CPUCAPACITY sched_domain topology flag Add a topology flag to the sched_domain hierarchy indicating the lowest domain level where the full range of CPU capacities is represented by the domain members for asymmetric capacity topologies (e.g. ARM big.LITTLE). The flag is intended to indicate that extra care should be taken when placing tasks on CPUs and this level spans all the different types of CPUs found in the system (no need to look further up the domain hierarchy). This information is currently only available through iterating through the capacities of all the CPUs at parent levels in the sched_domain hierarchy. SD 2 [ 0 1 2 3] SD_ASYM_CPUCAPACITY SD 1 [ 0 1] [ 2 3] !SD_ASYM_CPUCAPACITY CPU: 0 1 2 3 capacity: 756 756 1024 1024 If the topology in the example above is duplicated to create an eight CPU example with third sched_domain level on top (SD 3), this level should not have the flag set (!SD_ASYM_CPUCAPACITY) as its two group would both have all CPU capacities represented within them. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: freedom.tan@mediatek.com Cc: keita.kobayashi.ym@renesas.com Cc: mgalbraith@suse.de Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1469453670-2660-6-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched/core.c | 4 ++++ 2 files changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 7f64e89a5873..d75024053e9b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1022,6 +1022,7 @@ extern void wake_up_q(struct wake_q_head *head); #define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ #define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ +#define SD_ASYM_CPUCAPACITY 0x0040 /* Groups have different max cpu capacities */ #define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share cpu capacity */ #define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */ #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1b2dd5220170..46bfb90aec00 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5716,6 +5716,7 @@ static int sd_degenerate(struct sched_domain *sd) SD_BALANCE_FORK | SD_BALANCE_EXEC | SD_SHARE_CPUCAPACITY | + SD_ASYM_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN)) { if (sd->groups != sd->groups->next) @@ -5746,6 +5747,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | SD_BALANCE_EXEC | + SD_ASYM_CPUCAPACITY | SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_PREFER_SIBLING | @@ -6363,6 +6365,7 @@ static int sched_domains_curr_level; * SD_SHARE_PKG_RESOURCES - describes shared caches * SD_NUMA - describes NUMA topologies * SD_SHARE_POWERDOMAIN - describes shared power domain + * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies * * Odd one out, which beside describing the topology has a quirk also * prescribes the desired behaviour that goes along with it: @@ -6374,6 +6377,7 @@ static int sched_domains_curr_level; SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ SD_ASYM_PACKING | \ + SD_ASYM_CPUCAPACITY | \ SD_SHARE_POWERDOMAIN) static struct sched_domain * -- cgit v1.2.3 From 9af6528ee9b682df7f29dbee86fbba0b67eab944 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 13 Sep 2016 18:37:29 +0200 Subject: sched/core: Optimize __schedule() Oleg noted that by making do_exit() use __schedule() for the TASK_DEAD context switch, we can avoid the TASK_DEAD special case currently in __schedule() because that avoids the extra preempt_disable() from schedule(). In order to facilitate this, create a do_task_dead() helper which we place in the scheduler code, such that it can access __schedule(). Also add some __noreturn annotations to the functions, there's no coming back from do_exit(). Suggested-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Cheng Chao Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Cc: chris@chris-wilson.co.uk Cc: tj@kernel.org Link: http://lkml.kernel.org/r/20160913163729.GB5012@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- include/linux/kernel.h | 9 +++------ include/linux/sched.h | 2 ++ kernel/exit.c | 26 ++------------------------ kernel/sched/core.c | 38 +++++++++++++++++++++++++++----------- 4 files changed, 34 insertions(+), 41 deletions(-) (limited to 'include') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index d96a6118d26a..74fd6f05bc5b 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -259,17 +259,14 @@ static inline void might_fault(void) { } extern struct atomic_notifier_head panic_notifier_list; extern long (*panic_blink)(int state); __printf(1, 2) -void panic(const char *fmt, ...) - __noreturn __cold; +void panic(const char *fmt, ...) __noreturn __cold; void nmi_panic(struct pt_regs *regs, const char *msg); extern void oops_enter(void); extern void oops_exit(void); void print_oops_end_marker(void); extern int oops_may_print(void); -void do_exit(long error_code) - __noreturn; -void complete_and_exit(struct completion *, long) - __noreturn; +void do_exit(long error_code) __noreturn; +void complete_and_exit(struct completion *, long) __noreturn; /* Internal, do not use. */ int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res); diff --git a/include/linux/sched.h b/include/linux/sched.h index d75024053e9b..f00ee8e90a29 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -448,6 +448,8 @@ static inline void io_schedule(void) io_schedule_timeout(MAX_SCHEDULE_TIMEOUT); } +void __noreturn do_task_dead(void); + struct nsproxy; struct user_namespace; diff --git a/kernel/exit.c b/kernel/exit.c index 091a78be3b09..1e1d913914c0 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -725,7 +725,7 @@ static void check_stack_usage(void) static inline void check_stack_usage(void) {} #endif -void do_exit(long code) +void __noreturn do_exit(long code) { struct task_struct *tsk = current; int group_dead; @@ -882,29 +882,7 @@ void do_exit(long code) exit_rcu(); TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i)); - /* - * The setting of TASK_RUNNING by try_to_wake_up() may be delayed - * when the following two conditions become true. - * - There is race condition of mmap_sem (It is acquired by - * exit_mm()), and - * - SMI occurs before setting TASK_RUNINNG. - * (or hypervisor of virtual machine switches to other guest) - * As a result, we may become TASK_RUNNING after becoming TASK_DEAD - * - * To avoid it, we have to wait for releasing tsk->pi_lock which - * is held by try_to_wake_up() - */ - smp_mb(); - raw_spin_unlock_wait(&tsk->pi_lock); - - /* causes final put_task_struct in finish_task_switch(). */ - tsk->state = TASK_DEAD; - tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ - schedule(); - BUG(); - /* Avoid "noreturn function does return". */ - for (;;) - cpu_relax(); /* For when BUG is null */ + do_task_dead(); } EXPORT_SYMBOL_GPL(do_exit); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ff4e3c066dc2..b2ec53c1a974 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3331,17 +3331,6 @@ static void __sched notrace __schedule(bool preempt) rq = cpu_rq(cpu); prev = rq->curr; - /* - * do_exit() calls schedule() with preemption disabled as an exception; - * however we must fix that up, otherwise the next task will see an - * inconsistent (higher) preempt count. - * - * It also avoids the below schedule_debug() test from complaining - * about this. - */ - if (unlikely(prev->state == TASK_DEAD)) - preempt_enable_no_resched_notrace(); - schedule_debug(prev); if (sched_feat(HRTICK)) @@ -3409,6 +3398,33 @@ static void __sched notrace __schedule(bool preempt) } STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */ +void __noreturn do_task_dead(void) +{ + /* + * The setting of TASK_RUNNING by try_to_wake_up() may be delayed + * when the following two conditions become true. + * - There is race condition of mmap_sem (It is acquired by + * exit_mm()), and + * - SMI occurs before setting TASK_RUNINNG. + * (or hypervisor of virtual machine switches to other guest) + * As a result, we may become TASK_RUNNING after becoming TASK_DEAD + * + * To avoid it, we have to wait for releasing tsk->pi_lock which + * is held by try_to_wake_up() + */ + smp_mb(); + raw_spin_unlock_wait(¤t->pi_lock); + + /* causes final put_task_struct in finish_task_switch(). */ + __set_current_state(TASK_DEAD); + current->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ + __schedule(false); + BUG(); + /* Avoid "noreturn function does return". */ + for (;;) + cpu_relax(); /* For when BUG is null */ +} + static inline void sched_submit_work(struct task_struct *tsk) { if (!tsk->state || tsk_is_pi_blocked(tsk)) -- cgit v1.2.3 From 35a773a07926a22bf19d77ee00024522279c4e68 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 19 Sep 2016 12:57:53 +0200 Subject: sched/core: Avoid _cond_resched() for PREEMPT=y On fully preemptible kernels _cond_resched() is pointless, so avoid emitting any code for it. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mikulas Patocka Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 ++++ kernel/sched/core.c | 2 ++ 2 files changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index f00ee8e90a29..b99fcd1b341e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -3209,7 +3209,11 @@ static inline int signal_pending_state(long state, struct task_struct *p) * cond_resched_lock() will drop the spinlock before scheduling, * cond_resched_softirq() will enable bhs before scheduling. */ +#ifndef CONFIG_PREEMPT extern int _cond_resched(void); +#else +static inline int _cond_resched(void) { return 0; } +#endif #define cond_resched() ({ \ ___might_sleep(__FILE__, __LINE__, 0); \ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b2ec53c1a974..d7babcc7cb76 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4883,6 +4883,7 @@ SYSCALL_DEFINE0(sched_yield) return 0; } +#ifndef CONFIG_PREEMPT int __sched _cond_resched(void) { if (should_resched(0)) { @@ -4892,6 +4893,7 @@ int __sched _cond_resched(void) return 0; } EXPORT_SYMBOL(_cond_resched); +#endif /* * __cond_resched_lock() - if a reschedule is pending, drop the given lock, -- cgit v1.2.3 From 38a3e1fc1dac480f3672ab22fc97e1f995c80ed7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 6 Sep 2016 16:00:47 +0200 Subject: sched/wait: Fix abort_exclusive_wait(), it should pass TASK_NORMAL to wake_up() Otherwise this logic only works if mode is "compatible" with another exclusive waiter. If some wq has both TASK_INTERRUPTIBLE and TASK_UNINTERRUPTIBLE waiters, abort_exclusive_wait() won't wait an uninterruptible waiter. The main user is __wait_on_bit_lock() and currently it is fine but only because TASK_KILLABLE includes TASK_UNINTERRUPTIBLE and we do not have lock_page_interruptible() yet. Just use TASK_NORMAL and remove the "mode" arg from abort_exclusive_wait(). Yes, this means that (say) wake_up_interruptible() can wake up the non- interruptible waiter(s), but I think this is fine. And in fact I think that abort_exclusive_wait() must die, see the next change. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Al Viro Cc: Bart Van Assche Cc: Johannes Weiner Cc: Linus Torvalds Cc: Mike Galbraith Cc: Neil Brown Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160906140047.GA6157@redhat.com Signed-off-by: Ingo Molnar --- include/linux/wait.h | 6 +++--- kernel/sched/wait.c | 8 +++----- 2 files changed, 6 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index c3ff74d764fa..e4cfd1ed726e 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -281,8 +281,8 @@ wait_queue_head_t *bit_waitqueue(void *, int); if (___wait_is_interruptible(state) && __int) { \ __ret = __int; \ if (exclusive) { \ - abort_exclusive_wait(&wq, &__wait, \ - state, NULL); \ + abort_exclusive_wait(&wq, &__wait, \ + NULL); \ goto __out; \ } \ break; \ @@ -989,7 +989,7 @@ void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state); void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state); long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state); void finish_wait(wait_queue_head_t *q, wait_queue_t *wait); -void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, unsigned int mode, void *key); +void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, void *key); long wait_woken(wait_queue_t *wait, unsigned mode, long timeout); int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index f15d6b6a538a..2bbba0175ab2 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -259,7 +259,6 @@ EXPORT_SYMBOL(finish_wait); * abort_exclusive_wait - abort exclusive waiting in a queue * @q: waitqueue waited on * @wait: wait descriptor - * @mode: runstate of the waiter to be woken * @key: key to identify a wait bit queue or %NULL * * Sets current thread back to running state and removes @@ -273,8 +272,7 @@ EXPORT_SYMBOL(finish_wait); * aborts and is woken up concurrently and no one wakes up * the next waiter. */ -void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, - unsigned int mode, void *key) +void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, void *key) { unsigned long flags; @@ -283,7 +281,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, if (!list_empty(&wait->task_list)) list_del_init(&wait->task_list); else if (waitqueue_active(q)) - __wake_up_locked_key(q, mode, key); + __wake_up_locked_key(q, TASK_NORMAL, key); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(abort_exclusive_wait); @@ -434,7 +432,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, ret = action(&q->key, mode); if (!ret) continue; - abort_exclusive_wait(wq, &q->wait, mode, &q->key); + abort_exclusive_wait(wq, &q->wait, &q->key); return ret; } while (test_and_set_bit(q->key.bit_nr, q->key.flags)); finish_wait(wq, &q->wait); -- cgit v1.2.3 From b1ea06a90f528e516929a4da1d9b8838752bceb9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 8 Sep 2016 18:48:15 +0200 Subject: sched/wait: Avoid abort_exclusive_wait() in ___wait_event() ___wait_event() doesn't really need abort_exclusive_wait(), we can simply change prepare_to_wait_event() to remove the waiter from q->task_list if it was interrupted. This simplifies the code/logic, and this way prepare_to_wait_event() can have more users, see the next change. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Al Viro Cc: Bart Van Assche Cc: Johannes Weiner Cc: Linus Torvalds Cc: Mike Galbraith Cc: Neil Brown Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160908164815.GA18801@redhat.com Signed-off-by: Ingo Molnar -- include/linux/wait.h | 7 +------ kernel/sched/wait.c | 35 +++++++++++++++++++++++++---------- 2 files changed, 26 insertions(+), 16 deletions(-) --- include/linux/wait.h | 7 +------ kernel/sched/wait.c | 35 +++++++++++++++++++++++++---------- 2 files changed, 26 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index e4cfd1ed726e..7261dcbe5afe 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -280,12 +280,7 @@ wait_queue_head_t *bit_waitqueue(void *, int); \ if (___wait_is_interruptible(state) && __int) { \ __ret = __int; \ - if (exclusive) { \ - abort_exclusive_wait(&wq, &__wait, \ - NULL); \ - goto __out; \ - } \ - break; \ + goto __out; \ } \ \ cmd; \ diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 2bbba0175ab2..261239392258 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -199,24 +199,39 @@ EXPORT_SYMBOL(prepare_to_wait_exclusive); long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) { unsigned long flags; - - if (signal_pending_state(state, current)) - return -ERESTARTSYS; + long ret = 0; wait->private = current; wait->func = autoremove_wake_function; spin_lock_irqsave(&q->lock, flags); - if (list_empty(&wait->task_list)) { - if (wait->flags & WQ_FLAG_EXCLUSIVE) - __add_wait_queue_tail(q, wait); - else - __add_wait_queue(q, wait); + if (unlikely(signal_pending_state(state, current))) { + /* + * Exclusive waiter must not fail if it was selected by wakeup, + * it should "consume" the condition we were waiting for. + * + * The caller will recheck the condition and return success if + * we were already woken up, we can not miss the event because + * wakeup locks/unlocks the same q->lock. + * + * But we need to ensure that set-condition + wakeup after that + * can't see us, it should wake up another exclusive waiter if + * we fail. + */ + list_del_init(&wait->task_list); + ret = -ERESTARTSYS; + } else { + if (list_empty(&wait->task_list)) { + if (wait->flags & WQ_FLAG_EXCLUSIVE) + __add_wait_queue_tail(q, wait); + else + __add_wait_queue(q, wait); + } + set_current_state(state); } - set_current_state(state); spin_unlock_irqrestore(&q->lock, flags); - return 0; + return ret; } EXPORT_SYMBOL(prepare_to_wait_event); -- cgit v1.2.3 From eaf9ef52241b545fe63621266bfc6fd8b06559ff Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 6 Sep 2016 16:00:53 +0200 Subject: sched/wait: Avoid abort_exclusive_wait() in __wait_on_bit_lock() __wait_on_bit_lock() doesn't need abort_exclusive_wait() too. Right now it can't use prepare_to_wait_event() (see the next change), but it can do the additional finish_wait() if action() fails. abort_exclusive_wait() no longer has callers, remove it. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Al Viro Cc: Bart Van Assche Cc: Johannes Weiner Cc: Linus Torvalds Cc: Mike Galbraith Cc: Neil Brown Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160906140053.GA6164@redhat.com Signed-off-by: Ingo Molnar --- include/linux/wait.h | 1 - kernel/sched/wait.c | 64 +++++++++++++++++----------------------------------- 2 files changed, 21 insertions(+), 44 deletions(-) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index 7261dcbe5afe..19c75f9545ce 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -984,7 +984,6 @@ void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state); void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state); long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state); void finish_wait(wait_queue_head_t *q, wait_queue_t *wait); -void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, void *key); long wait_woken(wait_queue_t *wait, unsigned mode, long timeout); int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 261239392258..0cb615d69013 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -270,37 +270,6 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) } EXPORT_SYMBOL(finish_wait); -/** - * abort_exclusive_wait - abort exclusive waiting in a queue - * @q: waitqueue waited on - * @wait: wait descriptor - * @key: key to identify a wait bit queue or %NULL - * - * Sets current thread back to running state and removes - * the wait descriptor from the given waitqueue if still - * queued. - * - * Wakes up the next waiter if the caller is concurrently - * woken up through the queue. - * - * This prevents waiter starvation where an exclusive waiter - * aborts and is woken up concurrently and no one wakes up - * the next waiter. - */ -void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, void *key) -{ - unsigned long flags; - - __set_current_state(TASK_RUNNING); - spin_lock_irqsave(&q->lock, flags); - if (!list_empty(&wait->task_list)) - list_del_init(&wait->task_list); - else if (waitqueue_active(q)) - __wake_up_locked_key(q, TASK_NORMAL, key); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(abort_exclusive_wait); - int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) { int ret = default_wake_function(wait, mode, sync, key); @@ -438,20 +407,29 @@ int __sched __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, wait_bit_action_f *action, unsigned mode) { - do { - int ret; + int ret = 0; + for (;;) { prepare_to_wait_exclusive(wq, &q->wait, mode); - if (!test_bit(q->key.bit_nr, q->key.flags)) - continue; - ret = action(&q->key, mode); - if (!ret) - continue; - abort_exclusive_wait(wq, &q->wait, &q->key); - return ret; - } while (test_and_set_bit(q->key.bit_nr, q->key.flags)); - finish_wait(wq, &q->wait); - return 0; + if (test_bit(q->key.bit_nr, q->key.flags)) { + ret = action(&q->key, mode); + /* + * See the comment in prepare_to_wait_event(). + * finish_wait() does not necessarily takes wq->lock, + * but test_and_set_bit() implies mb() which pairs with + * smp_mb__after_atomic() before wake_up_page(). + */ + if (ret) + finish_wait(wq, &q->wait); + } + if (!test_and_set_bit(q->key.bit_nr, q->key.flags)) { + if (!ret) + finish_wait(wq, &q->wait); + return 0; + } else if (ret) { + return ret; + } + } } EXPORT_SYMBOL(__wait_on_bit_lock); -- cgit v1.2.3 From 0176beaffbe9ed627b6a4dfa61d640f1a848086f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 6 Sep 2016 16:00:55 +0200 Subject: sched/wait: Introduce init_wait_entry() The partial initialization of wait_queue_t in prepare_to_wait_event() looks ugly. This was done to shrink .text, but we can simply add the new helper which does the full initialization and shrink the compiled code a bit more. And. This way prepare_to_wait_event() can have more users. In particular we are ready to remove the signal_pending_state() checks from wait_bit_action_f helpers and change __wait_on_bit_lock() to use prepare_to_wait_event(). Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Al Viro Cc: Bart Van Assche Cc: Johannes Weiner Cc: Linus Torvalds Cc: Mike Galbraith Cc: Neil Brown Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160906140055.GA6167@redhat.com Signed-off-by: Ingo Molnar --- include/linux/wait.h | 9 +++------ kernel/sched/wait.c | 12 +++++++++--- 2 files changed, 12 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index 19c75f9545ce..2408e8d5c05c 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -248,6 +248,8 @@ wait_queue_head_t *bit_waitqueue(void *, int); (!__builtin_constant_p(state) || \ state == TASK_INTERRUPTIBLE || state == TASK_KILLABLE) \ +extern void init_wait_entry(wait_queue_t *__wait, int flags); + /* * The below macro ___wait_event() has an explicit shadow of the __ret * variable when used from the wait_event_*() macros. @@ -266,12 +268,7 @@ wait_queue_head_t *bit_waitqueue(void *, int); wait_queue_t __wait; \ long __ret = ret; /* explicit shadow */ \ \ - INIT_LIST_HEAD(&__wait.task_list); \ - if (exclusive) \ - __wait.flags = WQ_FLAG_EXCLUSIVE; \ - else \ - __wait.flags = 0; \ - \ + init_wait_entry(&__wait, exclusive ? WQ_FLAG_EXCLUSIVE : 0); \ for (;;) { \ long __int = prepare_to_wait_event(&wq, &__wait, state);\ \ diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 0cb615d69013..4f7053579fe3 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -196,14 +196,20 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) } EXPORT_SYMBOL(prepare_to_wait_exclusive); +void init_wait_entry(wait_queue_t *wait, int flags) +{ + wait->flags = flags; + wait->private = current; + wait->func = autoremove_wake_function; + INIT_LIST_HEAD(&wait->task_list); +} +EXPORT_SYMBOL(init_wait_entry); + long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) { unsigned long flags; long ret = 0; - wait->private = current; - wait->func = autoremove_wake_function; - spin_lock_irqsave(&q->lock, flags); if (unlikely(signal_pending_state(state, current))) { /* -- cgit v1.2.3 From 24fc7edb92eea05946119cc0258c891c26b3b469 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 9 May 2016 10:37:59 +0200 Subject: sched/core: Introduce 'struct sched_domain_shared' Since struct sched_domain is strictly per cpu; introduce a structure that is shared between all 'identical' sched_domains. Limit to SD_SHARE_PKG_RESOURCES domains for now, as we'll only use it for shared cache state; if another use comes up later we can easily relax this. While the sched_group's are normally shared between CPUs, these are not natural to use when we need some shared state on a domain level -- since that would require the domain to have a parent, which is not a given. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 6 ++++++ kernel/sched/core.c | 44 +++++++++++++++++++++++++++++++++++++++----- 2 files changed, 45 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index b99fcd1b341e..8a878b9649a1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1067,6 +1067,10 @@ extern int sched_domain_level_max; struct sched_group; +struct sched_domain_shared { + atomic_t ref; +}; + struct sched_domain { /* These fields must be setup */ struct sched_domain *parent; /* top domain must be null terminated */ @@ -1135,6 +1139,7 @@ struct sched_domain { void *private; /* used during construction */ struct rcu_head rcu; /* used during destruction */ }; + struct sched_domain_shared *shared; unsigned int span_weight; /* @@ -1168,6 +1173,7 @@ typedef int (*sched_domain_flags_f)(void); struct sd_data { struct sched_domain **__percpu sd; + struct sched_domain_shared **__percpu sds; struct sched_group **__percpu sg; struct sched_group_capacity **__percpu sgc; }; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 662d08d7b1df..97d3c18d00bf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5947,6 +5947,8 @@ static void destroy_sched_domain(struct sched_domain *sd) kfree(sd->groups->sgc); kfree(sd->groups); } + if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) + kfree(sd->shared); kfree(sd); } @@ -6385,6 +6387,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd) WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); *per_cpu_ptr(sdd->sd, cpu) = NULL; + if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref)) + *per_cpu_ptr(sdd->sds, cpu) = NULL; + if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) *per_cpu_ptr(sdd->sg, cpu) = NULL; @@ -6429,10 +6434,12 @@ static int sched_domains_curr_level; static struct sched_domain * sd_init(struct sched_domain_topology_level *tl, + const struct cpumask *cpu_map, struct sched_domain *child, int cpu) { - struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); - int sd_weight, sd_flags = 0; + struct sd_data *sdd = &tl->data; + struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); + int sd_id, sd_weight, sd_flags = 0; #ifdef CONFIG_NUMA /* @@ -6487,6 +6494,9 @@ sd_init(struct sched_domain_topology_level *tl, #endif }; + cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); + sd_id = cpumask_first(sched_domain_span(sd)); + /* * Convert topological properties into behaviour. */ @@ -6529,7 +6539,16 @@ sd_init(struct sched_domain_topology_level *tl, sd->idle_idx = 1; } - sd->private = &tl->data; + /* + * For all levels sharing cache; connect a sched_domain_shared + * instance. + */ + if (sd->flags & SD_SHARE_PKG_RESOURCES) { + sd->shared = *per_cpu_ptr(sdd->sds, sd_id); + atomic_inc(&sd->shared->ref); + } + + sd->private = sdd; return sd; } @@ -6839,6 +6858,10 @@ static int __sdt_alloc(const struct cpumask *cpu_map) if (!sdd->sd) return -ENOMEM; + sdd->sds = alloc_percpu(struct sched_domain_shared *); + if (!sdd->sds) + return -ENOMEM; + sdd->sg = alloc_percpu(struct sched_group *); if (!sdd->sg) return -ENOMEM; @@ -6849,6 +6872,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map) for_each_cpu(j, cpu_map) { struct sched_domain *sd; + struct sched_domain_shared *sds; struct sched_group *sg; struct sched_group_capacity *sgc; @@ -6859,6 +6883,13 @@ static int __sdt_alloc(const struct cpumask *cpu_map) *per_cpu_ptr(sdd->sd, j) = sd; + sds = kzalloc_node(sizeof(struct sched_domain_shared), + GFP_KERNEL, cpu_to_node(j)); + if (!sds) + return -ENOMEM; + + *per_cpu_ptr(sdd->sds, j) = sds; + sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); if (!sg) @@ -6898,6 +6929,8 @@ static void __sdt_free(const struct cpumask *cpu_map) kfree(*per_cpu_ptr(sdd->sd, j)); } + if (sdd->sds) + kfree(*per_cpu_ptr(sdd->sds, j)); if (sdd->sg) kfree(*per_cpu_ptr(sdd->sg, j)); if (sdd->sgc) @@ -6905,6 +6938,8 @@ static void __sdt_free(const struct cpumask *cpu_map) } free_percpu(sdd->sd); sdd->sd = NULL; + free_percpu(sdd->sds); + sdd->sds = NULL; free_percpu(sdd->sg); sdd->sg = NULL; free_percpu(sdd->sgc); @@ -6916,9 +6951,8 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr, struct sched_domain *child, int cpu) { - struct sched_domain *sd = sd_init(tl, child, cpu); + struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu); - cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); if (child) { sd->level = child->level + 1; sched_domain_level_max = max(sched_domain_level_max, sd->level); -- cgit v1.2.3 From 0e369d757578b23ac50b893f920aa50fdbc45fb6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 9 May 2016 10:38:01 +0200 Subject: sched/core: Replace sd_busy/nr_busy_cpus with sched_domain_shared Move the nr_busy_cpus thing from its hacky sd->parent->groups->sgc location into the much more natural sched_domain_shared location. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched/core.c | 10 +++++----- kernel/sched/fair.c | 22 ++++++++++++---------- kernel/sched/sched.h | 6 +----- 4 files changed, 19 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8a878b9649a1..98888f1a03bc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1069,6 +1069,7 @@ struct sched_group; struct sched_domain_shared { atomic_t ref; + atomic_t nr_busy_cpus; }; struct sched_domain { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 97d3c18d00bf..26826eac5545 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5981,14 +5981,14 @@ static void destroy_sched_domains(struct sched_domain *sd) DEFINE_PER_CPU(struct sched_domain *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); +DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain *, sd_numa); -DEFINE_PER_CPU(struct sched_domain *, sd_busy); DEFINE_PER_CPU(struct sched_domain *, sd_asym); static void update_top_cache_domain(int cpu) { + struct sched_domain_shared *sds = NULL; struct sched_domain *sd; - struct sched_domain *busy_sd = NULL; int id = cpu; int size = 1; @@ -5996,13 +5996,13 @@ static void update_top_cache_domain(int cpu) if (sd) { id = cpumask_first(sched_domain_span(sd)); size = cpumask_weight(sched_domain_span(sd)); - busy_sd = sd->parent; /* sd_busy */ + sds = sd->shared; } - rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd); rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_size, cpu) = size; per_cpu(sd_llc_id, cpu) = id; + rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); sd = lowest_flag_domain(cpu, SD_NUMA); rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); @@ -6299,7 +6299,6 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) return; update_group_capacity(sd, cpu); - atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight); } /* @@ -6546,6 +6545,7 @@ sd_init(struct sched_domain_topology_level *tl, if (sd->flags & SD_SHARE_PKG_RESOURCES) { sd->shared = *per_cpu_ptr(sdd->sds, sd_id); atomic_inc(&sd->shared->ref); + atomic_set(&sd->shared->nr_busy_cpus, sd_weight); } sd->private = sdd; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 786ef94197e0..15902cdb27b3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8008,13 +8008,13 @@ static inline void set_cpu_sd_state_busy(void) int cpu = smp_processor_id(); rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_busy, cpu)); + sd = rcu_dereference(per_cpu(sd_llc, cpu)); if (!sd || !sd->nohz_idle) goto unlock; sd->nohz_idle = 0; - atomic_inc(&sd->groups->sgc->nr_busy_cpus); + atomic_inc(&sd->shared->nr_busy_cpus); unlock: rcu_read_unlock(); } @@ -8025,13 +8025,13 @@ void set_cpu_sd_state_idle(void) int cpu = smp_processor_id(); rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_busy, cpu)); + sd = rcu_dereference(per_cpu(sd_llc, cpu)); if (!sd || sd->nohz_idle) goto unlock; sd->nohz_idle = 1; - atomic_dec(&sd->groups->sgc->nr_busy_cpus); + atomic_dec(&sd->shared->nr_busy_cpus); unlock: rcu_read_unlock(); } @@ -8258,8 +8258,8 @@ end: static inline bool nohz_kick_needed(struct rq *rq) { unsigned long now = jiffies; + struct sched_domain_shared *sds; struct sched_domain *sd; - struct sched_group_capacity *sgc; int nr_busy, cpu = rq->cpu; bool kick = false; @@ -8287,11 +8287,13 @@ static inline bool nohz_kick_needed(struct rq *rq) return true; rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_busy, cpu)); - if (sd) { - sgc = sd->groups->sgc; - nr_busy = atomic_read(&sgc->nr_busy_cpus); - + sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + if (sds) { + /* + * XXX: write a coherent comment on why we do this. + * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com + */ + nr_busy = atomic_read(&sds->nr_busy_cpus); if (nr_busy > 1) { kick = true; goto unlock; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 420c05d099c3..4fc6e9876d9c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -858,8 +858,8 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) DECLARE_PER_CPU(struct sched_domain *, sd_llc); DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); +DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); DECLARE_PER_CPU(struct sched_domain *, sd_numa); -DECLARE_PER_CPU(struct sched_domain *, sd_busy); DECLARE_PER_CPU(struct sched_domain *, sd_asym); struct sched_group_capacity { @@ -871,10 +871,6 @@ struct sched_group_capacity { unsigned int capacity; unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ - /* - * Number of busy cpus in this group. - */ - atomic_t nr_busy_cpus; unsigned long cpumask[0]; /* iteration mask */ }; -- cgit v1.2.3 From 10e2f1acd0106c05229f94c70a344ce3a2c8008b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 9 May 2016 10:38:05 +0200 Subject: sched/core: Rewrite and improve select_idle_siblings() select_idle_siblings() is a known pain point for a number of workloads; it either does too much or not enough and sometimes just does plain wrong. This rewrite attempts to address a number of issues (but sadly not all). The current code does an unconditional sched_domain iteration; with the intent of finding an idle core (on SMT hardware). The problems which this patch tries to address are: - its pointless to look for idle cores if the machine is real busy; at which point you're just wasting cycles. - it's behaviour is inconsistent between SMT and !SMT hardware in that !SMT hardware ends up doing a scan for any idle CPU in the LLC domain, while SMT hardware does a scan for idle cores and if that fails, falls back to a scan for idle threads on the 'target' core. The new code replaces the sched_domain scan with 3 explicit scans: 1) search for an idle core in the LLC 2) search for an idle CPU in the LLC 3) search for an idle thread in the 'target' core where 1 and 3 are conditional on SMT support and 1 and 2 have runtime heuristics to skip the step. Step 1) is conditional on sd_llc_shared->has_idle_cores; when a cpu goes idle and sd_llc_shared->has_idle_cores is false, we scan all SMT siblings of the CPU going idle. Similarly, we clear sd_llc_shared->has_idle_cores when we fail to find an idle core. Step 2) tracks the average cost of the scan and compares this to the average idle time guestimate for the CPU doing the wakeup. There is a significant fudge factor involved to deal with the variability of the averages. Esp. hackbench was sensitive to this. Step 3) is unconditional; we assume (also per step 1) that scanning all SMT siblings in a core is 'cheap'. With this; SMT systems gain step 2, which cures a few benchmarks -- notably one from Facebook. One 'feature' of the sched_domain iteration, which we preserve in the new code, is that it would start scanning from the 'target' CPU, instead of scanning the cpumask in cpu id order. This avoids multiple CPUs in the LLC scanning for idle to gang up and find the same CPU quite as much. The down side is that tasks can end up hopping across the LLC for no apparent reason. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 + kernel/sched/core.c | 3 + kernel/sched/fair.c | 267 +++++++++++++++++++++++++++++++++++++++-------- kernel/sched/idle_task.c | 2 +- kernel/sched/sched.h | 6 ++ 5 files changed, 234 insertions(+), 47 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 98888f1a03bc..2c30ed860d66 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1070,6 +1070,7 @@ struct sched_group; struct sched_domain_shared { atomic_t ref; atomic_t nr_busy_cpus; + int has_idle_cores; }; struct sched_domain { @@ -1102,6 +1103,8 @@ struct sched_domain { u64 max_newidle_lb_cost; unsigned long next_decay_max_lb_cost; + u64 avg_scan_cost; /* select_idle_sibling */ + #ifdef CONFIG_SCHEDSTATS /* load_balance() stats */ unsigned int lb_count[CPU_MAX_IDLE_TYPES]; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 26826eac5545..75ecd4f29199 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7478,6 +7478,7 @@ static struct kmem_cache *task_group_cache __read_mostly; #endif DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); +DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); void __init sched_init(void) { @@ -7514,6 +7515,8 @@ void __init sched_init(void) for_each_possible_cpu(i) { per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node( cpumask_size(), GFP_KERNEL, cpu_to_node(i)); + per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node( + cpumask_size(), GFP_KERNEL, cpu_to_node(i)); } #endif /* CONFIG_CPUMASK_OFFSTACK */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 15902cdb27b3..6b41589c41e4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1582,9 +1582,16 @@ balance: * One idle CPU per node is evaluated for a task numa move. * Call select_idle_sibling to maybe find a better one. */ - if (!cur) + if (!cur) { + /* + * select_idle_siblings() uses an per-cpu cpumask that + * can be used from IRQ context. + */ + local_irq_disable(); env->dst_cpu = select_idle_sibling(env->p, env->src_cpu, env->dst_cpu); + local_irq_enable(); + } assign: task_numa_assign(env, cur, imp); @@ -4616,6 +4623,11 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } #ifdef CONFIG_SMP + +/* Working cpumask for: load_balance, load_balance_newidle. */ +DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); +DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); + #ifdef CONFIG_NO_HZ_COMMON /* * per rq 'load' arrray crap; XXX kill this. @@ -5280,65 +5292,231 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) } /* - * Try and locate an idle CPU in the sched_domain. + * Implement a for_each_cpu() variant that starts the scan at a given cpu + * (@start), and wraps around. + * + * This is used to scan for idle CPUs; such that not all CPUs looking for an + * idle CPU find the same CPU. The down-side is that tasks tend to cycle + * through the LLC domain. + * + * Especially tbench is found sensitive to this. + */ + +static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped) +{ + int next; + +again: + next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1); + + if (*wrapped) { + if (next >= start) + return nr_cpumask_bits; + } else { + if (next >= nr_cpumask_bits) { + *wrapped = 1; + n = -1; + goto again; + } + } + + return next; +} + +#define for_each_cpu_wrap(cpu, mask, start, wrap) \ + for ((wrap) = 0, (cpu) = (start)-1; \ + (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \ + (cpu) < nr_cpumask_bits; ) + +#ifdef CONFIG_SCHED_SMT + +static inline void set_idle_cores(int cpu, int val) +{ + struct sched_domain_shared *sds; + + sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + if (sds) + WRITE_ONCE(sds->has_idle_cores, val); +} + +static inline bool test_idle_cores(int cpu, bool def) +{ + struct sched_domain_shared *sds; + + sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + if (sds) + return READ_ONCE(sds->has_idle_cores); + + return def; +} + +/* + * Scans the local SMT mask to see if the entire core is idle, and records this + * information in sd_llc_shared->has_idle_cores. + * + * Since SMT siblings share all cache levels, inspecting this limited remote + * state should be fairly cheap. + */ +void update_idle_core(struct rq *rq) +{ + int core = cpu_of(rq); + int cpu; + + rcu_read_lock(); + if (test_idle_cores(core, true)) + goto unlock; + + for_each_cpu(cpu, cpu_smt_mask(core)) { + if (cpu == core) + continue; + + if (!idle_cpu(cpu)) + goto unlock; + } + + set_idle_cores(core, 1); +unlock: + rcu_read_unlock(); +} + +/* + * Scan the entire LLC domain for idle cores; this dynamically switches off if + * there are no idle cores left in the system; tracked through + * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above. + */ +static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) +{ + struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); + int core, cpu, wrap; + + if (!test_idle_cores(target, false)) + return -1; + + cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p)); + + for_each_cpu_wrap(core, cpus, target, wrap) { + bool idle = true; + + for_each_cpu(cpu, cpu_smt_mask(core)) { + cpumask_clear_cpu(cpu, cpus); + if (!idle_cpu(cpu)) + idle = false; + } + + if (idle) + return core; + } + + /* + * Failed to find an idle core; stop looking for one. + */ + set_idle_cores(target, 0); + + return -1; +} + +/* + * Scan the local SMT mask for idle CPUs. + */ +static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) +{ + int cpu; + + for_each_cpu(cpu, cpu_smt_mask(target)) { + if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) + continue; + if (idle_cpu(cpu)) + return cpu; + } + + return -1; +} + +#else /* CONFIG_SCHED_SMT */ + +static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) +{ + return -1; +} + +static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) +{ + return -1; +} + +#endif /* CONFIG_SCHED_SMT */ + +/* + * Scan the LLC domain for idle CPUs; this is dynamically regulated by + * comparing the average scan cost (tracked in sd->avg_scan_cost) against the + * average idle time for this rq (as found in rq->avg_idle). + */ +static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) +{ + struct sched_domain *this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); + u64 avg_idle = this_rq()->avg_idle; + u64 avg_cost = this_sd->avg_scan_cost; + u64 time, cost; + s64 delta; + int cpu, wrap; + + /* + * Due to large variance we need a large fuzz factor; hackbench in + * particularly is sensitive here. + */ + if ((avg_idle / 512) < avg_cost) + return -1; + + time = local_clock(); + + for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) { + if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) + continue; + if (idle_cpu(cpu)) + break; + } + + time = local_clock() - time; + cost = this_sd->avg_scan_cost; + delta = (s64)(time - cost) / 8; + this_sd->avg_scan_cost += delta; + + return cpu; +} + +/* + * Try and locate an idle core/thread in the LLC cache domain. */ static int select_idle_sibling(struct task_struct *p, int prev, int target) { struct sched_domain *sd; - struct sched_group *sg; + int i; if (idle_cpu(target)) return target; /* - * If the prevous cpu is cache affine and idle, don't be stupid. + * If the previous cpu is cache affine and idle, don't be stupid. */ if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) return prev; - /* - * Otherwise, iterate the domains and find an eligible idle cpu. - * - * A completely idle sched group at higher domains is more - * desirable than an idle group at a lower level, because lower - * domains have smaller groups and usually share hardware - * resources which causes tasks to contend on them, e.g. x86 - * hyperthread siblings in the lowest domain (SMT) can contend - * on the shared cpu pipeline. - * - * However, while we prefer idle groups at higher domains - * finding an idle cpu at the lowest domain is still better than - * returning 'target', which we've already established, isn't - * idle. - */ sd = rcu_dereference(per_cpu(sd_llc, target)); - for_each_lower_domain(sd) { - sg = sd->groups; - do { - int i; + if (!sd) + return target; - if (!cpumask_intersects(sched_group_cpus(sg), - tsk_cpus_allowed(p))) - goto next; + i = select_idle_core(p, sd, target); + if ((unsigned)i < nr_cpumask_bits) + return i; - /* Ensure the entire group is idle */ - for_each_cpu(i, sched_group_cpus(sg)) { - if (i == target || !idle_cpu(i)) - goto next; - } + i = select_idle_cpu(p, sd, target); + if ((unsigned)i < nr_cpumask_bits) + return i; + + i = select_idle_smt(p, sd, target); + if ((unsigned)i < nr_cpumask_bits) + return i; - /* - * It doesn't matter which cpu we pick, the - * whole group is idle. - */ - target = cpumask_first_and(sched_group_cpus(sg), - tsk_cpus_allowed(p)); - goto done; -next: - sg = sg->next; - } while (sg != sd->groups); - } -done: return target; } @@ -7397,9 +7575,6 @@ static struct rq *find_busiest_queue(struct lb_env *env, */ #define MAX_PINNED_INTERVAL 512 -/* Working cpumask for load_balance and load_balance_newidle. */ -DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); - static int need_active_balance(struct lb_env *env) { struct sched_domain *sd = env->sd; diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index dedc81ecbb2e..5405d3feb112 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -27,7 +27,7 @@ static struct task_struct * pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) { put_prev_task(rq, prev); - + update_idle_core(rq); schedstat_inc(rq->sched_goidle); return rq->idle; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4fc6e9876d9c..c917dcad82ad 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -36,6 +36,12 @@ extern void cpu_load_update_active(struct rq *this_rq); static inline void cpu_load_update_active(struct rq *this_rq) { } #endif +#ifdef CONFIG_SCHED_SMT +extern void update_idle_core(struct rq *rq); +#else +static inline void update_idle_core(struct rq *rq) { } +#endif + /* * Helpers for converting nanosecond timing to jiffy resolution */ -- cgit v1.2.3 From a458ae2ea616420f74480f0f5ed67ca0f3b5dbf7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 20 Sep 2016 20:29:40 +0200 Subject: sched/core, ia64: Rename set_curr_task() Rename the ia64 only set_curr_task() function to free up the name. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/ia64/kernel/mca.c | 10 +++++----- include/linux/sched.h | 2 +- kernel/sched/core.c | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index eb9220cde76c..d47616c8b885 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -986,7 +986,7 @@ ia64_mca_modify_original_stack(struct pt_regs *regs, int cpu = smp_processor_id(); previous_current = curr_task(cpu); - set_curr_task(cpu, current); + ia64_set_curr_task(cpu, current); if ((p = strchr(current->comm, ' '))) *p = '\0'; @@ -1360,14 +1360,14 @@ ia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw, cpumask_clear_cpu(i, &mca_cpu); /* wake next cpu */ while (monarch_cpu != -1) cpu_relax(); /* spin until last cpu leaves */ - set_curr_task(cpu, previous_current); + ia64_set_curr_task(cpu, previous_current); ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE; return; } } } - set_curr_task(cpu, previous_current); + ia64_set_curr_task(cpu, previous_current); ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE; monarch_cpu = -1; /* This frees the slaves and previous monarchs */ } @@ -1729,7 +1729,7 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw, NOTIFY_INIT(DIE_INIT_SLAVE_LEAVE, regs, (long)&nd, 1); mprintk("Slave on cpu %d returning to normal service.\n", cpu); - set_curr_task(cpu, previous_current); + ia64_set_curr_task(cpu, previous_current); ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE; atomic_dec(&slaves); return; @@ -1756,7 +1756,7 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw, mprintk("\nINIT dump complete. Monarch on cpu %d returning to normal service.\n", cpu); atomic_dec(&monarchs); - set_curr_task(cpu, previous_current); + ia64_set_curr_task(cpu, previous_current); monarch_cpu = -1; return; } diff --git a/include/linux/sched.h b/include/linux/sched.h index 2c30ed860d66..ad51978ff15e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2581,7 +2581,7 @@ static inline bool is_idle_task(const struct task_struct *p) return p->pid == 0; } extern struct task_struct *curr_task(int cpu); -extern void set_curr_task(int cpu, struct task_struct *p); +extern void ia64_set_curr_task(int cpu, struct task_struct *p); void yield(void); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2b5e150e070b..6ec0cfe56ddd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7818,7 +7818,7 @@ struct task_struct *curr_task(int cpu) * * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! */ -void set_curr_task(int cpu, struct task_struct *p) +void ia64_set_curr_task(int cpu, struct task_struct *p) { cpu_curr(cpu) = p; } -- cgit v1.2.3 From 68107df5f2cb5dc3785be40162bfe2f19a178bbb Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 26 Sep 2016 02:29:19 +0200 Subject: u64_stats: Introduce IRQs disabled helpers Introduce light versions of u64_stats helpers for context where either preempt or IRQs are disabled. This way we can make this library usable by scheduler irqtime accounting which currenty implement its ad-hoc version. Signed-off-by: Frederic Weisbecker Cc: Eric Dumazet Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1474849761-12678-4-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/u64_stats_sync.h | 45 ++++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h index d3a2bb712af3..650f3dd6b800 100644 --- a/include/linux/u64_stats_sync.h +++ b/include/linux/u64_stats_sync.h @@ -103,31 +103,42 @@ static inline void u64_stats_update_end_raw(struct u64_stats_sync *syncp) #endif } -static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp) +static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp) { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) return read_seqcount_begin(&syncp->seq); #else -#if BITS_PER_LONG==32 - preempt_disable(); -#endif return 0; #endif } -static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, +static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp) +{ +#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) + preempt_disable(); +#endif + return __u64_stats_fetch_begin(syncp); +} + +static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, unsigned int start) { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) return read_seqcount_retry(&syncp->seq, start); #else -#if BITS_PER_LONG==32 - preempt_enable(); -#endif return false; #endif } +static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, + unsigned int start) +{ +#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) + preempt_enable(); +#endif + return __u64_stats_fetch_retry(syncp, start); +} + /* * In case irq handlers can update u64 counters, readers can use following helpers * - SMP 32bit arches use seqcount protection, irq safe. @@ -136,27 +147,19 @@ static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, */ static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp) { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - return read_seqcount_begin(&syncp->seq); -#else -#if BITS_PER_LONG==32 +#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) local_irq_disable(); #endif - return 0; -#endif + return __u64_stats_fetch_begin(syncp); } static inline bool u64_stats_fetch_retry_irq(const struct u64_stats_sync *syncp, - unsigned int start) + unsigned int start) { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - return read_seqcount_retry(&syncp->seq, start); -#else -#if BITS_PER_LONG==32 +#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) local_irq_enable(); #endif - return false; -#endif + return __u64_stats_fetch_retry(syncp, start); } #endif /* _LINUX_U64_STATS_SYNC_H */ -- cgit v1.2.3