diff options
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/core.c | 101 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 101 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 40 | ||||
-rw-r--r-- | kernel/sched/debug.c | 48 | ||||
-rw-r--r-- | kernel/sched/fair.c | 937 | ||||
-rw-r--r-- | kernel/sched/features.h | 18 | ||||
-rw-r--r-- | kernel/sched/idle.c | 14 | ||||
-rw-r--r-- | kernel/sched/idle_task.c | 1 | ||||
-rw-r--r-- | kernel/sched/rt.c | 42 | ||||
-rw-r--r-- | kernel/sched/sched.h | 39 | ||||
-rw-r--r-- | kernel/sched/stop_task.c | 1 |
11 files changed, 562 insertions, 780 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5e73c79fadd0..a585c7b2ccf0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1151,15 +1151,45 @@ static int migration_cpu_stop(void *data) return 0; } -void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +/* + * sched_class::set_cpus_allowed must do the below, but is not required to + * actually call this function. + */ +void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) { - if (p->sched_class->set_cpus_allowed) - p->sched_class->set_cpus_allowed(p, new_mask); - cpumask_copy(&p->cpus_allowed, new_mask); p->nr_cpus_allowed = cpumask_weight(new_mask); } +void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +{ + struct rq *rq = task_rq(p); + bool queued, running; + + lockdep_assert_held(&p->pi_lock); + + queued = task_on_rq_queued(p); + running = task_current(rq, p); + + if (queued) { + /* + * Because __kthread_bind() calls this on blocked tasks without + * holding rq->lock. + */ + lockdep_assert_held(&rq->lock); + dequeue_task(rq, p, 0); + } + if (running) + put_prev_task(rq, p); + + p->sched_class->set_cpus_allowed(p, new_mask); + + if (running) + p->sched_class->set_curr_task(rq); + if (queued) + enqueue_task(rq, p, 0); +} + /* * Change a given task's CPU affinity. Migrate the thread to a * proper CPU and schedule it away if the CPU it's executing on @@ -1169,7 +1199,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) * task must not exit() & deallocate itself prematurely. The * call is not atomic; no spinlocks may be held. */ -int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) +static int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, bool check) { unsigned long flags; struct rq *rq; @@ -1178,6 +1209,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) rq = task_rq_lock(p, &flags); + /* + * Must re-check here, to close a race against __kthread_bind(), + * sched_setaffinity() is not guaranteed to observe the flag. + */ + if (check && (p->flags & PF_NO_SETAFFINITY)) { + ret = -EINVAL; + goto out; + } + if (cpumask_equal(&p->cpus_allowed, new_mask)) goto out; @@ -1214,6 +1254,11 @@ out: return ret; } + +int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) +{ + return __set_cpus_allowed_ptr(p, new_mask, false); +} EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); void set_task_cpu(struct task_struct *p, unsigned int new_cpu) @@ -1595,6 +1640,15 @@ static void update_avg(u64 *avg, u64 sample) s64 diff = sample - *avg; *avg += diff >> 3; } + +#else + +static inline int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, bool check) +{ + return set_cpus_allowed_ptr(p, new_mask); +} + #endif /* CONFIG_SMP */ static void @@ -1654,9 +1708,9 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) { check_preempt_curr(rq, p, wake_flags); - trace_sched_wakeup(p, true); - p->state = TASK_RUNNING; + trace_sched_wakeup(p); + #ifdef CONFIG_SMP if (p->sched_class->task_woken) { /* @@ -1874,6 +1928,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) if (!(p->state & state)) goto out; + trace_sched_waking(p); + success = 1; /* we're going to change ->state */ cpu = task_cpu(p); @@ -1949,6 +2005,8 @@ static void try_to_wake_up_local(struct task_struct *p) if (!(p->state & TASK_NORMAL)) goto out; + trace_sched_waking(p); + if (!task_on_rq_queued(p)) ttwu_activate(rq, p, ENQUEUE_WAKEUP); @@ -2016,9 +2074,6 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; -#ifdef CONFIG_SMP - p->se.avg.decay_count = 0; -#endif INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_SCHEDSTATS @@ -2303,11 +2358,11 @@ void wake_up_new_task(struct task_struct *p) #endif /* Initialize new task's runnable average */ - init_task_runnable_average(p); + init_entity_runnable_average(&p->se); rq = __task_rq_lock(p); activate_task(rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; - trace_sched_wakeup_new(p, true); + trace_sched_wakeup_new(p); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP if (p->sched_class->task_woken) @@ -2469,7 +2524,6 @@ static struct rq *finish_task_switch(struct task_struct *prev) */ prev_state = prev->state; vtime_task_switch(prev); - finish_arch_switch(prev); perf_event_task_sched_in(prev, current); finish_lock_switch(rq, prev); finish_arch_post_lock_switch(); @@ -4340,7 +4394,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) } #endif again: - retval = set_cpus_allowed_ptr(p, new_mask); + retval = __set_cpus_allowed_ptr(p, new_mask, true); if (!retval) { cpuset_cpus_allowed(p, cpus_allowed); @@ -4492,7 +4546,7 @@ SYSCALL_DEFINE0(sched_yield) int __sched _cond_resched(void) { - if (should_resched()) { + if (should_resched(0)) { preempt_schedule_common(); return 1; } @@ -4510,7 +4564,7 @@ EXPORT_SYMBOL(_cond_resched); */ int __cond_resched_lock(spinlock_t *lock) { - int resched = should_resched(); + int resched = should_resched(PREEMPT_LOCK_OFFSET); int ret = 0; lockdep_assert_held(lock); @@ -4532,7 +4586,7 @@ int __sched __cond_resched_softirq(void) { BUG_ON(!in_softirq()); - if (should_resched()) { + if (should_resched(SOFTIRQ_DISABLE_OFFSET)) { local_bh_enable(); preempt_schedule_common(); local_bh_disable(); @@ -4865,7 +4919,8 @@ void init_idle(struct task_struct *idle, int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&idle->pi_lock, flags); + raw_spin_lock(&rq->lock); __sched_fork(0, idle); idle->state = TASK_RUNNING; @@ -4891,7 +4946,8 @@ void init_idle(struct task_struct *idle, int cpu) #if defined(CONFIG_SMP) idle->on_cpu = 1; #endif - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&idle->pi_lock, flags); /* Set the preempt count _outside_ the spinlocks! */ init_idle_preempt_count(idle, cpu); @@ -5311,8 +5367,7 @@ static void register_sched_domain_sysctl(void) /* may be called multiple times per register */ static void unregister_sched_domain_sysctl(void) { - if (sd_sysctl_header) - unregister_sysctl_table(sd_sysctl_header); + unregister_sysctl_table(sd_sysctl_header); sd_sysctl_header = NULL; if (sd_ctl_dir[0].child) sd_free_ctl_entry(&sd_ctl_dir[0].child); @@ -6445,8 +6500,10 @@ static void init_numa_topology_type(void) n = sched_max_numa_distance; - if (n <= 1) + if (sched_domains_numa_levels <= 1) { sched_numa_topology_type = NUMA_DIRECT; + return; + } for_each_online_node(a) { for_each_online_node(b) { diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index f5a64ffad176..8cbc3db671df 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -555,48 +555,43 @@ drop_precision: } /* - * Atomically advance counter to the new value. Interrupts, vcpu - * scheduling, and scaling inaccuracies can cause cputime_advance - * to be occasionally called with a new value smaller than counter. - * Let's enforce atomicity. + * Adjust tick based cputime random precision against scheduler runtime + * accounting. * - * Normally a caller will only go through this loop once, or not - * at all in case a previous caller updated counter the same jiffy. - */ -static void cputime_advance(cputime_t *counter, cputime_t new) -{ - cputime_t old; - - while (new > (old = READ_ONCE(*counter))) - cmpxchg_cputime(counter, old, new); -} - -/* - * Adjust tick based cputime random precision against scheduler - * runtime accounting. + * Tick based cputime accounting depend on random scheduling timeslices of a + * task to be interrupted or not by the timer. Depending on these + * circumstances, the number of these interrupts may be over or + * under-optimistic, matching the real user and system cputime with a variable + * precision. + * + * Fix this by scaling these tick based values against the total runtime + * accounted by the CFS scheduler. + * + * This code provides the following guarantees: + * + * stime + utime == rtime + * stime_i+1 >= stime_i, utime_i+1 >= utime_i + * + * Assuming that rtime_i+1 >= rtime_i. */ static void cputime_adjust(struct task_cputime *curr, - struct cputime *prev, + struct prev_cputime *prev, cputime_t *ut, cputime_t *st) { cputime_t rtime, stime, utime; + unsigned long flags; - /* - * Tick based cputime accounting depend on random scheduling - * timeslices of a task to be interrupted or not by the timer. - * Depending on these circumstances, the number of these interrupts - * may be over or under-optimistic, matching the real user and system - * cputime with a variable precision. - * - * Fix this by scaling these tick based values against the total - * runtime accounted by the CFS scheduler. - */ + /* Serialize concurrent callers such that we can honour our guarantees */ + raw_spin_lock_irqsave(&prev->lock, flags); rtime = nsecs_to_cputime(curr->sum_exec_runtime); /* - * Update userspace visible utime/stime values only if actual execution - * time is bigger than already exported. Note that can happen, that we - * provided bigger values due to scaling inaccuracy on big numbers. + * This is possible under two circumstances: + * - rtime isn't monotonic after all (a bug); + * - we got reordered by the lock. + * + * In both cases this acts as a filter such that the rest of the code + * can assume it is monotonic regardless of anything else. */ if (prev->stime + prev->utime >= rtime) goto out; @@ -606,22 +601,46 @@ static void cputime_adjust(struct task_cputime *curr, if (utime == 0) { stime = rtime; - } else if (stime == 0) { - utime = rtime; - } else { - cputime_t total = stime + utime; + goto update; + } - stime = scale_stime((__force u64)stime, - (__force u64)rtime, (__force u64)total); - utime = rtime - stime; + if (stime == 0) { + utime = rtime; + goto update; } - cputime_advance(&prev->stime, stime); - cputime_advance(&prev->utime, utime); + stime = scale_stime((__force u64)stime, (__force u64)rtime, + (__force u64)(stime + utime)); + + /* + * Make sure stime doesn't go backwards; this preserves monotonicity + * for utime because rtime is monotonic. + * + * utime_i+1 = rtime_i+1 - stime_i + * = rtime_i+1 - (rtime_i - utime_i) + * = (rtime_i+1 - rtime_i) + utime_i + * >= utime_i + */ + if (stime < prev->stime) + stime = prev->stime; + utime = rtime - stime; + + /* + * Make sure utime doesn't go backwards; this still preserves + * monotonicity for stime, analogous argument to above. + */ + if (utime < prev->utime) { + utime = prev->utime; + stime = rtime - utime; + } +update: + prev->stime = stime; + prev->utime = utime; out: *ut = prev->utime; *st = prev->stime; + raw_spin_unlock_irqrestore(&prev->lock, flags); } void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0a17af35670a..fc8f01083527 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -953,7 +953,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) /* * Use the scheduling parameters of the top pi-waiter - * task if we have one and its (relative) deadline is + * task if we have one and its (absolute) deadline is * smaller than our one... OTW we keep our runtime and * deadline. */ @@ -1563,7 +1563,7 @@ out: static void push_dl_tasks(struct rq *rq) { - /* Terminates as it moves a -deadline task */ + /* push_dl_task() will return true if it moved a -deadline task */ while (push_dl_task(rq)) ; } @@ -1657,7 +1657,6 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) { if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && - has_pushable_dl_tasks(rq) && p->nr_cpus_allowed > 1 && dl_task(rq->curr) && (rq->curr->nr_cpus_allowed < 2 || @@ -1669,9 +1668,8 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) static void set_cpus_allowed_dl(struct task_struct *p, const struct cpumask *new_mask) { - struct rq *rq; struct root_domain *src_rd; - int weight; + struct rq *rq; BUG_ON(!dl_task(p)); @@ -1697,37 +1695,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, raw_spin_unlock(&src_dl_b->lock); } - /* - * Update only if the task is actually running (i.e., - * it is on the rq AND it is not throttled). - */ - if (!on_dl_rq(&p->dl)) - return; - - weight = cpumask_weight(new_mask); - - /* - * Only update if the process changes its state from whether it - * can migrate or not. - */ - if ((p->nr_cpus_allowed > 1) == (weight > 1)) - return; - - /* - * The process used to be able to migrate OR it can now migrate - */ - if (weight <= 1) { - if (!task_current(rq, p)) - dequeue_pushable_dl_task(rq, p); - BUG_ON(!rq->dl.dl_nr_migratory); - rq->dl.dl_nr_migratory--; - } else { - if (!task_current(rq, p)) - enqueue_pushable_dl_task(rq, p); - rq->dl.dl_nr_migratory++; - } - - update_dl_migration(&rq->dl); + set_cpus_allowed_common(p, new_mask); } /* Assumes rq->lock is held */ diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4222ec50ab88..641511771ae6 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -68,13 +68,8 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group #define PN(F) \ SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) - if (!se) { - struct sched_avg *avg = &cpu_rq(cpu)->avg; - P(avg->runnable_avg_sum); - P(avg->avg_period); + if (!se) return; - } - PN(se->exec_start); PN(se->vruntime); @@ -93,12 +88,8 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group #endif P(se->load.weight); #ifdef CONFIG_SMP - P(se->avg.runnable_avg_sum); - P(se->avg.running_avg_sum); - P(se->avg.avg_period); - P(se->avg.load_avg_contrib); - P(se->avg.utilization_avg_contrib); - P(se->avg.decay_count); + P(se->avg.load_avg); + P(se->avg.util_avg); #endif #undef PN #undef P @@ -214,21 +205,21 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_SMP - SEQ_printf(m, " .%-30s: %ld\n", "runnable_load_avg", + SEQ_printf(m, " .%-30s: %lu\n", "load_avg", + cfs_rq->avg.load_avg); + SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg", cfs_rq->runnable_load_avg); - SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg", - cfs_rq->blocked_load_avg); - SEQ_printf(m, " .%-30s: %ld\n", "utilization_load_avg", - cfs_rq->utilization_load_avg); + SEQ_printf(m, " .%-30s: %lu\n", "util_avg", + cfs_rq->avg.util_avg); + SEQ_printf(m, " .%-30s: %ld\n", "removed_load_avg", + atomic_long_read(&cfs_rq->removed_load_avg)); + SEQ_printf(m, " .%-30s: %ld\n", "removed_util_avg", + atomic_long_read(&cfs_rq->removed_util_avg)); #ifdef CONFIG_FAIR_GROUP_SCHED - SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib", - cfs_rq->tg_load_contrib); - SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib", - cfs_rq->tg_runnable_contrib); + SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib", + cfs_rq->tg_load_avg_contrib); SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", atomic_long_read(&cfs_rq->tg->load_avg)); - SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg", - atomic_read(&cfs_rq->tg->runnable_avg)); #endif #endif #ifdef CONFIG_CFS_BANDWIDTH @@ -636,12 +627,11 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) P(se.load.weight); #ifdef CONFIG_SMP - P(se.avg.runnable_avg_sum); - P(se.avg.running_avg_sum); - P(se.avg.avg_period); - P(se.avg.load_avg_contrib); - P(se.avg.utilization_avg_contrib); - P(se.avg.decay_count); + P(se.avg.load_sum); + P(se.avg.util_sum); + P(se.avg.load_avg); + P(se.avg.util_avg); + P(se.avg.last_update_time); #endif P(policy); P(prio); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d113c3ba8bc4..6e2e3483b1ec 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -283,9 +283,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) return grp->my_q; } -static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, - int force_update); - static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { if (!cfs_rq->on_list) { @@ -305,8 +302,6 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) } cfs_rq->on_list = 1; - /* We should have no load, but we need to update last_decay. */ - update_cfs_rq_blocked_load(cfs_rq, 0); } } @@ -616,15 +611,10 @@ static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) */ static u64 __sched_period(unsigned long nr_running) { - u64 period = sysctl_sched_latency; - unsigned long nr_latency = sched_nr_latency; - - if (unlikely(nr_running > nr_latency)) { - period = sysctl_sched_min_granularity; - period *= nr_running; - } - - return period; + if (unlikely(nr_running > sched_nr_latency)) + return nr_running * sysctl_sched_min_granularity; + else + return sysctl_sched_latency; } /* @@ -669,22 +659,37 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) static int select_idle_sibling(struct task_struct *p, int cpu); static unsigned long task_h_load(struct task_struct *p); -static inline void __update_task_entity_contrib(struct sched_entity *se); -static inline void __update_task_entity_utilization(struct sched_entity *se); +/* + * We choose a half-life close to 1 scheduling period. + * Note: The tables below are dependent on this value. + */ +#define LOAD_AVG_PERIOD 32 +#define LOAD_AVG_MAX 47742 /* maximum possible load avg */ +#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */ -/* Give new task start runnable values to heavy its load in infant time */ -void init_task_runnable_average(struct task_struct *p) +/* Give new sched_entity start runnable values to heavy its load in infant time */ +void init_entity_runnable_average(struct sched_entity *se) { - u32 slice; + struct sched_avg *sa = &se->avg; - slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; - p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice; - p->se.avg.avg_period = slice; - __update_task_entity_contrib(&p->se); - __update_task_entity_utilization(&p->se); + sa->last_update_time = 0; + /* + * sched_avg's period_contrib should be strictly less then 1024, so + * we give it 1023 to make sure it is almost a period (1024us), and + * will definitely be update (after enqueue). + */ + sa->period_contrib = 1023; + sa->load_avg = scale_load_down(se->load.weight); + sa->load_sum = sa->load_avg * LOAD_AVG_MAX; + sa->util_avg = scale_load_down(SCHED_LOAD_SCALE); + sa->util_sum = LOAD_AVG_MAX; + /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } + +static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); +static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq); #else -void init_task_runnable_average(struct task_struct *p) +void init_entity_runnable_average(struct sched_entity *se) { } #endif @@ -1415,8 +1420,9 @@ static bool numa_has_capacity(struct task_numa_env *env) * --------------------- vs --------------------- * src->compute_capacity dst->compute_capacity */ - if (src->load * dst->compute_capacity > - dst->load * src->compute_capacity) + if (src->load * dst->compute_capacity * env->imbalance_pct > + + dst->load * src->compute_capacity * 100) return true; return false; @@ -1702,8 +1708,8 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) delta = runtime - p->last_sum_exec_runtime; *period = now - p->last_task_numa_placement; } else { - delta = p->se.avg.runnable_avg_sum; - *period = p->se.avg.avg_period; + delta = p->se.avg.load_sum / p->se.load.weight; + *period = LOAD_AVG_MAX; } p->last_sum_exec_runtime = runtime; @@ -2351,13 +2357,13 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) long tg_weight; /* - * Use this CPU's actual weight instead of the last load_contribution - * to gain a more accurate current total weight. See - * update_cfs_rq_load_contribution(). + * Use this CPU's real-time load instead of the last load contribution + * as the updating of the contribution is delayed, and we will use the + * the real-time load to calc the share. See update_tg_load_avg(). */ tg_weight = atomic_long_read(&tg->load_avg); - tg_weight -= cfs_rq->tg_load_contrib; - tg_weight += cfs_rq->load.weight; + tg_weight -= cfs_rq->tg_load_avg_contrib; + tg_weight += cfs_rq_load_avg(cfs_rq); return tg_weight; } @@ -2367,7 +2373,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) long tg_weight, load, shares; tg_weight = calc_tg_weight(tg, cfs_rq); - load = cfs_rq->load.weight; + load = cfs_rq_load_avg(cfs_rq); shares = (tg->shares * load); if (tg_weight) @@ -2429,14 +2435,6 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq) #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_SMP -/* - * We choose a half-life close to 1 scheduling period. - * Note: The tables below are dependent on this value. - */ -#define LOAD_AVG_PERIOD 32 -#define LOAD_AVG_MAX 47742 /* maximum possible load avg */ -#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */ - /* Precomputed fixed inverse multiplies for multiplication by y^n */ static const u32 runnable_avg_yN_inv[] = { 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, @@ -2485,9 +2483,8 @@ static __always_inline u64 decay_load(u64 val, u64 n) local_n %= LOAD_AVG_PERIOD; } - val *= runnable_avg_yN_inv[local_n]; - /* We don't use SRR here since we always want to round down. */ - return val >> 32; + val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32); + return val; } /* @@ -2546,23 +2543,22 @@ static u32 __compute_runnable_contrib(u64 n) * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] */ -static __always_inline int __update_entity_runnable_avg(u64 now, int cpu, - struct sched_avg *sa, - int runnable, - int running) +static __always_inline int +__update_load_avg(u64 now, int cpu, struct sched_avg *sa, + unsigned long weight, int running, struct cfs_rq *cfs_rq) { u64 delta, periods; - u32 runnable_contrib; + u32 contrib; int delta_w, decayed = 0; unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); - delta = now - sa->last_runnable_update; + delta = now - sa->last_update_time; /* * This should only happen when time goes backwards, which it * unfortunately does during sched clock init when we swap over to TSC. */ if ((s64)delta < 0) { - sa->last_runnable_update = now; + sa->last_update_time = now; return 0; } @@ -2573,26 +2569,29 @@ static __always_inline int __update_entity_runnable_avg(u64 now, int cpu, delta >>= 10; if (!delta) return 0; - sa->last_runnable_update = now; + sa->last_update_time = now; /* delta_w is the amount already accumulated against our next period */ - delta_w = sa->avg_period % 1024; + delta_w = sa->period_contrib; if (delta + delta_w >= 1024) { - /* period roll-over */ decayed = 1; + /* how much left for next period will start over, we don't know yet */ + sa->period_contrib = 0; + /* * Now that we know we're crossing a period boundary, figure * out how much from delta we need to complete the current * period and accrue it. */ delta_w = 1024 - delta_w; - if (runnable) - sa->runnable_avg_sum += delta_w; + if (weight) { + sa->load_sum += weight * delta_w; + if (cfs_rq) + cfs_rq->runnable_load_sum += weight * delta_w; + } if (running) - sa->running_avg_sum += delta_w * scale_freq - >> SCHED_CAPACITY_SHIFT; - sa->avg_period += delta_w; + sa->util_sum += delta_w * scale_freq >> SCHED_CAPACITY_SHIFT; delta -= delta_w; @@ -2600,341 +2599,186 @@ static __always_inline int __update_entity_runnable_avg(u64 now, int cpu, periods = delta / 1024; delta %= 1024; - sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, - periods + 1); - sa->running_avg_sum = decay_load(sa->running_avg_sum, - periods + 1); - sa->avg_period = decay_load(sa->avg_period, - periods + 1); + sa->load_sum = decay_load(sa->load_sum, periods + 1); + if (cfs_rq) { + cfs_rq->runnable_load_sum = + decay_load(cfs_rq->runnable_load_sum, periods + 1); + } + sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1); /* Efficiently calculate \sum (1..n_period) 1024*y^i */ - runnable_contrib = __compute_runnable_contrib(periods); - if (runnable) - sa->runnable_avg_sum += runnable_contrib; + contrib = __compute_runnable_contrib(periods); + if (weight) { + sa->load_sum += weight * contrib; + if (cfs_rq) + cfs_rq->runnable_load_sum += weight * contrib; + } if (running) - sa->running_avg_sum += runnable_contrib * scale_freq - >> SCHED_CAPACITY_SHIFT; - sa->avg_period += runnable_contrib; + sa->util_sum += contrib * scale_freq >> SCHED_CAPACITY_SHIFT; } /* Remainder of delta accrued against u_0` */ - if (runnable) - sa->runnable_avg_sum += delta; + if (weight) { + sa->load_sum += weight * delta; + if (cfs_rq) + cfs_rq->runnable_load_sum += weight * delta; + } if (running) - sa->running_avg_sum += delta * scale_freq - >> SCHED_CAPACITY_SHIFT; - sa->avg_period += delta; - - return decayed; -} + sa->util_sum += delta * scale_freq >> SCHED_CAPACITY_SHIFT; -/* Synchronize an entity's decay with its parenting cfs_rq.*/ -static inline u64 __synchronize_entity_decay(struct sched_entity *se) -{ - struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 decays = atomic64_read(&cfs_rq->decay_counter); - - decays -= se->avg.decay_count; - se->avg.decay_count = 0; - if (!decays) - return 0; + sa->period_contrib += delta; - se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); - se->avg.utilization_avg_contrib = - decay_load(se->avg.utilization_avg_contrib, decays); + if (decayed) { + sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); + if (cfs_rq) { + cfs_rq->runnable_load_avg = + div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); + } + sa->util_avg = (sa->util_sum << SCHED_LOAD_SHIFT) / LOAD_AVG_MAX; + } - return decays; + return decayed; } #ifdef CONFIG_FAIR_GROUP_SCHED -static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, - int force_update) -{ - struct task_group *tg = cfs_rq->tg; - long tg_contrib; - - tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; - tg_contrib -= cfs_rq->tg_load_contrib; - - if (!tg_contrib) - return; - - if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) { - atomic_long_add(tg_contrib, &tg->load_avg); - cfs_rq->tg_load_contrib += tg_contrib; - } -} - /* - * Aggregate cfs_rq runnable averages into an equivalent task_group - * representation for computing load contributions. + * Updating tg's load_avg is necessary before update_cfs_share (which is done) + * and effective_load (which is not done because it is too costly). */ -static inline void __update_tg_runnable_avg(struct sched_avg *sa, - struct cfs_rq *cfs_rq) +static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) { - struct task_group *tg = cfs_rq->tg; - long contrib; + long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; - /* The fraction of a cpu used by this cfs_rq */ - contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, - sa->avg_period + 1); - contrib -= cfs_rq->tg_runnable_contrib; - - if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { - atomic_add(contrib, &tg->runnable_avg); - cfs_rq->tg_runnable_contrib += contrib; + if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { + atomic_long_add(delta, &cfs_rq->tg->load_avg); + cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg; } } -static inline void __update_group_entity_contrib(struct sched_entity *se) -{ - struct cfs_rq *cfs_rq = group_cfs_rq(se); - struct task_group *tg = cfs_rq->tg; - int runnable_avg; - - u64 contrib; - - contrib = cfs_rq->tg_load_contrib * tg->shares; - se->avg.load_avg_contrib = div_u64(contrib, - atomic_long_read(&tg->load_avg) + 1); - - /* - * For group entities we need to compute a correction term in the case - * that they are consuming <1 cpu so that we would contribute the same - * load as a task of equal weight. - * - * Explicitly co-ordinating this measurement would be expensive, but - * fortunately the sum of each cpus contribution forms a usable - * lower-bound on the true value. - * - * Consider the aggregate of 2 contributions. Either they are disjoint - * (and the sum represents true value) or they are disjoint and we are - * understating by the aggregate of their overlap. - * - * Extending this to N cpus, for a given overlap, the maximum amount we - * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of - * cpus that overlap for this interval and w_i is the interval width. - * - * On a small machine; the first term is well-bounded which bounds the - * total error since w_i is a subset of the period. Whereas on a - * larger machine, while this first term can be larger, if w_i is the - * of consequential size guaranteed to see n_i*w_i quickly converge to - * our upper bound of 1-cpu. - */ - runnable_avg = atomic_read(&tg->runnable_avg); - if (runnable_avg < NICE_0_LOAD) { - se->avg.load_avg_contrib *= runnable_avg; - se->avg.load_avg_contrib >>= NICE_0_SHIFT; - } -} - -static inline void update_rq_runnable_avg(struct rq *rq, int runnable) -{ - __update_entity_runnable_avg(rq_clock_task(rq), cpu_of(rq), &rq->avg, - runnable, runnable); - __update_tg_runnable_avg(&rq->avg, &rq->cfs); -} #else /* CONFIG_FAIR_GROUP_SCHED */ -static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, - int force_update) {} -static inline void __update_tg_runnable_avg(struct sched_avg *sa, - struct cfs_rq *cfs_rq) {} -static inline void __update_group_entity_contrib(struct sched_entity *se) {} -static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} +static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} #endif /* CONFIG_FAIR_GROUP_SCHED */ -static inline void __update_task_entity_contrib(struct sched_entity *se) -{ - u32 contrib; - - /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ - contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); - contrib /= (se->avg.avg_period + 1); - se->avg.load_avg_contrib = scale_load(contrib); -} +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); -/* Compute the current contribution to load_avg by se, return any delta */ -static long __update_entity_load_avg_contrib(struct sched_entity *se) +/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ +static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) { - long old_contrib = se->avg.load_avg_contrib; + int decayed; + struct sched_avg *sa = &cfs_rq->avg; - if (entity_is_task(se)) { - __update_task_entity_contrib(se); - } else { - __update_tg_runnable_avg(&se->avg, group_cfs_rq(se)); - __update_group_entity_contrib(se); + if (atomic_long_read(&cfs_rq->removed_load_avg)) { + long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); + sa->load_avg = max_t(long, sa->load_avg - r, 0); + sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); } - return se->avg.load_avg_contrib - old_contrib; -} - - -static inline void __update_task_entity_utilization(struct sched_entity *se) -{ - u32 contrib; + if (atomic_long_read(&cfs_rq->removed_util_avg)) { + long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); + sa->util_avg = max_t(long, sa->util_avg - r, 0); + sa->util_sum = max_t(s32, sa->util_sum - + ((r * LOAD_AVG_MAX) >> SCHED_LOAD_SHIFT), 0); + } - /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ - contrib = se->avg.running_avg_sum * scale_load_down(SCHED_LOAD_SCALE); - contrib /= (se->avg.avg_period + 1); - se->avg.utilization_avg_contrib = scale_load(contrib); -} + decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, + scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq); -static long __update_entity_utilization_avg_contrib(struct sched_entity *se) -{ - long old_contrib = se->avg.utilization_avg_contrib; - - if (entity_is_task(se)) - __update_task_entity_utilization(se); - else - se->avg.utilization_avg_contrib = - group_cfs_rq(se)->utilization_load_avg; +#ifndef CONFIG_64BIT + smp_wmb(); + cfs_rq->load_last_update_time_copy = sa->last_update_time; +#endif - return se->avg.utilization_avg_contrib - old_contrib; + return decayed; } -static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, - long load_contrib) -{ - if (likely(load_contrib < cfs_rq->blocked_load_avg)) - cfs_rq->blocked_load_avg -= load_contrib; - else - cfs_rq->blocked_load_avg = 0; -} - -static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); - -/* Update a sched_entity's runnable average */ -static inline void update_entity_load_avg(struct sched_entity *se, - int update_cfs_rq) +/* Update task and its cfs_rq load average */ +static inline void update_load_avg(struct sched_entity *se, int update_tg) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - long contrib_delta, utilization_delta; int cpu = cpu_of(rq_of(cfs_rq)); - u64 now; + u64 now = cfs_rq_clock_task(cfs_rq); /* - * For a group entity we need to use their owned cfs_rq_clock_task() in - * case they are the parent of a throttled hierarchy. + * Track task load average for carrying it to new CPU after migrated, and + * track group sched_entity load average for task_h_load calc in migration */ - if (entity_is_task(se)) - now = cfs_rq_clock_task(cfs_rq); - else - now = cfs_rq_clock_task(group_cfs_rq(se)); - - if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq, - cfs_rq->curr == se)) - return; - - contrib_delta = __update_entity_load_avg_contrib(se); - utilization_delta = __update_entity_utilization_avg_contrib(se); - - if (!update_cfs_rq) - return; + __update_load_avg(now, cpu, &se->avg, + se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); - if (se->on_rq) { - cfs_rq->runnable_load_avg += contrib_delta; - cfs_rq->utilization_load_avg += utilization_delta; - } else { - subtract_blocked_load_contrib(cfs_rq, -contrib_delta); - } + if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) + update_tg_load_avg(cfs_rq, 0); } -/* - * Decay the load contributed by all blocked children and account this so that - * their contribution may appropriately discounted when they wake up. - */ -static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) +/* Add the load generated by se into cfs_rq's load average */ +static inline void +enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u64 now = cfs_rq_clock_task(cfs_rq) >> 20; - u64 decays; - - decays = now - cfs_rq->last_decay; - if (!decays && !force_update) - return; + struct sched_avg *sa = &se->avg; + u64 now = cfs_rq_clock_task(cfs_rq); + int migrated = 0, decayed; - if (atomic_long_read(&cfs_rq->removed_load)) { - unsigned long removed_load; - removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0); - subtract_blocked_load_contrib(cfs_rq, removed_load); + if (sa->last_update_time == 0) { + sa->last_update_time = now; + migrated = 1; } + else { + __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, + se->on_rq * scale_load_down(se->load.weight), + cfs_rq->curr == se, NULL); + } + + decayed = update_cfs_rq_load_avg(now, cfs_rq); - if (decays) { - cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg, - decays); - atomic64_add(decays, &cfs_rq->decay_counter); - cfs_rq->last_decay = now; + cfs_rq->runnable_load_avg += sa->load_avg; + cfs_rq->runnable_load_sum += sa->load_sum; + + if (migrated) { + cfs_rq->avg.load_avg += sa->load_avg; + cfs_rq->avg.load_sum += sa->load_sum; + cfs_rq->avg.util_avg += sa->util_avg; + cfs_rq->avg.util_sum += sa->util_sum; } - __update_cfs_rq_tg_load_contrib(cfs_rq, force_update); + if (decayed || migrated) + update_tg_load_avg(cfs_rq, 0); } -/* Add the load generated by se into cfs_rq's child load-average */ -static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, - struct sched_entity *se, - int wakeup) +/* Remove the runnable load generated by se from cfs_rq's runnable load average */ +static inline void +dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - /* - * We track migrations using entity decay_count <= 0, on a wake-up - * migration we use a negative decay count to track the remote decays - * accumulated while sleeping. - * - * Newly forked tasks are enqueued with se->avg.decay_count == 0, they - * are seen by enqueue_entity_load_avg() as a migration with an already - * constructed load_avg_contrib. - */ - if (unlikely(se->avg.decay_count <= 0)) { - se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq)); - if (se->avg.decay_count) { - /* - * In a wake-up migration we have to approximate the - * time sleeping. This is because we can't synchronize - * clock_task between the two cpus, and it is not - * guaranteed to be read-safe. Instead, we can - * approximate this using our carried decays, which are - * explicitly atomically readable. - */ - se->avg.last_runnable_update -= (-se->avg.decay_count) - << 20; - update_entity_load_avg(se, 0); - /* Indicate that we're now synchronized and on-rq */ - se->avg.decay_count = 0; - } - wakeup = 0; - } else { - __synchronize_entity_decay(se); - } + update_load_avg(se, 1); - /* migrated tasks did not contribute to our blocked load */ - if (wakeup) { - subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); - update_entity_load_avg(se, 0); - } - - cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; - cfs_rq->utilization_load_avg += se->avg.utilization_avg_contrib; - /* we force update consideration on load-balancer moves */ - update_cfs_rq_blocked_load(cfs_rq, !wakeup); + cfs_rq->runnable_load_avg = + max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); + cfs_rq->runnable_load_sum = + max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); } /* - * Remove se's load from this cfs_rq child load-average, if the entity is - * transitioning to a blocked state we track its projected decay using - * blocked_load_avg. + * Task first catches up with cfs_rq, and then subtract + * itself from the cfs_rq (task must be off the queue now). */ -static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, - struct sched_entity *se, - int sleep) +void remove_entity_load_avg(struct sched_entity *se) { - update_entity_load_avg(se, 1); - /* we force update consideration on load-balancer moves */ - update_cfs_rq_blocked_load(cfs_rq, !sleep); + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 last_update_time; + +#ifndef CONFIG_64BIT + u64 last_update_time_copy; - cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; - cfs_rq->utilization_load_avg -= se->avg.utilization_avg_contrib; - if (sleep) { - cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; - se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); - } /* migrations, e.g. sleep=0 leave decay_count == 0 */ + do { + last_update_time_copy = cfs_rq->load_last_update_time_copy; + smp_rmb(); + last_update_time = cfs_rq->avg.last_update_time; + } while (last_update_time != last_update_time_copy); +#else + last_update_time = cfs_rq->avg.last_update_time; +#endif + + __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); + atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); + atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); } /* @@ -2944,7 +2788,6 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, */ void idle_enter_fair(struct rq *this_rq) { - update_rq_runnable_avg(this_rq, 1); } /* @@ -2954,24 +2797,28 @@ void idle_enter_fair(struct rq *this_rq) */ void idle_exit_fair(struct rq *this_rq) { - update_rq_runnable_avg(this_rq, 0); +} + +static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) +{ + return cfs_rq->runnable_load_avg; +} + +static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) +{ + return cfs_rq->avg.load_avg; } static int idle_balance(struct rq *this_rq); #else /* CONFIG_SMP */ -static inline void update_entity_load_avg(struct sched_entity *se, - int update_cfs_rq) {} -static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} -static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, - struct sched_entity *se, - int wakeup) {} -static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, - struct sched_entity *se, - int sleep) {} -static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, - int force_update) {} +static inline void update_load_avg(struct sched_entity *se, int update_tg) {} +static inline void +enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} +static inline void +dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} +static inline void remove_entity_load_avg(struct sched_entity *se) {} static inline int idle_balance(struct rq *rq) { @@ -3103,7 +2950,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP); + enqueue_entity_load_avg(cfs_rq, se); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); @@ -3178,7 +3025,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP); + dequeue_entity_load_avg(cfs_rq, se); update_stats_dequeue(cfs_rq, se); if (flags & DEQUEUE_SLEEP) { @@ -3268,7 +3115,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) */ update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); - update_entity_load_avg(se, 1); + update_load_avg(se, 1); } update_stats_curr_start(cfs_rq, se); @@ -3368,7 +3215,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); /* in !on_rq case, update occurred at dequeue */ - update_entity_load_avg(prev, 1); + update_load_avg(prev, 0); } cfs_rq->curr = NULL; } @@ -3384,8 +3231,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) /* * Ensure that runnable average is periodically updated. */ - update_entity_load_avg(curr, 1); - update_cfs_rq_blocked_load(cfs_rq, 1); + update_load_avg(curr, 1); update_cfs_shares(cfs_rq); #ifdef CONFIG_SCHED_HRTICK @@ -4258,14 +4104,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; + update_load_avg(se, 1); update_cfs_shares(cfs_rq); - update_entity_load_avg(se, 1); } - if (!se) { - update_rq_runnable_avg(rq, rq->nr_running); + if (!se) add_nr_running(rq, 1); - } + hrtick_update(rq); } @@ -4319,14 +4164,13 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; + update_load_avg(se, 1); update_cfs_shares(cfs_rq); - update_entity_load_avg(se, 1); } - if (!se) { + if (!se) sub_nr_running(rq, 1); - update_rq_runnable_avg(rq, 1); - } + hrtick_update(rq); } @@ -4439,6 +4283,12 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, sched_avg_update(this_rq); } +/* Used instead of source_load when we know the type == 0 */ +static unsigned long weighted_cpuload(const int cpu) +{ + return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs); +} + #ifdef CONFIG_NO_HZ_COMMON /* * There is no sane way to deal with nohz on smp when using jiffies because the @@ -4460,7 +4310,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, static void update_idle_cpu_load(struct rq *this_rq) { unsigned long curr_jiffies = READ_ONCE(jiffies); - unsigned long load = this_rq->cfs.runnable_load_avg; + unsigned long load = weighted_cpuload(cpu_of(this_rq)); unsigned long pending_updates; /* @@ -4506,7 +4356,7 @@ void update_cpu_load_nohz(void) */ void update_cpu_load_active(struct rq *this_rq) { - unsigned long load = this_rq->cfs.runnable_load_avg; + unsigned long load = weighted_cpuload(cpu_of(this_rq)); /* * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). */ @@ -4514,12 +4364,6 @@ void update_cpu_load_active(struct rq *this_rq) __update_cpu_load(this_rq, load, 1); } -/* Used instead of source_load when we know the type == 0 */ -static unsigned long weighted_cpuload(const int cpu) -{ - return cpu_rq(cpu)->cfs.runnable_load_avg; -} - /* * Return a low guess at the load of a migration-source cpu weighted * according to the scheduling class and "nice" value. @@ -4567,7 +4411,7 @@ static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); - unsigned long load_avg = rq->cfs.runnable_load_avg; + unsigned long load_avg = weighted_cpuload(cpu); if (nr_running) return load_avg / nr_running; @@ -4686,7 +4530,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) /* * w = rw_i + @wl */ - w = se->my_q->load.weight + wl; + w = cfs_rq_load_avg(se->my_q) + wl; /* * wl = S * s'_i; see (2) @@ -4707,7 +4551,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) /* * wl = dw_i = S * (s'_i - s_i); see (3) */ - wl -= se->load.weight; + wl -= se->avg.load_avg; /* * Recursively apply this logic to all parent groups to compute @@ -4730,26 +4574,29 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) #endif +/* + * Detect M:N waker/wakee relationships via a switching-frequency heuristic. + * A waker of many should wake a different task than the one last awakened + * at a frequency roughly N times higher than one of its wakees. In order + * to determine whether we should let the load spread vs consolodating to + * shared cache, we look for a minimum 'flip' frequency of llc_size in one + * partner, and a factor of lls_size higher frequency in the other. With + * both conditions met, we can be relatively sure that the relationship is + * non-monogamous, with partner count exceeding socket size. Waker/wakee + * being client/server, worker/dispatcher, interrupt source or whatever is + * irrelevant, spread criteria is apparent partner count exceeds socket size. + */ static int wake_wide(struct task_struct *p) { + unsigned int master = current->wakee_flips; + unsigned int slave = p->wakee_flips; int factor = this_cpu_read(sd_llc_size); - /* - * Yeah, it's the switching-frequency, could means many wakee or - * rapidly switch, use factor here will just help to automatically - * adjust the loose-degree, so bigger node will lead to more pull. - */ - if (p->wakee_flips > factor) { - /* - * wakee is somewhat hot, it needs certain amount of cpu - * resource, so if waker is far more hot, prefer to leave - * it alone. - */ - if (current->wakee_flips > (factor * p->wakee_flips)) - return 1; - } - - return 0; + if (master < slave) + swap(master, slave); + if (slave < factor || master < slave * factor) + return 0; + return 1; } static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) @@ -4761,13 +4608,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) unsigned long weight; int balanced; - /* - * If we wake multiple tasks be careful to not bounce - * ourselves around too much. - */ - if (wake_wide(p)) - return 0; - idx = sd->wake_idx; this_cpu = smp_processor_id(); prev_cpu = task_cpu(p); @@ -4781,14 +4621,14 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) */ if (sync) { tg = task_group(current); - weight = current->se.load.weight; + weight = current->se.avg.load_avg; this_load += effective_load(tg, this_cpu, -weight, -weight); load += effective_load(tg, prev_cpu, 0, -weight); } tg = task_group(p); - weight = p->se.load.weight; + weight = p->se.avg.load_avg; /* * In low-load situations, where prev_cpu is idle and this_cpu is idle @@ -4981,12 +4821,12 @@ done: * tasks. The unit of the return value must be the one of capacity so we can * compare the usage with the capacity of the CPU that is available for CFS * task (ie cpu_capacity). - * cfs.utilization_load_avg is the sum of running time of runnable tasks on a + * cfs.avg.util_avg is the sum of running time of runnable tasks on a * CPU. It represents the amount of utilization of a CPU in the range * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full * capacity of the CPU because it's about the running time on this CPU. - * Nevertheless, cfs.utilization_load_avg can be higher than SCHED_LOAD_SCALE - * because of unfortunate rounding in avg_period and running_load_avg or just + * Nevertheless, cfs.avg.util_avg can be higher than SCHED_LOAD_SCALE + * because of unfortunate rounding in util_avg or just * after migrating tasks until the average stabilizes with the new running * time. So we need to check that the usage stays into the range * [0..cpu_capacity_orig] and cap if necessary. @@ -4995,7 +4835,7 @@ done: */ static int get_cpu_usage(int cpu) { - unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg; + unsigned long usage = cpu_rq(cpu)->cfs.avg.util_avg; unsigned long capacity = capacity_orig_of(cpu); if (usage >= SCHED_LOAD_SCALE) @@ -5021,17 +4861,17 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f { struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; int cpu = smp_processor_id(); - int new_cpu = cpu; + int new_cpu = prev_cpu; int want_affine = 0; int sync = wake_flags & WF_SYNC; if (sd_flag & SD_BALANCE_WAKE) - want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); rcu_read_lock(); for_each_domain(cpu, tmp) { if (!(tmp->flags & SD_LOAD_BALANCE)) - continue; + break; /* * If both cpu and prev_cpu are part of this domain, @@ -5045,17 +4885,21 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (tmp->flags & sd_flag) sd = tmp; + else if (!want_affine) + break; } - if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync)) - prev_cpu = cpu; - - if (sd_flag & SD_BALANCE_WAKE) { - new_cpu = select_idle_sibling(p, prev_cpu); - goto unlock; + if (affine_sd) { + sd = NULL; /* Prefer wake_affine over balance flags */ + if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) + new_cpu = cpu; } - while (sd) { + if (!sd) { + if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ + new_cpu = select_idle_sibling(p, new_cpu); + + } else while (sd) { struct sched_group *group; int weight; @@ -5089,7 +4933,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f } /* while loop will break here if sd == NULL */ } -unlock: rcu_read_unlock(); return new_cpu; @@ -5101,26 +4944,27 @@ unlock: * previous cpu. However, the caller only guarantees p->pi_lock is held; no * other assumptions, including the state of rq->lock, should be made. */ -static void -migrate_task_rq_fair(struct task_struct *p, int next_cpu) +static void migrate_task_rq_fair(struct task_struct *p, int next_cpu) { - struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); - /* - * Load tracking: accumulate removed load so that it can be processed - * when we next update owning cfs_rq under rq->lock. Tasks contribute - * to blocked load iff they have a positive decay-count. It can never - * be negative here since on-rq tasks have decay-count == 0. + * We are supposed to update the task to "current" time, then its up to date + * and ready to go to new CPU/cfs_rq. But we have difficulty in getting + * what current time is, so simply throw away the out-of-date time. This + * will result in the wakee task is less decayed, but giving the wakee more + * load sounds not bad. */ - if (se->avg.decay_count) { - se->avg.decay_count = -__synchronize_entity_decay(se); - atomic_long_add(se->avg.load_avg_contrib, - &cfs_rq->removed_load); - } + remove_entity_load_avg(&p->se); + + /* Tell new CPU we are migrated */ + p->se.avg.last_update_time = 0; /* We have migrated, no longer consider this task hot */ - se->exec_start = 0; + p->se.exec_start = 0; +} + +static void task_dead_fair(struct task_struct *p) +{ + remove_entity_load_avg(&p->se); } #endif /* CONFIG_SMP */ @@ -5670,72 +5514,39 @@ static int task_hot(struct task_struct *p, struct lb_env *env) #ifdef CONFIG_NUMA_BALANCING /* - * Returns true if the destination node is the preferred node. - * Needs to match fbq_classify_rq(): if there is a runnable task - * that is not on its preferred node, we should identify it. + * Returns 1, if task migration degrades locality + * Returns 0, if task migration improves locality i.e migration preferred. + * Returns -1, if task migration is not affected by locality. */ -static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) +static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { struct numa_group *numa_group = rcu_dereference(p->numa_group); unsigned long src_faults, dst_faults; int src_nid, dst_nid; - if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || - !(env->sd->flags & SD_NUMA)) { - return false; - } - - src_nid = cpu_to_node(env->src_cpu); - dst_nid = cpu_to_node(env->dst_cpu); - - if (src_nid == dst_nid) - return false; - - /* Encourage migration to the preferred node. */ - if (dst_nid == p->numa_preferred_nid) - return true; - - /* Migrating away from the preferred node is bad. */ - if (src_nid == p->numa_preferred_nid) - return false; - - if (numa_group) { - src_faults = group_faults(p, src_nid); - dst_faults = group_faults(p, dst_nid); - } else { - src_faults = task_faults(p, src_nid); - dst_faults = task_faults(p, dst_nid); - } - - return dst_faults > src_faults; -} - - -static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) -{ - struct numa_group *numa_group = rcu_dereference(p->numa_group); - unsigned long src_faults, dst_faults; - int src_nid, dst_nid; - - if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) - return false; - if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) - return false; + return -1; + + if (!sched_feat(NUMA)) + return -1; src_nid = cpu_to_node(env->src_cpu); dst_nid = cpu_to_node(env->dst_cpu); if (src_nid == dst_nid) - return false; + return -1; - /* Migrating away from the preferred node is bad. */ - if (src_nid == p->numa_preferred_nid) - return true; + /* Migrating away from the preferred node is always bad. */ + if (src_nid == p->numa_preferred_nid) { + if (env->src_rq->nr_running > env->src_rq->nr_preferred_running) + return 1; + else + return -1; + } /* Encourage migration to the preferred node. */ if (dst_nid == p->numa_preferred_nid) - return false; + return 0; if (numa_group) { src_faults = group_faults(p, src_nid); @@ -5749,16 +5560,10 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) } #else -static inline bool migrate_improves_locality(struct task_struct *p, +static inline int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { - return false; -} - -static inline bool migrate_degrades_locality(struct task_struct *p, - struct lb_env *env) -{ - return false; + return -1; } #endif @@ -5768,7 +5573,7 @@ static inline bool migrate_degrades_locality(struct task_struct *p, static int can_migrate_task(struct task_struct *p, struct lb_env *env) { - int tsk_cache_hot = 0; + int tsk_cache_hot; lockdep_assert_held(&env->src_rq->lock); @@ -5826,13 +5631,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 2) task is cache cold, or * 3) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, env); - if (!tsk_cache_hot) - tsk_cache_hot = migrate_degrades_locality(p, env); + tsk_cache_hot = migrate_degrades_locality(p, env); + if (tsk_cache_hot == -1) + tsk_cache_hot = task_hot(p, env); - if (migrate_improves_locality(p, env) || !tsk_cache_hot || + if (tsk_cache_hot <= 0 || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { - if (tsk_cache_hot) { + if (tsk_cache_hot == 1) { schedstat_inc(env->sd, lb_hot_gained[env->idle]); schedstat_inc(p, se.statistics.nr_forced_migrations); } @@ -5906,6 +5711,13 @@ static int detach_tasks(struct lb_env *env) return 0; while (!list_empty(tasks)) { + /* + * We don't want to steal all, otherwise we may be treated likewise, + * which could at worst lead to a livelock crash. + */ + if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1) + break; + p = list_first_entry(tasks, struct task_struct, se.group_node); env->loop++; @@ -6015,39 +5827,6 @@ static void attach_tasks(struct lb_env *env) } #ifdef CONFIG_FAIR_GROUP_SCHED -/* - * update tg->load_weight by folding this cpu's load_avg - */ -static void __update_blocked_averages_cpu(struct task_group *tg, int cpu) -{ - struct sched_entity *se = tg->se[cpu]; - struct cfs_rq *cfs_rq = tg->cfs_rq[cpu]; - - /* throttled entities do not contribute to load */ - if (throttled_hierarchy(cfs_rq)) - return; - - update_cfs_rq_blocked_load(cfs_rq, 1); - - if (se) { - update_entity_load_avg(se, 1); - /* - * We pivot on our runnable average having decayed to zero for - * list removal. This generally implies that all our children - * have also been removed (modulo rounding error or bandwidth - * control); however, such cases are rare and we can fix these - * at enqueue. - * - * TODO: fix up out-of-order children on enqueue. - */ - if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running) - list_del_leaf_cfs_rq(cfs_rq); - } else { - struct rq *rq = rq_of(cfs_rq); - update_rq_runnable_avg(rq, rq->nr_running); - } -} - static void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -6056,19 +5835,19 @@ static void update_blocked_averages(int cpu) raw_spin_lock_irqsave(&rq->lock, flags); update_rq_clock(rq); + /* * Iterates the task_group tree in a bottom up fashion, see * list_add_leaf_cfs_rq() for details. */ for_each_leaf_cfs_rq(rq, cfs_rq) { - /* - * Note: We may want to consider periodically releasing - * rq->lock about these updates so that creating many task - * groups does not result in continually extending hold time. - */ - __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu); - } + /* throttled entities do not contribute to load */ + if (throttled_hierarchy(cfs_rq)) + continue; + if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq)) + update_tg_load_avg(cfs_rq, 0); + } raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -6096,14 +5875,14 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq) } if (!se) { - cfs_rq->h_load = cfs_rq->runnable_load_avg; + cfs_rq->h_load = cfs_rq_load_avg(cfs_rq); cfs_rq->last_h_load_update = now; } while ((se = cfs_rq->h_load_next) != NULL) { load = cfs_rq->h_load; - load = div64_ul(load * se->avg.load_avg_contrib, - cfs_rq->runnable_load_avg + 1); + load = div64_ul(load * se->avg.load_avg, + cfs_rq_load_avg(cfs_rq) + 1); cfs_rq = group_cfs_rq(se); cfs_rq->h_load = load; cfs_rq->last_h_load_update = now; @@ -6115,17 +5894,25 @@ static unsigned long task_h_load(struct task_struct *p) struct cfs_rq *cfs_rq = task_cfs_rq(p); update_cfs_rq_h_load(cfs_rq); - return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load, - cfs_rq->runnable_load_avg + 1); + return div64_ul(p->se.avg.load_avg * cfs_rq->h_load, + cfs_rq_load_avg(cfs_rq) + 1); } #else static inline void update_blocked_averages(int cpu) { + struct rq *rq = cpu_rq(cpu); + struct cfs_rq *cfs_rq = &rq->cfs; + unsigned long flags; + + raw_spin_lock_irqsave(&rq->lock, flags); + update_rq_clock(rq); + update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); + raw_spin_unlock_irqrestore(&rq->lock, flags); } static unsigned long task_h_load(struct task_struct *p) { - return p->se.avg.load_avg_contrib; + return p->se.avg.load_avg; } #endif @@ -8025,8 +7812,6 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) if (numabalancing_enabled) task_tick_numa(rq, curr); - - update_rq_runnable_avg(rq, 1); } /* @@ -8125,15 +7910,18 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) } #ifdef CONFIG_SMP - /* - * Remove our load from contribution when we leave sched_fair - * and ensure we don't carry in an old decay_count if we - * switch back. - */ - if (se->avg.decay_count) { - __synchronize_entity_decay(se); - subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); - } + /* Catch up with the cfs_rq and remove our load when we leave */ + __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq), &se->avg, + se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); + + cfs_rq->avg.load_avg = + max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); + cfs_rq->avg.load_sum = + max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); + cfs_rq->avg.util_avg = + max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); + cfs_rq->avg.util_sum = + max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); #endif } @@ -8142,16 +7930,31 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) */ static void switched_to_fair(struct rq *rq, struct task_struct *p) { -#ifdef CONFIG_FAIR_GROUP_SCHED struct sched_entity *se = &p->se; + +#ifdef CONFIG_FAIR_GROUP_SCHED /* * Since the real-depth could have been changed (only FAIR * class maintain depth value), reset depth properly. */ se->depth = se->parent ? se->parent->depth + 1 : 0; #endif - if (!task_on_rq_queued(p)) + + if (!task_on_rq_queued(p)) { + + /* + * Ensure the task has a non-normalized vruntime when it is switched + * back to the fair class with !queued, so that enqueue_entity() at + * wake-up time will do the right thing. + * + * If it's queued, then the enqueue_entity(.flags=0) makes the task + * has non-normalized vruntime, if it's !queued, then it still has + * normalized vruntime. + */ + if (p->state != TASK_RUNNING) + se->vruntime += cfs_rq_of(se)->min_vruntime; return; + } /* * We were most likely switched from sched_rt, so @@ -8190,8 +7993,8 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; #endif #ifdef CONFIG_SMP - atomic64_set(&cfs_rq->decay_counter, 1); - atomic_long_set(&cfs_rq->removed_load, 0); + atomic_long_set(&cfs_rq->removed_load_avg, 0); + atomic_long_set(&cfs_rq->removed_util_avg, 0); #endif } @@ -8236,14 +8039,14 @@ static void task_move_group_fair(struct task_struct *p, int queued) if (!queued) { cfs_rq = cfs_rq_of(se); se->vruntime += cfs_rq->min_vruntime; + #ifdef CONFIG_SMP - /* - * migrate_task_rq_fair() will have removed our previous - * contribution, but we must synchronize for ongoing future - * decay. - */ - se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); - cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; + /* Virtually synchronize task with its new cfs_rq */ + p->se.avg.last_update_time = cfs_rq->avg.last_update_time; + cfs_rq->avg.load_avg += p->se.avg.load_avg; + cfs_rq->avg.load_sum += p->se.avg.load_sum; + cfs_rq->avg.util_avg += p->se.avg.util_avg; + cfs_rq->avg.util_sum += p->se.avg.util_sum; #endif } } @@ -8257,8 +8060,11 @@ void free_fair_sched_group(struct task_group *tg) for_each_possible_cpu(i) { if (tg->cfs_rq) kfree(tg->cfs_rq[i]); - if (tg->se) + if (tg->se) { + if (tg->se[i]) + remove_entity_load_avg(tg->se[i]); kfree(tg->se[i]); + } } kfree(tg->cfs_rq); @@ -8295,6 +8101,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); + init_entity_runnable_average(se); } return 1; @@ -8444,6 +8251,8 @@ const struct sched_class fair_sched_class = { .rq_offline = rq_offline_fair, .task_waking = task_waking_fair, + .task_dead = task_dead_fair, + .set_cpus_allowed = set_cpus_allowed_common, #endif .set_curr_task = set_curr_task_fair, diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 91e33cd485f6..83a50e7ca533 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -79,20 +79,12 @@ SCHED_FEAT(LB_MIN, false) * numa_balancing= */ #ifdef CONFIG_NUMA_BALANCING -SCHED_FEAT(NUMA, false) /* - * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a - * higher number of hinting faults are recorded during active load - * balancing. + * NUMA will favor moving tasks towards nodes where a higher number of + * hinting faults are recorded during active load balancing. It will + * resist moving tasks towards nodes where a lower number of hinting + * faults have been recorded. */ -SCHED_FEAT(NUMA_FAVOUR_HIGHER, true) - -/* - * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a - * lower number of hinting faults have been recorded. As this has - * the potential to prevent a task ever migrating to a new node - * due to CPU overload it is disabled by default. - */ -SCHED_FEAT(NUMA_RESIST_LOWER, false) +SCHED_FEAT(NUMA, true) #endif diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 594275ed2620..8f177c73ae19 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -83,10 +83,13 @@ void __weak arch_cpu_idle(void) */ void default_idle_call(void) { - if (current_clr_polling_and_test()) + if (current_clr_polling_and_test()) { local_irq_enable(); - else + } else { + stop_critical_timings(); arch_cpu_idle(); + start_critical_timings(); + } } static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev, @@ -141,12 +144,6 @@ static void cpuidle_idle_call(void) } /* - * During the idle period, stop measuring the disabled irqs - * critical sections latencies - */ - stop_critical_timings(); - - /* * Tell the RCU framework we are entering an idle section, * so no more rcu read side critical sections and one more * step to the grace period @@ -198,7 +195,6 @@ exit_idle: local_irq_enable(); rcu_idle_exit(); - start_critical_timings(); } DEFINE_PER_CPU(bool, cpu_dead_idle); diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index c65dac8c97cd..c4ae0f1fdf9b 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -96,6 +96,7 @@ const struct sched_class idle_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_idle, + .set_cpus_allowed = set_cpus_allowed_common, #endif .set_curr_task = set_curr_task_idle, diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 0d193a243e96..d2ea59364a1c 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2069,7 +2069,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) { if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && - has_pushable_tasks(rq) && p->nr_cpus_allowed > 1 && (dl_task(rq->curr) || rt_task(rq->curr)) && (rq->curr->nr_cpus_allowed < 2 || @@ -2077,45 +2076,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) push_rt_tasks(rq); } -static void set_cpus_allowed_rt(struct task_struct *p, - const struct cpumask *new_mask) -{ - struct rq *rq; - int weight; - - BUG_ON(!rt_task(p)); - - if (!task_on_rq_queued(p)) - return; - - weight = cpumask_weight(new_mask); - - /* - * Only update if the process changes its state from whether it - * can migrate or not. - */ - if ((p->nr_cpus_allowed > 1) == (weight > 1)) - return; - - rq = task_rq(p); - - /* - * The process used to be able to migrate OR it can now migrate - */ - if (weight <= 1) { - if (!task_current(rq, p)) - dequeue_pushable_task(rq, p); - BUG_ON(!rq->rt.rt_nr_migratory); - rq->rt.rt_nr_migratory--; - } else { - if (!task_current(rq, p)) - enqueue_pushable_task(rq, p); - rq->rt.rt_nr_migratory++; - } - - update_rt_migration(&rq->rt); -} - /* Assumes rq->lock is held */ static void rq_online_rt(struct rq *rq) { @@ -2324,7 +2284,7 @@ const struct sched_class rt_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_rt, - .set_cpus_allowed = set_cpus_allowed_rt, + .set_cpus_allowed = set_cpus_allowed_common, .rq_online = rq_online_rt, .rq_offline = rq_offline_rt, .task_woken = task_woken_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 84d48790bb6d..68cda117574c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -245,7 +245,6 @@ struct task_group { #ifdef CONFIG_SMP atomic_long_t load_avg; - atomic_t runnable_avg; #endif #endif @@ -366,27 +365,20 @@ struct cfs_rq { #ifdef CONFIG_SMP /* - * CFS Load tracking - * Under CFS, load is tracked on a per-entity basis and aggregated up. - * This allows for the description of both thread and group usage (in - * the FAIR_GROUP_SCHED case). - * runnable_load_avg is the sum of the load_avg_contrib of the - * sched_entities on the rq. - * blocked_load_avg is similar to runnable_load_avg except that its - * the blocked sched_entities on the rq. - * utilization_load_avg is the sum of the average running time of the - * sched_entities on the rq. + * CFS load tracking */ - unsigned long runnable_load_avg, blocked_load_avg, utilization_load_avg; - atomic64_t decay_counter; - u64 last_decay; - atomic_long_t removed_load; - + struct sched_avg avg; + u64 runnable_load_sum; + unsigned long runnable_load_avg; #ifdef CONFIG_FAIR_GROUP_SCHED - /* Required to track per-cpu representation of a task_group */ - u32 tg_runnable_contrib; - unsigned long tg_load_contrib; + unsigned long tg_load_avg_contrib; +#endif + atomic_long_t removed_load_avg, removed_util_avg; +#ifndef CONFIG_64BIT + u64 load_last_update_time_copy; +#endif +#ifdef CONFIG_FAIR_GROUP_SCHED /* * h_load = weight * f(tg) * @@ -595,8 +587,6 @@ struct rq { #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ struct list_head leaf_cfs_rq_list; - - struct sched_avg avg; #endif /* CONFIG_FAIR_GROUP_SCHED */ /* @@ -1065,9 +1055,6 @@ static inline int task_on_rq_migrating(struct task_struct *p) #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) #endif -#ifndef finish_arch_switch -# define finish_arch_switch(prev) do { } while (0) -#endif #ifndef finish_arch_post_lock_switch # define finish_arch_post_lock_switch() do { } while (0) #endif @@ -1268,6 +1255,8 @@ extern void trigger_load_balance(struct rq *rq); extern void idle_enter_fair(struct rq *this_rq); extern void idle_exit_fair(struct rq *this_rq); +extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); + #else static inline void idle_enter_fair(struct rq *rq) { } @@ -1319,7 +1308,7 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se); unsigned long to_ratio(u64 period, u64 runtime); -extern void init_task_runnable_average(struct task_struct *p); +extern void init_entity_runnable_average(struct sched_entity *se); static inline void add_nr_running(struct rq *rq, unsigned count) { diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 79ffec45a6ac..cbc67da10954 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -123,6 +123,7 @@ const struct sched_class stop_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_stop, + .set_cpus_allowed = set_cpus_allowed_common, #endif .set_curr_task = set_curr_task_stop, |