diff options
author | Linus Torvalds | 2022-08-01 11:49:06 -0700 |
---|---|---|
committer | Linus Torvalds | 2022-08-01 11:49:06 -0700 |
commit | b167fdffe9e737007cbf7c691cde5fa489ca58d7 (patch) | |
tree | 05f36cf3d602cf7e69d22e2737dd4e8fb9849bfa /include | |
parent | 0dd1cabe8a4a568252ca70f7530c3ca10e728513 (diff) | |
parent | c17a6ff9321355487d7d5ccaa7d406a0ea06b6c4 (diff) |
Merge tag 'sched-core-2022-08-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
"Load-balancing improvements:
- Improve NUMA balancing on AMD Zen systems for affine workloads.
- Improve the handling of reduced-capacity CPUs in load-balancing.
- Energy Model improvements: fix & refine all the energy fairness
metrics (PELT), and remove the conservative threshold requiring 6%
energy savings to migrate a task. Doing this improves power
efficiency for most workloads, and also increases the reliability
of energy-efficiency scheduling.
- Optimize/tweak select_idle_cpu() to spend (much) less time
searching for an idle CPU on overloaded systems. There's reports of
several milliseconds spent there on large systems with large
workloads ...
[ Since the search logic changed, there might be behavioral side
effects. ]
- Improve NUMA imbalance behavior. On certain systems with spare
capacity, initial placement of tasks is non-deterministic, and such
an artificial placement imbalance can persist for a long time,
hurting (and sometimes helping) performance.
The fix is to make fork-time task placement consistent with runtime
NUMA balancing placement.
Note that some performance regressions were reported against this,
caused by workloads that are not memory bandwith limited, which
benefit from the artificial locality of the placement bug(s). Mel
Gorman's conclusion, with which we concur, was that consistency is
better than random workload benefits from non-deterministic bugs:
"Given there is no crystal ball and it's a tradeoff, I think
it's better to be consistent and use similar logic at both fork
time and runtime even if it doesn't have universal benefit."
- Improve core scheduling by fixing a bug in
sched_core_update_cookie() that caused unnecessary forced idling.
- Improve wakeup-balancing by allowing same-LLC wakeup of idle CPUs
for newly woken tasks.
- Fix a newidle balancing bug that introduced unnecessary wakeup
latencies.
ABI improvements/fixes:
- Do not check capabilities and do not issue capability check denial
messages when a scheduler syscall doesn't require privileges. (Such
as increasing niceness.)
- Add forced-idle accounting to cgroups too.
- Fix/improve the RSEQ ABI to not just silently accept unknown flags.
(No existing tooling is known to have learned to rely on the
previous behavior.)
- Depreciate the (unused) RSEQ_CS_FLAG_NO_RESTART_ON_* flags.
Optimizations:
- Optimize & simplify leaf_cfs_rq_list()
- Micro-optimize set_nr_{and_not,if}_polling() via try_cmpxchg().
Misc fixes & cleanups:
- Fix the RSEQ self-tests on RISC-V and Glibc 2.35 systems.
- Fix a full-NOHZ bug that can in some cases result in the tick not
being re-enabled when the last SCHED_RT task is gone from a
runqueue but there's still SCHED_OTHER tasks around.
- Various PREEMPT_RT related fixes.
- Misc cleanups & smaller fixes"
* tag 'sched-core-2022-08-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (32 commits)
rseq: Kill process when unknown flags are encountered in ABI structures
rseq: Deprecate RSEQ_CS_FLAG_NO_RESTART_ON_* flags
sched/core: Fix the bug that task won't enqueue into core tree when update cookie
nohz/full, sched/rt: Fix missed tick-reenabling bug in dequeue_task_rt()
sched/core: Always flush pending blk_plug
sched/fair: fix case with reduced capacity CPU
sched/core: Use try_cmpxchg in set_nr_{and_not,if}_polling
sched/core: add forced idle accounting for cgroups
sched/fair: Remove the energy margin in feec()
sched/fair: Remove task_util from effective utilization in feec()
sched/fair: Use the same cpumask per-PD throughout find_energy_efficient_cpu()
sched/fair: Rename select_idle_mask to select_rq_mask
sched, drivers: Remove max param from effective_cpu_util()/sched_cpu_util()
sched/fair: Decay task PELT values during wakeup migration
sched/fair: Provide u64 read for 32-bits arch helper
sched/fair: Introduce SIS_UTIL to search idle CPU based on sum of util_avg
sched: only perform capability check on privileged operation
sched: Remove unused function group_first_cpu()
sched/fair: Remove redundant word " *"
selftests/rseq: check if libc rseq support is registered
...
Diffstat (limited to 'include')
-rw-r--r-- | include/linux/cgroup-defs.h | 4 | ||||
-rw-r--r-- | include/linux/kernel_stat.h | 7 | ||||
-rw-r--r-- | include/linux/sched.h | 2 | ||||
-rw-r--r-- | include/linux/sched/rt.h | 8 | ||||
-rw-r--r-- | include/linux/sched/topology.h | 1 |
5 files changed, 13 insertions, 9 deletions
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index d4427d0a0e18..187b54a9567e 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -288,6 +288,10 @@ struct css_set { struct cgroup_base_stat { struct task_cputime cputime; + +#ifdef CONFIG_SCHED_CORE + u64 forceidle_sum; +#endif }; /* diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 69ae6b278464..ddb5a358fd82 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -28,6 +28,9 @@ enum cpu_usage_stat { CPUTIME_STEAL, CPUTIME_GUEST, CPUTIME_GUEST_NICE, +#ifdef CONFIG_SCHED_CORE + CPUTIME_FORCEIDLE, +#endif NR_STATS, }; @@ -115,4 +118,8 @@ extern void account_process_tick(struct task_struct *, int user); extern void account_idle_ticks(unsigned long ticks); +#ifdef CONFIG_SCHED_CORE +extern void __account_forceidle_time(struct task_struct *tsk, u64 delta); +#endif + #endif /* _LINUX_KERNEL_STAT_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index c46f3a63b758..88b8817b827d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2257,7 +2257,7 @@ static inline bool owner_on_cpu(struct task_struct *owner) } /* Returns effective CPU energy utilization, as seen by the scheduler */ -unsigned long sched_cpu_util(int cpu, unsigned long max); +unsigned long sched_cpu_util(int cpu); #endif /* CONFIG_SMP */ #ifdef CONFIG_RSEQ diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h index e5af028c08b4..994c25640e15 100644 --- a/include/linux/sched/rt.h +++ b/include/linux/sched/rt.h @@ -39,20 +39,12 @@ static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *p) } extern void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task); extern void rt_mutex_adjust_pi(struct task_struct *p); -static inline bool tsk_is_pi_blocked(struct task_struct *tsk) -{ - return tsk->pi_blocked_on != NULL; -} #else static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task) { return NULL; } # define rt_mutex_adjust_pi(p) do { } while (0) -static inline bool tsk_is_pi_blocked(struct task_struct *tsk) -{ - return false; -} #endif extern void normalize_rt_tasks(void); diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 56cffe42abbc..816df6cc444e 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -81,6 +81,7 @@ struct sched_domain_shared { atomic_t ref; atomic_t nr_busy_cpus; int has_idle_cores; + int nr_idle_scan; }; struct sched_domain { |