diff options
author | Linus Torvalds | 2020-08-03 14:58:38 -0700 |
---|---|---|
committer | Linus Torvalds | 2020-08-03 14:58:38 -0700 |
commit | e4cbce4d131753eca271d9d67f58c6377f27ad21 (patch) | |
tree | e08e3c8836cd7b9f800e209131aed70897f5fe07 /include | |
parent | b34133fec882d2717f2d61a2a010edd3422368c8 (diff) | |
parent | 949bcb8135a96a6923e676646bd29cbe69e8350f (diff) |
Merge tag 'sched-core-2020-08-03' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
- Improve uclamp performance by using a static key for the fast path
- Add the "sched_util_clamp_min_rt_default" sysctl, to optimize for
better power efficiency of RT tasks on battery powered devices.
(The default is to maximize performance & reduce RT latencies.)
- Improve utime and stime tracking accuracy, which had a fixed boundary
of error, which created larger and larger relative errors as the
values become larger. This is now replaced with more precise
arithmetics, using the new mul_u64_u64_div_u64() helper in math64.h.
- Improve the deadline scheduler, such as making it capacity aware
- Improve frequency-invariant scheduling
- Misc cleanups in energy/power aware scheduling
- Add sched_update_nr_running tracepoint to track changes to nr_running
- Documentation additions and updates
- Misc cleanups and smaller fixes
* tag 'sched-core-2020-08-03' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (54 commits)
sched/doc: Factorize bits between sched-energy.rst & sched-capacity.rst
sched/doc: Document capacity aware scheduling
sched: Document arch_scale_*_capacity()
arm, arm64: Fix selection of CONFIG_SCHED_THERMAL_PRESSURE
Documentation/sysctl: Document uclamp sysctl knobs
sched/uclamp: Add a new sysctl to control RT default boost value
sched/uclamp: Fix a deadlock when enabling uclamp static key
sched: Remove duplicated tick_nohz_full_enabled() check
sched: Fix a typo in a comment
sched/uclamp: Remove unnecessary mutex_init()
arm, arm64: Select CONFIG_SCHED_THERMAL_PRESSURE
sched: Cleanup SCHED_THERMAL_PRESSURE kconfig entry
arch_topology, sched/core: Cleanup thermal pressure definition
trace/events/sched.h: fix duplicated word
linux/sched/mm.h: drop duplicated words in comments
smp: Fix a potential usage of stale nr_cpus
sched/fair: update_pick_idlest() Select group with lowest group_util when idle_cpus are equal
sched: nohz: stop passing around unused "ticks" parameter.
sched: Better document ttwu()
sched: Add a tracepoint to track rq->nr_running
...
Diffstat (limited to 'include')
-rw-r--r-- | include/asm-generic/vmlinux.lds.h | 24 | ||||
-rw-r--r-- | include/linux/arch_topology.h | 4 | ||||
-rw-r--r-- | include/linux/math64.h | 2 | ||||
-rw-r--r-- | include/linux/psi_types.h | 7 | ||||
-rw-r--r-- | include/linux/sched.h | 25 | ||||
-rw-r--r-- | include/linux/sched/isolation.h | 1 | ||||
-rw-r--r-- | include/linux/sched/loadavg.h | 2 | ||||
-rw-r--r-- | include/linux/sched/mm.h | 8 | ||||
-rw-r--r-- | include/linux/sched/sysctl.h | 4 | ||||
-rw-r--r-- | include/linux/sched/task.h | 1 | ||||
-rw-r--r-- | include/linux/sched/topology.h | 17 | ||||
-rw-r--r-- | include/trace/events/sched.h | 14 |
12 files changed, 86 insertions, 23 deletions
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 052e0f05a984..de8493cc3082 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -109,12 +109,31 @@ #endif /* - * Align to a 32 byte boundary equal to the - * alignment gcc 4.5 uses for a struct + * GCC 4.5 and later have a 32 bytes section alignment for structures. + * Except GCC 4.9, that feels the need to align on 64 bytes. */ +#if __GNUC__ == 4 && __GNUC_MINOR__ == 9 +#define STRUCT_ALIGNMENT 64 +#else #define STRUCT_ALIGNMENT 32 +#endif #define STRUCT_ALIGN() . = ALIGN(STRUCT_ALIGNMENT) +/* + * The order of the sched class addresses are important, as they are + * used to determine the order of the priority of each sched class in + * relation to each other. + */ +#define SCHED_DATA \ + STRUCT_ALIGN(); \ + __begin_sched_classes = .; \ + *(__idle_sched_class) \ + *(__fair_sched_class) \ + *(__rt_sched_class) \ + *(__dl_sched_class) \ + *(__stop_sched_class) \ + __end_sched_classes = .; + /* The actual configuration determine if the init/exit sections * are handled as text/data or they can be discarded (which * often happens at runtime) @@ -389,6 +408,7 @@ .rodata : AT(ADDR(.rodata) - LOAD_OFFSET) { \ __start_rodata = .; \ *(.rodata) *(.rodata.*) \ + SCHED_DATA \ RO_AFTER_INIT_DATA /* Read only after init */ \ . = ALIGN(8); \ __start___tracepoints_ptrs = .; \ diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index 0566cb3314ef..69b1dabe39dc 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -39,8 +39,8 @@ static inline unsigned long topology_get_thermal_pressure(int cpu) return per_cpu(thermal_pressure, cpu); } -void arch_set_thermal_pressure(struct cpumask *cpus, - unsigned long th_pressure); +void topology_set_thermal_pressure(const struct cpumask *cpus, + unsigned long th_pressure); struct cpu_topology { int thread_id; diff --git a/include/linux/math64.h b/include/linux/math64.h index 11a267413e8e..d097119419e6 100644 --- a/include/linux/math64.h +++ b/include/linux/math64.h @@ -263,6 +263,8 @@ static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor) } #endif /* mul_u64_u32_div */ +u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div); + #define DIV64_U64_ROUND_UP(ll, d) \ ({ u64 _tmp = (d); div64_u64((ll) + _tmp - 1, _tmp); }) diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 4b7258495a04..b95f3211566a 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -153,9 +153,10 @@ struct psi_group { unsigned long avg[NR_PSI_STATES - 1][3]; /* Monitor work control */ - atomic_t poll_scheduled; - struct kthread_worker __rcu *poll_kworker; - struct kthread_delayed_work poll_work; + struct task_struct __rcu *poll_task; + struct timer_list poll_timer; + wait_queue_head_t poll_wait; + atomic_t poll_wakeup; /* Protects data used by the monitor */ struct mutex trigger_lock; diff --git a/include/linux/sched.h b/include/linux/sched.h index 060e9214c8b5..6d6683b48c2a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -155,24 +155,24 @@ struct task_group; * * for (;;) { * set_current_state(TASK_UNINTERRUPTIBLE); - * if (!need_sleep) - * break; + * if (CONDITION) + * break; * * schedule(); * } * __set_current_state(TASK_RUNNING); * * If the caller does not need such serialisation (because, for instance, the - * condition test and condition change and wakeup are under the same lock) then + * CONDITION test and condition change and wakeup are under the same lock) then * use __set_current_state(). * * The above is typically ordered against the wakeup, which does: * - * need_sleep = false; + * CONDITION = 1; * wake_up_state(p, TASK_UNINTERRUPTIBLE); * - * where wake_up_state() executes a full memory barrier before accessing the - * task state. + * where wake_up_state()/try_to_wake_up() executes a full memory barrier before + * accessing p->state. * * Wakeup will do: if (@state & p->state) p->state = TASK_RUNNING, that is, * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a @@ -375,7 +375,7 @@ struct util_est { * For cfs_rq, they are the aggregated values of all runnable and blocked * sched_entities. * - * The load/runnable/util_avg doesn't direcly factor frequency scaling and CPU + * The load/runnable/util_avg doesn't directly factor frequency scaling and CPU * capacity scaling. The scaling is done through the rq_clock_pelt that is used * for computing those signals (see update_rq_clock_pelt()) * @@ -687,9 +687,15 @@ struct task_struct { struct sched_dl_entity dl; #ifdef CONFIG_UCLAMP_TASK - /* Clamp values requested for a scheduling entity */ + /* + * Clamp values requested for a scheduling entity. + * Must be updated with task_rq_lock() held. + */ struct uclamp_se uclamp_req[UCLAMP_CNT]; - /* Effective clamp values used for a scheduling entity */ + /* + * Effective clamp values used for a scheduling entity. + * Must be updated with task_rq_lock() held. + */ struct uclamp_se uclamp[UCLAMP_CNT]; #endif @@ -2039,6 +2045,7 @@ const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq); const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq); int sched_trace_rq_cpu(struct rq *rq); +int sched_trace_rq_nr_running(struct rq *rq); const struct cpumask *sched_trace_rd_span(struct root_domain *rd); diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index 0fbcbacd1b29..cc9f393e2a70 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -14,6 +14,7 @@ enum hk_flags { HK_FLAG_DOMAIN = (1 << 5), HK_FLAG_WQ = (1 << 6), HK_FLAG_MANAGED_IRQ = (1 << 7), + HK_FLAG_KTHREAD = (1 << 8), }; #ifdef CONFIG_CPU_ISOLATION diff --git a/include/linux/sched/loadavg.h b/include/linux/sched/loadavg.h index 4859bea47a7b..83ec54b65e79 100644 --- a/include/linux/sched/loadavg.h +++ b/include/linux/sched/loadavg.h @@ -43,6 +43,6 @@ extern unsigned long calc_load_n(unsigned long load, unsigned long exp, #define LOAD_INT(x) ((x) >> FSHIFT) #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) -extern void calc_global_load(unsigned long ticks); +extern void calc_global_load(void); #endif /* _LINUX_SCHED_LOADAVG_H */ diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 480a4d1b7dd8..6be66f52a2ad 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -23,7 +23,7 @@ extern struct mm_struct *mm_alloc(void); * will still exist later on and mmget_not_zero() has to be used before * accessing it. * - * This is a preferred way to to pin @mm for a longer/unbounded amount + * This is a preferred way to pin @mm for a longer/unbounded amount * of time. * * Use mmdrop() to release the reference acquired by mmgrab(). @@ -49,8 +49,6 @@ static inline void mmdrop(struct mm_struct *mm) __mmdrop(mm); } -void mmdrop(struct mm_struct *mm); - /* * This has to be called after a get_task_mm()/mmget_not_zero() * followed by taking the mmap_lock for writing before modifying the @@ -234,7 +232,7 @@ static inline unsigned int memalloc_noio_save(void) * @flags: Flags to restore. * * Ends the implicit GFP_NOIO scope started by memalloc_noio_save function. - * Always make sure that that the given flags is the return value from the + * Always make sure that the given flags is the return value from the * pairing memalloc_noio_save call. */ static inline void memalloc_noio_restore(unsigned int flags) @@ -265,7 +263,7 @@ static inline unsigned int memalloc_nofs_save(void) * @flags: Flags to restore. * * Ends the implicit GFP_NOFS scope started by memalloc_nofs_save function. - * Always make sure that that the given flags is the return value from the + * Always make sure that the given flags is the return value from the * pairing memalloc_nofs_save call. */ static inline void memalloc_nofs_restore(unsigned int flags) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 660ac49f2b53..3c31ba88aca5 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -61,9 +61,13 @@ int sched_proc_update_handler(struct ctl_table *table, int write, extern unsigned int sysctl_sched_rt_period; extern int sysctl_sched_rt_runtime; +extern unsigned int sysctl_sched_dl_period_max; +extern unsigned int sysctl_sched_dl_period_min; + #ifdef CONFIG_UCLAMP_TASK extern unsigned int sysctl_sched_uclamp_util_min; extern unsigned int sysctl_sched_uclamp_util_max; +extern unsigned int sysctl_sched_uclamp_util_min_rt_default; #endif #ifdef CONFIG_CFS_BANDWIDTH diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 1301077f9c24..27b4fa454c80 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -55,6 +55,7 @@ extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); extern int sched_fork(unsigned long clone_flags, struct task_struct *p); +extern void sched_post_fork(struct task_struct *p); extern void sched_dead(struct task_struct *p); void __noreturn do_task_dead(void); diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index fb11091129b3..820511289857 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -217,6 +217,16 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu) #endif /* !CONFIG_SMP */ #ifndef arch_scale_cpu_capacity +/** + * arch_scale_cpu_capacity - get the capacity scale factor of a given CPU. + * @cpu: the CPU in question. + * + * Return: the CPU scale factor normalized against SCHED_CAPACITY_SCALE, i.e. + * + * max_perf(cpu) + * ----------------------------- * SCHED_CAPACITY_SCALE + * max(max_perf(c) : c \in CPUs) + */ static __always_inline unsigned long arch_scale_cpu_capacity(int cpu) { @@ -232,6 +242,13 @@ unsigned long arch_scale_thermal_pressure(int cpu) } #endif +#ifndef arch_set_thermal_pressure +static __always_inline +void arch_set_thermal_pressure(const struct cpumask *cpus, + unsigned long th_pressure) +{ } +#endif + static inline int task_node(const struct task_struct *p) { return cpu_to_node(task_cpu(p)); diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index ed168b0e2c53..fec25b9cfbaf 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -91,7 +91,7 @@ DEFINE_EVENT(sched_wakeup_template, sched_waking, /* * Tracepoint called when the task is actually woken; p->state == TASK_RUNNNG. - * It it not always called from the waking context. + * It is not always called from the waking context. */ DEFINE_EVENT(sched_wakeup_template, sched_wakeup, TP_PROTO(struct task_struct *p), @@ -634,6 +634,18 @@ DECLARE_TRACE(sched_overutilized_tp, TP_PROTO(struct root_domain *rd, bool overutilized), TP_ARGS(rd, overutilized)); +DECLARE_TRACE(sched_util_est_cfs_tp, + TP_PROTO(struct cfs_rq *cfs_rq), + TP_ARGS(cfs_rq)); + +DECLARE_TRACE(sched_util_est_se_tp, + TP_PROTO(struct sched_entity *se), + TP_ARGS(se)); + +DECLARE_TRACE(sched_update_nr_running_tp, + TP_PROTO(struct rq *rq, int change), + TP_ARGS(rq, change)); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ |