aboutsummaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds2020-08-03 14:58:38 -0700
committerLinus Torvalds2020-08-03 14:58:38 -0700
commite4cbce4d131753eca271d9d67f58c6377f27ad21 (patch)
treee08e3c8836cd7b9f800e209131aed70897f5fe07 /include
parentb34133fec882d2717f2d61a2a010edd3422368c8 (diff)
parent949bcb8135a96a6923e676646bd29cbe69e8350f (diff)
Merge tag 'sched-core-2020-08-03' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: - Improve uclamp performance by using a static key for the fast path - Add the "sched_util_clamp_min_rt_default" sysctl, to optimize for better power efficiency of RT tasks on battery powered devices. (The default is to maximize performance & reduce RT latencies.) - Improve utime and stime tracking accuracy, which had a fixed boundary of error, which created larger and larger relative errors as the values become larger. This is now replaced with more precise arithmetics, using the new mul_u64_u64_div_u64() helper in math64.h. - Improve the deadline scheduler, such as making it capacity aware - Improve frequency-invariant scheduling - Misc cleanups in energy/power aware scheduling - Add sched_update_nr_running tracepoint to track changes to nr_running - Documentation additions and updates - Misc cleanups and smaller fixes * tag 'sched-core-2020-08-03' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (54 commits) sched/doc: Factorize bits between sched-energy.rst & sched-capacity.rst sched/doc: Document capacity aware scheduling sched: Document arch_scale_*_capacity() arm, arm64: Fix selection of CONFIG_SCHED_THERMAL_PRESSURE Documentation/sysctl: Document uclamp sysctl knobs sched/uclamp: Add a new sysctl to control RT default boost value sched/uclamp: Fix a deadlock when enabling uclamp static key sched: Remove duplicated tick_nohz_full_enabled() check sched: Fix a typo in a comment sched/uclamp: Remove unnecessary mutex_init() arm, arm64: Select CONFIG_SCHED_THERMAL_PRESSURE sched: Cleanup SCHED_THERMAL_PRESSURE kconfig entry arch_topology, sched/core: Cleanup thermal pressure definition trace/events/sched.h: fix duplicated word linux/sched/mm.h: drop duplicated words in comments smp: Fix a potential usage of stale nr_cpus sched/fair: update_pick_idlest() Select group with lowest group_util when idle_cpus are equal sched: nohz: stop passing around unused "ticks" parameter. sched: Better document ttwu() sched: Add a tracepoint to track rq->nr_running ...
Diffstat (limited to 'include')
-rw-r--r--include/asm-generic/vmlinux.lds.h24
-rw-r--r--include/linux/arch_topology.h4
-rw-r--r--include/linux/math64.h2
-rw-r--r--include/linux/psi_types.h7
-rw-r--r--include/linux/sched.h25
-rw-r--r--include/linux/sched/isolation.h1
-rw-r--r--include/linux/sched/loadavg.h2
-rw-r--r--include/linux/sched/mm.h8
-rw-r--r--include/linux/sched/sysctl.h4
-rw-r--r--include/linux/sched/task.h1
-rw-r--r--include/linux/sched/topology.h17
-rw-r--r--include/trace/events/sched.h14
12 files changed, 86 insertions, 23 deletions
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 052e0f05a984..de8493cc3082 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -109,12 +109,31 @@
#endif
/*
- * Align to a 32 byte boundary equal to the
- * alignment gcc 4.5 uses for a struct
+ * GCC 4.5 and later have a 32 bytes section alignment for structures.
+ * Except GCC 4.9, that feels the need to align on 64 bytes.
*/
+#if __GNUC__ == 4 && __GNUC_MINOR__ == 9
+#define STRUCT_ALIGNMENT 64
+#else
#define STRUCT_ALIGNMENT 32
+#endif
#define STRUCT_ALIGN() . = ALIGN(STRUCT_ALIGNMENT)
+/*
+ * The order of the sched class addresses are important, as they are
+ * used to determine the order of the priority of each sched class in
+ * relation to each other.
+ */
+#define SCHED_DATA \
+ STRUCT_ALIGN(); \
+ __begin_sched_classes = .; \
+ *(__idle_sched_class) \
+ *(__fair_sched_class) \
+ *(__rt_sched_class) \
+ *(__dl_sched_class) \
+ *(__stop_sched_class) \
+ __end_sched_classes = .;
+
/* The actual configuration determine if the init/exit sections
* are handled as text/data or they can be discarded (which
* often happens at runtime)
@@ -389,6 +408,7 @@
.rodata : AT(ADDR(.rodata) - LOAD_OFFSET) { \
__start_rodata = .; \
*(.rodata) *(.rodata.*) \
+ SCHED_DATA \
RO_AFTER_INIT_DATA /* Read only after init */ \
. = ALIGN(8); \
__start___tracepoints_ptrs = .; \
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index 0566cb3314ef..69b1dabe39dc 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -39,8 +39,8 @@ static inline unsigned long topology_get_thermal_pressure(int cpu)
return per_cpu(thermal_pressure, cpu);
}
-void arch_set_thermal_pressure(struct cpumask *cpus,
- unsigned long th_pressure);
+void topology_set_thermal_pressure(const struct cpumask *cpus,
+ unsigned long th_pressure);
struct cpu_topology {
int thread_id;
diff --git a/include/linux/math64.h b/include/linux/math64.h
index 11a267413e8e..d097119419e6 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -263,6 +263,8 @@ static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor)
}
#endif /* mul_u64_u32_div */
+u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div);
+
#define DIV64_U64_ROUND_UP(ll, d) \
({ u64 _tmp = (d); div64_u64((ll) + _tmp - 1, _tmp); })
diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index 4b7258495a04..b95f3211566a 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -153,9 +153,10 @@ struct psi_group {
unsigned long avg[NR_PSI_STATES - 1][3];
/* Monitor work control */
- atomic_t poll_scheduled;
- struct kthread_worker __rcu *poll_kworker;
- struct kthread_delayed_work poll_work;
+ struct task_struct __rcu *poll_task;
+ struct timer_list poll_timer;
+ wait_queue_head_t poll_wait;
+ atomic_t poll_wakeup;
/* Protects data used by the monitor */
struct mutex trigger_lock;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 060e9214c8b5..6d6683b48c2a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -155,24 +155,24 @@ struct task_group;
*
* for (;;) {
* set_current_state(TASK_UNINTERRUPTIBLE);
- * if (!need_sleep)
- * break;
+ * if (CONDITION)
+ * break;
*
* schedule();
* }
* __set_current_state(TASK_RUNNING);
*
* If the caller does not need such serialisation (because, for instance, the
- * condition test and condition change and wakeup are under the same lock) then
+ * CONDITION test and condition change and wakeup are under the same lock) then
* use __set_current_state().
*
* The above is typically ordered against the wakeup, which does:
*
- * need_sleep = false;
+ * CONDITION = 1;
* wake_up_state(p, TASK_UNINTERRUPTIBLE);
*
- * where wake_up_state() executes a full memory barrier before accessing the
- * task state.
+ * where wake_up_state()/try_to_wake_up() executes a full memory barrier before
+ * accessing p->state.
*
* Wakeup will do: if (@state & p->state) p->state = TASK_RUNNING, that is,
* once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a
@@ -375,7 +375,7 @@ struct util_est {
* For cfs_rq, they are the aggregated values of all runnable and blocked
* sched_entities.
*
- * The load/runnable/util_avg doesn't direcly factor frequency scaling and CPU
+ * The load/runnable/util_avg doesn't directly factor frequency scaling and CPU
* capacity scaling. The scaling is done through the rq_clock_pelt that is used
* for computing those signals (see update_rq_clock_pelt())
*
@@ -687,9 +687,15 @@ struct task_struct {
struct sched_dl_entity dl;
#ifdef CONFIG_UCLAMP_TASK
- /* Clamp values requested for a scheduling entity */
+ /*
+ * Clamp values requested for a scheduling entity.
+ * Must be updated with task_rq_lock() held.
+ */
struct uclamp_se uclamp_req[UCLAMP_CNT];
- /* Effective clamp values used for a scheduling entity */
+ /*
+ * Effective clamp values used for a scheduling entity.
+ * Must be updated with task_rq_lock() held.
+ */
struct uclamp_se uclamp[UCLAMP_CNT];
#endif
@@ -2039,6 +2045,7 @@ const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq);
const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq);
int sched_trace_rq_cpu(struct rq *rq);
+int sched_trace_rq_nr_running(struct rq *rq);
const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index 0fbcbacd1b29..cc9f393e2a70 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -14,6 +14,7 @@ enum hk_flags {
HK_FLAG_DOMAIN = (1 << 5),
HK_FLAG_WQ = (1 << 6),
HK_FLAG_MANAGED_IRQ = (1 << 7),
+ HK_FLAG_KTHREAD = (1 << 8),
};
#ifdef CONFIG_CPU_ISOLATION
diff --git a/include/linux/sched/loadavg.h b/include/linux/sched/loadavg.h
index 4859bea47a7b..83ec54b65e79 100644
--- a/include/linux/sched/loadavg.h
+++ b/include/linux/sched/loadavg.h
@@ -43,6 +43,6 @@ extern unsigned long calc_load_n(unsigned long load, unsigned long exp,
#define LOAD_INT(x) ((x) >> FSHIFT)
#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
-extern void calc_global_load(unsigned long ticks);
+extern void calc_global_load(void);
#endif /* _LINUX_SCHED_LOADAVG_H */
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 480a4d1b7dd8..6be66f52a2ad 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -23,7 +23,7 @@ extern struct mm_struct *mm_alloc(void);
* will still exist later on and mmget_not_zero() has to be used before
* accessing it.
*
- * This is a preferred way to to pin @mm for a longer/unbounded amount
+ * This is a preferred way to pin @mm for a longer/unbounded amount
* of time.
*
* Use mmdrop() to release the reference acquired by mmgrab().
@@ -49,8 +49,6 @@ static inline void mmdrop(struct mm_struct *mm)
__mmdrop(mm);
}
-void mmdrop(struct mm_struct *mm);
-
/*
* This has to be called after a get_task_mm()/mmget_not_zero()
* followed by taking the mmap_lock for writing before modifying the
@@ -234,7 +232,7 @@ static inline unsigned int memalloc_noio_save(void)
* @flags: Flags to restore.
*
* Ends the implicit GFP_NOIO scope started by memalloc_noio_save function.
- * Always make sure that that the given flags is the return value from the
+ * Always make sure that the given flags is the return value from the
* pairing memalloc_noio_save call.
*/
static inline void memalloc_noio_restore(unsigned int flags)
@@ -265,7 +263,7 @@ static inline unsigned int memalloc_nofs_save(void)
* @flags: Flags to restore.
*
* Ends the implicit GFP_NOFS scope started by memalloc_nofs_save function.
- * Always make sure that that the given flags is the return value from the
+ * Always make sure that the given flags is the return value from the
* pairing memalloc_nofs_save call.
*/
static inline void memalloc_nofs_restore(unsigned int flags)
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 660ac49f2b53..3c31ba88aca5 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -61,9 +61,13 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
extern unsigned int sysctl_sched_rt_period;
extern int sysctl_sched_rt_runtime;
+extern unsigned int sysctl_sched_dl_period_max;
+extern unsigned int sysctl_sched_dl_period_min;
+
#ifdef CONFIG_UCLAMP_TASK
extern unsigned int sysctl_sched_uclamp_util_min;
extern unsigned int sysctl_sched_uclamp_util_max;
+extern unsigned int sysctl_sched_uclamp_util_min_rt_default;
#endif
#ifdef CONFIG_CFS_BANDWIDTH
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 1301077f9c24..27b4fa454c80 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -55,6 +55,7 @@ extern asmlinkage void schedule_tail(struct task_struct *prev);
extern void init_idle(struct task_struct *idle, int cpu);
extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
+extern void sched_post_fork(struct task_struct *p);
extern void sched_dead(struct task_struct *p);
void __noreturn do_task_dead(void);
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index fb11091129b3..820511289857 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -217,6 +217,16 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu)
#endif /* !CONFIG_SMP */
#ifndef arch_scale_cpu_capacity
+/**
+ * arch_scale_cpu_capacity - get the capacity scale factor of a given CPU.
+ * @cpu: the CPU in question.
+ *
+ * Return: the CPU scale factor normalized against SCHED_CAPACITY_SCALE, i.e.
+ *
+ * max_perf(cpu)
+ * ----------------------------- * SCHED_CAPACITY_SCALE
+ * max(max_perf(c) : c \in CPUs)
+ */
static __always_inline
unsigned long arch_scale_cpu_capacity(int cpu)
{
@@ -232,6 +242,13 @@ unsigned long arch_scale_thermal_pressure(int cpu)
}
#endif
+#ifndef arch_set_thermal_pressure
+static __always_inline
+void arch_set_thermal_pressure(const struct cpumask *cpus,
+ unsigned long th_pressure)
+{ }
+#endif
+
static inline int task_node(const struct task_struct *p)
{
return cpu_to_node(task_cpu(p));
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index ed168b0e2c53..fec25b9cfbaf 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -91,7 +91,7 @@ DEFINE_EVENT(sched_wakeup_template, sched_waking,
/*
* Tracepoint called when the task is actually woken; p->state == TASK_RUNNNG.
- * It it not always called from the waking context.
+ * It is not always called from the waking context.
*/
DEFINE_EVENT(sched_wakeup_template, sched_wakeup,
TP_PROTO(struct task_struct *p),
@@ -634,6 +634,18 @@ DECLARE_TRACE(sched_overutilized_tp,
TP_PROTO(struct root_domain *rd, bool overutilized),
TP_ARGS(rd, overutilized));
+DECLARE_TRACE(sched_util_est_cfs_tp,
+ TP_PROTO(struct cfs_rq *cfs_rq),
+ TP_ARGS(cfs_rq));
+
+DECLARE_TRACE(sched_util_est_se_tp,
+ TP_PROTO(struct sched_entity *se),
+ TP_ARGS(se));
+
+DECLARE_TRACE(sched_update_nr_running_tp,
+ TP_PROTO(struct rq *rq, int change),
+ TP_ARGS(rq, change));
+
#endif /* _TRACE_SCHED_H */
/* This part must be outside protection */