Merge tag 'sched-core-2023-10-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar: "Fair scheduler (SCHED_OTHER) improvements: - Remove the old and now unused SIS_PROP code & option - Scan cluster before LLC in the wake-up path - Use candidate prev/recent_used CPU if scanning failed for cluster wakeup NUMA scheduling improvements: - Improve the VMA access-PID code to better skip/scan VMAs - Extend tracing to cover VMA-skipping decisions - Improve/fix the recently introduced sched_numa_find_nth_cpu() code - Generalize numa_map_to_online_node() Energy scheduling improvements: - Remove the EM_MAX_COMPLEXITY limit - Add tracepoints to track energy computation - Make the behavior of the 'sched_energy_aware' sysctl more consistent - Consolidate and clean up access to a CPU's max compute capacity - Fix uclamp code corner cases RT scheduling improvements: - Drive dl_rq->overloaded with dl_rq->pushable_dl_tasks updates - Drive the ->rto_mask with rt_rq->pushable_tasks updates Scheduler scalability improvements: - Rate-limit updates to tg->load_avg - On x86 disable IBRS when CPU is offline to improve single-threaded performance - Micro-optimize in_task() and in_interrupt() - Micro-optimize the PSI code - Avoid updating PSI triggers and ->rtpoll_total when there are no state changes Core scheduler infrastructure improvements: - Use saved_state to reduce some spurious freezer wakeups - Bring in a handful of fast-headers improvements to scheduler headers - Make the scheduler UAPI headers more widely usable by user-space - Simplify the control flow of scheduler syscalls by using lock guards - Fix sched_setaffinity() vs. CPU hotplug race Scheduler debuggability improvements: - Disallow writing invalid values to sched_rt_period_us - Fix a race in the rq-clock debugging code triggering warnings - Fix a warning in the bandwidth distribution code - Micro-optimize in_atomic_preempt_off() checks - Enforce that the tasklist_lock is held in for_each_thread() - Print the TGID in sched_show_task() - Remove the /proc/sys/kernel/sched_child_runs_first sysctl ... and misc cleanups & fixes" * tag 'sched-core-2023-10-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (82 commits) sched/fair: Remove SIS_PROP sched/fair: Use candidate prev/recent_used CPU if scanning failed for cluster wakeup sched/fair: Scan cluster before scanning LLC in wake-up path sched: Add cpus_share_resources API sched/core: Fix RQCF_ACT_SKIP leak sched/fair: Remove unused 'curr' argument from pick_next_entity() sched/nohz: Update comments about NEWILB_KICK sched/fair: Remove duplicate #include sched/psi: Update poll => rtpoll in relevant comments sched: Make PELT acronym definition searchable sched: Fix stop_one_cpu_nowait() vs hotplug sched/psi: Bail out early from irq time accounting sched/topology: Rename 'DIE' domain to 'PKG' sched/psi: Delete the 'update_total' function parameter from update_triggers() sched/psi: Avoid updating PSI triggers and ->rtpoll_total when there are no state changes sched/headers: Remove comment referring to rq::cpu_load, since this has been removed sched/numa: Complete scanning of inactive VMAs when there is no alternative sched/numa: Complete scanning of partial VMAs regardless of PID activity sched/numa: Move up the access pid reset logic sched/numa: Trace decisions related to skipping VMAs ...
author: Linus Torvalds 2023-10-30 13:12:15 -1000
committer: Linus Torvalds 2023-10-30 13:12:15 -1000
commit: 63ce50fff9240d66cf3b59663f458f55ba6dcfcc (patch)
tree: c2613289116d6f80a50f534e9c5e4afc26064574 /include
parent: 3cf3fabccb9dc821ffaec3ad6bf0cd6b278bd012 (diff)
parent: 984ffb6a4366752c949f7b39640aecdce222607f (diff)
18 files changed, 156 insertions, 26 deletions
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index eb768a866fe3..fc8094419084 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -155,6 +155,8 @@ static inline int remove_cpu(unsigned int cpu) { return -EPERM; }
 static inline void smp_shutdown_nonboot_cpus(unsigned int primary_cpu) { }
 #endif	/* !CONFIG_HOTPLUG_CPU */
 
+DEFINE_LOCK_GUARD_0(cpus_read_lock, cpus_read_lock(), cpus_read_unlock())
+
 #ifdef CONFIG_PM_SLEEP_SMP
 extern int freeze_secondary_cpus(int primary);
 extern void thaw_secondary_cpus(void);
diff --git a/include/linux/list.h b/include/linux/list.h
index 164b4d0e9d2a..1837caedf723 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -687,6 +687,14 @@ static inline void list_splice_tail_init(struct list_head *list,
 	for (pos = (head)->next; !list_is_head(pos, (head)); pos = pos->next)
 
 /**
+ * list_for_each_reverse - iterate backwards over a list
+ * @pos:	the &struct list_head to use as a loop cursor.
+ * @head:	the head for your list.
+ */
+#define list_for_each_reverse(pos, head) \
+	for (pos = (head)->prev; pos != (head); pos = pos->prev)
+
+/**
  * list_for_each_rcu - Iterate over a list in an RCU-safe fashion
  * @pos:	the &struct list_head to use as a loop cursor.
  * @head:	the head for your list.
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bf5d0b1b16f4..19fc73b02c9f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1726,8 +1726,8 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
 	unsigned int pid_bit;
 
 	pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG));
-	if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->access_pids[1])) {
-		__set_bit(pid_bit, &vma->numab_state->access_pids[1]);
+	if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->pids_active[1])) {
+		__set_bit(pid_bit, &vma->numab_state->pids_active[1]);
 	}
 }
 #else /* !CONFIG_NUMA_BALANCING */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 36c5b43999e6..589f31ef2e84 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -551,9 +551,36 @@ struct vma_lock {
 };
 
 struct vma_numab_state {
+	/*
+	 * Initialised as time in 'jiffies' after which VMA
+	 * should be scanned.  Delays first scan of new VMA by at
+	 * least sysctl_numa_balancing_scan_delay:
+	 */
 	unsigned long next_scan;
-	unsigned long next_pid_reset;
-	unsigned long access_pids[2];
+
+	/*
+	 * Time in jiffies when pids_active[] is reset to
+	 * detect phase change behaviour:
+	 */
+	unsigned long pids_active_reset;
+
+	/*
+	 * Approximate tracking of PIDs that trapped a NUMA hinting
+	 * fault. May produce false positives due to hash collisions.
+	 *
+	 *   [0] Previous PID tracking
+	 *   [1] Current PID tracking
+	 *
+	 * Window moves after next_pid_reset has expired approximately
+	 * every VMA_PID_RESET_PERIOD jiffies:
+	 */
+	unsigned long pids_active[2];
+
+	/*
+	 * MM scan sequence ID when the VMA was last completely scanned.
+	 * A VMA is not eligible for scanning if prev_scan_seq == numa_scan_seq
+	 */
+	int prev_scan_seq;
 };
 
 /*
diff --git a/include/linux/numa.h b/include/linux/numa.h
index 59df211d051f..fb30a42f0700 100644
--- a/include/linux/numa.h
+++ b/include/linux/numa.h
@@ -25,7 +25,7 @@
 #include <asm/sparsemem.h>
 
 /* Generic implementation available */
-int numa_map_to_online_node(int node);
+int numa_nearest_node(int node, unsigned int state);
 
 #ifndef memory_add_physaddr_to_nid
 static inline int memory_add_physaddr_to_nid(u64 start)
@@ -44,10 +44,11 @@ static inline int phys_to_target_node(u64 start)
 }
 #endif
 #else /* !CONFIG_NUMA */
-static inline int numa_map_to_online_node(int node)
+static inline int numa_nearest_node(int node, unsigned int state)
 {
 	return NUMA_NO_NODE;
 }
+
 static inline int memory_add_physaddr_to_nid(u64 start)
 {
 	return 0;
@@ -58,6 +59,8 @@ static inline int phys_to_target_node(u64 start)
 }
 #endif
 
+#define numa_map_to_online_node(node) numa_nearest_node(node, N_ONLINE)
+
 #ifdef CONFIG_HAVE_ARCH_NODE_DEV_GROUP
 extern const struct attribute_group arch_node_dev_group;
 #endif
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 1424670df161..9aa6358a1a16 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -99,14 +99,21 @@ static __always_inline unsigned char interrupt_context_level(void)
 	return level;
 }
 
+/*
+ * These macro definitions avoid redundant invocations of preempt_count()
+ * because such invocations would result in redundant loads given that
+ * preempt_count() is commonly implemented with READ_ONCE().
+ */
+
 #define nmi_count()	(preempt_count() & NMI_MASK)
 #define hardirq_count()	(preempt_count() & HARDIRQ_MASK)
 #ifdef CONFIG_PREEMPT_RT
 # define softirq_count()	(current->softirq_disable_cnt & SOFTIRQ_MASK)
+# define irq_count()		((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | softirq_count())
 #else
 # define softirq_count()	(preempt_count() & SOFTIRQ_MASK)
+# define irq_count()		(preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_MASK))
 #endif
-#define irq_count()	(nmi_count() | hardirq_count() | softirq_count())
 
 /*
  * Macros to retrieve the current execution context:
@@ -119,7 +126,11 @@ static __always_inline unsigned char interrupt_context_level(void)
 #define in_nmi()		(nmi_count())
 #define in_hardirq()		(hardirq_count())
 #define in_serving_softirq()	(softirq_count() & SOFTIRQ_OFFSET)
-#define in_task()		(!(in_nmi() | in_hardirq() | in_serving_softirq()))
+#ifdef CONFIG_PREEMPT_RT
+# define in_task()		(!((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | in_serving_softirq()))
+#else
+# define in_task()		(!(preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
+#endif
 
 /*
  * The following macros are deprecated and should not be used in new code:
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6af88873499a..12ec109ce8c9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -63,7 +63,6 @@ struct robust_list_head;
 struct root_domain;
 struct rq;
 struct sched_attr;
-struct sched_param;
 struct seq_file;
 struct sighand_struct;
 struct signal_struct;
@@ -370,6 +369,10 @@ extern struct root_domain def_root_domain;
 extern struct mutex sched_domains_mutex;
 #endif
 
+struct sched_param {
+	int sched_priority;
+};
+
 struct sched_info {
 #ifdef CONFIG_SCHED_INFO
 	/* Cumulative counters: */
@@ -750,10 +753,8 @@ struct task_struct {
 #endif
 	unsigned int			__state;
 
-#ifdef CONFIG_PREEMPT_RT
 	/* saved state for "spinlock sleepers" */
 	unsigned int			saved_state;
-#endif
 
 	/*
 	 * This begins the randomizable portion of task_struct. Only
diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h
index 7c83d4d5a971..df3aca89d4f5 100644
--- a/include/linux/sched/deadline.h
+++ b/include/linux/sched/deadline.h
@@ -1,4 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SCHED_DEADLINE_H
+#define _LINUX_SCHED_DEADLINE_H
 
 /*
  * SCHED_DEADLINE tasks has negative priorities, reflecting
@@ -34,3 +36,5 @@ extern void dl_add_task_root_domain(struct task_struct *p);
 extern void dl_clear_root_domain(struct root_domain *rd);
 
 #endif /* CONFIG_SMP */
+
+#endif /* _LINUX_SCHED_DEADLINE_H */
diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h
index 3988762efe15..b69afb8630db 100644
--- a/include/linux/sched/numa_balancing.h
+++ b/include/linux/sched/numa_balancing.h
@@ -15,6 +15,16 @@
 #define TNF_FAULT_LOCAL	0x08
 #define TNF_MIGRATE_FAIL 0x10
 
+enum numa_vmaskip_reason {
+	NUMAB_SKIP_UNSUITABLE,
+	NUMAB_SKIP_SHARED_RO,
+	NUMAB_SKIP_INACCESSIBLE,
+	NUMAB_SKIP_SCAN_DELAY,
+	NUMAB_SKIP_PID_INACTIVE,
+	NUMAB_SKIP_IGNORE_PID,
+	NUMAB_SKIP_SEQ_COMPLETED,
+};
+
 #ifdef CONFIG_NUMA_BALANCING
 extern void task_numa_fault(int last_node, int node, int pages, int flags);
 extern pid_t task_numa_group_id(struct task_struct *p);
diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h
index fad77b5172e2..a8b28647aafc 100644
--- a/include/linux/sched/sd_flags.h
+++ b/include/linux/sched/sd_flags.h
@@ -110,6 +110,13 @@ SD_FLAG(SD_ASYM_CPUCAPACITY_FULL, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
 SD_FLAG(SD_SHARE_CPUCAPACITY, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
 
 /*
+ * Domain members share CPU cluster (LLC tags or L2 cache)
+ *
+ * NEEDS_GROUPS: Clusters are shared between groups.
+ */
+SD_FLAG(SD_CLUSTER, SDF_NEEDS_GROUPS)
+
+/*
  * Domain members share CPU package resources (i.e. caches)
  *
  * SHARED_CHILD: Set from the base domain up until spanned CPUs no longer share
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 0014d3adaf84..9610bad018a3 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -656,7 +656,8 @@ extern bool current_is_single_threaded(void);
 	while ((t = next_thread(t)) != g)
 
 #define __for_each_thread(signal, t)	\
-	list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node)
+	list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node, \
+		lockdep_is_held(&tasklist_lock))
 
 #define for_each_thread(p, t)		\
 	__for_each_thread((p)->signal, t)
diff --git a/include/linux/sched/smt.h b/include/linux/sched/smt.h
index 59d3736c454c..fb1e295e7e63 100644
--- a/include/linux/sched/smt.h
+++ b/include/linux/sched/smt.h
@@ -17,4 +17,4 @@ static inline bool sched_smt_active(void) { return false; }
 
 void arch_smt_update(void);
 
-#endif
+#endif /* _LINUX_SCHED_SMT_H */
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 67b573d5bf28..de545ba85218 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -45,7 +45,7 @@ static inline int cpu_smt_flags(void)
 #ifdef CONFIG_SCHED_CLUSTER
 static inline int cpu_cluster_flags(void)
 {
-	return SD_SHARE_PKG_RESOURCES;
+	return SD_CLUSTER | SD_SHARE_PKG_RESOURCES;
 }
 #endif
 
@@ -109,8 +109,6 @@ struct sched_domain {
 	u64 max_newidle_lb_cost;
 	unsigned long last_decay_max_lb_cost;
 
-	u64 avg_scan_cost;		/* select_idle_sibling */
-
 #ifdef CONFIG_SCHEDSTATS
 	/* load_balance() stats */
 	unsigned int lb_count[CPU_MAX_IDLE_TYPES];
@@ -179,6 +177,7 @@ cpumask_var_t *alloc_sched_domains(unsigned int ndoms);
 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
 
 bool cpus_share_cache(int this_cpu, int that_cpu);
+bool cpus_share_resources(int this_cpu, int that_cpu);
 
 typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
 typedef int (*sched_domain_flags_f)(void);
@@ -232,6 +231,11 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu)
 	return true;
 }
 
+static inline bool cpus_share_resources(int this_cpu, int that_cpu)
+{
+	return true;
+}
+
 #endif	/* !CONFIG_SMP */
 
 #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
diff --git a/include/linux/sched/types.h b/include/linux/sched/types.h
index 3c3e049224ae..969aaf5ef9d6 100644
--- a/include/linux/sched/types.h
+++ b/include/linux/sched/types.h
@@ -20,4 +20,4 @@ struct task_cputime {
 	unsigned long long		sum_exec_runtime;
 };
 
-#endif
+#endif /* _LINUX_SCHED_TYPES_H */
diff --git a/include/linux/sched/vhost_task.h b/include/linux/sched/vhost_task.h
index 837a23624a66..bc60243d43b3 100644
--- a/include/linux/sched/vhost_task.h
+++ b/include/linux/sched/vhost_task.h
@@ -1,7 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_VHOST_TASK_H
-#define _LINUX_VHOST_TASK_H
-
+#ifndef _LINUX_SCHED_VHOST_TASK_H
+#define _LINUX_SCHED_VHOST_TASK_H
 
 struct vhost_task;
 
@@ -11,4 +10,4 @@ void vhost_task_start(struct vhost_task *vtsk);
 void vhost_task_stop(struct vhost_task *vtsk);
 void vhost_task_wake(struct vhost_task *vtsk);
 
-#endif
+#endif /* _LINUX_SCHED_VHOST_TASK_H */
diff --git a/include/linux/topology.h b/include/linux/topology.h
index fea32377f7c7..52f5850730b3 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -251,7 +251,7 @@ extern const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int
 #else
 static __always_inline int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
 {
-	return cpumask_nth(cpu, cpus);
+	return cpumask_nth_and(cpu, cpus, cpu_online_mask);
 }
 
 static inline const struct cpumask *
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index fbb99a61f714..6188ad0d9e0d 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -664,6 +664,58 @@ DEFINE_EVENT(sched_numa_pair_template, sched_swap_numa,
 	TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu)
 );
 
+#ifdef CONFIG_NUMA_BALANCING
+#define NUMAB_SKIP_REASON					\
+	EM( NUMAB_SKIP_UNSUITABLE,		"unsuitable" )	\
+	EM( NUMAB_SKIP_SHARED_RO,		"shared_ro" )	\
+	EM( NUMAB_SKIP_INACCESSIBLE,		"inaccessible" )	\
+	EM( NUMAB_SKIP_SCAN_DELAY,		"scan_delay" )	\
+	EM( NUMAB_SKIP_PID_INACTIVE,		"pid_inactive" )	\
+	EM( NUMAB_SKIP_IGNORE_PID,		"ignore_pid_inactive" )		\
+	EMe(NUMAB_SKIP_SEQ_COMPLETED,		"seq_completed" )
+
+/* Redefine for export. */
+#undef EM
+#undef EMe
+#define EM(a, b)	TRACE_DEFINE_ENUM(a);
+#define EMe(a, b)	TRACE_DEFINE_ENUM(a);
+
+NUMAB_SKIP_REASON
+
+/* Redefine for symbolic printing. */
+#undef EM
+#undef EMe
+#define EM(a, b)	{ a, b },
+#define EMe(a, b)	{ a, b }
+
+TRACE_EVENT(sched_skip_vma_numa,
+
+	TP_PROTO(struct mm_struct *mm, struct vm_area_struct *vma,
+		 enum numa_vmaskip_reason reason),
+
+	TP_ARGS(mm, vma, reason),
+
+	TP_STRUCT__entry(
+		__field(unsigned long, numa_scan_offset)
+		__field(unsigned long, vm_start)
+		__field(unsigned long, vm_end)
+		__field(enum numa_vmaskip_reason, reason)
+	),
+
+	TP_fast_assign(
+		__entry->numa_scan_offset	= mm->numa_scan_offset;
+		__entry->vm_start		= vma->vm_start;
+		__entry->vm_end			= vma->vm_end;
+		__entry->reason			= reason;
+	),
+
+	TP_printk("numa_scan_offset=%lX vm_start=%lX vm_end=%lX reason=%s",
+		  __entry->numa_scan_offset,
+		  __entry->vm_start,
+		  __entry->vm_end,
+		  __print_symbolic(__entry->reason, NUMAB_SKIP_REASON))
+);
+#endif /* CONFIG_NUMA_BALANCING */
 
 /*
  * Tracepoint for waking a polling cpu without an IPI.
@@ -735,6 +787,11 @@ DECLARE_TRACE(sched_update_nr_running_tp,
 	TP_PROTO(struct rq *rq, int change),
 	TP_ARGS(rq, change));
 
+DECLARE_TRACE(sched_compute_energy_tp,
+	TP_PROTO(struct task_struct *p, int dst_cpu, unsigned long energy,
+		 unsigned long max_util, unsigned long busy_time),
+	TP_ARGS(p, dst_cpu, energy, max_util, busy_time));
+
 #endif /* _TRACE_SCHED_H */
 
 /* This part must be outside protection */
diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h
index f2c4589d4dbf..90662385689b 100644
--- a/include/uapi/linux/sched/types.h
+++ b/include/uapi/linux/sched/types.h
@@ -4,10 +4,6 @@
 
 #include <linux/types.h>
 
-struct sched_param {
-	int sched_priority;
-};
-
 #define SCHED_ATTR_SIZE_VER0	48	/* sizeof first published struct */
 #define SCHED_ATTR_SIZE_VER1	56	/* add: util_{min,max} */
author	Linus Torvalds	2023-10-30 13:12:15 -1000
committer	Linus Torvalds	2023-10-30 13:12:15 -1000
commit	63ce50fff9240d66cf3b59663f458f55ba6dcfcc (patch)
tree	c2613289116d6f80a50f534e9c5e4afc26064574 /include
parent	3cf3fabccb9dc821ffaec3ad6bf0cd6b278bd012 (diff)
parent	984ffb6a4366752c949f7b39640aecdce222607f (diff)