diff options
44 files changed, 1634 insertions, 553 deletions
diff --git a/Documentation/scheduler/sched-deadline.txt b/Documentation/scheduler/sched-deadline.txt index cbc1b46cbf70..e89e36ec15a5 100644 --- a/Documentation/scheduler/sched-deadline.txt +++ b/Documentation/scheduler/sched-deadline.txt @@ -7,6 +7,8 @@ CONTENTS 0. WARNING 1. Overview 2. Scheduling algorithm + 2.1 Main algorithm + 2.2 Bandwidth reclaiming 3. Scheduling Real-Time Tasks 3.1 Definitions 3.2 Schedulability Analysis for Uniprocessor Systems @@ -44,6 +46,9 @@ CONTENTS 2. Scheduling algorithm ================== +2.1 Main algorithm +------------------ + SCHED_DEADLINE uses three parameters, named "runtime", "period", and "deadline", to schedule tasks. A SCHED_DEADLINE task should receive "runtime" microseconds of execution time every "period" microseconds, and @@ -113,6 +118,160 @@ CONTENTS remaining runtime = remaining runtime + runtime +2.2 Bandwidth reclaiming +------------------------ + + Bandwidth reclaiming for deadline tasks is based on the GRUB (Greedy + Reclamation of Unused Bandwidth) algorithm [15, 16, 17] and it is enabled + when flag SCHED_FLAG_RECLAIM is set. + + The following diagram illustrates the state names for tasks handled by GRUB: + + ------------ + (d) | Active | + ------------->| | + | | Contending | + | ------------ + | A | + ---------- | | + | | | | + | Inactive | |(b) | (a) + | | | | + ---------- | | + A | V + | ------------ + | | Active | + --------------| Non | + (c) | Contending | + ------------ + + A task can be in one of the following states: + + - ActiveContending: if it is ready for execution (or executing); + + - ActiveNonContending: if it just blocked and has not yet surpassed the 0-lag + time; + + - Inactive: if it is blocked and has surpassed the 0-lag time. + + State transitions: + + (a) When a task blocks, it does not become immediately inactive since its + bandwidth cannot be immediately reclaimed without breaking the + real-time guarantees. It therefore enters a transitional state called + ActiveNonContending. The scheduler arms the "inactive timer" to fire at + the 0-lag time, when the task's bandwidth can be reclaimed without + breaking the real-time guarantees. + + The 0-lag time for a task entering the ActiveNonContending state is + computed as + + (runtime * dl_period) + deadline - --------------------- + dl_runtime + + where runtime is the remaining runtime, while dl_runtime and dl_period + are the reservation parameters. + + (b) If the task wakes up before the inactive timer fires, the task re-enters + the ActiveContending state and the "inactive timer" is canceled. + In addition, if the task wakes up on a different runqueue, then + the task's utilization must be removed from the previous runqueue's active + utilization and must be added to the new runqueue's active utilization. + In order to avoid races between a task waking up on a runqueue while the + "inactive timer" is running on a different CPU, the "dl_non_contending" + flag is used to indicate that a task is not on a runqueue but is active + (so, the flag is set when the task blocks and is cleared when the + "inactive timer" fires or when the task wakes up). + + (c) When the "inactive timer" fires, the task enters the Inactive state and + its utilization is removed from the runqueue's active utilization. + + (d) When an inactive task wakes up, it enters the ActiveContending state and + its utilization is added to the active utilization of the runqueue where + it has been enqueued. + + For each runqueue, the algorithm GRUB keeps track of two different bandwidths: + + - Active bandwidth (running_bw): this is the sum of the bandwidths of all + tasks in active state (i.e., ActiveContending or ActiveNonContending); + + - Total bandwidth (this_bw): this is the sum of all tasks "belonging" to the + runqueue, including the tasks in Inactive state. + + + The algorithm reclaims the bandwidth of the tasks in Inactive state. + It does so by decrementing the runtime of the executing task Ti at a pace equal + to + + dq = -max{ Ui, (1 - Uinact) } dt + + where Uinact is the inactive utilization, computed as (this_bq - running_bw), + and Ui is the bandwidth of task Ti. + + + Let's now see a trivial example of two deadline tasks with runtime equal + to 4 and period equal to 8 (i.e., bandwidth equal to 0.5): + + A Task T1 + | + | | + | | + |-------- |---- + | | V + |---|---|---|---|---|---|---|---|--------->t + 0 1 2 3 4 5 6 7 8 + + + A Task T2 + | + | | + | | + | ------------------------| + | | V + |---|---|---|---|---|---|---|---|--------->t + 0 1 2 3 4 5 6 7 8 + + + A running_bw + | + 1 ----------------- ------ + | | | + 0.5- ----------------- + | | + |---|---|---|---|---|---|---|---|--------->t + 0 1 2 3 4 5 6 7 8 + + + - Time t = 0: + + Both tasks are ready for execution and therefore in ActiveContending state. + Suppose Task T1 is the first task to start execution. + Since there are no inactive tasks, its runtime is decreased as dq = -1 dt. + + - Time t = 2: + + Suppose that task T1 blocks + Task T1 therefore enters the ActiveNonContending state. Since its remaining + runtime is equal to 2, its 0-lag time is equal to t = 4. + Task T2 start execution, with runtime still decreased as dq = -1 dt since + there are no inactive tasks. + + - Time t = 4: + + This is the 0-lag time for Task T1. Since it didn't woken up in the + meantime, it enters the Inactive state. Its bandwidth is removed from + running_bw. + Task T2 continues its execution. However, its runtime is now decreased as + dq = - 0.5 dt because Uinact = 0.5. + Task T2 therefore reclaims the bandwidth unused by Task T1. + + - Time t = 8: + + Task T1 wakes up. It enters the ActiveContending state again, and the + running_bw is incremented. + + 3. Scheduling Real-Time Tasks ============================= @@ -330,6 +489,15 @@ CONTENTS 14 - J. Erickson, U. Devi and S. Baruah. Improved tardiness bounds for Global EDF. Proceedings of the 22nd Euromicro Conference on Real-Time Systems, 2010. + 15 - G. Lipari, S. Baruah, Greedy reclamation of unused bandwidth in + constant-bandwidth servers, 12th IEEE Euromicro Conference on Real-Time + Systems, 2000. + 16 - L. Abeni, J. Lelli, C. Scordino, L. Palopoli, Greedy CPU reclaiming for + SCHED DEADLINE. In Proceedings of the Real-Time Linux Workshop (RTLWS), + Dusseldorf, Germany, 2014. + 17 - L. Abeni, G. Lipari, A. Parri, Y. Sun, Multicore CPU reclaiming: parallel + or sequential?. In Proceedings of the 31st Annual ACM Symposium on Applied + Computing, 2016. 4. Bandwidth management diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 572a8df1b766..c9a0a5299827 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -555,8 +555,7 @@ static DEFINE_RAW_SPINLOCK(stop_lock); */ static void ipi_cpu_stop(unsigned int cpu) { - if (system_state == SYSTEM_BOOTING || - system_state == SYSTEM_RUNNING) { + if (system_state <= SYSTEM_RUNNING) { raw_spin_lock(&stop_lock); pr_crit("CPU%u: stopping\n", cpu); dump_stack(); diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 6e0e16a3a7d4..321119881abf 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -961,8 +961,7 @@ void smp_send_stop(void) cpumask_copy(&mask, cpu_online_mask); cpumask_clear_cpu(smp_processor_id(), &mask); - if (system_state == SYSTEM_BOOTING || - system_state == SYSTEM_RUNNING) + if (system_state <= SYSTEM_RUNNING) pr_crit("SMP: stopping secondary CPUs\n"); smp_cross_call(&mask, IPI_CPU_STOP); } diff --git a/arch/metag/kernel/smp.c b/arch/metag/kernel/smp.c index 232a12bf3f99..2dbbb7c66043 100644 --- a/arch/metag/kernel/smp.c +++ b/arch/metag/kernel/smp.c @@ -567,8 +567,7 @@ static void stop_this_cpu(void *data) { unsigned int cpu = smp_processor_id(); - if (system_state == SYSTEM_BOOTING || - system_state == SYSTEM_RUNNING) { + if (system_state <= SYSTEM_RUNNING) { spin_lock(&stop_lock); pr_crit("CPU%u: stopping\n", cpu); dump_stack(); diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index df2a41647d8e..1069f74fca47 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -97,7 +97,7 @@ int smp_generic_cpu_bootable(unsigned int nr) /* Special case - we inhibit secondary thread startup * during boot if the user requests it. */ - if (system_state == SYSTEM_BOOTING && cpu_has_feature(CPU_FTR_SMT)) { + if (system_state < SYSTEM_RUNNING && cpu_has_feature(CPU_FTR_SMT)) { if (!smt_enabled_at_boot && cpu_thread_in_core(nr) != 0) return 0; if (smt_enabled_at_boot diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 580b60f5ac83..d3990462582c 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2255,7 +2255,7 @@ static struct pmu pmu = { void arch_perf_update_userpage(struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now) { - struct cyc2ns_data *data; + struct cyc2ns_data data; u64 offset; userpg->cap_user_time = 0; @@ -2267,17 +2267,17 @@ void arch_perf_update_userpage(struct perf_event *event, if (!using_native_sched_clock() || !sched_clock_stable()) return; - data = cyc2ns_read_begin(); + cyc2ns_read_begin(&data); - offset = data->cyc2ns_offset + __sched_clock_offset; + offset = data.cyc2ns_offset + __sched_clock_offset; /* * Internal timekeeping for enabled/running/stopped times * is always in the local_clock domain. */ userpg->cap_user_time = 1; - userpg->time_mult = data->cyc2ns_mul; - userpg->time_shift = data->cyc2ns_shift; + userpg->time_mult = data.cyc2ns_mul; + userpg->time_shift = data.cyc2ns_shift; userpg->time_offset = offset - now; /* @@ -2289,7 +2289,7 @@ void arch_perf_update_userpage(struct perf_event *event, userpg->time_zero = offset; } - cyc2ns_read_end(data); + cyc2ns_read_end(); } void diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index 27e9f9d769b8..2016962103df 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -29,11 +29,9 @@ struct cyc2ns_data { u32 cyc2ns_mul; u32 cyc2ns_shift; u64 cyc2ns_offset; - u32 __count; - /* u32 hole */ -}; /* 24 bytes -- do not grow */ +}; /* 16 bytes */ -extern struct cyc2ns_data *cyc2ns_read_begin(void); -extern void cyc2ns_read_end(struct cyc2ns_data *); +extern void cyc2ns_read_begin(struct cyc2ns_data *); +extern void cyc2ns_read_end(void); #endif /* _ASM_X86_TIMER_H */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index f04479a8f74f..045e4f993bd2 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -863,7 +863,7 @@ static void announce_cpu(int cpu, int apicid) if (cpu == 1) printk(KERN_INFO "x86: Booting SMP configuration:\n"); - if (system_state == SYSTEM_BOOTING) { + if (system_state < SYSTEM_RUNNING) { if (node != current_node) { if (current_node > (-1)) pr_cont("\n"); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 714dfba6a1e7..5270fc0c2df6 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -51,115 +51,34 @@ static u32 art_to_tsc_denominator; static u64 art_to_tsc_offset; struct clocksource *art_related_clocksource; -/* - * Use a ring-buffer like data structure, where a writer advances the head by - * writing a new data entry and a reader advances the tail when it observes a - * new entry. - * - * Writers are made to wait on readers until there's space to write a new - * entry. - * - * This means that we can always use an {offset, mul} pair to compute a ns - * value that is 'roughly' in the right direction, even if we're writing a new - * {offset, mul} pair during the clock read. - * - * The down-side is that we can no longer guarantee strict monotonicity anymore - * (assuming the TSC was that to begin with), because while we compute the - * intersection point of the two clock slopes and make sure the time is - * continuous at the point of switching; we can no longer guarantee a reader is - * strictly before or after the switch point. - * - * It does mean a reader no longer needs to disable IRQs in order to avoid - * CPU-Freq updates messing with his times, and similarly an NMI reader will - * no longer run the risk of hitting half-written state. - */ - struct cyc2ns { - struct cyc2ns_data data[2]; /* 0 + 2*24 = 48 */ - struct cyc2ns_data *head; /* 48 + 8 = 56 */ - struct cyc2ns_data *tail; /* 56 + 8 = 64 */ -}; /* exactly fits one cacheline */ - -static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns); - -struct cyc2ns_data *cyc2ns_read_begin(void) -{ - struct cyc2ns_data *head; - - preempt_disable(); - - head = this_cpu_read(cyc2ns.head); - /* - * Ensure we observe the entry when we observe the pointer to it. - * matches the wmb from cyc2ns_write_end(). - */ - smp_read_barrier_depends(); - head->__count++; - barrier(); + struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */ + seqcount_t seq; /* 32 + 4 = 36 */ - return head; -} +}; /* fits one cacheline */ -void cyc2ns_read_end(struct cyc2ns_data *head) -{ - barrier(); - /* - * If we're the outer most nested read; update the tail pointer - * when we're done. This notifies possible pending writers - * that we've observed the head pointer and that the other - * entry is now free. - */ - if (!--head->__count) { - /* - * x86-TSO does not reorder writes with older reads; - * therefore once this write becomes visible to another - * cpu, we must be finished reading the cyc2ns_data. - * - * matches with cyc2ns_write_begin(). - */ - this_cpu_write(cyc2ns.tail, head); - } - preempt_enable(); -} +static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns); -/* - * Begin writing a new @data entry for @cpu. - * - * Assumes some sort of write side lock; currently 'provided' by the assumption - * that cpufreq will call its notifiers sequentially. - */ -static struct cyc2ns_data *cyc2ns_write_begin(int cpu) +void cyc2ns_read_begin(struct cyc2ns_data *data) { - struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); - struct cyc2ns_data *data = c2n->data; + int seq, idx; - if (data == c2n->head) - data++; + preempt_disable_notrace(); - /* XXX send an IPI to @cpu in order to guarantee a read? */ + do { + seq = this_cpu_read(cyc2ns.seq.sequence); + idx = seq & 1; - /* - * When we observe the tail write from cyc2ns_read_end(), - * the cpu must be done with that entry and its safe - * to start writing to it. - */ - while (c2n->tail == data) - cpu_relax(); + data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset); + data->cyc2ns_mul = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul); + data->cyc2ns_shift = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift); - return data; + } while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence))); } -static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data) +void cyc2ns_read_end(void) { - struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); - - /* - * Ensure the @data writes are visible before we publish the - * entry. Matches the data-depencency in cyc2ns_read_begin(). - */ - smp_wmb(); - - ACCESS_ONCE(c2n->head) = data; + preempt_enable_notrace(); } /* @@ -191,7 +110,6 @@ static void cyc2ns_data_init(struct cyc2ns_data *data) data->cyc2ns_mul = 0; data->cyc2ns_shift = 0; data->cyc2ns_offset = 0; - data->__count = 0; } static void cyc2ns_init(int cpu) @@ -201,51 +119,29 @@ static void cyc2ns_init(int cpu) cyc2ns_data_init(&c2n->data[0]); cyc2ns_data_init(&c2n->data[1]); - c2n->head = c2n->data; - c2n->tail = c2n->data; + seqcount_init(&c2n->seq); } static inline unsigned long long cycles_2_ns(unsigned long long cyc) { - struct cyc2ns_data *data, *tail; + struct cyc2ns_data data; unsigned long long ns; - /* - * See cyc2ns_read_*() for details; replicated in order to avoid - * an extra few instructions that came with the abstraction. - * Notable, it allows us to only do the __count and tail update - * dance when its actually needed. - */ - - preempt_disable_notrace(); - data = this_cpu_read(cyc2ns.head); - tail = this_cpu_read(cyc2ns.tail); - - if (likely(data == tail)) { - ns = data->cyc2ns_offset; - ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift); - } else { - data->__count++; - - barrier(); - - ns = data->cyc2ns_offset; - ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift); + cyc2ns_read_begin(&data); - barrier(); + ns = data.cyc2ns_offset; + ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift); - if (!--data->__count) - this_cpu_write(cyc2ns.tail, data); - } - preempt_enable_notrace(); + cyc2ns_read_end(); return ns; } -static void set_cyc2ns_scale(unsigned long khz, int cpu) +static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) { - unsigned long long tsc_now, ns_now; - struct cyc2ns_data *data; + unsigned long long ns_now; + struct cyc2ns_data data; + struct cyc2ns *c2n; unsigned long flags; local_irq_save(flags); @@ -254,9 +150,6 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu) if (!khz) goto done; - data = cyc2ns_write_begin(cpu); - - tsc_now = rdtsc(); ns_now = cycles_2_ns(tsc_now); /* @@ -264,7 +157,7 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu) * time function is continuous; see the comment near struct * cyc2ns_data. */ - clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, khz, + clocks_calc_mult_shift(&data.cyc2ns_mul, &data.cyc2ns_shift, khz, NSEC_PER_MSEC, 0); /* @@ -273,20 +166,26 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu) * conversion algorithm shifting a 32-bit value (now specifies a 64-bit * value) - refer perf_event_mmap_page documentation in perf_event.h. */ - if (data->cyc2ns_shift == 32) { - data->cyc2ns_shift = 31; - data->cyc2ns_mul >>= 1; + if (data.cyc2ns_shift == 32) { + data.cyc2ns_shift = 31; + data.cyc2ns_mul >>= 1; } - data->cyc2ns_offset = ns_now - - mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, data->cyc2ns_shift); + data.cyc2ns_offset = ns_now - + mul_u64_u32_shr(tsc_now, data.cyc2ns_mul, data.cyc2ns_shift); + + c2n = per_cpu_ptr(&cyc2ns, cpu); - cyc2ns_write_end(cpu, data); + raw_write_seqcount_latch(&c2n->seq); + c2n->data[0] = data; + raw_write_seqcount_latch(&c2n->seq); + c2n->data[1] = data; done: - sched_clock_idle_wakeup_event(0); + sched_clock_idle_wakeup_event(); local_irq_restore(flags); } + /* * Scheduler clock - returns current time in nanosec units. */ @@ -374,6 +273,8 @@ static int __init tsc_setup(char *str) tsc_clocksource_reliable = 1; if (!strncmp(str, "noirqtime", 9)) no_sched_irq_time = 1; + if (!strcmp(str, "unstable")) + mark_tsc_unstable("boot parameter"); return 1; } @@ -986,7 +887,6 @@ void tsc_restore_sched_clock_state(void) } #ifdef CONFIG_CPU_FREQ - /* Frequency scaling support. Adjust the TSC based timer when the cpu frequency * changes. * @@ -1027,7 +927,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, if (!(freq->flags & CPUFREQ_CONST_LOOPS)) mark_tsc_unstable("cpufreq changes"); - set_cyc2ns_scale(tsc_khz, freq->cpu); + set_cyc2ns_scale(tsc_khz, freq->cpu, rdtsc()); } return 0; @@ -1127,6 +1027,15 @@ static void tsc_cs_mark_unstable(struct clocksource *cs) pr_info("Marking TSC unstable due to clocksource watchdog\n"); } +static void tsc_cs_tick_stable(struct clocksource *cs) +{ + if (tsc_unstable) + return; + + if (using_native_sched_clock()) + sched_clock_tick_stable(); +} + /* * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc() */ @@ -1140,6 +1049,7 @@ static struct clocksource clocksource_tsc = { .archdata = { .vclock_mode = VCLOCK_TSC }, .resume = tsc_resume, .mark_unstable = tsc_cs_mark_unstable, + .tick_stable = tsc_cs_tick_stable, }; void mark_tsc_unstable(char *reason) @@ -1255,6 +1165,7 @@ static void tsc_refine_calibration_work(struct work_struct *work) static int hpet; u64 tsc_stop, ref_stop, delta; unsigned long freq; + int cpu; /* Don't bother refining TSC on unstable systems */ if (check_tsc_unstable()) @@ -1305,6 +1216,10 @@ static void tsc_refine_calibration_work(struct work_struct *work) /* Inform the TSC deadline clockevent devices about the recalibration */ lapic_update_tsc_freq(); + /* Update the sched_clock() rate to match the clocksource one */ + for_each_possible_cpu(cpu) + set_cyc2ns_scale(tsc_khz, cpu, tsc_stop); + out: if (boot_cpu_has(X86_FEATURE_ART)) art_related_clocksource = &clocksource_tsc; @@ -1350,7 +1265,7 @@ device_initcall(init_tsc_clocksource); void __init tsc_init(void) { - u64 lpj; + u64 lpj, cyc; int cpu; if (!boot_cpu_has(X86_FEATURE_TSC)) { @@ -1390,9 +1305,10 @@ void __init tsc_init(void) * speed as the bootup CPU. (cpufreq notifiers will fix this * up if their speed diverges) */ + cyc = rdtsc(); for_each_possible_cpu(cpu) { cyc2ns_init(cpu); - set_cyc2ns_scale(tsc_khz, cpu); + set_cyc2ns_scale(tsc_khz, cpu, cyc); } if (tsc_disabled > 0) diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 42e65fee5673..795671593528 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -456,12 +456,13 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp) */ static inline unsigned long long cycles_2_ns(unsigned long long cyc) { - struct cyc2ns_data *data = cyc2ns_read_begin(); + struct cyc2ns_data data; unsigned long long ns; - ns = mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift); + cyc2ns_read_begin(&data); + ns = mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift); + cyc2ns_read_end(); - cyc2ns_read_end(data); return ns; } @@ -470,12 +471,13 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc) */ static inline unsigned long long ns_2_cycles(unsigned long long ns) { - struct cyc2ns_data *data = cyc2ns_read_begin(); + struct cyc2ns_data data; unsigned long long cyc; - cyc = (ns << data->cyc2ns_shift) / data->cyc2ns_mul; + cyc2ns_read_begin(&data); + cyc = (ns << data.cyc2ns_shift) / data.cyc2ns_mul; + cyc2ns_read_end(); - cyc2ns_read_end(data); return cyc; } diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c index 919be0aa2578..240544253ccd 100644 --- a/drivers/acpi/pci_root.c +++ b/drivers/acpi/pci_root.c @@ -523,7 +523,7 @@ static int acpi_pci_root_add(struct acpi_device *device, struct acpi_pci_root *root; acpi_handle handle = device->handle; int no_aspm = 0; - bool hotadd = system_state != SYSTEM_BOOTING; + bool hotadd = system_state == SYSTEM_RUNNING; root = kzalloc(sizeof(struct acpi_pci_root), GFP_KERNEL); if (!root) diff --git a/drivers/base/node.c b/drivers/base/node.c index 5548f9686016..0440d95c9b5b 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -377,7 +377,7 @@ static int __ref get_nid_for_pfn(unsigned long pfn) if (!pfn_valid_within(pfn)) return -1; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT - if (system_state == SYSTEM_BOOTING) + if (system_state < SYSTEM_RUNNING) return early_pfn_to_nid(pfn); #endif page = pfn_to_page(pfn); diff --git a/drivers/cpufreq/pasemi-cpufreq.c b/drivers/cpufreq/pasemi-cpufreq.c index 35dd4d7ffee0..b257fc7d5204 100644 --- a/drivers/cpufreq/pasemi-cpufreq.c +++ b/drivers/cpufreq/pasemi-cpufreq.c @@ -226,7 +226,7 @@ static int pas_cpufreq_cpu_exit(struct cpufreq_policy *policy) * We don't support CPU hotplug. Don't unmap after the system * has already made it to a running state. */ - if (system_state != SYSTEM_BOOTING) + if (system_state >= SYSTEM_RUNNING) return 0; if (sdcasr_mapbase) diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 2706be7ed334..60bb64f4329d 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -220,6 +220,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, entered_state = target_state->enter(dev, drv, index); start_critical_timings(); + sched_clock_idle_wakeup_event(); time_end = ns_to_ktime(local_clock()); trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index fc2765ccdb57..8500deda9175 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -4315,7 +4315,7 @@ int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg) struct acpi_dmar_atsr *atsr; struct dmar_atsr_unit *atsru; - if (system_state != SYSTEM_BOOTING && !intel_iommu_enabled) + if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) return 0; atsr = container_of(hdr, struct acpi_dmar_atsr, header); @@ -4565,7 +4565,7 @@ int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) struct acpi_dmar_atsr *atsr; struct acpi_dmar_reserved_memory *rmrr; - if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING) + if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING) return 0; list_for_each_entry(rmrru, &dmar_rmrr_units, list) { diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c index 19779b88a479..8cb60829a7a1 100644 --- a/drivers/iommu/of_iommu.c +++ b/drivers/iommu/of_iommu.c @@ -103,7 +103,7 @@ static bool of_iommu_driver_present(struct device_node *np) * it never will be. We don't want to defer indefinitely, nor attempt * to dereference __iommu_of_table after it's been freed. */ - if (system_state > SYSTEM_BOOTING) + if (system_state >= SYSTEM_RUNNING) return false; return of_match_node(&__iommu_of_table, np); diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index c1ec8ee80924..9e35032351a0 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -190,6 +190,7 @@ static void do_poweroff(void) { switch (system_state) { case SYSTEM_BOOTING: + case SYSTEM_SCHEDULING: orderly_poweroff(true); break; case SYSTEM_RUNNING: diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index f2b10d9ebd04..81490456c242 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -96,6 +96,7 @@ struct clocksource { void (*suspend)(struct clocksource *cs); void (*resume)(struct clocksource *cs); void (*mark_unstable)(struct clocksource *cs); + void (*tick_stable)(struct clocksource *cs); /* private: */ #ifdef CONFIG_CLOCKSOURCE_WATCHDOG diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 2404ad238c0b..4bf4479a3a80 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -236,6 +236,23 @@ unsigned int cpumask_local_spread(unsigned int i, int node); (cpu) = cpumask_next_zero((cpu), (mask)), \ (cpu) < nr_cpu_ids;) +extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap); + +/** + * for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location + * @cpu: the (optionally unsigned) integer iterator + * @mask: the cpumask poiter + * @start: the start location + * + * The implementation does not assume any bit in @mask is set (including @start). + * + * After the loop, cpu is >= nr_cpu_ids. + */ +#define for_each_cpu_wrap(cpu, mask, start) \ + for ((cpu) = cpumask_next_wrap((start)-1, (mask), (start), false); \ + (cpu) < nr_cpumask_bits; \ + (cpu) = cpumask_next_wrap((cpu), (mask), (start), true)) + /** * for_each_cpu_and - iterate over every cpu in both masks * @cpu: the (optionally unsigned) integer iterator @@ -276,6 +293,12 @@ static inline void cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp) set_bit(cpumask_check(cpu), cpumask_bits(dstp)); } +static inline void __cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp) +{ + __set_bit(cpumask_check(cpu), cpumask_bits(dstp)); +} + + /** * cpumask_clear_cpu - clear a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) @@ -286,6 +309,11 @@ static inline void cpumask_clear_cpu(int cpu, struct cpumask *dstp) clear_bit(cpumask_check(cpu), cpumask_bits(dstp)); } +static inline void __cpumask_clear_cpu(int cpu, struct cpumask *dstp) +{ + __clear_bit(cpumask_check(cpu), cpumask_bits(dstp)); +} + /** * cpumask_test_cpu - test for a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 13bc08aba704..1c91f26e2996 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -490,9 +490,13 @@ extern int root_mountflags; extern bool early_boot_irqs_disabled; -/* Values used for system_state */ +/* + * Values used for system_state. Ordering of the states must not be changed + * as code checks for <, <=, >, >= STATE. + */ extern enum system_states { SYSTEM_BOOTING, + SYSTEM_SCHEDULING, SYSTEM_RUNNING, SYSTEM_HALT, SYSTEM_POWER_OFF, diff --git a/include/linux/llist.h b/include/linux/llist.h index 171baa90f6f6..d11738110a7a 100644 --- a/include/linux/llist.h +++ b/include/linux/llist.h @@ -110,6 +110,25 @@ static inline void init_llist_head(struct llist_head *list) for ((pos) = (node); pos; (pos) = (pos)->next) /** + * llist_for_each_safe - iterate over some deleted entries of a lock-less list + * safe against removal of list entry + * @pos: the &struct llist_node to use as a loop cursor + * @n: another &struct llist_node to use as temporary storage + * @node: the first entry of deleted list entries + * + * In general, some entries of the lock-less list can be traversed + * safely only after being deleted from list, so start with an entry + * instead of list head. + * + * If being used on entries deleted from lock-less list directly, the + * traverse order is from the newest to the oldest added entry. If + * you want to traverse from the oldest to the newest, you must + * reverse the order by yourself before traversing. + */ +#define llist_for_each_safe(pos, n, node) \ + for ((pos) = (node); (pos) && ((n) = (pos)->next, true); (pos) = (n)) + +/** * llist_for_each_entry - iterate over some deleted entries of lock-less list of given type * @pos: the type * to use as a loop cursor. * @node: the fist entry of deleted list entries. diff --git a/include/linux/sched.h b/include/linux/sched.h index 2b69fc650201..1f0f427e0292 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -421,7 +421,8 @@ struct sched_dl_entity { u64 dl_runtime; /* Maximum runtime for each instance */ u64 dl_deadline; /* Relative deadline of each instance */ u64 dl_period; /* Separation of two instances (period) */ - u64 dl_bw; /* dl_runtime / dl_deadline */ + u64 dl_bw; /* dl_runtime / dl_period */ + u64 dl_density; /* dl_runtime / dl_deadline */ /* * Actual scheduling parameters. Initialized with the values above, @@ -445,16 +446,33 @@ struct sched_dl_entity { * * @dl_yielded tells if task gave up the CPU before consuming * all its available runtime during the last job. + * + * @dl_non_contending tells if the task is inactive while still + * contributing to the active utilization. In other words, it + * indicates if the inactive timer has been armed and its handler + * has not been executed yet. This flag is useful to avoid race + * conditions between the inactive timer handler and the wakeup + * code. */ int dl_throttled; int dl_boosted; int dl_yielded; + int dl_non_contending; /* * Bandwidth enforcement timer. Each -deadline task has its * own bandwidth to be enforced, thus we need one timer per task. */ struct hrtimer dl_timer; + + /* + * Inactive timer, responsible for decreasing the active utilization + * at the "0-lag time". When a -deadline task blocks, it contributes + * to GRUB's active utilization until the "0-lag time", hence a + * timer is needed to decrease the active utilization at the correct + * time. + */ + struct hrtimer inactive_timer; }; union rcu_special { @@ -1096,8 +1114,6 @@ static inline struct pid *task_session(struct task_struct *task) * current. * task_xid_nr_ns() : id seen from the ns specified; * - * set_task_vxid() : assigns a virtual id to a task; - * * see also pid_nr() etc in include/linux/pid.h */ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns); diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h index 34fe92ce1ebd..a55600ffdf4b 100644 --- a/include/linux/sched/clock.h +++ b/include/linux/sched/clock.h @@ -23,10 +23,6 @@ extern u64 sched_clock_cpu(int cpu); extern void sched_clock_init(void); #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK -static inline void sched_clock_init_late(void) -{ -} - static inline void sched_clock_tick(void) { } @@ -39,7 +35,7 @@ static inline void sched_clock_idle_sleep_event(void) { } -static inline void sched_clock_idle_wakeup_event(u64 delta_ns) +static inline void sched_clock_idle_wakeup_event(void) { } @@ -53,7 +49,6 @@ static inline u64 local_clock(void) return sched_clock(); } #else -extern void sched_clock_init_late(void); extern int sched_clock_stable(void); extern void clear_sched_clock_stable(void); @@ -63,10 +58,10 @@ extern void clear_sched_clock_stable(void); */ extern u64 __sched_clock_offset; - extern void sched_clock_tick(void); +extern void sched_clock_tick_stable(void); extern void sched_clock_idle_sleep_event(void); -extern void sched_clock_idle_wakeup_event(u64 delta_ns); +extern void sched_clock_idle_wakeup_event(void); /* * As outlined in clock.c, provides a fast, high resolution, nanosecond diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 5f0fe019a720..e2a6c7b3510b 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -47,5 +47,6 @@ * For the sched_{set,get}attr() calls */ #define SCHED_FLAG_RESET_ON_FORK 0x01 +#define SCHED_FLAG_RECLAIM 0x02 #endif /* _UAPI_LINUX_SCHED_H */ diff --git a/init/main.c b/init/main.c index f866510472d7..df58a416dd1d 100644 --- a/init/main.c +++ b/init/main.c @@ -389,6 +389,7 @@ static __initdata DECLARE_COMPLETION(kthreadd_done); static noinline void __ref rest_init(void) { + struct task_struct *tsk; int pid; rcu_scheduler_starting(); @@ -397,12 +398,32 @@ static noinline void __ref rest_init(void) * the init task will end up wanting to create kthreads, which, if * we schedule it before we create kthreadd, will OOPS. */ - kernel_thread(kernel_init, NULL, CLONE_FS); + pid = kernel_thread(kernel_init, NULL, CLONE_FS); + /* + * Pin init on the boot CPU. Task migration is not properly working + * until sched_init_smp() has been run. It will set the allowed + * CPUs for init to the non isolated CPUs. + */ + rcu_read_lock(); + tsk = find_task_by_pid_ns(pid, &init_pid_ns); + set_cpus_allowed_ptr(tsk, cpumask_of(smp_processor_id())); + rcu_read_unlock(); + numa_default_policy(); pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES); rcu_read_lock(); kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns); rcu_read_unlock(); + + /* + * Enable might_sleep() and smp_processor_id() checks. + * They cannot be enabled earlier because with CONFIG_PRREMPT=y + * kernel_thread() would trigger might_sleep() splats. With + * CONFIG_PREEMPT_VOLUNTARY=y the init task might have scheduled + * already, but it's stuck on the kthreadd_done completion. + */ + system_state = SYSTEM_SCHEDULING; + complete(&kthreadd_done); /* @@ -1015,10 +1036,6 @@ static noinline void __init kernel_init_freeable(void) * init can allocate pages on any node */ set_mems_allowed(node_states[N_MEMORY]); - /* - * init can run on any cpu. - */ - set_cpus_allowed_ptr(current, cpu_all_mask); cad_pid = task_pid(current); diff --git a/kernel/async.c b/kernel/async.c index d2edd6efec56..2cbd3dd5940d 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -114,14 +114,14 @@ static void async_run_entry_fn(struct work_struct *work) ktime_t uninitialized_var(calltime), delta, rettime; /* 1) run (and print duration) */ - if (initcall_debug && system_state == SYSTEM_BOOTING) { + if (initcall_debug && system_state < SYSTEM_RUNNING) { pr_debug("calling %lli_%pF @ %i\n", (long long)entry->cookie, entry->func, task_pid_nr(current)); calltime = ktime_get(); } entry->func(entry->data, entry->cookie); - if (initcall_debug && system_state == SYSTEM_BOOTING) { + if (initcall_debug && system_state < SYSTEM_RUNNING) { rettime = ktime_get(); delta = ktime_sub(rettime, calltime); pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n", @@ -284,14 +284,14 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain { ktime_t uninitialized_var(starttime), delta, endtime; - if (initcall_debug && system_state == SYSTEM_BOOTING) { + if (initcall_debug && system_state < SYSTEM_RUNNING) { pr_debug("async_waiting @ %i\n", task_pid_nr(current)); starttime = ktime_get(); } wait_event(async_done, lowest_in_progress(domain) >= cookie); - if (initcall_debug && system_state == SYSTEM_BOOTING) { + if (initcall_debug && system_state < SYSTEM_RUNNING) { endtime = ktime_get(); delta = ktime_sub(endtime, starttime); diff --git a/kernel/extable.c b/kernel/extable.c index 2676d7f8baf6..0fbdd8582f08 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -75,7 +75,7 @@ int core_kernel_text(unsigned long addr) addr < (unsigned long)_etext) return 1; - if (system_state == SYSTEM_BOOTING && + if (system_state < SYSTEM_RUNNING && init_kernel_text(addr)) return 1; return 0; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a1db38abac5b..bd53ea579dc8 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1175,7 +1175,7 @@ static void boot_delay_msec(int level) unsigned long long k; unsigned long timeout; - if ((boot_delay == 0 || system_state != SYSTEM_BOOTING) + if ((boot_delay == 0 || system_state >= SYSTEM_RUNNING) || suppress_message_printing(level)) { return; } diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 16277e2ed8ee..53f0164ed362 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -16,9 +16,9 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif obj-y += core.o loadavg.o clock.o cputime.o -obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o +obj-y += idle_task.o fair.o rt.o deadline.o obj-y += wait.o wait_bit.o swait.o completion.o idle.o -obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o +obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 00a45c45beca..ca0f8fc945c6 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -64,6 +64,7 @@ #include <linux/workqueue.h> #include <linux/compiler.h> #include <linux/tick.h> +#include <linux/init.h> /* * Scheduler clock - returns current time in nanosec units. @@ -124,14 +125,27 @@ int sched_clock_stable(void) return static_branch_likely(&__sched_clock_stable); } +static void __scd_stamp(struct sched_clock_data *scd) +{ + scd->tick_gtod = ktime_get_ns(); + scd->tick_raw = sched_clock(); +} + static void __set_sched_clock_stable(void) { - struct sched_clock_data *scd = this_scd(); + struct sched_clock_data *scd; /* + * Since we're still unstable and the tick is already running, we have + * to disable IRQs in order to get a consistent scd->tick* reading. + */ + local_irq_disable(); + scd = this_scd(); + /* * Attempt to make the (initial) unstable->stable transition continuous. */ __sched_clock_offset = (scd->tick_gtod + __gtod_offset) - (scd->tick_raw); + local_irq_enable(); printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n", scd->tick_gtod, __gtod_offset, @@ -141,8 +155,38 @@ static void __set_sched_clock_stable(void) tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE); } +/* + * If we ever get here, we're screwed, because we found out -- typically after + * the fact -- that TSC wasn't good. This means all our clocksources (including + * ktime) could have reported wrong values. + * + * What we do here is an attempt to fix up and continue sort of where we left + * off in a coherent manner. + * + * The only way to fully avoid random clock jumps is to boot with: + * "tsc=unstable". + */ static void __sched_clock_work(struct work_struct *work) { + struct sched_clock_data *scd; + int cpu; + + /* take a current timestamp and set 'now' */ + preempt_disable(); + scd = this_scd(); + __scd_stamp(scd); + scd->clock = scd->tick_gtod + __gtod_offset; + preempt_enable(); + + /* clone to all CPUs */ + for_each_possible_cpu(cpu) + per_cpu(sched_clock_data, cpu) = *scd; + + printk(KERN_WARNING "TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'.\n"); + printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n", + scd->tick_gtod, __gtod_offset, + scd->tick_raw, __sched_clock_offset); + static_branch_disable(&__sched_clock_stable); } @@ -150,27 +194,11 @@ static DECLARE_WORK(sched_clock_work, __sched_clock_work); static void __clear_sched_clock_stable(void) { - struct sched_clock_data *scd = this_scd(); - - /* - * Attempt to make the stable->unstable transition continuous. - * - * Trouble is, this is typically called from the TSC watchdog - * timer, which is late per definition. This means the tick - * values can already be screwy. - * - * Still do what we can. - */ - __gtod_offset = (scd->tick_raw + __sched_clock_offset) - (scd->tick_gtod); - - printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n", - scd->tick_gtod, __gtod_offset, - scd->tick_raw, __sched_clock_offset); + if (!sched_clock_stable()) + return; tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE); - - if (sched_clock_stable()) - schedule_work(&sched_clock_work); + schedule_work(&sched_clock_work); } void clear_sched_clock_stable(void) @@ -183,7 +211,11 @@ void clear_sched_clock_stable(void) __clear_sched_clock_stable(); } -void sched_clock_init_late(void) +/* + * We run this as late_initcall() such that it runs after all built-in drivers, + * notably: acpi_processor and intel_idle, which can mark the TSC as unstable. + */ +static int __init sched_clock_init_late(void) { sched_clock_running = 2; /* @@ -197,7 +229,10 @@ void sched_clock_init_late(void) if (__sched_clock_stable_early) __set_sched_clock_stable(); + + return 0; } +late_initcall(sched_clock_init_late); /* * min, max except they take wrapping into account @@ -347,21 +382,38 @@ void sched_clock_tick(void) { struct sched_clock_data *scd; + if (sched_clock_stable()) + return; + + if (unlikely(!sched_clock_running)) + return; + WARN_ON_ONCE(!irqs_disabled()); + scd = this_scd(); + __scd_stamp(scd); + sched_clock_local(scd); +} + +void sched_clock_tick_stable(void) +{ + u64 gtod, clock; + + if (!sched_clock_stable()) + return; + /* - * Update these values even if sched_clock_stable(), because it can - * become unstable at any point in time at which point we need some - * values to fall back on. + * Called under watchdog_lock. * - * XXX arguably we can skip this if we expose tsc_clocksource_reliable + * The watchdog just found this TSC to (still) be stable, so now is a + * good moment to update our __gtod_offset. Because once we find the + * TSC to be unstable, any computation will be computing crap. */ - scd = this_scd(); - scd->tick_raw = sched_clock(); - scd->tick_gtod = ktime_get_ns(); - - if (!sched_clock_stable() && likely(sched_clock_running)) - sched_clock_local(scd); + local_irq_disable(); + gtod = ktime_get_ns(); + clock = sched_clock(); + __gtod_offset = (clock + __sched_clock_offset) - gtod; + local_irq_enable(); } /* @@ -374,15 +426,21 @@ void sched_clock_idle_sleep_event(void) EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); /* - * We just idled delta nanoseconds (called with irqs disabled): + * We just idled; resync with ktime. */ -void sched_clock_idle_wakeup_event(u64 delta_ns) +void sched_clock_idle_wakeup_event(void) { - if (timekeeping_suspended) + unsigned long flags; + + if (sched_clock_stable()) + return; + + if (unlikely(timekeeping_suspended)) return; + local_irq_save(flags); sched_clock_tick(); - touch_softlockup_watchdog_sched(); + local_irq_restore(flags); } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e7b9ef8df126..62166da1c359 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -789,36 +789,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) dequeue_task(rq, p, flags); } -void sched_set_stop_task(int cpu, struct task_struct *stop) -{ - struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; - struct task_struct *old_stop = cpu_rq(cpu)->stop; - - if (stop) { - /* - * Make it appear like a SCHED_FIFO task, its something - * userspace knows about and won't get confused about. - * - * Also, it will make PI more or less work without too - * much confusion -- but then, stop work should not - * rely on PI working anyway. - */ - sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); - - stop->sched_class = &stop_sched_class; - } - - cpu_rq(cpu)->stop = stop; - - if (old_stop) { - /* - * Reset it back to a normal scheduling class so that - * it can die in pieces. - */ - old_stop->sched_class = &rt_sched_class; - } -} - /* * __normal_prio - return the priority that is based on the static prio */ @@ -1589,6 +1559,36 @@ static void update_avg(u64 *avg, u64 sample) *avg += diff >> 3; } +void sched_set_stop_task(int cpu, struct task_struct *stop) +{ + struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; + struct task_struct *old_stop = cpu_rq(cpu)->stop; + + if (stop) { + /* + * Make it appear like a SCHED_FIFO task, its something + * userspace knows about and won't get confused about. + * + * Also, it will make PI more or less work without too + * much confusion -- but then, stop work should not + * rely on PI working anyway. + */ + sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); + + stop->sched_class = &stop_sched_class; + } + + cpu_rq(cpu)->stop = stop; + + if (old_stop) { + /* + * Reset it back to a normal scheduling class so that + * it can die in pieces. + */ + old_stop->sched_class = &rt_sched_class; + } +} + #else static inline int __set_cpus_allowed_ptr(struct task_struct *p, @@ -1732,7 +1732,7 @@ void sched_ttwu_pending(void) { struct rq *rq = this_rq(); struct llist_node *llist = llist_del_all(&rq->wake_list); - struct task_struct *p; + struct task_struct *p, *t; struct rq_flags rf; if (!llist) @@ -1741,17 +1741,8 @@ void sched_ttwu_pending(void) rq_lock_irqsave(rq, &rf); update_rq_clock(rq); - while (llist) { - int wake_flags = 0; - - p = llist_entry(llist, struct task_struct, wake_entry); - llist = llist_next(llist); - - if (p->sched_remote_wakeup) - wake_flags = WF_MIGRATED; - - ttwu_do_activate(rq, p, wake_flags, &rf); - } + llist_for_each_entry_safe(p, t, llist, wake_entry) + ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf); rq_unlock_irqrestore(rq, &rf); } @@ -2160,9 +2151,11 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_period = 0; dl_se->flags = 0; dl_se->dl_bw = 0; + dl_se->dl_density = 0; dl_se->dl_throttled = 0; dl_se->dl_yielded = 0; + dl_se->dl_non_contending = 0; } /* @@ -2194,6 +2187,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) RB_CLEAR_NODE(&p->dl.rb_node); init_dl_task_timer(&p->dl); + init_dl_inactive_task_timer(&p->dl); __dl_clear_params(p); INIT_LIST_HEAD(&p->rt.run_list); @@ -2431,7 +2425,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) unsigned long to_ratio(u64 period, u64 runtime) { if (runtime == RUNTIME_INF) - return 1ULL << 20; + return BW_UNIT; /* * Doing this here saves a lot of checks in all @@ -2441,7 +2435,7 @@ unsigned long to_ratio(u64 period, u64 runtime) if (period == 0) return 0; - return div64_u64(runtime << 20, period); + return div64_u64(runtime << BW_SHIFT, period); } #ifdef CONFIG_SMP @@ -2452,7 +2446,7 @@ inline struct dl_bw *dl_bw_of(int i) return &cpu_rq(i)->rd->dl_bw; } -static inline int dl_bw_cpus(int i) +inline int dl_bw_cpus(int i) { struct root_domain *rd = cpu_rq(i)->rd; int cpus = 0; @@ -2470,7 +2464,7 @@ inline struct dl_bw *dl_bw_of(int i) return &cpu_rq(i)->dl.dl_bw; } -static inline int dl_bw_cpus(int i) +inline int dl_bw_cpus(int i) { return 1; } @@ -2483,9 +2477,6 @@ static inline int dl_bw_cpus(int i) * allocated bandwidth to reflect the new situation. * * This function is called while holding p's rq->lock. - * - * XXX we should delay bw change until the task's 0-lag point, see - * __setparam_dl(). */ static int dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr) @@ -2510,15 +2501,29 @@ static int dl_overflow(struct task_struct *p, int policy, cpus = dl_bw_cpus(task_cpu(p)); if (dl_policy(policy) && !task_has_dl_policy(p) && !__dl_overflow(dl_b, cpus, 0, new_bw)) { - __dl_add(dl_b, new_bw); + if (hrtimer_active(&p->dl.inactive_timer)) + __dl_clear(dl_b, p->dl.dl_bw, cpus); + __dl_add(dl_b, new_bw, cpus); err = 0; } else if (dl_policy(policy) && task_has_dl_policy(p) && !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { - __dl_clear(dl_b, p->dl.dl_bw); - __dl_add(dl_b, new_bw); + /* + * XXX this is slightly incorrect: when the task + * utilization decreases, we should delay the total + * utilization change until the task's 0-lag point. + * But this would require to set the task's "inactive + * timer" when the task is not inactive. + */ + __dl_clear(dl_b, p->dl.dl_bw, cpus); + __dl_add(dl_b, new_bw, cpus); + dl_change_utilization(p, new_bw); err = 0; } else if (!dl_policy(policy) && task_has_dl_policy(p)) { - __dl_clear(dl_b, p->dl.dl_bw); + /* + * Do not decrease the total deadline utilization here, + * switched_from_dl() will take care to do it at the correct + * (0-lag) time. + */ err = 0; } raw_spin_unlock(&dl_b->lock); @@ -4027,26 +4032,7 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr) dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; dl_se->flags = attr->sched_flags; dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); - - /* - * Changing the parameters of a task is 'tricky' and we're not doing - * the correct thing -- also see task_dead_dl() and switched_from_dl(). - * - * What we SHOULD do is delay the bandwidth release until the 0-lag - * point. This would include retaining the task_struct until that time - * and change dl_overflow() to not immediately decrement the current - * amount. - * - * Instead we retain the current runtime/deadline and let the new - * parameters take effect after the current reservation period lapses. - * This is safe (albeit pessimistic) because the 0-lag point is always - * before the current scheduling deadline. - * - * We can still have temporary overloads because we do not delay the - * change in bandwidth until that time; so admission control is - * not on the safe side. It does however guarantee tasks will never - * consume more than promised. - */ + dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); } /* @@ -4198,8 +4184,8 @@ static int __sched_setscheduler(struct task_struct *p, int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; struct rq *rq; - /* May grab non-irq protected spin_locks: */ - BUG_ON(in_interrupt()); + /* The pi code expects interrupts enabled */ + BUG_ON(pi && in_interrupt()); recheck: /* Double check policy once rq lock held: */ if (policy < 0) { @@ -4212,7 +4198,8 @@ recheck: return -EINVAL; } - if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK)) + if (attr->sched_flags & + ~(SCHED_FLAG_RESET_ON_FORK | SCHED_FLAG_RECLAIM)) return -EINVAL; /* @@ -5531,7 +5518,7 @@ int task_can_attach(struct task_struct *p, * We will free resources in the source root_domain * later on (see set_cpus_allowed_dl()). */ - __dl_add(dl_b, p->dl.dl_bw); + __dl_add(dl_b, p->dl.dl_bw, cpus); } raw_spin_unlock_irqrestore(&dl_b->lock, flags); rcu_read_unlock_sched(); @@ -5959,7 +5946,6 @@ void __init sched_init_smp(void) cpumask_var_t non_isolated_cpus; alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); - alloc_cpumask_var(&fallback_doms, GFP_KERNEL); sched_init_numa(); @@ -5969,7 +5955,7 @@ void __init sched_init_smp(void) * happen. */ mutex_lock(&sched_domains_mutex); - init_sched_domains(cpu_active_mask); + sched_init_domains(cpu_active_mask); cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); if (cpumask_empty(non_isolated_cpus)) cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); @@ -5985,7 +5971,6 @@ void __init sched_init_smp(void) init_sched_dl_class(); sched_init_smt(); - sched_clock_init_late(); sched_smp_initialized = true; } @@ -6001,7 +5986,6 @@ early_initcall(migration_init); void __init sched_init_smp(void) { sched_init_granularity(); - sched_clock_init_late(); } #endif /* CONFIG_SMP */ @@ -6185,7 +6169,6 @@ void __init sched_init(void) calc_load_update = jiffies + LOAD_FREQ; #ifdef CONFIG_SMP - zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); /* May be allocated at isolcpus cmdline parse time */ if (cpu_isolated_map == NULL) zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); @@ -6237,8 +6220,10 @@ void ___might_sleep(const char *file, int line, int preempt_offset) if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && !is_idle_task(current)) || - system_state != SYSTEM_RUNNING || oops_in_progress) + system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || + oops_in_progress) return; + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) return; prev_jiffy = jiffies; @@ -6763,6 +6748,19 @@ static int sched_dl_global_validate(void) return ret; } +void init_dl_rq_bw_ratio(struct dl_rq *dl_rq) +{ + if (global_rt_runtime() == RUNTIME_INF) { + dl_rq->bw_ratio = 1 << RATIO_SHIFT; + dl_rq->extra_bw = 1 << BW_SHIFT; + } else { + dl_rq->bw_ratio = to_ratio(global_rt_runtime(), + global_rt_period()) >> (BW_SHIFT - RATIO_SHIFT); + dl_rq->extra_bw = to_ratio(global_rt_period(), + global_rt_runtime()); + } +} + static void sched_dl_do_global(void) { u64 new_bw = -1; @@ -6788,6 +6786,7 @@ static void sched_dl_do_global(void) raw_spin_unlock_irqrestore(&dl_b->lock, flags); rcu_read_unlock_sched(); + init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl); } } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index a2ce59015642..e12f85975857 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -43,6 +43,222 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se) return !RB_EMPTY_NODE(&dl_se->rb_node); } +static inline +void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) +{ + u64 old = dl_rq->running_bw; + + lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + dl_rq->running_bw += dl_bw; + SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ + SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); +} + +static inline +void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) +{ + u64 old = dl_rq->running_bw; + + lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + dl_rq->running_bw -= dl_bw; + SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */ + if (dl_rq->running_bw > old) + dl_rq->running_bw = 0; +} + +static inline +void add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) +{ + u64 old = dl_rq->this_bw; + + lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + dl_rq->this_bw += dl_bw; + SCHED_WARN_ON(dl_rq->this_bw < old); /* overflow */ +} + +static inline +void sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) +{ + u64 old = dl_rq->this_bw; + + lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + dl_rq->this_bw -= dl_bw; + SCHED_WARN_ON(dl_rq->this_bw > old); /* underflow */ + if (dl_rq->this_bw > old) + dl_rq->this_bw = 0; + SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); +} + +void dl_change_utilization(struct task_struct *p, u64 new_bw) +{ + struct rq *rq; + + if (task_on_rq_queued(p)) + return; + + rq = task_rq(p); + if (p->dl.dl_non_contending) { + sub_running_bw(p->dl.dl_bw, &rq->dl); + p->dl.dl_non_contending = 0; + /* + * If the timer handler is currently running and the + * timer cannot be cancelled, inactive_task_timer() + * will see that dl_not_contending is not set, and + * will not touch the rq's active utilization, + * so we are still safe. + */ + if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) + put_task_struct(p); + } + sub_rq_bw(p->dl.dl_bw, &rq->dl); + add_rq_bw(new_bw, &rq->dl); +} + +/* + * The utilization of a task cannot be immediately removed from + * the rq active utilization (running_bw) when the task blocks. + * Instead, we have to wait for the so called "0-lag time". + * + * If a task blocks before the "0-lag time", a timer (the inactive + * timer) is armed, and running_bw is decreased when the timer + * fires. + * + * If the task wakes up again before the inactive timer fires, + * the timer is cancelled, whereas if the task wakes up after the + * inactive timer fired (and running_bw has been decreased) the + * task's utilization has to be added to running_bw again. + * A flag in the deadline scheduling entity (dl_non_contending) + * is used to avoid race conditions between the inactive timer handler + * and task wakeups. + * + * The following diagram shows how running_bw is updated. A task is + * "ACTIVE" when its utilization contributes to running_bw; an + * "ACTIVE contending" task is in the TASK_RUNNING state, while an + * "ACTIVE non contending" task is a blocked task for which the "0-lag time" + * has not passed yet. An "INACTIVE" task is a task for which the "0-lag" + * time already passed, which does not contribute to running_bw anymore. + * +------------------+ + * wakeup | ACTIVE | + * +------------------>+ contending | + * | add_running_bw | | + * | +----+------+------+ + * | | ^ + * | dequeue | | + * +--------+-------+ | | + * | | t >= 0-lag | | wakeup + * | INACTIVE |<---------------+ | + * | | sub_running_bw | | + * +--------+-------+ | | + * ^ | | + * | t < 0-lag | | + * | | | + * | V | + * | +----+------+------+ + * | sub_running_bw | ACTIVE | + * +-------------------+ | + * inactive timer | non contending | + * fired +------------------+ + * + * The task_non_contending() function is invoked when a task + * blocks, and checks if the 0-lag time already passed or + * not (in the first case, it directly updates running_bw; + * in the second case, it arms the inactive timer). + * + * The task_contending() function is invoked when a task wakes + * up, and checks if the task is still in the "ACTIVE non contending" + * state or not (in the second case, it updates running_bw). + */ +static void task_non_contending(struct task_struct *p) +{ + struct sched_dl_entity *dl_se = &p->dl; + struct hrtimer *timer = &dl_se->inactive_timer; + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + struct rq *rq = rq_of_dl_rq(dl_rq); + s64 zerolag_time; + + /* + * If this is a non-deadline task that has been boosted, + * do nothing + */ + if (dl_se->dl_runtime == 0) + return; + + WARN_ON(hrtimer_active(&dl_se->inactive_timer)); + WARN_ON(dl_se->dl_non_contending); + + zerolag_time = dl_se->deadline - + div64_long((dl_se->runtime * dl_se->dl_period), + dl_se->dl_runtime); + + /* + * Using relative times instead of the absolute "0-lag time" + * allows to simplify the code + */ + zerolag_time -= rq_clock(rq); + + /* + * If the "0-lag time" already passed, decrease the active + * utilization now, instead of starting a timer + */ + if (zerolag_time < 0) { + if (dl_task(p)) + sub_running_bw(dl_se->dl_bw, dl_rq); + if (!dl_task(p) || p->state == TASK_DEAD) { + struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + + if (p->state == TASK_DEAD) + sub_rq_bw(p->dl.dl_bw, &rq->dl); + raw_spin_lock(&dl_b->lock); + __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); + __dl_clear_params(p); + raw_spin_unlock(&dl_b->lock); + } + + return; + } + + dl_se->dl_non_contending = 1; + get_task_struct(p); + hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL); +} + +static void task_contending(struct sched_dl_entity *dl_se, int flags) +{ + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + + /* + * If this is a non-deadline task that has been boosted, + * do nothing + */ + if (dl_se->dl_runtime == 0) + return; + + if (flags & ENQUEUE_MIGRATED) + add_rq_bw(dl_se->dl_bw, dl_rq); + + if (dl_se->dl_non_contending) { + dl_se->dl_non_contending = 0; + /* + * If the timer handler is currently running and the + * timer cannot be cancelled, inactive_task_timer() + * will see that dl_not_contending is not set, and + * will not touch the rq's active utilization, + * so we are still safe. + */ + if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) + put_task_struct(dl_task_of(dl_se)); + } else { + /* + * Since "dl_non_contending" is not set, the + * task's utilization has already been removed from + * active utilization (either when the task blocked, + * when the "inactive timer" fired). + * So, add it back. + */ + add_running_bw(dl_se->dl_bw, dl_rq); + } +} + static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) { struct sched_dl_entity *dl_se = &p->dl; @@ -83,6 +299,10 @@ void init_dl_rq(struct dl_rq *dl_rq) #else init_dl_bw(&dl_rq->dl_bw); #endif + + dl_rq->running_bw = 0; + dl_rq->this_bw = 0; + init_dl_rq_bw_ratio(dl_rq); } #ifdef CONFIG_SMP @@ -484,13 +704,84 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se, } /* - * When a -deadline entity is queued back on the runqueue, its runtime and - * deadline might need updating. + * Revised wakeup rule [1]: For self-suspending tasks, rather then + * re-initializing task's runtime and deadline, the revised wakeup + * rule adjusts the task's runtime to avoid the task to overrun its + * density. + * + * Reasoning: a task may overrun the density if: + * runtime / (deadline - t) > dl_runtime / dl_deadline + * + * Therefore, runtime can be adjusted to: + * runtime = (dl_runtime / dl_deadline) * (deadline - t) + * + * In such way that runtime will be equal to the maximum density + * the task can use without breaking any rule. + * + * [1] Luca Abeni, Giuseppe Lipari, and Juri Lelli. 2015. Constant + * bandwidth server revisited. SIGBED Rev. 11, 4 (January 2015), 19-24. + */ +static void +update_dl_revised_wakeup(struct sched_dl_entity *dl_se, struct rq *rq) +{ + u64 laxity = dl_se->deadline - rq_clock(rq); + + /* + * If the task has deadline < period, and the deadline is in the past, + * it should already be throttled before this check. + * + * See update_dl_entity() comments for further details. + */ + WARN_ON(dl_time_before(dl_se->deadline, rq_clock(rq))); + + dl_se->runtime = (dl_se->dl_density * laxity) >> BW_SHIFT; +} + +/* + * Regarding the deadline, a task with implicit deadline has a relative + * deadline == relative period. A task with constrained deadline has a + * relative deadline <= relative period. + * + * We support constrained deadline tasks. However, there are some restrictions + * applied only for tasks which do not have an implicit deadline. See + * update_dl_entity() to know more about such restrictions. + * + * The dl_is_implicit() returns true if the task has an implicit deadline. + */ +static inline bool dl_is_implicit(struct sched_dl_entity *dl_se) +{ + return dl_se->dl_deadline == dl_se->dl_period; +} + +/* + * When a deadline entity is placed in the runqueue, its runtime and deadline + * might need to be updated. This is done by a CBS wake up rule. There are two + * different rules: 1) the original CBS; and 2) the Revisited CBS. + * + * When the task is starting a new period, the Original CBS is used. In this + * case, the runtime is replenished and a new absolute deadline is set. + * + * When a task is queued before the begin of the next period, using the + * remaining runtime and deadline could make the entity to overflow, see + * dl_entity_overflow() to find more about runtime overflow. When such case + * is detected, the runtime and deadline need to be updated. * - * The policy here is that we update the deadline of the entity only if: - * - the current deadline is in the past, - * - using the remaining runtime with the current deadline would make - * the entity exceed its bandwidth. + * If the task has an implicit deadline, i.e., deadline == period, the Original + * CBS is applied. the runtime is replenished and a new absolute deadline is + * set, as in the previous cases. + * + * However, the Original CBS does not work properly for tasks with + * deadline < period, which are said to have a constrained deadline. By + * applying the Original CBS, a constrained deadline task would be able to run + * runtime/deadline in a period. With deadline < period, the task would + * overrun the runtime/period allowed bandwidth, breaking the admission test. + * + * In order to prevent this misbehave, the Revisited CBS is used for + * constrained deadline tasks when a runtime overflow is detected. In the + * Revisited CBS, rather than replenishing & setting a new absolute deadline, + * the remaining runtime of the task is reduced to avoid runtime overflow. + * Please refer to the comments update_dl_revised_wakeup() function to find + * more about the Revised CBS rule. */ static void update_dl_entity(struct sched_dl_entity *dl_se, struct sched_dl_entity *pi_se) @@ -500,6 +791,14 @@ static void update_dl_entity(struct sched_dl_entity *dl_se, if (dl_time_before(dl_se->deadline, rq_clock(rq)) || dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { + + if (unlikely(!dl_is_implicit(dl_se) && + !dl_time_before(dl_se->deadline, rq_clock(rq)) && + !dl_se->dl_boosted)){ + update_dl_revised_wakeup(dl_se, rq); + return; + } + dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->runtime = pi_se->dl_runtime; } @@ -593,10 +892,8 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * The task might have changed its scheduling policy to something * different than SCHED_DEADLINE (through switched_from_dl()). */ - if (!dl_task(p)) { - __dl_clear_params(p); + if (!dl_task(p)) goto unlock; - } /* * The task might have been boosted by someone else and might be in the @@ -723,6 +1020,8 @@ static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se) if (unlikely(dl_se->dl_boosted || !start_dl_timer(p))) return; dl_se->dl_throttled = 1; + if (dl_se->runtime > 0) + dl_se->runtime = 0; } } @@ -735,6 +1034,47 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se) extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); /* + * This function implements the GRUB accounting rule: + * according to the GRUB reclaiming algorithm, the runtime is + * not decreased as "dq = -dt", but as + * "dq = -max{u / Umax, (1 - Uinact - Uextra)} dt", + * where u is the utilization of the task, Umax is the maximum reclaimable + * utilization, Uinact is the (per-runqueue) inactive utilization, computed + * as the difference between the "total runqueue utilization" and the + * runqueue active utilization, and Uextra is the (per runqueue) extra + * reclaimable utilization. + * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations + * multiplied by 2^BW_SHIFT, the result has to be shifted right by + * BW_SHIFT. + * Since rq->dl.bw_ratio contains 1 / Umax multipled by 2^RATIO_SHIFT, + * dl_bw is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT. + * Since delta is a 64 bit variable, to have an overflow its value + * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds. + * So, overflow is not an issue here. + */ +u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se) +{ + u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */ + u64 u_act; + u64 u_act_min = (dl_se->dl_bw * rq->dl.bw_ratio) >> RATIO_SHIFT; + + /* + * Instead of computing max{u * bw_ratio, (1 - u_inact - u_extra)}, + * we compare u_inact + rq->dl.extra_bw with + * 1 - (u * rq->dl.bw_ratio >> RATIO_SHIFT), because + * u_inact + rq->dl.extra_bw can be larger than + * 1 * (so, 1 - u_inact - rq->dl.extra_bw would be negative + * leading to wrong results) + */ + if (u_inact + rq->dl.extra_bw > BW_UNIT - u_act_min) + u_act = u_act_min; + else + u_act = BW_UNIT - u_inact - rq->dl.extra_bw; + + return (delta * u_act) >> BW_SHIFT; +} + +/* * Update the current task's runtime statistics (provided it is still * a -deadline task and has not been removed from the dl_rq). */ @@ -776,6 +1116,8 @@ static void update_curr_dl(struct rq *rq) sched_rt_avg_update(rq, delta_exec); + if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) + delta_exec = grub_reclaim(delta_exec, rq, &curr->dl); dl_se->runtime -= delta_exec; throttle: @@ -815,6 +1157,56 @@ throttle: } } +static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) +{ + struct sched_dl_entity *dl_se = container_of(timer, + struct sched_dl_entity, + inactive_timer); + struct task_struct *p = dl_task_of(dl_se); + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(p, &rf); + + if (!dl_task(p) || p->state == TASK_DEAD) { + struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + + if (p->state == TASK_DEAD && dl_se->dl_non_contending) { + sub_running_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl)); + sub_rq_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl)); + dl_se->dl_non_contending = 0; + } + + raw_spin_lock(&dl_b->lock); + __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); + raw_spin_unlock(&dl_b->lock); + __dl_clear_params(p); + + goto unlock; + } + if (dl_se->dl_non_contending == 0) + goto unlock; + + sched_clock_tick(); + update_rq_clock(rq); + + sub_running_bw(dl_se->dl_bw, &rq->dl); + dl_se->dl_non_contending = 0; +unlock: + task_rq_unlock(rq, p, &rf); + put_task_struct(p); + + return HRTIMER_NORESTART; +} + +void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) +{ + struct hrtimer *timer = &dl_se->inactive_timer; + + hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + timer->function = inactive_task_timer; +} + #ifdef CONFIG_SMP static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) @@ -946,10 +1338,12 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, * parameters of the task might need updating. Otherwise, * we want a replenishment of its runtime. */ - if (flags & ENQUEUE_WAKEUP) + if (flags & ENQUEUE_WAKEUP) { + task_contending(dl_se, flags); update_dl_entity(dl_se, pi_se); - else if (flags & ENQUEUE_REPLENISH) + } else if (flags & ENQUEUE_REPLENISH) { replenish_dl_entity(dl_se, pi_se); + } __enqueue_dl_entity(dl_se); } @@ -959,11 +1353,6 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se) __dequeue_dl_entity(dl_se); } -static inline bool dl_is_constrained(struct sched_dl_entity *dl_se) -{ - return dl_se->dl_deadline < dl_se->dl_period; -} - static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) { struct task_struct *pi_task = rt_mutex_get_top_task(p); @@ -995,17 +1384,32 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * If that is the case, the task will be throttled and * the replenishment timer will be set to the next period. */ - if (!p->dl.dl_throttled && dl_is_constrained(&p->dl)) + if (!p->dl.dl_throttled && !dl_is_implicit(&p->dl)) dl_check_constrained_dl(&p->dl); + if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) { + add_rq_bw(p->dl.dl_bw, &rq->dl); + add_running_bw(p->dl.dl_bw, &rq->dl); + } + /* - * If p is throttled, we do nothing. In fact, if it exhausted + * If p is throttled, we do not enqueue it. In fact, if it exhausted * its budget it needs a replenishment and, since it now is on * its rq, the bandwidth timer callback (which clearly has not * run yet) will take care of this. + * However, the active utilization does not depend on the fact + * that the task is on the runqueue or not (but depends on the + * task's state - in GRUB parlance, "inactive" vs "active contending"). + * In other words, even if a task is throttled its utilization must + * be counted in the active utilization; hence, we need to call + * add_running_bw(). */ - if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) + if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) { + if (flags & ENQUEUE_WAKEUP) + task_contending(&p->dl, flags); + return; + } enqueue_dl_entity(&p->dl, pi_se, flags); @@ -1023,6 +1427,23 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) { update_curr_dl(rq); __dequeue_task_dl(rq, p, flags); + + if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) { + sub_running_bw(p->dl.dl_bw, &rq->dl); + sub_rq_bw(p->dl.dl_bw, &rq->dl); + } + + /* + * This check allows to start the inactive timer (or to immediately + * decrease the active utilization, if needed) in two cases: + * when the task blocks and when it is terminating + * (p->state == TASK_DEAD). We can handle the two cases in the same + * way, because from GRUB's point of view the same thing is happening + * (the task moves from "active contending" to "active non contending" + * or "inactive") + */ + if (flags & DEQUEUE_SLEEP) + task_non_contending(p); } /* @@ -1100,6 +1521,37 @@ out: return cpu; } +static void migrate_task_rq_dl(struct task_struct *p) +{ + struct rq *rq; + + if (p->state != TASK_WAKING) + return; + + rq = task_rq(p); + /* + * Since p->state == TASK_WAKING, set_task_cpu() has been called + * from try_to_wake_up(). Hence, p->pi_lock is locked, but + * rq->lock is not... So, lock it + */ + raw_spin_lock(&rq->lock); + if (p->dl.dl_non_contending) { + sub_running_bw(p->dl.dl_bw, &rq->dl); + p->dl.dl_non_contending = 0; + /* + * If the timer handler is currently running and the + * timer cannot be cancelled, inactive_task_timer() + * will see that dl_not_contending is not set, and + * will not touch the rq's active utilization, + * so we are still safe. + */ + if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) + put_task_struct(p); + } + sub_rq_bw(p->dl.dl_bw, &rq->dl); + raw_spin_unlock(&rq->lock); +} + static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) { /* @@ -1255,19 +1707,6 @@ static void task_fork_dl(struct task_struct *p) */ } -static void task_dead_dl(struct task_struct *p) -{ - struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); - - /* - * Since we are TASK_DEAD we won't slip out of the domain! - */ - raw_spin_lock_irq(&dl_b->lock); - /* XXX we should retain the bw until 0-lag */ - dl_b->total_bw -= p->dl.dl_bw; - raw_spin_unlock_irq(&dl_b->lock); -} - static void set_curr_task_dl(struct rq *rq) { struct task_struct *p = rq->curr; @@ -1533,7 +1972,7 @@ retry: * then possible that next_task has migrated. */ task = pick_next_pushable_dl_task(rq); - if (task_cpu(next_task) == rq->cpu && task == next_task) { + if (task == next_task) { /* * The task is still there. We don't try * again, some other cpu will pull it when ready. @@ -1551,7 +1990,11 @@ retry: } deactivate_task(rq, next_task, 0); + sub_running_bw(next_task->dl.dl_bw, &rq->dl); + sub_rq_bw(next_task->dl.dl_bw, &rq->dl); set_task_cpu(next_task, later_rq->cpu); + add_rq_bw(next_task->dl.dl_bw, &later_rq->dl); + add_running_bw(next_task->dl.dl_bw, &later_rq->dl); activate_task(later_rq, next_task, 0); ret = 1; @@ -1639,7 +2082,11 @@ static void pull_dl_task(struct rq *this_rq) resched = true; deactivate_task(src_rq, p, 0); + sub_running_bw(p->dl.dl_bw, &src_rq->dl); + sub_rq_bw(p->dl.dl_bw, &src_rq->dl); set_task_cpu(p, this_cpu); + add_rq_bw(p->dl.dl_bw, &this_rq->dl); + add_running_bw(p->dl.dl_bw, &this_rq->dl); activate_task(this_rq, p, 0); dmin = p->dl.deadline; @@ -1695,7 +2142,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, * until we complete the update. */ raw_spin_lock(&src_dl_b->lock); - __dl_clear(src_dl_b, p->dl.dl_bw); + __dl_clear(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); raw_spin_unlock(&src_dl_b->lock); } @@ -1737,13 +2184,26 @@ void __init init_sched_dl_class(void) static void switched_from_dl(struct rq *rq, struct task_struct *p) { /* - * Start the deadline timer; if we switch back to dl before this we'll - * continue consuming our current CBS slice. If we stay outside of - * SCHED_DEADLINE until the deadline passes, the timer will reset the - * task. + * task_non_contending() can start the "inactive timer" (if the 0-lag + * time is in the future). If the task switches back to dl before + * the "inactive timer" fires, it can continue to consume its current + * runtime using its current deadline. If it stays outside of + * SCHED_DEADLINE until the 0-lag time passes, inactive_task_timer() + * will reset the task parameters. */ - if (!start_dl_timer(p)) - __dl_clear_params(p); + if (task_on_rq_queued(p) && p->dl.dl_runtime) + task_non_contending(p); + + if (!task_on_rq_queued(p)) + sub_rq_bw(p->dl.dl_bw, &rq->dl); + + /* + * We cannot use inactive_task_timer() to invoke sub_running_bw() + * at the 0-lag time, because the task could have been migrated + * while SCHED_OTHER in the meanwhile. + */ + if (p->dl.dl_non_contending) + p->dl.dl_non_contending = 0; /* * Since this might be the only -deadline task on the rq, @@ -1762,11 +2222,15 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) */ static void switched_to_dl(struct rq *rq, struct task_struct *p) { + if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) + put_task_struct(p); /* If p is not queued we will update its parameters at next wakeup. */ - if (!task_on_rq_queued(p)) - return; + if (!task_on_rq_queued(p)) { + add_rq_bw(p->dl.dl_bw, &rq->dl); + return; + } /* * If p is boosted we already updated its params in * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH), @@ -1836,6 +2300,7 @@ const struct sched_class dl_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_dl, + .migrate_task_rq = migrate_task_rq_dl, .set_cpus_allowed = set_cpus_allowed_dl, .rq_online = rq_online_dl, .rq_offline = rq_offline_dl, @@ -1845,7 +2310,6 @@ const struct sched_class dl_sched_class = { .set_curr_task = set_curr_task_dl, .task_tick = task_tick_dl, .task_fork = task_fork_dl, - .task_dead = task_dead_dl, .prio_changed = prio_changed_dl, .switched_from = switched_from_dl, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c77e4b1d51c0..a24661ac3d23 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -369,8 +369,9 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) } /* Iterate thr' all leaf cfs_rq's on a runqueue */ -#define for_each_leaf_cfs_rq(rq, cfs_rq) \ - list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) +#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ + list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \ + leaf_cfs_rq_list) /* Do the two (enqueued) entities belong to the same group ? */ static inline struct cfs_rq * @@ -463,8 +464,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) { } -#define for_each_leaf_cfs_rq(rq, cfs_rq) \ - for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) +#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ + for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos) static inline struct sched_entity *parent_entity(struct sched_entity *se) { @@ -2469,7 +2470,8 @@ void task_numa_work(struct callback_head *work) return; - down_read(&mm->mmap_sem); + if (!down_read_trylock(&mm->mmap_sem)) + return; vma = find_vma(mm, start); if (!vma) { reset_ptenuma_scan(p); @@ -2916,12 +2918,12 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa, /* * Step 2: update *_avg. */ - sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); + sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib); if (cfs_rq) { cfs_rq->runnable_load_avg = - div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); + div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib); } - sa->util_avg = sa->util_sum / LOAD_AVG_MAX; + sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib); return 1; } @@ -4642,24 +4644,43 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) hrtimer_cancel(&cfs_b->slack_timer); } +/* + * Both these cpu hotplug callbacks race against unregister_fair_sched_group() + * + * The race is harmless, since modifying bandwidth settings of unhooked group + * bits doesn't do much. + */ + +/* cpu online calback */ static void __maybe_unused update_runtime_enabled(struct rq *rq) { - struct cfs_rq *cfs_rq; + struct task_group *tg; - for_each_leaf_cfs_rq(rq, cfs_rq) { - struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth; + lockdep_assert_held(&rq->lock); + + rcu_read_lock(); + list_for_each_entry_rcu(tg, &task_groups, list) { + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; raw_spin_lock(&cfs_b->lock); cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF; raw_spin_unlock(&cfs_b->lock); } + rcu_read_unlock(); } +/* cpu offline callback */ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) { - struct cfs_rq *cfs_rq; + struct task_group *tg; + + lockdep_assert_held(&rq->lock); + + rcu_read_lock(); + list_for_each_entry_rcu(tg, &task_groups, list) { + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; - for_each_leaf_cfs_rq(rq, cfs_rq) { if (!cfs_rq->runtime_enabled) continue; @@ -4677,6 +4698,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) if (cfs_rq_throttled(cfs_rq)) unthrottle_cfs_rq(cfs_rq); } + rcu_read_unlock(); } #else /* CONFIG_CFS_BANDWIDTH */ @@ -5484,12 +5506,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int i; /* Skip over this group if it has no CPUs allowed */ - if (!cpumask_intersects(sched_group_cpus(group), + if (!cpumask_intersects(sched_group_span(group), &p->cpus_allowed)) continue; local_group = cpumask_test_cpu(this_cpu, - sched_group_cpus(group)); + sched_group_span(group)); /* * Tally up the load of all CPUs in the group and find @@ -5499,7 +5521,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, runnable_load = 0; max_spare_cap = 0; - for_each_cpu(i, sched_group_cpus(group)) { + for_each_cpu(i, sched_group_span(group)) { /* Bias balancing toward cpus of our domain */ if (local_group) load = source_load(i, load_idx); @@ -5602,10 +5624,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) /* Check if we have any choice: */ if (group->group_weight == 1) - return cpumask_first(sched_group_cpus(group)); + return cpumask_first(sched_group_span(group)); /* Traverse only the allowed CPUs */ - for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { + for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) { if (idle_cpu(i)) { struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); @@ -5640,43 +5662,6 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; } -/* - * Implement a for_each_cpu() variant that starts the scan at a given cpu - * (@start), and wraps around. - * - * This is used to scan for idle CPUs; such that not all CPUs looking for an - * idle CPU find the same CPU. The down-side is that tasks tend to cycle - * through the LLC domain. - * - * Especially tbench is found sensitive to this. - */ - -static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped) -{ - int next; - -again: - next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1); - - if (*wrapped) { - if (next >= start) - return nr_cpumask_bits; - } else { - if (next >= nr_cpumask_bits) { - *wrapped = 1; - n = -1; - goto again; - } - } - - return next; -} - -#define for_each_cpu_wrap(cpu, mask, start, wrap) \ - for ((wrap) = 0, (cpu) = (start)-1; \ - (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \ - (cpu) < nr_cpumask_bits; ) - #ifdef CONFIG_SCHED_SMT static inline void set_idle_cores(int cpu, int val) @@ -5736,7 +5721,7 @@ unlock: static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); - int core, cpu, wrap; + int core, cpu; if (!static_branch_likely(&sched_smt_present)) return -1; @@ -5746,7 +5731,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed); - for_each_cpu_wrap(core, cpus, target, wrap) { + for_each_cpu_wrap(core, cpus, target) { bool idle = true; for_each_cpu(cpu, cpu_smt_mask(core)) { @@ -5809,27 +5794,38 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) { struct sched_domain *this_sd; - u64 avg_cost, avg_idle = this_rq()->avg_idle; + u64 avg_cost, avg_idle; u64 time, cost; s64 delta; - int cpu, wrap; + int cpu, nr = INT_MAX; this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); if (!this_sd) return -1; - avg_cost = this_sd->avg_scan_cost; - /* * Due to large variance we need a large fuzz factor; hackbench in * particularly is sensitive here. */ - if (sched_feat(SIS_AVG_CPU) && (avg_idle / 512) < avg_cost) + avg_idle = this_rq()->avg_idle / 512; + avg_cost = this_sd->avg_scan_cost + 1; + + if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost) return -1; + if (sched_feat(SIS_PROP)) { + u64 span_avg = sd->span_weight * avg_idle; + if (span_avg > 4*avg_cost) + nr = div_u64(span_avg, avg_cost); + else + nr = 4; + } + time = local_clock(); - for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) { + for_each_cpu_wrap(cpu, sched_domain_span(sd), target) { + if (!--nr) + return -1; if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) continue; if (idle_cpu(cpu)) @@ -6168,8 +6164,11 @@ static void set_last_buddy(struct sched_entity *se) if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) return; - for_each_sched_entity(se) + for_each_sched_entity(se) { + if (SCHED_WARN_ON(!se->on_rq)) + return; cfs_rq_of(se)->last = se; + } } static void set_next_buddy(struct sched_entity *se) @@ -6177,8 +6176,11 @@ static void set_next_buddy(struct sched_entity *se) if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) return; - for_each_sched_entity(se) + for_each_sched_entity(se) { + if (SCHED_WARN_ON(!se->on_rq)) + return; cfs_rq_of(se)->next = se; + } } static void set_skip_buddy(struct sched_entity *se) @@ -6970,10 +6972,28 @@ static void attach_tasks(struct lb_env *env) } #ifdef CONFIG_FAIR_GROUP_SCHED + +static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) +{ + if (cfs_rq->load.weight) + return false; + + if (cfs_rq->avg.load_sum) + return false; + + if (cfs_rq->avg.util_sum) + return false; + + if (cfs_rq->runnable_load_sum) + return false; + + return true; +} + static void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq, *pos; struct rq_flags rf; rq_lock_irqsave(rq, &rf); @@ -6983,7 +7003,7 @@ static void update_blocked_averages(int cpu) * Iterates the task_group tree in a bottom up fashion, see * list_add_leaf_cfs_rq() for details. */ - for_each_leaf_cfs_rq(rq, cfs_rq) { + for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) { struct sched_entity *se; /* throttled entities do not contribute to load */ @@ -6997,6 +7017,13 @@ static void update_blocked_averages(int cpu) se = cfs_rq->tg->se[cpu]; if (se && !skip_blocked_update(se)) update_load_avg(se, 0); + + /* + * There can be a lot of idle CPU cgroups. Don't let fully + * decayed cfs_rqs linger on the list. + */ + if (cfs_rq_is_decayed(cfs_rq)) + list_del_leaf_cfs_rq(cfs_rq); } rq_unlock_irqrestore(rq, &rf); } @@ -7229,7 +7256,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) * span the current group. */ - for_each_cpu(cpu, sched_group_cpus(sdg)) { + for_each_cpu(cpu, sched_group_span(sdg)) { struct sched_group_capacity *sgc; struct rq *rq = cpu_rq(cpu); @@ -7408,7 +7435,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, memset(sgs, 0, sizeof(*sgs)); - for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { + for_each_cpu_and(i, sched_group_span(group), env->cpus) { struct rq *rq = cpu_rq(i); /* Bias balancing toward cpus of our domain */ @@ -7572,7 +7599,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sg_lb_stats *sgs = &tmp_sgs; int local_group; - local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); + local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg)); if (local_group) { sds->local = sg; sgs = local; @@ -7927,7 +7954,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, unsigned long busiest_load = 0, busiest_capacity = 1; int i; - for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { + for_each_cpu_and(i, sched_group_span(group), env->cpus) { unsigned long capacity, wl; enum fbq_type rt; @@ -8033,7 +8060,6 @@ static int active_load_balance_cpu_stop(void *data); static int should_we_balance(struct lb_env *env) { struct sched_group *sg = env->sd->groups; - struct cpumask *sg_cpus, *sg_mask; int cpu, balance_cpu = -1; /* @@ -8043,11 +8069,9 @@ static int should_we_balance(struct lb_env *env) if (env->idle == CPU_NEWLY_IDLE) return 1; - sg_cpus = sched_group_cpus(sg); - sg_mask = sched_group_mask(sg); /* Try to find first idle cpu */ - for_each_cpu_and(cpu, sg_cpus, env->cpus) { - if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu)) + for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { + if (!idle_cpu(cpu)) continue; balance_cpu = cpu; @@ -8083,7 +8107,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, .sd = sd, .dst_cpu = this_cpu, .dst_rq = this_rq, - .dst_grpmask = sched_group_cpus(sd->groups), + .dst_grpmask = sched_group_span(sd->groups), .idle = idle, .loop_break = sched_nr_migrate_break, .cpus = cpus, @@ -9523,10 +9547,10 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_SCHED_DEBUG void print_cfs_stats(struct seq_file *m, int cpu) { - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq, *pos; rcu_read_lock(); - for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) + for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos) print_cfs_rq(m, cpu, cfs_rq); rcu_read_unlock(); } diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 11192e0cb122..d3fb15555291 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -55,6 +55,7 @@ SCHED_FEAT(TTWU_QUEUE, true) * When doing wakeups, attempt to limit superfluous scans of the LLC domain. */ SCHED_FEAT(SIS_AVG_CPU, false) +SCHED_FEAT(SIS_PROP, true) /* * Issue a WARN when we do multiple update_rq_clock() calls @@ -76,7 +77,6 @@ SCHED_FEAT(WARN_DOUBLE_CLOCK, false) SCHED_FEAT(RT_PUSH_IPI, true) #endif -SCHED_FEAT(FORCE_SD_OVERLAP, false) SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) SCHED_FEAT(ATTACH_AGE_LOAD, true) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index ef63adce0c9c..6c23e30c0e5c 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -219,6 +219,7 @@ static void do_idle(void) */ __current_set_polling(); + quiet_vmstat(); tick_nohz_idle_enter(); while (!need_resched()) { diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 979b7341008a..581d5c7a5264 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -840,6 +840,17 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) int enqueue = 0; struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); struct rq *rq = rq_of_rt_rq(rt_rq); + int skip; + + /* + * When span == cpu_online_mask, taking each rq->lock + * can be time-consuming. Try to avoid it when possible. + */ + raw_spin_lock(&rt_rq->rt_runtime_lock); + skip = !rt_rq->rt_time && !rt_rq->rt_nr_running; + raw_spin_unlock(&rt_rq->rt_runtime_lock); + if (skip) + continue; raw_spin_lock(&rq->lock); if (rt_rq->rt_time) { @@ -1819,7 +1830,7 @@ retry: * pushing. */ task = pick_next_pushable_task(rq); - if (task_cpu(next_task) == rq->cpu && task == next_task) { + if (task == next_task) { /* * The task hasn't migrated, and is still the next * eligible task, but we failed to find a run-queue diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6dda2aab731e..e0329d10bdb8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -39,9 +39,9 @@ #include "cpuacct.h" #ifdef CONFIG_SCHED_DEBUG -#define SCHED_WARN_ON(x) WARN_ONCE(x, #x) +# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) #else -#define SCHED_WARN_ON(x) ((void)(x)) +# define SCHED_WARN_ON(x) ({ (void)(x), 0; }) #endif struct rq; @@ -219,22 +219,27 @@ static inline int dl_bandwidth_enabled(void) } extern struct dl_bw *dl_bw_of(int i); +extern int dl_bw_cpus(int i); struct dl_bw { raw_spinlock_t lock; u64 bw, total_bw; }; +static inline void __dl_update(struct dl_bw *dl_b, s64 bw); + static inline -void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) +void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw, int cpus) { dl_b->total_bw -= tsk_bw; + __dl_update(dl_b, (s32)tsk_bw / cpus); } static inline -void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) +void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus) { dl_b->total_bw += tsk_bw; + __dl_update(dl_b, -((s32)tsk_bw / cpus)); } static inline @@ -244,6 +249,7 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; } +void dl_change_utilization(struct task_struct *p, u64 new_bw); extern void init_dl_bw(struct dl_bw *dl_b); #ifdef CONFIG_CGROUP_SCHED @@ -558,6 +564,30 @@ struct dl_rq { #else struct dl_bw dl_bw; #endif + /* + * "Active utilization" for this runqueue: increased when a + * task wakes up (becomes TASK_RUNNING) and decreased when a + * task blocks + */ + u64 running_bw; + + /* + * Utilization of the tasks "assigned" to this runqueue (including + * the tasks that are in runqueue and the tasks that executed on this + * CPU and blocked). Increased when a task moves to this runqueue, and + * decreased when the task moves away (migrates, changes scheduling + * policy, or terminates). + * This is needed to compute the "inactive utilization" for the + * runqueue (inactive utilization = this_bw - running_bw). + */ + u64 this_bw; + u64 extra_bw; + + /* + * Inverse of the fraction of CPU utilization that can be reclaimed + * by the GRUB algorithm. + */ + u64 bw_ratio; }; #ifdef CONFIG_SMP @@ -606,11 +636,9 @@ struct root_domain { extern struct root_domain def_root_domain; extern struct mutex sched_domains_mutex; -extern cpumask_var_t fallback_doms; -extern cpumask_var_t sched_domains_tmpmask; extern void init_defrootdomain(void); -extern int init_sched_domains(const struct cpumask *cpu_map); +extern int sched_init_domains(const struct cpumask *cpu_map); extern void rq_attach_root(struct rq *rq, struct root_domain *rd); #endif /* CONFIG_SMP */ @@ -1025,7 +1053,11 @@ struct sched_group_capacity { unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ - unsigned long cpumask[0]; /* iteration mask */ +#ifdef CONFIG_SCHED_DEBUG + int id; +#endif + + unsigned long cpumask[0]; /* balance mask */ }; struct sched_group { @@ -1046,16 +1078,15 @@ struct sched_group { unsigned long cpumask[0]; }; -static inline struct cpumask *sched_group_cpus(struct sched_group *sg) +static inline struct cpumask *sched_group_span(struct sched_group *sg) { return to_cpumask(sg->cpumask); } /* - * cpumask masking which cpus in the group are allowed to iterate up the domain - * tree. + * See build_balance_mask(). */ -static inline struct cpumask *sched_group_mask(struct sched_group *sg) +static inline struct cpumask *group_balance_mask(struct sched_group *sg) { return to_cpumask(sg->sgc->cpumask); } @@ -1066,7 +1097,7 @@ static inline struct cpumask *sched_group_mask(struct sched_group *sg) */ static inline unsigned int group_first_cpu(struct sched_group *group) { - return cpumask_first(sched_group_cpus(group)); + return cpumask_first(sched_group_span(group)); } extern int group_balance_cpu(struct sched_group *sg); @@ -1422,7 +1453,11 @@ static inline void set_curr_task(struct rq *rq, struct task_struct *curr) curr->sched_class->set_curr_task(rq); } +#ifdef CONFIG_SMP #define sched_class_highest (&stop_sched_class) +#else +#define sched_class_highest (&dl_sched_class) +#endif #define for_each_class(class) \ for (class = sched_class_highest; class; class = class->next) @@ -1486,7 +1521,12 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime extern struct dl_bandwidth def_dl_bandwidth; extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); extern void init_dl_task_timer(struct sched_dl_entity *dl_se); +extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); +extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); +#define BW_SHIFT 20 +#define BW_UNIT (1 << BW_SHIFT) +#define RATIO_SHIFT 8 unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); @@ -1928,6 +1968,33 @@ extern void nohz_balance_exit_idle(unsigned int cpu); static inline void nohz_balance_exit_idle(unsigned int cpu) { } #endif + +#ifdef CONFIG_SMP +static inline +void __dl_update(struct dl_bw *dl_b, s64 bw) +{ + struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw); + int i; + + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); + for_each_cpu_and(i, rd->span, cpu_active_mask) { + struct rq *rq = cpu_rq(i); + + rq->dl.extra_bw += bw; + } +} +#else +static inline +void __dl_update(struct dl_bw *dl_b, s64 bw) +{ + struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw); + + dl->extra_bw += bw; +} +#endif + + #ifdef CONFIG_IRQ_TIME_ACCOUNTING struct irqtime { u64 total; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 1b0b4fb12837..79895aec281e 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -10,6 +10,7 @@ DEFINE_MUTEX(sched_domains_mutex); /* Protected by sched_domains_mutex: */ cpumask_var_t sched_domains_tmpmask; +cpumask_var_t sched_domains_tmpmask2; #ifdef CONFIG_SCHED_DEBUG @@ -35,7 +36,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpumask_clear(groupmask); - printk(KERN_DEBUG "%*s domain %d: ", level, "", level); + printk(KERN_DEBUG "%*s domain-%d: ", level, "", level); if (!(sd->flags & SD_LOAD_BALANCE)) { printk("does not load-balance\n"); @@ -45,14 +46,14 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, return -1; } - printk(KERN_CONT "span %*pbl level %s\n", + printk(KERN_CONT "span=%*pbl level=%s\n", cpumask_pr_args(sched_domain_span(sd)), sd->name); if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { printk(KERN_ERR "ERROR: domain->span does not contain " "CPU%d\n", cpu); } - if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { + if (!cpumask_test_cpu(cpu, sched_group_span(group))) { printk(KERN_ERR "ERROR: domain->groups does not contain" " CPU%d\n", cpu); } @@ -65,29 +66,47 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (!cpumask_weight(sched_group_cpus(group))) { + if (!cpumask_weight(sched_group_span(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: empty group\n"); break; } if (!(sd->flags & SD_OVERLAP) && - cpumask_intersects(groupmask, sched_group_cpus(group))) { + cpumask_intersects(groupmask, sched_group_span(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: repeated CPUs\n"); break; } - cpumask_or(groupmask, groupmask, sched_group_cpus(group)); + cpumask_or(groupmask, groupmask, sched_group_span(group)); - printk(KERN_CONT " %*pbl", - cpumask_pr_args(sched_group_cpus(group))); - if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { - printk(KERN_CONT " (cpu_capacity = %lu)", - group->sgc->capacity); + printk(KERN_CONT " %d:{ span=%*pbl", + group->sgc->id, + cpumask_pr_args(sched_group_span(group))); + + if ((sd->flags & SD_OVERLAP) && + !cpumask_equal(group_balance_mask(group), sched_group_span(group))) { + printk(KERN_CONT " mask=%*pbl", + cpumask_pr_args(group_balance_mask(group))); + } + + if (group->sgc->capacity != SCHED_CAPACITY_SCALE) + printk(KERN_CONT " cap=%lu", group->sgc->capacity); + + if (group == sd->groups && sd->child && + !cpumask_equal(sched_domain_span(sd->child), + sched_group_span(group))) { + printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n"); } + printk(KERN_CONT " }"); + group = group->next; + + if (group != sd->groups) + printk(KERN_CONT ","); + } while (group != sd->groups); printk(KERN_CONT "\n"); @@ -113,7 +132,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) return; } - printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); + printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n", cpu); for (;;) { if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) @@ -477,46 +496,214 @@ enum s_alloc { }; /* - * Build an iteration mask that can exclude certain CPUs from the upwards - * domain traversal. + * Return the canonical balance CPU for this group, this is the first CPU + * of this group that's also in the balance mask. * - * Asymmetric node setups can result in situations where the domain tree is of - * unequal depth, make sure to skip domains that already cover the entire - * range. + * The balance mask are all those CPUs that could actually end up at this + * group. See build_balance_mask(). * - * In that case build_sched_domains() will have terminated the iteration early - * and our sibling sd spans will be empty. Domains should always include the - * CPU they're built on, so check that. + * Also see should_we_balance(). */ -static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) +int group_balance_cpu(struct sched_group *sg) { - const struct cpumask *span = sched_domain_span(sd); + return cpumask_first(group_balance_mask(sg)); +} + + +/* + * NUMA topology (first read the regular topology blurb below) + * + * Given a node-distance table, for example: + * + * node 0 1 2 3 + * 0: 10 20 30 20 + * 1: 20 10 20 30 + * 2: 30 20 10 20 + * 3: 20 30 20 10 + * + * which represents a 4 node ring topology like: + * + * 0 ----- 1 + * | | + * | | + * | | + * 3 ----- 2 + * + * We want to construct domains and groups to represent this. The way we go + * about doing this is to build the domains on 'hops'. For each NUMA level we + * construct the mask of all nodes reachable in @level hops. + * + * For the above NUMA topology that gives 3 levels: + * + * NUMA-2 0-3 0-3 0-3 0-3 + * groups: {0-1,3},{1-3} {0-2},{0,2-3} {1-3},{0-1,3} {0,2-3},{0-2} + * + * NUMA-1 0-1,3 0-2 1-3 0,2-3 + * groups: {0},{1},{3} {0},{1},{2} {1},{2},{3} {0},{2},{3} + * + * NUMA-0 0 1 2 3 + * + * + * As can be seen; things don't nicely line up as with the regular topology. + * When we iterate a domain in child domain chunks some nodes can be + * represented multiple times -- hence the "overlap" naming for this part of + * the topology. + * + * In order to minimize this overlap, we only build enough groups to cover the + * domain. For instance Node-0 NUMA-2 would only get groups: 0-1,3 and 1-3. + * + * Because: + * + * - the first group of each domain is its child domain; this + * gets us the first 0-1,3 + * - the only uncovered node is 2, who's child domain is 1-3. + * + * However, because of the overlap, computing a unique CPU for each group is + * more complicated. Consider for instance the groups of NODE-1 NUMA-2, both + * groups include the CPUs of Node-0, while those CPUs would not in fact ever + * end up at those groups (they would end up in group: 0-1,3). + * + * To correct this we have to introduce the group balance mask. This mask + * will contain those CPUs in the group that can reach this group given the + * (child) domain tree. + * + * With this we can once again compute balance_cpu and sched_group_capacity + * relations. + * + * XXX include words on how balance_cpu is unique and therefore can be + * used for sched_group_capacity links. + * + * + * Another 'interesting' topology is: + * + * node 0 1 2 3 + * 0: 10 20 20 30 + * 1: 20 10 20 20 + * 2: 20 20 10 20 + * 3: 30 20 20 10 + * + * Which looks a little like: + * + * 0 ----- 1 + * | / | + * | / | + * | / | + * 2 ----- 3 + * + * This topology is asymmetric, nodes 1,2 are fully connected, but nodes 0,3 + * are not. + * + * This leads to a few particularly weird cases where the sched_domain's are + * not of the same number for each cpu. Consider: + * + * NUMA-2 0-3 0-3 + * groups: {0-2},{1-3} {1-3},{0-2} + * + * NUMA-1 0-2 0-3 0-3 1-3 + * + * NUMA-0 0 1 2 3 + * + */ + + +/* + * Build the balance mask; it contains only those CPUs that can arrive at this + * group and should be considered to continue balancing. + * + * We do this during the group creation pass, therefore the group information + * isn't complete yet, however since each group represents a (child) domain we + * can fully construct this using the sched_domain bits (which are already + * complete). + */ +static void +build_balance_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask) +{ + const struct cpumask *sg_span = sched_group_span(sg); struct sd_data *sdd = sd->private; struct sched_domain *sibling; int i; - for_each_cpu(i, span) { + cpumask_clear(mask); + + for_each_cpu(i, sg_span) { sibling = *per_cpu_ptr(sdd->sd, i); - if (!cpumask_test_cpu(i, sched_domain_span(sibling))) + + /* + * Can happen in the asymmetric case, where these siblings are + * unused. The mask will not be empty because those CPUs that + * do have the top domain _should_ span the domain. + */ + if (!sibling->child) continue; - cpumask_set_cpu(i, sched_group_mask(sg)); + /* If we would not end up here, we can't continue from here */ + if (!cpumask_equal(sg_span, sched_domain_span(sibling->child))) + continue; + + cpumask_set_cpu(i, mask); } + + /* We must not have empty masks here */ + WARN_ON_ONCE(cpumask_empty(mask)); } /* - * Return the canonical balance CPU for this group, this is the first CPU - * of this group that's also in the iteration mask. + * XXX: This creates per-node group entries; since the load-balancer will + * immediately access remote memory to construct this group's load-balance + * statistics having the groups node local is of dubious benefit. */ -int group_balance_cpu(struct sched_group *sg) +static struct sched_group * +build_group_from_child_sched_domain(struct sched_domain *sd, int cpu) { - return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); + struct sched_group *sg; + struct cpumask *sg_span; + + sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, cpu_to_node(cpu)); + + if (!sg) + return NULL; + + sg_span = sched_group_span(sg); + if (sd->child) + cpumask_copy(sg_span, sched_domain_span(sd->child)); + else + cpumask_copy(sg_span, sched_domain_span(sd)); + + return sg; +} + +static void init_overlap_sched_group(struct sched_domain *sd, + struct sched_group *sg) +{ + struct cpumask *mask = sched_domains_tmpmask2; + struct sd_data *sdd = sd->private; + struct cpumask *sg_span; + int cpu; + + build_balance_mask(sd, sg, mask); + cpu = cpumask_first_and(sched_group_span(sg), mask); + + sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); + if (atomic_inc_return(&sg->sgc->ref) == 1) + cpumask_copy(group_balance_mask(sg), mask); + else + WARN_ON_ONCE(!cpumask_equal(group_balance_mask(sg), mask)); + + /* + * Initialize sgc->capacity such that even if we mess up the + * domains and no possible iteration will get us here, we won't + * die on a /0 trap. + */ + sg_span = sched_group_span(sg); + sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); + sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; } static int build_overlap_sched_groups(struct sched_domain *sd, int cpu) { - struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; + struct sched_group *first = NULL, *last = NULL, *sg; const struct cpumask *span = sched_domain_span(sd); struct cpumask *covered = sched_domains_tmpmask; struct sd_data *sdd = sd->private; @@ -525,7 +712,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) cpumask_clear(covered); - for_each_cpu(i, span) { + for_each_cpu_wrap(i, span, cpu) { struct cpumask *sg_span; if (cpumask_test_cpu(i, covered)) @@ -533,44 +720,27 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) sibling = *per_cpu_ptr(sdd->sd, i); - /* See the comment near build_group_mask(). */ + /* + * Asymmetric node setups can result in situations where the + * domain tree is of unequal depth, make sure to skip domains + * that already cover the entire range. + * + * In that case build_sched_domains() will have terminated the + * iteration early and our sibling sd spans will be empty. + * Domains should always include the CPU they're built on, so + * check that. + */ if (!cpumask_test_cpu(i, sched_domain_span(sibling))) continue; - sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), - GFP_KERNEL, cpu_to_node(cpu)); - + sg = build_group_from_child_sched_domain(sibling, cpu); if (!sg) goto fail; - sg_span = sched_group_cpus(sg); - if (sibling->child) - cpumask_copy(sg_span, sched_domain_span(sibling->child)); - else - cpumask_set_cpu(i, sg_span); - + sg_span = sched_group_span(sg); cpumask_or(covered, covered, sg_span); - sg->sgc = *per_cpu_ptr(sdd->sgc, i); - if (atomic_inc_return(&sg->sgc->ref) == 1) - build_group_mask(sd, sg); - - /* - * Initialize sgc->capacity such that even if we mess up the - * domains and no possible iteration will get us here, we won't - * die on a /0 trap. - */ - sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); - sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; - - /* - * Make sure the first group of this domain contains the - * canonical balance CPU. Otherwise the sched_domain iteration - * breaks. See update_sg_lb_stats(). - */ - if ((!groups && cpumask_test_cpu(cpu, sg_span)) || - group_balance_cpu(sg) == cpu) - groups = sg; + init_overlap_sched_group(sd, sg); if (!first) first = sg; @@ -579,7 +749,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) last = sg; last->next = first; } - sd->groups = groups; + sd->groups = first; return 0; @@ -589,23 +759,106 @@ fail: return -ENOMEM; } -static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) + +/* + * Package topology (also see the load-balance blurb in fair.c) + * + * The scheduler builds a tree structure to represent a number of important + * topology features. By default (default_topology[]) these include: + * + * - Simultaneous multithreading (SMT) + * - Multi-Core Cache (MC) + * - Package (DIE) + * + * Where the last one more or less denotes everything up to a NUMA node. + * + * The tree consists of 3 primary data structures: + * + * sched_domain -> sched_group -> sched_group_capacity + * ^ ^ ^ ^ + * `-' `-' + * + * The sched_domains are per-cpu and have a two way link (parent & child) and + * denote the ever growing mask of CPUs belonging to that level of topology. + * + * Each sched_domain has a circular (double) linked list of sched_group's, each + * denoting the domains of the level below (or individual CPUs in case of the + * first domain level). The sched_group linked by a sched_domain includes the + * CPU of that sched_domain [*]. + * + * Take for instance a 2 threaded, 2 core, 2 cache cluster part: + * + * CPU 0 1 2 3 4 5 6 7 + * + * DIE [ ] + * MC [ ] [ ] + * SMT [ ] [ ] [ ] [ ] + * + * - or - + * + * DIE 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7 + * MC 0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7 + * SMT 0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7 + * + * CPU 0 1 2 3 4 5 6 7 + * + * One way to think about it is: sched_domain moves you up and down among these + * topology levels, while sched_group moves you sideways through it, at child + * domain granularity. + * + * sched_group_capacity ensures each unique sched_group has shared storage. + * + * There are two related construction problems, both require a CPU that + * uniquely identify each group (for a given domain): + * + * - The first is the balance_cpu (see should_we_balance() and the + * load-balance blub in fair.c); for each group we only want 1 CPU to + * continue balancing at a higher domain. + * + * - The second is the sched_group_capacity; we want all identical groups + * to share a single sched_group_capacity. + * + * Since these topologies are exclusive by construction. That is, its + * impossible for an SMT thread to belong to multiple cores, and cores to + * be part of multiple caches. There is a very clear and unique location + * for each CPU in the hierarchy. + * + * Therefore computing a unique CPU for each group is trivial (the iteration + * mask is redundant and set all 1s; all CPUs in a group will end up at _that_ + * group), we can simply pick the first CPU in each group. + * + * + * [*] in other words, the first group of each domain is its child domain. + */ + +static struct sched_group *get_group(int cpu, struct sd_data *sdd) { struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); struct sched_domain *child = sd->child; + struct sched_group *sg; if (child) cpu = cpumask_first(sched_domain_span(child)); - if (sg) { - *sg = *per_cpu_ptr(sdd->sg, cpu); - (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu); + sg = *per_cpu_ptr(sdd->sg, cpu); + sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); + + /* For claim_allocations: */ + atomic_inc(&sg->ref); + atomic_inc(&sg->sgc->ref); - /* For claim_allocations: */ - atomic_set(&(*sg)->sgc->ref, 1); + if (child) { + cpumask_copy(sched_group_span(sg), sched_domain_span(child)); + cpumask_copy(group_balance_mask(sg), sched_group_span(sg)); + } else { + cpumask_set_cpu(cpu, sched_group_span(sg)); + cpumask_set_cpu(cpu, group_balance_mask(sg)); } - return cpu; + sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg)); + sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; + + return sg; } /* @@ -624,34 +877,20 @@ build_sched_groups(struct sched_domain *sd, int cpu) struct cpumask *covered; int i; - get_group(cpu, sdd, &sd->groups); - atomic_inc(&sd->groups->ref); - - if (cpu != cpumask_first(span)) - return 0; - lockdep_assert_held(&sched_domains_mutex); covered = sched_domains_tmpmask; cpumask_clear(covered); - for_each_cpu(i, span) { + for_each_cpu_wrap(i, span, cpu) { struct sched_group *sg; - int group, j; if (cpumask_test_cpu(i, covered)) continue; - group = get_group(i, sdd, &sg); - cpumask_setall(sched_group_mask(sg)); + sg = get_group(i, sdd); - for_each_cpu(j, span) { - if (get_group(j, sdd, NULL) != group) - continue; - - cpumask_set_cpu(j, covered); - cpumask_set_cpu(j, sched_group_cpus(sg)); - } + cpumask_or(covered, covered, sched_group_span(sg)); if (!first) first = sg; @@ -660,6 +899,7 @@ build_sched_groups(struct sched_domain *sd, int cpu) last = sg; } last->next = first; + sd->groups = first; return 0; } @@ -683,12 +923,12 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) do { int cpu, max_cpu = -1; - sg->group_weight = cpumask_weight(sched_group_cpus(sg)); + sg->group_weight = cpumask_weight(sched_group_span(sg)); if (!(sd->flags & SD_ASYM_PACKING)) goto next; - for_each_cpu(cpu, sched_group_cpus(sg)) { + for_each_cpu(cpu, sched_group_span(sg)) { if (max_cpu < 0) max_cpu = cpu; else if (sched_asym_prefer(cpu, max_cpu)) @@ -1308,6 +1548,10 @@ static int __sdt_alloc(const struct cpumask *cpu_map) if (!sgc) return -ENOMEM; +#ifdef CONFIG_SCHED_DEBUG + sgc->id = j; +#endif + *per_cpu_ptr(sdd->sgc, j) = sgc; } } @@ -1407,7 +1651,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att sd = build_sched_domain(tl, cpu_map, attr, sd, i); if (tl == sched_domain_topology) *per_cpu_ptr(d.sd, i) = sd; - if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) + if (tl->flags & SDTL_OVERLAP) sd->flags |= SD_OVERLAP; if (cpumask_equal(cpu_map, sched_domain_span(sd))) break; @@ -1478,7 +1722,7 @@ static struct sched_domain_attr *dattr_cur; * cpumask) fails, then fallback to a single sched domain, * as determined by the single cpumask fallback_doms. */ -cpumask_var_t fallback_doms; +static cpumask_var_t fallback_doms; /* * arch_update_cpu_topology lets virtualized architectures update the @@ -1520,10 +1764,14 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) * For now this just excludes isolated CPUs, but could be used to * exclude other special cases in the future. */ -int init_sched_domains(const struct cpumask *cpu_map) +int sched_init_domains(const struct cpumask *cpu_map) { int err; + zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL); + zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL); + zalloc_cpumask_var(&fallback_doms, GFP_KERNEL); + arch_update_cpu_topology(); ndoms_cur = 1; doms_cur = alloc_sched_domains(ndoms_cur); diff --git a/kernel/smp.c b/kernel/smp.c index a817769b53c0..3061483cb3ad 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -30,6 +30,7 @@ enum { struct call_function_data { struct call_single_data __percpu *csd; cpumask_var_t cpumask; + cpumask_var_t cpumask_ipi; }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); @@ -45,9 +46,15 @@ int smpcfd_prepare_cpu(unsigned int cpu) if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, cpu_to_node(cpu))) return -ENOMEM; + if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL, + cpu_to_node(cpu))) { + free_cpumask_var(cfd->cpumask); + return -ENOMEM; + } cfd->csd = alloc_percpu(struct call_single_data); if (!cfd->csd) { free_cpumask_var(cfd->cpumask); + free_cpumask_var(cfd->cpumask_ipi); return -ENOMEM; } @@ -59,6 +66,7 @@ int smpcfd_dead_cpu(unsigned int cpu) struct call_function_data *cfd = &per_cpu(cfd_data, cpu); free_cpumask_var(cfd->cpumask); + free_cpumask_var(cfd->cpumask_ipi); free_percpu(cfd->csd); return 0; } @@ -428,12 +436,13 @@ void smp_call_function_many(const struct cpumask *mask, cfd = this_cpu_ptr(&cfd_data); cpumask_and(cfd->cpumask, mask, cpu_online_mask); - cpumask_clear_cpu(this_cpu, cfd->cpumask); + __cpumask_clear_cpu(this_cpu, cfd->cpumask); /* Some callers race with other cpus changing the passed mask */ if (unlikely(!cpumask_weight(cfd->cpumask))) return; + cpumask_clear(cfd->cpumask_ipi); for_each_cpu(cpu, cfd->cpumask) { struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu); @@ -442,11 +451,12 @@ void smp_call_function_many(const struct cpumask *mask, csd->flags |= CSD_FLAG_SYNCHRONOUS; csd->func = func; csd->info = info; - llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)); + if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) + __cpumask_set_cpu(cpu, cfd->cpumask_ipi); } /* Send a message to all CPUs in the map */ - arch_send_call_function_ipi_mask(cfd->cpumask); + arch_send_call_function_ipi_mask(cfd->cpumask_ipi); if (wait) { for_each_cpu(cpu, cfd->cpumask) { diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 93621ae718d3..03918a19cf2d 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -233,6 +233,9 @@ static void clocksource_watchdog(unsigned long data) continue; } + if (cs == curr_clocksource && cs->tick_stable) + cs->tick_stable(cs); + if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 64c97fc130c4..9c2dc64e31d8 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -554,7 +554,7 @@ static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) update_ts_time_stats(smp_processor_id(), ts, now, NULL); ts->idle_active = 0; - sched_clock_idle_wakeup_event(0); + sched_clock_idle_wakeup_event(); } static ktime_t tick_nohz_start_idle(struct tick_sched *ts) diff --git a/lib/cpumask.c b/lib/cpumask.c index 81dedaab36cc..4731a0895760 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -43,6 +43,38 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu) } EXPORT_SYMBOL(cpumask_any_but); +/** + * cpumask_next_wrap - helper to implement for_each_cpu_wrap + * @n: the cpu prior to the place to search + * @mask: the cpumask pointer + * @start: the start point of the iteration + * @wrap: assume @n crossing @start terminates the iteration + * + * Returns >= nr_cpu_ids on completion + * + * Note: the @wrap argument is required for the start condition when + * we cannot assume @start is set in @mask. + */ +int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap) +{ + int next; + +again: + next = cpumask_next(n, mask); + + if (wrap && n < start && next >= start) { + return nr_cpumask_bits; + + } else if (next >= nr_cpumask_bits) { + wrap = true; + n = -1; + goto again; + } + + return next; +} +EXPORT_SYMBOL(cpumask_next_wrap); + /* These are not inline because of header tangles. */ #ifdef CONFIG_CPUMASK_OFFSTACK /** diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c index 690d75b132fa..2fb007be0212 100644 --- a/lib/smp_processor_id.c +++ b/lib/smp_processor_id.c @@ -28,7 +28,7 @@ notrace static unsigned int check_preemption_disabled(const char *what1, /* * It is valid to assume CPU-locality during early bootup: */ - if (system_state != SYSTEM_RUNNING) + if (system_state < SYSTEM_SCHEDULING) goto out; /* diff --git a/mm/vmscan.c b/mm/vmscan.c index 8ad39bbc79e6..c3c1c6ac62da 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3652,7 +3652,7 @@ int kswapd_run(int nid) pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); if (IS_ERR(pgdat->kswapd)) { /* failure at boot is fatal */ - BUG_ON(system_state == SYSTEM_BOOTING); + BUG_ON(system_state < SYSTEM_RUNNING); pr_err("Failed to start kswapd on node %d\n", nid); ret = PTR_ERR(pgdat->kswapd); pgdat->kswapd = NULL; |