From 3fc1f1e27a5b807791d72e5d992aa33b668a6626 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 6 May 2010 18:49:20 +0200 Subject: stop_machine: reimplement using cpu_stop Reimplement stop_machine using cpu_stop. As cpu stoppers are guaranteed to be available for all online cpus, stop_machine_create/destroy() are no longer necessary and removed. With resource management and synchronization handled by cpu_stop, the new implementation is much simpler. Asking the cpu_stop to execute the stop_cpu() state machine on all online cpus with cpu hotplug disabled is enough. stop_machine itself doesn't need to manage any global resources anymore, so all per-instance information is rolled into struct stop_machine_data and the mutex and all static data variables are removed. The previous implementation created and destroyed RT workqueues as necessary which made stop_machine() calls highly expensive on very large machines. According to Dimitri Sivanich, preventing the dynamic creation/destruction makes booting faster more than twice on very large machines. cpu_stop resources are preallocated for all online cpus and should have the same effect. Signed-off-by: Tejun Heo Acked-by: Rusty Russell Acked-by: Peter Zijlstra Cc: Oleg Nesterov Cc: Dimitri Sivanich --- drivers/xen/manage.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'drivers') diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 2ac4440e7b08..8943b8ccee1a 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -80,12 +80,6 @@ static void do_suspend(void) shutting_down = SHUTDOWN_SUSPEND; - err = stop_machine_create(); - if (err) { - printk(KERN_ERR "xen suspend: failed to setup stop_machine %d\n", err); - goto out; - } - #ifdef CONFIG_PREEMPT /* If the kernel is preemptible, we need to freeze all the processes to prevent them from being in the middle of a pagetable update @@ -93,7 +87,7 @@ static void do_suspend(void) err = freeze_processes(); if (err) { printk(KERN_ERR "xen suspend: freeze failed %d\n", err); - goto out_destroy_sm; + goto out; } #endif @@ -136,12 +130,8 @@ out_resume: out_thaw: #ifdef CONFIG_PREEMPT thaw_processes(); - -out_destroy_sm: -#endif - stop_machine_destroy(); - out: +#endif shutting_down = SHUTDOWN_INVALID; } #endif /* CONFIG_PM_SLEEP */ -- cgit v1.2.3 From 6b8fcd9029f217a9ecce822db645e19111c11080 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 9 May 2010 08:26:06 -0700 Subject: ondemand: Solve a big performance issue by counting IOWAIT time as busy The ondemand cpufreq governor uses CPU busy time (e.g. not-idle time) as a measure for scaling the CPU frequency up or down. If the CPU is busy, the CPU frequency scales up, if it's idle, the CPU frequency scales down. Effectively, it uses the CPU busy time as proxy variable for the more nebulous "how critical is performance right now" question. This algorithm falls flat on its face in the light of workloads where you're alternatingly disk and CPU bound, such as the ever popular "git grep", but also things like startup of programs and maildir using email clients... much to the chagarin of Andrew Morton. This patch changes the ondemand algorithm to count iowait time as busy, not idle, time. As shown in the breakdown cases above, iowait is performance critical often, and by counting iowait, the proxy variable becomes a more accurate representation of the "how critical is performance" question. The problem and fix are both verified with the "perf timechar" tool. Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Dave Jones Reviewed-by: Rik van Riel Acked-by: Peter Zijlstra LKML-Reference: <20100509082606.3d9f00d0@infradead.org> Signed-off-by: Ingo Molnar --- drivers/cpufreq/cpufreq_ondemand.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index bd444dc93cf2..ed472f8dfb72 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -73,6 +73,7 @@ enum {DBS_NORMAL_SAMPLE, DBS_SUB_SAMPLE}; struct cpu_dbs_info_s { cputime64_t prev_cpu_idle; + cputime64_t prev_cpu_iowait; cputime64_t prev_cpu_wall; cputime64_t prev_cpu_nice; struct cpufreq_policy *cur_policy; @@ -148,6 +149,16 @@ static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall) return idle_time; } +static inline cputime64_t get_cpu_iowait_time(unsigned int cpu, cputime64_t *wall) +{ + u64 iowait_time = get_cpu_iowait_time_us(cpu, wall); + + if (iowait_time == -1ULL) + return 0; + + return iowait_time; +} + /* * Find right freq to be set now with powersave_bias on. * Returns the freq_hi to be used right now and will set freq_hi_jiffies, @@ -470,14 +481,15 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) for_each_cpu(j, policy->cpus) { struct cpu_dbs_info_s *j_dbs_info; - cputime64_t cur_wall_time, cur_idle_time; - unsigned int idle_time, wall_time; + cputime64_t cur_wall_time, cur_idle_time, cur_iowait_time; + unsigned int idle_time, wall_time, iowait_time; unsigned int load, load_freq; int freq_avg; j_dbs_info = &per_cpu(od_cpu_dbs_info, j); cur_idle_time = get_cpu_idle_time(j, &cur_wall_time); + cur_iowait_time = get_cpu_iowait_time(j, &cur_wall_time); wall_time = (unsigned int) cputime64_sub(cur_wall_time, j_dbs_info->prev_cpu_wall); @@ -487,6 +499,10 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) j_dbs_info->prev_cpu_idle); j_dbs_info->prev_cpu_idle = cur_idle_time; + iowait_time = (unsigned int) cputime64_sub(cur_iowait_time, + j_dbs_info->prev_cpu_iowait); + j_dbs_info->prev_cpu_iowait = cur_iowait_time; + if (dbs_tuners_ins.ignore_nice) { cputime64_t cur_nice; unsigned long cur_nice_jiffies; @@ -504,6 +520,16 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) idle_time += jiffies_to_usecs(cur_nice_jiffies); } + /* + * For the purpose of ondemand, waiting for disk IO is an + * indication that you're performance critical, and not that + * the system is actually idle. So subtract the iowait time + * from the cpu idle time. + */ + + if (idle_time >= iowait_time) + idle_time -= iowait_time; + if (unlikely(!wall_time || wall_time < idle_time)) continue; -- cgit v1.2.3 From 19379b11819efc1fc3b602e64f7e7531050aaddb Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 9 May 2010 08:26:51 -0700 Subject: ondemand: Make the iowait-is-busy time a sysfs tunable Pavel Machek pointed out that not all CPUs have an efficient idle at high frequency. Specifically, older Intel and various AMD cpus would get a higher powerusage when copying files from USB. Mike Chan pointed out that the same is true for various ARM chips as well. Thomas Renninger suggested to make this a sysfs tunable with a reasonable default. This patch adds a sysfs tunable for the new behavior, and uses a very simple function to determine a reasonable default, depending on the CPU vendor/type. Signed-off-by: Arjan van de Ven Acked-by: Rik van Riel Acked-by: Pavel Machek Acked-by: Peter Zijlstra Cc: davej@redhat.com LKML-Reference: <20100509082651.46914d04@infradead.org> [ minor tidyup ] Signed-off-by: Ingo Molnar --- drivers/cpufreq/cpufreq_ondemand.c | 47 +++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index ed472f8dfb72..8e9dbdc6c700 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -109,6 +109,7 @@ static struct dbs_tuners { unsigned int down_differential; unsigned int ignore_nice; unsigned int powersave_bias; + unsigned int io_is_busy; } dbs_tuners_ins = { .up_threshold = DEF_FREQUENCY_UP_THRESHOLD, .down_differential = DEF_FREQUENCY_DOWN_DIFFERENTIAL, @@ -260,6 +261,7 @@ static ssize_t show_##file_name \ return sprintf(buf, "%u\n", dbs_tuners_ins.object); \ } show_one(sampling_rate, sampling_rate); +show_one(io_is_busy, io_is_busy); show_one(up_threshold, up_threshold); show_one(ignore_nice_load, ignore_nice); show_one(powersave_bias, powersave_bias); @@ -310,6 +312,23 @@ static ssize_t store_sampling_rate(struct kobject *a, struct attribute *b, return count; } +static ssize_t store_io_is_busy(struct kobject *a, struct attribute *b, + const char *buf, size_t count) +{ + unsigned int input; + int ret; + + ret = sscanf(buf, "%u", &input); + if (ret != 1) + return -EINVAL; + + mutex_lock(&dbs_mutex); + dbs_tuners_ins.io_is_busy = !!input; + mutex_unlock(&dbs_mutex); + + return count; +} + static ssize_t store_up_threshold(struct kobject *a, struct attribute *b, const char *buf, size_t count) { @@ -392,6 +411,7 @@ static struct global_attr _name = \ __ATTR(_name, 0644, show_##_name, store_##_name) define_one_rw(sampling_rate); +define_one_rw(io_is_busy); define_one_rw(up_threshold); define_one_rw(ignore_nice_load); define_one_rw(powersave_bias); @@ -403,6 +423,7 @@ static struct attribute *dbs_attributes[] = { &up_threshold.attr, &ignore_nice_load.attr, &powersave_bias.attr, + &io_is_busy.attr, NULL }; @@ -527,7 +548,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) * from the cpu idle time. */ - if (idle_time >= iowait_time) + if (dbs_tuners_ins.io_is_busy && idle_time >= iowait_time) idle_time -= iowait_time; if (unlikely(!wall_time || wall_time < idle_time)) @@ -643,6 +664,29 @@ static inline void dbs_timer_exit(struct cpu_dbs_info_s *dbs_info) cancel_delayed_work_sync(&dbs_info->work); } +/* + * Not all CPUs want IO time to be accounted as busy; this dependson how + * efficient idling at a higher frequency/voltage is. + * Pavel Machek says this is not so for various generations of AMD and old + * Intel systems. + * Mike Chan (androidlcom) calis this is also not true for ARM. + * Because of this, whitelist specific known (series) of CPUs by default, and + * leave all others up to the user. + */ +static int should_io_be_busy(void) +{ +#if defined(CONFIG_X86) + /* + * For Intel, Core 2 (model 15) andl later have an efficient idle. + */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + boot_cpu_data.x86 == 6 && + boot_cpu_data.x86_model >= 15) + return 1; +#endif + return 0; +} + static int cpufreq_governor_dbs(struct cpufreq_policy *policy, unsigned int event) { @@ -705,6 +749,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, dbs_tuners_ins.sampling_rate = max(min_sampling_rate, latency * LATENCY_MULTIPLIER); + dbs_tuners_ins.io_is_busy = should_io_be_busy(); } mutex_unlock(&dbs_mutex); -- cgit v1.2.3