From 84d56e66b9b4a646f04ec30696ca1aeea5e654d5 Mon Sep 17 00:00:00 2001 From: Ulrich Obergfell Date: Tue, 14 Apr 2015 15:43:55 -0700 Subject: watchdog: new definitions and variables, initialization The hardlockup and softockup had always been tied together. Due to the request of KVM folks, they had a need to have one enabled but not the other. Internally rework the code to split things apart more cleanly. There is a bunch of churn here, but the end result should be code that should be easier to maintain and fix without knowing the internals of what is going on. This patch (of 9): Introduce new definitions and variables to separate the user interface in /proc/sys/kernel from the internal run state of the lockup detectors. The internal run state is represented by two bits in a new variable that is named 'watchdog_enabled'. This helps simplify the code, for example: - In order to check if any of the two lockup detectors is enabled, it is sufficient to check if 'watchdog_enabled' is not zero. - In order to enable/disable one or both lockup detectors, it is sufficient to set/clear one or both bits in 'watchdog_enabled'. - Concurrent updates of 'watchdog_enabled' need not be synchronized via a spinlock or a mutex. Updates can either be atomic or concurrency can be detected by using 'cmpxchg'. Signed-off-by: Ulrich Obergfell Signed-off-by: Don Zickus Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/nmi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 9b2022ab4d85..3885a7d12bd2 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -68,6 +68,8 @@ static inline bool trigger_allbutself_cpu_backtrace(void) #ifdef CONFIG_LOCKUP_DETECTOR int hw_nmi_is_cpu_stuck(struct pt_regs *); u64 hw_nmi_get_sample_period(int watchdog_thresh); +extern int nmi_watchdog_enabled; +extern int soft_watchdog_enabled; extern int watchdog_user_enabled; extern int watchdog_thresh; extern int sysctl_softlockup_all_cpu_backtrace; -- cgit v1.2.3 From 83a80a39075a9ded23df1e26a4b617c289077630 Mon Sep 17 00:00:00 2001 From: Ulrich Obergfell Date: Tue, 14 Apr 2015 15:44:08 -0700 Subject: watchdog: introduce separate handlers for parameters in /proc/sys/kernel Separate handlers for each watchdog parameter in /proc/sys/kernel replace the proc_dowatchdog() function. Three of those handlers merely call proc_watchdog_common() with one different argument. Signed-off-by: Ulrich Obergfell Signed-off-by: Don Zickus Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/nmi.h | 8 ++++++++ kernel/watchdog.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) (limited to 'include') diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 3885a7d12bd2..5b5450585b8a 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -74,6 +74,14 @@ extern int watchdog_user_enabled; extern int watchdog_thresh; extern int sysctl_softlockup_all_cpu_backtrace; struct ctl_table; +extern int proc_watchdog(struct ctl_table *, int , + void __user *, size_t *, loff_t *); +extern int proc_nmi_watchdog(struct ctl_table *, int , + void __user *, size_t *, loff_t *); +extern int proc_soft_watchdog(struct ctl_table *, int , + void __user *, size_t *, loff_t *); +extern int proc_watchdog_thresh(struct ctl_table *, int , + void __user *, size_t *, loff_t *); extern int proc_dowatchdog(struct ctl_table *, int , void __user *, size_t *, loff_t *); #endif diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 3600a01c97a9..26002ed4c16e 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -768,6 +768,65 @@ out: return err; } +/* + * /proc/sys/kernel/watchdog + */ +int proc_watchdog(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return proc_watchdog_common(NMI_WATCHDOG_ENABLED|SOFT_WATCHDOG_ENABLED, + table, write, buffer, lenp, ppos); +} + +/* + * /proc/sys/kernel/nmi_watchdog + */ +int proc_nmi_watchdog(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return proc_watchdog_common(NMI_WATCHDOG_ENABLED, + table, write, buffer, lenp, ppos); +} + +/* + * /proc/sys/kernel/soft_watchdog + */ +int proc_soft_watchdog(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return proc_watchdog_common(SOFT_WATCHDOG_ENABLED, + table, write, buffer, lenp, ppos); +} + +/* + * /proc/sys/kernel/watchdog_thresh + */ +int proc_watchdog_thresh(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int err, old; + + mutex_lock(&watchdog_proc_mutex); + + old = ACCESS_ONCE(watchdog_thresh); + err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (err || !write) + goto out; + + /* + * Update the sample period. + * Restore 'watchdog_thresh' on failure. + */ + set_sample_period(); + err = proc_watchdog_update(); + if (err) + watchdog_thresh = old; +out: + mutex_unlock(&watchdog_proc_mutex); + return err; +} + /* * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh */ -- cgit v1.2.3 From 195daf665a6299de98a4da3843fed2dd9de19d3a Mon Sep 17 00:00:00 2001 From: Ulrich Obergfell Date: Tue, 14 Apr 2015 15:44:13 -0700 Subject: watchdog: enable the new user interface of the watchdog mechanism With the current user interface of the watchdog mechanism it is only possible to disable or enable both lockup detectors at the same time. This series introduces new kernel parameters and changes the semantics of some existing kernel parameters, so that the hard lockup detector and the soft lockup detector can be disabled or enabled individually. With this series applied, the user interface is as follows. - parameters in /proc/sys/kernel . soft_watchdog This is a new parameter to control and examine the run state of the soft lockup detector. . nmi_watchdog The semantics of this parameter have changed. It can now be used to control and examine the run state of the hard lockup detector. . watchdog This parameter is still available to control the run state of both lockup detectors at the same time. If this parameter is examined, it shows the logical OR of soft_watchdog and nmi_watchdog. . watchdog_thresh The semantics of this parameter are not affected by the patch. - kernel command line parameters . nosoftlockup The semantics of this parameter have changed. It can now be used to disable the soft lockup detector at boot time. . nmi_watchdog=0 or nmi_watchdog=1 Disable or enable the hard lockup detector at boot time. The patch introduces '=1' as a new option. . nowatchdog The semantics of this parameter are not affected by the patch. It is still available to disable both lockup detectors at boot time. Also, remove the proc_dowatchdog() function which is no longer needed. [dzickus@redhat.com: wrote changelog] [dzickus@redhat.com: update documentation for kernel params and sysctl] Signed-off-by: Ulrich Obergfell Signed-off-by: Don Zickus Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 6 ++- Documentation/sysctl/kernel.txt | 62 +++++++++++++++++++++++----- include/linux/nmi.h | 2 - kernel/sysctl.c | 35 +++++++++++----- kernel/watchdog.c | 81 ++++++++----------------------------- 5 files changed, 97 insertions(+), 89 deletions(-) (limited to 'include') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 01aa47d3b6ab..71eecb263250 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2236,8 +2236,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted. nmi_watchdog= [KNL,BUGS=X86] Debugging features for SMP kernels Format: [panic,][nopanic,][num] - Valid num: 0 + Valid num: 0 or 1 0 - turn nmi_watchdog off + 1 - turn nmi_watchdog on When panic is specified, panic when an NMI watchdog timeout occurs (or 'nopanic' to override the opposite default). @@ -2464,7 +2465,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted. nousb [USB] Disable the USB subsystem - nowatchdog [KNL] Disable the lockup detector (NMI watchdog). + nowatchdog [KNL] Disable both lockup detectors, i.e. + soft-lockup and NMI watchdog (hard-lockup). nowb [ARM] diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 83ab25660fc9..99d7eb3a1416 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -77,12 +77,14 @@ show up in /proc/sys/kernel: - shmmax [ sysv ipc ] - shmmni - softlockup_all_cpu_backtrace +- soft_watchdog - stop-a [ SPARC only ] - sysrq ==> Documentation/sysrq.txt - sysctl_writes_strict - tainted - threads-max - unknown_nmi_panic +- watchdog - watchdog_thresh - version @@ -417,16 +419,23 @@ successful IPC object allocation. nmi_watchdog: -Enables/Disables the NMI watchdog on x86 systems. When the value is -non-zero the NMI watchdog is enabled and will continuously test all -online cpus to determine whether or not they are still functioning -properly. Currently, passing "nmi_watchdog=" parameter at boot time is -required for this function to work. +This parameter can be used to control the NMI watchdog +(i.e. the hard lockup detector) on x86 systems. -If LAPIC NMI watchdog method is in use (nmi_watchdog=2 kernel -parameter), the NMI watchdog shares registers with oprofile. By -disabling the NMI watchdog, oprofile may have more registers to -utilize. + 0 - disable the hard lockup detector + 1 - enable the hard lockup detector + +The hard lockup detector monitors each CPU for its ability to respond to +timer interrupts. The mechanism utilizes CPU performance counter registers +that are programmed to generate Non-Maskable Interrupts (NMIs) periodically +while a CPU is busy. Hence, the alternative name 'NMI watchdog'. + +The NMI watchdog is disabled by default if the kernel is running as a guest +in a KVM virtual machine. This default can be overridden by adding + + nmi_watchdog=1 + +to the guest kernel command line (see Documentation/kernel-parameters.txt). ============================================================== @@ -816,6 +825,22 @@ NMI. ============================================================== +soft_watchdog + +This parameter can be used to control the soft lockup detector. + + 0 - disable the soft lockup detector + 1 - enable the soft lockup detector + +The soft lockup detector monitors CPUs for threads that are hogging the CPUs +without rescheduling voluntarily, and thus prevent the 'watchdog/N' threads +from running. The mechanism depends on the CPUs ability to respond to timer +interrupts which are needed for the 'watchdog/N' threads to be woken up by +the watchdog timer function, otherwise the NMI watchdog - if enabled - can +detect a hard lockup condition. + +============================================================== + tainted: Non-zero if the kernel has been tainted. Numeric values, which @@ -858,6 +883,25 @@ example. If a system hangs up, try pressing the NMI switch. ============================================================== +watchdog: + +This parameter can be used to disable or enable the soft lockup detector +_and_ the NMI watchdog (i.e. the hard lockup detector) at the same time. + + 0 - disable both lockup detectors + 1 - enable both lockup detectors + +The soft lockup detector and the NMI watchdog can also be disabled or +enabled individually, using the soft_watchdog and nmi_watchdog parameters. +If the watchdog parameter is read, for example by executing + + cat /proc/sys/kernel/watchdog + +the output of this command (0 or 1) shows the logical OR of soft_watchdog +and nmi_watchdog. + +============================================================== + watchdog_thresh: This value can be used to control the frequency of hrtimer and NMI diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 5b5450585b8a..0426357297d5 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -82,8 +82,6 @@ extern int proc_soft_watchdog(struct ctl_table *, int , void __user *, size_t *, loff_t *); extern int proc_watchdog_thresh(struct ctl_table *, int , void __user *, size_t *, loff_t *); -extern int proc_dowatchdog(struct ctl_table *, int , - void __user *, size_t *, loff_t *); #endif #ifdef CONFIG_HAVE_ACPI_APEI_NMI diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ce410bb9f2e1..245e7dcc3741 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -846,7 +846,7 @@ static struct ctl_table kern_table[] = { .data = &watchdog_user_enabled, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = proc_dowatchdog, + .proc_handler = proc_watchdog, .extra1 = &zero, .extra2 = &one, }, @@ -855,10 +855,32 @@ static struct ctl_table kern_table[] = { .data = &watchdog_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dowatchdog, + .proc_handler = proc_watchdog_thresh, .extra1 = &zero, .extra2 = &sixty, }, + { + .procname = "nmi_watchdog", + .data = &nmi_watchdog_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_nmi_watchdog, + .extra1 = &zero, +#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) + .extra2 = &one, +#else + .extra2 = &zero, +#endif + }, + { + .procname = "soft_watchdog", + .data = &soft_watchdog_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_soft_watchdog, + .extra1 = &zero, + .extra2 = &one, + }, { .procname = "softlockup_panic", .data = &softlockup_panic, @@ -879,15 +901,6 @@ static struct ctl_table kern_table[] = { .extra2 = &one, }, #endif /* CONFIG_SMP */ - { - .procname = "nmi_watchdog", - .data = &watchdog_user_enabled, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dowatchdog, - .extra1 = &zero, - .extra2 = &one, - }, #endif #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) { diff --git a/kernel/watchdog.c b/kernel/watchdog.c index fd2b6dc14486..63d702885686 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -110,15 +110,9 @@ static int __init hardlockup_panic_setup(char *str) else if (!strncmp(str, "nopanic", 7)) hardlockup_panic = 0; else if (!strncmp(str, "0", 1)) - watchdog_user_enabled = 0; - else if (!strncmp(str, "1", 1) || !strncmp(str, "2", 1)) { - /* - * Setting 'nmi_watchdog=1' or 'nmi_watchdog=2' (legacy option) - * has the same effect. - */ - watchdog_user_enabled = 1; - watchdog_enable_hardlockup_detector(true); - } + watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; + else if (!strncmp(str, "1", 1)) + watchdog_enabled |= NMI_WATCHDOG_ENABLED; return 1; } __setup("nmi_watchdog=", hardlockup_panic_setup); @@ -137,19 +131,18 @@ __setup("softlockup_panic=", softlockup_panic_setup); static int __init nowatchdog_setup(char *str) { - watchdog_user_enabled = 0; + watchdog_enabled = 0; return 1; } __setup("nowatchdog", nowatchdog_setup); -/* deprecated */ static int __init nosoftlockup_setup(char *str) { - watchdog_user_enabled = 0; + watchdog_enabled &= ~SOFT_WATCHDOG_ENABLED; return 1; } __setup("nosoftlockup", nosoftlockup_setup); -/* */ + #ifdef CONFIG_SMP static int __init softlockup_all_cpu_backtrace_setup(char *str) { @@ -264,10 +257,11 @@ static int is_softlockup(unsigned long touch_ts) { unsigned long now = get_timestamp(); - /* Warn about unreasonable delays: */ - if (time_after(now, touch_ts + get_softlockup_thresh())) - return now - touch_ts; - + if (watchdog_enabled & SOFT_WATCHDOG_ENABLED) { + /* Warn about unreasonable delays. */ + if (time_after(now, touch_ts + get_softlockup_thresh())) + return now - touch_ts; + } return 0; } @@ -532,6 +526,10 @@ static int watchdog_nmi_enable(unsigned int cpu) struct perf_event_attr *wd_attr; struct perf_event *event = per_cpu(watchdog_ev, cpu); + /* nothing to do if the hard lockup detector is disabled */ + if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) + goto out; + /* * Some kernels need to default hard lockup detection to * 'disabled', for example a guest on a hypervisor. @@ -856,59 +854,12 @@ out: mutex_unlock(&watchdog_proc_mutex); return err; } - -/* - * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh - */ - -int proc_dowatchdog(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int err, old_thresh, old_enabled; - bool old_hardlockup; - - mutex_lock(&watchdog_proc_mutex); - old_thresh = ACCESS_ONCE(watchdog_thresh); - old_enabled = ACCESS_ONCE(watchdog_user_enabled); - old_hardlockup = watchdog_hardlockup_detector_is_enabled(); - - err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (err || !write) - goto out; - - set_sample_period(); - /* - * Watchdog threads shouldn't be enabled if they are - * disabled. The 'watchdog_running' variable check in - * watchdog_*_all_cpus() function takes care of this. - */ - if (watchdog_user_enabled && watchdog_thresh) { - /* - * Prevent a change in watchdog_thresh accidentally overriding - * the enablement of the hardlockup detector. - */ - if (watchdog_user_enabled != old_enabled) - watchdog_enable_hardlockup_detector(true); - err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh); - } else - watchdog_disable_all_cpus(); - - /* Restore old values on failure */ - if (err) { - watchdog_thresh = old_thresh; - watchdog_user_enabled = old_enabled; - watchdog_enable_hardlockup_detector(old_hardlockup); - } -out: - mutex_unlock(&watchdog_proc_mutex); - return err; -} #endif /* CONFIG_SYSCTL */ void __init lockup_detector_init(void) { set_sample_period(); - if (watchdog_user_enabled) + if (watchdog_enabled) watchdog_enable_all_cpus(false); } -- cgit v1.2.3 From 692297d8f96887f836d9049a653ed05a71cf48fb Mon Sep 17 00:00:00 2001 From: Ulrich Obergfell Date: Tue, 14 Apr 2015 15:44:19 -0700 Subject: watchdog: introduce the hardlockup_detector_disable() function Have kvm_guest_init() use hardlockup_detector_disable() instead of watchdog_enable_hardlockup_detector(false). Remove the watchdog_hardlockup_detector_is_enabled() and the watchdog_enable_hardlockup_detector() function which are no longer needed. Signed-off-by: Ulrich Obergfell Signed-off-by: Don Zickus Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/kvm.c | 2 +- include/linux/nmi.h | 9 ++------- kernel/watchdog.c | 21 ++------------------- 3 files changed, 5 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index e354cc6446ab..9435620062df 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -513,7 +513,7 @@ void __init kvm_guest_init(void) * can get false positives too easily, for example if the host is * overcommitted. */ - watchdog_enable_hardlockup_detector(false); + hardlockup_detector_disable(); } static noinline uint32_t __kvm_cpuid_base(void) diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 0426357297d5..3d46fb4708e0 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -25,16 +25,11 @@ static inline void touch_nmi_watchdog(void) #endif #if defined(CONFIG_HARDLOCKUP_DETECTOR) -extern void watchdog_enable_hardlockup_detector(bool val); -extern bool watchdog_hardlockup_detector_is_enabled(void); +extern void hardlockup_detector_disable(void); #else -static inline void watchdog_enable_hardlockup_detector(bool val) +static inline void hardlockup_detector_disable(void) { } -static inline bool watchdog_hardlockup_detector_is_enabled(void) -{ - return true; -} #endif /* diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 49d02250aaac..f2be11ab7e08 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -83,8 +83,6 @@ static unsigned long soft_lockup_nmi_warn; #ifdef CONFIG_HARDLOCKUP_DETECTOR static int hardlockup_panic = CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; - -static bool hardlockup_detector_enabled = true; /* * We may not want to enable hard lockup detection by default in all cases, * for example when running the kernel as a guest on a hypervisor. In these @@ -93,14 +91,9 @@ static bool hardlockup_detector_enabled = true; * kernel command line parameters are parsed, because otherwise it is not * possible to override this in hardlockup_panic_setup(). */ -void watchdog_enable_hardlockup_detector(bool val) -{ - hardlockup_detector_enabled = val; -} - -bool watchdog_hardlockup_detector_is_enabled(void) +void hardlockup_detector_disable(void) { - return hardlockup_detector_enabled; + watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; } static int __init hardlockup_panic_setup(char *str) @@ -530,15 +523,6 @@ static int watchdog_nmi_enable(unsigned int cpu) if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) goto out; - /* - * Some kernels need to default hard lockup detection to - * 'disabled', for example a guest on a hypervisor. - */ - if (!watchdog_hardlockup_detector_is_enabled()) { - event = ERR_PTR(-ENOENT); - goto handle_err; - } - /* is it already setup and enabled? */ if (event && event->state > PERF_EVENT_STATE_OFF) goto out; @@ -553,7 +537,6 @@ static int watchdog_nmi_enable(unsigned int cpu) /* Try to register using hardware perf events */ event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); -handle_err: /* save cpu0 error for future comparision */ if (cpu == 0 && IS_ERR(event)) cpu0_err = PTR_ERR(event); -- cgit v1.2.3 From 124dee09f0669b92cc0073b00984d53541ca0884 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 14 Apr 2015 15:44:28 -0700 Subject: mm, slab: correct config option in comment CONFIG_SLAB_DEBUG doesn't exist, CONFIG_DEBUG_SLAB does. Signed-off-by: David Rientjes Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/slab.h b/include/linux/slab.h index 76f1feeabd38..ffd24c830151 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -18,7 +18,7 @@ /* * Flags to pass to kmem_cache_create(). - * The ones marked DEBUG are only valid if CONFIG_SLAB_DEBUG is set. + * The ones marked DEBUG are only valid if CONFIG_DEBUG_SLAB is set. */ #define SLAB_DEBUG_FREE 0x00000100UL /* DEBUG: Perform (expensive) checks on free */ #define SLAB_RED_ZONE 0x00000400UL /* DEBUG: Red zone objs in a cache */ -- cgit v1.2.3 From 84d33df279e0380995b0e03fb8aad04cef2bc29f Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Tue, 14 Apr 2015 15:44:37 -0700 Subject: mm: rename FOLL_MLOCK to FOLL_POPULATE After commit a1fde08c74e9 ("VM: skip the stack guard page lookup in get_user_pages only for mlock") FOLL_MLOCK has lost its original meaning: we don't necessarily mlock the page if the flags is set -- we also take VM_LOCKED into consideration. Since we use the same codepath for __mm_populate(), let's rename FOLL_MLOCK to FOLL_POPULATE. Signed-off-by: Kirill A. Shutemov Acked-by: Linus Torvalds Acked-by: David Rientjes Cc: Michel Lespinasse Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- mm/gup.c | 6 +++--- mm/huge_memory.c | 2 +- mm/mlock.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 47a93928b90f..cccbbba12b9d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2109,7 +2109,7 @@ static inline struct page *follow_page(struct vm_area_struct *vma, #define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ #define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO * and return without waiting upon it */ -#define FOLL_MLOCK 0x40 /* mark page as mlocked */ +#define FOLL_POPULATE 0x40 /* fault in page */ #define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */ #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ #define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ diff --git a/mm/gup.c b/mm/gup.c index a6e24e246f86..1b114ba9aebf 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -92,7 +92,7 @@ retry: */ mark_page_accessed(page); } - if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { + if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) { /* * The preliminary mapping check is mainly to avoid the * pointless overhead of lock_page on the ZERO_PAGE @@ -265,8 +265,8 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, unsigned int fault_flags = 0; int ret; - /* For mlock, just skip the stack guard page. */ - if ((*flags & FOLL_MLOCK) && + /* For mm_populate(), just skip the stack guard page. */ + if ((*flags & FOLL_POPULATE) && (stack_guard_page_start(vma, address) || stack_guard_page_end(vma, address + PAGE_SIZE))) return -ENOENT; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 6817b0350c71..10a4b6cea0d1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1231,7 +1231,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, pmd, _pmd, 1)) update_mmu_cache_pmd(vma, addr, pmd); } - if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { + if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) { if (page->mapping && trylock_page(page)) { lru_add_drain(); if (page->mapping) diff --git a/mm/mlock.c b/mm/mlock.c index 8a54cd214925..f756e28b33fc 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -237,7 +237,7 @@ long __mlock_vma_pages_range(struct vm_area_struct *vma, VM_BUG_ON_VMA(end > vma->vm_end, vma); VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); - gup_flags = FOLL_TOUCH | FOLL_MLOCK; + gup_flags = FOLL_TOUCH | FOLL_POPULATE; /* * We want to touch writable mappings with a write fault in order * to break COW, except for shared mappings because these don't COW -- cgit v1.2.3 From 30467e0b3be83c286d60039f8267dd421128ca74 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 14 Apr 2015 15:45:11 -0700 Subject: mm, hotplug: fix concurrent memory hot-add deadlock There's a deadlock when concurrently hot-adding memory through the probe interface and switching a memory block from offline to online. When hot-adding memory via the probe interface, add_memory() first takes mem_hotplug_begin() and then device_lock() is later taken when registering the newly initialized memory block. This creates a lock dependency of (1) mem_hotplug.lock (2) dev->mutex. When switching a memory block from offline to online, dev->mutex is first grabbed in device_online() when the write(2) transitions an existing memory block from offline to online, and then online_pages() will take mem_hotplug_begin(). This creates a lock inversion between mem_hotplug.lock and dev->mutex. Vitaly reports that this deadlock can happen when kworker handling a probe event races with systemd-udevd switching a memory block's state. This patch requires the state transition to take mem_hotplug_begin() before dev->mutex. Hot-adding memory via the probe interface creates a memory block while holding mem_hotplug_begin(), there is no way to take dev->mutex first in this case. online_pages() and offline_pages() are only called when transitioning memory block state. We now require that mem_hotplug_begin() is taken before calling them -- this requires exporting the mem_hotplug_begin() and mem_hotplug_done() to generic code. In all hot-add and hot-remove cases, mem_hotplug_begin() is done prior to device_online(). This is all that is needed to avoid the deadlock. Signed-off-by: David Rientjes Reported-by: Vitaly Kuznetsov Tested-by: Vitaly Kuznetsov Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: "K. Y. Srinivasan" Cc: Yasuaki Ishimatsu Cc: Tang Chen Cc: Vlastimil Babka Cc: Zhang Zhen Cc: Vladimir Davydov Cc: Wang Nan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 19 ++++++++++++------- include/linux/memory_hotplug.h | 6 ++++++ mm/memory_hotplug.c | 33 ++++++++++++--------------------- 3 files changed, 30 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index ab5c9a7dfeca..2804aed3f416 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -219,6 +219,7 @@ static bool pages_correctly_reserved(unsigned long start_pfn) /* * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is * OK to have direct references to sparsemem variables in here. + * Must already be protected by mem_hotplug_begin(). */ static int memory_block_action(unsigned long phys_index, unsigned long action, int online_type) @@ -286,6 +287,7 @@ static int memory_subsys_online(struct device *dev) if (mem->online_type < 0) mem->online_type = MMOP_ONLINE_KEEP; + /* Already under protection of mem_hotplug_begin() */ ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); /* clear online_type */ @@ -328,17 +330,19 @@ store_mem_state(struct device *dev, goto err; } + /* + * Memory hotplug needs to hold mem_hotplug_begin() for probe to find + * the correct memory block to online before doing device_online(dev), + * which will take dev->mutex. Take the lock early to prevent an + * inversion, memory_subsys_online() callbacks will be implemented by + * assuming it's already protected. + */ + mem_hotplug_begin(); + switch (online_type) { case MMOP_ONLINE_KERNEL: case MMOP_ONLINE_MOVABLE: case MMOP_ONLINE_KEEP: - /* - * mem->online_type is not protected so there can be a - * race here. However, when racing online, the first - * will succeed and the second will just return as the - * block will already be online. The online type - * could be either one, but that is expected. - */ mem->online_type = online_type; ret = device_online(&mem->dev); break; @@ -349,6 +353,7 @@ store_mem_state(struct device *dev, ret = -EINVAL; /* should never happen */ } + mem_hotplug_done(); err: unlock_device_hotplug(); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 8f1a41951df9..6ffa0ac7f7d6 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -192,6 +192,9 @@ extern void get_page_bootmem(unsigned long ingo, struct page *page, void get_online_mems(void); void put_online_mems(void); +void mem_hotplug_begin(void); +void mem_hotplug_done(void); + #else /* ! CONFIG_MEMORY_HOTPLUG */ /* * Stub functions for when hotplug is off @@ -231,6 +234,9 @@ static inline int try_online_node(int nid) static inline void get_online_mems(void) {} static inline void put_online_mems(void) {} +static inline void mem_hotplug_begin(void) {} +static inline void mem_hotplug_done(void) {} + #endif /* ! CONFIG_MEMORY_HOTPLUG */ #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index aaec7758eec3..e2e8014fb755 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -104,7 +104,7 @@ void put_online_mems(void) } -static void mem_hotplug_begin(void) +void mem_hotplug_begin(void) { mem_hotplug.active_writer = current; @@ -119,7 +119,7 @@ static void mem_hotplug_begin(void) } } -static void mem_hotplug_done(void) +void mem_hotplug_done(void) { mem_hotplug.active_writer = NULL; mutex_unlock(&mem_hotplug.lock); @@ -959,6 +959,7 @@ static void node_states_set_node(int node, struct memory_notify *arg) } +/* Must be protected by mem_hotplug_begin() */ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) { unsigned long flags; @@ -969,7 +970,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ int ret; struct memory_notify arg; - mem_hotplug_begin(); /* * This doesn't need a lock to do pfn_to_page(). * The section can't be removed here because of the @@ -977,21 +977,20 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ */ zone = page_zone(pfn_to_page(pfn)); - ret = -EINVAL; if ((zone_idx(zone) > ZONE_NORMAL || online_type == MMOP_ONLINE_MOVABLE) && !can_online_high_movable(zone)) - goto out; + return -EINVAL; if (online_type == MMOP_ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) - goto out; + return -EINVAL; } if (online_type == MMOP_ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) - goto out; + return -EINVAL; } /* Previous code may changed the zone of the pfn range */ @@ -1007,7 +1006,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ ret = notifier_to_errno(ret); if (ret) { memory_notify(MEM_CANCEL_ONLINE, &arg); - goto out; + return ret; } /* * If this zone is not populated, then it is not in zonelist. @@ -1031,7 +1030,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1); memory_notify(MEM_CANCEL_ONLINE, &arg); - goto out; + return ret; } zone->present_pages += onlined_pages; @@ -1061,9 +1060,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ if (onlined_pages) memory_notify(MEM_ONLINE, &arg); -out: - mem_hotplug_done(); - return ret; + return 0; } #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ @@ -1688,21 +1685,18 @@ static int __ref __offline_pages(unsigned long start_pfn, if (!test_pages_in_a_zone(start_pfn, end_pfn)) return -EINVAL; - mem_hotplug_begin(); - zone = page_zone(pfn_to_page(start_pfn)); node = zone_to_nid(zone); nr_pages = end_pfn - start_pfn; - ret = -EINVAL; if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) - goto out; + return -EINVAL; /* set above range as isolated */ ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE, true); if (ret) - goto out; + return ret; arg.start_pfn = start_pfn; arg.nr_pages = nr_pages; @@ -1795,7 +1789,6 @@ repeat: writeback_set_ratelimit(); memory_notify(MEM_OFFLINE, &arg); - mem_hotplug_done(); return 0; failed_removal: @@ -1805,12 +1798,10 @@ failed_removal: memory_notify(MEM_CANCEL_OFFLINE, &arg); /* pushback to free area */ undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); - -out: - mem_hotplug_done(); return ret; } +/* Must be protected by mem_hotplug_begin() */ int offline_pages(unsigned long start_pfn, unsigned long nr_pages) { return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); -- cgit v1.2.3 From b9ea25152e56365ce149b9a39637cd7a16eec556 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 14 Apr 2015 15:45:27 -0700 Subject: page_writeback: clean up mess around cancel_dirty_page() This patch replaces cancel_dirty_page() with a helper function account_page_cleaned() which only updates counters. It's called from truncate_complete_page() and from try_to_free_buffers() (hack for ext3). Page is locked in both cases, page-lock protects against concurrent dirtiers: see commit 2d6d7f982846 ("mm: protect set_page_dirty() from ongoing truncation"). Delete_from_page_cache() shouldn't be called for dirty pages, they must be handled by caller (either written or truncated). This patch treats final dirty accounting fixup at the end of __delete_from_page_cache() as a debug check and adds WARN_ON_ONCE() around it. If something removes dirty pages without proper handling that might be a bug and unwritten data might be lost. Hugetlbfs has no dirty pages accounting, ClearPageDirty() is enough here. cancel_dirty_page() in nfs_wb_page_cancel() is redundant. This is helper for nfs_invalidate_page() and it's called only in case complete invalidation. The mess was started in v2.6.20 after commits 46d2277c796f ("Clean up and make try_to_free_buffers() not race with dirty pages") and 3e67c0987d75 ("truncate: clear page dirtiness before running try_to_free_buffers()") first was reverted right in v2.6.20 in commit ecdfc9787fe5 ("Resurrect 'try_to_free_buffers()' VM hackery"), second in v2.6.25 commit a2b345642f53 ("Fix dirty page accounting leak with ext3 data=journal"). Custom fixes were introduced between these points. NFS in v2.6.23, commit 1b3b4a1a2deb ("NFS: Fix a write request leak in nfs_invalidate_page()"). Kludge in __delete_from_page_cache() in v2.6.24, commit 3a6927906f1b ("Do dirty page accounting when removing a page from the page cache"). Since v2.6.25 all of them are redundant. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Konstantin Khlebnikov Cc: Tejun Heo Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../lustre/include/linux/lustre_patchless_compat.h | 4 ++- fs/buffer.c | 4 +-- fs/hugetlbfs/inode.c | 2 +- fs/nfs/write.c | 5 --- include/linux/mm.h | 2 ++ include/linux/page-flags.h | 2 -- mm/filemap.c | 15 ++++----- mm/page-writeback.c | 19 +++++++++++ mm/truncate.c | 37 ++++------------------ 9 files changed, 41 insertions(+), 49 deletions(-) (limited to 'include') diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h b/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h index a260e99a4447..d72605864b0a 100644 --- a/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h +++ b/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h @@ -55,7 +55,9 @@ truncate_complete_page(struct address_space *mapping, struct page *page) if (PagePrivate(page)) page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE); - cancel_dirty_page(page, PAGE_SIZE); + if (TestClearPageDirty(page)) + account_page_cleaned(page, mapping); + ClearPageMappedToDisk(page); ll_delete_from_page_cache(page); } diff --git a/fs/buffer.c b/fs/buffer.c index 20805db2c987..c7a5602d01ee 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3243,8 +3243,8 @@ int try_to_free_buffers(struct page *page) * to synchronise against __set_page_dirty_buffers and prevent the * dirty bit from being lost. */ - if (ret) - cancel_dirty_page(page, PAGE_CACHE_SIZE); + if (ret && TestClearPageDirty(page)) + account_page_cleaned(page, mapping); spin_unlock(&mapping->private_lock); out: if (buffers_to_free) { diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index c274aca8e8dc..db76cec3ce21 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -319,7 +319,7 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping, static void truncate_huge_page(struct page *page) { - cancel_dirty_page(page, /* No IO accounting for huge pages? */0); + ClearPageDirty(page); ClearPageUptodate(page); delete_from_page_cache(page); } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 849ed784d6ac..759931088094 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1876,11 +1876,6 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) * request from the inode / page_private pointer and * release it */ nfs_inode_remove_request(req); - /* - * In case nfs_inode_remove_request has marked the - * page as being dirty - */ - cancel_dirty_page(page, PAGE_CACHE_SIZE); nfs_unlock_and_release_request(req); } diff --git a/include/linux/mm.h b/include/linux/mm.h index cccbbba12b9d..6571dd78e984 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1294,9 +1294,11 @@ int __set_page_dirty_no_writeback(struct page *page); int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page); void account_page_dirtied(struct page *page, struct address_space *mapping); +void account_page_cleaned(struct page *page, struct address_space *mapping); int set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); int clear_page_dirty_for_io(struct page *page); + int get_cmdline(struct task_struct *task, char *buffer, int buflen); /* Is the vma a continuation of the stack vma above it? */ diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5ed7bdaf22d5..c851ff92d5b3 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -328,8 +328,6 @@ static inline void SetPageUptodate(struct page *page) CLEARPAGEFLAG(Uptodate, uptodate) -extern void cancel_dirty_page(struct page *page, unsigned int account_size); - int test_clear_page_writeback(struct page *page); int __test_set_page_writeback(struct page *page, bool keep_write); diff --git a/mm/filemap.c b/mm/filemap.c index ad7242043bdb..434dba317400 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -203,16 +203,15 @@ void __delete_from_page_cache(struct page *page, void *shadow) BUG_ON(page_mapped(page)); /* - * Some filesystems seem to re-dirty the page even after - * the VM has canceled the dirty bit (eg ext3 journaling). + * At this point page must be either written or cleaned by truncate. + * Dirty page here signals a bug and loss of unwritten data. * - * Fix it up by doing a final dirty accounting check after - * having removed the page entirely. + * This fixes dirty accounting after removing the page entirely but + * leaves PageDirty set: it has no effect for truncated page and + * anyway will be cleared before returning page into buddy allocator. */ - if (PageDirty(page) && mapping_cap_account_dirty(mapping)) { - dec_zone_page_state(page, NR_FILE_DIRTY); - dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE); - } + if (WARN_ON_ONCE(PageDirty(page))) + account_page_cleaned(page, mapping); } /** diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 644bcb665773..0372411f38fc 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2110,6 +2110,25 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) } EXPORT_SYMBOL(account_page_dirtied); +/* + * Helper function for deaccounting dirty page without writeback. + * + * Doing this should *normally* only ever be done when a page + * is truncated, and is not actually mapped anywhere at all. However, + * fs/buffer.c does this when it notices that somebody has cleaned + * out all the buffers on a page without actually doing it through + * the VM. Can you say "ext3 is horribly ugly"? Thought you could. + */ +void account_page_cleaned(struct page *page, struct address_space *mapping) +{ + if (mapping_cap_account_dirty(mapping)) { + dec_zone_page_state(page, NR_FILE_DIRTY); + dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE); + task_io_account_cancelled_write(PAGE_CACHE_SIZE); + } +} +EXPORT_SYMBOL(account_page_cleaned); + /* * For address_spaces which do not use buffers. Just tag the page as dirty in * its radix tree. diff --git a/mm/truncate.c b/mm/truncate.c index ddec5a5966d7..7a9d8a3cb143 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -92,35 +92,6 @@ void do_invalidatepage(struct page *page, unsigned int offset, (*invalidatepage)(page, offset, length); } -/* - * This cancels just the dirty bit on the kernel page itself, it - * does NOT actually remove dirty bits on any mmap's that may be - * around. It also leaves the page tagged dirty, so any sync - * activity will still find it on the dirty lists, and in particular, - * clear_page_dirty_for_io() will still look at the dirty bits in - * the VM. - * - * Doing this should *normally* only ever be done when a page - * is truncated, and is not actually mapped anywhere at all. However, - * fs/buffer.c does this when it notices that somebody has cleaned - * out all the buffers on a page without actually doing it through - * the VM. Can you say "ext3 is horribly ugly"? Tought you could. - */ -void cancel_dirty_page(struct page *page, unsigned int account_size) -{ - if (TestClearPageDirty(page)) { - struct address_space *mapping = page->mapping; - if (mapping && mapping_cap_account_dirty(mapping)) { - dec_zone_page_state(page, NR_FILE_DIRTY); - dec_bdi_stat(inode_to_bdi(mapping->host), - BDI_RECLAIMABLE); - if (account_size) - task_io_account_cancelled_write(account_size); - } - } -} -EXPORT_SYMBOL(cancel_dirty_page); - /* * If truncate cannot remove the fs-private metadata from the page, the page * becomes orphaned. It will be left on the LRU and may even be mapped into @@ -140,7 +111,13 @@ truncate_complete_page(struct address_space *mapping, struct page *page) if (page_has_private(page)) do_invalidatepage(page, 0, PAGE_CACHE_SIZE); - cancel_dirty_page(page, PAGE_CACHE_SIZE); + /* + * Some filesystems seem to re-dirty the page even after + * the VM has canceled the dirty bit (eg ext3 journaling). + * Hence dirty accounting check is placed after invalidation. + */ + if (TestClearPageDirty(page)) + account_page_cleaned(page, mapping); ClearPageMappedToDisk(page); delete_from_page_cache(page); -- cgit v1.2.3 From d1bfcdb8ce0ea6eb6034daa7ff02548e0bc9c21b Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 14 Apr 2015 15:45:30 -0700 Subject: mm: hide per-cpu lists in output of show_mem() This makes show_mem() much less verbose on huge machines. Instead of huge and almost useless dump of counters for each per-zone per-cpu lists this patch prints the sum of these counters for each zone (free_pcp) and size of per-cpu list for current cpu (local_pcp). The filter flag SHOW_MEM_PERCPU_LISTS reverts to the old verbose mode. [akpm@linux-foundation.org: update show_free_areas comment] Signed-off-by: Konstantin Khlebnikov Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + mm/page_alloc.c | 39 ++++++++++++++++++++++++++++++--------- 2 files changed, 31 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6571dd78e984..9c21b42d07bf 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1126,6 +1126,7 @@ extern void pagefault_out_of_memory(void); * various contexts. */ #define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */ +#define SHOW_MEM_PERCPU_LISTS (0x0002u) /* per-zone per-cpu */ extern void show_free_areas(unsigned int flags); extern bool skip_free_areas_node(unsigned int flags, int nid); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6dfa5b24cc79..eab8e2018a46 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3251,25 +3251,37 @@ static void show_migration_types(unsigned char type) * Show free area list (used inside shift_scroll-lock stuff) * We also calculate the percentage fragmentation. We do this by counting the * memory on each free list with the exception of the first item on the list. - * Suppresses nodes that are not allowed by current's cpuset if - * SHOW_MEM_FILTER_NODES is passed. + * + * Bits in @filter: + * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's + * cpuset. + * SHOW_MEM_PERCPU_LISTS: display full per-node per-cpu pcp lists */ void show_free_areas(unsigned int filter) { + unsigned long free_pcp = 0; int cpu; struct zone *zone; for_each_populated_zone(zone) { if (skip_free_areas_node(filter, zone_to_nid(zone))) continue; - show_node(zone); - printk("%s per-cpu:\n", zone->name); + + if (filter & SHOW_MEM_PERCPU_LISTS) { + show_node(zone); + printk("%s per-cpu:\n", zone->name); + } for_each_online_cpu(cpu) { struct per_cpu_pageset *pageset; pageset = per_cpu_ptr(zone->pageset, cpu); + free_pcp += pageset->pcp.count; + + if (!(filter & SHOW_MEM_PERCPU_LISTS)) + continue; + printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", cpu, pageset->pcp.high, pageset->pcp.batch, pageset->pcp.count); @@ -3278,11 +3290,10 @@ void show_free_areas(unsigned int filter) printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" " active_file:%lu inactive_file:%lu isolated_file:%lu\n" - " unevictable:%lu" - " dirty:%lu writeback:%lu unstable:%lu\n" - " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" + " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n" + " slab_reclaimable:%lu slab_unreclaimable:%lu\n" " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" - " free_cma:%lu\n", + " free:%lu free_pcp:%lu free_cma:%lu\n", global_page_state(NR_ACTIVE_ANON), global_page_state(NR_INACTIVE_ANON), global_page_state(NR_ISOLATED_ANON), @@ -3293,13 +3304,14 @@ void show_free_areas(unsigned int filter) global_page_state(NR_FILE_DIRTY), global_page_state(NR_WRITEBACK), global_page_state(NR_UNSTABLE_NFS), - global_page_state(NR_FREE_PAGES), global_page_state(NR_SLAB_RECLAIMABLE), global_page_state(NR_SLAB_UNRECLAIMABLE), global_page_state(NR_FILE_MAPPED), global_page_state(NR_SHMEM), global_page_state(NR_PAGETABLE), global_page_state(NR_BOUNCE), + global_page_state(NR_FREE_PAGES), + free_pcp, global_page_state(NR_FREE_CMA_PAGES)); for_each_populated_zone(zone) { @@ -3307,6 +3319,11 @@ void show_free_areas(unsigned int filter) if (skip_free_areas_node(filter, zone_to_nid(zone))) continue; + + free_pcp = 0; + for_each_online_cpu(cpu) + free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count; + show_node(zone); printk("%s" " free:%lukB" @@ -3333,6 +3350,8 @@ void show_free_areas(unsigned int filter) " pagetables:%lukB" " unstable:%lukB" " bounce:%lukB" + " free_pcp:%lukB" + " local_pcp:%ukB" " free_cma:%lukB" " writeback_tmp:%lukB" " pages_scanned:%lu" @@ -3364,6 +3383,8 @@ void show_free_areas(unsigned int filter) K(zone_page_state(zone, NR_PAGETABLE)), K(zone_page_state(zone, NR_UNSTABLE_NFS)), K(zone_page_state(zone, NR_BOUNCE)), + K(free_pcp), + K(this_cpu_read(zone->pageset->pcp.count)), K(zone_page_state(zone, NR_FREE_CMA_PAGES)), K(zone_page_state(zone, NR_WRITEBACK_TEMP)), K(zone_page_state(zone, NR_PAGES_SCANNED)), -- cgit v1.2.3 From 761b06771adeeb734e9eebc6f70f916cb9e2f643 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 14 Apr 2015 15:45:32 -0700 Subject: mm: completely remove dumping per-cpu lists from show_mem() It seems nobody needs this. Signed-off-by: Konstantin Khlebnikov Cc: Michal Hocko Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - mm/page_alloc.c | 22 ++-------------------- 2 files changed, 2 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9c21b42d07bf..6571dd78e984 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1126,7 +1126,6 @@ extern void pagefault_out_of_memory(void); * various contexts. */ #define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */ -#define SHOW_MEM_PERCPU_LISTS (0x0002u) /* per-zone per-cpu */ extern void show_free_areas(unsigned int flags); extern bool skip_free_areas_node(unsigned int flags, int nid); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eab8e2018a46..84466a4b1b36 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3255,7 +3255,6 @@ static void show_migration_types(unsigned char type) * Bits in @filter: * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's * cpuset. - * SHOW_MEM_PERCPU_LISTS: display full per-node per-cpu pcp lists */ void show_free_areas(unsigned int filter) { @@ -3267,25 +3266,8 @@ void show_free_areas(unsigned int filter) if (skip_free_areas_node(filter, zone_to_nid(zone))) continue; - if (filter & SHOW_MEM_PERCPU_LISTS) { - show_node(zone); - printk("%s per-cpu:\n", zone->name); - } - - for_each_online_cpu(cpu) { - struct per_cpu_pageset *pageset; - - pageset = per_cpu_ptr(zone->pageset, cpu); - - free_pcp += pageset->pcp.count; - - if (!(filter & SHOW_MEM_PERCPU_LISTS)) - continue; - - printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", - cpu, pageset->pcp.high, - pageset->pcp.batch, pageset->pcp.count); - } + for_each_online_cpu(cpu) + free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count; } printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" -- cgit v1.2.3 From 982333683385343d8d2db9a1df69c98406f42687 Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Tue, 14 Apr 2015 15:46:14 -0700 Subject: x86: expose number of page table levels on Kconfig level We would want to use number of page table level to define mm_struct. Let's expose it as CONFIG_PGTABLE_LEVELS. Signed-off-by: Kirill A. Shutemov Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Tested-by: Guenter Roeck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 6 ++++++ arch/x86/include/asm/paravirt.h | 8 ++++---- arch/x86/include/asm/paravirt_types.h | 8 ++++---- arch/x86/include/asm/pgalloc.h | 8 ++++---- arch/x86/include/asm/pgtable-2level_types.h | 1 - arch/x86/include/asm/pgtable-3level_types.h | 2 -- arch/x86/include/asm/pgtable.h | 8 ++++---- arch/x86/include/asm/pgtable_64_types.h | 1 - arch/x86/include/asm/pgtable_types.h | 4 ++-- arch/x86/kernel/paravirt.c | 6 +++--- arch/x86/mm/pgtable.c | 14 +++++++------- arch/x86/xen/mmu.c | 14 +++++++------- include/trace/events/xen.h | 2 +- 13 files changed, 42 insertions(+), 40 deletions(-) (limited to 'include') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index faff6934c05a..3e3aaad23414 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -277,6 +277,12 @@ config ARCH_SUPPORTS_UPROBES config FIX_EARLYCON_MEM def_bool y +config PGTABLE_LEVELS + int + default 4 if X86_64 + default 3 if X86_PAE + default 2 + source "init/Kconfig" source "kernel/Kconfig.freezer" diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 5f6051d5d139..8957810ad7d1 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -545,7 +545,7 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) PVOP_VCALL2(pv_mmu_ops.set_pmd, pmdp, val); } -#if PAGETABLE_LEVELS >= 3 +#if CONFIG_PGTABLE_LEVELS >= 3 static inline pmd_t __pmd(pmdval_t val) { pmdval_t ret; @@ -585,7 +585,7 @@ static inline void set_pud(pud_t *pudp, pud_t pud) PVOP_VCALL2(pv_mmu_ops.set_pud, pudp, val); } -#if PAGETABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS == 4 static inline pud_t __pud(pudval_t val) { pudval_t ret; @@ -636,9 +636,9 @@ static inline void pud_clear(pud_t *pudp) set_pud(pudp, __pud(0)); } -#endif /* PAGETABLE_LEVELS == 4 */ +#endif /* CONFIG_PGTABLE_LEVELS == 4 */ -#endif /* PAGETABLE_LEVELS >= 3 */ +#endif /* CONFIG_PGTABLE_LEVELS >= 3 */ #ifdef CONFIG_X86_PAE /* Special-case pte-setting operations for PAE, which can't update a diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 7549b8b369e4..f7b0b5c112f2 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -294,7 +294,7 @@ struct pv_mmu_ops { struct paravirt_callee_save pgd_val; struct paravirt_callee_save make_pgd; -#if PAGETABLE_LEVELS >= 3 +#if CONFIG_PGTABLE_LEVELS >= 3 #ifdef CONFIG_X86_PAE void (*set_pte_atomic)(pte_t *ptep, pte_t pteval); void (*pte_clear)(struct mm_struct *mm, unsigned long addr, @@ -308,13 +308,13 @@ struct pv_mmu_ops { struct paravirt_callee_save pmd_val; struct paravirt_callee_save make_pmd; -#if PAGETABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS == 4 struct paravirt_callee_save pud_val; struct paravirt_callee_save make_pud; void (*set_pgd)(pgd_t *pudp, pgd_t pgdval); -#endif /* PAGETABLE_LEVELS == 4 */ -#endif /* PAGETABLE_LEVELS >= 3 */ +#endif /* CONFIG_PGTABLE_LEVELS == 4 */ +#endif /* CONFIG_PGTABLE_LEVELS >= 3 */ struct pv_lazy_ops lazy_mode; diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index c4412e972bbd..bf7f8b55b0f9 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -77,7 +77,7 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, #define pmd_pgtable(pmd) pmd_page(pmd) -#if PAGETABLE_LEVELS > 2 +#if CONFIG_PGTABLE_LEVELS > 2 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { struct page *page; @@ -116,7 +116,7 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) } #endif /* CONFIG_X86_PAE */ -#if PAGETABLE_LEVELS > 3 +#if CONFIG_PGTABLE_LEVELS > 3 static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) { paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT); @@ -142,7 +142,7 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, ___pud_free_tlb(tlb, pud); } -#endif /* PAGETABLE_LEVELS > 3 */ -#endif /* PAGETABLE_LEVELS > 2 */ +#endif /* CONFIG_PGTABLE_LEVELS > 3 */ +#endif /* CONFIG_PGTABLE_LEVELS > 2 */ #endif /* _ASM_X86_PGALLOC_H */ diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h index daacc23e3fb9..392576433e77 100644 --- a/arch/x86/include/asm/pgtable-2level_types.h +++ b/arch/x86/include/asm/pgtable-2level_types.h @@ -17,7 +17,6 @@ typedef union { #endif /* !__ASSEMBLY__ */ #define SHARED_KERNEL_PMD 0 -#define PAGETABLE_LEVELS 2 /* * traditional i386 two-level paging structure: diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h index 1bd5876c8649..bcc89625ebe5 100644 --- a/arch/x86/include/asm/pgtable-3level_types.h +++ b/arch/x86/include/asm/pgtable-3level_types.h @@ -24,8 +24,6 @@ typedef union { #define SHARED_KERNEL_PMD 1 #endif -#define PAGETABLE_LEVELS 3 - /* * PGDIR_SHIFT determines what a top-level page table entry can map */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index a0c35bf6cb92..fe57e7a98839 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -551,7 +551,7 @@ static inline unsigned long pages_to_mb(unsigned long npg) return npg >> (20 - PAGE_SHIFT); } -#if PAGETABLE_LEVELS > 2 +#if CONFIG_PGTABLE_LEVELS > 2 static inline int pud_none(pud_t pud) { return native_pud_val(pud) == 0; @@ -594,9 +594,9 @@ static inline int pud_large(pud_t pud) { return 0; } -#endif /* PAGETABLE_LEVELS > 2 */ +#endif /* CONFIG_PGTABLE_LEVELS > 2 */ -#if PAGETABLE_LEVELS > 3 +#if CONFIG_PGTABLE_LEVELS > 3 static inline int pgd_present(pgd_t pgd) { return pgd_flags(pgd) & _PAGE_PRESENT; @@ -633,7 +633,7 @@ static inline int pgd_none(pgd_t pgd) { return !native_pgd_val(pgd); } -#endif /* PAGETABLE_LEVELS > 3 */ +#endif /* CONFIG_PGTABLE_LEVELS > 3 */ #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 602b6028c5b6..e6844dfb4471 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -20,7 +20,6 @@ typedef struct { pteval_t pte; } pte_t; #endif /* !__ASSEMBLY__ */ #define SHARED_KERNEL_PMD 0 -#define PAGETABLE_LEVELS 4 /* * PGDIR_SHIFT determines what a top-level page table entry can map diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 8c7c10802e9c..78f0c8cbe316 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -234,7 +234,7 @@ static inline pgdval_t pgd_flags(pgd_t pgd) return native_pgd_val(pgd) & PTE_FLAGS_MASK; } -#if PAGETABLE_LEVELS > 3 +#if CONFIG_PGTABLE_LEVELS > 3 typedef struct { pudval_t pud; } pud_t; static inline pud_t native_make_pud(pmdval_t val) @@ -255,7 +255,7 @@ static inline pudval_t native_pud_val(pud_t pud) } #endif -#if PAGETABLE_LEVELS > 2 +#if CONFIG_PGTABLE_LEVELS > 2 typedef struct { pmdval_t pmd; } pmd_t; static inline pmd_t native_make_pmd(pmdval_t val) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 548d25f00c90..c614dd492f5f 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -443,7 +443,7 @@ struct pv_mmu_ops pv_mmu_ops = { .ptep_modify_prot_start = __ptep_modify_prot_start, .ptep_modify_prot_commit = __ptep_modify_prot_commit, -#if PAGETABLE_LEVELS >= 3 +#if CONFIG_PGTABLE_LEVELS >= 3 #ifdef CONFIG_X86_PAE .set_pte_atomic = native_set_pte_atomic, .pte_clear = native_pte_clear, @@ -454,13 +454,13 @@ struct pv_mmu_ops pv_mmu_ops = { .pmd_val = PTE_IDENT, .make_pmd = PTE_IDENT, -#if PAGETABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS == 4 .pud_val = PTE_IDENT, .make_pud = PTE_IDENT, .set_pgd = native_set_pgd, #endif -#endif /* PAGETABLE_LEVELS >= 3 */ +#endif /* CONFIG_PGTABLE_LEVELS >= 3 */ .pte_val = PTE_IDENT, .pgd_val = PTE_IDENT, diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 5a7e5252c878..b28edfecbdfe 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -58,7 +58,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) tlb_remove_page(tlb, pte); } -#if PAGETABLE_LEVELS > 2 +#if CONFIG_PGTABLE_LEVELS > 2 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) { struct page *page = virt_to_page(pmd); @@ -74,14 +74,14 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) tlb_remove_page(tlb, page); } -#if PAGETABLE_LEVELS > 3 +#if CONFIG_PGTABLE_LEVELS > 3 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) { paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); tlb_remove_page(tlb, virt_to_page(pud)); } -#endif /* PAGETABLE_LEVELS > 3 */ -#endif /* PAGETABLE_LEVELS > 2 */ +#endif /* CONFIG_PGTABLE_LEVELS > 3 */ +#endif /* CONFIG_PGTABLE_LEVELS > 2 */ static inline void pgd_list_add(pgd_t *pgd) { @@ -117,9 +117,9 @@ static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) /* If the pgd points to a shared pagetable level (either the ptes in non-PAE, or shared PMD in PAE), then just copy the references from swapper_pg_dir. */ - if (PAGETABLE_LEVELS == 2 || - (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) || - PAGETABLE_LEVELS == 4) { + if (CONFIG_PGTABLE_LEVELS == 2 || + (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) || + CONFIG_PGTABLE_LEVELS == 4) { clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, swapper_pg_dir + KERNEL_PGD_BOUNDARY, KERNEL_PGD_PTRS); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index adca9e2b6553..65083ad63b6f 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -502,7 +502,7 @@ __visible pmd_t xen_make_pmd(pmdval_t pmd) } PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd); -#if PAGETABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS == 4 __visible pudval_t xen_pud_val(pud_t pud) { return pte_mfn_to_pfn(pud.pud); @@ -589,7 +589,7 @@ static void xen_set_pgd(pgd_t *ptr, pgd_t val) xen_mc_issue(PARAVIRT_LAZY_MMU); } -#endif /* PAGETABLE_LEVELS == 4 */ +#endif /* CONFIG_PGTABLE_LEVELS == 4 */ /* * (Yet another) pagetable walker. This one is intended for pinning a @@ -1628,7 +1628,7 @@ static void xen_release_pmd(unsigned long pfn) xen_release_ptpage(pfn, PT_PMD); } -#if PAGETABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS == 4 static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) { xen_alloc_ptpage(mm, pfn, PT_PUD); @@ -2046,7 +2046,7 @@ static void __init xen_post_allocator_init(void) pv_mmu_ops.set_pte = xen_set_pte; pv_mmu_ops.set_pmd = xen_set_pmd; pv_mmu_ops.set_pud = xen_set_pud; -#if PAGETABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS == 4 pv_mmu_ops.set_pgd = xen_set_pgd; #endif @@ -2056,7 +2056,7 @@ static void __init xen_post_allocator_init(void) pv_mmu_ops.alloc_pmd = xen_alloc_pmd; pv_mmu_ops.release_pte = xen_release_pte; pv_mmu_ops.release_pmd = xen_release_pmd; -#if PAGETABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS == 4 pv_mmu_ops.alloc_pud = xen_alloc_pud; pv_mmu_ops.release_pud = xen_release_pud; #endif @@ -2122,14 +2122,14 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), -#if PAGETABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS == 4 .pud_val = PV_CALLEE_SAVE(xen_pud_val), .make_pud = PV_CALLEE_SAVE(xen_make_pud), .set_pgd = xen_set_pgd_hyper, .alloc_pud = xen_alloc_pmd_init, .release_pud = xen_release_pmd_init, -#endif /* PAGETABLE_LEVELS == 4 */ +#endif /* CONFIG_PGTABLE_LEVELS == 4 */ .activate_mm = xen_activate_mm, .dup_mmap = xen_dup_mmap, diff --git a/include/trace/events/xen.h b/include/trace/events/xen.h index d06b6da5c1e3..bce990f5a35d 100644 --- a/include/trace/events/xen.h +++ b/include/trace/events/xen.h @@ -224,7 +224,7 @@ TRACE_EVENT(xen_mmu_pmd_clear, TP_printk("pmdp %p", __entry->pmdp) ); -#if PAGETABLE_LEVELS >= 4 +#if CONFIG_PGTABLE_LEVELS >= 4 TRACE_EVENT(xen_mmu_set_pud, TP_PROTO(pud_t *pudp, pud_t pudval), -- cgit v1.2.3 From 235a8f0286d3de5754322e9b4abd472ed4d57209 Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Tue, 14 Apr 2015 15:46:17 -0700 Subject: mm: define default PGTABLE_LEVELS to two By this time all architectures which support more than two page table levels should be covered. This patch add default definiton of PGTABLE_LEVELS equal 2. We also add assert to detect inconsistence between CONFIG_PGTABLE_LEVELS and __PAGETABLE_PMD_FOLDED/__PAGETABLE_PUD_FOLDED. Signed-off-by: Kirill A. Shutemov Tested-by: Guenter Roeck Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Cc: "David S. Miller" Cc: "H. Peter Anvin" Cc: "James E.J. Bottomley" Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Chris Metcalf Cc: David Howells Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Helge Deller Cc: Ingo Molnar Cc: Jeff Dike Cc: Kirill A. Shutemov Cc: Koichi Yasutake Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Paul Mackerras Cc: Ralf Baechle Cc: Richard Weinberger Cc: Russell King Cc: Thomas Gleixner Cc: Tony Luck Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/Kconfig | 4 ++++ include/asm-generic/pgtable.h | 5 +++++ 2 files changed, 9 insertions(+) (limited to 'include') diff --git a/arch/Kconfig b/arch/Kconfig index 05d7a8a458d5..a9c95d36ba70 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -484,6 +484,10 @@ config HAVE_IRQ_EXIT_ON_IRQ_STACK This spares a stack switch and improves cache usage on softirq processing. +config PGTABLE_LEVELS + int + default 2 + # # ABI hall of shame # diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 4d46085c1b90..1f9f5da6828f 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -7,6 +7,11 @@ #include #include +#if 4 - defined(__PAGETABLE_PUD_FOLDED) - defined(__PAGETABLE_PMD_FOLDED) != \ + CONFIG_PGTABLE_LEVELS +#error CONFIG_PGTABLE_LEVELS is not consistent with __PAGETABLE_{PUD,PMD}_FOLDED +#endif + /* * On almost all architectures and configurations, 0 can be used as the * upper ceiling to free_pgtables(): on many architectures it has the same -- cgit v1.2.3 From 5a3fbef325e872f831cf13171c7032915bb1916d Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Tue, 14 Apr 2015 15:46:21 -0700 Subject: mm: do not add nr_pmds into mm_struct if PMD is folded CONFIG_PGTABLE_LEVELS is now available on every architecture and we can use it to check if we need to add nr_pmds into mm_struct. Signed-off-by: Kirill A. Shutemov Tested-by: Guenter Roeck Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Cc: "David S. Miller" Cc: "H. Peter Anvin" Cc: "James E.J. Bottomley" Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Chris Metcalf Cc: David Howells Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Helge Deller Cc: Ingo Molnar Cc: Jeff Dike Cc: Kirill A. Shutemov Cc: Koichi Yasutake Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Paul Mackerras Cc: Ralf Baechle Cc: Richard Weinberger Cc: Russell King Cc: Thomas Gleixner Cc: Tony Luck Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 199a03aab8dc..590630eb59ba 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -364,7 +364,9 @@ struct mm_struct { atomic_t mm_users; /* How many users with user space? */ atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ atomic_long_t nr_ptes; /* PTE page table pages */ +#if CONFIG_PGTABLE_LEVELS > 2 atomic_long_t nr_pmds; /* PMD page table pages */ +#endif int map_count; /* number of VMAs */ spinlock_t page_table_lock; /* Protects page tables and some counters */ -- cgit v1.2.3 From 9de1626290eaa7d921413ddc83544bc3bae27283 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Tue, 14 Apr 2015 15:46:42 -0700 Subject: cleancache: zap uuid arg of cleancache_init_shared_fs Use super_block->s_uuid instead. Every shared filesystem using cleancache must now initialize super_block->s_uuid before calling cleancache_init_shared_fs. The only one on the tree, ocfs2, already meets this requirement. Signed-off-by: Vladimir Davydov Cc: Konrad Rzeszutek Wilk Cc: Boris Ostrovsky Cc: David Vrabel Cc: Mark Fasheh Cc: Joel Becker Cc: Stefan Hengelein Cc: Florian Schmaus Cc: Andor Daam Cc: Dan Magenheimer Cc: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/super.c | 2 +- include/linux/cleancache.h | 6 +++--- mm/cleancache.c | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index a811a95cfd5f..837ddce4b659 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2336,7 +2336,7 @@ static int ocfs2_initialize_super(struct super_block *sb, mlog_errno(status); goto bail; } - cleancache_init_shared_fs((char *)&di->id2.i_super.s_uuid, sb); + cleancache_init_shared_fs(sb); bail: return status; diff --git a/include/linux/cleancache.h b/include/linux/cleancache.h index 4ce9056b31a8..29657d1c83fb 100644 --- a/include/linux/cleancache.h +++ b/include/linux/cleancache.h @@ -36,7 +36,7 @@ struct cleancache_ops { extern struct cleancache_ops * cleancache_register_ops(struct cleancache_ops *ops); extern void __cleancache_init_fs(struct super_block *); -extern void __cleancache_init_shared_fs(char *, struct super_block *); +extern void __cleancache_init_shared_fs(struct super_block *); extern int __cleancache_get_page(struct page *); extern void __cleancache_put_page(struct page *); extern void __cleancache_invalidate_page(struct address_space *, struct page *); @@ -78,10 +78,10 @@ static inline void cleancache_init_fs(struct super_block *sb) __cleancache_init_fs(sb); } -static inline void cleancache_init_shared_fs(char *uuid, struct super_block *sb) +static inline void cleancache_init_shared_fs(struct super_block *sb) { if (cleancache_enabled) - __cleancache_init_shared_fs(uuid, sb); + __cleancache_init_shared_fs(sb); } static inline int cleancache_get_page(struct page *page) diff --git a/mm/cleancache.c b/mm/cleancache.c index 053bcd8f12fb..532495f2e4f4 100644 --- a/mm/cleancache.c +++ b/mm/cleancache.c @@ -155,7 +155,7 @@ void __cleancache_init_fs(struct super_block *sb) EXPORT_SYMBOL(__cleancache_init_fs); /* Called by a cleancache-enabled clustered filesystem at time of mount */ -void __cleancache_init_shared_fs(char *uuid, struct super_block *sb) +void __cleancache_init_shared_fs(struct super_block *sb) { int i; @@ -163,10 +163,10 @@ void __cleancache_init_shared_fs(char *uuid, struct super_block *sb) for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { if (shared_fs_poolid_map[i] == FS_UNKNOWN) { sb->cleancache_poolid = i + FAKE_SHARED_FS_POOLID_OFFSET; - uuids[i] = uuid; + uuids[i] = sb->s_uuid; if (cleancache_ops) shared_fs_poolid_map[i] = cleancache_ops->init_shared_fs - (uuid, PAGE_SIZE); + (sb->s_uuid, PAGE_SIZE); else shared_fs_poolid_map[i] = FS_NO_BACKEND; break; -- cgit v1.2.3 From 53d85c98566535d7bc69470f008d7dcb09a0fec9 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Tue, 14 Apr 2015 15:46:45 -0700 Subject: cleancache: forbid overriding cleancache_ops Currently, cleancache_register_ops returns the previous value of cleancache_ops to allow chaining. However, chaining, as it is implemented now, is extremely dangerous due to possible pool id collisions. Suppose, a new cleancache driver is registered after the previous one assigned an id to a super block. If the new driver assigns the same id to another super block, which is perfectly possible, we will have two different filesystems using the same id. No matter if the new driver implements chaining or not, we are likely to get data corruption with such a configuration eventually. This patch therefore disables the ability to override cleancache_ops altogether as potentially dangerous. If there is already cleancache driver registered, all further calls to cleancache_register_ops will return EBUSY. Since no user of cleancache implements chaining, we only need to make minor changes to the code outside the cleancache core. Signed-off-by: Vladimir Davydov Cc: Konrad Rzeszutek Wilk Cc: Boris Ostrovsky Cc: David Vrabel Cc: Mark Fasheh Cc: Joel Becker Cc: Stefan Hengelein Cc: Florian Schmaus Cc: Andor Daam Cc: Dan Magenheimer Cc: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/cleancache.txt | 4 +--- drivers/xen/tmem.c | 16 +++++++++------- include/linux/cleancache.h | 3 +-- mm/cleancache.c | 12 +++++++----- 4 files changed, 18 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/Documentation/vm/cleancache.txt b/Documentation/vm/cleancache.txt index 01d76282444e..e4b49df7a048 100644 --- a/Documentation/vm/cleancache.txt +++ b/Documentation/vm/cleancache.txt @@ -28,9 +28,7 @@ IMPLEMENTATION OVERVIEW A cleancache "backend" that provides transcendent memory registers itself to the kernel's cleancache "frontend" by calling cleancache_register_ops, passing a pointer to a cleancache_ops structure with funcs set appropriately. -Note that cleancache_register_ops returns the previous settings so that -chaining can be performed if desired. The functions provided must conform to -certain semantics as follows: +The functions provided must conform to certain semantics as follows: Most important, cleancache is "ephemeral". Pages which are copied into cleancache have an indefinite lifetime which is completely unknowable diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c index 8a65423bc696..c4211a31612d 100644 --- a/drivers/xen/tmem.c +++ b/drivers/xen/tmem.c @@ -397,13 +397,15 @@ static int __init xen_tmem_init(void) #ifdef CONFIG_CLEANCACHE BUG_ON(sizeof(struct cleancache_filekey) != sizeof(struct tmem_oid)); if (tmem_enabled && cleancache) { - char *s = ""; - struct cleancache_ops *old_ops = - cleancache_register_ops(&tmem_cleancache_ops); - if (old_ops) - s = " (WARNING: cleancache_ops overridden)"; - pr_info("cleancache enabled, RAM provided by Xen Transcendent Memory%s\n", - s); + int err; + + err = cleancache_register_ops(&tmem_cleancache_ops); + if (err) + pr_warn("xen-tmem: failed to enable cleancache: %d\n", + err); + else + pr_info("cleancache enabled, RAM provided by " + "Xen Transcendent Memory\n"); } #endif #ifdef CONFIG_XEN_SELFBALLOONING diff --git a/include/linux/cleancache.h b/include/linux/cleancache.h index 29657d1c83fb..b23611f43cfb 100644 --- a/include/linux/cleancache.h +++ b/include/linux/cleancache.h @@ -33,8 +33,7 @@ struct cleancache_ops { void (*invalidate_fs)(int); }; -extern struct cleancache_ops * - cleancache_register_ops(struct cleancache_ops *ops); +extern int cleancache_register_ops(struct cleancache_ops *ops); extern void __cleancache_init_fs(struct super_block *); extern void __cleancache_init_shared_fs(struct super_block *); extern int __cleancache_get_page(struct page *); diff --git a/mm/cleancache.c b/mm/cleancache.c index 532495f2e4f4..aa10f9a3bc88 100644 --- a/mm/cleancache.c +++ b/mm/cleancache.c @@ -106,15 +106,17 @@ static DEFINE_MUTEX(poolid_mutex); */ /* - * Register operations for cleancache, returning previous thus allowing - * detection of multiple backends and possible nesting. + * Register operations for cleancache. Returns 0 on success. */ -struct cleancache_ops *cleancache_register_ops(struct cleancache_ops *ops) +int cleancache_register_ops(struct cleancache_ops *ops) { - struct cleancache_ops *old = cleancache_ops; int i; mutex_lock(&poolid_mutex); + if (cleancache_ops) { + mutex_unlock(&poolid_mutex); + return -EBUSY; + } for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { if (fs_poolid_map[i] == FS_NO_BACKEND) fs_poolid_map[i] = ops->init_fs(PAGE_SIZE); @@ -130,7 +132,7 @@ struct cleancache_ops *cleancache_register_ops(struct cleancache_ops *ops) barrier(); cleancache_ops = ops; mutex_unlock(&poolid_mutex); - return old; + return 0; } EXPORT_SYMBOL(cleancache_register_ops); -- cgit v1.2.3 From 3cb29d11174f29b76addcba4374884b14f8ea4b1 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Tue, 14 Apr 2015 15:46:48 -0700 Subject: cleancache: remove limit on the number of cleancache enabled filesystems The limit equals 32 and is imposed by the number of entries in the fs_poolid_map and shared_fs_poolid_map. Nowadays it is insufficient, because with containers on board a Linux host can have hundreds of active fs mounts. These maps were introduced by commit 49a9ab815acb8 ("mm: cleancache: lazy initialization to allow tmem backends to build/run as modules") in order to allow compiling cleancache drivers as modules. Real pool ids are stored in these maps while super_block->cleancache_poolid points to an entry in the map, so that on cleancache registration we can walk over all (if there are <= 32 of them, of course) cleancache-enabled super blocks and assign real pool ids. Actually, there is absolutely no need in these maps, because we can iterate over all super blocks immediately using iterate_supers. This is not racy, because cleancache_init_ops is called from mount_fs with super_block->s_umount held for writing, while iterate_supers takes this semaphore for reading, so if we call iterate_supers after setting cleancache_ops, all super blocks that had been created before cleancache_register_ops was called will be assigned pool ids by the action function of iterate_supers while all newer super blocks will receive it in cleancache_init_fs. This patch therefore removes the maps and hence the artificial limit on the number of cleancache enabled filesystems. Signed-off-by: Vladimir Davydov Cc: Konrad Rzeszutek Wilk Cc: Boris Ostrovsky Cc: David Vrabel Cc: Mark Fasheh Cc: Joel Becker Cc: Stefan Hengelein Cc: Florian Schmaus Cc: Andor Daam Cc: Dan Magenheimer Cc: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/super.c | 2 +- include/linux/cleancache.h | 4 + mm/cleancache.c | 270 +++++++++++++++------------------------------ 3 files changed, 94 insertions(+), 182 deletions(-) (limited to 'include') diff --git a/fs/super.c b/fs/super.c index 2b7dc90ccdbb..928c20f47af9 100644 --- a/fs/super.c +++ b/fs/super.c @@ -224,7 +224,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) s->s_maxbytes = MAX_NON_LFS; s->s_op = &default_op; s->s_time_gran = 1000000000; - s->cleancache_poolid = -1; + s->cleancache_poolid = CLEANCACHE_NO_POOL; s->s_shrink.seeks = DEFAULT_SEEKS; s->s_shrink.scan_objects = super_cache_scan; diff --git a/include/linux/cleancache.h b/include/linux/cleancache.h index b23611f43cfb..bda5ec0b4b4d 100644 --- a/include/linux/cleancache.h +++ b/include/linux/cleancache.h @@ -5,6 +5,10 @@ #include #include +#define CLEANCACHE_NO_POOL -1 +#define CLEANCACHE_NO_BACKEND -2 +#define CLEANCACHE_NO_BACKEND_SHARED -3 + #define CLEANCACHE_KEY_MAX 6 /* diff --git a/mm/cleancache.c b/mm/cleancache.c index aa10f9a3bc88..8fc50811119b 100644 --- a/mm/cleancache.c +++ b/mm/cleancache.c @@ -19,7 +19,7 @@ #include /* - * cleancache_ops is set by cleancache_ops_register to contain the pointers + * cleancache_ops is set by cleancache_register_ops to contain the pointers * to the cleancache "backend" implementation functions. */ static struct cleancache_ops *cleancache_ops __read_mostly; @@ -34,104 +34,78 @@ static u64 cleancache_failed_gets; static u64 cleancache_puts; static u64 cleancache_invalidates; -/* - * When no backend is registered all calls to init_fs and init_shared_fs - * are registered and fake poolids (FAKE_FS_POOLID_OFFSET or - * FAKE_SHARED_FS_POOLID_OFFSET, plus offset in the respective array - * [shared_|]fs_poolid_map) are given to the respective super block - * (sb->cleancache_poolid) and no tmem_pools are created. When a backend - * registers with cleancache the previous calls to init_fs and init_shared_fs - * are executed to create tmem_pools and set the respective poolids. While no - * backend is registered all "puts", "gets" and "flushes" are ignored or failed. - */ -#define MAX_INITIALIZABLE_FS 32 -#define FAKE_FS_POOLID_OFFSET 1000 -#define FAKE_SHARED_FS_POOLID_OFFSET 2000 - -#define FS_NO_BACKEND (-1) -#define FS_UNKNOWN (-2) -static int fs_poolid_map[MAX_INITIALIZABLE_FS]; -static int shared_fs_poolid_map[MAX_INITIALIZABLE_FS]; -static char *uuids[MAX_INITIALIZABLE_FS]; -/* - * Mutex for the [shared_|]fs_poolid_map to guard against multiple threads - * invoking umount (and ending in __cleancache_invalidate_fs) and also multiple - * threads calling mount (and ending up in __cleancache_init_[shared|]fs). - */ -static DEFINE_MUTEX(poolid_mutex); -/* - * When set to false (default) all calls to the cleancache functions, except - * the __cleancache_invalidate_fs and __cleancache_init_[shared|]fs are guarded - * by the if (!cleancache_ops) return. This means multiple threads (from - * different filesystems) will be checking cleancache_ops. The usage of a - * bool instead of a atomic_t or a bool guarded by a spinlock is OK - we are - * OK if the time between the backend's have been initialized (and - * cleancache_ops has been set to not NULL) and when the filesystems start - * actually calling the backends. The inverse (when unloading) is obviously - * not good - but this shim does not do that (yet). - */ - -/* - * The backends and filesystems work all asynchronously. This is b/c the - * backends can be built as modules. - * The usual sequence of events is: - * a) mount / -> __cleancache_init_fs is called. We set the - * [shared_|]fs_poolid_map and uuids for. - * - * b). user does I/Os -> we call the rest of __cleancache_* functions - * which return immediately as cleancache_ops is false. - * - * c). modprobe zcache -> cleancache_register_ops. We init the backend - * and set cleancache_ops to true, and for any fs_poolid_map - * (which is set by __cleancache_init_fs) we initialize the poolid. - * - * d). user does I/Os -> now that cleancache_ops is true all the - * __cleancache_* functions can call the backend. They all check - * that fs_poolid_map is valid and if so invoke the backend. - * - * e). umount / -> __cleancache_invalidate_fs, the fs_poolid_map is - * reset (which is the second check in the __cleancache_* ops - * to call the backend). - * - * The sequence of event could also be c), followed by a), and d). and e). The - * c) would not happen anymore. There is also the chance of c), and one thread - * doing a) + d), and another doing e). For that case we depend on the - * filesystem calling __cleancache_invalidate_fs in the proper sequence (so - * that it handles all I/Os before it invalidates the fs (which is last part - * of unmounting process). - * - * Note: The acute reader will notice that there is no "rmmod zcache" case. - * This is b/c the functionality for that is not yet implemented and when - * done, will require some extra locking not yet devised. - */ +static void cleancache_register_ops_sb(struct super_block *sb, void *unused) +{ + switch (sb->cleancache_poolid) { + case CLEANCACHE_NO_BACKEND: + __cleancache_init_fs(sb); + break; + case CLEANCACHE_NO_BACKEND_SHARED: + __cleancache_init_shared_fs(sb); + break; + } +} /* * Register operations for cleancache. Returns 0 on success. */ int cleancache_register_ops(struct cleancache_ops *ops) { - int i; - - mutex_lock(&poolid_mutex); - if (cleancache_ops) { - mutex_unlock(&poolid_mutex); + if (cmpxchg(&cleancache_ops, NULL, ops)) return -EBUSY; - } - for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { - if (fs_poolid_map[i] == FS_NO_BACKEND) - fs_poolid_map[i] = ops->init_fs(PAGE_SIZE); - if (shared_fs_poolid_map[i] == FS_NO_BACKEND) - shared_fs_poolid_map[i] = ops->init_shared_fs - (uuids[i], PAGE_SIZE); - } + /* - * We MUST set cleancache_ops _after_ we have called the backends - * init_fs or init_shared_fs functions. Otherwise the compiler might - * re-order where cleancache_ops is set in this function. + * A cleancache backend can be built as a module and hence loaded after + * a cleancache enabled filesystem has called cleancache_init_fs. To + * handle such a scenario, here we call ->init_fs or ->init_shared_fs + * for each active super block. To differentiate between local and + * shared filesystems, we temporarily initialize sb->cleancache_poolid + * to CLEANCACHE_NO_BACKEND or CLEANCACHE_NO_BACKEND_SHARED + * respectively in case there is no backend registered at the time + * cleancache_init_fs or cleancache_init_shared_fs is called. + * + * Since filesystems can be mounted concurrently with cleancache + * backend registration, we have to be careful to guarantee that all + * cleancache enabled filesystems that has been mounted by the time + * cleancache_register_ops is called has got and all mounted later will + * get cleancache_poolid. This is assured by the following statements + * tied together: + * + * a) iterate_supers skips only those super blocks that has started + * ->kill_sb + * + * b) if iterate_supers encounters a super block that has not finished + * ->mount yet, it waits until it is finished + * + * c) cleancache_init_fs is called from ->mount and + * cleancache_invalidate_fs is called from ->kill_sb + * + * d) we call iterate_supers after cleancache_ops has been set + * + * From a) it follows that if iterate_supers skips a super block, then + * either the super block is already dead, in which case we do not need + * to bother initializing cleancache for it, or it was mounted after we + * initiated iterate_supers. In the latter case, it must have seen + * cleancache_ops set according to d) and initialized cleancache from + * ->mount by itself according to c). This proves that we call + * ->init_fs at least once for each active super block. + * + * From b) and c) it follows that if iterate_supers encounters a super + * block that has already started ->init_fs, it will wait until ->mount + * and hence ->init_fs has finished, then check cleancache_poolid, see + * that it has already been set and therefore do nothing. This proves + * that we call ->init_fs no more than once for each super block. + * + * Combined together, the last two paragraphs prove the function + * correctness. + * + * Note that various cleancache callbacks may proceed before this + * function is called or even concurrently with it, but since + * CLEANCACHE_NO_BACKEND is negative, they will all result in a noop + * until the corresponding ->init_fs has been actually called and + * cleancache_ops has been set. */ - barrier(); - cleancache_ops = ops; - mutex_unlock(&poolid_mutex); + iterate_supers(cleancache_register_ops_sb, NULL); return 0; } EXPORT_SYMBOL(cleancache_register_ops); @@ -139,42 +113,28 @@ EXPORT_SYMBOL(cleancache_register_ops); /* Called by a cleancache-enabled filesystem at time of mount */ void __cleancache_init_fs(struct super_block *sb) { - int i; + int pool_id = CLEANCACHE_NO_BACKEND; - mutex_lock(&poolid_mutex); - for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { - if (fs_poolid_map[i] == FS_UNKNOWN) { - sb->cleancache_poolid = i + FAKE_FS_POOLID_OFFSET; - if (cleancache_ops) - fs_poolid_map[i] = cleancache_ops->init_fs(PAGE_SIZE); - else - fs_poolid_map[i] = FS_NO_BACKEND; - break; - } + if (cleancache_ops) { + pool_id = cleancache_ops->init_fs(PAGE_SIZE); + if (pool_id < 0) + pool_id = CLEANCACHE_NO_POOL; } - mutex_unlock(&poolid_mutex); + sb->cleancache_poolid = pool_id; } EXPORT_SYMBOL(__cleancache_init_fs); /* Called by a cleancache-enabled clustered filesystem at time of mount */ void __cleancache_init_shared_fs(struct super_block *sb) { - int i; + int pool_id = CLEANCACHE_NO_BACKEND_SHARED; - mutex_lock(&poolid_mutex); - for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { - if (shared_fs_poolid_map[i] == FS_UNKNOWN) { - sb->cleancache_poolid = i + FAKE_SHARED_FS_POOLID_OFFSET; - uuids[i] = sb->s_uuid; - if (cleancache_ops) - shared_fs_poolid_map[i] = cleancache_ops->init_shared_fs - (sb->s_uuid, PAGE_SIZE); - else - shared_fs_poolid_map[i] = FS_NO_BACKEND; - break; - } + if (cleancache_ops) { + pool_id = cleancache_ops->init_shared_fs(sb->s_uuid, PAGE_SIZE); + if (pool_id < 0) + pool_id = CLEANCACHE_NO_POOL; } - mutex_unlock(&poolid_mutex); + sb->cleancache_poolid = pool_id; } EXPORT_SYMBOL(__cleancache_init_shared_fs); @@ -203,19 +163,6 @@ static int cleancache_get_key(struct inode *inode, return 0; } -/* - * Returns a pool_id that is associated with a given fake poolid. - */ -static int get_poolid_from_fake(int fake_pool_id) -{ - if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET) - return shared_fs_poolid_map[fake_pool_id - - FAKE_SHARED_FS_POOLID_OFFSET]; - else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET) - return fs_poolid_map[fake_pool_id - FAKE_FS_POOLID_OFFSET]; - return FS_NO_BACKEND; -} - /* * "Get" data from cleancache associated with the poolid/inode/index * that were specified when the data was put to cleanache and, if @@ -231,7 +178,6 @@ int __cleancache_get_page(struct page *page) { int ret = -1; int pool_id; - int fake_pool_id; struct cleancache_filekey key = { .u.key = { 0 } }; if (!cleancache_ops) { @@ -240,17 +186,14 @@ int __cleancache_get_page(struct page *page) } VM_BUG_ON_PAGE(!PageLocked(page), page); - fake_pool_id = page->mapping->host->i_sb->cleancache_poolid; - if (fake_pool_id < 0) + pool_id = page->mapping->host->i_sb->cleancache_poolid; + if (pool_id < 0) goto out; - pool_id = get_poolid_from_fake(fake_pool_id); if (cleancache_get_key(page->mapping->host, &key) < 0) goto out; - if (pool_id >= 0) - ret = cleancache_ops->get_page(pool_id, - key, page->index, page); + ret = cleancache_ops->get_page(pool_id, key, page->index, page); if (ret == 0) cleancache_succ_gets++; else @@ -273,7 +216,6 @@ EXPORT_SYMBOL(__cleancache_get_page); void __cleancache_put_page(struct page *page) { int pool_id; - int fake_pool_id; struct cleancache_filekey key = { .u.key = { 0 } }; if (!cleancache_ops) { @@ -282,12 +224,7 @@ void __cleancache_put_page(struct page *page) } VM_BUG_ON_PAGE(!PageLocked(page), page); - fake_pool_id = page->mapping->host->i_sb->cleancache_poolid; - if (fake_pool_id < 0) - return; - - pool_id = get_poolid_from_fake(fake_pool_id); - + pool_id = page->mapping->host->i_sb->cleancache_poolid; if (pool_id >= 0 && cleancache_get_key(page->mapping->host, &key) >= 0) { cleancache_ops->put_page(pool_id, key, page->index, page); @@ -308,18 +245,13 @@ void __cleancache_invalidate_page(struct address_space *mapping, struct page *page) { /* careful... page->mapping is NULL sometimes when this is called */ - int pool_id; - int fake_pool_id = mapping->host->i_sb->cleancache_poolid; + int pool_id = mapping->host->i_sb->cleancache_poolid; struct cleancache_filekey key = { .u.key = { 0 } }; if (!cleancache_ops) return; - if (fake_pool_id >= 0) { - pool_id = get_poolid_from_fake(fake_pool_id); - if (pool_id < 0) - return; - + if (pool_id >= 0) { VM_BUG_ON_PAGE(!PageLocked(page), page); if (cleancache_get_key(mapping->host, &key) >= 0) { cleancache_ops->invalidate_page(pool_id, @@ -341,18 +273,12 @@ EXPORT_SYMBOL(__cleancache_invalidate_page); */ void __cleancache_invalidate_inode(struct address_space *mapping) { - int pool_id; - int fake_pool_id = mapping->host->i_sb->cleancache_poolid; + int pool_id = mapping->host->i_sb->cleancache_poolid; struct cleancache_filekey key = { .u.key = { 0 } }; if (!cleancache_ops) return; - if (fake_pool_id < 0) - return; - - pool_id = get_poolid_from_fake(fake_pool_id); - if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0) cleancache_ops->invalidate_inode(pool_id, key); } @@ -365,32 +291,18 @@ EXPORT_SYMBOL(__cleancache_invalidate_inode); */ void __cleancache_invalidate_fs(struct super_block *sb) { - int index; - int fake_pool_id = sb->cleancache_poolid; - int old_poolid = fake_pool_id; + int pool_id; - mutex_lock(&poolid_mutex); - if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET) { - index = fake_pool_id - FAKE_SHARED_FS_POOLID_OFFSET; - old_poolid = shared_fs_poolid_map[index]; - shared_fs_poolid_map[index] = FS_UNKNOWN; - uuids[index] = NULL; - } else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET) { - index = fake_pool_id - FAKE_FS_POOLID_OFFSET; - old_poolid = fs_poolid_map[index]; - fs_poolid_map[index] = FS_UNKNOWN; - } - sb->cleancache_poolid = -1; - if (cleancache_ops) - cleancache_ops->invalidate_fs(old_poolid); - mutex_unlock(&poolid_mutex); + pool_id = sb->cleancache_poolid; + sb->cleancache_poolid = CLEANCACHE_NO_POOL; + + if (cleancache_ops && pool_id >= 0) + cleancache_ops->invalidate_fs(pool_id); } EXPORT_SYMBOL(__cleancache_invalidate_fs); static int __init init_cleancache(void) { - int i; - #ifdef CONFIG_DEBUG_FS struct dentry *root = debugfs_create_dir("cleancache", NULL); if (root == NULL) @@ -402,10 +314,6 @@ static int __init init_cleancache(void) debugfs_create_u64("invalidates", S_IRUGO, root, &cleancache_invalidates); #endif - for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { - fs_poolid_map[i] = FS_UNKNOWN; - shared_fs_poolid_map[i] = FS_UNKNOWN; - } return 0; } module_init(init_cleancache) -- cgit v1.2.3 From 4167e9b2cf10f8a4bcda0c713ddc8bb0a18e8187 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 14 Apr 2015 15:46:55 -0700 Subject: mm: remove GFP_THISNODE NOTE: this is not about __GFP_THISNODE, this is only about GFP_THISNODE. GFP_THISNODE is a secret combination of gfp bits that have different behavior than expected. It is a combination of __GFP_THISNODE, __GFP_NORETRY, and __GFP_NOWARN and is special-cased in the page allocator slowpath to fail without trying reclaim even though it may be used in combination with __GFP_WAIT. An example of the problem this creates: commit e97ca8e5b864 ("mm: fix GFP_THISNODE callers and clarify") fixed up many users of GFP_THISNODE that really just wanted __GFP_THISNODE. The problem doesn't end there, however, because even it was a no-op for alloc_misplaced_dst_page(), which also sets __GFP_NORETRY and __GFP_NOWARN, and migrate_misplaced_transhuge_page(), where __GFP_NORETRY and __GFP_NOWAIT is set in GFP_TRANSHUGE. Converting GFP_THISNODE to __GFP_THISNODE is a no-op in these cases since the page allocator special-cases __GFP_THISNODE && __GFP_NORETRY && __GFP_NOWARN. It's time to just remove GFP_THISNODE entirely. We leave __GFP_THISNODE to restrict an allocation to a local node, but remove GFP_THISNODE and its obscurity. Instead, we require that a caller clear __GFP_WAIT if it wants to avoid reclaim. This allows the aforementioned functions to actually reclaim as they should. It also enables any future callers that want to do __GFP_THISNODE but also __GFP_NORETRY && __GFP_NOWARN to reclaim. The rule is simple: if you don't want to reclaim, then don't set __GFP_WAIT. Aside: ovs_flow_stats_update() really wants to avoid reclaim as well, so it is unchanged. Signed-off-by: David Rientjes Acked-by: Vlastimil Babka Cc: Christoph Lameter Acked-by: Pekka Enberg Cc: Joonsoo Kim Acked-by: Johannes Weiner Cc: Mel Gorman Cc: Pravin Shelar Cc: Jarno Rajahalme Cc: Li Zefan Cc: Greg Thelen Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 10 ---------- mm/page_alloc.c | 22 ++++++---------------- mm/slab.c | 22 ++++++++++++++++++---- net/openvswitch/flow.c | 4 +++- 4 files changed, 27 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 51bd1e72a917..4423a0f8eabe 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -117,16 +117,6 @@ struct vm_area_struct; __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \ __GFP_NO_KSWAPD) -/* - * GFP_THISNODE does not perform any reclaim, you most likely want to - * use __GFP_THISNODE to allocate from a given node without fallback! - */ -#ifdef CONFIG_NUMA -#define GFP_THISNODE (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY) -#else -#define GFP_THISNODE ((__force gfp_t)0) -#endif - /* This mask makes up all the page movable related flags */ #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 84466a4b1b36..86af1a96a6dc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2412,13 +2412,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, *did_some_progress = 1; goto out; } - /* - * GFP_THISNODE contains __GFP_NORETRY and we never hit this. - * Sanity check for bare calls of __GFP_THISNODE, not real OOM. - * The caller should handle page allocation failure by itself if - * it specifies __GFP_THISNODE. - * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER. - */ + /* The OOM killer may not free memory on a specific node */ if (gfp_mask & __GFP_THISNODE) goto out; } @@ -2673,15 +2667,11 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, } /* - * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and - * __GFP_NOWARN set) should not cause reclaim since the subsystem - * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim - * using a larger set of nodes after it has established that the - * allowed per node queues are empty and that nodes are - * over allocated. + * If this allocation cannot block and it is for a specific node, then + * fail early. There's no need to wakeup kswapd or retry for a + * speculative node-specific allocation. */ - if (IS_ENABLED(CONFIG_NUMA) && - (gfp_mask & GFP_THISNODE) == GFP_THISNODE) + if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !wait) goto nopage; retry: @@ -2874,7 +2864,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, /* * Check the zones suitable for the gfp_mask contain at least one * valid zone. It's possible to have an empty zonelist as a result - * of GFP_THISNODE and a memoryless node + * of __GFP_THISNODE and a memoryless node */ if (unlikely(!zonelist->_zonerefs->zone)) return NULL; diff --git a/mm/slab.c b/mm/slab.c index c4b89eaf4c96..7eb38dd1cefa 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -857,6 +857,11 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep, return NULL; } +static inline gfp_t gfp_exact_node(gfp_t flags) +{ + return flags; +} + #else /* CONFIG_NUMA */ static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); @@ -1023,6 +1028,15 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) return __cache_free_alien(cachep, objp, node, page_node); } + +/* + * Construct gfp mask to allocate from a specific node but do not invoke reclaim + * or warn about failures. + */ +static inline gfp_t gfp_exact_node(gfp_t flags) +{ + return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_WAIT; +} #endif /* @@ -2825,7 +2839,7 @@ alloc_done: if (unlikely(!ac->avail)) { int x; force_grow: - x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); + x = cache_grow(cachep, gfp_exact_node(flags), node, NULL); /* cache_grow can reenable interrupts, then ac could change. */ ac = cpu_cache_get(cachep); @@ -3019,7 +3033,7 @@ retry: get_node(cache, nid) && get_node(cache, nid)->free_objects) { obj = ____cache_alloc_node(cache, - flags | GFP_THISNODE, nid); + gfp_exact_node(flags), nid); if (obj) break; } @@ -3047,7 +3061,7 @@ retry: nid = page_to_nid(page); if (cache_grow(cache, flags, nid, page)) { obj = ____cache_alloc_node(cache, - flags | GFP_THISNODE, nid); + gfp_exact_node(flags), nid); if (!obj) /* * Another processor may allocate the @@ -3118,7 +3132,7 @@ retry: must_grow: spin_unlock(&n->list_lock); - x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL); + x = cache_grow(cachep, gfp_exact_node(flags), nodeid, NULL); if (x) goto retry; diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 50ec42f170a0..2dacc7b5af23 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -100,7 +100,9 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags, new_stats = kmem_cache_alloc_node(flow_stats_cache, - GFP_THISNODE | + GFP_NOWAIT | + __GFP_THISNODE | + __GFP_NOWARN | __GFP_NOMEMALLOC, node); if (likely(new_stats)) { -- cgit v1.2.3 From ac173824959adeb489f9fcf88858774c4535a241 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Tue, 14 Apr 2015 15:47:04 -0700 Subject: mm: cma: constify and use correct signness in mm/cma.c Constify function parameters and use correct signness where needed. Signed-off-by: Sasha Levin Cc: Michal Nazarewicz Cc: Marek Szyprowski Cc: Joonsoo Kim Cc: Laurent Pinchart Acked-by: Gregory Fong Cc: Pintu Kumar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cma.h | 12 ++++++------ mm/cma.c | 24 ++++++++++++++---------- 2 files changed, 20 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/cma.h b/include/linux/cma.h index 9384ba66e975..f7ef093ec49a 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -16,16 +16,16 @@ struct cma; extern unsigned long totalcma_pages; -extern phys_addr_t cma_get_base(struct cma *cma); -extern unsigned long cma_get_size(struct cma *cma); +extern phys_addr_t cma_get_base(const struct cma *cma); +extern unsigned long cma_get_size(const struct cma *cma); extern int __init cma_declare_contiguous(phys_addr_t base, phys_addr_t size, phys_addr_t limit, phys_addr_t alignment, unsigned int order_per_bit, bool fixed, struct cma **res_cma); -extern int cma_init_reserved_mem(phys_addr_t base, - phys_addr_t size, int order_per_bit, +extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, + unsigned int order_per_bit, struct cma **res_cma); -extern struct page *cma_alloc(struct cma *cma, int count, unsigned int align); -extern bool cma_release(struct cma *cma, struct page *pages, int count); +extern struct page *cma_alloc(struct cma *cma, unsigned int count, unsigned int align); +extern bool cma_release(struct cma *cma, const struct page *pages, unsigned int count); #endif diff --git a/mm/cma.c b/mm/cma.c index 7f6045420925..47203faaf65e 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -41,17 +41,18 @@ struct cma cma_areas[MAX_CMA_AREAS]; unsigned cma_area_count; static DEFINE_MUTEX(cma_mutex); -phys_addr_t cma_get_base(struct cma *cma) +phys_addr_t cma_get_base(const struct cma *cma) { return PFN_PHYS(cma->base_pfn); } -unsigned long cma_get_size(struct cma *cma) +unsigned long cma_get_size(const struct cma *cma) { return cma->count << PAGE_SHIFT; } -static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order) +static unsigned long cma_bitmap_aligned_mask(const struct cma *cma, + int align_order) { if (align_order <= cma->order_per_bit) return 0; @@ -62,7 +63,8 @@ static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order) * Find a PFN aligned to the specified order and return an offset represented in * order_per_bits. */ -static unsigned long cma_bitmap_aligned_offset(struct cma *cma, int align_order) +static unsigned long cma_bitmap_aligned_offset(const struct cma *cma, + int align_order) { if (align_order <= cma->order_per_bit) return 0; @@ -71,13 +73,14 @@ static unsigned long cma_bitmap_aligned_offset(struct cma *cma, int align_order) - cma->base_pfn) >> cma->order_per_bit; } -static unsigned long cma_bitmap_pages_to_bits(struct cma *cma, - unsigned long pages) +static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma, + unsigned long pages) { return ALIGN(pages, 1UL << cma->order_per_bit) >> cma->order_per_bit; } -static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, int count) +static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, + unsigned int count) { unsigned long bitmap_no, bitmap_count; @@ -162,7 +165,8 @@ core_initcall(cma_init_reserved_areas); * This function creates custom contiguous area from already reserved memory. */ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, - int order_per_bit, struct cma **res_cma) + unsigned int order_per_bit, + struct cma **res_cma) { struct cma *cma; phys_addr_t alignment; @@ -353,7 +357,7 @@ err: * This function allocates part of contiguous memory on specific * contiguous memory area. */ -struct page *cma_alloc(struct cma *cma, int count, unsigned int align) +struct page *cma_alloc(struct cma *cma, unsigned int count, unsigned int align) { unsigned long mask, offset, pfn, start = 0; unsigned long bitmap_maxno, bitmap_no, bitmap_count; @@ -424,7 +428,7 @@ struct page *cma_alloc(struct cma *cma, int count, unsigned int align) * It returns false when provided pages do not belong to contiguous area and * true otherwise. */ -bool cma_release(struct cma *cma, struct page *pages, int count) +bool cma_release(struct cma *cma, const struct page *pages, unsigned int count) { unsigned long pfn; -- cgit v1.2.3 From 647757197cd34fae041e21af39ded00f5c346fc4 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 14 Apr 2015 15:47:07 -0700 Subject: mm: clarify __GFP_NOFAIL deprecation status __GFP_NOFAIL is documented as a deprecated flag since commit 478352e789f5 ("mm: add comment about deprecation of __GFP_NOFAIL"). This has discouraged people from using it but in some cases an opencoded endless loop around allocator has been used instead. So the allocator is not aware of the de facto __GFP_NOFAIL allocation because this information was not communicated properly. Let's make clear that if the allocation context really cannot afford failure because there is no good failure policy then using __GFP_NOFAIL is preferable to opencoding the loop outside of the allocator. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Michal Hocko Acked-by: David Rientjes Cc: Johannes Weiner Cc: Dave Chinner Cc: "Theodore Ts'o" Cc: Mel Gorman Cc: Tetsuo Handa Cc: "David S. Miller" Cc: Vipul Pandya Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 4423a0f8eabe..97a9373e61e8 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -57,8 +57,10 @@ struct vm_area_struct; * _might_ fail. This depends upon the particular VM implementation. * * __GFP_NOFAIL: The VM implementation _must_ retry infinitely: the caller - * cannot handle allocation failures. This modifier is deprecated and no new - * users should be added. + * cannot handle allocation failures. New users should be evaluated carefully + * (and the flag should be used only when there is no reasonable failure policy) + * but it is definitely preferable to use the flag rather than opencode endless + * loop around allocator. * * __GFP_NORETRY: The VM implementation must not retry indefinitely. * -- cgit v1.2.3 From 0ddab1d2ed664c85c95488eef569786a84aedf37 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 14 Apr 2015 15:47:20 -0700 Subject: lib/ioremap.c: add huge I/O map capability interfaces Add ioremap_pud_enabled() and ioremap_pmd_enabled(), which return 1 when I/O mappings with pud/pmd are enabled on the kernel. ioremap_huge_init() calls arch_ioremap_pud_supported() and arch_ioremap_pmd_supported() to initialize the capabilities at boot-time. A new kernel option "nohugeiomap" is also added, so that user can disable the huge I/O map capabilities when necessary. Signed-off-by: Toshi Kani Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Arnd Bergmann Cc: Dave Hansen Cc: Robert Elliott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 2 ++ arch/Kconfig | 3 +++ include/linux/io.h | 8 ++++++++ init/main.c | 2 ++ lib/ioremap.c | 37 +++++++++++++++++++++++++++++++++++++ 5 files changed, 52 insertions(+) (limited to 'include') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 71eecb263250..b1fa70907ccf 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2323,6 +2323,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted. register save and restore. The kernel will only save legacy floating-point registers on task switch. + nohugeiomap [KNL,x86] Disable kernel huge I/O mappings. + noxsave [BUGS=X86] Disables x86 extended register state save and restore using xsave. The kernel will fallback to enabling legacy floating-point and sse state. diff --git a/arch/Kconfig b/arch/Kconfig index a9c95d36ba70..c88c23f0a1da 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -446,6 +446,9 @@ config HAVE_IRQ_TIME_ACCOUNTING config HAVE_ARCH_TRANSPARENT_HUGEPAGE bool +config HAVE_ARCH_HUGE_VMAP + bool + config HAVE_ARCH_SOFT_DIRTY bool diff --git a/include/linux/io.h b/include/linux/io.h index fa02e55e5a2e..4cc299c598e0 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -38,6 +38,14 @@ static inline int ioremap_page_range(unsigned long addr, unsigned long end, } #endif +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP +void __init ioremap_huge_init(void); +int arch_ioremap_pud_supported(void); +int arch_ioremap_pmd_supported(void); +#else +static inline void ioremap_huge_init(void) { } +#endif + /* * Managed iomap interface */ diff --git a/init/main.c b/init/main.c index 4a6974e67839..f6dd8fe1f22c 100644 --- a/init/main.c +++ b/init/main.c @@ -80,6 +80,7 @@ #include #include #include +#include #include #include @@ -484,6 +485,7 @@ static void __init mm_init(void) percpu_init_late(); pgtable_init(); vmalloc_init(); + ioremap_huge_init(); } asmlinkage __visible void __init start_kernel(void) diff --git a/lib/ioremap.c b/lib/ioremap.c index 0c9216c48762..2008652c9a1f 100644 --- a/lib/ioremap.c +++ b/lib/ioremap.c @@ -13,6 +13,43 @@ #include #include +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP +int __read_mostly ioremap_pud_capable; +int __read_mostly ioremap_pmd_capable; +int __read_mostly ioremap_huge_disabled; + +static int __init set_nohugeiomap(char *str) +{ + ioremap_huge_disabled = 1; + return 0; +} +early_param("nohugeiomap", set_nohugeiomap); + +void __init ioremap_huge_init(void) +{ + if (!ioremap_huge_disabled) { + if (arch_ioremap_pud_supported()) + ioremap_pud_capable = 1; + if (arch_ioremap_pmd_supported()) + ioremap_pmd_capable = 1; + } +} + +static inline int ioremap_pud_enabled(void) +{ + return ioremap_pud_capable; +} + +static inline int ioremap_pmd_enabled(void) +{ + return ioremap_pmd_capable; +} + +#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ +static inline int ioremap_pud_enabled(void) { return 0; } +static inline int ioremap_pmd_enabled(void) { return 0; } +#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ + static int ioremap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot) { -- cgit v1.2.3 From e61ce6ade404ef17bd63d62bec78e29047a47b47 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 14 Apr 2015 15:47:23 -0700 Subject: mm: change ioremap to set up huge I/O mappings ioremap_pud_range() and ioremap_pmd_range() are changed to create huge I/O mappings when their capability is enabled, and a request meets required conditions -- both virtual & physical addresses are aligned by their huge page size, and a requested range fufills their huge page size. When pud_set_huge() or pmd_set_huge() returns zero, i.e. no-operation is performed, the code simply falls back to the next level. The changes are only enabled when CONFIG_HAVE_ARCH_HUGE_VMAP is defined on the architecture. Signed-off-by: Toshi Kani Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Arnd Bergmann Cc: Dave Hansen Cc: Robert Elliott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/pgtable.h | 15 +++++++++++++++ lib/ioremap.c | 16 ++++++++++++++++ 2 files changed, 31 insertions(+) (limited to 'include') diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 1f9f5da6828f..9fb7dc7ca7f4 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -6,6 +6,7 @@ #include #include +#include #if 4 - defined(__PAGETABLE_PUD_FOLDED) - defined(__PAGETABLE_PMD_FOLDED) != \ CONFIG_PGTABLE_LEVELS @@ -696,6 +697,20 @@ static inline int pmd_protnone(pmd_t pmd) #endif /* CONFIG_MMU */ +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP +int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot); +int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot); +#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ +static inline int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) +{ + return 0; +} +static inline int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) +{ + return 0; +} +#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ + #endif /* !__ASSEMBLY__ */ #ifndef io_remap_pfn_range diff --git a/lib/ioremap.c b/lib/ioremap.c index 2008652c9a1f..be249062ba6e 100644 --- a/lib/ioremap.c +++ b/lib/ioremap.c @@ -80,6 +80,14 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr, return -ENOMEM; do { next = pmd_addr_end(addr, end); + + if (ioremap_pmd_enabled() && + ((next - addr) == PMD_SIZE) && + IS_ALIGNED(phys_addr + addr, PMD_SIZE)) { + if (pmd_set_huge(pmd, phys_addr + addr, prot)) + continue; + } + if (ioremap_pte_range(pmd, addr, next, phys_addr + addr, prot)) return -ENOMEM; } while (pmd++, addr = next, addr != end); @@ -98,6 +106,14 @@ static inline int ioremap_pud_range(pgd_t *pgd, unsigned long addr, return -ENOMEM; do { next = pud_addr_end(addr, end); + + if (ioremap_pud_enabled() && + ((next - addr) == PUD_SIZE) && + IS_ALIGNED(phys_addr + addr, PUD_SIZE)) { + if (pud_set_huge(pud, phys_addr + addr, prot)) + continue; + } + if (ioremap_pmd_range(pud, addr, next, phys_addr + addr, prot)) return -ENOMEM; } while (pud++, addr = next, addr != end); -- cgit v1.2.3 From b9820d8f39f816b67112eb7ec5cdc4c1655ff060 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 14 Apr 2015 15:47:26 -0700 Subject: mm: change vunmap to tear down huge KVA mappings Change vunmap_pmd_range() and vunmap_pud_range() to tear down huge KVA mappings when they are set. pud_clear_huge() and pmd_clear_huge() return zero when no-operation is performed, i.e. huge page mapping was not used. These changes are only enabled when CONFIG_HAVE_ARCH_HUGE_VMAP is defined on the architecture. [akpm@linux-foundation.org: use consistent code layout] Signed-off-by: Toshi Kani Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Arnd Bergmann Cc: Dave Hansen Cc: Robert Elliott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/pgtable.h | 10 ++++++++++ mm/vmalloc.c | 4 ++++ 2 files changed, 14 insertions(+) (limited to 'include') diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 9fb7dc7ca7f4..39f1d6a2b04d 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -700,6 +700,8 @@ static inline int pmd_protnone(pmd_t pmd) #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot); int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot); +int pud_clear_huge(pud_t *pud); +int pmd_clear_huge(pmd_t *pmd); #else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ static inline int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) { @@ -709,6 +711,14 @@ static inline int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) { return 0; } +static inline int pud_clear_huge(pud_t *pud) +{ + return 0; +} +static inline int pmd_clear_huge(pmd_t *pmd) +{ + return 0; +} #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ #endif /* !__ASSEMBLY__ */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a48cd061f16f..a5bbdd3b5d67 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -75,6 +75,8 @@ static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end) pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + if (pmd_clear_huge(pmd)) + continue; if (pmd_none_or_clear_bad(pmd)) continue; vunmap_pte_range(pmd, addr, next); @@ -89,6 +91,8 @@ static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end) pud = pud_offset(pgd, addr); do { next = pud_addr_end(addr, end); + if (pud_clear_huge(pud)) + continue; if (pud_none_or_clear_bad(pud)) continue; vunmap_pmd_range(pud, addr, next); -- cgit v1.2.3 From 2b68f6caeac271620cd2f9362aeaed360e317df0 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 14 Apr 2015 15:48:00 -0700 Subject: mm: expose arch_mmap_rnd when available When an architecture fully supports randomizing the ELF load location, a per-arch mmap_rnd() function is used to find a randomized mmap base. In preparation for randomizing the location of ET_DYN binaries separately from mmap, this renames and exports these functions as arch_mmap_rnd(). Additionally introduces CONFIG_ARCH_HAS_ELF_RANDOMIZE for describing this feature on architectures that support it (which is a superset of ARCH_BINFMT_ELF_RANDOMIZE_PIE, since s390 already supports a separated ET_DYN ASLR from mmap ASLR without the ARCH_BINFMT_ELF_RANDOMIZE_PIE logic). Signed-off-by: Kees Cook Cc: Hector Marco-Gisbert Cc: Russell King Reviewed-by: Ingo Molnar Cc: Catalin Marinas Cc: Will Deacon Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Alexander Viro Cc: Oleg Nesterov Cc: Andy Lutomirski Cc: "David A. Long" Cc: Andrey Ryabinin Cc: Arun Chandran Cc: Yann Droneaud Cc: Min-Hua Chen Cc: Paul Burton Cc: Alex Smith Cc: Markos Chandras Cc: Vineeth Vijayan Cc: Jeff Bailey Cc: Michael Holzheu Cc: Ben Hutchings Cc: Behan Webster Cc: Ismael Ripoll Cc: Jan-Simon Mller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/Kconfig | 7 +++++++ arch/arm/Kconfig | 1 + arch/arm/mm/mmap.c | 4 ++-- arch/arm64/Kconfig | 1 + arch/arm64/mm/mmap.c | 4 ++-- arch/mips/Kconfig | 1 + arch/mips/mm/mmap.c | 4 ++-- arch/powerpc/Kconfig | 1 + arch/powerpc/mm/mmap.c | 4 ++-- arch/s390/Kconfig | 1 + arch/s390/mm/mmap.c | 8 ++++---- arch/x86/Kconfig | 1 + arch/x86/mm/mmap.c | 4 ++-- include/linux/elf-randomize.h | 10 ++++++++++ 14 files changed, 37 insertions(+), 14 deletions(-) create mode 100644 include/linux/elf-randomize.h (limited to 'include') diff --git a/arch/Kconfig b/arch/Kconfig index c88c23f0a1da..474904a8e540 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -491,6 +491,13 @@ config PGTABLE_LEVELS int default 2 +config ARCH_HAS_ELF_RANDOMIZE + bool + help + An architecture supports choosing randomized locations for + stack, mmap, brk, and ET_DYN. Defined functions: + - arch_mmap_rnd() + # # ABI hall of shame # diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 696cf3c61e0f..f85200a63a8b 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -3,6 +3,7 @@ config ARM default y select ARCH_BINFMT_ELF_RANDOMIZE_PIE select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE + select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAVE_CUSTOM_GPIO_H select ARCH_HAS_GCOV_PROFILE_ALL diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c index 15a8160096b3..407dc786583a 100644 --- a/arch/arm/mm/mmap.c +++ b/arch/arm/mm/mmap.c @@ -169,7 +169,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, return addr; } -static unsigned long mmap_rnd(void) +unsigned long arch_mmap_rnd(void) { unsigned long rnd; @@ -184,7 +184,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) unsigned long random_factor = 0UL; if (current->flags & PF_RANDOMIZE) - random_factor = mmap_rnd(); + random_factor = arch_mmap_rnd(); if (mmap_is_legacy()) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 3f2fba996bc2..7c1dbeb73e8d 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2,6 +2,7 @@ config ARM64 def_bool y select ARCH_BINFMT_ELF_RANDOMIZE_PIE select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE + select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_SG_CHAIN select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index ba776c01b552..ed177475dd8c 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -47,7 +47,7 @@ static int mmap_is_legacy(void) return sysctl_legacy_va_layout; } -static unsigned long mmap_rnd(void) +unsigned long arch_mmap_rnd(void) { unsigned long rnd; @@ -77,7 +77,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) unsigned long random_factor = 0UL; if (current->flags & PF_RANDOMIZE) - random_factor = mmap_rnd(); + random_factor = arch_mmap_rnd(); /* * Fall back to the standard layout if the personality bit is set, or diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index a9d112d2a135..688ce274f59d 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -24,6 +24,7 @@ config MIPS select HAVE_DEBUG_KMEMLEAK select HAVE_SYSCALL_TRACEPOINTS select ARCH_BINFMT_ELF_RANDOMIZE_PIE + select ARCH_HAS_ELF_RANDOMIZE select HAVE_ARCH_TRANSPARENT_HUGEPAGE if CPU_SUPPORTS_HUGEPAGES && 64BIT select RTC_LIB if !MACH_LOONGSON select GENERIC_ATOMIC64 if !64BIT diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c index 9a4f1f5c1f0e..5c81fdd032c3 100644 --- a/arch/mips/mm/mmap.c +++ b/arch/mips/mm/mmap.c @@ -142,7 +142,7 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp, addr0, len, pgoff, flags, DOWN); } -static unsigned long mmap_rnd(void) +unsigned long arch_mmap_rnd(void) { unsigned long rnd; @@ -161,7 +161,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) unsigned long random_factor = 0UL; if (current->flags & PF_RANDOMIZE) - random_factor = mmap_rnd(); + random_factor = arch_mmap_rnd(); if (mmap_is_legacy()) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 91ad76f30d18..fc5fffbb331b 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -89,6 +89,7 @@ config PPC select ARCH_MIGHT_HAVE_PC_SERIO select BINFMT_ELF select ARCH_BINFMT_ELF_RANDOMIZE_PIE + select ARCH_HAS_ELF_RANDOMIZE select OF select OF_EARLY_FLATTREE select OF_RESERVED_MEM diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c index 1ad2299d795d..0f0502e12f6c 100644 --- a/arch/powerpc/mm/mmap.c +++ b/arch/powerpc/mm/mmap.c @@ -53,7 +53,7 @@ static inline int mmap_is_legacy(void) return sysctl_legacy_va_layout; } -static unsigned long mmap_rnd(void) +unsigned long arch_mmap_rnd(void) { unsigned long rnd; @@ -87,7 +87,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) unsigned long random_factor = 0UL; if (current->flags & PF_RANDOMIZE) - random_factor = mmap_rnd(); + random_factor = arch_mmap_rnd(); /* * Fall back to the standard layout if the personality diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index f6aebcb7a0f8..ac2b75d74cd2 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -65,6 +65,7 @@ config S390 def_bool y select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS + select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_SG_CHAIN select ARCH_HAVE_NMI_SAFE_CMPXCHG diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index db57078075c5..a94504d99c47 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -60,7 +60,7 @@ static inline int mmap_is_legacy(void) return sysctl_legacy_va_layout; } -static unsigned long mmap_rnd(void) +unsigned long arch_mmap_rnd(void) { if (is_32bit_task()) return (get_random_int() & 0x7ff) << PAGE_SHIFT; @@ -187,7 +187,7 @@ unsigned long randomize_et_dyn(void) base &= ~((1UL << 32) - 1); if (current->flags & PF_RANDOMIZE) - base += mmap_rnd(); + base += arch_mmap_rnd(); return base; } @@ -203,7 +203,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) unsigned long random_factor = 0UL; if (current->flags & PF_RANDOMIZE) - random_factor = mmap_rnd(); + random_factor = arch_mmap_rnd(); /* * Fall back to the standard layout if the personality @@ -283,7 +283,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) unsigned long random_factor = 0UL; if (current->flags & PF_RANDOMIZE) - random_factor = mmap_rnd(); + random_factor = arch_mmap_rnd(); /* * Fall back to the standard layout if the personality diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0f948cefaeb1..782ddbbc1c9a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -88,6 +88,7 @@ config X86 select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP select HAVE_USER_RETURN_NOTIFIER select ARCH_BINFMT_ELF_RANDOMIZE_PIE + select ARCH_HAS_ELF_RANDOMIZE select HAVE_ARCH_JUMP_LABEL select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select SPARSE_IRQ diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index ebfa52030d5c..9d518d693b4b 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -65,7 +65,7 @@ static int mmap_is_legacy(void) return sysctl_legacy_va_layout; } -static unsigned long mmap_rnd(void) +unsigned long arch_mmap_rnd(void) { unsigned long rnd; @@ -114,7 +114,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) unsigned long random_factor = 0UL; if (current->flags & PF_RANDOMIZE) - random_factor = mmap_rnd(); + random_factor = arch_mmap_rnd(); mm->mmap_legacy_base = mmap_legacy_base(random_factor); diff --git a/include/linux/elf-randomize.h b/include/linux/elf-randomize.h new file mode 100644 index 000000000000..7a4eda02d2b1 --- /dev/null +++ b/include/linux/elf-randomize.h @@ -0,0 +1,10 @@ +#ifndef _ELF_RANDOMIZE_H +#define _ELF_RANDOMIZE_H + +#ifndef CONFIG_ARCH_HAS_ELF_RANDOMIZE +static inline unsigned long arch_mmap_rnd(void) { return 0; } +#else +extern unsigned long arch_mmap_rnd(void); +#endif + +#endif -- cgit v1.2.3 From 204db6ed17743000691d930368a5abd6ea541c58 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 14 Apr 2015 15:48:12 -0700 Subject: mm: fold arch_randomize_brk into ARCH_HAS_ELF_RANDOMIZE The arch_randomize_brk() function is used on several architectures, even those that don't support ET_DYN ASLR. To avoid bulky extern/#define tricks, consolidate the support under CONFIG_ARCH_HAS_ELF_RANDOMIZE for the architectures that support it, while still handling CONFIG_COMPAT_BRK. Signed-off-by: Kees Cook Cc: Hector Marco-Gisbert Cc: Russell King Reviewed-by: Ingo Molnar Cc: Catalin Marinas Cc: Will Deacon Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Alexander Viro Cc: Oleg Nesterov Cc: Andy Lutomirski Cc: "David A. Long" Cc: Andrey Ryabinin Cc: Arun Chandran Cc: Yann Droneaud Cc: Min-Hua Chen Cc: Paul Burton Cc: Alex Smith Cc: Markos Chandras Cc: Vineeth Vijayan Cc: Jeff Bailey Cc: Michael Holzheu Cc: Ben Hutchings Cc: Behan Webster Cc: Ismael Ripoll Cc: Jan-Simon Mller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/Kconfig | 1 + arch/arm/include/asm/elf.h | 4 ---- arch/arm64/include/asm/elf.h | 4 ---- arch/mips/include/asm/elf.h | 4 ---- arch/powerpc/include/asm/elf.h | 4 ---- arch/s390/include/asm/elf.h | 3 --- arch/x86/include/asm/elf.h | 3 --- fs/binfmt_elf.c | 4 +--- include/linux/elf-randomize.h | 12 ++++++++++++ 9 files changed, 14 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/arch/Kconfig b/arch/Kconfig index 474904a8e540..e1068987bad1 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -497,6 +497,7 @@ config ARCH_HAS_ELF_RANDOMIZE An architecture supports choosing randomized locations for stack, mmap, brk, and ET_DYN. Defined functions: - arch_mmap_rnd() + - arch_randomize_brk() # # ABI hall of shame diff --git a/arch/arm/include/asm/elf.h b/arch/arm/include/asm/elf.h index afb9cafd3786..c1ff8ab12914 100644 --- a/arch/arm/include/asm/elf.h +++ b/arch/arm/include/asm/elf.h @@ -125,10 +125,6 @@ int dump_task_regs(struct task_struct *t, elf_gregset_t *elfregs); extern void elf_set_personality(const struct elf32_hdr *); #define SET_PERSONALITY(ex) elf_set_personality(&(ex)) -struct mm_struct; -extern unsigned long arch_randomize_brk(struct mm_struct *mm); -#define arch_randomize_brk arch_randomize_brk - #ifdef CONFIG_MMU #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 struct linux_binprm; diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h index f724db00b235..faad6df49e5b 100644 --- a/arch/arm64/include/asm/elf.h +++ b/arch/arm64/include/asm/elf.h @@ -156,10 +156,6 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm, #define STACK_RND_MASK (0x3ffff >> (PAGE_SHIFT - 12)) #endif -struct mm_struct; -extern unsigned long arch_randomize_brk(struct mm_struct *mm); -#define arch_randomize_brk arch_randomize_brk - #ifdef CONFIG_COMPAT #ifdef __AARCH64EB__ diff --git a/arch/mips/include/asm/elf.h b/arch/mips/include/asm/elf.h index 535f196ffe02..31d747d46a23 100644 --- a/arch/mips/include/asm/elf.h +++ b/arch/mips/include/asm/elf.h @@ -410,10 +410,6 @@ struct linux_binprm; extern int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp); -struct mm_struct; -extern unsigned long arch_randomize_brk(struct mm_struct *mm); -#define arch_randomize_brk arch_randomize_brk - struct arch_elf_state { int fp_abi; int interp_fp_abi; diff --git a/arch/powerpc/include/asm/elf.h b/arch/powerpc/include/asm/elf.h index 57d289acb803..ee46ffef608e 100644 --- a/arch/powerpc/include/asm/elf.h +++ b/arch/powerpc/include/asm/elf.h @@ -128,10 +128,6 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm, (0x7ff >> (PAGE_SHIFT - 12)) : \ (0x3ffff >> (PAGE_SHIFT - 12))) -extern unsigned long arch_randomize_brk(struct mm_struct *mm); -#define arch_randomize_brk arch_randomize_brk - - #ifdef CONFIG_SPU_BASE /* Notes used in ET_CORE. Note name is "SPU//". */ #define NT_SPU 1 diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h index ff662155b2c4..a5c4978462c1 100644 --- a/arch/s390/include/asm/elf.h +++ b/arch/s390/include/asm/elf.h @@ -226,9 +226,6 @@ struct linux_binprm; #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 int arch_setup_additional_pages(struct linux_binprm *, int); -extern unsigned long arch_randomize_brk(struct mm_struct *mm); -#define arch_randomize_brk arch_randomize_brk - void *fill_cpu_elf_notes(void *ptr, struct save_area *sa, __vector128 *vxrs); #endif diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 935588d95c82..f161c189c27b 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -339,9 +339,6 @@ extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp); #define compat_arch_setup_additional_pages compat_arch_setup_additional_pages -extern unsigned long arch_randomize_brk(struct mm_struct *mm); -#define arch_randomize_brk arch_randomize_brk - /* * True on X86_32 or when emulating IA32 on X86_64 */ diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index b20c05477e90..241ef68d2893 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1050,15 +1050,13 @@ static int load_elf_binary(struct linux_binprm *bprm) current->mm->end_data = end_data; current->mm->start_stack = bprm->p; -#ifdef arch_randomize_brk if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) { current->mm->brk = current->mm->start_brk = arch_randomize_brk(current->mm); -#ifdef CONFIG_COMPAT_BRK +#ifdef compat_brk_randomized current->brk_randomized = 1; #endif } -#endif if (current->personality & MMAP_PAGE_ZERO) { /* Why this, you ask??? Well SVr4 maps page 0 as read-only, diff --git a/include/linux/elf-randomize.h b/include/linux/elf-randomize.h index 7a4eda02d2b1..b5f0bda9472e 100644 --- a/include/linux/elf-randomize.h +++ b/include/linux/elf-randomize.h @@ -1,10 +1,22 @@ #ifndef _ELF_RANDOMIZE_H #define _ELF_RANDOMIZE_H +struct mm_struct; + #ifndef CONFIG_ARCH_HAS_ELF_RANDOMIZE static inline unsigned long arch_mmap_rnd(void) { return 0; } +# if defined(arch_randomize_brk) && defined(CONFIG_COMPAT_BRK) +# define compat_brk_randomized +# endif +# ifndef arch_randomize_brk +# define arch_randomize_brk(mm) (mm->brk) +# endif #else extern unsigned long arch_mmap_rnd(void); +extern unsigned long arch_randomize_brk(struct mm_struct *mm); +# ifdef CONFIG_COMPAT_BRK +# define compat_brk_randomized +# endif #endif #endif -- cgit v1.2.3 From 2a8e70026435ad97570a1e0a0c4c941e0f700a3e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 14 Apr 2015 15:48:15 -0700 Subject: mm: numa: remove migrate_ratelimited This code is dead since commit 9e645ab6d089 ("sched/numa: Continue PTE scanning even if migrate rate limited") so remove it. Signed-off-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 5 ----- mm/migrate.c | 20 -------------------- 2 files changed, 25 deletions(-) (limited to 'include') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 78baed5f2952..cac1c0904d5f 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -69,7 +69,6 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, extern bool pmd_trans_migrating(pmd_t pmd); extern int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, int node); -extern bool migrate_ratelimited(int node); #else static inline bool pmd_trans_migrating(pmd_t pmd) { @@ -80,10 +79,6 @@ static inline int migrate_misplaced_page(struct page *page, { return -EAGAIN; /* can't migrate now */ } -static inline bool migrate_ratelimited(int node) -{ - return false; -} #endif /* CONFIG_NUMA_BALANCING */ #if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) diff --git a/mm/migrate.c b/mm/migrate.c index ec1802d85f05..a65ff72ab739 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1565,30 +1565,10 @@ static struct page *alloc_misplaced_dst_page(struct page *page, * page migration rate limiting control. * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs * window of time. Default here says do not migrate more than 1280M per second. - * If a node is rate-limited then PTE NUMA updates are also rate-limited. However - * as it is faults that reset the window, pte updates will happen unconditionally - * if there has not been a fault since @pteupdate_interval_millisecs after the - * throttle window closed. */ static unsigned int migrate_interval_millisecs __read_mostly = 100; -static unsigned int pteupdate_interval_millisecs __read_mostly = 1000; static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT); -/* Returns true if NUMA migration is currently rate limited */ -bool migrate_ratelimited(int node) -{ - pg_data_t *pgdat = NODE_DATA(node); - - if (time_after(jiffies, pgdat->numabalancing_migrate_next_window + - msecs_to_jiffies(pteupdate_interval_millisecs))) - return false; - - if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages) - return false; - - return true; -} - /* Returns true if the node is migrate rate-limited after the update */ static bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages) -- cgit v1.2.3 From 2415b9f5cb048a803b30b790af994ba71ff0bd4c Mon Sep 17 00:00:00 2001 From: Balasubramani Vivekanandan Date: Tue, 14 Apr 2015 15:48:18 -0700 Subject: memcg: print cgroup information when system panics due to panic_on_oom If kernel panics due to oom, caused by a cgroup reaching its limit, when 'compulsory panic_on_oom' is enabled, then we will only see that the OOM happened because of "compulsory panic_on_oom is enabled" but this doesn't tell the difference between mempolicy and memcg. And dumping system wide information is plain wrong and more confusing. This patch provides the information of the cgroup whose limit triggerred panic Signed-off-by: Balasubramani Vivekanandan Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 3 ++- mm/memcontrol.c | 16 +++++++++------- mm/oom_kill.c | 7 ++++--- 3 files changed, 15 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/oom.h b/include/linux/oom.h index d5771bed59c9..44b2f6f7bbd8 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -66,7 +66,8 @@ extern bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_flags); extern void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_flags); extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, - int order, const nodemask_t *nodemask); + int order, const nodemask_t *nodemask, + struct mem_cgroup *memcg); extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task, unsigned long totalpages, const nodemask_t *nodemask, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f227786e73db..c3f09b2dda5f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1442,15 +1442,17 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) struct mem_cgroup *iter; unsigned int i; - if (!p) - return; - mutex_lock(&oom_info_lock); rcu_read_lock(); - pr_info("Task in "); - pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); - pr_cont(" killed as a result of limit of "); + if (p) { + pr_info("Task in "); + pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); + pr_cont(" killed as a result of limit of "); + } else { + pr_info("Memory limit reached of cgroup "); + } + pr_cont_cgroup_path(memcg->css.cgroup); pr_cont("\n"); @@ -1537,7 +1539,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, return; } - check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); + check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg); totalpages = mem_cgroup_get_limit(memcg) ? : 1; for_each_mem_cgroup_tree(iter, memcg) { struct css_task_iter it; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 642f38cb175a..52628c819bf7 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -612,7 +612,8 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, * Determines whether the kernel must panic because of the panic_on_oom sysctl. */ void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, - int order, const nodemask_t *nodemask) + int order, const nodemask_t *nodemask, + struct mem_cgroup *memcg) { if (likely(!sysctl_panic_on_oom)) return; @@ -625,7 +626,7 @@ void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, if (constraint != CONSTRAINT_NONE) return; } - dump_header(NULL, gfp_mask, order, NULL, nodemask); + dump_header(NULL, gfp_mask, order, memcg, nodemask); panic("Out of memory: %s panic_on_oom is enabled\n", sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); } @@ -740,7 +741,7 @@ static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, constraint = constrained_alloc(zonelist, gfp_mask, nodemask, &totalpages); mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL; - check_panic_on_oom(constraint, gfp_mask, order, mpol_mask); + check_panic_on_oom(constraint, gfp_mask, order, mpol_mask, NULL); if (sysctl_oom_kill_allocating_task && current->mm && !oom_unkillable_task(current, NULL, nodemask) && -- cgit v1.2.3 From 11d83360452ea2a95e699da01f8e1bcc4676a5de Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 14 Apr 2015 15:48:21 -0700 Subject: mm, mempool: do not allow atomic resizing Allocating a large number of elements in atomic context could quickly deplete memory reserves, so just disallow atomic resizing entirely. Nothing currently uses mempool_resize() with anything other than GFP_KERNEL, so convert existing callers to drop the gfp_mask. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: David Rientjes Acked-by: Steffen Maier [zfcp] Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Steve French Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/s390/scsi/zfcp_erp.c | 4 ++-- fs/cifs/connect.c | 6 ++---- include/linux/mempool.h | 2 +- mm/mempool.c | 10 ++++++---- 4 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/drivers/s390/scsi/zfcp_erp.c b/drivers/s390/scsi/zfcp_erp.c index 2c5d4567d1da..acde3f5d6e9e 100644 --- a/drivers/s390/scsi/zfcp_erp.c +++ b/drivers/s390/scsi/zfcp_erp.c @@ -738,11 +738,11 @@ static int zfcp_erp_adapter_strategy_open_fsf(struct zfcp_erp_action *act) return ZFCP_ERP_FAILED; if (mempool_resize(act->adapter->pool.sr_data, - act->adapter->stat_read_buf_num, GFP_KERNEL)) + act->adapter->stat_read_buf_num)) return ZFCP_ERP_FAILED; if (mempool_resize(act->adapter->pool.status_read_req, - act->adapter->stat_read_buf_num, GFP_KERNEL)) + act->adapter->stat_read_buf_num)) return ZFCP_ERP_FAILED; atomic_set(&act->adapter->stat_miss, act->adapter->stat_read_buf_num); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 480cf9c81d50..f3bfe08e177b 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -773,8 +773,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) length = atomic_dec_return(&tcpSesAllocCount); if (length > 0) - mempool_resize(cifs_req_poolp, length + cifs_min_rcv, - GFP_KERNEL); + mempool_resize(cifs_req_poolp, length + cifs_min_rcv); } static int @@ -848,8 +847,7 @@ cifs_demultiplex_thread(void *p) length = atomic_inc_return(&tcpSesAllocCount); if (length > 1) - mempool_resize(cifs_req_poolp, length + cifs_min_rcv, - GFP_KERNEL); + mempool_resize(cifs_req_poolp, length + cifs_min_rcv); set_freezable(); while (server->tcpStatus != CifsExiting) { diff --git a/include/linux/mempool.h b/include/linux/mempool.h index 39ed62ab5b8a..b19b3023c880 100644 --- a/include/linux/mempool.h +++ b/include/linux/mempool.h @@ -29,7 +29,7 @@ extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data, gfp_t gfp_mask, int nid); -extern int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask); +extern int mempool_resize(mempool_t *pool, int new_min_nr); extern void mempool_destroy(mempool_t *pool); extern void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask); extern void mempool_free(void *element, mempool_t *pool); diff --git a/mm/mempool.c b/mm/mempool.c index e209c98c7203..949970db2874 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -113,23 +113,24 @@ EXPORT_SYMBOL(mempool_create_node); * mempool_create(). * @new_min_nr: the new minimum number of elements guaranteed to be * allocated for this pool. - * @gfp_mask: the usual allocation bitmask. * * This function shrinks/grows the pool. In the case of growing, * it cannot be guaranteed that the pool will be grown to the new * size immediately, but new mempool_free() calls will refill it. + * This function may sleep. * * Note, the caller must guarantee that no mempool_destroy is called * while this function is running. mempool_alloc() & mempool_free() * might be called (eg. from IRQ contexts) while this function executes. */ -int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask) +int mempool_resize(mempool_t *pool, int new_min_nr) { void *element; void **new_elements; unsigned long flags; BUG_ON(new_min_nr <= 0); + might_sleep(); spin_lock_irqsave(&pool->lock, flags); if (new_min_nr <= pool->min_nr) { @@ -145,7 +146,8 @@ int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask) spin_unlock_irqrestore(&pool->lock, flags); /* Grow the pool */ - new_elements = kmalloc(new_min_nr * sizeof(*new_elements), gfp_mask); + new_elements = kmalloc_array(new_min_nr, sizeof(*new_elements), + GFP_KERNEL); if (!new_elements) return -ENOMEM; @@ -164,7 +166,7 @@ int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask) while (pool->curr_nr < pool->min_nr) { spin_unlock_irqrestore(&pool->lock, flags); - element = pool->alloc(gfp_mask, pool->pool_data); + element = pool->alloc(GFP_KERNEL, pool->pool_data); if (!element) goto out; spin_lock_irqsave(&pool->lock, flags); -- cgit v1.2.3 From 4a20799d11f64e6b8725cacc7619b1ae1dbf9acd Mon Sep 17 00:00:00 2001 From: Vladimir Murzin Date: Tue, 14 Apr 2015 15:48:27 -0700 Subject: mm: move memtest under mm Memtest is a simple feature which fills the memory with a given set of patterns and validates memory contents, if bad memory regions is detected it reserves them via memblock API. Since memblock API is widely used by other architectures this feature can be enabled outside of x86 world. This patch set promotes memtest to live under generic mm umbrella and enables memtest feature for arm/arm64. It was reported that this patch set was useful for tracking down an issue with some errant DMA on an arm64 platform. This patch (of 6): There is nothing platform dependent in the core memtest code, so other platforms might benefit from this feature too. [linux@roeck-us.net: MEMTEST depends on MEMBLOCK] Signed-off-by: Vladimir Murzin Acked-by: Will Deacon Tested-by: Mark Rutland Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Catalin Marinas Cc: Russell King Cc: Paul Bolle Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 11 ----- arch/x86/include/asm/e820.h | 8 --- arch/x86/mm/Makefile | 2 - arch/x86/mm/memtest.c | 118 -------------------------------------------- include/linux/memblock.h | 8 +++ lib/Kconfig.debug | 12 +++++ mm/Makefile | 1 + mm/memtest.c | 118 ++++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 139 insertions(+), 139 deletions(-) delete mode 100644 arch/x86/mm/memtest.c create mode 100644 mm/memtest.c (limited to 'include') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1f7f185934a5..d43e7e1c784b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -721,17 +721,6 @@ endif #HYPERVISOR_GUEST config NO_BOOTMEM def_bool y -config MEMTEST - bool "Memtest" - ---help--- - This option adds a kernel parameter 'memtest', which allows memtest - to be set. - memtest=0, mean disabled; -- default - memtest=1, mean do 1 test pattern; - ... - memtest=4, mean do 4 test patterns. - If you are unsure how to answer this question, answer N. - source "arch/x86/Kconfig.cpu" config HPET_TIMER diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index 779c2efe2e97..3ab0537872fb 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -40,14 +40,6 @@ static inline void e820_mark_nosave_regions(unsigned long limit_pfn) } #endif -#ifdef CONFIG_MEMTEST -extern void early_memtest(unsigned long start, unsigned long end); -#else -static inline void early_memtest(unsigned long start, unsigned long end) -{ -} -#endif - extern unsigned long e820_end_of_ram_pfn(void); extern unsigned long e820_end_of_low_ram_pfn(void); extern u64 early_reserve_e820(u64 sizet, u64 align); diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index c4cc74006c61..a482d105172b 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -32,6 +32,4 @@ obj-$(CONFIG_AMD_NUMA) += amdtopology.o obj-$(CONFIG_ACPI_NUMA) += srat.o obj-$(CONFIG_NUMA_EMU) += numa_emulation.o -obj-$(CONFIG_MEMTEST) += memtest.o - obj-$(CONFIG_X86_INTEL_MPX) += mpx.o diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c deleted file mode 100644 index 1e9da795767a..000000000000 --- a/arch/x86/mm/memtest.c +++ /dev/null @@ -1,118 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static u64 patterns[] __initdata = { - /* The first entry has to be 0 to leave memtest with zeroed memory */ - 0, - 0xffffffffffffffffULL, - 0x5555555555555555ULL, - 0xaaaaaaaaaaaaaaaaULL, - 0x1111111111111111ULL, - 0x2222222222222222ULL, - 0x4444444444444444ULL, - 0x8888888888888888ULL, - 0x3333333333333333ULL, - 0x6666666666666666ULL, - 0x9999999999999999ULL, - 0xccccccccccccccccULL, - 0x7777777777777777ULL, - 0xbbbbbbbbbbbbbbbbULL, - 0xddddddddddddddddULL, - 0xeeeeeeeeeeeeeeeeULL, - 0x7a6c7258554e494cULL, /* yeah ;-) */ -}; - -static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad) -{ - printk(KERN_INFO " %016llx bad mem addr %010llx - %010llx reserved\n", - (unsigned long long) pattern, - (unsigned long long) start_bad, - (unsigned long long) end_bad); - memblock_reserve(start_bad, end_bad - start_bad); -} - -static void __init memtest(u64 pattern, u64 start_phys, u64 size) -{ - u64 *p, *start, *end; - u64 start_bad, last_bad; - u64 start_phys_aligned; - const size_t incr = sizeof(pattern); - - start_phys_aligned = ALIGN(start_phys, incr); - start = __va(start_phys_aligned); - end = start + (size - (start_phys_aligned - start_phys)) / incr; - start_bad = 0; - last_bad = 0; - - for (p = start; p < end; p++) - *p = pattern; - - for (p = start; p < end; p++, start_phys_aligned += incr) { - if (*p == pattern) - continue; - if (start_phys_aligned == last_bad + incr) { - last_bad += incr; - continue; - } - if (start_bad) - reserve_bad_mem(pattern, start_bad, last_bad + incr); - start_bad = last_bad = start_phys_aligned; - } - if (start_bad) - reserve_bad_mem(pattern, start_bad, last_bad + incr); -} - -static void __init do_one_pass(u64 pattern, u64 start, u64 end) -{ - u64 i; - phys_addr_t this_start, this_end; - - for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) { - this_start = clamp_t(phys_addr_t, this_start, start, end); - this_end = clamp_t(phys_addr_t, this_end, start, end); - if (this_start < this_end) { - printk(KERN_INFO " %010llx - %010llx pattern %016llx\n", - (unsigned long long)this_start, - (unsigned long long)this_end, - (unsigned long long)cpu_to_be64(pattern)); - memtest(pattern, this_start, this_end - this_start); - } - } -} - -/* default is disabled */ -static int memtest_pattern __initdata; - -static int __init parse_memtest(char *arg) -{ - if (arg) - memtest_pattern = simple_strtoul(arg, NULL, 0); - else - memtest_pattern = ARRAY_SIZE(patterns); - - return 0; -} - -early_param("memtest", parse_memtest); - -void __init early_memtest(unsigned long start, unsigned long end) -{ - unsigned int i; - unsigned int idx = 0; - - if (!memtest_pattern) - return; - - printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern); - for (i = memtest_pattern-1; i < UINT_MAX; --i) { - idx = i % ARRAY_SIZE(patterns); - do_one_pass(patterns[idx], start, end); - } -} diff --git a/include/linux/memblock.h b/include/linux/memblock.h index e8cc45307f8f..6724cb020f5e 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -365,6 +365,14 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo #define __initdata_memblock #endif +#ifdef CONFIG_MEMTEST +extern void early_memtest(unsigned long start, unsigned long end); +#else +static inline void early_memtest(unsigned long start, unsigned long end) +{ +} +#endif + #else static inline phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align) { diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 36b6fa88ce5b..5c7a3183423b 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1745,6 +1745,18 @@ config TEST_UDELAY If unsure, say N. +config MEMTEST + bool "Memtest" + depends on HAVE_MEMBLOCK + ---help--- + This option adds a kernel parameter 'memtest', which allows memtest + to be set. + memtest=0, mean disabled; -- default + memtest=1, mean do 1 test pattern; + ... + memtest=4, mean do 4 test patterns. + If you are unsure how to answer this question, answer N. + source "samples/Kconfig" source "lib/Kconfig.kgdb" diff --git a/mm/Makefile b/mm/Makefile index 668a9bb82be4..98c4eaeabdcb 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -55,6 +55,7 @@ obj-$(CONFIG_KMEMCHECK) += kmemcheck.o obj-$(CONFIG_KASAN) += kasan/ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o +obj-$(CONFIG_MEMTEST) += memtest.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o diff --git a/mm/memtest.c b/mm/memtest.c new file mode 100644 index 000000000000..1e9da795767a --- /dev/null +++ b/mm/memtest.c @@ -0,0 +1,118 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static u64 patterns[] __initdata = { + /* The first entry has to be 0 to leave memtest with zeroed memory */ + 0, + 0xffffffffffffffffULL, + 0x5555555555555555ULL, + 0xaaaaaaaaaaaaaaaaULL, + 0x1111111111111111ULL, + 0x2222222222222222ULL, + 0x4444444444444444ULL, + 0x8888888888888888ULL, + 0x3333333333333333ULL, + 0x6666666666666666ULL, + 0x9999999999999999ULL, + 0xccccccccccccccccULL, + 0x7777777777777777ULL, + 0xbbbbbbbbbbbbbbbbULL, + 0xddddddddddddddddULL, + 0xeeeeeeeeeeeeeeeeULL, + 0x7a6c7258554e494cULL, /* yeah ;-) */ +}; + +static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad) +{ + printk(KERN_INFO " %016llx bad mem addr %010llx - %010llx reserved\n", + (unsigned long long) pattern, + (unsigned long long) start_bad, + (unsigned long long) end_bad); + memblock_reserve(start_bad, end_bad - start_bad); +} + +static void __init memtest(u64 pattern, u64 start_phys, u64 size) +{ + u64 *p, *start, *end; + u64 start_bad, last_bad; + u64 start_phys_aligned; + const size_t incr = sizeof(pattern); + + start_phys_aligned = ALIGN(start_phys, incr); + start = __va(start_phys_aligned); + end = start + (size - (start_phys_aligned - start_phys)) / incr; + start_bad = 0; + last_bad = 0; + + for (p = start; p < end; p++) + *p = pattern; + + for (p = start; p < end; p++, start_phys_aligned += incr) { + if (*p == pattern) + continue; + if (start_phys_aligned == last_bad + incr) { + last_bad += incr; + continue; + } + if (start_bad) + reserve_bad_mem(pattern, start_bad, last_bad + incr); + start_bad = last_bad = start_phys_aligned; + } + if (start_bad) + reserve_bad_mem(pattern, start_bad, last_bad + incr); +} + +static void __init do_one_pass(u64 pattern, u64 start, u64 end) +{ + u64 i; + phys_addr_t this_start, this_end; + + for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) { + this_start = clamp_t(phys_addr_t, this_start, start, end); + this_end = clamp_t(phys_addr_t, this_end, start, end); + if (this_start < this_end) { + printk(KERN_INFO " %010llx - %010llx pattern %016llx\n", + (unsigned long long)this_start, + (unsigned long long)this_end, + (unsigned long long)cpu_to_be64(pattern)); + memtest(pattern, this_start, this_end - this_start); + } + } +} + +/* default is disabled */ +static int memtest_pattern __initdata; + +static int __init parse_memtest(char *arg) +{ + if (arg) + memtest_pattern = simple_strtoul(arg, NULL, 0); + else + memtest_pattern = ARRAY_SIZE(patterns); + + return 0; +} + +early_param("memtest", parse_memtest); + +void __init early_memtest(unsigned long start, unsigned long end) +{ + unsigned int i; + unsigned int idx = 0; + + if (!memtest_pattern) + return; + + printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern); + for (i = memtest_pattern-1; i < UINT_MAX; --i) { + idx = i % ARRAY_SIZE(patterns); + do_one_pass(patterns[idx], start, end); + } +} -- cgit v1.2.3 From 7f70baeeb9e2d2b2a37a4bd3727d709547c4ae41 Mon Sep 17 00:00:00 2001 From: Vladimir Murzin Date: Tue, 14 Apr 2015 15:48:30 -0700 Subject: memtest: use phys_addr_t for physical addresses Since memtest might be used by other architectures pass input parameters as phys_addr_t instead of long to prevent overflow. Signed-off-by: Vladimir Murzin Acked-by: Will Deacon Tested-by: Mark Rutland Cc: "H. Peter Anvin" Cc: Catalin Marinas Cc: Ingo Molnar Cc: Russell King Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 4 ++-- mm/memtest.c | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 6724cb020f5e..9497ec7c77ea 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -366,9 +366,9 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo #endif #ifdef CONFIG_MEMTEST -extern void early_memtest(unsigned long start, unsigned long end); +extern void early_memtest(phys_addr_t start, phys_addr_t end); #else -static inline void early_memtest(unsigned long start, unsigned long end) +static inline void early_memtest(phys_addr_t start, phys_addr_t end) { } #endif diff --git a/mm/memtest.c b/mm/memtest.c index 1e9da795767a..1997d934b13b 100644 --- a/mm/memtest.c +++ b/mm/memtest.c @@ -29,7 +29,7 @@ static u64 patterns[] __initdata = { 0x7a6c7258554e494cULL, /* yeah ;-) */ }; -static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad) +static void __init reserve_bad_mem(u64 pattern, phys_addr_t start_bad, phys_addr_t end_bad) { printk(KERN_INFO " %016llx bad mem addr %010llx - %010llx reserved\n", (unsigned long long) pattern, @@ -38,11 +38,11 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad) memblock_reserve(start_bad, end_bad - start_bad); } -static void __init memtest(u64 pattern, u64 start_phys, u64 size) +static void __init memtest(u64 pattern, phys_addr_t start_phys, phys_addr_t size) { u64 *p, *start, *end; - u64 start_bad, last_bad; - u64 start_phys_aligned; + phys_addr_t start_bad, last_bad; + phys_addr_t start_phys_aligned; const size_t incr = sizeof(pattern); start_phys_aligned = ALIGN(start_phys, incr); @@ -69,14 +69,14 @@ static void __init memtest(u64 pattern, u64 start_phys, u64 size) reserve_bad_mem(pattern, start_bad, last_bad + incr); } -static void __init do_one_pass(u64 pattern, u64 start, u64 end) +static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end) { u64 i; phys_addr_t this_start, this_end; for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) { - this_start = clamp_t(phys_addr_t, this_start, start, end); - this_end = clamp_t(phys_addr_t, this_end, start, end); + this_start = clamp(this_start, start, end); + this_end = clamp(this_end, start, end); if (this_start < this_end) { printk(KERN_INFO " %010llx - %010llx pattern %016llx\n", (unsigned long long)this_start, @@ -102,7 +102,7 @@ static int __init parse_memtest(char *arg) early_param("memtest", parse_memtest); -void __init early_memtest(unsigned long start, unsigned long end) +void __init early_memtest(phys_addr_t start, phys_addr_t end) { unsigned int i; unsigned int idx = 0; -- cgit v1.2.3