From 5866a75b50b381fa05028e7ff3f2272ed006e321 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 25 Jun 2020 15:08:35 +0200 Subject: arm64: Remove dev->archdata.iommu pointer There are no users left, all drivers have been converted to use the per-device private pointer offered by IOMMU core. Signed-off-by: Joerg Roedel Reviewed-by: Jerry Snitselaar Acked-by: Will Deacon Link: https://lore.kernel.org/r/20200625130836.1916-13-joro@8bytes.org --- arch/arm64/include/asm/device.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/include/asm/device.h b/arch/arm64/include/asm/device.h index 12b778d55342..996498751318 100644 --- a/arch/arm64/include/asm/device.h +++ b/arch/arm64/include/asm/device.h @@ -6,9 +6,6 @@ #define __ASM_DEVICE_H struct dev_archdata { -#ifdef CONFIG_IOMMU_API - void *iommu; /* private IOMMU data */ -#endif }; struct pdev_archdata { -- cgit v1.2.3 From 4c5a116ada953b86125ab7c70a57c57463a55a55 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 4 Aug 2020 22:37:48 +0200 Subject: vdso/treewide: Add vdso_data pointer argument to __arch_get_hw_counter() MIPS already uses and S390 will need the vdso data pointer in __arch_get_hw_counter(). This works nicely as long as the architecture does not support time namespaces in the VDSO. With time namespaces enabled the regular accessor to the vdso data pointer __arch_get_vdso_data() will return the namespace specific VDSO data page for tasks which are part of a non-root time namespace. This would cause the architectures which need the vdso data pointer in __arch_get_hw_counter() to access the wrong vdso data page. Add a vdso_data pointer argument to __arch_get_hw_counter() and hand it in from the call sites in the core code. For architectures which do not need the data pointer in their counter accessor function the compiler will just optimize it out. Fix up all existing architecture implementations and make MIPS utilize the pointer instead of invoking the accessor function. No functional change and no change in the resulting object code (except MIPS). Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/draft-87wo2ekuzn.fsf@nanos.tec.linutronix.de --- arch/arm/include/asm/vdso/gettimeofday.h | 3 ++- arch/arm64/include/asm/vdso/compat_gettimeofday.h | 3 ++- arch/arm64/include/asm/vdso/gettimeofday.h | 3 ++- arch/mips/include/asm/vdso/gettimeofday.h | 5 +++-- arch/riscv/include/asm/vdso/gettimeofday.h | 3 ++- arch/x86/include/asm/vdso/gettimeofday.h | 3 ++- lib/vdso/gettimeofday.c | 4 ++-- 7 files changed, 15 insertions(+), 9 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm/include/asm/vdso/gettimeofday.h b/arch/arm/include/asm/vdso/gettimeofday.h index 1b207cf07697..2134cbd5469f 100644 --- a/arch/arm/include/asm/vdso/gettimeofday.h +++ b/arch/arm/include/asm/vdso/gettimeofday.h @@ -113,7 +113,8 @@ static inline bool arm_vdso_hres_capable(void) } #define __arch_vdso_hres_capable arm_vdso_hres_capable -static __always_inline u64 __arch_get_hw_counter(int clock_mode) +static __always_inline u64 __arch_get_hw_counter(int clock_mode, + const struct vdso_data *vd) { #ifdef CONFIG_ARM_ARCH_TIMER u64 cycle_now; diff --git a/arch/arm64/include/asm/vdso/compat_gettimeofday.h b/arch/arm64/include/asm/vdso/compat_gettimeofday.h index 75cbae60455b..7508b0ac1d21 100644 --- a/arch/arm64/include/asm/vdso/compat_gettimeofday.h +++ b/arch/arm64/include/asm/vdso/compat_gettimeofday.h @@ -103,7 +103,8 @@ int clock_getres32_fallback(clockid_t _clkid, struct old_timespec32 *_ts) return ret; } -static __always_inline u64 __arch_get_hw_counter(s32 clock_mode) +static __always_inline u64 __arch_get_hw_counter(s32 clock_mode, + const struct vdso_data *vd) { u64 res; diff --git a/arch/arm64/include/asm/vdso/gettimeofday.h b/arch/arm64/include/asm/vdso/gettimeofday.h index 9c29ad3049f8..631ab1281633 100644 --- a/arch/arm64/include/asm/vdso/gettimeofday.h +++ b/arch/arm64/include/asm/vdso/gettimeofday.h @@ -64,7 +64,8 @@ int clock_getres_fallback(clockid_t _clkid, struct __kernel_timespec *_ts) return ret; } -static __always_inline u64 __arch_get_hw_counter(s32 clock_mode) +static __always_inline u64 __arch_get_hw_counter(s32 clock_mode, + const struct vdso_data *vd) { u64 res; diff --git a/arch/mips/include/asm/vdso/gettimeofday.h b/arch/mips/include/asm/vdso/gettimeofday.h index c63ddcaea54c..2203e2d0ae2a 100644 --- a/arch/mips/include/asm/vdso/gettimeofday.h +++ b/arch/mips/include/asm/vdso/gettimeofday.h @@ -167,7 +167,8 @@ static __always_inline u64 read_gic_count(const struct vdso_data *data) #endif -static __always_inline u64 __arch_get_hw_counter(s32 clock_mode) +static __always_inline u64 __arch_get_hw_counter(s32 clock_mode, + const struct vdso_data *vd) { #ifdef CONFIG_CSRC_R4K if (clock_mode == VDSO_CLOCKMODE_R4K) @@ -175,7 +176,7 @@ static __always_inline u64 __arch_get_hw_counter(s32 clock_mode) #endif #ifdef CONFIG_CLKSRC_MIPS_GIC if (clock_mode == VDSO_CLOCKMODE_GIC) - return read_gic_count(get_vdso_data()); + return read_gic_count(vd); #endif /* * Core checks mode already. So this raced against a concurrent diff --git a/arch/riscv/include/asm/vdso/gettimeofday.h b/arch/riscv/include/asm/vdso/gettimeofday.h index 3099362d9f26..f839f16e0d2a 100644 --- a/arch/riscv/include/asm/vdso/gettimeofday.h +++ b/arch/riscv/include/asm/vdso/gettimeofday.h @@ -60,7 +60,8 @@ int clock_getres_fallback(clockid_t _clkid, struct __kernel_timespec *_ts) return ret; } -static __always_inline u64 __arch_get_hw_counter(s32 clock_mode) +static __always_inline u64 __arch_get_hw_counter(s32 clock_mode, + const struct vdso_data *vd) { /* * The purpose of csr_read(CSR_TIME) is to trap the system into diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h index fb81fea99093..df01d7349d79 100644 --- a/arch/x86/include/asm/vdso/gettimeofday.h +++ b/arch/x86/include/asm/vdso/gettimeofday.h @@ -241,7 +241,8 @@ static u64 vread_hvclock(void) } #endif -static inline u64 __arch_get_hw_counter(s32 clock_mode) +static inline u64 __arch_get_hw_counter(s32 clock_mode, + const struct vdso_data *vd) { if (likely(clock_mode == VDSO_CLOCKMODE_TSC)) return (u64)rdtsc_ordered(); diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index bcc9a98a0524..2919f1698140 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -68,7 +68,7 @@ static int do_hres_timens(const struct vdso_data *vdns, clockid_t clk, if (unlikely(!vdso_clocksource_ok(vd))) return -1; - cycles = __arch_get_hw_counter(vd->clock_mode); + cycles = __arch_get_hw_counter(vd->clock_mode, vd); if (unlikely(!vdso_cycles_ok(cycles))) return -1; ns = vdso_ts->nsec; @@ -138,7 +138,7 @@ static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk, if (unlikely(!vdso_clocksource_ok(vd))) return -1; - cycles = __arch_get_hw_counter(vd->clock_mode); + cycles = __arch_get_hw_counter(vd->clock_mode, vd); if (unlikely(!vdso_cycles_ok(cycles))) return -1; ns = vdso_ts->nsec; -- cgit v1.2.3 From d622ecec5f57ee7e27fbc6db222d8cbaf3c59478 Mon Sep 17 00:00:00 2001 From: Jia He Date: Tue, 11 Aug 2020 18:32:16 -0700 Subject: mm/memory_hotplug: introduce default dummy memory_add_physaddr_to_nid() This is to introduce a general dummy helper. memory_add_physaddr_to_nid() is a fallback option to get the nid in case NUMA_NO_NID is detected. After this patch, arm64/sh/s390 can simply use the general dummy version. PowerPC/x86/ia64 will still use their specific version. This is the preparation to set a fallback value for dev_dax->target_node. Signed-off-by: Jia He Signed-off-by: Andrew Morton Reviewed-by: David Hildenbrand Cc: Dan Williams Cc: Michal Hocko Cc: Catalin Marinas Cc: Will Deacon Cc: Tony Luck Cc: Fenghua Yu Cc: Yoshinori Sato Cc: Rich Felker Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Vishal Verma Cc: Dave Jiang Cc: Baoquan He Cc: Chuhong Yuan Cc: Mike Rapoport Cc: Logan Gunthorpe Cc: Masahiro Yamada Cc: Jonathan Cameron Cc: Kaly Xin Link: http://lkml.kernel.org/r/20200710031619.18762-2-justin.he@arm.com Signed-off-by: Linus Torvalds --- arch/arm64/mm/numa.c | 10 ---------- arch/ia64/mm/numa.c | 2 -- arch/sh/mm/init.c | 9 --------- arch/x86/mm/numa.c | 1 - mm/memory_hotplug.c | 10 ++++++++++ 5 files changed, 10 insertions(+), 22 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c index aafcee3e3f7e..73f8b49d485c 100644 --- a/arch/arm64/mm/numa.c +++ b/arch/arm64/mm/numa.c @@ -461,13 +461,3 @@ void __init arm64_numa_init(void) numa_init(dummy_numa_init); } - -/* - * We hope that we will be hotplugging memory on nodes we already know about, - * such that acpi_get_node() succeeds and we never fall back to this... - */ -int memory_add_physaddr_to_nid(u64 addr) -{ - pr_warn("Unknown node for memory at 0x%llx, assuming node 0\n", addr); - return 0; -} diff --git a/arch/ia64/mm/numa.c b/arch/ia64/mm/numa.c index 5e1015eb6d0d..f34964271101 100644 --- a/arch/ia64/mm/numa.c +++ b/arch/ia64/mm/numa.c @@ -106,7 +106,5 @@ int memory_add_physaddr_to_nid(u64 addr) return 0; return nid; } - -EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif #endif diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 613de8096335..cd1379360f08 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -425,15 +425,6 @@ int arch_add_memory(int nid, u64 start, u64 size, return ret; } -#ifdef CONFIG_NUMA -int memory_add_physaddr_to_nid(u64 addr) -{ - /* Node 0 for now.. */ - return 0; -} -EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); -#endif - void arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) { diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index b05f45e5e8e2..aa76ec2d359b 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -929,5 +929,4 @@ int memory_add_physaddr_to_nid(u64 start) nid = numa_meminfo.blk[0].nid; return nid; } -EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index ac6961abaa10..6289a909ea36 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -350,6 +350,16 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, return err; } +#ifdef CONFIG_NUMA +int __weak memory_add_physaddr_to_nid(u64 start) +{ + pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n", + start); + return 0; +} +EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); +#endif + /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ static unsigned long find_smallest_section_pfn(int nid, struct zone *zone, unsigned long start_pfn, -- cgit v1.2.3 From 428e2976a5bf7e7f5554286d7a5a33b8147b106a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 11 Aug 2020 18:33:44 -0700 Subject: uaccess: remove segment_eq segment_eq is only used to implement uaccess_kernel. Just open code uaccess_kernel in the arch uaccess headers and remove one layer of indirection. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Linus Torvalds Acked-by: Greentime Hu Acked-by: Geert Uytterhoeven Cc: Nick Hu Cc: Vincent Chen Cc: Paul Walmsley Cc: Palmer Dabbelt Link: http://lkml.kernel.org/r/20200710135706.537715-5-hch@lst.de Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/uaccess.h | 2 +- arch/arc/include/asm/segment.h | 3 +-- arch/arm/include/asm/uaccess.h | 4 ++-- arch/arm64/include/asm/uaccess.h | 2 +- arch/csky/include/asm/segment.h | 2 +- arch/h8300/include/asm/segment.h | 2 +- arch/ia64/include/asm/uaccess.h | 2 +- arch/m68k/include/asm/segment.h | 2 +- arch/microblaze/include/asm/uaccess.h | 2 +- arch/mips/include/asm/uaccess.h | 2 +- arch/nds32/include/asm/uaccess.h | 2 +- arch/nios2/include/asm/uaccess.h | 2 +- arch/openrisc/include/asm/uaccess.h | 2 +- arch/parisc/include/asm/uaccess.h | 2 +- arch/powerpc/include/asm/uaccess.h | 3 +-- arch/riscv/include/asm/uaccess.h | 4 +--- arch/s390/include/asm/uaccess.h | 2 +- arch/sh/include/asm/segment.h | 3 +-- arch/sparc/include/asm/uaccess_32.h | 2 +- arch/sparc/include/asm/uaccess_64.h | 2 +- arch/x86/include/asm/uaccess.h | 2 +- arch/xtensa/include/asm/uaccess.h | 2 +- include/asm-generic/uaccess.h | 4 ++-- include/linux/uaccess.h | 2 -- 24 files changed, 25 insertions(+), 32 deletions(-) (limited to 'arch/arm64') diff --git a/arch/alpha/include/asm/uaccess.h b/arch/alpha/include/asm/uaccess.h index 1fe2b56cb861..1b6f25efa247 100644 --- a/arch/alpha/include/asm/uaccess.h +++ b/arch/alpha/include/asm/uaccess.h @@ -20,7 +20,7 @@ #define get_fs() (current_thread_info()->addr_limit) #define set_fs(x) (current_thread_info()->addr_limit = (x)) -#define segment_eq(a, b) ((a).seg == (b).seg) +#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) /* * Is a address valid? This does a straightforward calculation rather diff --git a/arch/arc/include/asm/segment.h b/arch/arc/include/asm/segment.h index 6a2a5be5026d..871f8ab11bfd 100644 --- a/arch/arc/include/asm/segment.h +++ b/arch/arc/include/asm/segment.h @@ -14,8 +14,7 @@ typedef unsigned long mm_segment_t; #define KERNEL_DS MAKE_MM_SEG(0) #define USER_DS MAKE_MM_SEG(TASK_SIZE) - -#define segment_eq(a, b) ((a) == (b)) +#define uaccess_kernel() (get_fs() == KERNEL_DS) #endif /* __ASSEMBLY__ */ #endif /* __ASMARC_SEGMENT_H */ diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h index b5fdd30252f8..a13d90206472 100644 --- a/arch/arm/include/asm/uaccess.h +++ b/arch/arm/include/asm/uaccess.h @@ -76,7 +76,7 @@ static inline void set_fs(mm_segment_t fs) modify_domain(DOMAIN_KERNEL, fs ? DOMAIN_CLIENT : DOMAIN_MANAGER); } -#define segment_eq(a, b) ((a) == (b)) +#define uaccess_kernel() (get_fs() == KERNEL_DS) /* * We use 33-bit arithmetic here. Success returns zero, failure returns @@ -267,7 +267,7 @@ extern int __put_user_8(void *, unsigned long long); */ #define USER_DS KERNEL_DS -#define segment_eq(a, b) (1) +#define uaccess_kernel() (true) #define __addr_ok(addr) ((void)(addr), 1) #define __range_ok(addr, size) ((void)(addr), 0) #define get_fs() (KERNEL_DS) diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 8d7c466f809b..991dd5f031e4 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -50,7 +50,7 @@ static inline void set_fs(mm_segment_t fs) CONFIG_ARM64_UAO)); } -#define segment_eq(a, b) ((a) == (b)) +#define uaccess_kernel() (get_fs() == KERNEL_DS) /* * Test whether a block of memory is a valid user space address. diff --git a/arch/csky/include/asm/segment.h b/arch/csky/include/asm/segment.h index db2640d5f575..79ede9b1a646 100644 --- a/arch/csky/include/asm/segment.h +++ b/arch/csky/include/asm/segment.h @@ -13,6 +13,6 @@ typedef struct { #define USER_DS ((mm_segment_t) { 0x80000000UL }) #define get_fs() (current_thread_info()->addr_limit) #define set_fs(x) (current_thread_info()->addr_limit = (x)) -#define segment_eq(a, b) ((a).seg == (b).seg) +#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) #endif /* __ASM_CSKY_SEGMENT_H */ diff --git a/arch/h8300/include/asm/segment.h b/arch/h8300/include/asm/segment.h index a407978f9f9f..37950725d9b9 100644 --- a/arch/h8300/include/asm/segment.h +++ b/arch/h8300/include/asm/segment.h @@ -33,7 +33,7 @@ static inline mm_segment_t get_fs(void) return USER_DS; } -#define segment_eq(a, b) ((a).seg == (b).seg) +#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) #endif /* __ASSEMBLY__ */ diff --git a/arch/ia64/include/asm/uaccess.h b/arch/ia64/include/asm/uaccess.h index 8aa473a4b0f4..179243c3dfc7 100644 --- a/arch/ia64/include/asm/uaccess.h +++ b/arch/ia64/include/asm/uaccess.h @@ -50,7 +50,7 @@ #define get_fs() (current_thread_info()->addr_limit) #define set_fs(x) (current_thread_info()->addr_limit = (x)) -#define segment_eq(a, b) ((a).seg == (b).seg) +#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) /* * When accessing user memory, we need to make sure the entire area really is in diff --git a/arch/m68k/include/asm/segment.h b/arch/m68k/include/asm/segment.h index c6686559e9b7..2b5e68a71ef7 100644 --- a/arch/m68k/include/asm/segment.h +++ b/arch/m68k/include/asm/segment.h @@ -52,7 +52,7 @@ static inline void set_fs(mm_segment_t val) #define set_fs(x) (current_thread_info()->addr_limit = (x)) #endif -#define segment_eq(a, b) ((a).seg == (b).seg) +#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) #endif /* __ASSEMBLY__ */ diff --git a/arch/microblaze/include/asm/uaccess.h b/arch/microblaze/include/asm/uaccess.h index 6723c56ec378..304b04ffea2f 100644 --- a/arch/microblaze/include/asm/uaccess.h +++ b/arch/microblaze/include/asm/uaccess.h @@ -41,7 +41,7 @@ # define get_fs() (current_thread_info()->addr_limit) # define set_fs(val) (current_thread_info()->addr_limit = (val)) -# define segment_eq(a, b) ((a).seg == (b).seg) +# define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) #ifndef CONFIG_MMU diff --git a/arch/mips/include/asm/uaccess.h b/arch/mips/include/asm/uaccess.h index 62b298c50905..61fc01f177a6 100644 --- a/arch/mips/include/asm/uaccess.h +++ b/arch/mips/include/asm/uaccess.h @@ -72,7 +72,7 @@ extern u64 __ua_limit; #define get_fs() (current_thread_info()->addr_limit) #define set_fs(x) (current_thread_info()->addr_limit = (x)) -#define segment_eq(a, b) ((a).seg == (b).seg) +#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) /* * eva_kernel_access() - determine whether kernel memory access on an EVA system diff --git a/arch/nds32/include/asm/uaccess.h b/arch/nds32/include/asm/uaccess.h index 3a9219f53ee0..010ba5f1d7dd 100644 --- a/arch/nds32/include/asm/uaccess.h +++ b/arch/nds32/include/asm/uaccess.h @@ -44,7 +44,7 @@ static inline void set_fs(mm_segment_t fs) current_thread_info()->addr_limit = fs; } -#define segment_eq(a, b) ((a) == (b)) +#define uaccess_kernel() (get_fs() == KERNEL_DS) #define __range_ok(addr, size) (size <= get_fs() && addr <= (get_fs() -size)) diff --git a/arch/nios2/include/asm/uaccess.h b/arch/nios2/include/asm/uaccess.h index e83f831a76f9..a741abbed6fb 100644 --- a/arch/nios2/include/asm/uaccess.h +++ b/arch/nios2/include/asm/uaccess.h @@ -30,7 +30,7 @@ #define get_fs() (current_thread_info()->addr_limit) #define set_fs(seg) (current_thread_info()->addr_limit = (seg)) -#define segment_eq(a, b) ((a).seg == (b).seg) +#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) #define __access_ok(addr, len) \ (((signed long)(((long)get_fs().seg) & \ diff --git a/arch/openrisc/include/asm/uaccess.h b/arch/openrisc/include/asm/uaccess.h index 17c24f14615f..48b691530d3e 100644 --- a/arch/openrisc/include/asm/uaccess.h +++ b/arch/openrisc/include/asm/uaccess.h @@ -43,7 +43,7 @@ #define get_fs() (current_thread_info()->addr_limit) #define set_fs(x) (current_thread_info()->addr_limit = (x)) -#define segment_eq(a, b) ((a) == (b)) +#define uaccess_kernel() (get_fs() == KERNEL_DS) /* Ensure that the range from addr to addr+size is all within the process' * address space diff --git a/arch/parisc/include/asm/uaccess.h b/arch/parisc/include/asm/uaccess.h index ebbb9ffe038c..ed2cd4fb479b 100644 --- a/arch/parisc/include/asm/uaccess.h +++ b/arch/parisc/include/asm/uaccess.h @@ -14,7 +14,7 @@ #define KERNEL_DS ((mm_segment_t){0}) #define USER_DS ((mm_segment_t){1}) -#define segment_eq(a, b) ((a).seg == (b).seg) +#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) #define get_fs() (current_thread_info()->addr_limit) #define set_fs(x) (current_thread_info()->addr_limit = (x)) diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index 64c04ab09112..00699903f1ef 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -38,8 +38,7 @@ static inline void set_fs(mm_segment_t fs) set_thread_flag(TIF_FSCHECK); } -#define segment_eq(a, b) ((a).seg == (b).seg) - +#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) #define user_addr_max() (get_fs().seg) #ifdef __powerpc64__ diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h index 22de922d6ecb..f56c66b3f5fe 100644 --- a/arch/riscv/include/asm/uaccess.h +++ b/arch/riscv/include/asm/uaccess.h @@ -64,11 +64,9 @@ static inline void set_fs(mm_segment_t fs) current_thread_info()->addr_limit = fs; } -#define segment_eq(a, b) ((a).seg == (b).seg) - +#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) #define user_addr_max() (get_fs().seg) - /** * access_ok: - Checks if a user space pointer is valid * @addr: User space pointer to start of block to check diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index 324438889fe1..f09444d6aeab 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -32,7 +32,7 @@ #define USER_DS_SACF (3) #define get_fs() (current->thread.mm_segment) -#define segment_eq(a,b) (((a) & 2) == ((b) & 2)) +#define uaccess_kernel() ((get_fs() & 2) == KERNEL_DS) void set_fs(mm_segment_t fs); diff --git a/arch/sh/include/asm/segment.h b/arch/sh/include/asm/segment.h index 33d1d28057cb..02e54a3335d6 100644 --- a/arch/sh/include/asm/segment.h +++ b/arch/sh/include/asm/segment.h @@ -24,8 +24,7 @@ typedef struct { #define USER_DS KERNEL_DS #endif -#define segment_eq(a, b) ((a).seg == (b).seg) - +#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) #define get_fs() (current_thread_info()->addr_limit) #define set_fs(x) (current_thread_info()->addr_limit = (x)) diff --git a/arch/sparc/include/asm/uaccess_32.h b/arch/sparc/include/asm/uaccess_32.h index d6d8413eca83..0a2d3ebc4bb8 100644 --- a/arch/sparc/include/asm/uaccess_32.h +++ b/arch/sparc/include/asm/uaccess_32.h @@ -28,7 +28,7 @@ #define get_fs() (current->thread.current_ds) #define set_fs(val) ((current->thread.current_ds) = (val)) -#define segment_eq(a, b) ((a).seg == (b).seg) +#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) /* We have there a nice not-mapped page at PAGE_OFFSET - PAGE_SIZE, so that this test * can be fairly lightweight. diff --git a/arch/sparc/include/asm/uaccess_64.h b/arch/sparc/include/asm/uaccess_64.h index bf9d330073b2..698cf69f74e9 100644 --- a/arch/sparc/include/asm/uaccess_64.h +++ b/arch/sparc/include/asm/uaccess_64.h @@ -32,7 +32,7 @@ #define get_fs() ((mm_segment_t){(current_thread_info()->current_ds)}) -#define segment_eq(a, b) ((a).seg == (b).seg) +#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) #define set_fs(val) \ do { \ diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 2f3e8f2a958f..ecefaffd15d4 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -33,7 +33,7 @@ static inline void set_fs(mm_segment_t fs) set_thread_flag(TIF_FSCHECK); } -#define segment_eq(a, b) ((a).seg == (b).seg) +#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) #define user_addr_max() (current->thread.addr_limit.seg) /* diff --git a/arch/xtensa/include/asm/uaccess.h b/arch/xtensa/include/asm/uaccess.h index e57f0d0a88d8..b9758119feca 100644 --- a/arch/xtensa/include/asm/uaccess.h +++ b/arch/xtensa/include/asm/uaccess.h @@ -35,7 +35,7 @@ #define get_fs() (current->thread.current_ds) #define set_fs(val) (current->thread.current_ds = (val)) -#define segment_eq(a, b) ((a).seg == (b).seg) +#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) #define __kernel_ok (uaccess_kernel()) #define __user_ok(addr, size) \ diff --git a/include/asm-generic/uaccess.h b/include/asm-generic/uaccess.h index e935318804f8..ba68ee4dabfa 100644 --- a/include/asm-generic/uaccess.h +++ b/include/asm-generic/uaccess.h @@ -86,8 +86,8 @@ static inline void set_fs(mm_segment_t fs) } #endif -#ifndef segment_eq -#define segment_eq(a, b) ((a).seg == (b).seg) +#ifndef uaccess_kernel +#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) #endif #define access_ok(addr, size) __access_ok((unsigned long)(addr),(size)) diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 0a76ddc07d59..5c62d0c6f15b 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -6,8 +6,6 @@ #include #include -#define uaccess_kernel() segment_eq(get_fs(), KERNEL_DS) - #include /* -- cgit v1.2.3 From 3d13f313ce4c34c524ccc37986fe77172f601ff3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 11 Aug 2020 18:33:47 -0700 Subject: uaccess: add force_uaccess_{begin,end} helpers Add helpers to wrap the get_fs/set_fs magic for undoing any damange done by set_fs(KERNEL_DS). There is no real functional benefit, but this documents the intent of these calls better, and will allow stubbing the functions out easily for kernels builds that do not allow address space overrides in the future. [hch@lst.de: drop two incorrect hunks, fix a commit log typo] Link: http://lkml.kernel.org/r/20200714105505.935079-6-hch@lst.de Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Linus Torvalds Acked-by: Mark Rutland Acked-by: Greentime Hu Acked-by: Geert Uytterhoeven Cc: Nick Hu Cc: Vincent Chen Cc: Paul Walmsley Cc: Palmer Dabbelt Link: http://lkml.kernel.org/r/20200710135706.537715-6-hch@lst.de Signed-off-by: Linus Torvalds --- arch/arm64/kernel/sdei.c | 2 +- arch/m68k/include/asm/tlbflush.h | 6 +++--- arch/mips/kernel/unaligned.c | 27 +++++++++++++-------------- arch/nds32/mm/alignment.c | 7 +++---- arch/sh/kernel/traps_32.c | 12 +++++------- drivers/firmware/arm_sdei.c | 5 ++--- include/linux/uaccess.h | 18 ++++++++++++++++++ kernel/events/callchain.c | 5 ++--- kernel/events/core.c | 5 ++--- kernel/kthread.c | 5 ++--- kernel/stacktrace.c | 5 ++--- mm/maccess.c | 22 ++++++++++------------ 12 files changed, 63 insertions(+), 56 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c index dab88260b137..7689f2031c0c 100644 --- a/arch/arm64/kernel/sdei.c +++ b/arch/arm64/kernel/sdei.c @@ -180,7 +180,7 @@ static __kprobes unsigned long _sdei_handler(struct pt_regs *regs, /* * We didn't take an exception to get here, set PAN. UAO will be cleared - * by sdei_event_handler()s set_fs(USER_DS) call. + * by sdei_event_handler()s force_uaccess_begin() call. */ __uaccess_enable_hw_pan(); diff --git a/arch/m68k/include/asm/tlbflush.h b/arch/m68k/include/asm/tlbflush.h index 191e75a6bb24..5337bc2c262f 100644 --- a/arch/m68k/include/asm/tlbflush.h +++ b/arch/m68k/include/asm/tlbflush.h @@ -85,10 +85,10 @@ static inline void flush_tlb_mm(struct mm_struct *mm) static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr) { if (vma->vm_mm == current->active_mm) { - mm_segment_t old_fs = get_fs(); - set_fs(USER_DS); + mm_segment_t old_fs = force_uaccess_begin(); + __flush_tlb_one(addr); - set_fs(old_fs); + force_uaccess_end(old_fs); } } diff --git a/arch/mips/kernel/unaligned.c b/arch/mips/kernel/unaligned.c index 0adce604fa44..126a5f3f4e4c 100644 --- a/arch/mips/kernel/unaligned.c +++ b/arch/mips/kernel/unaligned.c @@ -191,17 +191,16 @@ static void emulate_load_store_insn(struct pt_regs *regs, * memory, so we need to "switch" the address limit to * user space, so that address check can work properly. */ - seg = get_fs(); - set_fs(USER_DS); + seg = force_uaccess_begin(); switch (insn.spec3_format.func) { case lhe_op: if (!access_ok(addr, 2)) { - set_fs(seg); + force_uaccess_end(seg); goto sigbus; } LoadHWE(addr, value, res); if (res) { - set_fs(seg); + force_uaccess_end(seg); goto fault; } compute_return_epc(regs); @@ -209,12 +208,12 @@ static void emulate_load_store_insn(struct pt_regs *regs, break; case lwe_op: if (!access_ok(addr, 4)) { - set_fs(seg); + force_uaccess_end(seg); goto sigbus; } LoadWE(addr, value, res); if (res) { - set_fs(seg); + force_uaccess_end(seg); goto fault; } compute_return_epc(regs); @@ -222,12 +221,12 @@ static void emulate_load_store_insn(struct pt_regs *regs, break; case lhue_op: if (!access_ok(addr, 2)) { - set_fs(seg); + force_uaccess_end(seg); goto sigbus; } LoadHWUE(addr, value, res); if (res) { - set_fs(seg); + force_uaccess_end(seg); goto fault; } compute_return_epc(regs); @@ -235,35 +234,35 @@ static void emulate_load_store_insn(struct pt_regs *regs, break; case she_op: if (!access_ok(addr, 2)) { - set_fs(seg); + force_uaccess_end(seg); goto sigbus; } compute_return_epc(regs); value = regs->regs[insn.spec3_format.rt]; StoreHWE(addr, value, res); if (res) { - set_fs(seg); + force_uaccess_end(seg); goto fault; } break; case swe_op: if (!access_ok(addr, 4)) { - set_fs(seg); + force_uaccess_end(seg); goto sigbus; } compute_return_epc(regs); value = regs->regs[insn.spec3_format.rt]; StoreWE(addr, value, res); if (res) { - set_fs(seg); + force_uaccess_end(seg); goto fault; } break; default: - set_fs(seg); + force_uaccess_end(seg); goto sigill; } - set_fs(seg); + force_uaccess_end(seg); } #endif break; diff --git a/arch/nds32/mm/alignment.c b/arch/nds32/mm/alignment.c index c8b9061a2ee3..1eb7ded6992b 100644 --- a/arch/nds32/mm/alignment.c +++ b/arch/nds32/mm/alignment.c @@ -512,7 +512,7 @@ int do_unaligned_access(unsigned long addr, struct pt_regs *regs) { unsigned long inst; int ret = -EFAULT; - mm_segment_t seg = get_fs(); + mm_segment_t seg; inst = get_inst(regs->ipc); @@ -520,13 +520,12 @@ int do_unaligned_access(unsigned long addr, struct pt_regs *regs) "Faulting addr: 0x%08lx, pc: 0x%08lx [inst: 0x%08lx ]\n", addr, regs->ipc, inst); - set_fs(USER_DS); - + seg = force_uaccess_begin(); if (inst & NDS32_16BIT_INSTRUCTION) ret = do_16((inst >> 16) & 0xffff, regs); else ret = do_32(inst, regs); - set_fs(seg); + force_uaccess_end(seg); return ret; } diff --git a/arch/sh/kernel/traps_32.c b/arch/sh/kernel/traps_32.c index 058c6181bb30..b62ad0ba2395 100644 --- a/arch/sh/kernel/traps_32.c +++ b/arch/sh/kernel/traps_32.c @@ -482,8 +482,6 @@ asmlinkage void do_address_error(struct pt_regs *regs, error_code = lookup_exception_vector(); #endif - oldfs = get_fs(); - if (user_mode(regs)) { int si_code = BUS_ADRERR; unsigned int user_action; @@ -491,13 +489,13 @@ asmlinkage void do_address_error(struct pt_regs *regs, local_irq_enable(); inc_unaligned_user_access(); - set_fs(USER_DS); + oldfs = force_uaccess_begin(); if (copy_from_user(&instruction, (insn_size_t *)(regs->pc & ~1), sizeof(instruction))) { - set_fs(oldfs); + force_uaccess_end(oldfs); goto uspace_segv; } - set_fs(oldfs); + force_uaccess_end(oldfs); /* shout about userspace fixups */ unaligned_fixups_notify(current, instruction, regs); @@ -520,11 +518,11 @@ fixup: goto uspace_segv; } - set_fs(USER_DS); + oldfs = force_uaccess_begin(); tmp = handle_unaligned_access(instruction, regs, &user_mem_access, 0, address); - set_fs(oldfs); + force_uaccess_end(oldfs); if (tmp == 0) return; /* sorted */ diff --git a/drivers/firmware/arm_sdei.c b/drivers/firmware/arm_sdei.c index e7e36aab2386..b4b9ce97f415 100644 --- a/drivers/firmware/arm_sdei.c +++ b/drivers/firmware/arm_sdei.c @@ -1136,15 +1136,14 @@ int sdei_event_handler(struct pt_regs *regs, * access kernel memory. * Do the same here because this doesn't come via the same entry code. */ - orig_addr_limit = get_fs(); - set_fs(USER_DS); + orig_addr_limit = force_uaccess_begin(); err = arg->callback(event_num, regs, arg->callback_arg); if (err) pr_err_ratelimited("event %u on CPU %u failed with error: %d\n", event_num, smp_processor_id(), err); - set_fs(orig_addr_limit); + force_uaccess_end(orig_addr_limit); return err; } diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 5c62d0c6f15b..94b285411659 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -8,6 +8,24 @@ #include +/* + * Force the uaccess routines to be wired up for actual userspace access, + * overriding any possible set_fs(KERNEL_DS) still lingering around. Undone + * using force_uaccess_end below. + */ +static inline mm_segment_t force_uaccess_begin(void) +{ + mm_segment_t fs = get_fs(); + + set_fs(USER_DS); + return fs; +} + +static inline void force_uaccess_end(mm_segment_t oldfs) +{ + set_fs(oldfs); +} + /* * Architectures should provide two primitives (raw_copy_{to,from}_user()) * and get rid of their private instances of copy_{to,from}_user() and diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index c6ce894e4ce9..58cbe357fb2b 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -217,10 +217,9 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); - fs = get_fs(); - set_fs(USER_DS); + fs = force_uaccess_begin(); perf_callchain_user(&ctx, regs); - set_fs(fs); + force_uaccess_end(fs); } } diff --git a/kernel/events/core.c b/kernel/events/core.c index d1f0a7e5b182..6961333ebad5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6453,10 +6453,9 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, /* Data. */ sp = perf_user_stack_pointer(regs); - fs = get_fs(); - set_fs(USER_DS); + fs = force_uaccess_begin(); rem = __output_copy_user(handle, (void *) sp, dump_size); - set_fs(fs); + force_uaccess_end(fs); dyn_size = dump_size - rem; perf_output_skip(handle, rem); diff --git a/kernel/kthread.c b/kernel/kthread.c index b2807e7be772..3edaa380dc7b 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1258,8 +1258,7 @@ void kthread_use_mm(struct mm_struct *mm) if (active_mm != mm) mmdrop(active_mm); - to_kthread(tsk)->oldfs = get_fs(); - set_fs(USER_DS); + to_kthread(tsk)->oldfs = force_uaccess_begin(); } EXPORT_SYMBOL_GPL(kthread_use_mm); @@ -1274,7 +1273,7 @@ void kthread_unuse_mm(struct mm_struct *mm) WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(!tsk->mm); - set_fs(to_kthread(tsk)->oldfs); + force_uaccess_end(to_kthread(tsk)->oldfs); task_lock(tsk); sync_mm_rss(mm); diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 2af66e449aa6..946f44a9e86a 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -233,10 +233,9 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size) if (current->flags & PF_KTHREAD) return 0; - fs = get_fs(); - set_fs(USER_DS); + fs = force_uaccess_begin(); arch_stack_walk_user(consume_entry, &c, task_pt_regs(current)); - set_fs(fs); + force_uaccess_end(fs); return c.len; } diff --git a/mm/maccess.c b/mm/maccess.c index f98ff91e32c6..3bd70405f2d8 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -205,15 +205,14 @@ long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count) long copy_from_user_nofault(void *dst, const void __user *src, size_t size) { long ret = -EFAULT; - mm_segment_t old_fs = get_fs(); + mm_segment_t old_fs = force_uaccess_begin(); - set_fs(USER_DS); if (access_ok(src, size)) { pagefault_disable(); ret = __copy_from_user_inatomic(dst, src, size); pagefault_enable(); } - set_fs(old_fs); + force_uaccess_end(old_fs); if (ret) return -EFAULT; @@ -233,15 +232,14 @@ EXPORT_SYMBOL_GPL(copy_from_user_nofault); long copy_to_user_nofault(void __user *dst, const void *src, size_t size) { long ret = -EFAULT; - mm_segment_t old_fs = get_fs(); + mm_segment_t old_fs = force_uaccess_begin(); - set_fs(USER_DS); if (access_ok(dst, size)) { pagefault_disable(); ret = __copy_to_user_inatomic(dst, src, size); pagefault_enable(); } - set_fs(old_fs); + force_uaccess_end(old_fs); if (ret) return -EFAULT; @@ -270,17 +268,17 @@ EXPORT_SYMBOL_GPL(copy_to_user_nofault); long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr, long count) { - mm_segment_t old_fs = get_fs(); + mm_segment_t old_fs; long ret; if (unlikely(count <= 0)) return 0; - set_fs(USER_DS); + old_fs = force_uaccess_begin(); pagefault_disable(); ret = strncpy_from_user(dst, unsafe_addr, count); pagefault_enable(); - set_fs(old_fs); + force_uaccess_end(old_fs); if (ret >= count) { ret = count; @@ -310,14 +308,14 @@ long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr, */ long strnlen_user_nofault(const void __user *unsafe_addr, long count) { - mm_segment_t old_fs = get_fs(); + mm_segment_t old_fs; int ret; - set_fs(USER_DS); + old_fs = force_uaccess_begin(); pagefault_disable(); ret = strnlen_user(unsafe_addr, count); pagefault_enable(); - set_fs(old_fs); + force_uaccess_end(old_fs); return ret; } -- cgit v1.2.3 From bce617edecada007aee8610fbe2c14d10b8de2f6 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 11 Aug 2020 18:37:44 -0700 Subject: mm: do page fault accounting in handle_mm_fault Patch series "mm: Page fault accounting cleanups", v5. This is v5 of the pf accounting cleanup series. It originates from Gerald Schaefer's report on an issue a week ago regarding to incorrect page fault accountings for retried page fault after commit 4064b9827063 ("mm: allow VM_FAULT_RETRY for multiple times"): https://lore.kernel.org/lkml/20200610174811.44b94525@thinkpad/ What this series did: - Correct page fault accounting: we do accounting for a page fault (no matter whether it's from #PF handling, or gup, or anything else) only with the one that completed the fault. For example, page fault retries should not be counted in page fault counters. Same to the perf events. - Unify definition of PERF_COUNT_SW_PAGE_FAULTS: currently this perf event is used in an adhoc way across different archs. Case (1): for many archs it's done at the entry of a page fault handler, so that it will also cover e.g. errornous faults. Case (2): for some other archs, it is only accounted when the page fault is resolved successfully. Case (3): there're still quite some archs that have not enabled this perf event. Since this series will touch merely all the archs, we unify this perf event to always follow case (1), which is the one that makes most sense. And since we moved the accounting into handle_mm_fault, the other two MAJ/MIN perf events are well taken care of naturally. - Unify definition of "major faults": the definition of "major fault" is slightly changed when used in accounting (not VM_FAULT_MAJOR). More information in patch 1. - Always account the page fault onto the one that triggered the page fault. This does not matter much for #PF handlings, but mostly for gup. More information on this in patch 25. Patchset layout: Patch 1: Introduced the accounting in handle_mm_fault(), not enabled. Patch 2-23: Enable the new accounting for arch #PF handlers one by one. Patch 24: Enable the new accounting for the rest outliers (gup, iommu, etc.) Patch 25: Cleanup GUP task_struct pointer since it's not needed any more This patch (of 25): This is a preparation patch to move page fault accountings into the general code in handle_mm_fault(). This includes both the per task flt_maj/flt_min counters, and the major/minor page fault perf events. To do this, the pt_regs pointer is passed into handle_mm_fault(). PERF_COUNT_SW_PAGE_FAULTS should still be kept in per-arch page fault handlers. So far, all the pt_regs pointer that passed into handle_mm_fault() is NULL, which means this patch should have no intented functional change. Suggested-by: Linus Torvalds Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Cc: Albert Ou Cc: Alexander Gordeev Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Brian Cain Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Chris Zankel Cc: Dave Hansen Cc: David S. Miller Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Greentime Hu Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Ivan Kokshaysky Cc: James E.J. Bottomley Cc: John Hubbard Cc: Jonas Bonn Cc: Ley Foon Tan Cc: "Luck, Tony" Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Nick Hu Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vincent Chen Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200707225021.200906-1-peterx@redhat.com Link: http://lkml.kernel.org/r/20200707225021.200906-2-peterx@redhat.com Signed-off-by: Linus Torvalds --- arch/alpha/mm/fault.c | 2 +- arch/arc/mm/fault.c | 2 +- arch/arm/mm/fault.c | 2 +- arch/arm64/mm/fault.c | 2 +- arch/csky/mm/fault.c | 3 +- arch/hexagon/mm/vm_fault.c | 2 +- arch/ia64/mm/fault.c | 2 +- arch/m68k/mm/fault.c | 2 +- arch/microblaze/mm/fault.c | 2 +- arch/mips/mm/fault.c | 2 +- arch/nds32/mm/fault.c | 2 +- arch/nios2/mm/fault.c | 2 +- arch/openrisc/mm/fault.c | 2 +- arch/parisc/mm/fault.c | 2 +- arch/powerpc/mm/copro_fault.c | 2 +- arch/powerpc/mm/fault.c | 2 +- arch/riscv/mm/fault.c | 2 +- arch/s390/mm/fault.c | 2 +- arch/sh/mm/fault.c | 2 +- arch/sparc/mm/fault_32.c | 4 +-- arch/sparc/mm/fault_64.c | 2 +- arch/um/kernel/trap.c | 2 +- arch/x86/mm/fault.c | 2 +- arch/xtensa/mm/fault.c | 2 +- drivers/iommu/amd/iommu_v2.c | 2 +- drivers/iommu/intel/svm.c | 3 +- include/linux/mm.h | 7 +++-- mm/gup.c | 4 +-- mm/hmm.c | 3 +- mm/ksm.c | 3 +- mm/memory.c | 64 ++++++++++++++++++++++++++++++++++++++++++- 31 files changed, 103 insertions(+), 34 deletions(-) (limited to 'arch/arm64') diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c index c2303a8c2b9f..1983e43a5e2f 100644 --- a/arch/alpha/mm/fault.c +++ b/arch/alpha/mm/fault.c @@ -148,7 +148,7 @@ retry: /* If for any reason at all we couldn't handle the fault, make sure we exit gracefully rather than endlessly redo the fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, NULL); if (fault_signal_pending(fault, regs)) return; diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c index 7287c793d1c9..587dea524e6b 100644 --- a/arch/arc/mm/fault.c +++ b/arch/arc/mm/fault.c @@ -130,7 +130,7 @@ retry: goto bad_area; } - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, NULL); /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index c6550eddfce1..01a8e0f8fef7 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -224,7 +224,7 @@ good_area: goto out; } - return handle_mm_fault(vma, addr & PAGE_MASK, flags); + return handle_mm_fault(vma, addr & PAGE_MASK, flags, NULL); check_stack: /* Don't allow expansion below FIRST_USER_ADDRESS */ diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 8afb238ff335..be29f4076fe3 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -428,7 +428,7 @@ static vm_fault_t __do_page_fault(struct mm_struct *mm, unsigned long addr, */ if (!(vma->vm_flags & vm_flags)) return VM_FAULT_BADACCESS; - return handle_mm_fault(vma, addr & PAGE_MASK, mm_flags); + return handle_mm_fault(vma, addr & PAGE_MASK, mm_flags, NULL); } static bool is_el0_instruction_abort(unsigned int esr) diff --git a/arch/csky/mm/fault.c b/arch/csky/mm/fault.c index b1dce9f2f04d..b252e6e4d32f 100644 --- a/arch/csky/mm/fault.c +++ b/arch/csky/mm/fault.c @@ -150,7 +150,8 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, write ? FAULT_FLAG_WRITE : 0); + fault = handle_mm_fault(vma, address, write ? FAULT_FLAG_WRITE : 0, + NULL); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c index cd3808f96b93..f12f330e7946 100644 --- a/arch/hexagon/mm/vm_fault.c +++ b/arch/hexagon/mm/vm_fault.c @@ -88,7 +88,7 @@ good_area: break; } - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, NULL); if (fault_signal_pending(fault, regs)) return; diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index 3a4dec334cc5..abf2808f9b4b 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c @@ -143,7 +143,7 @@ retry: * sure we exit gracefully rather than endlessly redo the * fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, NULL); if (fault_signal_pending(fault, regs)) return; diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c index 508abb63da67..08b35a318ebe 100644 --- a/arch/m68k/mm/fault.c +++ b/arch/m68k/mm/fault.c @@ -134,7 +134,7 @@ good_area: * the fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, NULL); pr_debug("handle_mm_fault returns %x\n", fault); if (fault_signal_pending(fault, regs)) diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c index a2bfe587b491..1a3d4c4ca28b 100644 --- a/arch/microblaze/mm/fault.c +++ b/arch/microblaze/mm/fault.c @@ -214,7 +214,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, NULL); if (fault_signal_pending(fault, regs)) return; diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c index 01b168a90434..b1db39784db9 100644 --- a/arch/mips/mm/fault.c +++ b/arch/mips/mm/fault.c @@ -152,7 +152,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, NULL); if (fault_signal_pending(fault, regs)) return; diff --git a/arch/nds32/mm/fault.c b/arch/nds32/mm/fault.c index 8fb73f6401a0..d0ecc8fb5b23 100644 --- a/arch/nds32/mm/fault.c +++ b/arch/nds32/mm/fault.c @@ -206,7 +206,7 @@ good_area: * the fault. */ - fault = handle_mm_fault(vma, addr, flags); + fault = handle_mm_fault(vma, addr, flags, NULL); /* * If we need to retry but a fatal signal is pending, handle the diff --git a/arch/nios2/mm/fault.c b/arch/nios2/mm/fault.c index 4112ef0e247e..86beb9a2698e 100644 --- a/arch/nios2/mm/fault.c +++ b/arch/nios2/mm/fault.c @@ -131,7 +131,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, NULL); if (fault_signal_pending(fault, regs)) return; diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c index d2224ccca294..3daa491d1edb 100644 --- a/arch/openrisc/mm/fault.c +++ b/arch/openrisc/mm/fault.c @@ -159,7 +159,7 @@ good_area: * the fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, NULL); if (fault_signal_pending(fault, regs)) return; diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c index 66ac0719bd49..e32d06928c24 100644 --- a/arch/parisc/mm/fault.c +++ b/arch/parisc/mm/fault.c @@ -302,7 +302,7 @@ good_area: * fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, NULL); if (fault_signal_pending(fault, regs)) return; diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c index b83abbead4a2..2d0276abe0a6 100644 --- a/arch/powerpc/mm/copro_fault.c +++ b/arch/powerpc/mm/copro_fault.c @@ -64,7 +64,7 @@ int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea, } ret = 0; - *flt = handle_mm_fault(vma, ea, is_write ? FAULT_FLAG_WRITE : 0); + *flt = handle_mm_fault(vma, ea, is_write ? FAULT_FLAG_WRITE : 0, NULL); if (unlikely(*flt & VM_FAULT_ERROR)) { if (*flt & VM_FAULT_OOM) { ret = -ENOMEM; diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 925a7231abb3..c6a5225a3521 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -511,7 +511,7 @@ retry: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, NULL); major |= fault & VM_FAULT_MAJOR; diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 5873835a3e6b..30c1124d0fb6 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -109,7 +109,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, addr, flags); + fault = handle_mm_fault(vma, addr, flags, NULL); /* * If we need to retry but a fatal signal is pending, handle the diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index aebf9183bedd..ad783aaaf649 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -476,7 +476,7 @@ retry: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, NULL); if (fault_signal_pending(fault, regs)) { fault = VM_FAULT_SIGNAL; if (flags & FAULT_FLAG_RETRY_NOWAIT) diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c index fbe1f2fe9a8c..3c0a11827f7e 100644 --- a/arch/sh/mm/fault.c +++ b/arch/sh/mm/fault.c @@ -482,7 +482,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, NULL); if (unlikely(fault & (VM_FAULT_RETRY | VM_FAULT_ERROR))) if (mm_fault_error(regs, error_code, address, fault)) diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c index cfef656eda0f..06af03db4417 100644 --- a/arch/sparc/mm/fault_32.c +++ b/arch/sparc/mm/fault_32.c @@ -234,7 +234,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, NULL); if (fault_signal_pending(fault, regs)) return; @@ -410,7 +410,7 @@ good_area: if (!(vma->vm_flags & (VM_READ | VM_EXEC))) goto bad_area; } - switch (handle_mm_fault(vma, address, flags)) { + switch (handle_mm_fault(vma, address, flags, NULL)) { case VM_FAULT_SIGBUS: case VM_FAULT_OOM: goto do_sigbus; diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c index a3806614e4dc..9ebee14ee893 100644 --- a/arch/sparc/mm/fault_64.c +++ b/arch/sparc/mm/fault_64.c @@ -422,7 +422,7 @@ good_area: goto bad_area; } - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, NULL); if (fault_signal_pending(fault, regs)) goto exit_exception; diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index 2b3afa354a90..8d9870d76da1 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -71,7 +71,7 @@ good_area: do { vm_fault_t fault; - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, NULL); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) goto out_nosemaphore; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 0c7643d9f7cb..e1bf5555d80a 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1291,7 +1291,7 @@ good_area: * userland). The return to userland is identified whenever * FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, NULL); major |= fault & VM_FAULT_MAJOR; /* Quick path to respond to signals */ diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c index c128dcc7c85b..e72c8c1359a6 100644 --- a/arch/xtensa/mm/fault.c +++ b/arch/xtensa/mm/fault.c @@ -107,7 +107,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags); + fault = handle_mm_fault(vma, address, flags, NULL); if (fault_signal_pending(fault, regs)) return; diff --git a/drivers/iommu/amd/iommu_v2.c b/drivers/iommu/amd/iommu_v2.c index e4b025c5637c..c259108ab6dd 100644 --- a/drivers/iommu/amd/iommu_v2.c +++ b/drivers/iommu/amd/iommu_v2.c @@ -495,7 +495,7 @@ static void do_fault(struct work_struct *work) if (access_error(vma, fault)) goto out; - ret = handle_mm_fault(vma, address, flags); + ret = handle_mm_fault(vma, address, flags, NULL); out: mmap_read_unlock(mm); diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 6c87c807a0ab..5ae59a6ad681 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -872,7 +872,8 @@ static irqreturn_t prq_event_thread(int irq, void *d) goto invalid; ret = handle_mm_fault(vma, address, - req->wr_req ? FAULT_FLAG_WRITE : 0); + req->wr_req ? FAULT_FLAG_WRITE : 0, + NULL); if (ret & VM_FAULT_ERROR) goto invalid; diff --git a/include/linux/mm.h b/include/linux/mm.h index f97b10117d44..ec0ffb423769 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -38,6 +38,7 @@ struct file_ra_state; struct user_struct; struct writeback_control; struct bdi_writeback; +struct pt_regs; void init_mm_internals(void); @@ -1658,7 +1659,8 @@ int invalidate_inode_page(struct page *page); #ifdef CONFIG_MMU extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma, - unsigned long address, unsigned int flags); + unsigned long address, unsigned int flags, + struct pt_regs *regs); extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked); @@ -1668,7 +1670,8 @@ void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows); #else static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma, - unsigned long address, unsigned int flags) + unsigned long address, unsigned int flags, + struct pt_regs *regs) { /* should never happen if there's no MMU */ BUG(); diff --git a/mm/gup.c b/mm/gup.c index e9d1d0cc18f0..ae7121d729fa 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -884,7 +884,7 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, fault_flags |= FAULT_FLAG_TRIED; } - ret = handle_mm_fault(vma, address, fault_flags); + ret = handle_mm_fault(vma, address, fault_flags, NULL); if (ret & VM_FAULT_ERROR) { int err = vm_fault_to_errno(ret, *flags); @@ -1238,7 +1238,7 @@ retry: fatal_signal_pending(current)) return -EINTR; - ret = handle_mm_fault(vma, address, fault_flags); + ret = handle_mm_fault(vma, address, fault_flags, NULL); major |= ret & VM_FAULT_MAJOR; if (ret & VM_FAULT_ERROR) { int err = vm_fault_to_errno(ret, 0); diff --git a/mm/hmm.c b/mm/hmm.c index bb279319bf40..943cb2ba4442 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -75,7 +75,8 @@ static int hmm_vma_fault(unsigned long addr, unsigned long end, } for (; addr < end; addr += PAGE_SIZE) - if (handle_mm_fault(vma, addr, fault_flags) & VM_FAULT_ERROR) + if (handle_mm_fault(vma, addr, fault_flags, NULL) & + VM_FAULT_ERROR) return -EFAULT; return -EBUSY; } diff --git a/mm/ksm.c b/mm/ksm.c index 217842a66912..0aa2247bddd7 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -480,7 +480,8 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) break; if (PageKsm(page)) ret = handle_mm_fault(vma, addr, - FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE); + FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE, + NULL); else ret = VM_FAULT_WRITE; put_page(page); diff --git a/mm/memory.c b/mm/memory.c index 325bb575e7ec..9b7d35734caa 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -71,6 +71,8 @@ #include #include #include +#include +#include #include @@ -4356,6 +4358,64 @@ retry_pud: return handle_pte_fault(&vmf); } +/** + * mm_account_fault - Do page fault accountings + * + * @regs: the pt_regs struct pointer. When set to NULL, will skip accounting + * of perf event counters, but we'll still do the per-task accounting to + * the task who triggered this page fault. + * @address: the faulted address. + * @flags: the fault flags. + * @ret: the fault retcode. + * + * This will take care of most of the page fault accountings. Meanwhile, it + * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter + * updates. However note that the handling of PERF_COUNT_SW_PAGE_FAULTS should + * still be in per-arch page fault handlers at the entry of page fault. + */ +static inline void mm_account_fault(struct pt_regs *regs, + unsigned long address, unsigned int flags, + vm_fault_t ret) +{ + bool major; + + /* + * We don't do accounting for some specific faults: + * + * - Unsuccessful faults (e.g. when the address wasn't valid). That + * includes arch_vma_access_permitted() failing before reaching here. + * So this is not a "this many hardware page faults" counter. We + * should use the hw profiling for that. + * + * - Incomplete faults (VM_FAULT_RETRY). They will only be counted + * once they're completed. + */ + if (ret & (VM_FAULT_ERROR | VM_FAULT_RETRY)) + return; + + /* + * We define the fault as a major fault when the final successful fault + * is VM_FAULT_MAJOR, or if it retried (which implies that we couldn't + * handle it immediately previously). + */ + major = (ret & VM_FAULT_MAJOR) || (flags & FAULT_FLAG_TRIED); + + /* + * If the fault is done for GUP, regs will be NULL, and we will skip + * the fault accounting. + */ + if (!regs) + return; + + if (major) { + current->maj_flt++; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); + } else { + current->min_flt++; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); + } +} + /* * By the time we get here, we already hold the mm semaphore * @@ -4363,7 +4423,7 @@ retry_pud: * return value. See filemap_fault() and __lock_page_or_retry(). */ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, - unsigned int flags) + unsigned int flags, struct pt_regs *regs) { vm_fault_t ret; @@ -4404,6 +4464,8 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, mem_cgroup_oom_synchronize(false); } + mm_account_fault(regs, address, flags, ret); + return ret; } EXPORT_SYMBOL_GPL(handle_mm_fault); -- cgit v1.2.3 From 6a1bb025d28e1026fead73b7b33e2dfccba3f4d2 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 11 Aug 2020 18:37:57 -0700 Subject: mm/arm64: use general page fault accounting Use the general page fault accounting by passing regs into handle_mm_fault(). It naturally solve the issue of multiple page fault accounting when page fault retry happened. To do this, we pass pt_regs pointer into __do_page_fault(). Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Acked-by: Will Deacon Cc: Catalin Marinas Link: http://lkml.kernel.org/r/20200707225021.200906-6-peterx@redhat.com Signed-off-by: Linus Torvalds --- arch/arm64/mm/fault.c | 29 ++++++----------------------- 1 file changed, 6 insertions(+), 23 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index be29f4076fe3..f07333e86c2f 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -404,7 +404,8 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re #define VM_FAULT_BADACCESS 0x020000 static vm_fault_t __do_page_fault(struct mm_struct *mm, unsigned long addr, - unsigned int mm_flags, unsigned long vm_flags) + unsigned int mm_flags, unsigned long vm_flags, + struct pt_regs *regs) { struct vm_area_struct *vma = find_vma(mm, addr); @@ -428,7 +429,7 @@ static vm_fault_t __do_page_fault(struct mm_struct *mm, unsigned long addr, */ if (!(vma->vm_flags & vm_flags)) return VM_FAULT_BADACCESS; - return handle_mm_fault(vma, addr & PAGE_MASK, mm_flags, NULL); + return handle_mm_fault(vma, addr & PAGE_MASK, mm_flags, regs); } static bool is_el0_instruction_abort(unsigned int esr) @@ -450,7 +451,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, { const struct fault_info *inf; struct mm_struct *mm = current->mm; - vm_fault_t fault, major = 0; + vm_fault_t fault; unsigned long vm_flags = VM_ACCESS_FLAGS; unsigned int mm_flags = FAULT_FLAG_DEFAULT; @@ -516,8 +517,7 @@ retry: #endif } - fault = __do_page_fault(mm, addr, mm_flags, vm_flags); - major |= fault & VM_FAULT_MAJOR; + fault = __do_page_fault(mm, addr, mm_flags, vm_flags, regs); /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { @@ -538,25 +538,8 @@ retry: * Handle the "normal" (no error) case first. */ if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | - VM_FAULT_BADACCESS)))) { - /* - * Major/minor page fault accounting is only done - * once. If we go through a retry, it is extremely - * likely that the page will be found in page cache at - * that point. - */ - if (major) { - current->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, - addr); - } else { - current->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, - addr); - } - + VM_FAULT_BADACCESS)))) return 0; - } /* * If we are in kernel mode at this point, we have no context to -- cgit v1.2.3 From 88db0aa2421666d2f73486d15b239a4521983d55 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 14 Aug 2020 17:31:07 -0700 Subject: all arch: remove system call sys_sysctl Since commit 61a47c1ad3a4dc ("sysctl: Remove the sysctl system call"), sys_sysctl is actually unavailable: any input can only return an error. We have been warning about people using the sysctl system call for years and believe there are no more users. Even if there are users of this interface if they have not complained or fixed their code by now they probably are not going to, so there is no point in warning them any longer. So completely remove sys_sysctl on all architectures. [nixiaoming@huawei.com: s390: fix build error for sys_call_table_emu] Link: http://lkml.kernel.org/r/20200618141426.16884-1-nixiaoming@huawei.com Signed-off-by: Xiaoming Ni Signed-off-by: Andrew Morton Acked-by: Will Deacon [arm/arm64] Acked-by: "Eric W. Biederman" Cc: Aleksa Sarai Cc: Alexander Shishkin Cc: Al Viro Cc: Andi Kleen Cc: Andrew Morton Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Bin Meng Cc: Borislav Petkov Cc: Brian Gerst Cc: Catalin Marinas Cc: chenzefeng Cc: Christian Borntraeger Cc: Christian Brauner Cc: Chris Zankel Cc: David Howells Cc: David S. Miller Cc: Diego Elio Pettenò Cc: Dmitry Vyukov Cc: Dominik Brodowski Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Iurii Zaikin Cc: Ivan Kokshaysky Cc: James Bottomley Cc: Jens Axboe Cc: Jiri Olsa Cc: Kars de Jong Cc: Kees Cook Cc: Krzysztof Kozlowski Cc: Luis Chamberlain Cc: Marco Elver Cc: Mark Rutland Cc: Martin K. Petersen Cc: Masahiro Yamada Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Miklos Szeredi Cc: Minchan Kim Cc: Namhyung Kim Cc: Naveen N. Rao Cc: Nick Piggin Cc: Oleg Nesterov Cc: Olof Johansson Cc: Paul Burton Cc: "Paul E. McKenney" Cc: Paul Mackerras Cc: Peter Zijlstra (Intel) Cc: Randy Dunlap Cc: Ravi Bangoria Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Sami Tolvanen Cc: Sargun Dhillon Cc: Stephen Rothwell Cc: Sudeep Holla Cc: Sven Schnelle Cc: Thiago Jung Bauermann Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Tony Luck Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Yoshinori Sato Cc: Zhou Yanjie Link: http://lkml.kernel.org/r/20200616030734.87257-1-nixiaoming@huawei.com Signed-off-by: Linus Torvalds --- arch/alpha/kernel/syscalls/syscall.tbl | 2 +- arch/arm/configs/am200epdkit_defconfig | 1 - arch/arm/tools/syscall.tbl | 2 +- arch/arm64/include/asm/unistd32.h | 4 +- arch/ia64/kernel/syscalls/syscall.tbl | 2 +- arch/m68k/kernel/syscalls/syscall.tbl | 2 +- arch/microblaze/kernel/syscalls/syscall.tbl | 2 +- arch/mips/configs/cu1000-neo_defconfig | 1 - arch/mips/kernel/syscalls/syscall_n32.tbl | 2 +- arch/mips/kernel/syscalls/syscall_n64.tbl | 2 +- arch/mips/kernel/syscalls/syscall_o32.tbl | 2 +- arch/parisc/kernel/syscalls/syscall.tbl | 2 +- arch/powerpc/kernel/syscalls/syscall.tbl | 2 +- arch/s390/kernel/syscalls/syscall.tbl | 2 +- arch/sh/configs/dreamcast_defconfig | 1 - arch/sh/configs/espt_defconfig | 1 - arch/sh/configs/hp6xx_defconfig | 1 - arch/sh/configs/landisk_defconfig | 1 - arch/sh/configs/lboxre2_defconfig | 1 - arch/sh/configs/microdev_defconfig | 1 - arch/sh/configs/migor_defconfig | 1 - arch/sh/configs/r7780mp_defconfig | 1 - arch/sh/configs/r7785rp_defconfig | 1 - arch/sh/configs/rts7751r2d1_defconfig | 1 - arch/sh/configs/rts7751r2dplus_defconfig | 1 - arch/sh/configs/se7206_defconfig | 1 - arch/sh/configs/se7343_defconfig | 1 - arch/sh/configs/se7619_defconfig | 1 - arch/sh/configs/se7705_defconfig | 1 - arch/sh/configs/se7750_defconfig | 1 - arch/sh/configs/se7751_defconfig | 1 - arch/sh/configs/secureedge5410_defconfig | 1 - arch/sh/configs/sh03_defconfig | 1 - arch/sh/configs/sh7710voipgw_defconfig | 1 - arch/sh/configs/sh7757lcr_defconfig | 1 - arch/sh/configs/sh7763rdp_defconfig | 1 - arch/sh/configs/shmin_defconfig | 1 - arch/sh/configs/titan_defconfig | 1 - arch/sh/kernel/syscalls/syscall.tbl | 2 +- arch/sparc/kernel/syscalls/syscall.tbl | 2 +- arch/x86/entry/syscalls/syscall_32.tbl | 2 +- arch/x86/entry/syscalls/syscall_64.tbl | 2 +- arch/xtensa/kernel/syscalls/syscall.tbl | 2 +- include/linux/compat.h | 1 - include/linux/syscalls.h | 2 - include/linux/sysctl.h | 6 +- kernel/Makefile | 2 +- kernel/sys_ni.c | 1 - kernel/sysctl_binary.c | 171 --------------------- tools/perf/arch/powerpc/entry/syscalls/syscall.tbl | 2 +- tools/perf/arch/s390/entry/syscalls/syscall.tbl | 2 +- tools/perf/arch/x86/entry/syscalls/syscall_64.tbl | 2 +- 52 files changed, 24 insertions(+), 227 deletions(-) delete mode 100644 kernel/sysctl_binary.c (limited to 'arch/arm64') diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index a28fb211881d..ec8bed9e7b75 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -249,7 +249,7 @@ 316 common mlockall sys_mlockall 317 common munlockall sys_munlockall 318 common sysinfo sys_sysinfo -319 common _sysctl sys_sysctl +319 common _sysctl sys_ni_syscall # 320 was sys_idle 321 common oldumount sys_oldumount 322 common swapon sys_swapon diff --git a/arch/arm/configs/am200epdkit_defconfig b/arch/arm/configs/am200epdkit_defconfig index f56ac394caf1..4e49d6cb2f62 100644 --- a/arch/arm/configs/am200epdkit_defconfig +++ b/arch/arm/configs/am200epdkit_defconfig @@ -3,7 +3,6 @@ CONFIG_LOCALVERSION="gum" CONFIG_SYSVIPC=y CONFIG_SYSFS_DEPRECATED_V2=y CONFIG_EXPERT=y -# CONFIG_SYSCTL_SYSCALL is not set # CONFIG_EPOLL is not set # CONFIG_SHMEM is not set # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index 7e8ee4adf269..171077cbf419 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -162,7 +162,7 @@ 146 common writev sys_writev 147 common getsid sys_getsid 148 common fdatasync sys_fdatasync -149 common _sysctl sys_sysctl +149 common _sysctl sys_ni_syscall 150 common mlock sys_mlock 151 common munlock sys_munlock 152 common mlockall sys_mlockall diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 17e81bd9a2d3..734860ac7cf9 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -308,8 +308,8 @@ __SYSCALL(__NR_writev, compat_sys_writev) __SYSCALL(__NR_getsid, sys_getsid) #define __NR_fdatasync 148 __SYSCALL(__NR_fdatasync, sys_fdatasync) -#define __NR__sysctl 149 -__SYSCALL(__NR__sysctl, compat_sys_sysctl) + /* 149 was sys_sysctl */ +__SYSCALL(149, sys_ni_syscall) #define __NR_mlock 150 __SYSCALL(__NR_mlock, sys_mlock) #define __NR_munlock 151 diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index ced9c83e47c9..f52a41f4c340 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -135,7 +135,7 @@ 123 common writev sys_writev 124 common pread64 sys_pread64 125 common pwrite64 sys_pwrite64 -126 common _sysctl sys_sysctl +126 common _sysctl sys_ni_syscall 127 common mmap sys_mmap 128 common munmap sys_munmap 129 common mlock sys_mlock diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index 1a4822de7292..81fc799d8392 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -156,7 +156,7 @@ 146 common writev sys_writev 147 common getsid sys_getsid 148 common fdatasync sys_fdatasync -149 common _sysctl sys_sysctl +149 common _sysctl sys_ni_syscall 150 common mlock sys_mlock 151 common munlock sys_munlock 152 common mlockall sys_mlockall diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index a3f4be8e7238..b4e263916f41 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -156,7 +156,7 @@ 146 common writev sys_writev 147 common getsid sys_getsid 148 common fdatasync sys_fdatasync -149 common _sysctl sys_sysctl +149 common _sysctl sys_ni_syscall 150 common mlock sys_mlock 151 common munlock sys_munlock 152 common mlockall sys_mlockall diff --git a/arch/mips/configs/cu1000-neo_defconfig b/arch/mips/configs/cu1000-neo_defconfig index 6b471cdb16cf..e924c817f73d 100644 --- a/arch/mips/configs/cu1000-neo_defconfig +++ b/arch/mips/configs/cu1000-neo_defconfig @@ -17,7 +17,6 @@ CONFIG_CGROUP_CPUACCT=y CONFIG_NAMESPACES=y CONFIG_USER_NS=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y -CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS_ALL=y CONFIG_EMBEDDED=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index 6b4ee92e3aed..f9df9edb67a4 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -159,7 +159,7 @@ 149 n32 munlockall sys_munlockall 150 n32 vhangup sys_vhangup 151 n32 pivot_root sys_pivot_root -152 n32 _sysctl compat_sys_sysctl +152 n32 _sysctl sys_ni_syscall 153 n32 prctl sys_prctl 154 n32 adjtimex sys_adjtimex_time32 155 n32 setrlimit compat_sys_setrlimit diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index 391acbf425a0..557f9954a2b9 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -159,7 +159,7 @@ 149 n64 munlockall sys_munlockall 150 n64 vhangup sys_vhangup 151 n64 pivot_root sys_pivot_root -152 n64 _sysctl sys_sysctl +152 n64 _sysctl sys_ni_syscall 153 n64 prctl sys_prctl 154 n64 adjtimex sys_adjtimex 155 n64 setrlimit sys_setrlimit diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 5727c5187508..195b43cf27c8 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -164,7 +164,7 @@ 150 o32 unused150 sys_ni_syscall 151 o32 getsid sys_getsid 152 o32 fdatasync sys_fdatasync -153 o32 _sysctl sys_sysctl compat_sys_sysctl +153 o32 _sysctl sys_ni_syscall 154 o32 mlock sys_mlock 155 o32 munlock sys_munlock 156 o32 mlockall sys_mlockall diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index 292baabefade..def64d221cd4 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -163,7 +163,7 @@ 146 common writev sys_writev compat_sys_writev 147 common getsid sys_getsid 148 common fdatasync sys_fdatasync -149 common _sysctl sys_sysctl compat_sys_sysctl +149 common _sysctl sys_ni_syscall 150 common mlock sys_mlock 151 common munlock sys_munlock 152 common mlockall sys_mlockall diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index be9f74546068..c2d737ff2e7b 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -197,7 +197,7 @@ 146 common writev sys_writev compat_sys_writev 147 common getsid sys_getsid 148 common fdatasync sys_fdatasync -149 nospu _sysctl sys_sysctl compat_sys_sysctl +149 nospu _sysctl sys_ni_syscall 150 common mlock sys_mlock 151 common munlock sys_munlock 152 common mlockall sys_mlockall diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index f1fda4375526..10456bc936fb 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -138,7 +138,7 @@ 146 common writev sys_writev compat_sys_writev 147 common getsid sys_getsid sys_getsid 148 common fdatasync sys_fdatasync sys_fdatasync -149 common _sysctl sys_sysctl compat_sys_sysctl +149 common _sysctl - - 150 common mlock sys_mlock sys_mlock 151 common munlock sys_munlock sys_munlock 152 common mlockall sys_mlockall sys_mlockall diff --git a/arch/sh/configs/dreamcast_defconfig b/arch/sh/configs/dreamcast_defconfig index ae067e0b15e3..6a82c7b8ff32 100644 --- a/arch/sh/configs/dreamcast_defconfig +++ b/arch/sh/configs/dreamcast_defconfig @@ -1,7 +1,6 @@ CONFIG_SYSVIPC=y CONFIG_BSD_PROCESS_ACCT=y CONFIG_LOG_BUF_SHIFT=14 -# CONFIG_SYSCTL_SYSCALL is not set CONFIG_SLAB=y CONFIG_PROFILING=y CONFIG_MODULES=y diff --git a/arch/sh/configs/espt_defconfig b/arch/sh/configs/espt_defconfig index a5b865a75d22..9a988c347e9d 100644 --- a/arch/sh/configs/espt_defconfig +++ b/arch/sh/configs/espt_defconfig @@ -5,7 +5,6 @@ CONFIG_LOG_BUF_SHIFT=14 CONFIG_NAMESPACES=y CONFIG_UTS_NS=y CONFIG_IPC_NS=y -# CONFIG_SYSCTL_SYSCALL is not set CONFIG_SLAB=y CONFIG_PROFILING=y CONFIG_OPROFILE=y diff --git a/arch/sh/configs/hp6xx_defconfig b/arch/sh/configs/hp6xx_defconfig index a92db6694ce2..70e6605d7f7e 100644 --- a/arch/sh/configs/hp6xx_defconfig +++ b/arch/sh/configs/hp6xx_defconfig @@ -3,7 +3,6 @@ CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=14 # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set -# CONFIG_SYSCTL_SYSCALL is not set CONFIG_SLAB=y # CONFIG_BLK_DEV_BSG is not set CONFIG_CPU_SUBTYPE_SH7709=y diff --git a/arch/sh/configs/landisk_defconfig b/arch/sh/configs/landisk_defconfig index 567af752b1bb..ba6ec042606f 100644 --- a/arch/sh/configs/landisk_defconfig +++ b/arch/sh/configs/landisk_defconfig @@ -1,6 +1,5 @@ CONFIG_SYSVIPC=y CONFIG_LOG_BUF_SHIFT=14 -# CONFIG_SYSCTL_SYSCALL is not set CONFIG_KALLSYMS_EXTRA_PASS=y CONFIG_SLAB=y CONFIG_MODULES=y diff --git a/arch/sh/configs/lboxre2_defconfig b/arch/sh/configs/lboxre2_defconfig index 10f6d371ce2c..05e4ac6fed5f 100644 --- a/arch/sh/configs/lboxre2_defconfig +++ b/arch/sh/configs/lboxre2_defconfig @@ -1,6 +1,5 @@ CONFIG_SYSVIPC=y CONFIG_LOG_BUF_SHIFT=14 -# CONFIG_SYSCTL_SYSCALL is not set CONFIG_KALLSYMS_EXTRA_PASS=y CONFIG_SLAB=y CONFIG_MODULES=y diff --git a/arch/sh/configs/microdev_defconfig b/arch/sh/configs/microdev_defconfig index ed84d1303acf..c65667d00313 100644 --- a/arch/sh/configs/microdev_defconfig +++ b/arch/sh/configs/microdev_defconfig @@ -2,7 +2,6 @@ CONFIG_BSD_PROCESS_ACCT=y CONFIG_LOG_BUF_SHIFT=14 CONFIG_BLK_DEV_INITRD=y # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set -# CONFIG_SYSCTL_SYSCALL is not set CONFIG_SLAB=y # CONFIG_BLK_DEV_BSG is not set CONFIG_CPU_SUBTYPE_SH4_202=y diff --git a/arch/sh/configs/migor_defconfig b/arch/sh/configs/migor_defconfig index 37e9521a99e5..a24cf8cd2cea 100644 --- a/arch/sh/configs/migor_defconfig +++ b/arch/sh/configs/migor_defconfig @@ -4,7 +4,6 @@ CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=14 CONFIG_BLK_DEV_INITRD=y # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set -# CONFIG_SYSCTL_SYSCALL is not set CONFIG_SLAB=y CONFIG_PROFILING=y CONFIG_OPROFILE=y diff --git a/arch/sh/configs/r7780mp_defconfig b/arch/sh/configs/r7780mp_defconfig index c97ec60cff27..e922659fdadb 100644 --- a/arch/sh/configs/r7780mp_defconfig +++ b/arch/sh/configs/r7780mp_defconfig @@ -3,7 +3,6 @@ CONFIG_BSD_PROCESS_ACCT=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=14 -# CONFIG_SYSCTL_SYSCALL is not set # CONFIG_FUTEX is not set # CONFIG_EPOLL is not set CONFIG_SLAB=y diff --git a/arch/sh/configs/r7785rp_defconfig b/arch/sh/configs/r7785rp_defconfig index 55fce65eb454..5978866358ec 100644 --- a/arch/sh/configs/r7785rp_defconfig +++ b/arch/sh/configs/r7785rp_defconfig @@ -7,7 +7,6 @@ CONFIG_RCU_TRACE=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=14 -# CONFIG_SYSCTL_SYSCALL is not set CONFIG_SLAB=y CONFIG_PROFILING=y CONFIG_OPROFILE=y diff --git a/arch/sh/configs/rts7751r2d1_defconfig b/arch/sh/configs/rts7751r2d1_defconfig index 6a3cfe08295f..fc9c22152b08 100644 --- a/arch/sh/configs/rts7751r2d1_defconfig +++ b/arch/sh/configs/rts7751r2d1_defconfig @@ -1,7 +1,6 @@ CONFIG_SYSVIPC=y CONFIG_LOG_BUF_SHIFT=14 # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set -# CONFIG_SYSCTL_SYSCALL is not set CONFIG_SLAB=y CONFIG_PROFILING=y CONFIG_OPROFILE=y diff --git a/arch/sh/configs/rts7751r2dplus_defconfig b/arch/sh/configs/rts7751r2dplus_defconfig index 2b3d7d280672..ff3fd6787fd6 100644 --- a/arch/sh/configs/rts7751r2dplus_defconfig +++ b/arch/sh/configs/rts7751r2dplus_defconfig @@ -1,7 +1,6 @@ CONFIG_SYSVIPC=y CONFIG_LOG_BUF_SHIFT=14 # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set -# CONFIG_SYSCTL_SYSCALL is not set CONFIG_SLAB=y CONFIG_PROFILING=y CONFIG_OPROFILE=y diff --git a/arch/sh/configs/se7206_defconfig b/arch/sh/configs/se7206_defconfig index 21a43f14ffac..ff5bb4489922 100644 --- a/arch/sh/configs/se7206_defconfig +++ b/arch/sh/configs/se7206_defconfig @@ -18,7 +18,6 @@ CONFIG_USER_NS=y CONFIG_PID_NS=y CONFIG_BLK_DEV_INITRD=y # CONFIG_UID16 is not set -# CONFIG_SYSCTL_SYSCALL is not set CONFIG_KALLSYMS_ALL=y # CONFIG_ELF_CORE is not set # CONFIG_COMPAT_BRK is not set diff --git a/arch/sh/configs/se7343_defconfig b/arch/sh/configs/se7343_defconfig index 4e794e719a28..5d6c19338ebf 100644 --- a/arch/sh/configs/se7343_defconfig +++ b/arch/sh/configs/se7343_defconfig @@ -2,7 +2,6 @@ CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y CONFIG_LOG_BUF_SHIFT=14 -# CONFIG_SYSCTL_SYSCALL is not set # CONFIG_FUTEX is not set # CONFIG_EPOLL is not set # CONFIG_SHMEM is not set diff --git a/arch/sh/configs/se7619_defconfig b/arch/sh/configs/se7619_defconfig index 3264415a5931..71a672c30716 100644 --- a/arch/sh/configs/se7619_defconfig +++ b/arch/sh/configs/se7619_defconfig @@ -1,7 +1,6 @@ # CONFIG_LOCALVERSION_AUTO is not set CONFIG_LOG_BUF_SHIFT=14 # CONFIG_UID16 is not set -# CONFIG_SYSCTL_SYSCALL is not set # CONFIG_KALLSYMS is not set # CONFIG_HOTPLUG is not set # CONFIG_ELF_CORE is not set diff --git a/arch/sh/configs/se7705_defconfig b/arch/sh/configs/se7705_defconfig index 4496b94b7d88..ed00a6eeadf5 100644 --- a/arch/sh/configs/se7705_defconfig +++ b/arch/sh/configs/se7705_defconfig @@ -2,7 +2,6 @@ CONFIG_LOG_BUF_SHIFT=14 CONFIG_BLK_DEV_INITRD=y # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set -# CONFIG_SYSCTL_SYSCALL is not set # CONFIG_KALLSYMS is not set # CONFIG_HOTPLUG is not set CONFIG_SLAB=y diff --git a/arch/sh/configs/se7750_defconfig b/arch/sh/configs/se7750_defconfig index b23f67542728..3f1c13799d79 100644 --- a/arch/sh/configs/se7750_defconfig +++ b/arch/sh/configs/se7750_defconfig @@ -5,7 +5,6 @@ CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=14 # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set -# CONFIG_SYSCTL_SYSCALL is not set # CONFIG_HOTPLUG is not set CONFIG_SLAB=y CONFIG_MODULES=y diff --git a/arch/sh/configs/se7751_defconfig b/arch/sh/configs/se7751_defconfig index 162343683937..4a024065bb75 100644 --- a/arch/sh/configs/se7751_defconfig +++ b/arch/sh/configs/se7751_defconfig @@ -3,7 +3,6 @@ CONFIG_BSD_PROCESS_ACCT=y CONFIG_LOG_BUF_SHIFT=14 CONFIG_BLK_DEV_INITRD=y # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set -# CONFIG_SYSCTL_SYSCALL is not set # CONFIG_HOTPLUG is not set CONFIG_SLAB=y CONFIG_MODULES=y diff --git a/arch/sh/configs/secureedge5410_defconfig b/arch/sh/configs/secureedge5410_defconfig index 360592d63a2f..8422599cfb04 100644 --- a/arch/sh/configs/secureedge5410_defconfig +++ b/arch/sh/configs/secureedge5410_defconfig @@ -1,7 +1,6 @@ # CONFIG_SWAP is not set CONFIG_LOG_BUF_SHIFT=14 CONFIG_BLK_DEV_INITRD=y -# CONFIG_SYSCTL_SYSCALL is not set # CONFIG_HOTPLUG is not set CONFIG_SLAB=y # CONFIG_BLK_DEV_BSG is not set diff --git a/arch/sh/configs/sh03_defconfig b/arch/sh/configs/sh03_defconfig index 87db9a84b5ec..f0073ed39947 100644 --- a/arch/sh/configs/sh03_defconfig +++ b/arch/sh/configs/sh03_defconfig @@ -3,7 +3,6 @@ CONFIG_POSIX_MQUEUE=y CONFIG_BSD_PROCESS_ACCT=y CONFIG_LOG_BUF_SHIFT=14 CONFIG_BLK_DEV_INITRD=y -# CONFIG_SYSCTL_SYSCALL is not set CONFIG_SLAB=y CONFIG_PROFILING=y CONFIG_OPROFILE=m diff --git a/arch/sh/configs/sh7710voipgw_defconfig b/arch/sh/configs/sh7710voipgw_defconfig index 08426913c0e3..0d814770b07f 100644 --- a/arch/sh/configs/sh7710voipgw_defconfig +++ b/arch/sh/configs/sh7710voipgw_defconfig @@ -2,7 +2,6 @@ CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y CONFIG_LOG_BUF_SHIFT=14 -# CONFIG_SYSCTL_SYSCALL is not set # CONFIG_FUTEX is not set # CONFIG_EPOLL is not set # CONFIG_SHMEM is not set diff --git a/arch/sh/configs/sh7757lcr_defconfig b/arch/sh/configs/sh7757lcr_defconfig index d0933a9b9799..a2700ab165af 100644 --- a/arch/sh/configs/sh7757lcr_defconfig +++ b/arch/sh/configs/sh7757lcr_defconfig @@ -8,7 +8,6 @@ CONFIG_TASK_XACCT=y CONFIG_TASK_IO_ACCOUNTING=y CONFIG_LOG_BUF_SHIFT=14 CONFIG_BLK_DEV_INITRD=y -# CONFIG_SYSCTL_SYSCALL is not set CONFIG_KALLSYMS_ALL=y CONFIG_SLAB=y CONFIG_MODULES=y diff --git a/arch/sh/configs/sh7763rdp_defconfig b/arch/sh/configs/sh7763rdp_defconfig index d0a0aa74cecf..26c5fd02c87a 100644 --- a/arch/sh/configs/sh7763rdp_defconfig +++ b/arch/sh/configs/sh7763rdp_defconfig @@ -5,7 +5,6 @@ CONFIG_LOG_BUF_SHIFT=14 CONFIG_NAMESPACES=y CONFIG_UTS_NS=y CONFIG_IPC_NS=y -# CONFIG_SYSCTL_SYSCALL is not set CONFIG_SLAB=y CONFIG_PROFILING=y CONFIG_OPROFILE=y diff --git a/arch/sh/configs/shmin_defconfig b/arch/sh/configs/shmin_defconfig index a27b129b93c5..c0b6f40d01cc 100644 --- a/arch/sh/configs/shmin_defconfig +++ b/arch/sh/configs/shmin_defconfig @@ -1,7 +1,6 @@ # CONFIG_SWAP is not set CONFIG_LOG_BUF_SHIFT=14 # CONFIG_UID16 is not set -# CONFIG_SYSCTL_SYSCALL is not set # CONFIG_KALLSYMS is not set # CONFIG_HOTPLUG is not set # CONFIG_BUG is not set diff --git a/arch/sh/configs/titan_defconfig b/arch/sh/configs/titan_defconfig index 4ec961ace688..ba887f1351be 100644 --- a/arch/sh/configs/titan_defconfig +++ b/arch/sh/configs/titan_defconfig @@ -6,7 +6,6 @@ CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=16 CONFIG_BLK_DEV_INITRD=y # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set -# CONFIG_SYSCTL_SYSCALL is not set CONFIG_SLAB=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index 96848db9659e..ae0a00beea5f 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -156,7 +156,7 @@ 146 common writev sys_writev 147 common getsid sys_getsid 148 common fdatasync sys_fdatasync -149 common _sysctl sys_sysctl +149 common _sysctl sys_ni_syscall 150 common mlock sys_mlock 151 common munlock sys_munlock 152 common mlockall sys_mlockall diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 46024e80ee86..4af114e84f20 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -300,7 +300,7 @@ 249 64 nanosleep sys_nanosleep 250 32 mremap sys_mremap 250 64 mremap sys_64_mremap -251 common _sysctl sys_sysctl compat_sys_sysctl +251 common _sysctl sys_ni_syscall 252 common getsid sys_getsid 253 common fdatasync sys_fdatasync 254 32 nfsservctl sys_ni_syscall sys_nis_syscall diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index e31a75262c9c..9d1102873666 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -160,7 +160,7 @@ 146 i386 writev sys_writev compat_sys_writev 147 i386 getsid sys_getsid 148 i386 fdatasync sys_fdatasync -149 i386 _sysctl sys_sysctl compat_sys_sysctl +149 i386 _sysctl sys_ni_syscall 150 i386 mlock sys_mlock 151 i386 munlock sys_munlock 152 i386 mlockall sys_mlockall diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 9d82078c949a..f30d6ae9a688 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -164,7 +164,7 @@ 153 common vhangup sys_vhangup 154 common modify_ldt sys_modify_ldt 155 common pivot_root sys_pivot_root -156 64 _sysctl sys_sysctl +156 64 _sysctl sys_ni_syscall 157 common prctl sys_prctl 158 common arch_prctl sys_arch_prctl 159 common adjtimex sys_adjtimex diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index d216ccba42f7..6276e3c2d3fc 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -222,7 +222,7 @@ 204 common quotactl sys_quotactl # 205 was old nfsservctl 205 common nfsservctl sys_ni_syscall -206 common _sysctl sys_sysctl +206 common _sysctl sys_ni_syscall 207 common bdflush sys_bdflush 208 common uname sys_newuname 209 common sysinfo sys_sysinfo diff --git a/include/linux/compat.h b/include/linux/compat.h index c4255d8a4a8a..d38c4d7e83bd 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -851,7 +851,6 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp, asmlinkage long compat_sys_ustat(unsigned dev, struct compat_ustat __user *u32); asmlinkage long compat_sys_recv(int fd, void __user *buf, compat_size_t len, unsigned flags); -asmlinkage long compat_sys_sysctl(struct compat_sysctl_args __user *args); /* obsolete: fs/readdir.c */ asmlinkage long compat_sys_old_readdir(unsigned int fd, diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index dc2b827c81e5..75ac7f8ae93c 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -47,7 +47,6 @@ struct stat64; struct statfs; struct statfs64; struct statx; -struct __sysctl_args; struct sysinfo; struct timespec; struct __kernel_old_timeval; @@ -1117,7 +1116,6 @@ asmlinkage long sys_send(int, void __user *, size_t, unsigned); asmlinkage long sys_bdflush(int func, long data); asmlinkage long sys_oldumount(char __user *name); asmlinkage long sys_uselib(const char __user *library); -asmlinkage long sys_sysctl(struct __sysctl_args __user *args); asmlinkage long sys_sysfs(int option, unsigned long arg1, unsigned long arg2); asmlinkage long sys_fork(void); diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 50bb7f383a1b..51298a4f4623 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -74,15 +74,13 @@ int proc_do_static_key(struct ctl_table *table, int write, void *buffer, * sysctl names can be mirrored automatically under /proc/sys. The * procname supplied controls /proc naming. * - * The table's mode will be honoured both for sys_sysctl(2) and - * proc-fs access. + * The table's mode will be honoured for proc-fs access. * * Leaf nodes in the sysctl tree will be represented by a single file * under /proc; non-leaf nodes will be represented by directories. A * null procname disables /proc mirroring at this node. * - * sysctl(2) can automatically manage read and write requests through - * the sysctl table. The data and maxlen fields of the ctl_table + * The data and maxlen fields of the ctl_table * struct enable minimal validation of the values being written to be * performed, and the mode field allows minimal authentication. * diff --git a/kernel/Makefile b/kernel/Makefile index b3da548691c9..9a20016d4900 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -5,7 +5,7 @@ obj-y = fork.o exec_domain.o panic.o \ cpu.o exit.o softirq.o resource.o \ - sysctl.o sysctl_binary.o capability.o ptrace.o user.o \ + sysctl.o capability.o ptrace.o user.o \ signal.o sys.o umh.o workqueue.o pid.o task_work.o \ extable.o params.o \ kthread.o sys_ni.o nsproxy.o \ diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 3b69a560a7ac..4d59775ea79c 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -364,7 +364,6 @@ COND_SYSCALL(socketcall); COND_SYSCALL_COMPAT(socketcall); /* compat syscalls for arm64, x86, ... */ -COND_SYSCALL_COMPAT(sysctl); COND_SYSCALL_COMPAT(fanotify_mark); /* x86 */ diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c deleted file mode 100644 index 7d550cc76a3b..000000000000 --- a/kernel/sysctl_binary.c +++ /dev/null @@ -1,171 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include "../fs/xfs/xfs_sysctl.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static ssize_t binary_sysctl(const int *name, int nlen, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) -{ - return -ENOSYS; -} - -static void deprecated_sysctl_warning(const int *name, int nlen) -{ - int i; - - /* - * CTL_KERN/KERN_VERSION is used by older glibc and cannot - * ever go away. - */ - if (nlen >= 2 && name[0] == CTL_KERN && name[1] == KERN_VERSION) - return; - - if (printk_ratelimit()) { - printk(KERN_INFO - "warning: process `%s' used the deprecated sysctl " - "system call with ", current->comm); - for (i = 0; i < nlen; i++) - printk(KERN_CONT "%d.", name[i]); - printk(KERN_CONT "\n"); - } - return; -} - -#define WARN_ONCE_HASH_BITS 8 -#define WARN_ONCE_HASH_SIZE (1<nlen. */ - if (nlen < 0 || nlen > CTL_MAXNAME) - return -ENOTDIR; - /* Read in the sysctl name for simplicity */ - for (i = 0; i < nlen; i++) - if (get_user(name[i], args_name + i)) - return -EFAULT; - - warn_on_bintable(name, nlen); - - return binary_sysctl(name, nlen, oldval, oldlen, newval, newlen); -} - -SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args) -{ - struct __sysctl_args tmp; - size_t oldlen = 0; - ssize_t result; - - if (copy_from_user(&tmp, args, sizeof(tmp))) - return -EFAULT; - - if (tmp.oldval && !tmp.oldlenp) - return -EFAULT; - - if (tmp.oldlenp && get_user(oldlen, tmp.oldlenp)) - return -EFAULT; - - result = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, oldlen, - tmp.newval, tmp.newlen); - - if (result >= 0) { - oldlen = result; - result = 0; - } - - if (tmp.oldlenp && put_user(oldlen, tmp.oldlenp)) - return -EFAULT; - - return result; -} - - -#ifdef CONFIG_COMPAT - -struct compat_sysctl_args { - compat_uptr_t name; - int nlen; - compat_uptr_t oldval; - compat_uptr_t oldlenp; - compat_uptr_t newval; - compat_size_t newlen; - compat_ulong_t __unused[4]; -}; - -COMPAT_SYSCALL_DEFINE1(sysctl, struct compat_sysctl_args __user *, args) -{ - struct compat_sysctl_args tmp; - compat_size_t __user *compat_oldlenp; - size_t oldlen = 0; - ssize_t result; - - if (copy_from_user(&tmp, args, sizeof(tmp))) - return -EFAULT; - - if (tmp.oldval && !tmp.oldlenp) - return -EFAULT; - - compat_oldlenp = compat_ptr(tmp.oldlenp); - if (compat_oldlenp && get_user(oldlen, compat_oldlenp)) - return -EFAULT; - - result = do_sysctl(compat_ptr(tmp.name), tmp.nlen, - compat_ptr(tmp.oldval), oldlen, - compat_ptr(tmp.newval), tmp.newlen); - - if (result >= 0) { - oldlen = result; - result = 0; - } - - if (compat_oldlenp && put_user(oldlen, compat_oldlenp)) - return -EFAULT; - - return result; -} - -#endif /* CONFIG_COMPAT */ diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl index b190f2eb2611..3ca6fe057a0b 100644 --- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl @@ -193,7 +193,7 @@ 146 common writev sys_writev compat_sys_writev 147 common getsid sys_getsid 148 common fdatasync sys_fdatasync -149 nospu _sysctl sys_sysctl compat_sys_sysctl +149 nospu _sysctl sys_ni_syscall 150 common mlock sys_mlock 151 common munlock sys_munlock 152 common mlockall sys_mlockall diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl index 56ae24b6e4be..6a0bbea225db 100644 --- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl @@ -138,7 +138,7 @@ 146 common writev sys_writev compat_sys_writev 147 common getsid sys_getsid sys_getsid 148 common fdatasync sys_fdatasync sys_fdatasync -149 common _sysctl sys_sysctl compat_sys_sysctl +149 common _sysctl - - 150 common mlock sys_mlock compat_sys_mlock 151 common munlock sys_munlock compat_sys_munlock 152 common mlockall sys_mlockall sys_mlockall diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl index 9d82078c949a..f30d6ae9a688 100644 --- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl +++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl @@ -164,7 +164,7 @@ 153 common vhangup sys_vhangup 154 common modify_ldt sys_modify_ldt 155 common pivot_root sys_pivot_root -156 64 _sysctl sys_sysctl +156 64 _sysctl sys_ni_syscall 157 common prctl sys_prctl 158 common arch_prctl sys_arch_prctl 159 common adjtimex sys_adjtimex -- cgit v1.2.3 From 38480df564cc68f081bb38998927d164b9010995 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Tue, 4 Aug 2020 19:05:59 +0200 Subject: KVM: arm64: pvtime: steal-time is only supported when configured Don't confuse the guest by saying steal-time is supported when it hasn't been configured by userspace and won't work. Signed-off-by: Andrew Jones Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200804170604.42662-2-drjones@redhat.com --- arch/arm64/kvm/pvtime.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/arm64') diff --git a/arch/arm64/kvm/pvtime.c b/arch/arm64/kvm/pvtime.c index f7b52ce1557e..c3ef4ebd6846 100644 --- a/arch/arm64/kvm/pvtime.c +++ b/arch/arm64/kvm/pvtime.c @@ -43,7 +43,8 @@ long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu) switch (feature) { case ARM_SMCCC_HV_PV_TIME_FEATURES: case ARM_SMCCC_HV_PV_TIME_ST: - val = SMCCC_RET_SUCCESS; + if (vcpu->arch.steal.base != GPA_INVALID) + val = SMCCC_RET_SUCCESS; break; } -- cgit v1.2.3 From 2dbd780e34ac53e79c6c359ce12b89ed665ef562 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Tue, 4 Aug 2020 19:06:00 +0200 Subject: KVM: arm64: pvtime: Fix potential loss of stolen time We should only check current->sched_info.run_delay once when updating stolen time. Otherwise there's a chance there could be a change between checks that we miss (preemption disabling comes after vcpu request checks). Signed-off-by: Andrew Jones Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200804170604.42662-3-drjones@redhat.com --- arch/arm64/kvm/pvtime.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/kvm/pvtime.c b/arch/arm64/kvm/pvtime.c index c3ef4ebd6846..95f9580275b1 100644 --- a/arch/arm64/kvm/pvtime.c +++ b/arch/arm64/kvm/pvtime.c @@ -13,6 +13,7 @@ void kvm_update_stolen_time(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; + u64 last_steal = vcpu->arch.steal.last_steal; u64 steal; __le64 steal_le; u64 offset; @@ -24,8 +25,8 @@ void kvm_update_stolen_time(struct kvm_vcpu *vcpu) /* Let's do the local bookkeeping */ steal = vcpu->arch.steal.steal; - steal += current->sched_info.run_delay - vcpu->arch.steal.last_steal; - vcpu->arch.steal.last_steal = current->sched_info.run_delay; + vcpu->arch.steal.last_steal = READ_ONCE(current->sched_info.run_delay); + steal += vcpu->arch.steal.last_steal - last_steal; vcpu->arch.steal.steal = steal; steal_le = cpu_to_le64(steal); -- cgit v1.2.3 From 4d2d4ce001f283ed8127173543b4cfb65641e357 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Tue, 4 Aug 2020 19:06:01 +0200 Subject: KVM: arm64: Drop type input from kvm_put_guest We can use typeof() to avoid the need for the type input. Suggested-by: Marc Zyngier Signed-off-by: Andrew Jones Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200804170604.42662-4-drjones@redhat.com --- arch/arm64/kvm/pvtime.c | 2 +- include/linux/kvm_host.h | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/kvm/pvtime.c b/arch/arm64/kvm/pvtime.c index 95f9580275b1..241ded7ee0ad 100644 --- a/arch/arm64/kvm/pvtime.c +++ b/arch/arm64/kvm/pvtime.c @@ -32,7 +32,7 @@ void kvm_update_stolen_time(struct kvm_vcpu *vcpu) steal_le = cpu_to_le64(steal); idx = srcu_read_lock(&kvm->srcu); offset = offsetof(struct pvclock_vcpu_stolen_time, stolen_time); - kvm_put_guest(kvm, base + offset, steal_le, u64); + kvm_put_guest(kvm, base + offset, steal_le); srcu_read_unlock(&kvm->srcu, idx); } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index a23076765b4c..84371fb06209 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -749,25 +749,26 @@ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, gpa_t gpa, unsigned long len); -#define __kvm_put_guest(kvm, gfn, offset, value, type) \ +#define __kvm_put_guest(kvm, gfn, offset, v) \ ({ \ unsigned long __addr = gfn_to_hva(kvm, gfn); \ - type __user *__uaddr = (type __user *)(__addr + offset); \ + typeof(v) __user *__uaddr = (typeof(__uaddr))(__addr + offset); \ int __ret = -EFAULT; \ \ if (!kvm_is_error_hva(__addr)) \ - __ret = put_user(value, __uaddr); \ + __ret = put_user(v, __uaddr); \ if (!__ret) \ mark_page_dirty(kvm, gfn); \ __ret; \ }) -#define kvm_put_guest(kvm, gpa, value, type) \ +#define kvm_put_guest(kvm, gpa, v) \ ({ \ gpa_t __gpa = gpa; \ struct kvm *__kvm = kvm; \ + \ __kvm_put_guest(__kvm, __gpa >> PAGE_SHIFT, \ - offset_in_page(__gpa), (value), type); \ + offset_in_page(__gpa), v); \ }) int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len); -- cgit v1.2.3 From 53f985584e3c2ebe5f2455530fbf87a001528db8 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Tue, 4 Aug 2020 19:06:02 +0200 Subject: KVM: arm64: pvtime: Fix stolen time accounting across migration When updating the stolen time we should always read the current stolen time from the user provided memory, not from a kernel cache. If we use a cache then we'll end up resetting stolen time to zero on the first update after migration. Signed-off-by: Andrew Jones Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200804170604.42662-5-drjones@redhat.com --- arch/arm64/include/asm/kvm_host.h | 1 - arch/arm64/kvm/pvtime.c | 23 +++++++++-------------- include/linux/kvm_host.h | 20 ++++++++++++++++++++ 3 files changed, 29 insertions(+), 15 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 65568b23868a..dd9c3b25aa1e 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -368,7 +368,6 @@ struct kvm_vcpu_arch { /* Guest PV state */ struct { - u64 steal; u64 last_steal; gpa_t base; } steal; diff --git a/arch/arm64/kvm/pvtime.c b/arch/arm64/kvm/pvtime.c index 241ded7ee0ad..75234321d896 100644 --- a/arch/arm64/kvm/pvtime.c +++ b/arch/arm64/kvm/pvtime.c @@ -13,26 +13,22 @@ void kvm_update_stolen_time(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; + u64 base = vcpu->arch.steal.base; u64 last_steal = vcpu->arch.steal.last_steal; - u64 steal; - __le64 steal_le; - u64 offset; + u64 offset = offsetof(struct pvclock_vcpu_stolen_time, stolen_time); + u64 steal = 0; int idx; - u64 base = vcpu->arch.steal.base; if (base == GPA_INVALID) return; - /* Let's do the local bookkeeping */ - steal = vcpu->arch.steal.steal; - vcpu->arch.steal.last_steal = READ_ONCE(current->sched_info.run_delay); - steal += vcpu->arch.steal.last_steal - last_steal; - vcpu->arch.steal.steal = steal; - - steal_le = cpu_to_le64(steal); idx = srcu_read_lock(&kvm->srcu); - offset = offsetof(struct pvclock_vcpu_stolen_time, stolen_time); - kvm_put_guest(kvm, base + offset, steal_le); + if (!kvm_get_guest(kvm, base + offset, steal)) { + steal = le64_to_cpu(steal); + vcpu->arch.steal.last_steal = READ_ONCE(current->sched_info.run_delay); + steal += vcpu->arch.steal.last_steal - last_steal; + kvm_put_guest(kvm, base + offset, cpu_to_le64(steal)); + } srcu_read_unlock(&kvm->srcu, idx); } @@ -66,7 +62,6 @@ gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu) * Start counting stolen time from the time the guest requests * the feature enabled. */ - vcpu->arch.steal.steal = 0; vcpu->arch.steal.last_steal = current->sched_info.run_delay; idx = srcu_read_lock(&kvm->srcu); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 84371fb06209..05e3c2fb3ef7 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -749,6 +749,26 @@ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, gpa_t gpa, unsigned long len); +#define __kvm_get_guest(kvm, gfn, offset, v) \ +({ \ + unsigned long __addr = gfn_to_hva(kvm, gfn); \ + typeof(v) __user *__uaddr = (typeof(__uaddr))(__addr + offset); \ + int __ret = -EFAULT; \ + \ + if (!kvm_is_error_hva(__addr)) \ + __ret = get_user(v, __uaddr); \ + __ret; \ +}) + +#define kvm_get_guest(kvm, gpa, v) \ +({ \ + gpa_t __gpa = gpa; \ + struct kvm *__kvm = kvm; \ + \ + __kvm_get_guest(__kvm, __gpa >> PAGE_SHIFT, \ + offset_in_page(__gpa), v); \ +}) + #define __kvm_put_guest(kvm, gfn, offset, v) \ ({ \ unsigned long __addr = gfn_to_hva(kvm, gfn); \ -- cgit v1.2.3 From 004a01241c5a0d375266ebf1c72f208de99294e9 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Tue, 4 Aug 2020 19:06:04 +0200 Subject: arm64/x86: KVM: Introduce steal-time cap arm64 requires a vcpu fd (KVM_HAS_DEVICE_ATTR vcpu ioctl) to probe support for steal-time. However this is unnecessary, as only a KVM fd is required, and it complicates userspace (userspace may prefer delaying vcpu creation until after feature probing). Introduce a cap that can be checked instead. While x86 can already probe steal-time support with a kvm fd (KVM_GET_SUPPORTED_CPUID), we add the cap there too for consistency. Signed-off-by: Andrew Jones Signed-off-by: Marc Zyngier Reviewed-by: Steven Price Link: https://lore.kernel.org/r/20200804170604.42662-7-drjones@redhat.com --- Documentation/virt/kvm/api.rst | 13 +++++++++++++ arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/kvm/arm.c | 3 +++ arch/arm64/kvm/pvtime.c | 2 +- arch/x86/kvm/x86.c | 3 +++ include/uapi/linux/kvm.h | 1 + 6 files changed, 22 insertions(+), 1 deletion(-) (limited to 'arch/arm64') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 49af23d2b462..d2b733dc7892 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6160,3 +6160,16 @@ KVM can therefore start protected VMs. This capability governs the KVM_S390_PV_COMMAND ioctl and the KVM_MP_STATE_LOAD MP_STATE. KVM_SET_MP_STATE can fail for protected guests when the state change is invalid. + +8.24 KVM_CAP_STEAL_TIME +----------------------- + +:Architectures: arm64, x86 + +This capability indicates that KVM supports steal time accounting. +When steal time accounting is supported it may be enabled with +architecture-specific interfaces. This capability and the architecture- +specific interfaces must be consistent, i.e. if one says the feature +is supported, than the other should as well and vice versa. For arm64 +see Documentation/virt/kvm/devices/vcpu.rst "KVM_ARM_VCPU_PVTIME_CTRL". +For x86 see Documentation/virt/kvm/msr.rst "MSR_KVM_STEAL_TIME". diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index dd9c3b25aa1e..af4989a25bb7 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -543,6 +543,7 @@ long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu); gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu); void kvm_update_stolen_time(struct kvm_vcpu *vcpu); +bool kvm_arm_pvtime_supported(void); int kvm_arm_pvtime_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int kvm_arm_pvtime_get_attr(struct kvm_vcpu *vcpu, diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 691d21e4c717..57876b0b870b 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -206,6 +206,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) */ r = 1; break; + case KVM_CAP_STEAL_TIME: + r = kvm_arm_pvtime_supported(); + break; default: r = kvm_arch_vm_ioctl_check_extension(kvm, ext); break; diff --git a/arch/arm64/kvm/pvtime.c b/arch/arm64/kvm/pvtime.c index 75234321d896..920ac43077ad 100644 --- a/arch/arm64/kvm/pvtime.c +++ b/arch/arm64/kvm/pvtime.c @@ -71,7 +71,7 @@ gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu) return base; } -static bool kvm_arm_pvtime_supported(void) +bool kvm_arm_pvtime_supported(void) { return !!sched_info_on(); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 599d73206299..c44d3a73b8eb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3581,6 +3581,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SMALLER_MAXPHYADDR: r = (int) allow_smaller_maxphyaddr; break; + case KVM_CAP_STEAL_TIME: + r = sched_info_on(); + break; default: break; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index f6d86033c4fa..3d8023474f2a 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1035,6 +1035,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_LAST_CPU 184 #define KVM_CAP_SMALLER_MAXPHYADDR 185 #define KVM_CAP_S390_DIAG318 186 +#define KVM_CAP_STEAL_TIME 187 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From 3fb884ffe921c99483a84b0175f3c03f048e9069 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 2 Sep 2020 11:18:29 +0100 Subject: KVM: arm64: Do not try to map PUDs when they are folded into PMD For the obscure cases where PMD and PUD are the same size (64kB pages with 42bit VA, for example, which results in only two levels of page tables), we can't map anything as a PUD, because there is... erm... no PUD to speak of. Everything is either a PMD or a PTE. So let's only try and map a PUD when its size is different from that of a PMD. Cc: stable@vger.kernel.org Fixes: b8e0ba7c8bea ("KVM: arm64: Add support for creating PUD hugepages at stage 2") Reported-by: Gavin Shan Reported-by: Eric Auger Reviewed-by: Alexandru Elisei Reviewed-by: Gavin Shan Tested-by: Gavin Shan Tested-by: Eric Auger Tested-by: Alexandru Elisei Signed-off-by: Marc Zyngier --- arch/arm64/kvm/mmu.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/arm64') diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 0121ef2c7c8d..16b8660ddbcc 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1964,7 +1964,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, (fault_status == FSC_PERM && stage2_is_exec(mmu, fault_ipa, vma_pagesize)); - if (vma_pagesize == PUD_SIZE) { + /* + * If PUD_SIZE == PMD_SIZE, there is no real PUD level, and + * all we have is a 2-level page table. Trying to map a PUD in + * this case would be fatally wrong. + */ + if (PUD_SIZE != PMD_SIZE && vma_pagesize == PUD_SIZE) { pud_t new_pud = kvm_pfn_pud(pfn, mem_type); new_pud = kvm_pud_mkhuge(new_pud); -- cgit v1.2.3 From 376426b1a953762b00df887e28d29e44ab4ff723 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 2 Sep 2020 11:53:03 +0100 Subject: KVM: arm64: Fix address truncation in traces Owing to their ARMv7 origins, the trace events are truncating most address values to 32bits. That's not really helpful. Expand the printing of such values to their full glory. Signed-off-by: Marc Zyngier --- arch/arm64/kvm/trace_arm.h | 16 ++++++++-------- arch/arm64/kvm/trace_handle_exit.h | 6 +++--- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/kvm/trace_arm.h b/arch/arm64/kvm/trace_arm.h index 4691053c5ee4..ff0444352bba 100644 --- a/arch/arm64/kvm/trace_arm.h +++ b/arch/arm64/kvm/trace_arm.h @@ -23,7 +23,7 @@ TRACE_EVENT(kvm_entry, __entry->vcpu_pc = vcpu_pc; ), - TP_printk("PC: 0x%08lx", __entry->vcpu_pc) + TP_printk("PC: 0x%016lx", __entry->vcpu_pc) ); TRACE_EVENT(kvm_exit, @@ -42,7 +42,7 @@ TRACE_EVENT(kvm_exit, __entry->vcpu_pc = vcpu_pc; ), - TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%08lx", + TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%016lx", __print_symbolic(__entry->ret, kvm_arm_exception_type), __entry->esr_ec, __print_symbolic(__entry->esr_ec, kvm_arm_exception_class), @@ -69,7 +69,7 @@ TRACE_EVENT(kvm_guest_fault, __entry->ipa = ipa; ), - TP_printk("ipa %#llx, hsr %#08lx, hxfar %#08lx, pc %#08lx", + TP_printk("ipa %#llx, hsr %#08lx, hxfar %#08lx, pc %#016lx", __entry->ipa, __entry->hsr, __entry->hxfar, __entry->vcpu_pc) ); @@ -131,7 +131,7 @@ TRACE_EVENT(kvm_mmio_emulate, __entry->cpsr = cpsr; ), - TP_printk("Emulate MMIO at: 0x%08lx (instr: %08lx, cpsr: %08lx)", + TP_printk("Emulate MMIO at: 0x%016lx (instr: %08lx, cpsr: %08lx)", __entry->vcpu_pc, __entry->instr, __entry->cpsr) ); @@ -149,7 +149,7 @@ TRACE_EVENT(kvm_unmap_hva_range, __entry->end = end; ), - TP_printk("mmu notifier unmap range: %#08lx -- %#08lx", + TP_printk("mmu notifier unmap range: %#016lx -- %#016lx", __entry->start, __entry->end) ); @@ -165,7 +165,7 @@ TRACE_EVENT(kvm_set_spte_hva, __entry->hva = hva; ), - TP_printk("mmu notifier set pte hva: %#08lx", __entry->hva) + TP_printk("mmu notifier set pte hva: %#016lx", __entry->hva) ); TRACE_EVENT(kvm_age_hva, @@ -182,7 +182,7 @@ TRACE_EVENT(kvm_age_hva, __entry->end = end; ), - TP_printk("mmu notifier age hva: %#08lx -- %#08lx", + TP_printk("mmu notifier age hva: %#016lx -- %#016lx", __entry->start, __entry->end) ); @@ -198,7 +198,7 @@ TRACE_EVENT(kvm_test_age_hva, __entry->hva = hva; ), - TP_printk("mmu notifier test age hva: %#08lx", __entry->hva) + TP_printk("mmu notifier test age hva: %#016lx", __entry->hva) ); TRACE_EVENT(kvm_set_way_flush, diff --git a/arch/arm64/kvm/trace_handle_exit.h b/arch/arm64/kvm/trace_handle_exit.h index 2c56d1e0f5bd..8d78acc4fba7 100644 --- a/arch/arm64/kvm/trace_handle_exit.h +++ b/arch/arm64/kvm/trace_handle_exit.h @@ -22,7 +22,7 @@ TRACE_EVENT(kvm_wfx_arm64, __entry->is_wfe = is_wfe; ), - TP_printk("guest executed wf%c at: 0x%08lx", + TP_printk("guest executed wf%c at: 0x%016lx", __entry->is_wfe ? 'e' : 'i', __entry->vcpu_pc) ); @@ -42,7 +42,7 @@ TRACE_EVENT(kvm_hvc_arm64, __entry->imm = imm; ), - TP_printk("HVC at 0x%08lx (r0: 0x%08lx, imm: 0x%lx)", + TP_printk("HVC at 0x%016lx (r0: 0x%016lx, imm: 0x%lx)", __entry->vcpu_pc, __entry->r0, __entry->imm) ); @@ -135,7 +135,7 @@ TRACE_EVENT(trap_reg, __entry->write_value = write_value; ), - TP_printk("%s %s reg %d (0x%08llx)", __entry->fn, __entry->is_write?"write to":"read from", __entry->reg, __entry->write_value) + TP_printk("%s %s reg %d (0x%016llx)", __entry->fn, __entry->is_write?"write to":"read from", __entry->reg, __entry->write_value) ); TRACE_EVENT(kvm_handle_sys_reg, -- cgit v1.2.3 From 7b75cd5128421c673153efb1236705696a1a9812 Mon Sep 17 00:00:00 2001 From: Alexandru Elisei Date: Tue, 1 Sep 2020 14:33:56 +0100 Subject: KVM: arm64: Update page shift if stage 2 block mapping not supported Commit 196f878a7ac2e (" KVM: arm/arm64: Signal SIGBUS when stage2 discovers hwpoison memory") modifies user_mem_abort() to send a SIGBUS signal when the fault IPA maps to a hwpoisoned page. Commit 1559b7583ff6 ("KVM: arm/arm64: Re-check VMA on detecting a poisoned page") changed kvm_send_hwpoison_signal() to use the page shift instead of the VMA because at that point the code had already released the mmap lock, which means userspace could have modified the VMA. If userspace uses hugetlbfs for the VM memory, user_mem_abort() tries to map the guest fault IPA using block mappings in stage 2. That is not always possible, if, for example, userspace uses dirty page logging for the VM. Update the page shift appropriately in those cases when we downgrade the stage 2 entry from a block mapping to a page. Fixes: 1559b7583ff6 ("KVM: arm/arm64: Re-check VMA on detecting a poisoned page") Signed-off-by: Alexandru Elisei Signed-off-by: Marc Zyngier Reviewed-by: Gavin Shan Link: https://lore.kernel.org/r/20200901133357.52640-2-alexandru.elisei@arm.com --- arch/arm64/kvm/mmu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/arm64') diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 16b8660ddbcc..f58d657a898d 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1871,6 +1871,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) { force_pte = true; vma_pagesize = PAGE_SIZE; + vma_shift = PAGE_SHIFT; } /* -- cgit v1.2.3