diff options
25 files changed, 505 insertions, 48 deletions
diff --git a/arch/alpha/kernel/vmlinux.lds.S b/arch/alpha/kernel/vmlinux.lds.S index 003ef4c02585..433be2a24f31 100644 --- a/arch/alpha/kernel/vmlinux.lds.S +++ b/arch/alpha/kernel/vmlinux.lds.S @@ -1,5 +1,6 @@ #include <asm-generic/vmlinux.lds.h> #include <asm/thread_info.h> +#include <asm/cache.h> #include <asm/page.h> OUTPUT_FORMAT("elf64-alpha") @@ -38,7 +39,7 @@ SECTIONS __init_begin = ALIGN(PAGE_SIZE); INIT_TEXT_SECTION(PAGE_SIZE) INIT_DATA_SECTION(16) - PERCPU(PAGE_SIZE) + PERCPU(L1_CACHE_BYTES, PAGE_SIZE) /* Align to THREAD_SIZE rather than PAGE_SIZE here so any padding page needed for the THREAD_SIZE aligned init_task gets freed after init */ . = ALIGN(THREAD_SIZE); @@ -46,7 +47,7 @@ SECTIONS /* Freed after init ends here */ _data = .; - RW_DATA_SECTION(64, PAGE_SIZE, THREAD_SIZE) + RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) .got : { *(.got) diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S index 61462790757f..28fea9b2d129 100644 --- a/arch/arm/kernel/vmlinux.lds.S +++ b/arch/arm/kernel/vmlinux.lds.S @@ -78,7 +78,7 @@ SECTIONS #endif } - PERCPU(PAGE_SIZE) + PERCPU(32, PAGE_SIZE) #ifndef CONFIG_XIP_KERNEL . = ALIGN(PAGE_SIZE); diff --git a/arch/blackfin/kernel/vmlinux.lds.S b/arch/blackfin/kernel/vmlinux.lds.S index 4122678529c0..c40d07f708e8 100644 --- a/arch/blackfin/kernel/vmlinux.lds.S +++ b/arch/blackfin/kernel/vmlinux.lds.S @@ -136,7 +136,7 @@ SECTIONS . = ALIGN(16); INIT_DATA_SECTION(16) - PERCPU(4) + PERCPU(32, 4) .exit.data : { diff --git a/arch/cris/kernel/vmlinux.lds.S b/arch/cris/kernel/vmlinux.lds.S index c49be845f96a..728bbd9e7d4c 100644 --- a/arch/cris/kernel/vmlinux.lds.S +++ b/arch/cris/kernel/vmlinux.lds.S @@ -102,7 +102,7 @@ SECTIONS #endif __vmlinux_end = .; /* Last address of the physical file. */ #ifdef CONFIG_ETRAX_ARCH_V32 - PERCPU(PAGE_SIZE) + PERCPU(32, PAGE_SIZE) .init.ramfs : { INIT_RAM_FS diff --git a/arch/frv/kernel/vmlinux.lds.S b/arch/frv/kernel/vmlinux.lds.S index 8b973f3cc90e..0daae8af5787 100644 --- a/arch/frv/kernel/vmlinux.lds.S +++ b/arch/frv/kernel/vmlinux.lds.S @@ -37,7 +37,7 @@ SECTIONS _einittext = .; INIT_DATA_SECTION(8) - PERCPU(4096) + PERCPU(L1_CACHE_BYTES, 4096) . = ALIGN(PAGE_SIZE); __init_end = .; diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S index 5a4d044dcb1c..787de4a77d82 100644 --- a/arch/ia64/kernel/vmlinux.lds.S +++ b/arch/ia64/kernel/vmlinux.lds.S @@ -198,7 +198,7 @@ SECTIONS { /* Per-cpu data: */ . = ALIGN(PERCPU_PAGE_SIZE); - PERCPU_VADDR(PERCPU_ADDR, :percpu) + PERCPU_VADDR(SMP_CACHE_BYTES, PERCPU_ADDR, :percpu) __phys_per_cpu_start = __per_cpu_load; /* * ensure percpu data fits diff --git a/arch/m32r/kernel/vmlinux.lds.S b/arch/m32r/kernel/vmlinux.lds.S index 7da94eaa082b..c194d64cdbb9 100644 --- a/arch/m32r/kernel/vmlinux.lds.S +++ b/arch/m32r/kernel/vmlinux.lds.S @@ -53,7 +53,7 @@ SECTIONS __init_begin = .; INIT_TEXT_SECTION(PAGE_SIZE) INIT_DATA_SECTION(16) - PERCPU(PAGE_SIZE) + PERCPU(32, PAGE_SIZE) . = ALIGN(PAGE_SIZE); __init_end = .; /* freed after init ends here */ diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S index 570607b376b5..832afbb87588 100644 --- a/arch/mips/kernel/vmlinux.lds.S +++ b/arch/mips/kernel/vmlinux.lds.S @@ -115,7 +115,7 @@ SECTIONS EXIT_DATA } - PERCPU(PAGE_SIZE) + PERCPU(1 << CONFIG_MIPS_L1_CACHE_SHIFT, PAGE_SIZE) . = ALIGN(PAGE_SIZE); __init_end = .; /* freed after init ends here */ diff --git a/arch/mn10300/kernel/vmlinux.lds.S b/arch/mn10300/kernel/vmlinux.lds.S index febbeee7f2f5..968bcd2cb022 100644 --- a/arch/mn10300/kernel/vmlinux.lds.S +++ b/arch/mn10300/kernel/vmlinux.lds.S @@ -70,7 +70,7 @@ SECTIONS .exit.text : { EXIT_TEXT; } .exit.data : { EXIT_DATA; } - PERCPU(PAGE_SIZE) + PERCPU(32, PAGE_SIZE) . = ALIGN(PAGE_SIZE); __init_end = .; /* freed after init ends here */ diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S index d64a6bbec2aa..8f1e4efd143e 100644 --- a/arch/parisc/kernel/vmlinux.lds.S +++ b/arch/parisc/kernel/vmlinux.lds.S @@ -145,7 +145,7 @@ SECTIONS EXIT_DATA } - PERCPU(PAGE_SIZE) + PERCPU(L1_CACHE_BYTES, PAGE_SIZE) . = ALIGN(PAGE_SIZE); __init_end = .; /* freed after init ends here */ diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 8a0deefac08d..b9150f07d266 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -160,7 +160,7 @@ SECTIONS INIT_RAM_FS } - PERCPU(PAGE_SIZE) + PERCPU(L1_CACHE_BYTES, PAGE_SIZE) . = ALIGN(8); .machine.desc : AT(ADDR(.machine.desc) - LOAD_OFFSET) { diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index a68ac10213b2..1bc18cdb525b 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -77,7 +77,7 @@ SECTIONS . = ALIGN(PAGE_SIZE); INIT_DATA_SECTION(0x100) - PERCPU(PAGE_SIZE) + PERCPU(0x100, PAGE_SIZE) . = ALIGN(PAGE_SIZE); __init_end = .; /* freed after init ends here */ diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S index 7f8a709c3ada..af4d46187a79 100644 --- a/arch/sh/kernel/vmlinux.lds.S +++ b/arch/sh/kernel/vmlinux.lds.S @@ -66,7 +66,7 @@ SECTIONS __machvec_end = .; } - PERCPU(PAGE_SIZE) + PERCPU(L1_CACHE_BYTES, PAGE_SIZE) /* * .exit.text is discarded at runtime, not link time, to deal with diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S index 0c1e6783657f..92b557afe535 100644 --- a/arch/sparc/kernel/vmlinux.lds.S +++ b/arch/sparc/kernel/vmlinux.lds.S @@ -108,7 +108,7 @@ SECTIONS __sun4v_2insn_patch_end = .; } - PERCPU(PAGE_SIZE) + PERCPU(SMP_CACHE_BYTES, PAGE_SIZE) . = ALIGN(PAGE_SIZE); __init_end = .; diff --git a/arch/tile/kernel/vmlinux.lds.S b/arch/tile/kernel/vmlinux.lds.S index 25fdc0c1839a..c6ce378e0678 100644 --- a/arch/tile/kernel/vmlinux.lds.S +++ b/arch/tile/kernel/vmlinux.lds.S @@ -63,7 +63,7 @@ SECTIONS *(.init.page) } :data =0 INIT_DATA_SECTION(16) - PERCPU(PAGE_SIZE) + PERCPU(L2_CACHE_BYTES, PAGE_SIZE) . = ALIGN(PAGE_SIZE); VMLINUX_SYMBOL(_einitdata) = .; diff --git a/arch/um/include/asm/common.lds.S b/arch/um/include/asm/common.lds.S index ac55b9efa1ce..34bede8aad4a 100644 --- a/arch/um/include/asm/common.lds.S +++ b/arch/um/include/asm/common.lds.S @@ -42,7 +42,7 @@ INIT_SETUP(0) } - PERCPU(32) + PERCPU(32, 32) .initcall.init : { INIT_CALLS diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 7e172955ee57..a09e1f052d84 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -451,6 +451,26 @@ do { \ #define irqsafe_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) #endif /* !CONFIG_M386 */ +#ifdef CONFIG_X86_CMPXCHG64 +#define percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) \ +({ \ + char __ret; \ + typeof(o1) __o1 = o1; \ + typeof(o1) __n1 = n1; \ + typeof(o2) __o2 = o2; \ + typeof(o2) __n2 = n2; \ + typeof(o2) __dummy = n2; \ + asm volatile("cmpxchg8b "__percpu_arg(1)"\n\tsetz %0\n\t" \ + : "=a"(__ret), "=m" (pcp1), "=d"(__dummy) \ + : "b"(__n1), "c"(__n2), "a"(__o1), "d"(__o2)); \ + __ret; \ +}) + +#define __this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) +#define this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) +#define irqsafe_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) +#endif /* CONFIG_X86_CMPXCHG64 */ + /* * Per cpu atomic 64 bit operations are only available under 64 bit. * 32 bit must fall back to generic operations. @@ -480,6 +500,34 @@ do { \ #define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) #define irqsafe_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) #define irqsafe_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) + +/* + * Pretty complex macro to generate cmpxchg16 instruction. The instruction + * is not supported on early AMD64 processors so we must be able to emulate + * it in software. The address used in the cmpxchg16 instruction must be + * aligned to a 16 byte boundary. + */ +#define percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) \ +({ \ + char __ret; \ + typeof(o1) __o1 = o1; \ + typeof(o1) __n1 = n1; \ + typeof(o2) __o2 = o2; \ + typeof(o2) __n2 = n2; \ + typeof(o2) __dummy; \ + alternative_io("call this_cpu_cmpxchg16b_emu\n\t" P6_NOP4, \ + "cmpxchg16b %%gs:(%%rsi)\n\tsetz %0\n\t", \ + X86_FEATURE_CX16, \ + ASM_OUTPUT2("=a"(__ret), "=d"(__dummy)), \ + "S" (&pcp1), "b"(__n1), "c"(__n2), \ + "a"(__o1), "d"(__o2)); \ + __ret; \ +}) + +#define __this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) +#define this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) +#define irqsafe_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) + #endif /* This is not atomic against other CPUs -- CPU preemption needs to be off */ diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index bf4700755184..cef446f8ac78 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -230,7 +230,7 @@ SECTIONS * output PHDR, so the next output section - .init.text - should * start another segment - init. */ - PERCPU_VADDR(0, :percpu) + PERCPU_VADDR(INTERNODE_CACHE_BYTES, 0, :percpu) #endif INIT_TEXT_SECTION(PAGE_SIZE) @@ -305,7 +305,7 @@ SECTIONS } #if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) - PERCPU(THREAD_SIZE) + PERCPU(INTERNODE_CACHE_BYTES, THREAD_SIZE) #endif . = ALIGN(PAGE_SIZE); diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index e10cf070ede0..f2479f19ddde 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -42,4 +42,5 @@ else lib-y += memmove_64.o memset_64.o lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o + lib-y += cmpxchg16b_emu.o endif diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S new file mode 100644 index 000000000000..3e8b08a6de2b --- /dev/null +++ b/arch/x86/lib/cmpxchg16b_emu.S @@ -0,0 +1,59 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + */ +#include <linux/linkage.h> +#include <asm/alternative-asm.h> +#include <asm/frame.h> +#include <asm/dwarf2.h> + +.text + +/* + * Inputs: + * %rsi : memory location to compare + * %rax : low 64 bits of old value + * %rdx : high 64 bits of old value + * %rbx : low 64 bits of new value + * %rcx : high 64 bits of new value + * %al : Operation successful + */ +ENTRY(this_cpu_cmpxchg16b_emu) +CFI_STARTPROC + +# +# Emulate 'cmpxchg16b %gs:(%rsi)' except we return the result in %al not +# via the ZF. Caller will access %al to get result. +# +# Note that this is only useful for a cpuops operation. Meaning that we +# do *not* have a fully atomic operation but just an operation that is +# *atomic* on a single cpu (as provided by the this_cpu_xx class of +# macros). +# +this_cpu_cmpxchg16b_emu: + pushf + cli + + cmpq %gs:(%rsi), %rax + jne not_same + cmpq %gs:8(%rsi), %rdx + jne not_same + + movq %rbx, %gs:(%rsi) + movq %rcx, %gs:8(%rsi) + + popf + mov $1, %al + ret + + not_same: + popf + xor %al,%al + ret + +CFI_ENDPROC + +ENDPROC(this_cpu_cmpxchg16b_emu) diff --git a/arch/xtensa/kernel/vmlinux.lds.S b/arch/xtensa/kernel/vmlinux.lds.S index 9b526154c9ba..a2820065927e 100644 --- a/arch/xtensa/kernel/vmlinux.lds.S +++ b/arch/xtensa/kernel/vmlinux.lds.S @@ -155,7 +155,7 @@ SECTIONS INIT_RAM_FS } - PERCPU(PAGE_SIZE) + PERCPU(XCHAL_ICACHE_LINESIZE, PAGE_SIZE) /* We need this dummy segment here */ diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index fe77e3395b40..22d3342d6af4 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -15,7 +15,7 @@ * HEAD_TEXT_SECTION * INIT_TEXT_SECTION(PAGE_SIZE) * INIT_DATA_SECTION(...) - * PERCPU(PAGE_SIZE) + * PERCPU(CACHELINE_SIZE, PAGE_SIZE) * __init_end = .; * * _stext = .; @@ -683,13 +683,18 @@ /** * PERCPU_VADDR - define output section for percpu area + * @cacheline: cacheline size * @vaddr: explicit base address (optional) * @phdr: destination PHDR (optional) * - * Macro which expands to output section for percpu area. If @vaddr - * is not blank, it specifies explicit base address and all percpu - * symbols will be offset from the given address. If blank, @vaddr - * always equals @laddr + LOAD_OFFSET. + * Macro which expands to output section for percpu area. + * + * @cacheline is used to align subsections to avoid false cacheline + * sharing between subsections for different purposes. + * + * If @vaddr is not blank, it specifies explicit base address and all + * percpu symbols will be offset from the given address. If blank, + * @vaddr always equals @laddr + LOAD_OFFSET. * * @phdr defines the output PHDR to use if not blank. Be warned that * output PHDR is sticky. If @phdr is specified, the next output @@ -700,7 +705,7 @@ * If there is no need to put the percpu section at a predetermined * address, use PERCPU(). */ -#define PERCPU_VADDR(vaddr, phdr) \ +#define PERCPU_VADDR(cacheline, vaddr, phdr) \ VMLINUX_SYMBOL(__per_cpu_load) = .; \ .data..percpu vaddr : AT(VMLINUX_SYMBOL(__per_cpu_load) \ - LOAD_OFFSET) { \ @@ -708,7 +713,9 @@ *(.data..percpu..first) \ . = ALIGN(PAGE_SIZE); \ *(.data..percpu..page_aligned) \ + . = ALIGN(cacheline); \ *(.data..percpu..readmostly) \ + . = ALIGN(cacheline); \ *(.data..percpu) \ *(.data..percpu..shared_aligned) \ VMLINUX_SYMBOL(__per_cpu_end) = .; \ @@ -717,18 +724,18 @@ /** * PERCPU - define output section for percpu area, simple version + * @cacheline: cacheline size * @align: required alignment * - * Align to @align and outputs output section for percpu area. This - * macro doesn't maniuplate @vaddr or @phdr and __per_cpu_load and + * Align to @align and outputs output section for percpu area. This macro + * doesn't manipulate @vaddr or @phdr and __per_cpu_load and * __per_cpu_start will be identical. * - * This macro is equivalent to ALIGN(align); PERCPU_VADDR( , ) except - * that __per_cpu_load is defined as a relative symbol against - * .data..percpu which is required for relocatable x86_32 - * configuration. + * This macro is equivalent to ALIGN(@align); PERCPU_VADDR(@cacheline,,) + * except that __per_cpu_load is defined as a relative symbol against + * .data..percpu which is required for relocatable x86_32 configuration. */ -#define PERCPU(align) \ +#define PERCPU(cacheline, align) \ . = ALIGN(align); \ .data..percpu : AT(ADDR(.data..percpu) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__per_cpu_load) = .; \ @@ -736,7 +743,9 @@ *(.data..percpu..first) \ . = ALIGN(PAGE_SIZE); \ *(.data..percpu..page_aligned) \ + . = ALIGN(cacheline); \ *(.data..percpu..readmostly) \ + . = ALIGN(cacheline); \ *(.data..percpu) \ *(.data..percpu..shared_aligned) \ VMLINUX_SYMBOL(__per_cpu_end) = .; \ diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 27c3c6fcfad3..3a5c4449fd36 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -255,6 +255,30 @@ extern void __bad_size_call_parameter(void); pscr2_ret__; \ }) +/* + * Special handling for cmpxchg_double. cmpxchg_double is passed two + * percpu variables. The first has to be aligned to a double word + * boundary and the second has to follow directly thereafter. + */ +#define __pcpu_double_call_return_bool(stem, pcp1, pcp2, ...) \ +({ \ + bool pdcrb_ret__; \ + __verify_pcpu_ptr(&pcp1); \ + BUILD_BUG_ON(sizeof(pcp1) != sizeof(pcp2)); \ + VM_BUG_ON((unsigned long)(&pcp1) % (2 * sizeof(pcp1))); \ + VM_BUG_ON((unsigned long)(&pcp2) != \ + (unsigned long)(&pcp1) + sizeof(pcp1)); \ + switch(sizeof(pcp1)) { \ + case 1: pdcrb_ret__ = stem##1(pcp1, pcp2, __VA_ARGS__); break; \ + case 2: pdcrb_ret__ = stem##2(pcp1, pcp2, __VA_ARGS__); break; \ + case 4: pdcrb_ret__ = stem##4(pcp1, pcp2, __VA_ARGS__); break; \ + case 8: pdcrb_ret__ = stem##8(pcp1, pcp2, __VA_ARGS__); break; \ + default: \ + __bad_size_call_parameter(); break; \ + } \ + pdcrb_ret__; \ +}) + #define __pcpu_size_call(stem, variable, ...) \ do { \ __verify_pcpu_ptr(&(variable)); \ @@ -501,6 +525,45 @@ do { \ #endif /* + * cmpxchg_double replaces two adjacent scalars at once. The first + * two parameters are per cpu variables which have to be of the same + * size. A truth value is returned to indicate success or failure + * (since a double register result is difficult to handle). There is + * very limited hardware support for these operations, so only certain + * sizes may work. + */ +#define _this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \ +({ \ + int ret__; \ + preempt_disable(); \ + ret__ = __this_cpu_generic_cmpxchg_double(pcp1, pcp2, \ + oval1, oval2, nval1, nval2); \ + preempt_enable(); \ + ret__; \ +}) + +#ifndef this_cpu_cmpxchg_double +# ifndef this_cpu_cmpxchg_double_1 +# define this_cpu_cmpxchg_double_1(pcp1, pcp2, oval1, oval2, nval1, nval2) \ + _this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) +# endif +# ifndef this_cpu_cmpxchg_double_2 +# define this_cpu_cmpxchg_double_2(pcp1, pcp2, oval1, oval2, nval1, nval2) \ + _this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) +# endif +# ifndef this_cpu_cmpxchg_double_4 +# define this_cpu_cmpxchg_double_4(pcp1, pcp2, oval1, oval2, nval1, nval2) \ + _this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) +# endif +# ifndef this_cpu_cmpxchg_double_8 +# define this_cpu_cmpxchg_double_8(pcp1, pcp2, oval1, oval2, nval1, nval2) \ + _this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) +# endif +# define this_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \ + __pcpu_double_call_return_bool(this_cpu_cmpxchg_double_, (pcp1), (pcp2), (oval1), (oval2), (nval1), (nval2)) +#endif + +/* * Generic percpu operations that do not require preemption handling. * Either we do not care about races or the caller has the * responsibility of handling preemptions issues. Arch code can still @@ -703,6 +766,39 @@ do { \ __pcpu_size_call_return2(__this_cpu_cmpxchg_, pcp, oval, nval) #endif +#define __this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \ +({ \ + int __ret = 0; \ + if (__this_cpu_read(pcp1) == (oval1) && \ + __this_cpu_read(pcp2) == (oval2)) { \ + __this_cpu_write(pcp1, (nval1)); \ + __this_cpu_write(pcp2, (nval2)); \ + __ret = 1; \ + } \ + (__ret); \ +}) + +#ifndef __this_cpu_cmpxchg_double +# ifndef __this_cpu_cmpxchg_double_1 +# define __this_cpu_cmpxchg_double_1(pcp1, pcp2, oval1, oval2, nval1, nval2) \ + __this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) +# endif +# ifndef __this_cpu_cmpxchg_double_2 +# define __this_cpu_cmpxchg_double_2(pcp1, pcp2, oval1, oval2, nval1, nval2) \ + __this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) +# endif +# ifndef __this_cpu_cmpxchg_double_4 +# define __this_cpu_cmpxchg_double_4(pcp1, pcp2, oval1, oval2, nval1, nval2) \ + __this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) +# endif +# ifndef __this_cpu_cmpxchg_double_8 +# define __this_cpu_cmpxchg_double_8(pcp1, pcp2, oval1, oval2, nval1, nval2) \ + __this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) +# endif +# define __this_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \ + __pcpu_double_call_return_bool(__this_cpu_cmpxchg_double_, (pcp1), (pcp2), (oval1), (oval2), (nval1), (nval2)) +#endif + /* * IRQ safe versions of the per cpu RMW operations. Note that these operations * are *not* safe against modification of the same variable from another @@ -823,4 +919,36 @@ do { \ __pcpu_size_call_return2(irqsafe_cpu_cmpxchg_, (pcp), oval, nval) #endif +#define irqsafe_generic_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \ +({ \ + int ret__; \ + unsigned long flags; \ + local_irq_save(flags); \ + ret__ = __this_cpu_generic_cmpxchg_double(pcp1, pcp2, \ + oval1, oval2, nval1, nval2); \ + local_irq_restore(flags); \ + ret__; \ +}) + +#ifndef irqsafe_cpu_cmpxchg_double +# ifndef irqsafe_cpu_cmpxchg_double_1 +# define irqsafe_cpu_cmpxchg_double_1(pcp1, pcp2, oval1, oval2, nval1, nval2) \ + irqsafe_generic_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) +# endif +# ifndef irqsafe_cpu_cmpxchg_double_2 +# define irqsafe_cpu_cmpxchg_double_2(pcp1, pcp2, oval1, oval2, nval1, nval2) \ + irqsafe_generic_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) +# endif +# ifndef irqsafe_cpu_cmpxchg_double_4 +# define irqsafe_cpu_cmpxchg_double_4(pcp1, pcp2, oval1, oval2, nval1, nval2) \ + irqsafe_generic_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) +# endif +# ifndef irqsafe_cpu_cmpxchg_double_8 +# define irqsafe_cpu_cmpxchg_double_8(pcp1, pcp2, oval1, oval2, nval1, nval2) \ + irqsafe_generic_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) +# endif +# define irqsafe_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \ + __pcpu_double_call_return_int(irqsafe_cpu_cmpxchg_double_, (pcp1), (pcp2), (oval1), (oval2), (nval1), (nval2)) +#endif + #endif /* __LINUX_PERCPU_H */ diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index ae0093cc5189..90fbb6d87e11 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -35,7 +35,10 @@ enum stat_item { NR_SLUB_STAT_ITEMS }; struct kmem_cache_cpu { - void **freelist; /* Pointer to first free per cpu object */ + void **freelist; /* Pointer to next available object */ +#ifdef CONFIG_CMPXCHG_LOCAL + unsigned long tid; /* Globally unique transaction id */ +#endif struct page *page; /* The slab from which we are allocating */ int node; /* The node of the page (or -1 for debug) */ #ifdef CONFIG_SLUB_STATS @@ -70,6 +73,7 @@ struct kmem_cache { struct kmem_cache_cpu __percpu *cpu_slab; /* Used for retriving partial slabs etc */ unsigned long flags; + unsigned long min_partial; int size; /* The size of an object including meta data */ int objsize; /* The size of an object without meta data */ int offset; /* Free pointer offset. */ @@ -84,7 +88,6 @@ struct kmem_cache { int inuse; /* Offset to metadata */ int align; /* Alignment */ int reserved; /* Reserved bytes at the end of slabs */ - unsigned long min_partial; const char *name; /* Name (only for display!) */ struct list_head list; /* List of slab caches */ #ifdef CONFIG_SYSFS diff --git a/mm/slub.c b/mm/slub.c index e841d8921c22..7e4f835e32ab 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -836,14 +836,24 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void static inline void slab_free_hook(struct kmem_cache *s, void *x) { kmemleak_free_recursive(x, s->flags); -} -static inline void slab_free_hook_irq(struct kmem_cache *s, void *object) -{ - kmemcheck_slab_free(s, object, s->objsize); - debug_check_no_locks_freed(object, s->objsize); - if (!(s->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(object, s->objsize); + /* + * Trouble is that we may no longer disable interupts in the fast path + * So in order to make the debug calls that expect irqs to be + * disabled we need to disable interrupts temporarily. + */ +#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) + { + unsigned long flags; + + local_irq_save(flags); + kmemcheck_slab_free(s, x, s->objsize); + debug_check_no_locks_freed(x, s->objsize); + if (!(s->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(x, s->objsize); + local_irq_restore(flags); + } +#endif } /* @@ -1130,9 +1140,6 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, static inline void slab_free_hook(struct kmem_cache *s, void *x) {} -static inline void slab_free_hook_irq(struct kmem_cache *s, - void *object) {} - #endif /* CONFIG_SLUB_DEBUG */ /* @@ -1533,6 +1540,77 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) } } +#ifdef CONFIG_CMPXCHG_LOCAL +#ifdef CONFIG_PREEMPT +/* + * Calculate the next globally unique transaction for disambiguiation + * during cmpxchg. The transactions start with the cpu number and are then + * incremented by CONFIG_NR_CPUS. + */ +#define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS) +#else +/* + * No preemption supported therefore also no need to check for + * different cpus. + */ +#define TID_STEP 1 +#endif + +static inline unsigned long next_tid(unsigned long tid) +{ + return tid + TID_STEP; +} + +static inline unsigned int tid_to_cpu(unsigned long tid) +{ + return tid % TID_STEP; +} + +static inline unsigned long tid_to_event(unsigned long tid) +{ + return tid / TID_STEP; +} + +static inline unsigned int init_tid(int cpu) +{ + return cpu; +} + +static inline void note_cmpxchg_failure(const char *n, + const struct kmem_cache *s, unsigned long tid) +{ +#ifdef SLUB_DEBUG_CMPXCHG + unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid); + + printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name); + +#ifdef CONFIG_PREEMPT + if (tid_to_cpu(tid) != tid_to_cpu(actual_tid)) + printk("due to cpu change %d -> %d\n", + tid_to_cpu(tid), tid_to_cpu(actual_tid)); + else +#endif + if (tid_to_event(tid) != tid_to_event(actual_tid)) + printk("due to cpu running other code. Event %ld->%ld\n", + tid_to_event(tid), tid_to_event(actual_tid)); + else + printk("for unknown reason: actual=%lx was=%lx target=%lx\n", + actual_tid, tid, next_tid(tid)); +#endif +} + +#endif + +void init_kmem_cache_cpus(struct kmem_cache *s) +{ +#if defined(CONFIG_CMPXCHG_LOCAL) && defined(CONFIG_PREEMPT) + int cpu; + + for_each_possible_cpu(cpu) + per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); +#endif + +} /* * Remove the cpu slab */ @@ -1564,6 +1642,9 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) page->inuse--; } c->page = NULL; +#ifdef CONFIG_CMPXCHG_LOCAL + c->tid = next_tid(c->tid); +#endif unfreeze_slab(s, page, tail); } @@ -1698,6 +1779,19 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, { void **object; struct page *new; +#ifdef CONFIG_CMPXCHG_LOCAL + unsigned long flags; + + local_irq_save(flags); +#ifdef CONFIG_PREEMPT + /* + * We may have been preempted and rescheduled on a different + * cpu before disabling interrupts. Need to reload cpu area + * pointer. + */ + c = this_cpu_ptr(s->cpu_slab); +#endif +#endif /* We handle __GFP_ZERO in the caller */ gfpflags &= ~__GFP_ZERO; @@ -1724,6 +1818,10 @@ load_freelist: c->node = page_to_nid(c->page); unlock_out: slab_unlock(c->page); +#ifdef CONFIG_CMPXCHG_LOCAL + c->tid = next_tid(c->tid); + local_irq_restore(flags); +#endif stat(s, ALLOC_SLOWPATH); return object; @@ -1785,23 +1883,76 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, { void **object; struct kmem_cache_cpu *c; +#ifdef CONFIG_CMPXCHG_LOCAL + unsigned long tid; +#else unsigned long flags; +#endif if (slab_pre_alloc_hook(s, gfpflags)) return NULL; +#ifndef CONFIG_CMPXCHG_LOCAL local_irq_save(flags); +#else +redo: +#endif + + /* + * Must read kmem_cache cpu data via this cpu ptr. Preemption is + * enabled. We may switch back and forth between cpus while + * reading from one cpu area. That does not matter as long + * as we end up on the original cpu again when doing the cmpxchg. + */ c = __this_cpu_ptr(s->cpu_slab); + +#ifdef CONFIG_CMPXCHG_LOCAL + /* + * The transaction ids are globally unique per cpu and per operation on + * a per cpu queue. Thus they can be guarantee that the cmpxchg_double + * occurs on the right processor and that there was no operation on the + * linked list in between. + */ + tid = c->tid; + barrier(); +#endif + object = c->freelist; if (unlikely(!object || !node_match(c, node))) object = __slab_alloc(s, gfpflags, node, addr, c); else { +#ifdef CONFIG_CMPXCHG_LOCAL + /* + * The cmpxchg will only match if there was no additonal + * operation and if we are on the right processor. + * + * The cmpxchg does the following atomically (without lock semantics!) + * 1. Relocate first pointer to the current per cpu area. + * 2. Verify that tid and freelist have not been changed + * 3. If they were not changed replace tid and freelist + * + * Since this is without lock semantics the protection is only against + * code executing on this cpu *not* from access by other cpus. + */ + if (unlikely(!this_cpu_cmpxchg_double( + s->cpu_slab->freelist, s->cpu_slab->tid, + object, tid, + get_freepointer(s, object), next_tid(tid)))) { + + note_cmpxchg_failure("slab_alloc", s, tid); + goto redo; + } +#else c->freelist = get_freepointer(s, object); +#endif stat(s, ALLOC_FASTPATH); } + +#ifndef CONFIG_CMPXCHG_LOCAL local_irq_restore(flags); +#endif if (unlikely(gfpflags & __GFP_ZERO) && object) memset(object, 0, s->objsize); @@ -1879,9 +2030,13 @@ static void __slab_free(struct kmem_cache *s, struct page *page, { void *prior; void **object = (void *)x; +#ifdef CONFIG_CMPXCHG_LOCAL + unsigned long flags; - stat(s, FREE_SLOWPATH); + local_irq_save(flags); +#endif slab_lock(page); + stat(s, FREE_SLOWPATH); if (kmem_cache_debug(s)) goto debug; @@ -1911,6 +2066,9 @@ checks_ok: out_unlock: slab_unlock(page); +#ifdef CONFIG_CMPXCHG_LOCAL + local_irq_restore(flags); +#endif return; slab_empty: @@ -1922,6 +2080,9 @@ slab_empty: stat(s, FREE_REMOVE_PARTIAL); } slab_unlock(page); +#ifdef CONFIG_CMPXCHG_LOCAL + local_irq_restore(flags); +#endif stat(s, FREE_SLAB); discard_slab(s, page); return; @@ -1948,23 +2109,56 @@ static __always_inline void slab_free(struct kmem_cache *s, { void **object = (void *)x; struct kmem_cache_cpu *c; +#ifdef CONFIG_CMPXCHG_LOCAL + unsigned long tid; +#else unsigned long flags; +#endif slab_free_hook(s, x); +#ifndef CONFIG_CMPXCHG_LOCAL local_irq_save(flags); + +#else +redo: +#endif + + /* + * Determine the currently cpus per cpu slab. + * The cpu may change afterward. However that does not matter since + * data is retrieved via this pointer. If we are on the same cpu + * during the cmpxchg then the free will succedd. + */ c = __this_cpu_ptr(s->cpu_slab); - slab_free_hook_irq(s, x); +#ifdef CONFIG_CMPXCHG_LOCAL + tid = c->tid; + barrier(); +#endif if (likely(page == c->page && c->node != NUMA_NO_NODE)) { set_freepointer(s, object, c->freelist); + +#ifdef CONFIG_CMPXCHG_LOCAL + if (unlikely(!this_cpu_cmpxchg_double( + s->cpu_slab->freelist, s->cpu_slab->tid, + c->freelist, tid, + object, next_tid(tid)))) { + + note_cmpxchg_failure("slab_free", s, tid); + goto redo; + } +#else c->freelist = object; +#endif stat(s, FREE_FASTPATH); } else __slab_free(s, page, x, addr); +#ifndef CONFIG_CMPXCHG_LOCAL local_irq_restore(flags); +#endif } void kmem_cache_free(struct kmem_cache *s, void *x) @@ -2156,9 +2350,23 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu)); +#ifdef CONFIG_CMPXCHG_LOCAL + /* + * Must align to double word boundary for the double cmpxchg instructions + * to work. + */ + s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 2 * sizeof(void *)); +#else + /* Regular alignment is sufficient */ s->cpu_slab = alloc_percpu(struct kmem_cache_cpu); +#endif + + if (!s->cpu_slab) + return 0; + + init_kmem_cache_cpus(s); - return s->cpu_slab != NULL; + return 1; } static struct kmem_cache *kmem_cache_node; |