diff options
author | Linus Torvalds | 2018-04-02 15:45:30 -0700 |
---|---|---|
committer | Linus Torvalds | 2018-04-02 15:45:30 -0700 |
commit | d22fff81418edc92be534cad8d59da914049bf69 (patch) | |
tree | 96b22b20bbc789a76e744bcfc11a7f0854b62ece /arch | |
parent | 986b37c0ae4f0a3f93d8974d03a9cbc1502dd377 (diff) | |
parent | eaeb8e76cd5751e805f6e4a3fcec91d283e3b0c2 (diff) |
Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 mm updates from Ingo Molnar:
- Extend the memmap= boot parameter syntax to allow the redeclaration
and dropping of existing ranges, and to support all e820 range types
(Jan H. Schönherr)
- Improve the W+X boot time security checks to remove false positive
warnings on Xen (Jan Beulich)
- Support booting as Xen PVH guest (Juergen Gross)
- Improved 5-level paging (LA57) support, in particular it's possible
now to have a single kernel image for both 4-level and 5-level
hardware (Kirill A. Shutemov)
- AMD hardware RAM encryption support (SME/SEV) fixes (Tom Lendacky)
- Preparatory commits for hardware-encrypted RAM support on Intel CPUs.
(Kirill A. Shutemov)
- Improved Intel-MID support (Andy Shevchenko)
- Show EFI page tables in page_tables debug files (Andy Lutomirski)
- ... plus misc fixes and smaller cleanups
* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (56 commits)
x86/cpu/tme: Fix spelling: "configuation" -> "configuration"
x86/boot: Fix SEV boot failure from change to __PHYSICAL_MASK_SHIFT
x86/mm: Update comment in detect_tme() regarding x86_phys_bits
x86/mm/32: Remove unused node_memmap_size_bytes() & CONFIG_NEED_NODE_MEMMAP_SIZE logic
x86/mm: Remove pointless checks in vmalloc_fault
x86/platform/intel-mid: Add special handling for ACPI HW reduced platforms
ACPI, x86/boot: Introduce the ->reduced_hw_early_init() ACPI callback
ACPI, x86/boot: Split out acpi_generic_reduce_hw_init() and export
x86/pconfig: Provide defines and helper to run MKTME_KEY_PROG leaf
x86/pconfig: Detect PCONFIG targets
x86/tme: Detect if TME and MKTME is activated by BIOS
x86/boot/compressed/64: Handle 5-level paging boot if kernel is above 4G
x86/boot/compressed/64: Use page table in trampoline memory
x86/boot/compressed/64: Use stack from trampoline memory
x86/boot/compressed/64: Make sure we have a 32-bit code segment
x86/mm: Do not use paravirtualized calls in native_set_p4d()
kdump, vmcoreinfo: Export pgtable_l5_enabled value
x86/boot/compressed/64: Prepare new top-level page table for trampoline
x86/boot/compressed/64: Set up trampoline memory
x86/boot/compressed/64: Save and restore trampoline memory
...
Diffstat (limited to 'arch')
57 files changed, 1619 insertions, 970 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cb5b5907dbd6..518b41b097dc 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1461,6 +1461,8 @@ config X86_PAE config X86_5LEVEL bool "Enable 5-level page tables support" + select DYNAMIC_MEMORY_LAYOUT + select SPARSEMEM_VMEMMAP depends on X86_64 ---help--- 5-level paging enables access to larger address space: @@ -1469,8 +1471,8 @@ config X86_5LEVEL It will be supported by future Intel CPUs. - Note: a kernel with this option enabled can only be booted - on machines that support the feature. + A kernel with the option enabled can be booted on machines that + support 4- or 5-level paging. See Documentation/x86/x86_64/5level-paging.txt for more information. @@ -1595,10 +1597,6 @@ config ARCH_HAVE_MEMORY_PRESENT def_bool y depends on X86_32 && DISCONTIGMEM -config NEED_NODE_MEMMAP_SIZE - def_bool y - depends on X86_32 && (DISCONTIGMEM || SPARSEMEM) - config ARCH_FLATMEM_ENABLE def_bool y depends on X86_32 && !NUMA @@ -2174,10 +2172,17 @@ config PHYSICAL_ALIGN Don't change this unless you know what you are doing. +config DYNAMIC_MEMORY_LAYOUT + bool + ---help--- + This option makes base addresses of vmalloc and vmemmap as well as + __PAGE_OFFSET movable during boot. + config RANDOMIZE_MEMORY bool "Randomize the kernel memory sections" depends on X86_64 depends on RANDOMIZE_BASE + select DYNAMIC_MEMORY_LAYOUT default RANDOMIZE_BASE ---help--- Randomizes the base virtual address of kernel memory sections diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index f484ae0ece93..fa42f895fdde 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -78,7 +78,7 @@ vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \ vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o ifdef CONFIG_X86_64 - vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/pagetable.o + vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr_64.o vmlinux-objs-y += $(obj)/mem_encrypt.o vmlinux-objs-y += $(obj)/pgtable_64.o endif diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index fc313e29fe2c..fca012baba19 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -33,6 +33,7 @@ #include <asm/processor-flags.h> #include <asm/asm-offsets.h> #include <asm/bootparam.h> +#include "pgtable.h" /* * Locally defined symbols should be marked hidden: @@ -304,55 +305,77 @@ ENTRY(startup_64) /* Set up the stack */ leaq boot_stack_end(%rbx), %rsp -#ifdef CONFIG_X86_5LEVEL /* - * Check if we need to enable 5-level paging. - * RSI holds real mode data and need to be preserved across - * a function call. + * At this point we are in long mode with 4-level paging enabled, + * but we might want to enable 5-level paging or vice versa. + * + * The problem is that we cannot do it directly. Setting or clearing + * CR4.LA57 in long mode would trigger #GP. So we need to switch off + * long mode and paging first. + * + * We also need a trampoline in lower memory to switch over from + * 4- to 5-level paging for cases when the bootloader puts the kernel + * above 4G, but didn't enable 5-level paging for us. + * + * The same trampoline can be used to switch from 5- to 4-level paging + * mode, like when starting 4-level paging kernel via kexec() when + * original kernel worked in 5-level paging mode. + * + * For the trampoline, we need the top page table to reside in lower + * memory as we don't have a way to load 64-bit values into CR3 in + * 32-bit mode. + * + * We go though the trampoline even if we don't have to: if we're + * already in a desired paging mode. This way the trampoline code gets + * tested on every boot. */ - pushq %rsi - call l5_paging_required - popq %rsi - /* If l5_paging_required() returned zero, we're done here. */ - cmpq $0, %rax - je lvl5 + /* Make sure we have GDT with 32-bit code segment */ + leaq gdt(%rip), %rax + movq %rax, gdt64+2(%rip) + lgdt gdt64(%rip) /* - * At this point we are in long mode with 4-level paging enabled, - * but we want to enable 5-level paging. + * paging_prepare() sets up the trampoline and checks if we need to + * enable 5-level paging. * - * The problem is that we cannot do it directly. Setting LA57 in - * long mode would trigger #GP. So we need to switch off long mode - * first. + * Address of the trampoline is returned in RAX. + * Non zero RDX on return means we need to enable 5-level paging. * - * NOTE: This is not going to work if bootloader put us above 4G - * limit. - * - * The first step is go into compatibility mode. + * RSI holds real mode data and needs to be preserved across + * this function call. */ + pushq %rsi + call paging_prepare + popq %rsi - /* Clear additional page table */ - leaq lvl5_pgtable(%rbx), %rdi - xorq %rax, %rax - movq $(PAGE_SIZE/8), %rcx - rep stosq + /* Save the trampoline address in RCX */ + movq %rax, %rcx /* - * Setup current CR3 as the first and only entry in a new top level - * page table. + * Load the address of trampoline_return() into RDI. + * It will be used by the trampoline to return to the main code. */ - movq %cr3, %rdi - leaq 0x7 (%rdi), %rax - movq %rax, lvl5_pgtable(%rbx) + leaq trampoline_return(%rip), %rdi /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */ pushq $__KERNEL32_CS - leaq compatible_mode(%rip), %rax + leaq TRAMPOLINE_32BIT_CODE_OFFSET(%rax), %rax pushq %rax lretq -lvl5: -#endif +trampoline_return: + /* Restore the stack, the 32-bit trampoline uses its own stack */ + leaq boot_stack_end(%rbx), %rsp + + /* + * cleanup_trampoline() would restore trampoline memory. + * + * RSI holds real mode data and needs to be preserved across + * this function call. + */ + pushq %rsi + call cleanup_trampoline + popq %rsi /* Zero EFLAGS */ pushq $0 @@ -490,46 +513,82 @@ relocated: jmp *%rax .code32 -#ifdef CONFIG_X86_5LEVEL -compatible_mode: - /* Setup data and stack segments */ +/* + * This is the 32-bit trampoline that will be copied over to low memory. + * + * RDI contains the return address (might be above 4G). + * ECX contains the base address of the trampoline memory. + * Non zero RDX on return means we need to enable 5-level paging. + */ +ENTRY(trampoline_32bit_src) + /* Set up data and stack segments */ movl $__KERNEL_DS, %eax movl %eax, %ds movl %eax, %ss + /* Set up new stack */ + leal TRAMPOLINE_32BIT_STACK_END(%ecx), %esp + /* Disable paging */ movl %cr0, %eax btrl $X86_CR0_PG_BIT, %eax movl %eax, %cr0 - /* Point CR3 to 5-level paging */ - leal lvl5_pgtable(%ebx), %eax - movl %eax, %cr3 + /* Check what paging mode we want to be in after the trampoline */ + cmpl $0, %edx + jz 1f - /* Enable PAE and LA57 mode */ + /* We want 5-level paging: don't touch CR3 if it already points to 5-level page tables */ + movl %cr4, %eax + testl $X86_CR4_LA57, %eax + jnz 3f + jmp 2f +1: + /* We want 4-level paging: don't touch CR3 if it already points to 4-level page tables */ movl %cr4, %eax - orl $(X86_CR4_PAE | X86_CR4_LA57), %eax + testl $X86_CR4_LA57, %eax + jz 3f +2: + /* Point CR3 to the trampoline's new top level page table */ + leal TRAMPOLINE_32BIT_PGTABLE_OFFSET(%ecx), %eax + movl %eax, %cr3 +3: + /* Enable PAE and LA57 (if required) paging modes */ + movl $X86_CR4_PAE, %eax + cmpl $0, %edx + jz 1f + orl $X86_CR4_LA57, %eax +1: movl %eax, %cr4 - /* Calculate address we are running at */ - call 1f -1: popl %edi - subl $1b, %edi + /* Calculate address of paging_enabled() once we are executing in the trampoline */ + leal paging_enabled - trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_OFFSET(%ecx), %eax - /* Prepare stack for far return to Long Mode */ + /* Prepare the stack for far return to Long Mode */ pushl $__KERNEL_CS - leal lvl5(%edi), %eax - push %eax + pushl %eax - /* Enable paging back */ + /* Enable paging again */ movl $(X86_CR0_PG | X86_CR0_PE), %eax movl %eax, %cr0 lret -#endif + .code64 +paging_enabled: + /* Return from the trampoline */ + jmp *%rdi + + /* + * The trampoline code has a size limit. + * Make sure we fail to compile if the trampoline code grows + * beyond TRAMPOLINE_32BIT_CODE_SIZE bytes. + */ + .org trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_SIZE + + .code32 no_longmode: - /* This isn't an x86-64 CPU so hang */ + /* This isn't an x86-64 CPU, so hang intentionally, we cannot continue */ 1: hlt jmp 1b @@ -537,6 +596,11 @@ no_longmode: #include "../../kernel/verify_cpu.S" .data +gdt64: + .word gdt_end - gdt + .long 0 + .word 0 + .quad 0 gdt: .word gdt_end - gdt .long gdt @@ -585,7 +649,3 @@ boot_stack_end: .balign 4096 pgtable: .fill BOOT_PGT_SIZE, 1, 0 -#ifdef CONFIG_X86_5LEVEL -lvl5_pgtable: - .fill PAGE_SIZE, 1, 0 -#endif diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 8199a6187251..66e42a098d70 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -46,6 +46,12 @@ #define STATIC #include <linux/decompress/mm.h> +#ifdef CONFIG_X86_5LEVEL +unsigned int pgtable_l5_enabled __ro_after_init; +unsigned int pgdir_shift __ro_after_init = 39; +unsigned int ptrs_per_p4d __ro_after_init = 1; +#endif + extern unsigned long get_cmd_line_ptr(void); /* Simplified build-specific string for starting entropy. */ @@ -723,6 +729,14 @@ void choose_random_location(unsigned long input, return; } +#ifdef CONFIG_X86_5LEVEL + if (__read_cr4() & X86_CR4_LA57) { + pgtable_l5_enabled = 1; + pgdir_shift = 48; + ptrs_per_p4d = 512; + } +#endif + boot_params->hdr.loadflags |= KASLR_FLAG; /* Prepare to add new identity pagetables on demand. */ diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/kaslr_64.c index b5e5e02f8cde..522d11431433 100644 --- a/arch/x86/boot/compressed/pagetable.c +++ b/arch/x86/boot/compressed/kaslr_64.c @@ -16,13 +16,6 @@ #define __pa(x) ((unsigned long)(x)) #define __va(x) ((void *)((unsigned long)(x))) -/* - * The pgtable.h and mm/ident_map.c includes make use of the SME related - * information which is not used in the compressed image support. Un-define - * the SME support to avoid any compile and link errors. - */ -#undef CONFIG_AMD_MEM_ENCRYPT - /* No PAGE_TABLE_ISOLATION support needed either: */ #undef CONFIG_PAGE_TABLE_ISOLATION @@ -85,13 +78,14 @@ static struct x86_mapping_info mapping_info; /* Locates and clears a region for a new top level page table. */ void initialize_identity_maps(void) { - unsigned long sev_me_mask = get_sev_encryption_mask(); + /* If running as an SEV guest, the encryption mask is required. */ + set_sev_encryption_mask(); /* Init mapping_info with run-time function/buffer pointers. */ mapping_info.alloc_pgt_page = alloc_pgt_page; mapping_info.context = &pgt_data; - mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sev_me_mask; - mapping_info.kernpg_flag = _KERNPG_TABLE | sev_me_mask; + mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sme_me_mask; + mapping_info.kernpg_flag = _KERNPG_TABLE; /* * It should be impossible for this not to already be true, diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S index 54f5f6625a73..eaa843a52907 100644 --- a/arch/x86/boot/compressed/mem_encrypt.S +++ b/arch/x86/boot/compressed/mem_encrypt.S @@ -88,9 +88,7 @@ ENTRY(get_sev_encryption_bit) ENDPROC(get_sev_encryption_bit) .code64 -ENTRY(get_sev_encryption_mask) - xor %rax, %rax - +ENTRY(set_sev_encryption_mask) #ifdef CONFIG_AMD_MEM_ENCRYPT push %rbp push %rdx @@ -101,9 +99,7 @@ ENTRY(get_sev_encryption_mask) testl %eax, %eax jz .Lno_sev_mask - xor %rdx, %rdx - bts %rax, %rdx /* Create the encryption mask */ - mov %rdx, %rax /* ... and return it */ + bts %rax, sme_me_mask(%rip) /* Create the encryption mask */ .Lno_sev_mask: movq %rbp, %rsp /* Restore original stack pointer */ @@ -112,9 +108,16 @@ ENTRY(get_sev_encryption_mask) pop %rbp #endif + xor %rax, %rax ret -ENDPROC(get_sev_encryption_mask) +ENDPROC(set_sev_encryption_mask) .data enc_bit: .int 0xffffffff + +#ifdef CONFIG_AMD_MEM_ENCRYPT + .balign 8 +GLOBAL(sme_me_mask) + .quad 0 +#endif diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 252fee320816..8dd1d5ccae58 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -14,6 +14,7 @@ #include "misc.h" #include "error.h" +#include "pgtable.h" #include "../string.h" #include "../voffset.h" @@ -169,16 +170,6 @@ void __puthex(unsigned long value) } } -static bool l5_supported(void) -{ - /* Check if leaf 7 is supported. */ - if (native_cpuid_eax(0) < 7) - return 0; - - /* Check if la57 is supported. */ - return native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)); -} - #if CONFIG_X86_NEED_RELOCS static void handle_relocations(void *output, unsigned long output_len, unsigned long virt_addr) @@ -376,12 +367,6 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, console_init(); debug_putstr("early console in extract_kernel\n"); - if (IS_ENABLED(CONFIG_X86_5LEVEL) && !l5_supported()) { - error("This linux kernel as configured requires 5-level paging\n" - "This CPU does not support the required 'cr4.la57' feature\n" - "Unable to boot - please use a kernel appropriate for your CPU\n"); - } - free_mem_ptr = heap; /* Heap */ free_mem_end_ptr = heap + BOOT_HEAP_SIZE; @@ -392,6 +377,11 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, debug_putaddr(output_len); debug_putaddr(kernel_total_size); +#ifdef CONFIG_X86_64 + /* Report address of 32-bit trampoline */ + debug_putaddr(trampoline_32bit); +#endif + /* * The memory hole needed for the kernel is the larger of either * the entire decompressed kernel plus relocation table, or the diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 9d323dc6b159..9e11be4cae19 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -12,6 +12,11 @@ #undef CONFIG_PARAVIRT_SPINLOCKS #undef CONFIG_KASAN +#ifdef CONFIG_X86_5LEVEL +/* cpu_feature_enabled() cannot be used that early */ +#define pgtable_l5_enabled __pgtable_l5_enabled +#endif + #include <linux/linkage.h> #include <linux/screen_info.h> #include <linux/elf.h> @@ -109,6 +114,6 @@ static inline void console_init(void) { } #endif -unsigned long get_sev_encryption_mask(void); +void set_sev_encryption_mask(void); #endif diff --git a/arch/x86/boot/compressed/pgtable.h b/arch/x86/boot/compressed/pgtable.h new file mode 100644 index 000000000000..91f75638f6e6 --- /dev/null +++ b/arch/x86/boot/compressed/pgtable.h @@ -0,0 +1,20 @@ +#ifndef BOOT_COMPRESSED_PAGETABLE_H +#define BOOT_COMPRESSED_PAGETABLE_H + +#define TRAMPOLINE_32BIT_SIZE (2 * PAGE_SIZE) + +#define TRAMPOLINE_32BIT_PGTABLE_OFFSET 0 + +#define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE +#define TRAMPOLINE_32BIT_CODE_SIZE 0x60 + +#define TRAMPOLINE_32BIT_STACK_END TRAMPOLINE_32BIT_SIZE + +#ifndef __ASSEMBLER__ + +extern unsigned long *trampoline_32bit; + +extern void trampoline_32bit_src(void *return_ptr); + +#endif /* __ASSEMBLER__ */ +#endif /* BOOT_COMPRESSED_PAGETABLE_H */ diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index b4469a37e9a1..32af1cbcd903 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -1,4 +1,6 @@ #include <asm/processor.h> +#include "pgtable.h" +#include "../string.h" /* * __force_order is used by special_insns.h asm code to force instruction @@ -9,20 +11,144 @@ */ unsigned long __force_order; -int l5_paging_required(void) +#define BIOS_START_MIN 0x20000U /* 128K, less than this is insane */ +#define BIOS_START_MAX 0x9f000U /* 640K, absolute maximum */ + +struct paging_config { + unsigned long trampoline_start; + unsigned long l5_required; +}; + +/* Buffer to preserve trampoline memory */ +static char trampoline_save[TRAMPOLINE_32BIT_SIZE]; + +/* + * The page table is going to be used instead of page table in the trampoline + * memory. + * + * It must not be in BSS as BSS is cleared after cleanup_trampoline(). + */ +static char top_pgtable[PAGE_SIZE] __aligned(PAGE_SIZE) __section(.data); + +/* + * Trampoline address will be printed by extract_kernel() for debugging + * purposes. + * + * Avoid putting the pointer into .bss as it will be cleared between + * paging_prepare() and extract_kernel(). + */ +unsigned long *trampoline_32bit __section(.data); + +struct paging_config paging_prepare(void) { - /* Check if leaf 7 is supported. */ + struct paging_config paging_config = {}; + unsigned long bios_start, ebda_start; + + /* + * Check if LA57 is desired and supported. + * + * There are two parts to the check: + * - if the kernel supports 5-level paging: CONFIG_X86_5LEVEL=y + * - if the machine supports 5-level paging: + * + CPUID leaf 7 is supported + * + the leaf has the feature bit set + * + * That's substitute for boot_cpu_has() in early boot code. + */ + if (IS_ENABLED(CONFIG_X86_5LEVEL) && + native_cpuid_eax(0) >= 7 && + (native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) { + paging_config.l5_required = 1; + } + + /* + * Find a suitable spot for the trampoline. + * This code is based on reserve_bios_regions(). + */ + + ebda_start = *(unsigned short *)0x40e << 4; + bios_start = *(unsigned short *)0x413 << 10; - if (native_cpuid_eax(0) < 7) - return 0; + if (bios_start < BIOS_START_MIN || bios_start > BIOS_START_MAX) + bios_start = BIOS_START_MAX; + + if (ebda_start > BIOS_START_MIN && ebda_start < bios_start) + bios_start = ebda_start; + + /* Place the trampoline just below the end of low memory, aligned to 4k */ + paging_config.trampoline_start = bios_start - TRAMPOLINE_32BIT_SIZE; + paging_config.trampoline_start = round_down(paging_config.trampoline_start, PAGE_SIZE); + + trampoline_32bit = (unsigned long *)paging_config.trampoline_start; + + /* Preserve trampoline memory */ + memcpy(trampoline_save, trampoline_32bit, TRAMPOLINE_32BIT_SIZE); + + /* Clear trampoline memory first */ + memset(trampoline_32bit, 0, TRAMPOLINE_32BIT_SIZE); + + /* Copy trampoline code in place */ + memcpy(trampoline_32bit + TRAMPOLINE_32BIT_CODE_OFFSET / sizeof(unsigned long), + &trampoline_32bit_src, TRAMPOLINE_32BIT_CODE_SIZE); + + /* + * The code below prepares page table in trampoline memory. + * + * The new page table will be used by trampoline code for switching + * from 4- to 5-level paging or vice versa. + * + * If switching is not required, the page table is unused: trampoline + * code wouldn't touch CR3. + */ + + /* + * We are not going to use the page table in trampoline memory if we + * are already in the desired paging mode. + */ + if (paging_config.l5_required == !!(native_read_cr4() & X86_CR4_LA57)) + goto out; + + if (paging_config.l5_required) { + /* + * For 4- to 5-level paging transition, set up current CR3 as + * the first and the only entry in a new top-level page table. + */ + trampoline_32bit[TRAMPOLINE_32BIT_PGTABLE_OFFSET] = __native_read_cr3() | _PAGE_TABLE_NOENC; + } else { + unsigned long src; + + /* + * For 5- to 4-level paging transition, copy page table pointed + * by first entry in the current top-level page table as our + * new top-level page table. + * + * We cannot just point to the page table from trampoline as it + * may be above 4G. + */ + src = *(unsigned long *)__native_read_cr3() & PAGE_MASK; + memcpy(trampoline_32bit + TRAMPOLINE_32BIT_PGTABLE_OFFSET / sizeof(unsigned long), + (void *)src, PAGE_SIZE); + } + +out: + return paging_config; +} + +void cleanup_trampoline(void) +{ + void *trampoline_pgtable; - /* Check if la57 is supported. */ - if (!(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) - return 0; + trampoline_pgtable = trampoline_32bit + TRAMPOLINE_32BIT_PGTABLE_OFFSET; - /* Check if 5-level paging has already been enabled. */ - if (native_read_cr4() & X86_CR4_LA57) - return 0; + /* + * Move the top level page table out of trampoline memory, + * if it's there. + */ + if ((void *)__native_read_cr3() == trampoline_pgtable) { + memcpy(top_pgtable, trampoline_pgtable, PAGE_SIZE); + native_write_cr3((unsigned long)top_pgtable); + } - return 1; + /* Restore trampoline memory */ + memcpy(trampoline_32bit, trampoline_save, TRAMPOLINE_32BIT_SIZE); } diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 18ed349b4f83..936e19642eab 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -260,8 +260,13 @@ GLOBAL(entry_SYSCALL_64_after_hwframe) * Change top bits to match most significant bit (47th or 56th bit * depending on paging mode) in the address. */ +#ifdef CONFIG_X86_5LEVEL + ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \ + "shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57 +#else shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx +#endif /* If this changed %rcx, it was not canonical */ cmpq %rcx, %r11 diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 11881726ed37..a303d7b7d763 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -31,6 +31,7 @@ #include <asm/mmu.h> #include <asm/mpspec.h> #include <asm/realmode.h> +#include <asm/x86_init.h> #ifdef CONFIG_ACPI_APEI # include <asm/pgtable_types.h> @@ -133,6 +134,14 @@ static inline bool acpi_has_cpu_in_madt(void) return !!acpi_lapic; } +#define ACPI_HAVE_ARCH_GET_ROOT_POINTER +static inline u64 acpi_arch_get_root_pointer(void) +{ + return x86_init.acpi.get_root_pointer(); +} + +void acpi_generic_reduced_hw_init(void); + #else /* !CONFIG_ACPI */ #define acpi_lapic 0 @@ -142,6 +151,8 @@ static inline void acpi_noirq_set(void) { } static inline void acpi_disable_pci(void) { } static inline void disable_acpi(void) { } +static inline void acpi_generic_reduced_hw_init(void) { } + #endif /* !CONFIG_ACPI */ #define ARCH_HAS_POWER_INIT 1 diff --git a/arch/x86/include/asm/intel_pconfig.h b/arch/x86/include/asm/intel_pconfig.h new file mode 100644 index 000000000000..3cb002b1d0f9 --- /dev/null +++ b/arch/x86/include/asm/intel_pconfig.h @@ -0,0 +1,65 @@ +#ifndef _ASM_X86_INTEL_PCONFIG_H +#define _ASM_X86_INTEL_PCONFIG_H + +#include <asm/asm.h> +#include <asm/processor.h> + +enum pconfig_target { + INVALID_TARGET = 0, + MKTME_TARGET = 1, + PCONFIG_TARGET_NR +}; + +int pconfig_target_supported(enum pconfig_target target); + +enum pconfig_leaf { + MKTME_KEY_PROGRAM = 0, + PCONFIG_LEAF_INVALID, +}; + +#define PCONFIG ".byte 0x0f, 0x01, 0xc5" + +/* Defines and structure for MKTME_KEY_PROGRAM of PCONFIG instruction */ + +/* mktme_key_program::keyid_ctrl COMMAND, bits [7:0] */ +#define MKTME_KEYID_SET_KEY_DIRECT 0 +#define MKTME_KEYID_SET_KEY_RANDOM 1 +#define MKTME_KEYID_CLEAR_KEY 2 +#define MKTME_KEYID_NO_ENCRYPT 3 + +/* mktme_key_program::keyid_ctrl ENC_ALG, bits [23:8] */ +#define MKTME_AES_XTS_128 (1 << 8) + +/* Return codes from the PCONFIG MKTME_KEY_PROGRAM */ +#define MKTME_PROG_SUCCESS 0 +#define MKTME_INVALID_PROG_CMD 1 +#define MKTME_ENTROPY_ERROR 2 +#define MKTME_INVALID_KEYID 3 +#define MKTME_INVALID_ENC_ALG 4 +#define MKTME_DEVICE_BUSY 5 + +/* Hardware requires the structure to be 256 byte alinged. Otherwise #GP(0). */ +struct mktme_key_program { + u16 keyid; + u32 keyid_ctrl; + u8 __rsvd[58]; + u8 key_field_1[64]; + u8 key_field_2[64]; +} __packed __aligned(256); + +static inline int mktme_key_program(struct mktme_key_program *key_program) +{ + unsigned long rax = MKTME_KEY_PROGRAM; + + if (!pconfig_target_supported(MKTME_TARGET)) + return -ENXIO; + + asm volatile(PCONFIG + : "=a" (rax), "=b" (key_program) + : "0" (rax), "1" (key_program) + : "memory", "cc"); + + return rax; +} + +#endif /* _ASM_X86_INTEL_PCONFIG_H */ diff --git a/arch/x86/include/asm/kaslr.h b/arch/x86/include/asm/kaslr.h index 460991e3b529..db7ba2feb947 100644 --- a/arch/x86/include/asm/kaslr.h +++ b/arch/x86/include/asm/kaslr.h @@ -5,10 +5,6 @@ unsigned long kaslr_get_random_long(const char *purpose); #ifdef CONFIG_RANDOMIZE_MEMORY -extern unsigned long page_offset_base; -extern unsigned long vmalloc_base; -extern unsigned long vmemmap_base; - void kernel_randomize_memory(void); #else static inline void kernel_randomize_memory(void) { } diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 22c5f3e6f820..8fe61ad21047 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -22,6 +22,7 @@ #ifdef CONFIG_AMD_MEM_ENCRYPT extern u64 sme_me_mask; +extern bool sev_enabled; void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr, unsigned long decrypted_kernel_vaddr, diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index 9ca8dae9c716..939b1cff4a7b 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -11,6 +11,10 @@ extern unsigned long max_pfn; extern unsigned long phys_base; +extern unsigned long page_offset_base; +extern unsigned long vmalloc_base; +extern unsigned long vmemmap_base; + static inline unsigned long __phys_addr_nodebug(unsigned long x) { unsigned long y = x - __START_KERNEL_map; diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index e1407312c412..2c5a966dc222 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -37,26 +37,24 @@ * hypervisor to fit. Choosing 16 slots here is arbitrary, but it's * what Xen requires. */ -#ifdef CONFIG_X86_5LEVEL -#define __PAGE_OFFSET_BASE _AC(0xff10000000000000, UL) -#else -#define __PAGE_OFFSET_BASE _AC(0xffff880000000000, UL) -#endif +#define __PAGE_OFFSET_BASE_L5 _AC(0xff10000000000000, UL) +#define __PAGE_OFFSET_BASE_L4 _AC(0xffff880000000000, UL) -#ifdef CONFIG_RANDOMIZE_MEMORY +#ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT #define __PAGE_OFFSET page_offset_base #else -#define __PAGE_OFFSET __PAGE_OFFSET_BASE -#endif /* CONFIG_RANDOMIZE_MEMORY */ +#define __PAGE_OFFSET __PAGE_OFFSET_BASE_L4 +#endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */ #define __START_KERNEL_map _AC(0xffffffff80000000, UL) /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ -#ifdef CONFIG_X86_5LEVEL + #define __PHYSICAL_MASK_SHIFT 52 -#define __VIRTUAL_MASK_SHIFT 56 + +#ifdef CONFIG_X86_5LEVEL +#define __VIRTUAL_MASK_SHIFT (pgtable_l5_enabled ? 56 : 47) #else -#define __PHYSICAL_MASK_SHIFT 46 #define __VIRTUAL_MASK_SHIFT 47 #endif diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index c83a2f418cea..9be2bf13825b 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -568,17 +568,22 @@ static inline p4dval_t p4d_val(p4d_t p4d) return PVOP_CALLEE1(p4dval_t, pv_mmu_ops.p4d_val, p4d.p4d); } -static inline void set_pgd(pgd_t *pgdp, pgd_t pgd) +static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd) { - pgdval_t val = native_pgd_val(pgd); - - PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp, val); + PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp, native_pgd_val(pgd)); } -static inline void pgd_clear(pgd_t *pgdp) -{ - set_pgd(pgdp, __pgd(0)); -} +#define set_pgd(pgdp, pgdval) do { \ + if (pgtable_l5_enabled) \ + __set_pgd(pgdp, pgdval); \ + else \ + set_p4d((p4d_t *)(pgdp), (p4d_t) { (pgdval).pgd }); \ +} while (0) + +#define pgd_clear(pgdp) do { \ + if (pgtable_l5_enabled) \ + set_pgd(pgdp, __pgd(0)); \ +} while (0) #endif /* CONFIG_PGTABLE_LEVELS == 5 */ diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index aff42e1da6ee..263c142a6a6c 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -167,6 +167,8 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, #if CONFIG_PGTABLE_LEVELS > 4 static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d) { + if (!pgtable_l5_enabled) + return; paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT); set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(p4d))); } @@ -191,7 +193,8 @@ extern void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d); static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, unsigned long address) { - ___p4d_free_tlb(tlb, p4d); + if (pgtable_l5_enabled) + ___p4d_free_tlb(tlb, p4d); } #endif /* CONFIG_PGTABLE_LEVELS > 4 */ diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h index 876b4c77d983..6a59a6d0cc50 100644 --- a/arch/x86/include/asm/pgtable-3level_types.h +++ b/arch/x86/include/asm/pgtable-3level_types.h @@ -44,5 +44,6 @@ typedef union { */ #define PTRS_PER_PTE 512 +#define MAX_POSSIBLE_PHYSMEM_BITS 36 #endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index b444d83cfc95..89d5c8886c85 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -65,7 +65,7 @@ extern pmdval_t early_pmd_flags; #ifndef __PAGETABLE_P4D_FOLDED #define set_pgd(pgdp, pgd) native_set_pgd(pgdp, pgd) -#define pgd_clear(pgd) native_pgd_clear(pgd) +#define pgd_clear(pgd) (pgtable_l5_enabled ? native_pgd_clear(pgd) : 0) #endif #ifndef set_p4d @@ -859,6 +859,8 @@ static inline unsigned long p4d_index(unsigned long address) #if CONFIG_PGTABLE_LEVELS > 4 static inline int pgd_present(pgd_t pgd) { + if (!pgtable_l5_enabled) + return 1; return pgd_flags(pgd) & _PAGE_PRESENT; } @@ -876,6 +878,8 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd) /* to find an entry in a page-table-directory. */ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) { + if (!pgtable_l5_enabled) + return (p4d_t *)pgd; return (p4d_t *)pgd_page_vaddr(*pgd) + p4d_index(address); } @@ -883,6 +887,9 @@ static inline int pgd_bad(pgd_t pgd) { unsigned long ignore_flags = _PAGE_USER; + if (!pgtable_l5_enabled) + return 0; + if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) ignore_flags |= _PAGE_NX; @@ -891,6 +898,8 @@ static inline int pgd_bad(pgd_t pgd) static inline int pgd_none(pgd_t pgd) { + if (!pgtable_l5_enabled) + return 0; /* * There is no need to do a workaround for the KNL stray * A/D bit erratum here. PGDs only point to page tables diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index b3ec519e3982..88a056b01db4 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -34,6 +34,8 @@ static inline void check_pgt_cache(void) { } void paging_init(void); void sync_initial_page_table(void); +static inline int pgd_large(pgd_t pgd) { return 0; } + /* * Define this if things work differently on an i386 and an i486: * it will (on an i486) warn about kernel memory accesses that are diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h index 0777e18a1d23..e3225e83db7d 100644 --- a/arch/x86/include/asm/pgtable_32_types.h +++ b/arch/x86/include/asm/pgtable_32_types.h @@ -15,6 +15,8 @@ # include <asm/pgtable-2level_types.h> #endif +#define pgtable_l5_enabled 0 + #define PGDIR_SIZE (1UL << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE - 1)) diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 1149d2112b2e..877bc27718ae 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -218,29 +218,26 @@ static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) { -#if defined(CONFIG_PAGE_TABLE_ISOLATION) && !defined(CONFIG_X86_5LEVEL) - p4dp->pgd = pti_set_user_pgd(&p4dp->pgd, p4d.pgd); -#else - *p4dp = p4d; -#endif + pgd_t pgd; + + if (pgtable_l5_enabled || !IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) { + *p4dp = p4d; + return; + } + + pgd = native_make_pgd(native_p4d_val(p4d)); + pgd = pti_set_user_pgd((pgd_t *)p4dp, pgd); + *p4dp = native_make_p4d(native_pgd_val(pgd)); } static inline void native_p4d_clear(p4d_t *p4d) { -#ifdef CONFIG_X86_5LEVEL native_set_p4d(p4d, native_make_p4d(0)); -#else - native_set_p4d(p4d, (p4d_t) { .pgd = native_make_pgd(0)}); -#endif } static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) { -#ifdef CONFIG_PAGE_TABLE_ISOLATION *pgdp = pti_set_user_pgd(pgdp, pgd); -#else - *pgdp = pgd; -#endif } static inline void native_pgd_clear(pgd_t *pgd) diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 6b8f73dcbc2c..d5c21a382475 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -20,6 +20,18 @@ typedef unsigned long pgprotval_t; typedef struct { pteval_t pte; } pte_t; +#ifdef CONFIG_X86_5LEVEL +extern unsigned int __pgtable_l5_enabled; +#ifndef pgtable_l5_enabled +#define pgtable_l5_enabled cpu_feature_enabled(X86_FEATURE_LA57) +#endif +#else +#define pgtable_l5_enabled 0 +#endif + +extern unsigned int pgdir_shift; +extern unsigned int ptrs_per_p4d; + #endif /* !__ASSEMBLY__ */ #define SHARED_KERNEL_PMD 0 @@ -29,24 +41,28 @@ typedef struct { pteval_t pte; } pte_t; /* * PGDIR_SHIFT determines what a top-level page table entry can map */ -#define PGDIR_SHIFT 48 +#define PGDIR_SHIFT pgdir_shift #define PTRS_PER_PGD 512 /* * 4th level page in 5-level paging case */ -#define P4D_SHIFT 39 -#define PTRS_PER_P4D 512 -#define P4D_SIZE (_AC(1, UL) << P4D_SHIFT) -#define P4D_MASK (~(P4D_SIZE - 1)) +#define P4D_SHIFT 39 +#define MAX_PTRS_PER_P4D 512 +#define PTRS_PER_P4D ptrs_per_p4d +#define P4D_SIZE (_AC(1, UL) << P4D_SHIFT) +#define P4D_MASK (~(P4D_SIZE - 1)) + +#define MAX_POSSIBLE_PHYSMEM_BITS 52 #else /* CONFIG_X86_5LEVEL */ /* * PGDIR_SHIFT determines what a top-level page table entry can map */ -#define PGDIR_SHIFT 39 -#define PTRS_PER_PGD 512 +#define PGDIR_SHIFT 39 +#define PTRS_PER_PGD 512 +#define MAX_PTRS_PER_P4D 1 #endif /* CONFIG_X86_5LEVEL */ @@ -82,31 +98,33 @@ typedef struct { pteval_t pte; } pte_t; * range must not overlap with anything except the KASAN shadow area, which * is correct as KASAN disables KASLR. */ -#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) +#define MAXMEM (1UL << MAX_PHYSMEM_BITS) -#ifdef CONFIG_X86_5LEVEL -# define VMALLOC_SIZE_TB _AC(12800, UL) -# define __VMALLOC_BASE _AC(0xffa0000000000000, UL) -# define __VMEMMAP_BASE _AC(0xffd4000000000000, UL) -# define LDT_PGD_ENTRY _AC(-112, UL) -# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) -#else -# define VMALLOC_SIZE_TB _AC(32, UL) -# define __VMALLOC_BASE _AC(0xffffc90000000000, UL) -# define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) -# define LDT_PGD_ENTRY _AC(-3, UL) -# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) -#endif +#define LDT_PGD_ENTRY_L4 -3UL +#define LDT_PGD_ENTRY_L5 -112UL +#define LDT_PGD_ENTRY (pgtable_l5_enabled ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4) +#define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) + +#define __VMALLOC_BASE_L4 0xffffc90000000000 +#define __VMALLOC_BASE_L5 0xffa0000000000000 + +#define VMALLOC_SIZE_TB_L4 32UL +#define VMALLOC_SIZE_TB_L5 12800UL + +#define __VMEMMAP_BASE_L4 0xffffea0000000000 +#define __VMEMMAP_BASE_L5 0xffd4000000000000 -#ifdef CONFIG_RANDOMIZE_MEMORY +#ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT # define VMALLOC_START vmalloc_base +# define VMALLOC_SIZE_TB (pgtable_l5_enabled ? VMALLOC_SIZE_TB_L5 : VMALLOC_SIZE_TB_L4) # define VMEMMAP_START vmemmap_base #else -# define VMALLOC_START __VMALLOC_BASE -# define VMEMMAP_START __VMEMMAP_BASE -#endif /* CONFIG_RANDOMIZE_MEMORY */ +# define VMALLOC_START __VMALLOC_BASE_L4 +# define VMALLOC_SIZE_TB VMALLOC_SIZE_TB_L4 +# define VMEMMAP_START __VMEMMAP_BASE_L4 +#endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */ -#define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL)) +#define VMALLOC_END (VMALLOC_START + (VMALLOC_SIZE_TB << 40) - 1) #define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) /* The module sections ends with the start of the fixmap */ diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h index fb3a6de7440b..6847d85400a8 100644 --- a/arch/x86/include/asm/required-features.h +++ b/arch/x86/include/asm/required-features.h @@ -53,12 +53,6 @@ # define NEED_MOVBE 0 #endif -#ifdef CONFIG_X86_5LEVEL -# define NEED_LA57 (1<<(X86_FEATURE_LA57 & 31)) -#else -# define NEED_LA57 0 -#endif - #ifdef CONFIG_X86_64 #ifdef CONFIG_PARAVIRT /* Paravirtualized systems may not have PSE or PGE available */ @@ -104,7 +98,7 @@ #define REQUIRED_MASK13 0 #define REQUIRED_MASK14 0 #define REQUIRED_MASK15 0 -#define REQUIRED_MASK16 (NEED_LA57) +#define REQUIRED_MASK16 0 #define REQUIRED_MASK17 0 #define REQUIRED_MASK18 0 #define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h index 4fc1e9d3c43e..4617a2bf123c 100644 --- a/arch/x86/include/asm/sparsemem.h +++ b/arch/x86/include/asm/sparsemem.h @@ -27,13 +27,8 @@ # endif #else /* CONFIG_X86_32 */ # define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */ -# ifdef CONFIG_X86_5LEVEL -# define MAX_PHYSADDR_BITS 52 -# define MAX_PHYSMEM_BITS 52 -# else -# define MAX_PHYSADDR_BITS 44 -# define MAX_PHYSMEM_BITS 46 -# endif +# define MAX_PHYSADDR_BITS (pgtable_l5_enabled ? 52 : 44) +# define MAX_PHYSMEM_BITS (pgtable_l5_enabled ? 52 : 46) #endif #endif /* CONFIG_SPARSEMEM */ diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 88306054bd98..199e15bd3ec5 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -131,6 +131,16 @@ struct x86_hyper_init { }; /** + * struct x86_init_acpi - x86 ACPI init functions + * @get_root_pointer: get RSDP address + * @reduced_hw_early_init: hardware reduced platform early init + */ +struct x86_init_acpi { + u64 (*get_root_pointer)(void); + void (*reduced_hw_early_init)(void); +}; + +/** * struct x86_init_ops - functions for platform specific setup * */ @@ -144,6 +154,7 @@ struct x86_init_ops { struct x86_init_iommu iommu; struct x86_init_pci pci; struct x86_hyper_init hyper; + struct x86_init_acpi acpi; }; /** diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 2aa92094b59d..7a37d9357bc4 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1376,17 +1376,21 @@ static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) * * We initialize the Hardware-reduced ACPI model here: */ +void __init acpi_generic_reduced_hw_init(void) +{ + /* + * Override x86_init functions and bypass legacy PIC in + * hardware reduced ACPI mode. + */ + x86_init.timers.timer_init = x86_init_noop; + x86_init.irqs.pre_vector_init = x86_init_noop; + legacy_pic = &null_legacy_pic; +} + static void __init acpi_reduced_hw_init(void) { - if (acpi_gbl_reduced_hardware) { - /* - * Override x86_init functions and bypass legacy pic - * in Hardware-reduced ACPI mode - */ - x86_init.timers.timer_init = x86_init_noop; - x86_init.irqs.pre_vector_init = x86_init_noop; - legacy_pic = &null_legacy_pic; - } + if (acpi_gbl_reduced_hardware) + x86_init.acpi.reduced_hw_early_init(); } /* diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 570e8bb1f386..a66229f51b12 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -28,7 +28,7 @@ obj-y += cpuid-deps.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o -obj-$(CONFIG_CPU_SUP_INTEL) += intel.o +obj-$(CONFIG_CPU_SUP_INTEL) += intel.o intel_pconfig.o obj-$(CONFIG_CPU_SUP_AMD) += amd.o obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index c3af167d0a70..b9693b80fc21 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -509,6 +509,90 @@ static void detect_vmx_virtcap(struct cpuinfo_x86 *c) } } +#define MSR_IA32_TME_ACTIVATE 0x982 + +/* Helpers to access TME_ACTIVATE MSR */ +#define TME_ACTIVATE_LOCKED(x) (x & 0x1) +#define TME_ACTIVATE_ENABLED(x) (x & 0x2) + +#define TME_ACTIVATE_POLICY(x) ((x >> 4) & 0xf) /* Bits 7:4 */ +#define TME_ACTIVATE_POLICY_AES_XTS_128 0 + +#define TME_ACTIVATE_KEYID_BITS(x) ((x >> 32) & 0xf) /* Bits 35:32 */ + +#define TME_ACTIVATE_CRYPTO_ALGS(x) ((x >> 48) & 0xffff) /* Bits 63:48 */ +#define TME_ACTIVATE_CRYPTO_AES_XTS_128 1 + +/* Values for mktme_status (SW only construct) */ +#define MKTME_ENABLED 0 +#define MKTME_DISABLED 1 +#define MKTME_UNINITIALIZED 2 +static int mktme_status = MKTME_UNINITIALIZED; + +static void detect_tme(struct cpuinfo_x86 *c) +{ + u64 tme_activate, tme_policy, tme_crypto_algs; + int keyid_bits = 0, nr_keyids = 0; + static u64 tme_activate_cpu0 = 0; + + rdmsrl(MSR_IA32_TME_ACTIVATE, tme_activate); + + if (mktme_status != MKTME_UNINITIALIZED) { + if (tme_activate != tme_activate_cpu0) { + /* Broken BIOS? */ + pr_err_once("x86/tme: configuration is inconsistent between CPUs\n"); + pr_err_once("x86/tme: MKTME is not usable\n"); + mktme_status = MKTME_DISABLED; + + /* Proceed. We may need to exclude bits from x86_phys_bits. */ + } + } else { + tme_activate_cpu0 = tme_activate; + } + + if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) { + pr_info_once("x86/tme: not enabled by BIOS\n"); + mktme_status = MKTME_DISABLED; + return; + } + + if (mktme_status != MKTME_UNINITIALIZED) + goto detect_keyid_bits; + + pr_info("x86/tme: enabled by BIOS\n"); + + tme_policy = TME_ACTIVATE_POLICY(tme_activate); + if (tme_policy != TME_ACTIVATE_POLICY_AES_XTS_128) + pr_warn("x86/tme: Unknown policy is active: %#llx\n", tme_policy); + + tme_crypto_algs = TME_ACTIVATE_CRYPTO_ALGS(tme_activate); + if (!(tme_crypto_algs & TME_ACTIVATE_CRYPTO_AES_XTS_128)) { + pr_err("x86/mktme: No known encryption algorithm is supported: %#llx\n", + tme_crypto_algs); + mktme_status = MKTME_DISABLED; + } +detect_keyid_bits: + keyid_bits = TME_ACTIVATE_KEYID_BITS(tme_activate); + nr_keyids = (1UL << keyid_bits) - 1; + if (nr_keyids) { + pr_info_once("x86/mktme: enabled by BIOS\n"); + pr_info_once("x86/mktme: %d KeyIDs available\n", nr_keyids); + } else { + pr_info_once("x86/mktme: disabled by BIOS\n"); + } + + if (mktme_status == MKTME_UNINITIALIZED) { + /* MKTME is usable */ + mktme_status = MKTME_ENABLED; + } + + /* + * KeyID bits effectively lower the number of physical address + * bits. Update cpuinfo_x86::x86_phys_bits accordingly. + */ + c->x86_phys_bits -= keyid_bits; +} + static void init_intel_energy_perf(struct cpuinfo_x86 *c) { u64 epb; @@ -679,6 +763,9 @@ static void init_intel(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_VMX)) detect_vmx_virtcap(c); + if (cpu_has(c, X86_FEATURE_TME)) + detect_tme(c); + init_intel_energy_perf(c); init_intel_misc_features(c); diff --git a/arch/x86/kernel/cpu/intel_pconfig.c b/arch/x86/kernel/cpu/intel_pconfig.c new file mode 100644 index 000000000000..0771a905b286 --- /dev/null +++ b/arch/x86/kernel/cpu/intel_pconfig.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel PCONFIG instruction support. + * + * Copyright (C) 2017 Intel Corporation + * + * Author: + * Kirill A. Shutemov <kirill.shutemov@linux.intel.com> + */ + +#include <asm/cpufeature.h> +#include <asm/intel_pconfig.h> + +#define PCONFIG_CPUID 0x1b + +#define PCONFIG_CPUID_SUBLEAF_MASK ((1 << 12) - 1) + +/* Subleaf type (EAX) for PCONFIG CPUID leaf (0x1B) */ +enum { + PCONFIG_CPUID_SUBLEAF_INVALID = 0, + PCONFIG_CPUID_SUBLEAF_TARGETID = 1, +}; + +/* Bitmask of supported targets */ +static u64 targets_supported __read_mostly; + +int pconfig_target_supported(enum pconfig_target target) +{ + /* + * We would need to re-think the implementation once we get > 64 + * PCONFIG targets. Spec allows up to 2^32 targets. + */ + BUILD_BUG_ON(PCONFIG_TARGET_NR >= 64); + + if (WARN_ON_ONCE(target >= 64)) + return 0; + return targets_supported & (1ULL << target); +} + +static int __init intel_pconfig_init(void) +{ + int subleaf; + + if (!boot_cpu_has(X86_FEATURE_PCONFIG)) + return 0; + + /* + * Scan subleafs of PCONFIG CPUID leaf. + * + * Subleafs of the same type need not to be consecutive. + * + * Stop on the first invalid subleaf type. All subleafs after the first + * invalid are invalid too. + */ + for (subleaf = 0; subleaf < INT_MAX; subleaf++) { + struct cpuid_regs regs; + + cpuid_count(PCONFIG_CPUID, subleaf, + ®s.eax, ®s.ebx, ®s.ecx, ®s.edx); + + switch (regs.eax & PCONFIG_CPUID_SUBLEAF_MASK) { + case PCONFIG_CPUID_SUBLEAF_INVALID: + /* Stop on the first invalid subleaf */ + goto out; + case PCONFIG_CPUID_SUBLEAF_TARGETID: + /* Mark supported PCONFIG targets */ + if (regs.ebx < 64) + targets_supported |= (1ULL << regs.ebx); + if (regs.ecx < 64) + targets_supported |= (1ULL << regs.ecx); + if (regs.edx < 64) + targets_supported |= (1ULL << regs.edx); + break; + default: + /* Unknown CPUID.PCONFIG subleaf: ignore */ + break; + } + } +out: + return 0; +} +arch_initcall(intel_pconfig_init); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 3c1eec17312b..42cf2880d0ed 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1095,19 +1095,7 @@ static void mce_unmap_kpfn(unsigned long pfn) * a legal address. */ -/* - * Build time check to see if we have a spare virtual bit. Don't want - * to leave this until run time because most developers don't have a - * system that can exercise this code path. This will only become a - * problem if/when we move beyond 5-level page tables. - * - * Hard code "9" here because cpp doesn't grok ilog2(PTRS_PER_PGD) - */ -#if PGDIR_SHIFT + 9 < 63 decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63)); -#else -#error "no unused virtual bit available" -#endif if (set_memory_np(decoy_addr, 1)) pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); @@ -2357,6 +2345,12 @@ static __init int mcheck_init_device(void) { int err; + /* + * Check if we have a spare virtual bit. This will only become + * a problem if/when we move beyond 5-level page tables. + */ + MAYBE_BUILD_BUG_ON(__VIRTUAL_MASK_SHIFT >= 63); + if (!mce_available(&boot_cpu_data)) { err = -EIO; goto err_out; diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 71c11ad5643e..6a2cb1442e05 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -924,6 +924,24 @@ static int __init parse_memmap_one(char *p) } else if (*p == '!') { start_at = memparse(p+1, &p); e820__range_add(start_at, mem_size, E820_TYPE_PRAM); + } else if (*p == '%') { + enum e820_type from = 0, to = 0; + + start_at = memparse(p + 1, &p); + if (*p == '-') + from = simple_strtoull(p + 1, &p, 0); + if (*p == '+') + to = simple_strtoull(p + 1, &p, 0); + if (*p != '\0') + return -EINVAL; + if (from && to) + e820__range_update(start_at, mem_size, from, to); + else if (to) + e820__range_add(start_at, mem_size, to); + else if (from) + e820__range_remove(start_at, mem_size, from, 1); + else + e820__range_remove(start_at, mem_size, 0, 0); } else { e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1); } diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 7ba5d819ebe3..0c855deee165 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -32,6 +32,11 @@ #include <asm/microcode.h> #include <asm/kasan.h> +#ifdef CONFIG_X86_5LEVEL +#undef pgtable_l5_enabled +#define pgtable_l5_enabled __pgtable_l5_enabled +#endif + /* * Manage page tables very early on. */ @@ -39,6 +44,24 @@ extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; static unsigned int __initdata next_early_pgt; pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); +#ifdef CONFIG_X86_5LEVEL +unsigned int __pgtable_l5_enabled __ro_after_init; +EXPORT_SYMBOL(__pgtable_l5_enabled); +unsigned int pgdir_shift __ro_after_init = 39; +EXPORT_SYMBOL(pgdir_shift); +unsigned int ptrs_per_p4d __ro_after_init = 1; +EXPORT_SYMBOL(ptrs_per_p4d); +#endif + +#ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT +unsigned long page_offset_base __ro_after_init = __PAGE_OFFSET_BASE_L4; +EXPORT_SYMBOL(page_offset_base); +unsigned long vmalloc_base __ro_after_init = __VMALLOC_BASE_L4; +EXPORT_SYMBOL(vmalloc_base); +unsigned long vmemmap_base __ro_after_init = __VMEMMAP_BASE_L4; +EXPORT_SYMBOL(vmemmap_base); +#endif + #define __head __section(.head.text) static void __head *fixup_pointer(void *ptr, unsigned long physaddr) @@ -46,6 +69,41 @@ static void __head *fixup_pointer(void *ptr, unsigned long physaddr) return ptr - (void *)_text + (void *)physaddr; } +static unsigned long __head *fixup_long(void *ptr, unsigned long physaddr) +{ + return fixup_pointer(ptr, physaddr); +} + +#ifdef CONFIG_X86_5LEVEL +static unsigned int __head *fixup_int(void *ptr, unsigned long physaddr) +{ + return fixup_pointer(ptr, physaddr); +} + +static bool __head check_la57_support(unsigned long physaddr) +{ + if (native_cpuid_eax(0) < 7) + return false; + + if (!(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) + return false; + + *fixup_int(&pgtable_l5_enabled, physaddr) = 1; + *fixup_int(&pgdir_shift, physaddr) = 48; + *fixup_int(&ptrs_per_p4d, physaddr) = 512; + *fixup_long(&page_offset_base, physaddr) = __PAGE_OFFSET_BASE_L5; + *fixup_long(&vmalloc_base, physaddr) = __VMALLOC_BASE_L5; + *fixup_long(&vmemmap_base, physaddr) = __VMEMMAP_BASE_L5; + + return true; +} +#else +static bool __head check_la57_support(unsigned long physaddr) +{ + return false; +} +#endif + unsigned long __head __startup_64(unsigned long physaddr, struct boot_params *bp) { @@ -55,9 +113,12 @@ unsigned long __head __startup_64(unsigned long physaddr, p4dval_t *p4d; pudval_t *pud; pmdval_t *pmd, pmd_entry; + bool la57; int i; unsigned int *next_pgt_ptr; + la57 = check_la57_support(physaddr); + /* Is the address too large? */ if (physaddr >> MAX_PHYSMEM_BITS) for (;;); @@ -81,9 +142,14 @@ unsigned long __head __startup_64(unsigned long physaddr, /* Fixup the physical addresses in the page table */ pgd = fixup_pointer(&early_top_pgt, physaddr); - pgd[pgd_index(__START_KERNEL_map)] += load_delta; - - if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + p = pgd + pgd_index(__START_KERNEL_map); + if (la57) + *p = (unsigned long)level4_kernel_pgt; + else + *p = (unsigned long)level3_kernel_pgt; + *p += _PAGE_TABLE_NOENC - __START_KERNEL_map + load_delta; + + if (la57) { p4d = fixup_pointer(&level4_kernel_pgt, physaddr); p4d[511] += load_delta; } @@ -108,7 +174,7 @@ unsigned long __head __startup_64(unsigned long physaddr, pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask(); - if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + if (la57) { p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; @@ -154,8 +220,7 @@ unsigned long __head __startup_64(unsigned long physaddr, * Fixup phys_base - remove the memory encryption mask to obtain * the true physical address. */ - p = fixup_pointer(&phys_base, physaddr); - *p += load_delta - sme_get_me_mask(); + *fixup_long(&phys_base, physaddr) += load_delta - sme_get_me_mask(); /* Encrypt the kernel and related (if SME is active) */ sme_encrypt_kernel(bp); @@ -206,7 +271,7 @@ again: * critical -- __PAGE_OFFSET would point us back into the dynamic * range and we might end up looping forever... */ - if (!IS_ENABLED(CONFIG_X86_5LEVEL)) + if (!pgtable_l5_enabled) p4d_p = pgd_p; else if (pgd) p4d_p = (p4dval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); @@ -322,7 +387,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) BUILD_BUG_ON((__START_KERNEL_map & ~PMD_MASK) != 0); BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0); BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); - BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == + MAYBE_BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == (__START_KERNEL & PGDIR_MASK))); BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 0f545b3cf926..48385c1074a5 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -39,12 +39,12 @@ * */ +#define l4_index(x) (((x) >> 39) & 511) #define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) -#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH) -PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE) -PGD_START_KERNEL = pgd_index(__START_KERNEL_map) -#endif +L4_PAGE_OFFSET = l4_index(__PAGE_OFFSET_BASE_L4) +L4_START_KERNEL = l4_index(__START_KERNEL_map) + L3_START_KERNEL = pud_index(__START_KERNEL_map) .text @@ -125,7 +125,10 @@ ENTRY(secondary_startup_64) /* Enable PAE mode, PGE and LA57 */ movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx #ifdef CONFIG_X86_5LEVEL + testl $1, __pgtable_l5_enabled(%rip) + jz 1f orl $X86_CR4_LA57, %ecx +1: #endif movq %rcx, %cr4 @@ -374,12 +377,7 @@ GLOBAL(name) __INITDATA NEXT_PGD_PAGE(early_top_pgt) - .fill 511,8,0 -#ifdef CONFIG_X86_5LEVEL - .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC -#else - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC -#endif + .fill 512,8,0 .fill PTI_USER_PGD_FILL,8,0 NEXT_PAGE(early_dynamic_pgts) @@ -390,9 +388,9 @@ NEXT_PAGE(early_dynamic_pgts) #if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH) NEXT_PGD_PAGE(init_top_pgt) .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC - .org init_top_pgt + PGD_PAGE_OFFSET*8, 0 + .org init_top_pgt + L4_PAGE_OFFSET*8, 0 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC - .org init_top_pgt + PGD_START_KERNEL*8, 0 + .org init_top_pgt + L4_START_KERNEL*8, 0 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC .fill PTI_USER_PGD_FILL,8,0 diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index faeea0b5cbd0..93bd4fb603d1 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -350,6 +350,7 @@ void arch_crash_save_vmcoreinfo(void) { VMCOREINFO_NUMBER(phys_base); VMCOREINFO_SYMBOL(init_top_pgt); + VMCOREINFO_NUMBER(pgtable_l5_enabled); #ifdef CONFIG_NUMA VMCOREINFO_SYMBOL(node_data); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 4c616be28506..6285697b6e56 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -189,9 +189,7 @@ struct ist_info ist_info; #endif #else -struct cpuinfo_x86 boot_cpu_data __read_mostly = { - .x86_phys_bits = MAX_PHYSMEM_BITS, -}; +struct cpuinfo_x86 boot_cpu_data __read_mostly; EXPORT_SYMBOL(boot_cpu_data); #endif @@ -851,6 +849,7 @@ void __init setup_arch(char **cmdline_p) __flush_tlb_all(); #else printk(KERN_INFO "Command line: %s\n", boot_command_line); + boot_cpu_data.x86_phys_bits = MAX_PHYSMEM_BITS; #endif /* diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 2bccd03bd654..ebda84a91510 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -8,6 +8,7 @@ #include <linux/export.h> #include <linux/pci.h> +#include <asm/acpi.h> #include <asm/bios_ebda.h> #include <asm/paravirt.h> #include <asm/pci_x86.h> @@ -26,10 +27,11 @@ void x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } -int __init iommu_init_noop(void) { return 0; } -void iommu_shutdown_noop(void) { } -bool __init bool_x86_init_noop(void) { return false; } -void x86_op_int_noop(int cpu) { } +static int __init iommu_init_noop(void) { return 0; } +static void iommu_shutdown_noop(void) { } +static bool __init bool_x86_init_noop(void) { return false; } +static void x86_op_int_noop(int cpu) { } +static u64 u64_x86_init_noop(void) { return 0; } /* * The platform setup functions are preset with the default functions @@ -91,6 +93,11 @@ struct x86_init_ops x86_init __initdata = { .x2apic_available = bool_x86_init_noop, .init_mem_mapping = x86_init_noop, }, + + .acpi = { + .get_root_pointer = u64_x86_init_noop, + .reduced_hw_early_init = acpi_generic_reduced_hw_init, + }, }; struct x86_cpuinit_ops x86_cpuinit = { diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 27e9e90a8d35..4b101dd6e52f 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,12 +1,15 @@ # SPDX-License-Identifier: GPL-2.0 -# Kernel does not boot with instrumentation of tlb.c and mem_encrypt.c -KCOV_INSTRUMENT_tlb.o := n -KCOV_INSTRUMENT_mem_encrypt.o := n +# Kernel does not boot with instrumentation of tlb.c and mem_encrypt*.c +KCOV_INSTRUMENT_tlb.o := n +KCOV_INSTRUMENT_mem_encrypt.o := n +KCOV_INSTRUMENT_mem_encrypt_identity.o := n -KASAN_SANITIZE_mem_encrypt.o := n +KASAN_SANITIZE_mem_encrypt.o := n +KASAN_SANITIZE_mem_encrypt_identity.o := n ifdef CONFIG_FUNCTION_TRACER -CFLAGS_REMOVE_mem_encrypt.o = -pg +CFLAGS_REMOVE_mem_encrypt.o = -pg +CFLAGS_REMOVE_mem_encrypt_identity.o = -pg endif obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ @@ -16,6 +19,7 @@ obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_physaddr.o := $(nostackp) CFLAGS_setup_nx.o := $(nostackp) +CFLAGS_mem_encrypt_identity.o := $(nostackp) CFLAGS_fault.o := -I$(src)/../include/asm/trace @@ -47,4 +51,5 @@ obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o +obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_identity.o obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c index 421f2664ffa0..51a6f92da2bf 100644 --- a/arch/x86/mm/debug_pagetables.c +++ b/arch/x86/mm/debug_pagetables.c @@ -72,6 +72,31 @@ static const struct file_operations ptdump_curusr_fops = { }; #endif +#if defined(CONFIG_EFI) && defined(CONFIG_X86_64) +extern pgd_t *efi_pgd; +static struct dentry *pe_efi; + +static int ptdump_show_efi(struct seq_file *m, void *v) +{ + if (efi_pgd) + ptdump_walk_pgd_level_debugfs(m, efi_pgd, false); + return 0; +} + +static int ptdump_open_efi(struct inode *inode, struct file *filp) +{ + return single_open(filp, ptdump_show_efi, NULL); +} + +static const struct file_operations ptdump_efi_fops = { + .owner = THIS_MODULE, + .open = ptdump_open_efi, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + static struct dentry *dir, *pe_knl, *pe_curknl; static int __init pt_dump_debug_init(void) @@ -96,6 +121,13 @@ static int __init pt_dump_debug_init(void) if (!pe_curusr) goto err; #endif + +#if defined(CONFIG_EFI) && defined(CONFIG_X86_64) + pe_efi = debugfs_create_file("efi", 0400, dir, NULL, &ptdump_efi_fops); + if (!pe_efi) + goto err; +#endif + return 0; err: debugfs_remove_recursive(dir); diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 2a4849e92831..62a7e9f65dec 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -29,6 +29,7 @@ struct pg_state { int level; pgprot_t current_prot; + pgprotval_t effective_prot; unsigned long start_address; unsigned long current_address; const struct addr_marker *marker; @@ -85,11 +86,15 @@ static struct addr_marker address_markers[] = { [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" }, [VMEMMAP_START_NR] = { 0UL, "Vmemmap" }, #ifdef CONFIG_KASAN - [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" }, - [KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" }, + /* + * These fields get initialized with the (dynamic) + * KASAN_SHADOW_{START,END} values in pt_dump_init(). + */ + [KASAN_SHADOW_START_NR] = { 0UL, "KASAN shadow" }, + [KASAN_SHADOW_END_NR] = { 0UL, "KASAN shadow end" }, #endif #ifdef CONFIG_MODIFY_LDT_SYSCALL - [LDT_NR] = { LDT_BASE_ADDR, "LDT remap" }, + [LDT_NR] = { 0UL, "LDT remap" }, #endif [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" }, #ifdef CONFIG_X86_ESPFIX64 @@ -231,9 +236,9 @@ static unsigned long normalize_addr(unsigned long u) * print what we collected so far. */ static void note_page(struct seq_file *m, struct pg_state *st, - pgprot_t new_prot, int level) + pgprot_t new_prot, pgprotval_t new_eff, int level) { - pgprotval_t prot, cur; + pgprotval_t prot, cur, eff; static const char units[] = "BKMGTPE"; /* @@ -243,23 +248,24 @@ static void note_page(struct seq_file *m, struct pg_state *st, */ prot = pgprot_val(new_prot); cur = pgprot_val(st->current_prot); + eff = st->effective_prot; if (!st->level) { /* First entry */ st->current_prot = new_prot; + st->effective_prot = new_eff; st->level = level; st->marker = address_markers; st->lines = 0; pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", st->marker->name); - } else if (prot != cur || level != st->level || + } else if (prot != cur || new_eff != eff || level != st->level || st->current_address >= st->marker[1].start_address) { const char *unit = units; unsigned long delta; int width = sizeof(unsigned long) * 2; - pgprotval_t pr = pgprot_val(st->current_prot); - if (st->check_wx && (pr & _PAGE_RW) && !(pr & _PAGE_NX)) { + if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX)) { WARN_ONCE(1, "x86/mm: Found insecure W+X mapping at address %p/%pS\n", (void *)st->start_address, @@ -313,21 +319,30 @@ static void note_page(struct seq_file *m, struct pg_state *st, st->start_address = st->current_address; st->current_prot = new_prot; + st->effective_prot = new_eff; st->level = level; } } -static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, unsigned long P) +static inline pgprotval_t effective_prot(pgprotval_t prot1, pgprotval_t prot2) +{ + return (prot1 & prot2 & (_PAGE_USER | _PAGE_RW)) | + ((prot1 | prot2) & _PAGE_NX); +} + +static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, + pgprotval_t eff_in, unsigned long P) { int i; pte_t *start; - pgprotval_t prot; + pgprotval_t prot, eff; start = (pte_t *)pmd_page_vaddr(addr); for (i = 0; i < PTRS_PER_PTE; i++) { prot = pte_flags(*start); + eff = effective_prot(eff_in, prot); st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT); - note_page(m, st, __pgprot(prot), 5); + note_page(m, st, __pgprot(prot), eff, 5); start++; } } @@ -344,12 +359,10 @@ static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st, void *pt) { if (__pa(pt) == __pa(kasan_zero_pmd) || -#ifdef CONFIG_X86_5LEVEL - __pa(pt) == __pa(kasan_zero_p4d) || -#endif + (pgtable_l5_enabled && __pa(pt) == __pa(kasan_zero_p4d)) || __pa(pt) == __pa(kasan_zero_pud)) { pgprotval_t prot = pte_flags(kasan_zero_pte[0]); - note_page(m, st, __pgprot(prot), 5); + note_page(m, st, __pgprot(prot), 0, 5); return true; } return false; @@ -364,42 +377,45 @@ static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st, #if PTRS_PER_PMD > 1 -static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, unsigned long P) +static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, + pgprotval_t eff_in, unsigned long P) { int i; pmd_t *start, *pmd_start; - pgprotval_t prot; + pgprotval_t prot, eff; pmd_start = start = (pmd_t *)pud_page_vaddr(addr); for (i = 0; i < PTRS_PER_PMD; i++) { st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); if (!pmd_none(*start)) { + prot = pmd_flags(*start); + eff = effective_prot(eff_in, prot); if (pmd_large(*start) || !pmd_present(*start)) { - prot = pmd_flags(*start); - note_page(m, st, __pgprot(prot), 4); + note_page(m, st, __pgprot(prot), eff, 4); } else if (!kasan_page_table(m, st, pmd_start)) { - walk_pte_level(m, st, *start, + walk_pte_level(m, st, *start, eff, P + i * PMD_LEVEL_MULT); } } else - note_page(m, st, __pgprot(0), 4); + note_page(m, st, __pgprot(0), 0, 4); start++; } } #else -#define walk_pmd_level(m,s,a,p) walk_pte_level(m,s,__pmd(pud_val(a)),p) +#define walk_pmd_level(m,s,a,e,p) walk_pte_level(m,s,__pmd(pud_val(a)),e,p) #define pud_large(a) pmd_large(__pmd(pud_val(a))) #define pud_none(a) pmd_none(__pmd(pud_val(a))) #endif #if PTRS_PER_PUD > 1 -static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, unsigned long P) +static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, + pgprotval_t eff_in, unsigned long P) { int i; pud_t *start, *pud_start; - pgprotval_t prot; + pgprotval_t prot, eff; pud_t *prev_pud = NULL; pud_start = start = (pud_t *)p4d_page_vaddr(addr); @@ -407,15 +423,16 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, for (i = 0; i < PTRS_PER_PUD; i++) { st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); if (!pud_none(*start)) { + prot = pud_flags(*start); + eff = effective_prot(eff_in, prot); if (pud_large(*start) || !pud_present(*start)) { - prot = pud_flags(*start); - note_page(m, st, __pgprot(prot), 3); + note_page(m, st, __pgprot(prot), eff, 3); } else if (!kasan_page_table(m, st, pud_start)) { - walk_pmd_level(m, st, *start, + walk_pmd_level(m, st, *start, eff, P + i * PUD_LEVEL_MULT); } } else - note_page(m, st, __pgprot(0), 3); + note_page(m, st, __pgprot(0), 0, 3); prev_pud = start; start++; @@ -423,43 +440,43 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, } #else -#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(p4d_val(a)),p) +#define walk_pud_level(m,s,a,e,p) walk_pmd_level(m,s,__pud(p4d_val(a)),e,p) #define p4d_large(a) pud_large(__pud(p4d_val(a))) #define p4d_none(a) pud_none(__pud(p4d_val(a))) #endif -#if PTRS_PER_P4D > 1 - -static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P) +static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, + pgprotval_t eff_in, unsigned long P) { int i; p4d_t *start, *p4d_start; - pgprotval_t prot; + pgprotval_t prot, eff; + + if (PTRS_PER_P4D == 1) + return walk_pud_level(m, st, __p4d(pgd_val(addr)), eff_in, P); p4d_start = start = (p4d_t *)pgd_page_vaddr(addr); for (i = 0; i < PTRS_PER_P4D; i++) { st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT); if (!p4d_none(*start)) { + prot = p4d_flags(*start); + eff = effective_prot(eff_in, prot); if (p4d_large(*start) || !p4d_present(*start)) { - prot = p4d_flags(*start); - note_page(m, st, __pgprot(prot), 2); + note_page(m, st, __pgprot(prot), eff, 2); } else if (!kasan_page_table(m, st, p4d_start)) { - walk_pud_level(m, st, *start, + walk_pud_level(m, st, *start, eff, P + i * P4D_LEVEL_MULT); } } else - note_page(m, st, __pgprot(0), 2); + note_page(m, st, __pgprot(0), 0, 2); start++; } } -#else -#define walk_p4d_level(m,s,a,p) walk_pud_level(m,s,__p4d(pgd_val(a)),p) -#define pgd_large(a) p4d_large(__p4d(pgd_val(a))) -#define pgd_none(a) p4d_none(__p4d(pgd_val(a))) -#endif +#define pgd_large(a) (pgtable_l5_enabled ? pgd_large(a) : p4d_large(__p4d(pgd_val(a)))) +#define pgd_none(a) (pgtable_l5_enabled ? pgd_none(a) : p4d_none(__p4d(pgd_val(a)))) static inline bool is_hypervisor_range(int idx) { @@ -483,7 +500,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, #else pgd_t *start = swapper_pg_dir; #endif - pgprotval_t prot; + pgprotval_t prot, eff; int i; struct pg_state st = {}; @@ -499,15 +516,20 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, for (i = 0; i < PTRS_PER_PGD; i++) { st.current_address = normalize_addr(i * PGD_LEVEL_MULT); if (!pgd_none(*start) && !is_hypervisor_range(i)) { + prot = pgd_flags(*start); +#ifdef CONFIG_X86_PAE + eff = _PAGE_USER | _PAGE_RW; +#else + eff = prot; +#endif if (pgd_large(*start) || !pgd_present(*start)) { - prot = pgd_flags(*start); - note_page(m, &st, __pgprot(prot), 1); + note_page(m, &st, __pgprot(prot), eff, 1); } else { - walk_p4d_level(m, &st, *start, + walk_p4d_level(m, &st, *start, eff, i * PGD_LEVEL_MULT); } } else - note_page(m, &st, __pgprot(0), 1); + note_page(m, &st, __pgprot(0), 0, 1); cond_resched(); start++; @@ -515,7 +537,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, /* Flush out the last page */ st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT); - note_page(m, &st, __pgprot(0), 0); + note_page(m, &st, __pgprot(0), 0, 0); if (!checkwx) return; if (st.wx_pages) @@ -570,6 +592,13 @@ static int __init pt_dump_init(void) address_markers[LOW_KERNEL_NR].start_address = PAGE_OFFSET; address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START; +#ifdef CONFIG_MODIFY_LDT_SYSCALL + address_markers[LDT_NR].start_address = LDT_BASE_ADDR; +#endif +#ifdef CONFIG_KASAN + address_markers[KASAN_SHADOW_START_NR].start_address = KASAN_SHADOW_START; + address_markers[KASAN_SHADOW_END_NR].start_address = KASAN_SHADOW_END; +#endif #endif #ifdef CONFIG_X86_32 address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index f75ea0748b9f..73bd8c95ac71 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -417,11 +417,11 @@ void vmalloc_sync_all(void) */ static noinline int vmalloc_fault(unsigned long address) { - pgd_t *pgd, *pgd_ref; - p4d_t *p4d, *p4d_ref; - pud_t *pud, *pud_ref; - pmd_t *pmd, *pmd_ref; - pte_t *pte, *pte_ref; + pgd_t *pgd, *pgd_k; + p4d_t *p4d, *p4d_k; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; /* Make sure we are in vmalloc area: */ if (!(address >= VMALLOC_START && address < VMALLOC_END)) @@ -435,73 +435,51 @@ static noinline int vmalloc_fault(unsigned long address) * case just flush: */ pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address); - pgd_ref = pgd_offset_k(address); - if (pgd_none(*pgd_ref)) + pgd_k = pgd_offset_k(address); + if (pgd_none(*pgd_k)) return -1; - if (CONFIG_PGTABLE_LEVELS > 4) { + if (pgtable_l5_enabled) { if (pgd_none(*pgd)) { - set_pgd(pgd, *pgd_ref); + set_pgd(pgd, *pgd_k); arch_flush_lazy_mmu_mode(); } else { - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_k)); } } /* With 4-level paging, copying happens on the p4d level. */ p4d = p4d_offset(pgd, address); - p4d_ref = p4d_offset(pgd_ref, address); - if (p4d_none(*p4d_ref)) + p4d_k = p4d_offset(pgd_k, address); + if (p4d_none(*p4d_k)) return -1; - if (p4d_none(*p4d) && CONFIG_PGTABLE_LEVELS == 4) { - set_p4d(p4d, *p4d_ref); + if (p4d_none(*p4d) && !pgtable_l5_enabled) { + set_p4d(p4d, *p4d_k); arch_flush_lazy_mmu_mode(); } else { - BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_ref)); + BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_k)); } - /* - * Below here mismatches are bugs because these lower tables - * are shared: - */ BUILD_BUG_ON(CONFIG_PGTABLE_LEVELS < 4); pud = pud_offset(p4d, address); - pud_ref = pud_offset(p4d_ref, address); - if (pud_none(*pud_ref)) + if (pud_none(*pud)) return -1; - if (pud_none(*pud) || pud_pfn(*pud) != pud_pfn(*pud_ref)) - BUG(); - if (pud_large(*pud)) return 0; pmd = pmd_offset(pud, address); - pmd_ref = pmd_offset(pud_ref, address); - if (pmd_none(*pmd_ref)) + if (pmd_none(*pmd)) return -1; - if (pmd_none(*pmd) || pmd_pfn(*pmd) != pmd_pfn(*pmd_ref)) - BUG(); - if (pmd_large(*pmd)) return 0; - pte_ref = pte_offset_kernel(pmd_ref, address); - if (!pte_present(*pte_ref)) - return -1; - pte = pte_offset_kernel(pmd, address); - - /* - * Don't use pte_page here, because the mappings can point - * outside mem_map, and the NUMA hash lookup cannot handle - * that: - */ - if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) - BUG(); + if (!pte_present(*pte)) + return -1; return 0; } diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index ab33a32df2a8..9aa22be8331e 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -120,7 +120,7 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, result = ident_p4d_init(info, p4d, addr, next); if (result) return result; - if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + if (pgtable_l5_enabled) { set_pgd(pgd, __pgd(__pa(p4d) | info->kernpg_flag)); } else { /* diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index af11a2890235..45241de66785 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -88,12 +88,7 @@ static int __init nonx32_setup(char *str) } __setup("noexec32=", nonx32_setup); -/* - * When memory was added make sure all the processes MM have - * suitable PGD entries in the local PGD level page. - */ -#ifdef CONFIG_X86_5LEVEL -void sync_global_pgds(unsigned long start, unsigned long end) +static void sync_global_pgds_l5(unsigned long start, unsigned long end) { unsigned long addr; @@ -129,8 +124,8 @@ void sync_global_pgds(unsigned long start, unsigned long end) spin_unlock(&pgd_lock); } } -#else -void sync_global_pgds(unsigned long start, unsigned long end) + +static void sync_global_pgds_l4(unsigned long start, unsigned long end) { unsigned long addr; @@ -143,7 +138,7 @@ void sync_global_pgds(unsigned long start, unsigned long end) * With folded p4d, pgd_none() is always false, we need to * handle synchonization on p4d level. */ - BUILD_BUG_ON(pgd_none(*pgd_ref)); + MAYBE_BUILD_BUG_ON(pgd_none(*pgd_ref)); p4d_ref = p4d_offset(pgd_ref, addr); if (p4d_none(*p4d_ref)) @@ -173,7 +168,18 @@ void sync_global_pgds(unsigned long start, unsigned long end) spin_unlock(&pgd_lock); } } -#endif + +/* + * When memory was added make sure all the processes MM have + * suitable PGD entries in the local PGD level page. + */ +void sync_global_pgds(unsigned long start, unsigned long end) +{ + if (pgtable_l5_enabled) + sync_global_pgds_l5(start, end); + else + sync_global_pgds_l4(start, end); +} /* * NOTE: This function is marked __ref because it calls __init function @@ -632,7 +638,7 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end, unsigned long vaddr = (unsigned long)__va(paddr); int i = p4d_index(vaddr); - if (!IS_ENABLED(CONFIG_X86_5LEVEL)) + if (!pgtable_l5_enabled) return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, page_size_mask); for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) { @@ -712,7 +718,7 @@ kernel_physical_mapping_init(unsigned long paddr_start, page_size_mask); spin_lock(&init_mm.page_table_lock); - if (IS_ENABLED(CONFIG_X86_5LEVEL)) + if (pgtable_l5_enabled) pgd_populate(&init_mm, pgd, p4d); else p4d_populate(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d); @@ -1089,7 +1095,7 @@ remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end, * 5-level case we should free them. This code will have to change * to adapt for boot-time switching between 4 and 5 level page tables. */ - if (CONFIG_PGTABLE_LEVELS == 5) + if (pgtable_l5_enabled) free_pud_table(pud_base, p4d); } diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index af6f2f9c6a26..d8ff013ea9d0 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -1,6 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 #define DISABLE_BRANCH_PROFILING #define pr_fmt(fmt) "kasan: " fmt + +#ifdef CONFIG_X86_5LEVEL +/* Too early to use cpu_feature_enabled() */ +#define pgtable_l5_enabled __pgtable_l5_enabled +#endif + #include <linux/bootmem.h> #include <linux/kasan.h> #include <linux/kdebug.h> @@ -19,7 +25,7 @@ extern struct range pfn_mapped[E820_MAX_ENTRIES]; -static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE); +static p4d_t tmp_p4d_table[MAX_PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE); static __init void *early_alloc(size_t size, int nid, bool panic) { @@ -176,10 +182,10 @@ static void __init clear_pgds(unsigned long start, * With folded p4d, pgd_clear() is nop, use p4d_clear() * instead. */ - if (CONFIG_PGTABLE_LEVELS < 5) - p4d_clear(p4d_offset(pgd, start)); - else + if (pgtable_l5_enabled) pgd_clear(pgd); + else + p4d_clear(p4d_offset(pgd, start)); } pgd = pgd_offset_k(start); @@ -191,7 +197,7 @@ static inline p4d_t *early_p4d_offset(pgd_t *pgd, unsigned long addr) { unsigned long p4d; - if (!IS_ENABLED(CONFIG_X86_5LEVEL)) + if (!pgtable_l5_enabled) return (p4d_t *)pgd; p4d = __pa_nodebug(pgd_val(*pgd)) & PTE_PFN_MASK; @@ -272,7 +278,7 @@ void __init kasan_early_init(void) for (i = 0; i < PTRS_PER_PUD; i++) kasan_zero_pud[i] = __pud(pud_val); - for (i = 0; IS_ENABLED(CONFIG_X86_5LEVEL) && i < PTRS_PER_P4D; i++) + for (i = 0; pgtable_l5_enabled && i < PTRS_PER_P4D; i++) kasan_zero_p4d[i] = __p4d(p4d_val); kasan_map_early_shadow(early_top_pgt); @@ -303,7 +309,7 @@ void __init kasan_init(void) * bunch of things like kernel code, modules, EFI mapping, etc. * We need to take extra steps to not overwrite them. */ - if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + if (pgtable_l5_enabled) { void *ptr; ptr = (void *)pgd_page_vaddr(*pgd_offset_k(KASAN_SHADOW_END)); diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index aedebd2ebf1e..615cc03ced84 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -34,23 +34,12 @@ #define TB_SHIFT 40 /* - * Virtual address start and end range for randomization. - * * The end address could depend on more configuration options to make the * highest amount of space for randomization available, but that's too hard * to keep straight and caused issues already. */ -static const unsigned long vaddr_start = __PAGE_OFFSET_BASE; static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE; -/* Default values */ -unsigned long page_offset_base = __PAGE_OFFSET_BASE; -EXPORT_SYMBOL(page_offset_base); -unsigned long vmalloc_base = __VMALLOC_BASE; -EXPORT_SYMBOL(vmalloc_base); -unsigned long vmemmap_base = __VMEMMAP_BASE; -EXPORT_SYMBOL(vmemmap_base); - /* * Memory regions randomized by KASLR (except modules that use a separate logic * earlier during boot). The list is ordered based on virtual addresses. This @@ -60,8 +49,8 @@ static __initdata struct kaslr_memory_region { unsigned long *base; unsigned long size_tb; } kaslr_regions[] = { - { &page_offset_base, 1 << (__PHYSICAL_MASK_SHIFT - TB_SHIFT) /* Maximum */ }, - { &vmalloc_base, VMALLOC_SIZE_TB }, + { &page_offset_base, 0 }, + { &vmalloc_base, 0 }, { &vmemmap_base, 1 }, }; @@ -84,11 +73,14 @@ static inline bool kaslr_memory_enabled(void) void __init kernel_randomize_memory(void) { size_t i; - unsigned long vaddr = vaddr_start; + unsigned long vaddr_start, vaddr; unsigned long rand, memory_tb; struct rnd_state rand_state; unsigned long remain_entropy; + vaddr_start = pgtable_l5_enabled ? __PAGE_OFFSET_BASE_L5 : __PAGE_OFFSET_BASE_L4; + vaddr = vaddr_start; + /* * These BUILD_BUG_ON checks ensure the memory layout is consistent * with the vaddr_start/vaddr_end variables. These checks are very @@ -101,6 +93,9 @@ void __init kernel_randomize_memory(void) if (!kaslr_memory_enabled()) return; + kaslr_regions[0].size_tb = 1 << (__PHYSICAL_MASK_SHIFT - TB_SHIFT); + kaslr_regions[1].size_tb = VMALLOC_SIZE_TB; + /* * Update Physical memory mapping to available and * add padding if needed (especially for memory hotplug support). @@ -129,7 +124,7 @@ void __init kernel_randomize_memory(void) */ entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i); prandom_bytes_state(&rand_state, &rand, sizeof(rand)); - if (IS_ENABLED(CONFIG_X86_5LEVEL)) + if (pgtable_l5_enabled) entropy = (rand % (entropy + 1)) & P4D_MASK; else entropy = (rand % (entropy + 1)) & PUD_MASK; @@ -141,7 +136,7 @@ void __init kernel_randomize_memory(void) * randomization alignment. */ vaddr += get_padding(&kaslr_regions[i]); - if (IS_ENABLED(CONFIG_X86_5LEVEL)) + if (pgtable_l5_enabled) vaddr = round_up(vaddr + 1, P4D_SIZE); else vaddr = round_up(vaddr + 1, PUD_SIZE); @@ -217,7 +212,7 @@ void __meminit init_trampoline(void) return; } - if (IS_ENABLED(CONFIG_X86_5LEVEL)) + if (pgtable_l5_enabled) init_trampoline_p4d(); else init_trampoline_pud(); diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 1a53071e2e17..3a1b5fe4c2ca 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -25,17 +25,12 @@ #include <asm/bootparam.h> #include <asm/set_memory.h> #include <asm/cacheflush.h> -#include <asm/sections.h> #include <asm/processor-flags.h> #include <asm/msr.h> #include <asm/cmdline.h> #include "mm_internal.h" -static char sme_cmdline_arg[] __initdata = "mem_encrypt"; -static char sme_cmdline_on[] __initdata = "on"; -static char sme_cmdline_off[] __initdata = "off"; - /* * Since SME related variables are set early in the boot process they must * reside in the .data section so as not to be zeroed out when the .bss @@ -46,7 +41,7 @@ EXPORT_SYMBOL(sme_me_mask); DEFINE_STATIC_KEY_FALSE(sev_enable_key); EXPORT_SYMBOL_GPL(sev_enable_key); -static bool sev_enabled __section(.data); +bool sev_enabled __section(.data); /* Buffer used for early in-place encryption by BSP, no locking needed */ static char sme_early_buffer[PAGE_SIZE] __aligned(PAGE_SIZE); @@ -463,574 +458,3 @@ void swiotlb_set_mem_attributes(void *vaddr, unsigned long size) /* Make the SWIOTLB buffer area decrypted */ set_memory_decrypted((unsigned long)vaddr, size >> PAGE_SHIFT); } - -struct sme_populate_pgd_data { - void *pgtable_area; - pgd_t *pgd; - - pmdval_t pmd_flags; - pteval_t pte_flags; - unsigned long paddr; - - unsigned long vaddr; - unsigned long vaddr_end; -}; - -static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd) -{ - unsigned long pgd_start, pgd_end, pgd_size; - pgd_t *pgd_p; - - pgd_start = ppd->vaddr & PGDIR_MASK; - pgd_end = ppd->vaddr_end & PGDIR_MASK; - - pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1) * sizeof(pgd_t); - - pgd_p = ppd->pgd + pgd_index(ppd->vaddr); - - memset(pgd_p, 0, pgd_size); -} - -#define PGD_FLAGS _KERNPG_TABLE_NOENC -#define P4D_FLAGS _KERNPG_TABLE_NOENC -#define PUD_FLAGS _KERNPG_TABLE_NOENC -#define PMD_FLAGS _KERNPG_TABLE_NOENC - -#define PMD_FLAGS_LARGE (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL) - -#define PMD_FLAGS_DEC PMD_FLAGS_LARGE -#define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \ - (_PAGE_PAT | _PAGE_PWT)) - -#define PMD_FLAGS_ENC (PMD_FLAGS_LARGE | _PAGE_ENC) - -#define PTE_FLAGS (__PAGE_KERNEL_EXEC & ~_PAGE_GLOBAL) - -#define PTE_FLAGS_DEC PTE_FLAGS -#define PTE_FLAGS_DEC_WP ((PTE_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \ - (_PAGE_PAT | _PAGE_PWT)) - -#define PTE_FLAGS_ENC (PTE_FLAGS | _PAGE_ENC) - -static pmd_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) -{ - pgd_t *pgd_p; - p4d_t *p4d_p; - pud_t *pud_p; - pmd_t *pmd_p; - - pgd_p = ppd->pgd + pgd_index(ppd->vaddr); - if (native_pgd_val(*pgd_p)) { - if (IS_ENABLED(CONFIG_X86_5LEVEL)) - p4d_p = (p4d_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK); - else - pud_p = (pud_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK); - } else { - pgd_t pgd; - - if (IS_ENABLED(CONFIG_X86_5LEVEL)) { - p4d_p = ppd->pgtable_area; - memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D); - ppd->pgtable_area += sizeof(*p4d_p) * PTRS_PER_P4D; - - pgd = native_make_pgd((pgdval_t)p4d_p + PGD_FLAGS); - } else { - pud_p = ppd->pgtable_area; - memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); - ppd->pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; - - pgd = native_make_pgd((pgdval_t)pud_p + PGD_FLAGS); - } - native_set_pgd(pgd_p, pgd); - } - - if (IS_ENABLED(CONFIG_X86_5LEVEL)) { - p4d_p += p4d_index(ppd->vaddr); - if (native_p4d_val(*p4d_p)) { - pud_p = (pud_t *)(native_p4d_val(*p4d_p) & ~PTE_FLAGS_MASK); - } else { - p4d_t p4d; - - pud_p = ppd->pgtable_area; - memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); - ppd->pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; - - p4d = native_make_p4d((pudval_t)pud_p + P4D_FLAGS); - native_set_p4d(p4d_p, p4d); - } - } - - pud_p += pud_index(ppd->vaddr); - if (native_pud_val(*pud_p)) { - if (native_pud_val(*pud_p) & _PAGE_PSE) - return NULL; - - pmd_p = (pmd_t *)(native_pud_val(*pud_p) & ~PTE_FLAGS_MASK); - } else { - pud_t pud; - - pmd_p = ppd->pgtable_area; - memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD); - ppd->pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD; - - pud = native_make_pud((pmdval_t)pmd_p + PUD_FLAGS); - native_set_pud(pud_p, pud); - } - - return pmd_p; -} - -static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) -{ - pmd_t *pmd_p; - - pmd_p = sme_prepare_pgd(ppd); - if (!pmd_p) - return; - - pmd_p += pmd_index(ppd->vaddr); - if (!native_pmd_val(*pmd_p) || !(native_pmd_val(*pmd_p) & _PAGE_PSE)) - native_set_pmd(pmd_p, native_make_pmd(ppd->paddr | ppd->pmd_flags)); -} - -static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd) -{ - pmd_t *pmd_p; - pte_t *pte_p; - - pmd_p = sme_prepare_pgd(ppd); - if (!pmd_p) - return; - - pmd_p += pmd_index(ppd->vaddr); - if (native_pmd_val(*pmd_p)) { - if (native_pmd_val(*pmd_p) & _PAGE_PSE) - return; - - pte_p = (pte_t *)(native_pmd_val(*pmd_p) & ~PTE_FLAGS_MASK); - } else { - pmd_t pmd; - - pte_p = ppd->pgtable_area; - memset(pte_p, 0, sizeof(*pte_p) * PTRS_PER_PTE); - ppd->pgtable_area += sizeof(*pte_p) * PTRS_PER_PTE; - - pmd = native_make_pmd((pteval_t)pte_p + PMD_FLAGS); - native_set_pmd(pmd_p, pmd); - } - - pte_p += pte_index(ppd->vaddr); - if (!native_pte_val(*pte_p)) - native_set_pte(pte_p, native_make_pte(ppd->paddr | ppd->pte_flags)); -} - -static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) -{ - while (ppd->vaddr < ppd->vaddr_end) { - sme_populate_pgd_large(ppd); - - ppd->vaddr += PMD_PAGE_SIZE; - ppd->paddr += PMD_PAGE_SIZE; - } -} - -static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd) -{ - while (ppd->vaddr < ppd->vaddr_end) { - sme_populate_pgd(ppd); - - ppd->vaddr += PAGE_SIZE; - ppd->paddr += PAGE_SIZE; - } -} - -static void __init __sme_map_range(struct sme_populate_pgd_data *ppd, - pmdval_t pmd_flags, pteval_t pte_flags) -{ - unsigned long vaddr_end; - - ppd->pmd_flags = pmd_flags; - ppd->pte_flags = pte_flags; - - /* Save original end value since we modify the struct value */ - vaddr_end = ppd->vaddr_end; - - /* If start is not 2MB aligned, create PTE entries */ - ppd->vaddr_end = ALIGN(ppd->vaddr, PMD_PAGE_SIZE); - __sme_map_range_pte(ppd); - - /* Create PMD entries */ - ppd->vaddr_end = vaddr_end & PMD_PAGE_MASK; - __sme_map_range_pmd(ppd); - - /* If end is not 2MB aligned, create PTE entries */ - ppd->vaddr_end = vaddr_end; - __sme_map_range_pte(ppd); -} - -static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd) -{ - __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC); -} - -static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd) -{ - __sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC); -} - -static void __init sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd) -{ - __sme_map_range(ppd, PMD_FLAGS_DEC_WP, PTE_FLAGS_DEC_WP); -} - -static unsigned long __init sme_pgtable_calc(unsigned long len) -{ - unsigned long p4d_size, pud_size, pmd_size, pte_size; - unsigned long total; - - /* - * Perform a relatively simplistic calculation of the pagetable - * entries that are needed. Those mappings will be covered mostly - * by 2MB PMD entries so we can conservatively calculate the required - * number of P4D, PUD and PMD structures needed to perform the - * mappings. For mappings that are not 2MB aligned, PTE mappings - * would be needed for the start and end portion of the address range - * that fall outside of the 2MB alignment. This results in, at most, - * two extra pages to hold PTE entries for each range that is mapped. - * Incrementing the count for each covers the case where the addresses - * cross entries. - */ - if (IS_ENABLED(CONFIG_X86_5LEVEL)) { - p4d_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1; - p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D; - pud_size = (ALIGN(len, P4D_SIZE) / P4D_SIZE) + 1; - pud_size *= sizeof(pud_t) * PTRS_PER_PUD; - } else { - p4d_size = 0; - pud_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1; - pud_size *= sizeof(pud_t) * PTRS_PER_PUD; - } - pmd_size = (ALIGN(len, PUD_SIZE) / PUD_SIZE) + 1; - pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD; - pte_size = 2 * sizeof(pte_t) * PTRS_PER_PTE; - - total = p4d_size + pud_size + pmd_size + pte_size; - - /* - * Now calculate the added pagetable structures needed to populate - * the new pagetables. - */ - if (IS_ENABLED(CONFIG_X86_5LEVEL)) { - p4d_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE; - p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D; - pud_size = ALIGN(total, P4D_SIZE) / P4D_SIZE; - pud_size *= sizeof(pud_t) * PTRS_PER_PUD; - } else { - p4d_size = 0; - pud_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE; - pud_size *= sizeof(pud_t) * PTRS_PER_PUD; - } - pmd_size = ALIGN(total, PUD_SIZE) / PUD_SIZE; - pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD; - - total += p4d_size + pud_size + pmd_size; - - return total; -} - -void __init __nostackprotector sme_encrypt_kernel(struct boot_params *bp) -{ - unsigned long workarea_start, workarea_end, workarea_len; - unsigned long execute_start, execute_end, execute_len; - unsigned long kernel_start, kernel_end, kernel_len; - unsigned long initrd_start, initrd_end, initrd_len; - struct sme_populate_pgd_data ppd; - unsigned long pgtable_area_len; - unsigned long decrypted_base; - - if (!sme_active()) - return; - - /* - * Prepare for encrypting the kernel and initrd by building new - * pagetables with the necessary attributes needed to encrypt the - * kernel in place. - * - * One range of virtual addresses will map the memory occupied - * by the kernel and initrd as encrypted. - * - * Another range of virtual addresses will map the memory occupied - * by the kernel and initrd as decrypted and write-protected. - * - * The use of write-protect attribute will prevent any of the - * memory from being cached. - */ - - /* Physical addresses gives us the identity mapped virtual addresses */ - kernel_start = __pa_symbol(_text); - kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE); - kernel_len = kernel_end - kernel_start; - - initrd_start = 0; - initrd_end = 0; - initrd_len = 0; -#ifdef CONFIG_BLK_DEV_INITRD - initrd_len = (unsigned long)bp->hdr.ramdisk_size | - ((unsigned long)bp->ext_ramdisk_size << 32); - if (initrd_len) { - initrd_start = (unsigned long)bp->hdr.ramdisk_image | - ((unsigned long)bp->ext_ramdisk_image << 32); - initrd_end = PAGE_ALIGN(initrd_start + initrd_len); - initrd_len = initrd_end - initrd_start; - } -#endif - - /* Set the encryption workarea to be immediately after the kernel */ - workarea_start = kernel_end; - - /* - * Calculate required number of workarea bytes needed: - * executable encryption area size: - * stack page (PAGE_SIZE) - * encryption routine page (PAGE_SIZE) - * intermediate copy buffer (PMD_PAGE_SIZE) - * pagetable structures for the encryption of the kernel - * pagetable structures for workarea (in case not currently mapped) - */ - execute_start = workarea_start; - execute_end = execute_start + (PAGE_SIZE * 2) + PMD_PAGE_SIZE; - execute_len = execute_end - execute_start; - - /* - * One PGD for both encrypted and decrypted mappings and a set of - * PUDs and PMDs for each of the encrypted and decrypted mappings. - */ - pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD; - pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2; - if (initrd_len) - pgtable_area_len += sme_pgtable_calc(initrd_len) * 2; - - /* PUDs and PMDs needed in the current pagetables for the workarea */ - pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len); - - /* - * The total workarea includes the executable encryption area and - * the pagetable area. The start of the workarea is already 2MB - * aligned, align the end of the workarea on a 2MB boundary so that - * we don't try to create/allocate PTE entries from the workarea - * before it is mapped. - */ - workarea_len = execute_len + pgtable_area_len; - workarea_end = ALIGN(workarea_start + workarea_len, PMD_PAGE_SIZE); - - /* - * Set the address to the start of where newly created pagetable - * structures (PGDs, PUDs and PMDs) will be allocated. New pagetable - * structures are created when the workarea is added to the current - * pagetables and when the new encrypted and decrypted kernel - * mappings are populated. - */ - ppd.pgtable_area = (void *)execute_end; - - /* - * Make sure the current pagetable structure has entries for - * addressing the workarea. - */ - ppd.pgd = (pgd_t *)native_read_cr3_pa(); - ppd.paddr = workarea_start; - ppd.vaddr = workarea_start; - ppd.vaddr_end = workarea_end; - sme_map_range_decrypted(&ppd); - - /* Flush the TLB - no globals so cr3 is enough */ - native_write_cr3(__native_read_cr3()); - - /* - * A new pagetable structure is being built to allow for the kernel - * and initrd to be encrypted. It starts with an empty PGD that will - * then be populated with new PUDs and PMDs as the encrypted and - * decrypted kernel mappings are created. - */ - ppd.pgd = ppd.pgtable_area; - memset(ppd.pgd, 0, sizeof(pgd_t) * PTRS_PER_PGD); - ppd.pgtable_area += sizeof(pgd_t) * PTRS_PER_PGD; - - /* - * A different PGD index/entry must be used to get different - * pagetable entries for the decrypted mapping. Choose the next - * PGD index and convert it to a virtual address to be used as - * the base of the mapping. - */ - decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1); - if (initrd_len) { - unsigned long check_base; - - check_base = (pgd_index(initrd_end) + 1) & (PTRS_PER_PGD - 1); - decrypted_base = max(decrypted_base, check_base); - } - decrypted_base <<= PGDIR_SHIFT; - - /* Add encrypted kernel (identity) mappings */ - ppd.paddr = kernel_start; - ppd.vaddr = kernel_start; - ppd.vaddr_end = kernel_end; - sme_map_range_encrypted(&ppd); - - /* Add decrypted, write-protected kernel (non-identity) mappings */ - ppd.paddr = kernel_start; - ppd.vaddr = kernel_start + decrypted_base; - ppd.vaddr_end = kernel_end + decrypted_base; - sme_map_range_decrypted_wp(&ppd); - - if (initrd_len) { - /* Add encrypted initrd (identity) mappings */ - ppd.paddr = initrd_start; - ppd.vaddr = initrd_start; - ppd.vaddr_end = initrd_end; - sme_map_range_encrypted(&ppd); - /* - * Add decrypted, write-protected initrd (non-identity) mappings - */ - ppd.paddr = initrd_start; - ppd.vaddr = initrd_start + decrypted_base; - ppd.vaddr_end = initrd_end + decrypted_base; - sme_map_range_decrypted_wp(&ppd); - } - - /* Add decrypted workarea mappings to both kernel mappings */ - ppd.paddr = workarea_start; - ppd.vaddr = workarea_start; - ppd.vaddr_end = workarea_end; - sme_map_range_decrypted(&ppd); - - ppd.paddr = workarea_start; - ppd.vaddr = workarea_start + decrypted_base; - ppd.vaddr_end = workarea_end + decrypted_base; - sme_map_range_decrypted(&ppd); - - /* Perform the encryption */ - sme_encrypt_execute(kernel_start, kernel_start + decrypted_base, - kernel_len, workarea_start, (unsigned long)ppd.pgd); - - if (initrd_len) - sme_encrypt_execute(initrd_start, initrd_start + decrypted_base, - initrd_len, workarea_start, - (unsigned long)ppd.pgd); - - /* - * At this point we are running encrypted. Remove the mappings for - * the decrypted areas - all that is needed for this is to remove - * the PGD entry/entries. - */ - ppd.vaddr = kernel_start + decrypted_base; - ppd.vaddr_end = kernel_end + decrypted_base; - sme_clear_pgd(&ppd); - - if (initrd_len) { - ppd.vaddr = initrd_start + decrypted_base; - ppd.vaddr_end = initrd_end + decrypted_base; - sme_clear_pgd(&ppd); - } - - ppd.vaddr = workarea_start + decrypted_base; - ppd.vaddr_end = workarea_end + decrypted_base; - sme_clear_pgd(&ppd); - - /* Flush the TLB - no globals so cr3 is enough */ - native_write_cr3(__native_read_cr3()); -} - -void __init __nostackprotector sme_enable(struct boot_params *bp) -{ - const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off; - unsigned int eax, ebx, ecx, edx; - unsigned long feature_mask; - bool active_by_default; - unsigned long me_mask; - char buffer[16]; - u64 msr; - - /* Check for the SME/SEV support leaf */ - eax = 0x80000000; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - if (eax < 0x8000001f) - return; - -#define AMD_SME_BIT BIT(0) -#define AMD_SEV_BIT BIT(1) - /* - * Set the feature mask (SME or SEV) based on whether we are - * running under a hypervisor. - */ - eax = 1; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - feature_mask = (ecx & BIT(31)) ? AMD_SEV_BIT : AMD_SME_BIT; - - /* - * Check for the SME/SEV feature: - * CPUID Fn8000_001F[EAX] - * - Bit 0 - Secure Memory Encryption support - * - Bit 1 - Secure Encrypted Virtualization support - * CPUID Fn8000_001F[EBX] - * - Bits 5:0 - Pagetable bit position used to indicate encryption - */ - eax = 0x8000001f; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - if (!(eax & feature_mask)) - return; - - me_mask = 1UL << (ebx & 0x3f); - - /* Check if memory encryption is enabled */ - if (feature_mask == AMD_SME_BIT) { - /* For SME, check the SYSCFG MSR */ - msr = __rdmsr(MSR_K8_SYSCFG); - if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) - return; - } else { - /* For SEV, check the SEV MSR */ - msr = __rdmsr(MSR_AMD64_SEV); - if (!(msr & MSR_AMD64_SEV_ENABLED)) - return; - - /* SEV state cannot be controlled by a command line option */ - sme_me_mask = me_mask; - sev_enabled = true; - return; - } - - /* - * Fixups have not been applied to phys_base yet and we're running - * identity mapped, so we must obtain the address to the SME command - * line argument data using rip-relative addressing. - */ - asm ("lea sme_cmdline_arg(%%rip), %0" - : "=r" (cmdline_arg) - : "p" (sme_cmdline_arg)); - asm ("lea sme_cmdline_on(%%rip), %0" - : "=r" (cmdline_on) - : "p" (sme_cmdline_on)); - asm ("lea sme_cmdline_off(%%rip), %0" - : "=r" (cmdline_off) - : "p" (sme_cmdline_off)); - - if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT)) - active_by_default = true; - else - active_by_default = false; - - cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr | - ((u64)bp->ext_cmd_line_ptr << 32)); - - cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer)); - - if (!strncmp(buffer, cmdline_on, sizeof(buffer))) - sme_me_mask = me_mask; - else if (!strncmp(buffer, cmdline_off, sizeof(buffer))) - sme_me_mask = 0; - else - sme_me_mask = active_by_default ? me_mask : 0; -} diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c new file mode 100644 index 000000000000..1b2197d13832 --- /dev/null +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -0,0 +1,564 @@ +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky <thomas.lendacky@amd.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#define DISABLE_BRANCH_PROFILING + +/* + * Since we're dealing with identity mappings, physical and virtual + * addresses are the same, so override these defines which are ultimately + * used by the headers in misc.h. + */ +#define __pa(x) ((unsigned long)(x)) +#define __va(x) ((void *)((unsigned long)(x))) + +/* + * Special hack: we have to be careful, because no indirections are + * allowed here, and paravirt_ops is a kind of one. As it will only run in + * baremetal anyway, we just keep it from happening. (This list needs to + * be extended when new paravirt and debugging variants are added.) + */ +#undef CONFIG_PARAVIRT +#undef CONFIG_PARAVIRT_SPINLOCKS + +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/mem_encrypt.h> + +#include <asm/setup.h> +#include <asm/sections.h> +#include <asm/cmdline.h> + +#include "mm_internal.h" + +#define PGD_FLAGS _KERNPG_TABLE_NOENC +#define P4D_FLAGS _KERNPG_TABLE_NOENC +#define PUD_FLAGS _KERNPG_TABLE_NOENC +#define PMD_FLAGS _KERNPG_TABLE_NOENC + +#define PMD_FLAGS_LARGE (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL) + +#define PMD_FLAGS_DEC PMD_FLAGS_LARGE +#define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \ + (_PAGE_PAT | _PAGE_PWT)) + +#define PMD_FLAGS_ENC (PMD_FLAGS_LARGE | _PAGE_ENC) + +#define PTE_FLAGS (__PAGE_KERNEL_EXEC & ~_PAGE_GLOBAL) + +#define PTE_FLAGS_DEC PTE_FLAGS +#define PTE_FLAGS_DEC_WP ((PTE_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \ + (_PAGE_PAT | _PAGE_PWT)) + +#define PTE_FLAGS_ENC (PTE_FLAGS | _PAGE_ENC) + +struct sme_populate_pgd_data { + void *pgtable_area; + pgd_t *pgd; + + pmdval_t pmd_flags; + pteval_t pte_flags; + unsigned long paddr; + + unsigned long vaddr; + unsigned long vaddr_end; +}; + +static char sme_cmdline_arg[] __initdata = "mem_encrypt"; +static char sme_cmdline_on[] __initdata = "on"; +static char sme_cmdline_off[] __initdata = "off"; + +static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd) +{ + unsigned long pgd_start, pgd_end, pgd_size; + pgd_t *pgd_p; + + pgd_start = ppd->vaddr & PGDIR_MASK; + pgd_end = ppd->vaddr_end & PGDIR_MASK; + + pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1) * sizeof(pgd_t); + + pgd_p = ppd->pgd + pgd_index(ppd->vaddr); + + memset(pgd_p, 0, pgd_size); +} + +static pud_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + pgd = ppd->pgd + pgd_index(ppd->vaddr); + if (pgd_none(*pgd)) { + p4d = ppd->pgtable_area; + memset(p4d, 0, sizeof(*p4d) * PTRS_PER_P4D); + ppd->pgtable_area += sizeof(*p4d) * PTRS_PER_P4D; + set_pgd(pgd, __pgd(PGD_FLAGS | __pa(p4d))); + } + + p4d = p4d_offset(pgd, ppd->vaddr); + if (p4d_none(*p4d)) { + pud = ppd->pgtable_area; + memset(pud, 0, sizeof(*pud) * PTRS_PER_PUD); + ppd->pgtable_area += sizeof(*pud) * PTRS_PER_PUD; + set_p4d(p4d, __p4d(P4D_FLAGS | __pa(pud))); + } + + pud = pud_offset(p4d, ppd->vaddr); + if (pud_none(*pud)) { + pmd = ppd->pgtable_area; + memset(pmd, 0, sizeof(*pmd) * PTRS_PER_PMD); + ppd->pgtable_area += sizeof(*pmd) * PTRS_PER_PMD; + set_pud(pud, __pud(PUD_FLAGS | __pa(pmd))); + } + + if (pud_large(*pud)) + return NULL; + + return pud; +} + +static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) +{ + pud_t *pud; + pmd_t *pmd; + + pud = sme_prepare_pgd(ppd); + if (!pud) + return; + + pmd = pmd_offset(pud, ppd->vaddr); + if (pmd_large(*pmd)) + return; + + set_pmd(pmd, __pmd(ppd->paddr | ppd->pmd_flags)); +} + +static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd) +{ + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pud = sme_prepare_pgd(ppd); + if (!pud) + return; + + pmd = pmd_offset(pud, ppd->vaddr); + if (pmd_none(*pmd)) { + pte = ppd->pgtable_area; + memset(pte, 0, sizeof(pte) * PTRS_PER_PTE); + ppd->pgtable_area += sizeof(pte) * PTRS_PER_PTE; + set_pmd(pmd, __pmd(PMD_FLAGS | __pa(pte))); + } + + if (pmd_large(*pmd)) + return; + + pte = pte_offset_map(pmd, ppd->vaddr); + if (pte_none(*pte)) + set_pte(pte, __pte(ppd->paddr | ppd->pte_flags)); +} + +static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) +{ + while (ppd->vaddr < ppd->vaddr_end) { + sme_populate_pgd_large(ppd); + + ppd->vaddr += PMD_PAGE_SIZE; + ppd->paddr += PMD_PAGE_SIZE; + } +} + +static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd) +{ + while (ppd->vaddr < ppd->vaddr_end) { + sme_populate_pgd(ppd); + + ppd->vaddr += PAGE_SIZE; + ppd->paddr += PAGE_SIZE; + } +} + +static void __init __sme_map_range(struct sme_populate_pgd_data *ppd, + pmdval_t pmd_flags, pteval_t pte_flags) +{ + unsigned long vaddr_end; + + ppd->pmd_flags = pmd_flags; + ppd->pte_flags = pte_flags; + + /* Save original end value since we modify the struct value */ + vaddr_end = ppd->vaddr_end; + + /* If start is not 2MB aligned, create PTE entries */ + ppd->vaddr_end = ALIGN(ppd->vaddr, PMD_PAGE_SIZE); + __sme_map_range_pte(ppd); + + /* Create PMD entries */ + ppd->vaddr_end = vaddr_end & PMD_PAGE_MASK; + __sme_map_range_pmd(ppd); + + /* If end is not 2MB aligned, create PTE entries */ + ppd->vaddr_end = vaddr_end; + __sme_map_range_pte(ppd); +} + +static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd) +{ + __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC); +} + +static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd) +{ + __sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC); +} + +static void __init sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd) +{ + __sme_map_range(ppd, PMD_FLAGS_DEC_WP, PTE_FLAGS_DEC_WP); +} + +static unsigned long __init sme_pgtable_calc(unsigned long len) +{ + unsigned long entries = 0, tables = 0; + + /* + * Perform a relatively simplistic calculation of the pagetable + * entries that are needed. Those mappings will be covered mostly + * by 2MB PMD entries so we can conservatively calculate the required + * number of P4D, PUD and PMD structures needed to perform the + * mappings. For mappings that are not 2MB aligned, PTE mappings + * would be needed for the start and end portion of the address range + * that fall outside of the 2MB alignment. This results in, at most, + * two extra pages to hold PTE entries for each range that is mapped. + * Incrementing the count for each covers the case where the addresses + * cross entries. + */ + + /* PGDIR_SIZE is equal to P4D_SIZE on 4-level machine. */ + if (PTRS_PER_P4D > 1) + entries += (DIV_ROUND_UP(len, PGDIR_SIZE) + 1) * sizeof(p4d_t) * PTRS_PER_P4D; + entries += (DIV_ROUND_UP(len, P4D_SIZE) + 1) * sizeof(pud_t) * PTRS_PER_PUD; + entries += (DIV_ROUND_UP(len, PUD_SIZE) + 1) * sizeof(pmd_t) * PTRS_PER_PMD; + entries += 2 * sizeof(pte_t) * PTRS_PER_PTE; + + /* + * Now calculate the added pagetable structures needed to populate + * the new pagetables. + */ + + if (PTRS_PER_P4D > 1) + tables += DIV_ROUND_UP(entries, PGDIR_SIZE) * sizeof(p4d_t) * PTRS_PER_P4D; + tables += DIV_ROUND_UP(entries, P4D_SIZE) * sizeof(pud_t) * PTRS_PER_PUD; + tables += DIV_ROUND_UP(entries, PUD_SIZE) * sizeof(pmd_t) * PTRS_PER_PMD; + + return entries + tables; +} + +void __init sme_encrypt_kernel(struct boot_params *bp) +{ + unsigned long workarea_start, workarea_end, workarea_len; + unsigned long execute_start, execute_end, execute_len; + unsigned long kernel_start, kernel_end, kernel_len; + unsigned long initrd_start, initrd_end, initrd_len; + struct sme_populate_pgd_data ppd; + unsigned long pgtable_area_len; + unsigned long decrypted_base; + + if (!sme_active()) + return; + + /* + * Prepare for encrypting the kernel and initrd by building new + * pagetables with the necessary attributes needed to encrypt the + * kernel in place. + * + * One range of virtual addresses will map the memory occupied + * by the kernel and initrd as encrypted. + * + * Another range of virtual addresses will map the memory occupied + * by the kernel and initrd as decrypted and write-protected. + * + * The use of write-protect attribute will prevent any of the + * memory from being cached. + */ + + /* Physical addresses gives us the identity mapped virtual addresses */ + kernel_start = __pa_symbol(_text); + kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE); + kernel_len = kernel_end - kernel_start; + + initrd_start = 0; + initrd_end = 0; + initrd_len = 0; +#ifdef CONFIG_BLK_DEV_INITRD + initrd_len = (unsigned long)bp->hdr.ramdisk_size | + ((unsigned long)bp->ext_ramdisk_size << 32); + if (initrd_len) { + initrd_start = (unsigned long)bp->hdr.ramdisk_image | + ((unsigned long)bp->ext_ramdisk_image << 32); + initrd_end = PAGE_ALIGN(initrd_start + initrd_len); + initrd_len = initrd_end - initrd_start; + } +#endif + + /* Set the encryption workarea to be immediately after the kernel */ + workarea_start = kernel_end; + + /* + * Calculate required number of workarea bytes needed: + * executable encryption area size: + * stack page (PAGE_SIZE) + * encryption routine page (PAGE_SIZE) + * intermediate copy buffer (PMD_PAGE_SIZE) + * pagetable structures for the encryption of the kernel + * pagetable structures for workarea (in case not currently mapped) + */ + execute_start = workarea_start; + execute_end = execute_start + (PAGE_SIZE * 2) + PMD_PAGE_SIZE; + execute_len = execute_end - execute_start; + + /* + * One PGD for both encrypted and decrypted mappings and a set of + * PUDs and PMDs for each of the encrypted and decrypted mappings. + */ + pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD; + pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2; + if (initrd_len) + pgtable_area_len += sme_pgtable_calc(initrd_len) * 2; + + /* PUDs and PMDs needed in the current pagetables for the workarea */ + pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len); + + /* + * The total workarea includes the executable encryption area and + * the pagetable area. The start of the workarea is already 2MB + * aligned, align the end of the workarea on a 2MB boundary so that + * we don't try to create/allocate PTE entries from the workarea + * before it is mapped. + */ + workarea_len = execute_len + pgtable_area_len; + workarea_end = ALIGN(workarea_start + workarea_len, PMD_PAGE_SIZE); + + /* + * Set the address to the start of where newly created pagetable + * structures (PGDs, PUDs and PMDs) will be allocated. New pagetable + * structures are created when the workarea is added to the current + * pagetables and when the new encrypted and decrypted kernel + * mappings are populated. + */ + ppd.pgtable_area = (void *)execute_end; + + /* + * Make sure the current pagetable structure has entries for + * addressing the workarea. + */ + ppd.pgd = (pgd_t *)native_read_cr3_pa(); + ppd.paddr = workarea_start; + ppd.vaddr = workarea_start; + ppd.vaddr_end = workarea_end; + sme_map_range_decrypted(&ppd); + + /* Flush the TLB - no globals so cr3 is enough */ + native_write_cr3(__native_read_cr3()); + + /* + * A new pagetable structure is being built to allow for the kernel + * and initrd to be encrypted. It starts with an empty PGD that will + * then be populated with new PUDs and PMDs as the encrypted and + * decrypted kernel mappings are created. + */ + ppd.pgd = ppd.pgtable_area; + memset(ppd.pgd, 0, sizeof(pgd_t) * PTRS_PER_PGD); + ppd.pgtable_area += sizeof(pgd_t) * PTRS_PER_PGD; + + /* + * A different PGD index/entry must be used to get different + * pagetable entries for the decrypted mapping. Choose the next + * PGD index and convert it to a virtual address to be used as + * the base of the mapping. + */ + decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1); + if (initrd_len) { + unsigned long check_base; + + check_base = (pgd_index(initrd_end) + 1) & (PTRS_PER_PGD - 1); + decrypted_base = max(decrypted_base, check_base); + } + decrypted_base <<= PGDIR_SHIFT; + + /* Add encrypted kernel (identity) mappings */ + ppd.paddr = kernel_start; + ppd.vaddr = kernel_start; + ppd.vaddr_end = kernel_end; + sme_map_range_encrypted(&ppd); + + /* Add decrypted, write-protected kernel (non-identity) mappings */ + ppd.paddr = kernel_start; + ppd.vaddr = kernel_start + decrypted_base; + ppd.vaddr_end = kernel_end + decrypted_base; + sme_map_range_decrypted_wp(&ppd); + + if (initrd_len) { + /* Add encrypted initrd (identity) mappings */ + ppd.paddr = initrd_start; + ppd.vaddr = initrd_start; + ppd.vaddr_end = initrd_end; + sme_map_range_encrypted(&ppd); + /* + * Add decrypted, write-protected initrd (non-identity) mappings + */ + ppd.paddr = initrd_start; + ppd.vaddr = initrd_start + decrypted_base; + ppd.vaddr_end = initrd_end + decrypted_base; + sme_map_range_decrypted_wp(&ppd); + } + + /* Add decrypted workarea mappings to both kernel mappings */ + ppd.paddr = workarea_start; + ppd.vaddr = workarea_start; + ppd.vaddr_end = workarea_end; + sme_map_range_decrypted(&ppd); + + ppd.paddr = workarea_start; + ppd.vaddr = workarea_start + decrypted_base; + ppd.vaddr_end = workarea_end + decrypted_base; + sme_map_range_decrypted(&ppd); + + /* Perform the encryption */ + sme_encrypt_execute(kernel_start, kernel_start + decrypted_base, + kernel_len, workarea_start, (unsigned long)ppd.pgd); + + if (initrd_len) + sme_encrypt_execute(initrd_start, initrd_start + decrypted_base, + initrd_len, workarea_start, + (unsigned long)ppd.pgd); + + /* + * At this point we are running encrypted. Remove the mappings for + * the decrypted areas - all that is needed for this is to remove + * the PGD entry/entries. + */ + ppd.vaddr = kernel_start + decrypted_base; + ppd.vaddr_end = kernel_end + decrypted_base; + sme_clear_pgd(&ppd); + + if (initrd_len) { + ppd.vaddr = initrd_start + decrypted_base; + ppd.vaddr_end = initrd_end + decrypted_base; + sme_clear_pgd(&ppd); + } + + ppd.vaddr = workarea_start + decrypted_base; + ppd.vaddr_end = workarea_end + decrypted_base; + sme_clear_pgd(&ppd); + + /* Flush the TLB - no globals so cr3 is enough */ + native_write_cr3(__native_read_cr3()); +} + +void __init sme_enable(struct boot_params *bp) +{ + const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off; + unsigned int eax, ebx, ecx, edx; + unsigned long feature_mask; + bool active_by_default; + unsigned long me_mask; + char buffer[16]; + u64 msr; + + /* Check for the SME/SEV support leaf */ + eax = 0x80000000; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + if (eax < 0x8000001f) + return; + +#define AMD_SME_BIT BIT(0) +#define AMD_SEV_BIT BIT(1) + /* + * Set the feature mask (SME or SEV) based on whether we are + * running under a hypervisor. + */ + eax = 1; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + feature_mask = (ecx & BIT(31)) ? AMD_SEV_BIT : AMD_SME_BIT; + + /* + * Check for the SME/SEV feature: + * CPUID Fn8000_001F[EAX] + * - Bit 0 - Secure Memory Encryption support + * - Bit 1 - Secure Encrypted Virtualization support + * CPUID Fn8000_001F[EBX] + * - Bits 5:0 - Pagetable bit position used to indicate encryption + */ + eax = 0x8000001f; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + if (!(eax & feature_mask)) + return; + + me_mask = 1UL << (ebx & 0x3f); + + /* Check if memory encryption is enabled */ + if (feature_mask == AMD_SME_BIT) { + /* For SME, check the SYSCFG MSR */ + msr = __rdmsr(MSR_K8_SYSCFG); + if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) + return; + } else { + /* For SEV, check the SEV MSR */ + msr = __rdmsr(MSR_AMD64_SEV); + if (!(msr & MSR_AMD64_SEV_ENABLED)) + return; + + /* SEV state cannot be controlled by a command line option */ + sme_me_mask = me_mask; + sev_enabled = true; + return; + } + + /* + * Fixups have not been applied to phys_base yet and we're running + * identity mapped, so we must obtain the address to the SME command + * line argument data using rip-relative addressing. + */ + asm ("lea sme_cmdline_arg(%%rip), %0" + : "=r" (cmdline_arg) + : "p" (sme_cmdline_arg)); + asm ("lea sme_cmdline_on(%%rip), %0" + : "=r" (cmdline_on) + : "p" (sme_cmdline_on)); + asm ("lea sme_cmdline_off(%%rip), %0" + : "=r" (cmdline_off) + : "p" (sme_cmdline_off)); + + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT)) + active_by_default = true; + else + active_by_default = false; + + cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr | + ((u64)bp->ext_cmd_line_ptr << 32)); + + cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer)); + + if (!strncmp(buffer, cmdline_on, sizeof(buffer))) + sme_me_mask = me_mask; + else if (!strncmp(buffer, cmdline_off, sizeof(buffer))) + sme_me_mask = 0; + else + sme_me_mask = active_by_default ? me_mask : 0; +} diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index aca6295350f3..e8a4a09e20f1 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -60,17 +60,6 @@ void memory_present(int nid, unsigned long start, unsigned long end) } printk(KERN_CONT "\n"); } - -unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, - unsigned long end_pfn) -{ - unsigned long nr_pages = end_pfn - start_pfn; - - if (!nr_pages) - return 0; - - return (nr_pages + 1) * sizeof(struct page); -} #endif extern unsigned long highend_pfn, highstart_pfn; diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 7f1a51399674..e055d1a06699 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -157,7 +157,7 @@ static void sync_current_stack_to_mm(struct mm_struct *mm) unsigned long sp = current_stack_pointer; pgd_t *pgd = pgd_offset(mm, sp); - if (CONFIG_PGTABLE_LEVELS > 4) { + if (pgtable_l5_enabled) { if (unlikely(pgd_none(*pgd))) { pgd_t *pgd_ref = pgd_offset_k(sp); @@ -613,7 +613,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, { int cpu; - struct flush_tlb_info info = { + struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = { .mm = mm, }; diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index f9cfbc0d1f33..7f443bd1411d 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -27,6 +27,7 @@ #include <linux/ioport.h> #include <linux/mc146818rtc.h> #include <linux/efi.h> +#include <linux/export.h> #include <linux/uaccess.h> #include <linux/io.h> #include <linux/reboot.h> @@ -190,7 +191,8 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd) early_code_mapping_set_exec(0); } -static pgd_t *efi_pgd; +pgd_t *efi_pgd; +EXPORT_SYMBOL_GPL(efi_pgd); /* * We need our own copy of the higher levels of the page tables @@ -225,7 +227,7 @@ int __init efi_alloc_page_tables(void) pud = pud_alloc(&init_mm, p4d, EFI_VA_END); if (!pud) { - if (CONFIG_PGTABLE_LEVELS > 4) + if (pgtable_l5_enabled) free_page((unsigned long) pgd_page_vaddr(*pgd)); free_pages((unsigned long)efi_pgd, PGD_ALLOCATION_ORDER); return -ENOMEM; @@ -255,8 +257,8 @@ void efi_sync_low_kernel_mappings(void) * only span a single PGD entry and that the entry also maps * other important kernel regions. */ - BUILD_BUG_ON(pgd_index(EFI_VA_END) != pgd_index(MODULES_END)); - BUILD_BUG_ON((EFI_VA_START & PGDIR_MASK) != + MAYBE_BUILD_BUG_ON(pgd_index(EFI_VA_END) != pgd_index(MODULES_END)); + MAYBE_BUILD_BUG_ON((EFI_VA_START & PGDIR_MASK) != (EFI_VA_END & PGDIR_MASK)); pgd_efi = efi_pgd + pgd_index(PAGE_OFFSET); diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c index fb1df9488e98..2ebdf31d9996 100644 --- a/arch/x86/platform/intel-mid/intel-mid.c +++ b/arch/x86/platform/intel-mid/intel-mid.c @@ -199,6 +199,12 @@ void __init x86_intel_mid_early_setup(void) legacy_pic = &null_legacy_pic; + /* + * Do nothing for now as everything needed done in + * x86_intel_mid_early_setup() below. + */ + x86_init.acpi.reduced_hw_early_init = x86_init_noop; + pm_power_off = intel_mid_power_off; machine_ops.emergency_restart = intel_mid_reboot; diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index 0ef5e5204968..74a532989308 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -50,7 +50,7 @@ static int set_up_temporary_text_mapping(pgd_t *pgd) { pmd_t *pmd; pud_t *pud; - p4d_t *p4d; + p4d_t *p4d = NULL; /* * The new mapping only has to cover the page containing the image @@ -66,7 +66,7 @@ static int set_up_temporary_text_mapping(pgd_t *pgd) * tables used by the image kernel. */ - if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + if (pgtable_l5_enabled) { p4d = (p4d_t *)get_safe_page(GFP_ATOMIC); if (!p4d) return -ENOMEM; @@ -84,7 +84,7 @@ static int set_up_temporary_text_mapping(pgd_t *pgd) __pmd((jump_address_phys & PMD_MASK) | __PAGE_KERNEL_LARGE_EXEC)); set_pud(pud + pud_index(restore_jump_address), __pud(__pa(pmd) | _KERNPG_TABLE)); - if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + if (p4d) { set_p4d(p4d + p4d_index(restore_jump_address), __p4d(__pa(pud) | _KERNPG_TABLE)); set_pgd(pgd + pgd_index(restore_jump_address), __pgd(__pa(p4d) | _KERNPG_TABLE)); } else { diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index f605825a04ab..c1f98f32c45f 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -18,9 +18,6 @@ config XEN_PV bool "Xen PV guest support" default y depends on XEN - # XEN_PV is not ready to work with 5-level paging. - # Changes to hypervisor are also required. - depends on !X86_5LEVEL select XEN_HAVE_PVMMU select XEN_HAVE_VPMU help @@ -79,6 +76,4 @@ config XEN_DEBUG_FS config XEN_PVH bool "Support for running as a PVH guest" depends on XEN && XEN_PVHVM && ACPI - # Pre-built page tables are not ready to handle 5-level paging. - depends on !X86_5LEVEL def_bool n diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c index 436c4f003e17..aa1c6a6831a9 100644 --- a/arch/x86/xen/enlighten_pvh.c +++ b/arch/x86/xen/enlighten_pvh.c @@ -6,6 +6,7 @@ #include <asm/io_apic.h> #include <asm/hypervisor.h> #include <asm/e820/api.h> +#include <asm/x86_init.h> #include <asm/xen/interface.h> #include <asm/xen/hypercall.h> @@ -16,15 +17,20 @@ /* * PVH variables. * - * xen_pvh and pvh_bootparams need to live in data segment since they - * are used after startup_{32|64}, which clear .bss, are invoked. + * xen_pvh pvh_bootparams and pvh_start_info need to live in data segment + * since they are used after startup_{32|64}, which clear .bss, are invoked. */ bool xen_pvh __attribute__((section(".data"))) = 0; struct boot_params pvh_bootparams __attribute__((section(".data"))); +struct hvm_start_info pvh_start_info __attribute__((section(".data"))); -struct hvm_start_info pvh_start_info; unsigned int pvh_start_info_sz = sizeof(pvh_start_info); +static u64 pvh_get_root_pointer(void) +{ + return pvh_start_info.rsdp_paddr; +} + static void __init init_pvh_bootparams(void) { struct xen_memory_map memmap; @@ -71,6 +77,8 @@ static void __init init_pvh_bootparams(void) */ pvh_bootparams.hdr.version = 0x212; pvh_bootparams.hdr.type_of_loader = (9 << 4) | 0; /* Xen loader */ + + x86_init.acpi.get_root_pointer = pvh_get_root_pointer; } /* diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index aae88fec9941..d20763472920 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -538,6 +538,22 @@ static void xen_set_p4d(p4d_t *ptr, p4d_t val) xen_mc_issue(PARAVIRT_LAZY_MMU); } + +#if CONFIG_PGTABLE_LEVELS >= 5 +__visible p4dval_t xen_p4d_val(p4d_t p4d) +{ + return pte_mfn_to_pfn(p4d.p4d); +} +PV_CALLEE_SAVE_REGS_THUNK(xen_p4d_val); + +__visible p4d_t xen_make_p4d(p4dval_t p4d) +{ + p4d = pte_pfn_to_mfn(p4d); + + return native_make_p4d(p4d); +} +PV_CALLEE_SAVE_REGS_THUNK(xen_make_p4d); +#endif /* CONFIG_PGTABLE_LEVELS >= 5 */ #endif /* CONFIG_X86_64 */ static int xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd, @@ -2411,6 +2427,11 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { .alloc_pud = xen_alloc_pmd_init, .release_pud = xen_release_pmd_init, + +#if CONFIG_PGTABLE_LEVELS >= 5 + .p4d_val = PV_CALLEE_SAVE(xen_p4d_val), + .make_p4d = PV_CALLEE_SAVE(xen_make_p4d), +#endif #endif /* CONFIG_X86_64 */ .activate_mm = xen_activate_mm, |