From 7744ccdbc16f0ac4adae21b3678af93775b3a386 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:10:03 -0500 Subject: x86/mm: Add Secure Memory Encryption (SME) support Add support for Secure Memory Encryption (SME). This initial support provides a Kconfig entry to build the SME support into the kernel and defines the memory encryption mask that will be used in subsequent patches to mark pages as encrypted. Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Matt Fleming Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/a6c34d16caaed3bc3e2d6f0987554275bd291554.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- include/linux/mem_encrypt.h | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 include/linux/mem_encrypt.h (limited to 'include') diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h new file mode 100644 index 000000000000..59769f7287e4 --- /dev/null +++ b/include/linux/mem_encrypt.h @@ -0,0 +1,35 @@ +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __MEM_ENCRYPT_H__ +#define __MEM_ENCRYPT_H__ + +#ifndef __ASSEMBLY__ + +#ifdef CONFIG_ARCH_HAS_MEM_ENCRYPT + +#include + +#else /* !CONFIG_ARCH_HAS_MEM_ENCRYPT */ + +#define sme_me_mask 0UL + +#endif /* CONFIG_ARCH_HAS_MEM_ENCRYPT */ + +static inline bool sme_active(void) +{ + return !!sme_me_mask; +} + +#endif /* __ASSEMBLY__ */ + +#endif /* __MEM_ENCRYPT_H__ */ -- cgit v1.2.3 From 5868f3651fa0dff96a57f94d49247d3ef320ebe2 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:10:05 -0500 Subject: x86/mm: Add support to enable SME in early boot processing Add support to the early boot code to use Secure Memory Encryption (SME). Since the kernel has been loaded into memory in a decrypted state, encrypt the kernel in place and update the early pagetables with the memory encryption mask so that new pagetable entries will use memory encryption. The routines to set the encryption mask and perform the encryption are stub routines for now with functionality to be added in a later patch. Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Matt Fleming Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/e52ad781f085224bf835b3caff9aa3aee6febccb.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mem_encrypt.h | 8 ++++++ arch/x86/kernel/head64.c | 53 ++++++++++++++++++++++++++++++-------- arch/x86/kernel/head_64.S | 20 ++++++++++++-- arch/x86/mm/mem_encrypt.c | 9 +++++++ include/linux/mem_encrypt.h | 5 ++++ 5 files changed, 82 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index a1057961ac46..475e34f53793 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -15,14 +15,22 @@ #ifndef __ASSEMBLY__ +#include + #ifdef CONFIG_AMD_MEM_ENCRYPT extern unsigned long sme_me_mask; +void __init sme_encrypt_kernel(void); +void __init sme_enable(void); + #else /* !CONFIG_AMD_MEM_ENCRYPT */ #define sme_me_mask 0UL +static inline void __init sme_encrypt_kernel(void) { } +static inline void __init sme_enable(void) { } + #endif /* CONFIG_AMD_MEM_ENCRYPT */ #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 46c3c73e7f43..1f0ddcc9675c 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -45,9 +46,10 @@ static void __head *fixup_pointer(void *ptr, unsigned long physaddr) return ptr - (void *)_text + (void *)physaddr; } -void __head __startup_64(unsigned long physaddr) +unsigned long __head __startup_64(unsigned long physaddr) { unsigned long load_delta, *p; + unsigned long pgtable_flags; pgdval_t *pgd; p4dval_t *p4d; pudval_t *pud; @@ -68,6 +70,12 @@ void __head __startup_64(unsigned long physaddr) if (load_delta & ~PMD_PAGE_MASK) for (;;); + /* Activate Secure Memory Encryption (SME) if supported and enabled */ + sme_enable(); + + /* Include the SME encryption mask in the fixup value */ + load_delta += sme_get_me_mask(); + /* Fixup the physical addresses in the page table */ pgd = fixup_pointer(&early_top_pgt, physaddr); @@ -94,28 +102,30 @@ void __head __startup_64(unsigned long physaddr) pud = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); pmd = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); + pgtable_flags = _KERNPG_TABLE + sme_get_me_mask(); if (IS_ENABLED(CONFIG_X86_5LEVEL)) { p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; - pgd[i + 0] = (pgdval_t)p4d + _KERNPG_TABLE; - pgd[i + 1] = (pgdval_t)p4d + _KERNPG_TABLE; + pgd[i + 0] = (pgdval_t)p4d + pgtable_flags; + pgd[i + 1] = (pgdval_t)p4d + pgtable_flags; i = (physaddr >> P4D_SHIFT) % PTRS_PER_P4D; - p4d[i + 0] = (pgdval_t)pud + _KERNPG_TABLE; - p4d[i + 1] = (pgdval_t)pud + _KERNPG_TABLE; + p4d[i + 0] = (pgdval_t)pud + pgtable_flags; + p4d[i + 1] = (pgdval_t)pud + pgtable_flags; } else { i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; - pgd[i + 0] = (pgdval_t)pud + _KERNPG_TABLE; - pgd[i + 1] = (pgdval_t)pud + _KERNPG_TABLE; + pgd[i + 0] = (pgdval_t)pud + pgtable_flags; + pgd[i + 1] = (pgdval_t)pud + pgtable_flags; } i = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD; - pud[i + 0] = (pudval_t)pmd + _KERNPG_TABLE; - pud[i + 1] = (pudval_t)pmd + _KERNPG_TABLE; + pud[i + 0] = (pudval_t)pmd + pgtable_flags; + pud[i + 1] = (pudval_t)pmd + pgtable_flags; pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL; + pmd_entry += sme_get_me_mask(); pmd_entry += physaddr; for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) { @@ -136,9 +146,30 @@ void __head __startup_64(unsigned long physaddr) pmd[i] += load_delta; } - /* Fixup phys_base */ + /* + * Fixup phys_base - remove the memory encryption mask to obtain + * the true physical address. + */ p = fixup_pointer(&phys_base, physaddr); - *p += load_delta; + *p += load_delta - sme_get_me_mask(); + + /* Encrypt the kernel (if SME is active) */ + sme_encrypt_kernel(); + + /* + * Return the SME encryption mask (if SME is active) to be used as a + * modifier for the initial pgdir entry programmed into CR3. + */ + return sme_get_me_mask(); +} + +unsigned long __startup_secondary_64(void) +{ + /* + * Return the SME encryption mask (if SME is active) to be used as a + * modifier for the initial pgdir entry programmed into CR3. + */ + return sme_get_me_mask(); } /* Wipe all early page tables except for the kernel symbol map */ diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 6225550883df..ec5d5e90c8f1 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -73,12 +73,19 @@ startup_64: /* Sanitize CPU configuration */ call verify_cpu + /* + * Perform pagetable fixups. Additionally, if SME is active, encrypt + * the kernel and retrieve the modifier (SME encryption mask if SME + * is active) to be added to the initial pgdir entry that will be + * programmed into CR3. + */ leaq _text(%rip), %rdi pushq %rsi call __startup_64 popq %rsi - movq $(early_top_pgt - __START_KERNEL_map), %rax + /* Form the CR3 value being sure to include the CR3 modifier */ + addq $(early_top_pgt - __START_KERNEL_map), %rax jmp 1f ENTRY(secondary_startup_64) /* @@ -98,7 +105,16 @@ ENTRY(secondary_startup_64) /* Sanitize CPU configuration */ call verify_cpu - movq $(init_top_pgt - __START_KERNEL_map), %rax + /* + * Retrieve the modifier (SME encryption mask if SME is active) to be + * added to the initial pgdir entry that will be programmed into CR3. + */ + pushq %rsi + call __startup_secondary_64 + popq %rsi + + /* Form the CR3 value being sure to include the CR3 modifier */ + addq $(init_top_pgt - __START_KERNEL_map), %rax 1: /* Enable PAE mode, PGE and LA57 */ diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index b99d469c73e7..3ac6f99b095c 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -11,6 +11,7 @@ */ #include +#include /* * Since SME related variables are set early in the boot process they must @@ -19,3 +20,11 @@ */ unsigned long sme_me_mask __section(.data) = 0; EXPORT_SYMBOL_GPL(sme_me_mask); + +void __init sme_encrypt_kernel(void) +{ +} + +void __init sme_enable(void) +{ +} diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h index 59769f7287e4..570f4fcff13f 100644 --- a/include/linux/mem_encrypt.h +++ b/include/linux/mem_encrypt.h @@ -30,6 +30,11 @@ static inline bool sme_active(void) return !!sme_me_mask; } +static inline unsigned long sme_get_me_mask(void) +{ + return sme_me_mask; +} + #endif /* __ASSEMBLY__ */ #endif /* __MEM_ENCRYPT_H__ */ -- cgit v1.2.3 From 21729f81ce8ae76a6995681d40e16f7ce8075db4 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:10:07 -0500 Subject: x86/mm: Provide general kernel support for memory encryption Changes to the existing page table macros will allow the SME support to be enabled in a simple fashion with minimal changes to files that use these macros. Since the memory encryption mask will now be part of the regular pagetable macros, we introduce two new macros (_PAGE_TABLE_NOENC and _KERNPG_TABLE_NOENC) to allow for early pagetable creation/initialization without the encryption mask before SME becomes active. Two new pgprot() macros are defined to allow setting or clearing the page encryption mask. The FIXMAP_PAGE_NOCACHE define is introduced for use with MMIO. SME does not support encryption for MMIO areas so this define removes the encryption mask from the page attribute. Two new macros are introduced (__sme_pa() / __sme_pa_nodebug()) to allow creating a physical address with the encryption mask. These are used when working with the cr3 register so that the PGD can be encrypted. The current __va() macro is updated so that the virtual address is generated based off of the physical address without the encryption mask thus allowing the same virtual address to be generated regardless of whether encryption is enabled for that physical location or not. Also, an early initialization function is added for SME. If SME is active, this function: - Updates the early_pmd_flags so that early page faults create mappings with the encryption mask. - Updates the __supported_pte_mask to include the encryption mask. - Updates the protection_map entries to include the encryption mask so that user-space allocations will automatically have the encryption mask applied. Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Matt Fleming Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/b36e952c4c39767ae7f0a41cf5345adf27438480.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/pagetable.c | 7 ++++++ arch/x86/include/asm/fixmap.h | 7 ++++++ arch/x86/include/asm/mem_encrypt.h | 13 +++++++++++ arch/x86/include/asm/page_types.h | 3 ++- arch/x86/include/asm/pgtable.h | 9 ++++++++ arch/x86/include/asm/pgtable_types.h | 45 +++++++++++++++++++++++------------- arch/x86/include/asm/processor.h | 3 ++- arch/x86/kernel/espfix_64.c | 2 +- arch/x86/kernel/head64.c | 11 +++++++-- arch/x86/kernel/head_64.S | 20 ++++++++-------- arch/x86/mm/kasan_init_64.c | 4 ++-- arch/x86/mm/mem_encrypt.c | 17 ++++++++++++++ arch/x86/mm/pageattr.c | 3 +++ arch/x86/mm/tlb.c | 4 ++-- include/asm-generic/pgtable.h | 12 ++++++++++ include/linux/mem_encrypt.h | 8 +++++++ 16 files changed, 133 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c index 28029be47fbb..f1aa43854bed 100644 --- a/arch/x86/boot/compressed/pagetable.c +++ b/arch/x86/boot/compressed/pagetable.c @@ -15,6 +15,13 @@ #define __pa(x) ((unsigned long)(x)) #define __va(x) ((void *)((unsigned long)(x))) +/* + * The pgtable.h and mm/ident_map.c includes make use of the SME related + * information which is not used in the compressed image support. Un-define + * the SME support to avoid any compile and link errors. + */ +#undef CONFIG_AMD_MEM_ENCRYPT + #include "misc.h" /* These actually do the work of building the kernel identity maps. */ diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index b65155cc3760..d9ff226cb489 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -157,6 +157,13 @@ static inline void __set_fixmap(enum fixed_addresses idx, } #endif +/* + * FIXMAP_PAGE_NOCACHE is used for MMIO. Memory encryption is not + * supported for MMIO addresses, so make sure that the memory encryption + * mask is not part of the page attributes. + */ +#define FIXMAP_PAGE_NOCACHE PAGE_KERNEL_IO_NOCACHE + #include #define __late_set_fixmap(idx, phys, flags) __set_fixmap(idx, phys, flags) diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 475e34f53793..dbae7a5a347d 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -21,6 +21,8 @@ extern unsigned long sme_me_mask; +void __init sme_early_init(void); + void __init sme_encrypt_kernel(void); void __init sme_enable(void); @@ -28,11 +30,22 @@ void __init sme_enable(void); #define sme_me_mask 0UL +static inline void __init sme_early_init(void) { } + static inline void __init sme_encrypt_kernel(void) { } static inline void __init sme_enable(void) { } #endif /* CONFIG_AMD_MEM_ENCRYPT */ +/* + * The __sme_pa() and __sme_pa_nodebug() macros are meant for use when + * writing to or comparing values from the cr3 register. Having the + * encryption mask set in cr3 enables the PGD entry to be encrypted and + * avoid special case handling of PGD allocations. + */ +#define __sme_pa(x) (__pa(x) | sme_me_mask) +#define __sme_pa_nodebug(x) (__pa_nodebug(x) | sme_me_mask) + #endif /* __ASSEMBLY__ */ #endif /* __X86_MEM_ENCRYPT_H__ */ diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 7bd0099384ca..b98ed9d14630 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -3,6 +3,7 @@ #include #include +#include /* PAGE_SHIFT determines the page size */ #define PAGE_SHIFT 12 @@ -15,7 +16,7 @@ #define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) #define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) -#define __PHYSICAL_MASK ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1)) +#define __PHYSICAL_MASK ((phys_addr_t)(__sme_clr((1ULL << __PHYSICAL_MASK_SHIFT) - 1))) #define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) /* Cast *PAGE_MASK to a signed type so that it is sign-extended if diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index b64ea527edfb..c6452cb12c0b 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1,6 +1,7 @@ #ifndef _ASM_X86_PGTABLE_H #define _ASM_X86_PGTABLE_H +#include #include #include @@ -13,6 +14,12 @@ cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS))) \ : (prot)) +/* + * Macros to add or remove encryption attribute + */ +#define pgprot_encrypted(prot) __pgprot(__sme_set(pgprot_val(prot))) +#define pgprot_decrypted(prot) __pgprot(__sme_clr(pgprot_val(prot))) + #ifndef __ASSEMBLY__ #include @@ -38,6 +45,8 @@ extern struct list_head pgd_list; extern struct mm_struct *pgd_page_get_mm(struct page *page); +extern pmdval_t early_pmd_flags; + #ifdef CONFIG_PARAVIRT #include #else /* !CONFIG_PARAVIRT */ diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index bf9638e1ee42..de32ca32928a 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -2,6 +2,8 @@ #define _ASM_X86_PGTABLE_DEFS_H #include +#include + #include #define FIRST_USER_ADDRESS 0UL @@ -121,10 +123,10 @@ #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) -#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ - _PAGE_ACCESSED | _PAGE_DIRTY) -#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ - _PAGE_DIRTY) +#define _PAGE_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |\ + _PAGE_ACCESSED | _PAGE_DIRTY) +#define _KERNPG_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | \ + _PAGE_ACCESSED | _PAGE_DIRTY) /* * Set of bits not changed in pte_modify. The pte's @@ -191,18 +193,29 @@ enum page_cache_mode { #define __PAGE_KERNEL_IO (__PAGE_KERNEL) #define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE) -#define PAGE_KERNEL __pgprot(__PAGE_KERNEL) -#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) -#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) -#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX) -#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE) -#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE) -#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) -#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL) -#define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR) - -#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) -#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) +#ifndef __ASSEMBLY__ + +#define _PAGE_ENC (_AT(pteval_t, sme_me_mask)) + +#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ + _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_ENC) +#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ + _PAGE_DIRTY | _PAGE_ENC) + +#define PAGE_KERNEL __pgprot(__PAGE_KERNEL | _PAGE_ENC) +#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO | _PAGE_ENC) +#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC | _PAGE_ENC) +#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX | _PAGE_ENC) +#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE | _PAGE_ENC) +#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE | _PAGE_ENC) +#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC | _PAGE_ENC) +#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL | _PAGE_ENC) +#define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR | _PAGE_ENC) + +#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) +#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) + +#endif /* __ASSEMBLY__ */ /* xwr */ #define __P000 PAGE_NONE diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 6a79547e8ee0..a68f70c3debc 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -29,6 +29,7 @@ struct vm86; #include #include #include +#include /* * We handle most unaligned accesses in hardware. On the other hand @@ -241,7 +242,7 @@ static inline unsigned long read_cr3_pa(void) static inline void load_cr3(pgd_t *pgdir) { - write_cr3(__pa(pgdir)); + write_cr3(__sme_pa(pgdir)); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 6b91e2eb8d3f..9c4e7ba6870c 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -195,7 +195,7 @@ void init_espfix_ap(int cpu) pte_p = pte_offset_kernel(&pmd, addr); stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0)); - pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask)); + pte = __pte(__pa(stack_page) | ((__PAGE_KERNEL_RO | _PAGE_ENC) & ptemask)); for (n = 0; n < ESPFIX_PTE_CLONES; n++) set_pte(&pte_p[n*PTE_STRIDE], pte); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 1f0ddcc9675c..5cd0b72a0283 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -102,7 +102,7 @@ unsigned long __head __startup_64(unsigned long physaddr) pud = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); pmd = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); - pgtable_flags = _KERNPG_TABLE + sme_get_me_mask(); + pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask(); if (IS_ENABLED(CONFIG_X86_5LEVEL)) { p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); @@ -177,7 +177,7 @@ static void __init reset_early_page_tables(void) { memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1)); next_early_pgt = 0; - write_cr3(__pa_nodebug(early_top_pgt)); + write_cr3(__sme_pa_nodebug(early_top_pgt)); } /* Create a new PMD entry */ @@ -310,6 +310,13 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) clear_page(init_top_pgt); + /* + * SME support may update early_pmd_flags to include the memory + * encryption mask, so it needs to be called before anything + * that may generate a page fault. + */ + sme_early_init(); + kasan_early_init(); for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index ec5d5e90c8f1..513cbb012ecc 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -351,9 +351,9 @@ GLOBAL(name) NEXT_PAGE(early_top_pgt) .fill 511,8,0 #ifdef CONFIG_X86_5LEVEL - .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC #else - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC #endif NEXT_PAGE(early_dynamic_pgts) @@ -366,15 +366,15 @@ NEXT_PAGE(init_top_pgt) .fill 512,8,0 #else NEXT_PAGE(init_top_pgt) - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .org init_top_pgt + PGD_PAGE_OFFSET*8, 0 - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .org init_top_pgt + PGD_START_KERNEL*8, 0 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC NEXT_PAGE(level3_ident_pgt) - .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .fill 511, 8, 0 NEXT_PAGE(level2_ident_pgt) /* Since I easily can, map the first 1G. @@ -386,14 +386,14 @@ NEXT_PAGE(level2_ident_pgt) #ifdef CONFIG_X86_5LEVEL NEXT_PAGE(level4_kernel_pgt) .fill 511,8,0 - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC #endif NEXT_PAGE(level3_kernel_pgt) .fill L3_START_KERNEL,8,0 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ - .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE - .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC + .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC NEXT_PAGE(level2_kernel_pgt) /* @@ -411,7 +411,7 @@ NEXT_PAGE(level2_kernel_pgt) NEXT_PAGE(level2_fixmap_pgt) .fill 506,8,0 - .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ .fill 5,8,0 diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 02c9d7553409..39d4daf5e289 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -87,7 +87,7 @@ static struct notifier_block kasan_die_notifier = { void __init kasan_early_init(void) { int i; - pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL; + pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL | _PAGE_ENC; pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE; pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE; p4dval_t p4d_val = __pa_nodebug(kasan_zero_pud) | _KERNPG_TABLE; @@ -153,7 +153,7 @@ void __init kasan_init(void) */ memset(kasan_zero_page, 0, PAGE_SIZE); for (i = 0; i < PTRS_PER_PTE; i++) { - pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO); + pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO | _PAGE_ENC); set_pte(&kasan_zero_pte[i], pte); } /* Flush TLBs again to be sure that write protection applied. */ diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 3ac6f99b095c..f973d3dc3802 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -12,6 +12,7 @@ #include #include +#include /* * Since SME related variables are set early in the boot process they must @@ -21,6 +22,22 @@ unsigned long sme_me_mask __section(.data) = 0; EXPORT_SYMBOL_GPL(sme_me_mask); +void __init sme_early_init(void) +{ + unsigned int i; + + if (!sme_me_mask) + return; + + early_pmd_flags = __sme_set(early_pmd_flags); + + __supported_pte_mask = __sme_set(__supported_pte_mask); + + /* Update the protection map with memory encryption mask */ + for (i = 0; i < ARRAY_SIZE(protection_map); i++) + protection_map[i] = pgprot_encrypted(protection_map[i]); +} + void __init sme_encrypt_kernel(void) { } diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 757b0bcdf712..7e2d6c0a64c4 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -2020,6 +2020,9 @@ int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, if (!(page_flags & _PAGE_RW)) cpa.mask_clr = __pgprot(_PAGE_RW); + if (!(page_flags & _PAGE_ENC)) + cpa.mask_clr = pgprot_encrypted(cpa.mask_clr); + cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags); retval = __change_page_attr_set_clr(&cpa, 0); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 2c1b8881e9d3..593d2f76a54c 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -115,7 +115,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, */ this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, next_tlb_gen); - write_cr3(__pa(next->pgd)); + write_cr3(__sme_pa(next->pgd)); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } @@ -157,7 +157,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, next_tlb_gen); this_cpu_write(cpu_tlbstate.loaded_mm, next); - write_cr3(__pa(next->pgd)); + write_cr3(__sme_pa(next->pgd)); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 7dfa767dc680..4d7bb98f4134 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -582,6 +582,18 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm, #endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */ #endif /* CONFIG_MMU */ +/* + * No-op macros that just return the current protection value. Defined here + * because these macros can be used used even if CONFIG_MMU is not defined. + */ +#ifndef pgprot_encrypted +#define pgprot_encrypted(prot) (prot) +#endif + +#ifndef pgprot_decrypted +#define pgprot_decrypted(prot) (prot) +#endif + /* * A facility to provide lazy MMU batching. This allows PTE updates and * page invalidations to be delayed until a call to leave lazy MMU mode diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h index 570f4fcff13f..1255f09f5e42 100644 --- a/include/linux/mem_encrypt.h +++ b/include/linux/mem_encrypt.h @@ -35,6 +35,14 @@ static inline unsigned long sme_get_me_mask(void) return sme_me_mask; } +/* + * The __sme_set() and __sme_clr() macros are useful for adding or removing + * the encryption mask from a value (e.g. when dealing with pagetable + * entries). + */ +#define __sme_set(x) ((unsigned long)(x) | sme_me_mask) +#define __sme_clr(x) ((unsigned long)(x) & ~sme_me_mask) + #endif /* __ASSEMBLY__ */ #endif /* __MEM_ENCRYPT_H__ */ -- cgit v1.2.3 From f88a68facd9a15b94f8c195d9d2c0b30c76c595a Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:10:09 -0500 Subject: x86/mm: Extend early_memremap() support with additional attrs Add early_memremap() support to be able to specify encrypted and decrypted mappings with and without write-protection. The use of write-protection is necessary when encrypting data "in place". The write-protect attribute is considered cacheable for loads, but not stores. This implies that the hardware will never give the core a dirty line with this memtype. Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Matt Fleming Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/479b5832c30fae3efa7932e48f81794e86397229.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 4 ++++ arch/x86/include/asm/fixmap.h | 13 +++++++++++ arch/x86/include/asm/pgtable_types.h | 8 +++++++ arch/x86/mm/ioremap.c | 44 ++++++++++++++++++++++++++++++++++++ include/asm-generic/early_ioremap.h | 2 ++ mm/early_ioremap.c | 10 ++++++++ 6 files changed, 81 insertions(+) (limited to 'include') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ba7b93d08d00..8328bcb9ce8b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1440,6 +1440,10 @@ config AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT If set to N, then the encryption of system memory can be activated with the mem_encrypt=on command line option. +config ARCH_USE_MEMREMAP_PROT + def_bool y + depends on AMD_MEM_ENCRYPT + # Common NUMA Features config NUMA bool "Numa Memory Allocation and Scheduler Support" diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index d9ff226cb489..dcd9fb55e679 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -164,6 +164,19 @@ static inline void __set_fixmap(enum fixed_addresses idx, */ #define FIXMAP_PAGE_NOCACHE PAGE_KERNEL_IO_NOCACHE +/* + * Early memremap routines used for in-place encryption. The mappings created + * by these routines are intended to be used as temporary mappings. + */ +void __init *early_memremap_encrypted(resource_size_t phys_addr, + unsigned long size); +void __init *early_memremap_encrypted_wp(resource_size_t phys_addr, + unsigned long size); +void __init *early_memremap_decrypted(resource_size_t phys_addr, + unsigned long size); +void __init *early_memremap_decrypted_wp(resource_size_t phys_addr, + unsigned long size); + #include #define __late_set_fixmap(idx, phys, flags) __set_fixmap(idx, phys, flags) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index de32ca32928a..32095af0fefb 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -161,6 +161,7 @@ enum page_cache_mode { #define _PAGE_CACHE_MASK (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT) #define _PAGE_NOCACHE (cachemode2protval(_PAGE_CACHE_MODE_UC)) +#define _PAGE_CACHE_WP (cachemode2protval(_PAGE_CACHE_MODE_WP)) #define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) #define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ @@ -189,6 +190,7 @@ enum page_cache_mode { #define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER) #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) +#define __PAGE_KERNEL_WP (__PAGE_KERNEL | _PAGE_CACHE_WP) #define __PAGE_KERNEL_IO (__PAGE_KERNEL) #define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE) @@ -202,6 +204,12 @@ enum page_cache_mode { #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ _PAGE_DIRTY | _PAGE_ENC) +#define __PAGE_KERNEL_ENC (__PAGE_KERNEL | _PAGE_ENC) +#define __PAGE_KERNEL_ENC_WP (__PAGE_KERNEL_WP | _PAGE_ENC) + +#define __PAGE_KERNEL_NOENC (__PAGE_KERNEL) +#define __PAGE_KERNEL_NOENC_WP (__PAGE_KERNEL_WP) + #define PAGE_KERNEL __pgprot(__PAGE_KERNEL | _PAGE_ENC) #define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO | _PAGE_ENC) #define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC | _PAGE_ENC) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 66ddf5e8ffc8..570201bbf442 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -417,6 +417,50 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr) iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK)); } +#ifdef CONFIG_ARCH_USE_MEMREMAP_PROT +/* Remap memory with encryption */ +void __init *early_memremap_encrypted(resource_size_t phys_addr, + unsigned long size) +{ + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC); +} + +/* + * Remap memory with encryption and write-protected - cannot be called + * before pat_init() is called + */ +void __init *early_memremap_encrypted_wp(resource_size_t phys_addr, + unsigned long size) +{ + /* Be sure the write-protect PAT entry is set for write-protect */ + if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP) + return NULL; + + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC_WP); +} + +/* Remap memory without encryption */ +void __init *early_memremap_decrypted(resource_size_t phys_addr, + unsigned long size) +{ + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC); +} + +/* + * Remap memory without encryption and write-protected - cannot be called + * before pat_init() is called + */ +void __init *early_memremap_decrypted_wp(resource_size_t phys_addr, + unsigned long size) +{ + /* Be sure the write-protect PAT entry is set for write-protect */ + if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP) + return NULL; + + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC_WP); +} +#endif /* CONFIG_ARCH_USE_MEMREMAP_PROT */ + static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss; static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) diff --git a/include/asm-generic/early_ioremap.h b/include/asm-generic/early_ioremap.h index 734ad4db388c..2edef8d7fa6b 100644 --- a/include/asm-generic/early_ioremap.h +++ b/include/asm-generic/early_ioremap.h @@ -13,6 +13,8 @@ extern void *early_memremap(resource_size_t phys_addr, unsigned long size); extern void *early_memremap_ro(resource_size_t phys_addr, unsigned long size); +extern void *early_memremap_prot(resource_size_t phys_addr, + unsigned long size, unsigned long prot_val); extern void early_iounmap(void __iomem *addr, unsigned long size); extern void early_memunmap(void *addr, unsigned long size); diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c index 6d5717bd7197..d7d30da754ba 100644 --- a/mm/early_ioremap.c +++ b/mm/early_ioremap.c @@ -226,6 +226,16 @@ early_memremap_ro(resource_size_t phys_addr, unsigned long size) } #endif +#ifdef CONFIG_ARCH_USE_MEMREMAP_PROT +void __init * +early_memremap_prot(resource_size_t phys_addr, unsigned long size, + unsigned long prot_val) +{ + return (__force void *)__early_ioremap(phys_addr, size, + __pgprot(prot_val)); +} +#endif + #define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size) -- cgit v1.2.3 From a19d66c56af1c52b8b463bf94d21116ae8c1aa5a Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:10:13 -0500 Subject: efi: Add an EFI table address match function Add a function that will determine if a supplied physical address matches the address of an EFI table. Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Reviewed-by: Matt Fleming Reviewed-by: Borislav Petkov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/e1e06441d80f44776df391e0e4cb485b345b7518.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- drivers/firmware/efi/efi.c | 33 +++++++++++++++++++++++++++++++++ include/linux/efi.h | 7 +++++++ 2 files changed, 40 insertions(+) (limited to 'include') diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 045d6d311bde..69d4d130e055 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -55,6 +55,25 @@ struct efi __read_mostly efi = { }; EXPORT_SYMBOL(efi); +static unsigned long *efi_tables[] = { + &efi.mps, + &efi.acpi, + &efi.acpi20, + &efi.smbios, + &efi.smbios3, + &efi.sal_systab, + &efi.boot_info, + &efi.hcdp, + &efi.uga, + &efi.uv_systab, + &efi.fw_vendor, + &efi.runtime, + &efi.config_table, + &efi.esrt, + &efi.properties_table, + &efi.mem_attr_table, +}; + static bool disable_runtime; static int __init setup_noefi(char *arg) { @@ -855,6 +874,20 @@ int efi_status_to_err(efi_status_t status) return err; } +bool efi_is_table_address(unsigned long phys_addr) +{ + unsigned int i; + + if (phys_addr == EFI_INVALID_TABLE_ADDR) + return false; + + for (i = 0; i < ARRAY_SIZE(efi_tables); i++) + if (*(efi_tables[i]) == phys_addr) + return true; + + return false; +} + #ifdef CONFIG_KEXEC static int update_efi_random_seed(struct notifier_block *nb, unsigned long code, void *unused) diff --git a/include/linux/efi.h b/include/linux/efi.h index 8269bcb8ccf7..8e24f099bd3f 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1091,6 +1091,8 @@ static inline bool efi_enabled(int feature) return test_bit(feature, &efi.flags) != 0; } extern void efi_reboot(enum reboot_mode reboot_mode, const char *__unused); + +extern bool efi_is_table_address(unsigned long phys_addr); #else static inline bool efi_enabled(int feature) { @@ -1104,6 +1106,11 @@ efi_capsule_pending(int *reset_type) { return false; } + +static inline bool efi_is_table_address(unsigned long phys_addr) +{ + return false; +} #endif extern int efi_status_to_err(efi_status_t status); -- cgit v1.2.3 From f99afd08a45fbbd9ce35a7624ffd1d850a1906c0 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:10:14 -0500 Subject: efi: Update efi_mem_type() to return an error rather than 0 The efi_mem_type() function currently returns a 0, which maps to EFI_RESERVED_TYPE, if the function is unable to find a memmap entry for the supplied physical address. Returning EFI_RESERVED_TYPE implies that a memmap entry exists, when it doesn't. Instead of returning 0, change the function to return a negative error value when no memmap entry is found. Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Reviewed-by: Matt Fleming Reviewed-by: Borislav Petkov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/7fbf40a9dc414d5da849e1ddcd7f7c1285e4e181.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- arch/ia64/kernel/efi.c | 4 ++-- arch/x86/platform/efi/efi.c | 6 +++--- include/linux/efi.h | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c index 121295637d0d..81416000c5e0 100644 --- a/arch/ia64/kernel/efi.c +++ b/arch/ia64/kernel/efi.c @@ -757,14 +757,14 @@ efi_memmap_intersects (unsigned long phys_addr, unsigned long size) return 0; } -u32 +int efi_mem_type (unsigned long phys_addr) { efi_memory_desc_t *md = efi_memory_descriptor(phys_addr); if (md) return md->type; - return 0; + return -EINVAL; } u64 diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index f084d8718ac4..6217b23e85f6 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -1035,12 +1035,12 @@ void __init efi_enter_virtual_mode(void) /* * Convenience functions to obtain memory types and attributes */ -u32 efi_mem_type(unsigned long phys_addr) +int efi_mem_type(unsigned long phys_addr) { efi_memory_desc_t *md; if (!efi_enabled(EFI_MEMMAP)) - return 0; + return -ENOTSUPP; for_each_efi_memory_desc(md) { if ((md->phys_addr <= phys_addr) && @@ -1048,7 +1048,7 @@ u32 efi_mem_type(unsigned long phys_addr) (md->num_pages << EFI_PAGE_SHIFT)))) return md->type; } - return 0; + return -EINVAL; } static int __init arch_parse_efi_cmdline(char *str) diff --git a/include/linux/efi.h b/include/linux/efi.h index 8e24f099bd3f..4e47f78430be 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -985,7 +985,7 @@ static inline void efi_esrt_init(void) { } extern int efi_config_parse_tables(void *config_tables, int count, int sz, efi_config_table_type_t *arch_tables); extern u64 efi_get_iobase (void); -extern u32 efi_mem_type (unsigned long phys_addr); +extern int efi_mem_type(unsigned long phys_addr); extern u64 efi_mem_attributes (unsigned long phys_addr); extern u64 efi_mem_attribute (unsigned long phys_addr, unsigned long size); extern int __init efi_uart_console_only (void); -- cgit v1.2.3 From 8f716c9b5febf6ed0f5fedb7c9407cd0c25b2796 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:10:16 -0500 Subject: x86/mm: Add support to access boot related data in the clear Boot data (such as EFI related data) is not encrypted when the system is booted because UEFI/BIOS does not run with SME active. In order to access this data properly it needs to be mapped decrypted. Update early_memremap() to provide an arch specific routine to modify the pagetable protection attributes before they are applied to the new mapping. This is used to remove the encryption mask for boot related data. Update memremap() to provide an arch specific routine to determine if RAM remapping is allowed. RAM remapping will cause an encrypted mapping to be generated. By preventing RAM remapping, ioremap_cache() will be used instead, which will provide a decrypted mapping of the boot related data. Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Reviewed-by: Matt Fleming Reviewed-by: Borislav Petkov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/81fb6b4117a5df6b9f2eda342f81bbef4b23d2e5.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/io.h | 5 ++ arch/x86/mm/ioremap.c | 180 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/io.h | 2 + kernel/memremap.c | 20 ++++-- mm/early_ioremap.c | 18 ++++- 5 files changed, 218 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 7afb0e2f07f4..09c5557b1454 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -381,4 +381,9 @@ extern void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size) #define arch_io_reserve_memtype_wc arch_io_reserve_memtype_wc #endif +extern bool arch_memremap_can_ram_remap(resource_size_t offset, + unsigned long size, + unsigned long flags); +#define arch_memremap_can_ram_remap arch_memremap_can_ram_remap + #endif /* _ASM_X86_IO_H */ diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 570201bbf442..8986b2868944 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include #include @@ -21,6 +23,7 @@ #include #include #include +#include #include "physaddr.h" @@ -417,6 +420,183 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr) iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK)); } +/* + * Examine the physical address to determine if it is an area of memory + * that should be mapped decrypted. If the memory is not part of the + * kernel usable area it was accessed and created decrypted, so these + * areas should be mapped decrypted. + */ +static bool memremap_should_map_decrypted(resource_size_t phys_addr, + unsigned long size) +{ + /* Check if the address is outside kernel usable area */ + switch (e820__get_entry_type(phys_addr, phys_addr + size - 1)) { + case E820_TYPE_RESERVED: + case E820_TYPE_ACPI: + case E820_TYPE_NVS: + case E820_TYPE_UNUSABLE: + return true; + default: + break; + } + + return false; +} + +/* + * Examine the physical address to determine if it is EFI data. Check + * it against the boot params structure and EFI tables and memory types. + */ +static bool memremap_is_efi_data(resource_size_t phys_addr, + unsigned long size) +{ + u64 paddr; + + /* Check if the address is part of EFI boot/runtime data */ + if (!efi_enabled(EFI_BOOT)) + return false; + + paddr = boot_params.efi_info.efi_memmap_hi; + paddr <<= 32; + paddr |= boot_params.efi_info.efi_memmap; + if (phys_addr == paddr) + return true; + + paddr = boot_params.efi_info.efi_systab_hi; + paddr <<= 32; + paddr |= boot_params.efi_info.efi_systab; + if (phys_addr == paddr) + return true; + + if (efi_is_table_address(phys_addr)) + return true; + + switch (efi_mem_type(phys_addr)) { + case EFI_BOOT_SERVICES_DATA: + case EFI_RUNTIME_SERVICES_DATA: + return true; + default: + break; + } + + return false; +} + +/* + * Examine the physical address to determine if it is boot data by checking + * it against the boot params setup_data chain. + */ +static bool memremap_is_setup_data(resource_size_t phys_addr, + unsigned long size) +{ + struct setup_data *data; + u64 paddr, paddr_next; + + paddr = boot_params.hdr.setup_data; + while (paddr) { + unsigned int len; + + if (phys_addr == paddr) + return true; + + data = memremap(paddr, sizeof(*data), + MEMREMAP_WB | MEMREMAP_DEC); + + paddr_next = data->next; + len = data->len; + + memunmap(data); + + if ((phys_addr > paddr) && (phys_addr < (paddr + len))) + return true; + + paddr = paddr_next; + } + + return false; +} + +/* + * Examine the physical address to determine if it is boot data by checking + * it against the boot params setup_data chain (early boot version). + */ +static bool __init early_memremap_is_setup_data(resource_size_t phys_addr, + unsigned long size) +{ + struct setup_data *data; + u64 paddr, paddr_next; + + paddr = boot_params.hdr.setup_data; + while (paddr) { + unsigned int len; + + if (phys_addr == paddr) + return true; + + data = early_memremap_decrypted(paddr, sizeof(*data)); + + paddr_next = data->next; + len = data->len; + + early_memunmap(data, sizeof(*data)); + + if ((phys_addr > paddr) && (phys_addr < (paddr + len))) + return true; + + paddr = paddr_next; + } + + return false; +} + +/* + * Architecture function to determine if RAM remap is allowed. By default, a + * RAM remap will map the data as encrypted. Determine if a RAM remap should + * not be done so that the data will be mapped decrypted. + */ +bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size, + unsigned long flags) +{ + if (!sme_active()) + return true; + + if (flags & MEMREMAP_ENC) + return true; + + if (flags & MEMREMAP_DEC) + return false; + + if (memremap_is_setup_data(phys_addr, size) || + memremap_is_efi_data(phys_addr, size) || + memremap_should_map_decrypted(phys_addr, size)) + return false; + + return true; +} + +/* + * Architecture override of __weak function to adjust the protection attributes + * used when remapping memory. By default, early_memremap() will map the data + * as encrypted. Determine if an encrypted mapping should not be done and set + * the appropriate protection attributes. + */ +pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr, + unsigned long size, + pgprot_t prot) +{ + if (!sme_active()) + return prot; + + if (early_memremap_is_setup_data(phys_addr, size) || + memremap_is_efi_data(phys_addr, size) || + memremap_should_map_decrypted(phys_addr, size)) + prot = pgprot_decrypted(prot); + else + prot = pgprot_encrypted(prot); + + return prot; +} + #ifdef CONFIG_ARCH_USE_MEMREMAP_PROT /* Remap memory with encryption */ void __init *early_memremap_encrypted(resource_size_t phys_addr, diff --git a/include/linux/io.h b/include/linux/io.h index 2195d9ea4aaa..32e30e8fb9db 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -157,6 +157,8 @@ enum { MEMREMAP_WB = 1 << 0, MEMREMAP_WT = 1 << 1, MEMREMAP_WC = 1 << 2, + MEMREMAP_ENC = 1 << 3, + MEMREMAP_DEC = 1 << 4, }; void *memremap(resource_size_t offset, size_t size, unsigned long flags); diff --git a/kernel/memremap.c b/kernel/memremap.c index 124bed776532..9afdc434fb49 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -34,13 +34,24 @@ static void *arch_memremap_wb(resource_size_t offset, unsigned long size) } #endif -static void *try_ram_remap(resource_size_t offset, size_t size) +#ifndef arch_memremap_can_ram_remap +static bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size, + unsigned long flags) +{ + return true; +} +#endif + +static void *try_ram_remap(resource_size_t offset, size_t size, + unsigned long flags) { unsigned long pfn = PHYS_PFN(offset); /* In the simple case just return the existing linear address */ - if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn))) + if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)) && + arch_memremap_can_ram_remap(offset, size, flags)) return __va(offset); + return NULL; /* fallback to arch_memremap_wb */ } @@ -48,7 +59,8 @@ static void *try_ram_remap(resource_size_t offset, size_t size) * memremap() - remap an iomem_resource as cacheable memory * @offset: iomem resource start address * @size: size of remap - * @flags: any of MEMREMAP_WB, MEMREMAP_WT and MEMREMAP_WC + * @flags: any of MEMREMAP_WB, MEMREMAP_WT, MEMREMAP_WC, + * MEMREMAP_ENC, MEMREMAP_DEC * * memremap() is "ioremap" for cases where it is known that the resource * being mapped does not have i/o side effects and the __iomem @@ -95,7 +107,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags) * the requested range is potentially in System RAM. */ if (is_ram == REGION_INTERSECTS) - addr = try_ram_remap(offset, size); + addr = try_ram_remap(offset, size, flags); if (!addr) addr = arch_memremap_wb(offset, size); } diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c index d7d30da754ba..b1dd4a948fc0 100644 --- a/mm/early_ioremap.c +++ b/mm/early_ioremap.c @@ -30,6 +30,13 @@ early_param("early_ioremap_debug", early_ioremap_debug_setup); static int after_paging_init __initdata; +pgprot_t __init __weak early_memremap_pgprot_adjust(resource_size_t phys_addr, + unsigned long size, + pgprot_t prot) +{ + return prot; +} + void __init __weak early_ioremap_shutdown(void) { } @@ -215,14 +222,19 @@ early_ioremap(resource_size_t phys_addr, unsigned long size) void __init * early_memremap(resource_size_t phys_addr, unsigned long size) { - return (__force void *)__early_ioremap(phys_addr, size, - FIXMAP_PAGE_NORMAL); + pgprot_t prot = early_memremap_pgprot_adjust(phys_addr, size, + FIXMAP_PAGE_NORMAL); + + return (__force void *)__early_ioremap(phys_addr, size, prot); } #ifdef FIXMAP_PAGE_RO void __init * early_memremap_ro(resource_size_t phys_addr, unsigned long size) { - return (__force void *)__early_ioremap(phys_addr, size, FIXMAP_PAGE_RO); + pgprot_t prot = early_memremap_pgprot_adjust(phys_addr, size, + FIXMAP_PAGE_RO); + + return (__force void *)__early_ioremap(phys_addr, size, prot); } #endif -- cgit v1.2.3 From c7753208a94c73d5beb1e4bd843081d6dc7d4678 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:10:21 -0500 Subject: x86, swiotlb: Add memory encryption support Since DMA addresses will effectively look like 48-bit addresses when the memory encryption mask is set, SWIOTLB is needed if the DMA mask of the device performing the DMA does not support 48-bits. SWIOTLB will be initialized to create decrypted bounce buffers for use by these devices. Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Matt Fleming Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/aa2d29b78ae7d508db8881e46a3215231b9327a7.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/dma-mapping.h | 5 ++-- arch/x86/include/asm/mem_encrypt.h | 5 ++++ arch/x86/kernel/pci-dma.c | 11 +++++--- arch/x86/kernel/pci-nommu.c | 2 +- arch/x86/kernel/pci-swiotlb.c | 15 +++++++++-- arch/x86/mm/mem_encrypt.c | 22 ++++++++++++++++ include/linux/swiotlb.h | 1 + init/main.c | 10 +++++++ lib/swiotlb.c | 54 ++++++++++++++++++++++++++++++++------ 9 files changed, 108 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 398c79889f5c..1387dafdba2d 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -12,6 +12,7 @@ #include #include #include +#include #ifdef CONFIG_ISA # define ISA_DMA_BIT_MASK DMA_BIT_MASK(24) @@ -57,12 +58,12 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) { - return paddr; + return __sme_set(paddr); } static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr) { - return daddr; + return __sme_clr(daddr); } #endif /* CONFIG_X86_DMA_REMAP */ diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index ab1fe77c2f73..70e55f6b9adf 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -34,6 +34,11 @@ void __init sme_early_init(void); void __init sme_encrypt_kernel(void); void __init sme_enable(void); +/* Architecture __weak replacement functions */ +void __init mem_encrypt_init(void); + +void swiotlb_set_mem_attributes(void *vaddr, unsigned long size); + #else /* !CONFIG_AMD_MEM_ENCRYPT */ #define sme_me_mask 0UL diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 5e16d3f29594..0accc2404b92 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -93,9 +93,12 @@ again: if (gfpflags_allow_blocking(flag)) { page = dma_alloc_from_contiguous(dev, count, get_order(size), flag); - if (page && page_to_phys(page) + size > dma_mask) { - dma_release_from_contiguous(dev, page, count); - page = NULL; + if (page) { + addr = phys_to_dma(dev, page_to_phys(page)); + if (addr + size > dma_mask) { + dma_release_from_contiguous(dev, page, count); + page = NULL; + } } } /* fallback */ @@ -104,7 +107,7 @@ again: if (!page) return NULL; - addr = page_to_phys(page); + addr = phys_to_dma(dev, page_to_phys(page)); if (addr + size > dma_mask) { __free_pages(page, get_order(size)); diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index a6d404087fe3..4fc3cb60ea11 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -32,7 +32,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page, enum dma_data_direction dir, unsigned long attrs) { - dma_addr_t bus = page_to_phys(page) + offset; + dma_addr_t bus = phys_to_dma(dev, page_to_phys(page)) + offset; WARN_ON(size == 0); if (!check_addr("map_single", dev, bus, size)) return NOMMU_MAPPING_ERROR; diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 1e23577e17cf..677077510e30 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -6,12 +6,14 @@ #include #include #include +#include #include #include #include #include #include + int swiotlb __read_mostly; void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, @@ -79,8 +81,8 @@ IOMMU_INIT_FINISH(pci_swiotlb_detect_override, pci_swiotlb_late_init); /* - * if 4GB or more detected (and iommu=off not set) return 1 - * and set swiotlb to 1. + * If 4GB or more detected (and iommu=off not set) or if SME is active + * then set swiotlb to 1 and return 1. */ int __init pci_swiotlb_detect_4gb(void) { @@ -89,6 +91,15 @@ int __init pci_swiotlb_detect_4gb(void) if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN) swiotlb = 1; #endif + + /* + * If SME is active then swiotlb will be set to 1 so that bounce + * buffers are allocated and used for devices that do not support + * the addressing range required for the encryption mask. + */ + if (sme_active()) + swiotlb = 1; + return swiotlb; } IOMMU_INIT(pci_swiotlb_detect_4gb, diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 0843d023da4a..a7400ec8538b 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -13,11 +13,14 @@ #include #include #include +#include +#include #include #include #include #include +#include /* * Since SME related variables are set early in the boot process they must @@ -177,6 +180,25 @@ void __init sme_early_init(void) protection_map[i] = pgprot_encrypted(protection_map[i]); } +/* Architecture __weak replacement functions */ +void __init mem_encrypt_init(void) +{ + if (!sme_me_mask) + return; + + /* Call into SWIOTLB to update the SWIOTLB DMA buffers */ + swiotlb_update_mem_attributes(); +} + +void swiotlb_set_mem_attributes(void *vaddr, unsigned long size) +{ + WARN(PAGE_ALIGN(size) != size, + "size is not page-aligned (%#lx)\n", size); + + /* Make the SWIOTLB buffer area decrypted */ + set_memory_decrypted((unsigned long)vaddr, size >> PAGE_SHIFT); +} + void __init sme_encrypt_kernel(void) { } diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 4ee479f2f355..15e7160751a8 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -35,6 +35,7 @@ int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose); extern unsigned long swiotlb_nr_tbl(void); unsigned long swiotlb_size_or_default(void); extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs); +extern void __init swiotlb_update_mem_attributes(void); /* * Enumeration for sync targets diff --git a/init/main.c b/init/main.c index 052481fbe363..9789ab7fe85e 100644 --- a/init/main.c +++ b/init/main.c @@ -488,6 +488,8 @@ void __init __weak thread_stack_cache_init(void) } #endif +void __init __weak mem_encrypt_init(void) { } + /* * Set up kernel memory allocators */ @@ -641,6 +643,14 @@ asmlinkage __visible void __init start_kernel(void) */ locking_selftest(); + /* + * This needs to be called before any devices perform DMA + * operations that might use the SWIOTLB bounce buffers. It will + * mark the bounce buffers as decrypted so that their usage will + * not cause "plain-text" data to be decrypted when accessed. + */ + mem_encrypt_init(); + #ifdef CONFIG_BLK_DEV_INITRD if (initrd_start && !initrd_below_start_ok && page_to_pfn(virt_to_page((void *)initrd_start)) < min_low_pfn) { diff --git a/lib/swiotlb.c b/lib/swiotlb.c index a8d74a733a38..04ac91acf193 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -155,6 +156,15 @@ unsigned long swiotlb_size_or_default(void) return size ? size : (IO_TLB_DEFAULT_SIZE); } +void __weak swiotlb_set_mem_attributes(void *vaddr, unsigned long size) { } + +/* For swiotlb, clear memory encryption mask from dma addresses */ +static dma_addr_t swiotlb_phys_to_dma(struct device *hwdev, + phys_addr_t address) +{ + return __sme_clr(phys_to_dma(hwdev, address)); +} + /* Note that this doesn't work with highmem page */ static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev, volatile void *address) @@ -183,6 +193,31 @@ void swiotlb_print_info(void) bytes >> 20, vstart, vend - 1); } +/* + * Early SWIOTLB allocation may be too early to allow an architecture to + * perform the desired operations. This function allows the architecture to + * call SWIOTLB when the operations are possible. It needs to be called + * before the SWIOTLB memory is used. + */ +void __init swiotlb_update_mem_attributes(void) +{ + void *vaddr; + unsigned long bytes; + + if (no_iotlb_memory || late_alloc) + return; + + vaddr = phys_to_virt(io_tlb_start); + bytes = PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT); + swiotlb_set_mem_attributes(vaddr, bytes); + memset(vaddr, 0, bytes); + + vaddr = phys_to_virt(io_tlb_overflow_buffer); + bytes = PAGE_ALIGN(io_tlb_overflow); + swiotlb_set_mem_attributes(vaddr, bytes); + memset(vaddr, 0, bytes); +} + int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) { void *v_overflow_buffer; @@ -320,6 +355,7 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) io_tlb_start = virt_to_phys(tlb); io_tlb_end = io_tlb_start + bytes; + swiotlb_set_mem_attributes(tlb, bytes); memset(tlb, 0, bytes); /* @@ -330,6 +366,8 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) if (!v_overflow_buffer) goto cleanup2; + swiotlb_set_mem_attributes(v_overflow_buffer, io_tlb_overflow); + memset(v_overflow_buffer, 0, io_tlb_overflow); io_tlb_overflow_buffer = virt_to_phys(v_overflow_buffer); /* @@ -581,7 +619,7 @@ map_single(struct device *hwdev, phys_addr_t phys, size_t size, return SWIOTLB_MAP_ERROR; } - start_dma_addr = phys_to_dma(hwdev, io_tlb_start); + start_dma_addr = swiotlb_phys_to_dma(hwdev, io_tlb_start); return swiotlb_tbl_map_single(hwdev, start_dma_addr, phys, size, dir, attrs); } @@ -702,7 +740,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size, goto err_warn; ret = phys_to_virt(paddr); - dev_addr = phys_to_dma(hwdev, paddr); + dev_addr = swiotlb_phys_to_dma(hwdev, paddr); /* Confirm address can be DMA'd by device */ if (dev_addr + size - 1 > dma_mask) { @@ -812,10 +850,10 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, map = map_single(dev, phys, size, dir, attrs); if (map == SWIOTLB_MAP_ERROR) { swiotlb_full(dev, size, dir, 1); - return phys_to_dma(dev, io_tlb_overflow_buffer); + return swiotlb_phys_to_dma(dev, io_tlb_overflow_buffer); } - dev_addr = phys_to_dma(dev, map); + dev_addr = swiotlb_phys_to_dma(dev, map); /* Ensure that the address returned is DMA'ble */ if (dma_capable(dev, dev_addr, size)) @@ -824,7 +862,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, attrs |= DMA_ATTR_SKIP_CPU_SYNC; swiotlb_tbl_unmap_single(dev, map, size, dir, attrs); - return phys_to_dma(dev, io_tlb_overflow_buffer); + return swiotlb_phys_to_dma(dev, io_tlb_overflow_buffer); } EXPORT_SYMBOL_GPL(swiotlb_map_page); @@ -958,7 +996,7 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems, sg_dma_len(sgl) = 0; return 0; } - sg->dma_address = phys_to_dma(hwdev, map); + sg->dma_address = swiotlb_phys_to_dma(hwdev, map); } else sg->dma_address = dev_addr; sg_dma_len(sg) = sg->length; @@ -1026,7 +1064,7 @@ EXPORT_SYMBOL(swiotlb_sync_sg_for_device); int swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr) { - return (dma_addr == phys_to_dma(hwdev, io_tlb_overflow_buffer)); + return (dma_addr == swiotlb_phys_to_dma(hwdev, io_tlb_overflow_buffer)); } EXPORT_SYMBOL(swiotlb_dma_mapping_error); @@ -1039,6 +1077,6 @@ EXPORT_SYMBOL(swiotlb_dma_mapping_error); int swiotlb_dma_supported(struct device *hwdev, u64 mask) { - return phys_to_dma(hwdev, io_tlb_end - 1) <= mask; + return swiotlb_phys_to_dma(hwdev, io_tlb_end - 1) <= mask; } EXPORT_SYMBOL(swiotlb_dma_supported); -- cgit v1.2.3 From 648babb7078c6310d2af5b8aa01f086030916968 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:10:22 -0500 Subject: swiotlb: Add warnings for use of bounce buffers with SME Add warnings to let the user know when bounce buffers are being used for DMA when SME is active. Since the bounce buffers are not in encrypted memory, these notifications are to allow the user to determine some appropriate action - if necessary. Actions can range from utilizing an IOMMU, replacing the device with another device that can support 64-bit DMA, ignoring the message if the device isn't used much, etc. Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Matt Fleming Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/d112564053c3f2e86ca634a8d4fa4abc0eb53a6a.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- include/linux/dma-mapping.h | 13 +++++++++++++ lib/swiotlb.c | 3 +++ 2 files changed, 16 insertions(+) (limited to 'include') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 843ab866e0f4..fce2369ecf82 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -10,6 +10,7 @@ #include #include #include +#include /** * List of possible attributes associated with a DMA mapping. The semantics @@ -548,6 +549,12 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) return 0; } +static inline void dma_check_mask(struct device *dev, u64 mask) +{ + if (sme_active() && (mask < (((u64)sme_get_me_mask() << 1) - 1))) + dev_warn(dev, "SME is active, device will require DMA bounce buffers\n"); +} + static inline int dma_supported(struct device *dev, u64 mask) { const struct dma_map_ops *ops = get_dma_ops(dev); @@ -564,6 +571,9 @@ static inline int dma_set_mask(struct device *dev, u64 mask) { if (!dev->dma_mask || !dma_supported(dev, mask)) return -EIO; + + dma_check_mask(dev, mask); + *dev->dma_mask = mask; return 0; } @@ -583,6 +593,9 @@ static inline int dma_set_coherent_mask(struct device *dev, u64 mask) { if (!dma_supported(dev, mask)) return -EIO; + + dma_check_mask(dev, mask); + dev->coherent_dma_mask = mask; return 0; } diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 04ac91acf193..8c6c83ef57a4 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -507,6 +507,9 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, if (no_iotlb_memory) panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); + if (sme_active()) + pr_warn_once("SME is active and system is using DMA bounce buffers\n"); + mask = dma_get_seg_boundary(hwdev); tbl_dma_addr &= mask; -- cgit v1.2.3 From bba4ed011a52d494aa7ef5e08cf226709bbf3f60 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:10:28 -0500 Subject: x86/mm, kexec: Allow kexec to be used with SME Provide support so that kexec can be used to boot a kernel when SME is enabled. Support is needed to allocate pages for kexec without encryption. This is needed in order to be able to reboot in the kernel in the same manner as originally booted. Additionally, when shutting down all of the CPUs we need to be sure to flush the caches and then halt. This is needed when booting from a state where SME was not active into a state where SME is active (or vice-versa). Without these steps, it is possible for cache lines to exist for the same physical location but tagged both with and without the encryption bit. This can cause random memory corruption when caches are flushed depending on which cacheline is written last. Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Matt Fleming Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/b95ff075db3e7cd545313f2fb609a49619a09625.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/init.h | 1 + arch/x86/include/asm/kexec.h | 8 ++++++++ arch/x86/include/asm/pgtable_types.h | 1 + arch/x86/kernel/machine_kexec_64.c | 22 +++++++++++++++++++++- arch/x86/kernel/process.c | 17 +++++++++++++++-- arch/x86/mm/ident_map.c | 12 ++++++++---- include/linux/kexec.h | 8 ++++++++ kernel/kexec_core.c | 12 +++++++++++- 8 files changed, 73 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index 474eb8c66fee..05c4aa00cc86 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -7,6 +7,7 @@ struct x86_mapping_info { unsigned long page_flag; /* page flag for PMD or PUD entry */ unsigned long offset; /* ident mapping offset */ bool direct_gbpages; /* PUD level 1GB page support */ + unsigned long kernpg_flag; /* kernel pagetable flag override */ }; int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 70ef205489f0..e8183acf931f 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -207,6 +207,14 @@ struct kexec_entry64_regs { uint64_t r15; uint64_t rip; }; + +extern int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, + gfp_t gfp); +#define arch_kexec_post_alloc_pages arch_kexec_post_alloc_pages + +extern void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages); +#define arch_kexec_pre_free_pages arch_kexec_pre_free_pages + #endif typedef void crash_vmclear_fn(void); diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 32095af0fefb..830992fc5a06 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -213,6 +213,7 @@ enum page_cache_mode { #define PAGE_KERNEL __pgprot(__PAGE_KERNEL | _PAGE_ENC) #define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO | _PAGE_ENC) #define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC | _PAGE_ENC) +#define PAGE_KERNEL_EXEC_NOENC __pgprot(__PAGE_KERNEL_EXEC) #define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX | _PAGE_ENC) #define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE | _PAGE_ENC) #define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE | _PAGE_ENC) diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index cb0a30473c23..9cf8daacc046 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -87,7 +87,7 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); } pte = pte_offset_kernel(pmd, vaddr); - set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC)); + set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC_NOENC)); return 0; err: free_transition_pgtable(image); @@ -115,6 +115,7 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) .alloc_pgt_page = alloc_pgt_page, .context = image, .page_flag = __PAGE_KERNEL_LARGE_EXEC, + .kernpg_flag = _KERNPG_TABLE_NOENC, }; unsigned long mstart, mend; pgd_t *level4p; @@ -602,3 +603,22 @@ void arch_kexec_unprotect_crashkres(void) { kexec_mark_crashkres(false); } + +int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) +{ + /* + * If SME is active we need to be sure that kexec pages are + * not encrypted because when we boot to the new kernel the + * pages won't be accessed encrypted (initially). + */ + return set_memory_decrypted((unsigned long)vaddr, pages); +} + +void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) +{ + /* + * If SME is active we need to reset the pages back to being + * an encrypted mapping before freeing them. + */ + set_memory_encrypted((unsigned long)vaddr, pages); +} diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 3ca198080ea9..bd6b85fac666 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -355,6 +355,7 @@ bool xen_set_default_idle(void) return ret; } #endif + void stop_this_cpu(void *dummy) { local_irq_disable(); @@ -365,8 +366,20 @@ void stop_this_cpu(void *dummy) disable_local_APIC(); mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); - for (;;) - halt(); + for (;;) { + /* + * Use wbinvd followed by hlt to stop the processor. This + * provides support for kexec on a processor that supports + * SME. With kexec, going from SME inactive to SME active + * requires clearing cache entries so that addresses without + * the encryption bit set don't corrupt the same physical + * address that has the encryption bit set when caches are + * flushed. To achieve this a wbinvd is performed followed by + * a hlt. Even if the processor is not in the kexec/SME + * scenario this only adds a wbinvd to a halting processor. + */ + asm volatile("wbinvd; hlt" : : : "memory"); + } } /* diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index adab1595f4bd..31cea988fa36 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -51,7 +51,7 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, if (!pmd) return -ENOMEM; ident_pmd_init(info, pmd, addr, next); - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); + set_pud(pud, __pud(__pa(pmd) | info->kernpg_flag)); } return 0; @@ -79,7 +79,7 @@ static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page, if (!pud) return -ENOMEM; ident_pud_init(info, pud, addr, next); - set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); + set_p4d(p4d, __p4d(__pa(pud) | info->kernpg_flag)); } return 0; @@ -93,6 +93,10 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, unsigned long next; int result; + /* Set the default pagetable flags if not supplied */ + if (!info->kernpg_flag) + info->kernpg_flag = _KERNPG_TABLE; + for (; addr < end; addr = next) { pgd_t *pgd = pgd_page + pgd_index(addr); p4d_t *p4d; @@ -116,14 +120,14 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, if (result) return result; if (IS_ENABLED(CONFIG_X86_5LEVEL)) { - set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE)); + set_pgd(pgd, __pgd(__pa(p4d) | info->kernpg_flag)); } else { /* * With p4d folded, pgd is equal to p4d. * The pgd entry has to point to the pud page table in this case. */ pud_t *pud = pud_offset(p4d, 0); - set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + set_pgd(pgd, __pgd(__pa(pud) | info->kernpg_flag)); } } diff --git a/include/linux/kexec.h b/include/linux/kexec.h index dd056fab9e35..2b7590f5483a 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -327,6 +327,14 @@ static inline void *boot_phys_to_virt(unsigned long entry) return phys_to_virt(boot_phys_to_phys(entry)); } +#ifndef arch_kexec_post_alloc_pages +static inline int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) { return 0; } +#endif + +#ifndef arch_kexec_pre_free_pages +static inline void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) { } +#endif + #else /* !CONFIG_KEXEC_CORE */ struct pt_regs; struct task_struct; diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 1ae7c41c33c1..20fef1a38602 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -301,7 +301,7 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) { struct page *pages; - pages = alloc_pages(gfp_mask, order); + pages = alloc_pages(gfp_mask & ~__GFP_ZERO, order); if (pages) { unsigned int count, i; @@ -310,6 +310,13 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) count = 1 << order; for (i = 0; i < count; i++) SetPageReserved(pages + i); + + arch_kexec_post_alloc_pages(page_address(pages), count, + gfp_mask); + + if (gfp_mask & __GFP_ZERO) + for (i = 0; i < count; i++) + clear_highpage(pages + i); } return pages; @@ -321,6 +328,9 @@ static void kimage_free_pages(struct page *page) order = page_private(page); count = 1 << order; + + arch_kexec_pre_free_pages(page_address(page), count); + for (i = 0; i < count; i++) ClearPageReserved(page + i); __free_pages(page, order); -- cgit v1.2.3 From 7375ae3a0b79ea072f4c672039f08f5db633b9e1 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:10:34 -0500 Subject: compiler-gcc.h: Introduce __nostackprotector function attribute Create a new function attribute, __nostackprotector, that can used to turn off stack protection on a per function basis. Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Matt Fleming Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/0576fd5c74440ad0250f16ac6609ecf587812456.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- include/linux/compiler-gcc.h | 2 ++ include/linux/compiler.h | 4 ++++ 2 files changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index cd4bbe8242bd..c28cedde973f 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -166,6 +166,8 @@ #if GCC_VERSION >= 40100 # define __compiletime_object_size(obj) __builtin_object_size(obj, 0) + +#define __nostackprotector __attribute__((__optimize__("no-stack-protector"))) #endif #if GCC_VERSION >= 40300 diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 219f82f3ec1a..3f8c88e29a46 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -470,6 +470,10 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s #define __visible #endif +#ifndef __nostackprotector +# define __nostackprotector +#endif + /* * Assume alignment of return value. */ -- cgit v1.2.3 From ce0fa3e56ad20f04d8252353dcd24e924abdafca Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 16 Aug 2017 10:18:03 -0700 Subject: x86/mm, mm/hwpoison: Clear PRESENT bit for kernel 1:1 mappings of poison pages Speculative processor accesses may reference any memory that has a valid page table entry. While a speculative access won't generate a machine check, it will log the error in a machine check bank. That could cause escalation of a subsequent error since the overflow bit will be then set in the machine check bank status register. Code has to be double-plus-tricky to avoid mentioning the 1:1 virtual address of the page we want to map out otherwise we may trigger the very problem we are trying to avoid. We use a non-canonical address that passes through the usual Linux table walking code to get to the same "pte". Thanks to Dave Hansen for reviewing several iterations of this. Also see: http://marc.info/?l=linux-mm&m=149860136413338&w=2 Signed-off-by: Tony Luck Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: Elliott, Robert (Persistent Memory) Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Naoya Horiguchi Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20170816171803.28342-1-tony.luck@intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/page_64.h | 4 ++++ arch/x86/kernel/cpu/mcheck/mce.c | 43 ++++++++++++++++++++++++++++++++++++++++ include/linux/mm_inline.h | 6 ++++++ mm/memory-failure.c | 2 ++ 4 files changed, 55 insertions(+) (limited to 'include') diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index b4a0d43248cf..b50df06ad251 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -51,6 +51,10 @@ static inline void clear_page(void *page) void copy_page(void *to, void *from); +#ifdef CONFIG_X86_MCE +#define arch_unmap_kpfn arch_unmap_kpfn +#endif + #endif /* !__ASSEMBLY__ */ #ifdef CONFIG_X86_VSYSCALL_EMULATION diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 6dde0497efc7..3b413065c613 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -51,6 +51,7 @@ #include #include #include +#include #include "mce-internal.h" @@ -1051,6 +1052,48 @@ static int do_memory_failure(struct mce *m) return ret; } +#if defined(arch_unmap_kpfn) && defined(CONFIG_MEMORY_FAILURE) + +void arch_unmap_kpfn(unsigned long pfn) +{ + unsigned long decoy_addr; + + /* + * Unmap this page from the kernel 1:1 mappings to make sure + * we don't log more errors because of speculative access to + * the page. + * We would like to just call: + * set_memory_np((unsigned long)pfn_to_kaddr(pfn), 1); + * but doing that would radically increase the odds of a + * speculative access to the posion page because we'd have + * the virtual address of the kernel 1:1 mapping sitting + * around in registers. + * Instead we get tricky. We create a non-canonical address + * that looks just like the one we want, but has bit 63 flipped. + * This relies on set_memory_np() not checking whether we passed + * a legal address. + */ + +/* + * Build time check to see if we have a spare virtual bit. Don't want + * to leave this until run time because most developers don't have a + * system that can exercise this code path. This will only become a + * problem if/when we move beyond 5-level page tables. + * + * Hard code "9" here because cpp doesn't grok ilog2(PTRS_PER_PGD) + */ +#if PGDIR_SHIFT + 9 < 63 + decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63)); +#else +#error "no unused virtual bit available" +#endif + + if (set_memory_np(decoy_addr, 1)) + pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); + +} +#endif + /* * The actual machine check handler. This only handles real * exceptions when something got corrupted coming in through int 18. diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index e030a68ead7e..25438b2b6f22 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -126,4 +126,10 @@ static __always_inline enum lru_list page_lru(struct page *page) #define lru_to_page(head) (list_entry((head)->prev, struct page, lru)) +#ifdef arch_unmap_kpfn +extern void arch_unmap_kpfn(unsigned long pfn); +#else +static __always_inline void arch_unmap_kpfn(unsigned long pfn) { } +#endif + #endif diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 1cd3b3569af8..88366626c0b7 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1146,6 +1146,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags) return 0; } + arch_unmap_kpfn(pfn); + orig_head = hpage = compound_head(p); num_poisoned_pages_inc(); -- cgit v1.2.3