diff options
Diffstat (limited to 'arch/x86/kvm/mmu')
-rw-r--r-- | arch/x86/kvm/mmu/mmu.c | 974 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/mmu_internal.h | 88 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/mmutrace.h | 21 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/paging_tmpl.h | 50 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/spte.c | 318 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/spte.h | 252 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_iter.c | 182 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_iter.h | 60 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_mmu.c | 1157 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_mmu.h | 48 |
10 files changed, 2443 insertions, 707 deletions
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 71aa3da2a0b7..17587f496ec7 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -19,10 +19,12 @@ #include "ioapic.h" #include "mmu.h" #include "mmu_internal.h" +#include "tdp_mmu.h" #include "x86.h" #include "kvm_cache_regs.h" #include "kvm_emulate.h" #include "cpuid.h" +#include "spte.h" #include <linux/kvm_host.h> #include <linux/types.h> @@ -45,7 +47,6 @@ #include <asm/page.h> #include <asm/memtype.h> #include <asm/cmpxchg.h> -#include <asm/e820/api.h> #include <asm/io.h> #include <asm/vmx.h> #include <asm/kvm_page_track.h> @@ -64,12 +65,12 @@ static uint __read_mostly nx_huge_pages_recovery_ratio = 60; static int set_nx_huge_pages(const char *val, const struct kernel_param *kp); static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp); -static struct kernel_param_ops nx_huge_pages_ops = { +static const struct kernel_param_ops nx_huge_pages_ops = { .set = set_nx_huge_pages, .get = param_get_bool, }; -static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = { +static const struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = { .set = set_nx_huge_pages_recovery_ratio, .get = param_get_uint, }; @@ -104,45 +105,13 @@ enum { AUDIT_POST_SYNC }; -#undef MMU_DEBUG - #ifdef MMU_DEBUG -static bool dbg = 0; +bool dbg = 0; module_param(dbg, bool, 0644); - -#define pgprintk(x...) do { if (dbg) printk(x); } while (0) -#define rmap_printk(x...) do { if (dbg) printk(x); } while (0) -#define MMU_WARN_ON(x) WARN_ON(x) -#else -#define pgprintk(x...) do { } while (0) -#define rmap_printk(x...) do { } while (0) -#define MMU_WARN_ON(x) do { } while (0) #endif #define PTE_PREFETCH_NUM 8 -#define PT_FIRST_AVAIL_BITS_SHIFT 10 -#define PT64_SECOND_AVAIL_BITS_SHIFT 54 - -/* - * The mask used to denote special SPTEs, which can be either MMIO SPTEs or - * Access Tracking SPTEs. - */ -#define SPTE_SPECIAL_MASK (3ULL << 52) -#define SPTE_AD_ENABLED_MASK (0ULL << 52) -#define SPTE_AD_DISABLED_MASK (1ULL << 52) -#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52) -#define SPTE_MMIO_MASK (3ULL << 52) - -#define PT64_LEVEL_BITS 9 - -#define PT64_LEVEL_SHIFT(level) \ - (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) - -#define PT64_INDEX(address, level)\ - (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) - - #define PT32_LEVEL_BITS 10 #define PT32_LEVEL_SHIFT(level) \ @@ -156,18 +125,6 @@ module_param(dbg, bool, 0644); (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) -#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK -#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) -#else -#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) -#endif -#define PT64_LVL_ADDR_MASK(level) \ - (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT64_LEVEL_BITS))) - 1)) -#define PT64_LVL_OFFSET_MASK(level) \ - (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT64_LEVEL_BITS))) - 1)) - #define PT32_BASE_ADDR_MASK PAGE_MASK #define PT32_DIR_BASE_ADDR_MASK \ (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) @@ -175,42 +132,11 @@ module_param(dbg, bool, 0644); (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ * PT32_LEVEL_BITS))) - 1)) -#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ - | shadow_x_mask | shadow_nx_mask | shadow_me_mask) - -#define ACC_EXEC_MASK 1 -#define ACC_WRITE_MASK PT_WRITABLE_MASK -#define ACC_USER_MASK PT_USER_MASK -#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) - -/* The mask for the R/X bits in EPT PTEs */ -#define PT64_EPT_READABLE_MASK 0x1ull -#define PT64_EPT_EXECUTABLE_MASK 0x4ull - #include <trace/events/kvm.h> -#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) -#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) - -#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) - /* make pte_list_desc fit well in cache line */ #define PTE_LIST_EXT 3 -/* - * Return values of handle_mmio_page_fault and mmu.page_fault: - * RET_PF_RETRY: let CPU fault again on the address. - * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. - * - * For handle_mmio_page_fault only: - * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. - */ -enum { - RET_PF_RETRY = 0, - RET_PF_EMULATE = 1, - RET_PF_INVALID = 2, -}; - struct pte_list_desc { u64 *sptes[PTE_LIST_EXT]; struct pte_list_desc *more; @@ -242,65 +168,10 @@ struct kvm_shadow_walk_iterator { __shadow_walk_next(&(_walker), spte)) static struct kmem_cache *pte_list_desc_cache; -static struct kmem_cache *mmu_page_header_cache; +struct kmem_cache *mmu_page_header_cache; static struct percpu_counter kvm_total_used_mmu_pages; -static u64 __read_mostly shadow_nx_mask; -static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ -static u64 __read_mostly shadow_user_mask; -static u64 __read_mostly shadow_accessed_mask; -static u64 __read_mostly shadow_dirty_mask; -static u64 __read_mostly shadow_mmio_value; -static u64 __read_mostly shadow_mmio_access_mask; -static u64 __read_mostly shadow_present_mask; -static u64 __read_mostly shadow_me_mask; - -/* - * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK; - * shadow_acc_track_mask is the set of bits to be cleared in non-accessed - * pages. - */ -static u64 __read_mostly shadow_acc_track_mask; - -/* - * The mask/shift to use for saving the original R/X bits when marking the PTE - * as not-present for access tracking purposes. We do not save the W bit as the - * PTEs being access tracked also need to be dirty tracked, so the W bit will be - * restored only when a write is attempted to the page. - */ -static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK | - PT64_EPT_EXECUTABLE_MASK; -static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT; - -/* - * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order - * to guard against L1TF attacks. - */ -static u64 __read_mostly shadow_nonpresent_or_rsvd_mask; - -/* - * The number of high-order 1 bits to use in the mask above. - */ -static const u64 shadow_nonpresent_or_rsvd_mask_len = 5; - -/* - * In some cases, we need to preserve the GFN of a non-present or reserved - * SPTE when we usurp the upper five bits of the physical address space to - * defend against L1TF, e.g. for MMIO SPTEs. To preserve the GFN, we'll - * shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask - * left into the reserved bits, i.e. the GFN in the SPTE will be split into - * high and low parts. This mask covers the lower bits of the GFN. - */ -static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; - -/* - * The number of non-reserved physical address bits irrespective of features - * that repurpose legal bits, e.g. MKTME. - */ -static u8 __read_mostly shadow_phys_bits; - static void mmu_spte_set(u64 *sptep, u64 spte); -static bool is_executable_pte(u64 spte); static union kvm_mmu_page_role kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu); @@ -325,7 +196,7 @@ static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm, kvm_flush_remote_tlbs(kvm); } -static void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, +void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, u64 start_gfn, u64 pages) { struct kvm_tlb_range range; @@ -336,143 +207,17 @@ static void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, kvm_flush_remote_tlbs_with_range(kvm, &range); } -void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask) -{ - BUG_ON((u64)(unsigned)access_mask != access_mask); - WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << shadow_nonpresent_or_rsvd_mask_len)); - WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask); - shadow_mmio_value = mmio_value | SPTE_MMIO_MASK; - shadow_mmio_access_mask = access_mask; -} -EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); - -static bool is_mmio_spte(u64 spte) -{ - return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK; -} - -static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) -{ - return sp->role.ad_disabled; -} - -static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) -{ - /* - * When using the EPT page-modification log, the GPAs in the log - * would come from L2 rather than L1. Therefore, we need to rely - * on write protection to record dirty pages. This also bypasses - * PML, since writes now result in a vmexit. - */ - return vcpu->arch.mmu == &vcpu->arch.guest_mmu; -} - -static inline bool spte_ad_enabled(u64 spte) -{ - MMU_WARN_ON(is_mmio_spte(spte)); - return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK; -} - -static inline bool spte_ad_need_write_protect(u64 spte) -{ - MMU_WARN_ON(is_mmio_spte(spte)); - return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK; -} - -static bool is_nx_huge_page_enabled(void) +bool is_nx_huge_page_enabled(void) { return READ_ONCE(nx_huge_pages); } -static inline u64 spte_shadow_accessed_mask(u64 spte) -{ - MMU_WARN_ON(is_mmio_spte(spte)); - return spte_ad_enabled(spte) ? shadow_accessed_mask : 0; -} - -static inline u64 spte_shadow_dirty_mask(u64 spte) -{ - MMU_WARN_ON(is_mmio_spte(spte)); - return spte_ad_enabled(spte) ? shadow_dirty_mask : 0; -} - -static inline bool is_access_track_spte(u64 spte) -{ - return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0; -} - -/* - * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of - * the memslots generation and is derived as follows: - * - * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11 - * Bits 9-18 of the MMIO generation are propagated to spte bits 52-61 - * - * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in - * the MMIO generation number, as doing so would require stealing a bit from - * the "real" generation number and thus effectively halve the maximum number - * of MMIO generations that can be handled before encountering a wrap (which - * requires a full MMU zap). The flag is instead explicitly queried when - * checking for MMIO spte cache hits. - */ -#define MMIO_SPTE_GEN_MASK GENMASK_ULL(17, 0) - -#define MMIO_SPTE_GEN_LOW_START 3 -#define MMIO_SPTE_GEN_LOW_END 11 -#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \ - MMIO_SPTE_GEN_LOW_START) - -#define MMIO_SPTE_GEN_HIGH_START PT64_SECOND_AVAIL_BITS_SHIFT -#define MMIO_SPTE_GEN_HIGH_END 62 -#define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \ - MMIO_SPTE_GEN_HIGH_START) - -static u64 generation_mmio_spte_mask(u64 gen) -{ - u64 mask; - - WARN_ON(gen & ~MMIO_SPTE_GEN_MASK); - BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK); - - mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK; - mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK; - return mask; -} - -static u64 get_mmio_spte_generation(u64 spte) -{ - u64 gen; - - gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START; - gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START; - return gen; -} - -static u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access) -{ - - u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK; - u64 mask = generation_mmio_spte_mask(gen); - u64 gpa = gfn << PAGE_SHIFT; - - access &= shadow_mmio_access_mask; - mask |= shadow_mmio_value | access; - mask |= gpa | shadow_nonpresent_or_rsvd_mask; - mask |= (gpa & shadow_nonpresent_or_rsvd_mask) - << shadow_nonpresent_or_rsvd_mask_len; - - return mask; -} - static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, unsigned int access) { u64 mask = make_mmio_spte(vcpu, gfn, access); - unsigned int gen = get_mmio_spte_generation(mask); - - access = mask & ACC_ALL; - trace_mark_mmio_spte(sptep, gfn, access, gen); + trace_mark_mmio_spte(sptep, gfn, mask); mmu_spte_set(sptep, mask); } @@ -521,7 +266,7 @@ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, struct x86_exception *exception) { /* Check if guest physical address doesn't exceed guest maximum */ - if (kvm_mmu_is_illegal_gpa(vcpu, gpa)) { + if (kvm_vcpu_is_illegal_gpa(vcpu, gpa)) { exception->error_code |= PFERR_RSVD_MASK; return UNMAPPED_GVA; } @@ -529,90 +274,6 @@ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, return gpa; } -/* - * Sets the shadow PTE masks used by the MMU. - * - * Assumptions: - * - Setting either @accessed_mask or @dirty_mask requires setting both - * - At least one of @accessed_mask or @acc_track_mask must be set - */ -void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, - u64 acc_track_mask, u64 me_mask) -{ - BUG_ON(!dirty_mask != !accessed_mask); - BUG_ON(!accessed_mask && !acc_track_mask); - BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK); - - shadow_user_mask = user_mask; - shadow_accessed_mask = accessed_mask; - shadow_dirty_mask = dirty_mask; - shadow_nx_mask = nx_mask; - shadow_x_mask = x_mask; - shadow_present_mask = p_mask; - shadow_acc_track_mask = acc_track_mask; - shadow_me_mask = me_mask; -} -EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); - -static u8 kvm_get_shadow_phys_bits(void) -{ - /* - * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected - * in CPU detection code, but the processor treats those reduced bits as - * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at - * the physical address bits reported by CPUID. - */ - if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008)) - return cpuid_eax(0x80000008) & 0xff; - - /* - * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with - * custom CPUID. Proceed with whatever the kernel found since these features - * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008). - */ - return boot_cpu_data.x86_phys_bits; -} - -static void kvm_mmu_reset_all_pte_masks(void) -{ - u8 low_phys_bits; - - shadow_user_mask = 0; - shadow_accessed_mask = 0; - shadow_dirty_mask = 0; - shadow_nx_mask = 0; - shadow_x_mask = 0; - shadow_present_mask = 0; - shadow_acc_track_mask = 0; - - shadow_phys_bits = kvm_get_shadow_phys_bits(); - - /* - * If the CPU has 46 or less physical address bits, then set an - * appropriate mask to guard against L1TF attacks. Otherwise, it is - * assumed that the CPU is not vulnerable to L1TF. - * - * Some Intel CPUs address the L1 cache using more PA bits than are - * reported by CPUID. Use the PA width of the L1 cache when possible - * to achieve more effective mitigation, e.g. if system RAM overlaps - * the most significant bits of legal physical address space. - */ - shadow_nonpresent_or_rsvd_mask = 0; - low_phys_bits = boot_cpu_data.x86_phys_bits; - if (boot_cpu_has_bug(X86_BUG_L1TF) && - !WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >= - 52 - shadow_nonpresent_or_rsvd_mask_len)) { - low_phys_bits = boot_cpu_data.x86_cache_bits - - shadow_nonpresent_or_rsvd_mask_len; - shadow_nonpresent_or_rsvd_mask = - rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1); - } - - shadow_nonpresent_or_rsvd_lower_gfn_mask = - GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT); -} - static int is_cpuid_PSE36(void) { return 1; @@ -623,35 +284,6 @@ static int is_nx(struct kvm_vcpu *vcpu) return vcpu->arch.efer & EFER_NX; } -static int is_shadow_present_pte(u64 pte) -{ - return (pte != 0) && !is_mmio_spte(pte); -} - -static int is_large_pte(u64 pte) -{ - return pte & PT_PAGE_SIZE_MASK; -} - -static int is_last_spte(u64 pte, int level) -{ - if (level == PG_LEVEL_4K) - return 1; - if (is_large_pte(pte)) - return 1; - return 0; -} - -static bool is_executable_pte(u64 spte) -{ - return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask; -} - -static kvm_pfn_t spte_to_pfn(u64 pte) -{ - return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; -} - static gfn_t pse36_gfn_delta(u32 gpte) { int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; @@ -796,12 +428,6 @@ retry: } #endif -static bool spte_can_locklessly_be_made_writable(u64 spte) -{ - return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) == - (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE); -} - static bool spte_has_volatile_bits(u64 spte) { if (!is_shadow_present_pte(spte)) @@ -826,21 +452,6 @@ static bool spte_has_volatile_bits(u64 spte) return false; } -static bool is_accessed_spte(u64 spte) -{ - u64 accessed_mask = spte_shadow_accessed_mask(spte); - - return accessed_mask ? spte & accessed_mask - : !is_access_track_spte(spte); -} - -static bool is_dirty_spte(u64 spte) -{ - u64 dirty_mask = spte_shadow_dirty_mask(spte); - - return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK; -} - /* Rules for using mmu_spte_set: * Set the sptep from nonpresent to present. * Note: the sptep being assigned *must* be either not present @@ -976,34 +587,6 @@ static u64 mmu_spte_get_lockless(u64 *sptep) return __get_spte_lockless(sptep); } -static u64 mark_spte_for_access_track(u64 spte) -{ - if (spte_ad_enabled(spte)) - return spte & ~shadow_accessed_mask; - - if (is_access_track_spte(spte)) - return spte; - - /* - * Making an Access Tracking PTE will result in removal of write access - * from the PTE. So, verify that we will be able to restore the write - * access in the fast page fault path later on. - */ - WARN_ONCE((spte & PT_WRITABLE_MASK) && - !spte_can_locklessly_be_made_writable(spte), - "kvm: Writable SPTE is not locklessly dirty-trackable\n"); - - WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask << - shadow_acc_track_saved_bits_shift), - "kvm: Access Tracking saved bit locations are not zero\n"); - - spte |= (spte & shadow_acc_track_saved_bits_mask) << - shadow_acc_track_saved_bits_shift; - spte &= ~shadow_acc_track_mask; - - return spte; -} - /* Restore an acc-track PTE back to a regular PTE */ static u64 restore_acc_track_spte(u64 spte) { @@ -1193,7 +776,7 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_gfn_disallow_lpage(slot, gfn); } -static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) +void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) { if (sp->lpage_disallowed) return; @@ -1221,7 +804,7 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_gfn_allow_lpage(slot, gfn); } -static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) +void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) { --kvm->stat.nx_lpage_splits; sp->lpage_disallowed = false; @@ -1640,6 +1223,9 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, { struct kvm_rmap_head *rmap_head; + if (kvm->arch.tdp_mmu_enabled) + kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, + slot->base_gfn + gfn_offset, mask, true); while (mask) { rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), PG_LEVEL_4K, slot); @@ -1666,6 +1252,9 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, { struct kvm_rmap_head *rmap_head; + if (kvm->arch.tdp_mmu_enabled) + kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, + slot->base_gfn + gfn_offset, mask, false); while (mask) { rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), PG_LEVEL_4K, slot); @@ -1710,6 +1299,10 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, write_protected |= __rmap_write_protect(kvm, rmap_head, true); } + if (kvm->arch.tdp_mmu_enabled) + write_protected |= + kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn); + return write_protected; } @@ -1769,13 +1362,8 @@ restart: pte_list_remove(rmap_head, sptep); goto restart; } else { - new_spte = *sptep & ~PT64_BASE_ADDR_MASK; - new_spte |= (u64)new_pfn << PAGE_SHIFT; - - new_spte &= ~PT_WRITABLE_MASK; - new_spte &= ~SPTE_HOST_WRITEABLE; - - new_spte = mark_spte_for_access_track(new_spte); + new_spte = kvm_mmu_changed_pte_notifier_make_spte( + *sptep, new_pfn); mmu_spte_clear_track_bits(sptep); mmu_spte_set(sptep, new_spte); @@ -1919,12 +1507,26 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, unsigned flags) { - return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); + int r; + + r = kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); + + if (kvm->arch.tdp_mmu_enabled) + r |= kvm_tdp_mmu_zap_hva_range(kvm, start, end); + + return r; } int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) { - return kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); + int r; + + r = kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); + + if (kvm->arch.tdp_mmu_enabled) + r |= kvm_tdp_mmu_set_spte_hva(kvm, hva, &pte); + + return r; } static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, @@ -1973,12 +1575,24 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) { - return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp); + int young = false; + + young = kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp); + if (kvm->arch.tdp_mmu_enabled) + young |= kvm_tdp_mmu_age_hva_range(kvm, start, end); + + return young; } int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) { - return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp); + int young = false; + + young = kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp); + if (kvm->arch.tdp_mmu_enabled) + young |= kvm_tdp_mmu_test_age_hva(kvm, hva); + + return young; } #ifdef MMU_DEBUG @@ -2577,13 +2191,7 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); - spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK | - shadow_user_mask | shadow_x_mask | shadow_me_mask; - - if (sp_ad_disabled(sp)) - spte |= SPTE_AD_DISABLED_MASK; - else - spte |= shadow_accessed_mask; + spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp)); mmu_spte_set(sptep, spte); @@ -2615,8 +2223,9 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, } } -static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, - u64 *spte) +/* Returns the number of zapped non-leaf child shadow pages. */ +static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, + u64 *spte, struct list_head *invalid_list) { u64 pte; struct kvm_mmu_page *child; @@ -2630,23 +2239,34 @@ static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, } else { child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); drop_parent_pte(child, spte); - } - return true; - } - if (is_mmio_spte(pte)) + /* + * Recursively zap nested TDP SPs, parentless SPs are + * unlikely to be used again in the near future. This + * avoids retaining a large number of stale nested SPs. + */ + if (tdp_enabled && invalid_list && + child->role.guest_mode && !child->parent_ptes.val) + return kvm_mmu_prepare_zap_page(kvm, child, + invalid_list); + } + } else if (is_mmio_spte(pte)) { mmu_spte_clear_no_track(spte); - - return false; + } + return 0; } -static void kvm_mmu_page_unlink_children(struct kvm *kvm, - struct kvm_mmu_page *sp) +static int kvm_mmu_page_unlink_children(struct kvm *kvm, + struct kvm_mmu_page *sp, + struct list_head *invalid_list) { + int zapped = 0; unsigned i; for (i = 0; i < PT64_ENT_PER_PAGE; ++i) - mmu_page_zap_pte(kvm, sp, sp->spt + i); + zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list); + + return zapped; } static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) @@ -2692,7 +2312,7 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, trace_kvm_mmu_prepare_zap_page(sp); ++kvm->stat.mmu_shadow_zapped; *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list); - kvm_mmu_page_unlink_children(kvm, sp); + *nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list); kvm_mmu_unlink_parents(kvm, sp); /* Zapping children means active_mmu_pages has become unstable. */ @@ -2885,8 +2505,8 @@ static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) kvm_mmu_mark_parents_unsync(sp); } -static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, - bool can_unsync) +bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, + bool can_unsync) { struct kvm_mmu_page *sp; @@ -2946,132 +2566,42 @@ static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, return false; } -static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) -{ - if (pfn_valid(pfn)) - return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) && - /* - * Some reserved pages, such as those from NVDIMM - * DAX devices, are not for MMIO, and can be mapped - * with cached memory type for better performance. - * However, the above check misconceives those pages - * as MMIO, and results in KVM mapping them with UC - * memory type, which would hurt the performance. - * Therefore, we check the host memory type in addition - * and only treat UC/UC-/WC pages as MMIO. - */ - (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn)); - - return !e820__mapped_raw_any(pfn_to_hpa(pfn), - pfn_to_hpa(pfn + 1) - 1, - E820_TYPE_RAM); -} - -/* Bits which may be returned by set_spte() */ -#define SET_SPTE_WRITE_PROTECTED_PT BIT(0) -#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) - static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned int pte_access, int level, gfn_t gfn, kvm_pfn_t pfn, bool speculative, bool can_unsync, bool host_writable) { - u64 spte = 0; - int ret = 0; + u64 spte; struct kvm_mmu_page *sp; + int ret; if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access)) return 0; sp = sptep_to_sp(sptep); - if (sp_ad_disabled(sp)) - spte |= SPTE_AD_DISABLED_MASK; - else if (kvm_vcpu_ad_need_write_protect(vcpu)) - spte |= SPTE_AD_WRPROT_ONLY_MASK; - - /* - * For the EPT case, shadow_present_mask is 0 if hardware - * supports exec-only page table entries. In that case, - * ACC_USER_MASK and shadow_user_mask are used to represent - * read access. See FNAME(gpte_access) in paging_tmpl.h. - */ - spte |= shadow_present_mask; - if (!speculative) - spte |= spte_shadow_accessed_mask(spte); - if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) && - is_nx_huge_page_enabled()) { - pte_access &= ~ACC_EXEC_MASK; - } + ret = make_spte(vcpu, pte_access, level, gfn, pfn, *sptep, speculative, + can_unsync, host_writable, sp_ad_disabled(sp), &spte); - if (pte_access & ACC_EXEC_MASK) - spte |= shadow_x_mask; - else - spte |= shadow_nx_mask; - - if (pte_access & ACC_USER_MASK) - spte |= shadow_user_mask; - - if (level > PG_LEVEL_4K) - spte |= PT_PAGE_SIZE_MASK; - if (tdp_enabled) - spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn, - kvm_is_mmio_pfn(pfn)); - - if (host_writable) - spte |= SPTE_HOST_WRITEABLE; - else - pte_access &= ~ACC_WRITE_MASK; - - if (!kvm_is_mmio_pfn(pfn)) - spte |= shadow_me_mask; - - spte |= (u64)pfn << PAGE_SHIFT; - - if (pte_access & ACC_WRITE_MASK) { - spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; - - /* - * Optimization: for pte sync, if spte was writable the hash - * lookup is unnecessary (and expensive). Write protection - * is responsibility of mmu_get_page / kvm_sync_page. - * Same reasoning can be applied to dirty page accounting. - */ - if (!can_unsync && is_writable_pte(*sptep)) - goto set_pte; - - if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { - pgprintk("%s: found shadow page for %llx, marking ro\n", - __func__, gfn); - ret |= SET_SPTE_WRITE_PROTECTED_PT; - pte_access &= ~ACC_WRITE_MASK; - spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); - } - } - - if (pte_access & ACC_WRITE_MASK) { + if (spte & PT_WRITABLE_MASK) kvm_vcpu_mark_page_dirty(vcpu, gfn); - spte |= spte_shadow_dirty_mask(spte); - } - if (speculative) - spte = mark_spte_for_access_track(spte); - -set_pte: - if (mmu_spte_update(sptep, spte)) + if (*sptep == spte) + ret |= SET_SPTE_SPURIOUS; + else if (mmu_spte_update(sptep, spte)) ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH; return ret; } static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, - unsigned int pte_access, int write_fault, int level, + unsigned int pte_access, bool write_fault, int level, gfn_t gfn, kvm_pfn_t pfn, bool speculative, bool host_writable) { int was_rmapped = 0; int rmap_count; int set_spte_ret; - int ret = RET_PF_RETRY; + int ret = RET_PF_FIXED; bool flush = false; pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, @@ -3113,6 +2643,15 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (unlikely(is_mmio_spte(*sptep))) ret = RET_PF_EMULATE; + /* + * The fault is fully spurious if and only if the new SPTE and old SPTE + * are identical, and emulation is not required. + */ + if ((set_spte_ret & SET_SPTE_SPURIOUS) && ret == RET_PF_FIXED) { + WARN_ON_ONCE(!was_rmapped); + return RET_PF_SPURIOUS; + } + pgprintk("%s: setting spte %llx\n", __func__, *sptep); trace_kvm_mmu_set_spte(level, gfn, sptep); if (!was_rmapped && is_large_pte(*sptep)) @@ -3161,7 +2700,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, return -1; for (i = 0; i < ret; i++, gfn++, start++) { - mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn, + mmu_set_spte(vcpu, start, access, false, sp->role.level, gfn, page_to_pfn(pages[i]), true, true); put_page(pages[i]); } @@ -3239,8 +2778,9 @@ static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, return level; } -static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, - int max_level, kvm_pfn_t *pfnp) +int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, + int max_level, kvm_pfn_t *pfnp, + bool huge_page_disallowed, int *req_level) { struct kvm_memory_slot *slot; struct kvm_lpage_info *linfo; @@ -3248,6 +2788,8 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t mask; int level; + *req_level = PG_LEVEL_4K; + if (unlikely(max_level == PG_LEVEL_4K)) return PG_LEVEL_4K; @@ -3272,7 +2814,14 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, if (level == PG_LEVEL_4K) return level; - level = min(level, max_level); + *req_level = level = min(level, max_level); + + /* + * Enforce the iTLB multihit workaround after capturing the requested + * level, which will be used to do precise, accurate accounting. + */ + if (huge_page_disallowed) + return PG_LEVEL_4K; /* * mmu_notifier_retry() was successful and mmu_lock is held, so @@ -3285,14 +2834,12 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, return level; } -static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, - gfn_t gfn, kvm_pfn_t *pfnp, int *levelp) +void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level, + kvm_pfn_t *pfnp, int *goal_levelp) { - int level = *levelp; - u64 spte = *it.sptep; + int level = *goal_levelp; - if (it.level == level && level > PG_LEVEL_4K && - is_nx_huge_page_enabled() && + if (cur_level == level && level > PG_LEVEL_4K && is_shadow_present_pte(spte) && !is_large_pte(spte)) { /* @@ -3302,26 +2849,32 @@ static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, * patching back for them into pfn the next 9 bits of * the address. */ - u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1); + u64 page_mask = KVM_PAGES_PER_HPAGE(level) - + KVM_PAGES_PER_HPAGE(level - 1); *pfnp |= gfn & page_mask; - (*levelp)--; + (*goal_levelp)--; } } -static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, +static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, int map_writable, int max_level, kvm_pfn_t pfn, - bool prefault, bool account_disallowed_nx_lpage) + bool prefault, bool is_tdp) { + bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); + bool write = error_code & PFERR_WRITE_MASK; + bool exec = error_code & PFERR_FETCH_MASK; + bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; struct kvm_shadow_walk_iterator it; struct kvm_mmu_page *sp; - int level, ret; + int level, req_level, ret; gfn_t gfn = gpa >> PAGE_SHIFT; gfn_t base_gfn = gfn; if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) return RET_PF_RETRY; - level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn); + level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, + huge_page_disallowed, &req_level); trace_kvm_mmu_spte_requested(gpa, level, pfn); for_each_shadow_entry(vcpu, gpa, it) { @@ -3329,7 +2882,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, * We cannot overwrite existing page tables with an NX * large page, as the leaf could be executable. */ - disallowed_hugepage_adjust(it, gfn, &pfn, &level); + if (nx_huge_page_workaround_enabled) + disallowed_hugepage_adjust(*it.sptep, gfn, it.level, + &pfn, &level); base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); if (it.level == level) @@ -3341,7 +2896,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, it.level - 1, true, ACC_ALL); link_shadow_page(vcpu, it.sptep, sp); - if (account_disallowed_nx_lpage) + if (is_tdp && huge_page_disallowed && + req_level >= it.level) account_huge_nx_page(vcpu->kvm, sp); } } @@ -3349,6 +2905,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL, write, level, base_gfn, pfn, prefault, map_writable); + if (ret == RET_PF_SPURIOUS) + return ret; + direct_pte_prefetch(vcpu, it.sptep); ++vcpu->stat.pf_fixed; return ret; @@ -3479,21 +3038,19 @@ static bool is_access_allowed(u32 fault_err_code, u64 spte) } /* - * Return value: - * - true: let the vcpu to access on the same address again. - * - false: let the real page fault path to fix it. + * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS. */ -static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - u32 error_code) +static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + u32 error_code) { struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; - bool fault_handled = false; + int ret = RET_PF_INVALID; u64 spte = 0ull; uint retry_count = 0; if (!page_fault_can_be_fast(error_code)) - return false; + return ret; walk_shadow_page_lockless_begin(vcpu); @@ -3519,7 +3076,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * they are always ACC_ALL. */ if (is_access_allowed(error_code, spte)) { - fault_handled = true; + ret = RET_PF_SPURIOUS; break; } @@ -3562,11 +3119,11 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * since the gfn is not stable for indirect shadow page. See * Documentation/virt/kvm/locking.rst to get more detail. */ - fault_handled = fast_pf_fix_direct_spte(vcpu, sp, - iterator.sptep, spte, - new_spte); - if (fault_handled) + if (fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte, + new_spte)) { + ret = RET_PF_FIXED; break; + } if (++retry_count > 4) { printk_once(KERN_WARNING @@ -3577,10 +3134,10 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, } while (true); trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep, - spte, fault_handled); + spte, ret); walk_shadow_page_lockless_end(vcpu); - return fault_handled; + return ret; } static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, @@ -3592,9 +3149,13 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, return; sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK); - --sp->root_count; - if (!sp->root_count && sp->role.invalid) - kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); + + if (kvm_mmu_put_root(kvm, sp)) { + if (sp->tdp_mmu_page) + kvm_tdp_mmu_free_root(kvm, sp); + else if (sp->role.invalid) + kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); + } *root_hpa = INVALID_PAGE; } @@ -3603,6 +3164,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, ulong roots_to_free) { + struct kvm *kvm = vcpu->kvm; int i; LIST_HEAD(invalid_list); bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT; @@ -3620,22 +3182,21 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return; } - spin_lock(&vcpu->kvm->mmu_lock); + spin_lock(&kvm->mmu_lock); for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) - mmu_free_root_page(vcpu->kvm, &mmu->prev_roots[i].hpa, + mmu_free_root_page(kvm, &mmu->prev_roots[i].hpa, &invalid_list); if (free_active_root) { if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) { - mmu_free_root_page(vcpu->kvm, &mmu->root_hpa, - &invalid_list); + mmu_free_root_page(kvm, &mmu->root_hpa, &invalid_list); } else { for (i = 0; i < 4; ++i) if (mmu->pae_root[i] != 0) - mmu_free_root_page(vcpu->kvm, + mmu_free_root_page(kvm, &mmu->pae_root[i], &invalid_list); mmu->root_hpa = INVALID_PAGE; @@ -3643,8 +3204,8 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, mmu->root_pgd = 0; } - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); - spin_unlock(&vcpu->kvm->mmu_lock); + kvm_mmu_commit_zap_page(kvm, &invalid_list); + spin_unlock(&kvm->mmu_lock); } EXPORT_SYMBOL_GPL(kvm_mmu_free_roots); @@ -3684,8 +3245,16 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) hpa_t root; unsigned i; - if (shadow_root_level >= PT64_ROOT_4LEVEL) { - root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true); + if (vcpu->kvm->arch.tdp_mmu_enabled) { + root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu); + + if (!VALID_PAGE(root)) + return -ENOSPC; + vcpu->arch.mmu->root_hpa = root; + } else if (shadow_root_level >= PT64_ROOT_4LEVEL) { + root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, + true); + if (!VALID_PAGE(root)) return -ENOSPC; vcpu->arch.mmu->root_hpa = root; @@ -3910,54 +3479,82 @@ static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) return vcpu_match_mmio_gva(vcpu, addr); } -/* return true if reserved bit is detected on spte. */ -static bool -walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) +/* + * Return the level of the lowest level SPTE added to sptes. + * That SPTE may be non-present. + */ +static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes) { struct kvm_shadow_walk_iterator iterator; - u64 sptes[PT64_ROOT_MAX_LEVEL], spte = 0ull; - struct rsvd_bits_validate *rsvd_check; - int root, leaf; - bool reserved = false; + int leaf = vcpu->arch.mmu->root_level; + u64 spte; - rsvd_check = &vcpu->arch.mmu->shadow_zero_check; walk_shadow_page_lockless_begin(vcpu); - for (shadow_walk_init(&iterator, vcpu, addr), - leaf = root = iterator.level; + for (shadow_walk_init(&iterator, vcpu, addr); shadow_walk_okay(&iterator); __shadow_walk_next(&iterator, spte)) { + leaf = iterator.level; spte = mmu_spte_get_lockless(iterator.sptep); sptes[leaf - 1] = spte; - leaf--; if (!is_shadow_present_pte(spte)) break; + } + + walk_shadow_page_lockless_end(vcpu); + + return leaf; +} + +/* return true if reserved bit is detected on spte. */ +static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) +{ + u64 sptes[PT64_ROOT_MAX_LEVEL]; + struct rsvd_bits_validate *rsvd_check; + int root = vcpu->arch.mmu->root_level; + int leaf; + int level; + bool reserved = false; + + if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) { + *sptep = 0ull; + return reserved; + } + + if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) + leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes); + else + leaf = get_walk(vcpu, addr, sptes); + + rsvd_check = &vcpu->arch.mmu->shadow_zero_check; + + for (level = root; level >= leaf; level--) { + if (!is_shadow_present_pte(sptes[level - 1])) + break; /* * Use a bitwise-OR instead of a logical-OR to aggregate the * reserved bit and EPT's invalid memtype/XWR checks to avoid * adding a Jcc in the loop. */ - reserved |= __is_bad_mt_xwr(rsvd_check, spte) | - __is_rsvd_bits_set(rsvd_check, spte, iterator.level); + reserved |= __is_bad_mt_xwr(rsvd_check, sptes[level - 1]) | + __is_rsvd_bits_set(rsvd_check, sptes[level - 1], + level); } - walk_shadow_page_lockless_end(vcpu); - if (reserved) { pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n", __func__, addr); - while (root > leaf) { + for (level = root; level >= leaf; level--) pr_err("------ spte 0x%llx level %d.\n", - sptes[root - 1], root); - root--; - } + sptes[level - 1], level); } - *sptep = spte; + *sptep = sptes[leaf - 1]; + return reserved; } @@ -3969,7 +3566,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) if (mmio_info_in_cache(vcpu, addr, direct)) return RET_PF_EMULATE; - reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte); + reserved = get_mmio_spte(vcpu, addr, &spte); if (WARN_ON(reserved)) return -EINVAL; @@ -4080,8 +3677,6 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, bool prefault, int max_level, bool is_tdp) { bool write = error_code & PFERR_WRITE_MASK; - bool exec = error_code & PFERR_FETCH_MASK; - bool lpage_disallowed = exec && is_nx_huge_page_enabled(); bool map_writable; gfn_t gfn = gpa >> PAGE_SHIFT; @@ -4092,16 +3687,16 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (page_fault_handle_page_track(vcpu, error_code, gfn)) return RET_PF_EMULATE; - if (fast_page_fault(vcpu, gpa, error_code)) - return RET_PF_RETRY; + if (!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) { + r = fast_page_fault(vcpu, gpa, error_code); + if (r != RET_PF_INVALID) + return r; + } r = mmu_topup_memory_caches(vcpu, false); if (r) return r; - if (lpage_disallowed) - max_level = PG_LEVEL_4K; - mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); @@ -4118,8 +3713,13 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, r = make_mmu_pages_available(vcpu); if (r) goto out_unlock; - r = __direct_map(vcpu, gpa, write, map_writable, max_level, pfn, - prefault, is_tdp && lpage_disallowed); + + if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) + r = kvm_tdp_mmu_map(vcpu, gpa, error_code, map_writable, max_level, + pfn, prefault); + else + r = __direct_map(vcpu, gpa, error_code, map_writable, max_level, pfn, + prefault, is_tdp); out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); @@ -4292,7 +3892,13 @@ static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, */ vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); - __clear_sp_write_flooding_count(to_shadow_page(vcpu->arch.mmu->root_hpa)); + /* + * If this is a direct root page, it doesn't have a write flooding + * count. Otherwise, clear the write flooding count. + */ + if (!new_role.direct) + __clear_sp_write_flooding_count( + to_shadow_page(vcpu->arch.mmu->root_hpa)); } void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush, @@ -5400,7 +5006,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, u32 base_role = vcpu->arch.mmu->mmu_role.base.word; entry = *spte; - mmu_page_zap_pte(vcpu->kvm, sp, spte); + mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL); if (gentry && !((sp->role.word ^ base_role) & ~role_ign.word) && rmap_can_add(vcpu)) @@ -5450,13 +5056,14 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, if (r == RET_PF_INVALID) { r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, lower_32_bits(error_code), false); - WARN_ON(r == RET_PF_INVALID); + if (WARN_ON_ONCE(r == RET_PF_INVALID)) + return -EIO; } - if (r == RET_PF_RETRY) - return 1; if (r < 0) return r; + if (r != RET_PF_EMULATE) + return 1; /* * Before emulating the instruction, check if the error code @@ -5485,18 +5092,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu)) emulation_type |= EMULTYPE_ALLOW_RETRY_PF; emulate: - /* - * On AMD platforms, under certain conditions insn_len may be zero on #NPF. - * This can happen if a guest gets a page-fault on data access but the HW - * table walker is not able to read the instruction page (e.g instruction - * page is not present in memory). In those cases we simply restart the - * guest, with the exception of AMD Erratum 1096 which is unrecoverable. - */ - if (unlikely(insn && !insn_len)) { - if (!kvm_x86_ops.need_emulation_on_page_fault(vcpu)) - return 1; - } - return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn, insn_len); } @@ -5682,11 +5277,17 @@ static void free_mmu_pages(struct kvm_mmu *mmu) free_page((unsigned long)mmu->lm_root); } -static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) +static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) { struct page *page; int i; + mmu->root_hpa = INVALID_PAGE; + mmu->root_pgd = 0; + mmu->translate_gpa = translate_gpa; + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; + /* * When using PAE paging, the four PDPTEs are treated as 'root' pages, * while the PDP table is a per-vCPU construct that's allocated at MMU @@ -5712,7 +5313,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) int kvm_mmu_create(struct kvm_vcpu *vcpu) { - uint i; int ret; vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache; @@ -5726,25 +5326,13 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) vcpu->arch.mmu = &vcpu->arch.root_mmu; vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; - vcpu->arch.root_mmu.root_hpa = INVALID_PAGE; - vcpu->arch.root_mmu.root_pgd = 0; - vcpu->arch.root_mmu.translate_gpa = translate_gpa; - for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) - vcpu->arch.root_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; - - vcpu->arch.guest_mmu.root_hpa = INVALID_PAGE; - vcpu->arch.guest_mmu.root_pgd = 0; - vcpu->arch.guest_mmu.translate_gpa = translate_gpa; - for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) - vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; - vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; - ret = alloc_mmu_pages(vcpu, &vcpu->arch.guest_mmu); + ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu); if (ret) return ret; - ret = alloc_mmu_pages(vcpu, &vcpu->arch.root_mmu); + ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu); if (ret) goto fail_allocate_root; @@ -5841,6 +5429,10 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) kvm_reload_remote_mmus(kvm); kvm_zap_obsolete_pages(kvm); + + if (kvm->arch.tdp_mmu_enabled) + kvm_tdp_mmu_zap_all(kvm); + spin_unlock(&kvm->mmu_lock); } @@ -5860,6 +5452,8 @@ void kvm_mmu_init_vm(struct kvm *kvm) { struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; + kvm_mmu_init_tdp_mmu(kvm); + node->track_write = kvm_mmu_pte_write; node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; kvm_page_track_register_notifier(kvm, node); @@ -5870,6 +5464,8 @@ void kvm_mmu_uninit_vm(struct kvm *kvm) struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; kvm_page_track_unregister_notifier(kvm, node); + + kvm_mmu_uninit_tdp_mmu(kvm); } void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) @@ -5877,6 +5473,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) struct kvm_memslots *slots; struct kvm_memory_slot *memslot; int i; + bool flush; spin_lock(&kvm->mmu_lock); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { @@ -5896,6 +5493,12 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) } } + if (kvm->arch.tdp_mmu_enabled) { + flush = kvm_tdp_mmu_zap_gfn_range(kvm, gfn_start, gfn_end); + if (flush) + kvm_flush_remote_tlbs(kvm); + } + spin_unlock(&kvm->mmu_lock); } @@ -5914,6 +5517,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, spin_lock(&kvm->mmu_lock); flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect, start_level, KVM_MAX_HUGEPAGE_LEVEL, false); + if (kvm->arch.tdp_mmu_enabled) + flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_4K); spin_unlock(&kvm->mmu_lock); /* @@ -5977,6 +5582,9 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, spin_lock(&kvm->mmu_lock); slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot, kvm_mmu_zap_collapsible_spte, true); + + if (kvm->arch.tdp_mmu_enabled) + kvm_tdp_mmu_zap_collapsible_sptes(kvm, memslot); spin_unlock(&kvm->mmu_lock); } @@ -6002,6 +5610,8 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, spin_lock(&kvm->mmu_lock); flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false); + if (kvm->arch.tdp_mmu_enabled) + flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot); spin_unlock(&kvm->mmu_lock); /* @@ -6023,6 +5633,8 @@ void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, spin_lock(&kvm->mmu_lock); flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect, false); + if (kvm->arch.tdp_mmu_enabled) + flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_2M); spin_unlock(&kvm->mmu_lock); if (flush) @@ -6037,6 +5649,8 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm, spin_lock(&kvm->mmu_lock); flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false); + if (kvm->arch.tdp_mmu_enabled) + flush |= kvm_tdp_mmu_slot_set_dirty(kvm, memslot); spin_unlock(&kvm->mmu_lock); if (flush) @@ -6062,6 +5676,10 @@ restart: } kvm_mmu_commit_zap_page(kvm, &invalid_list); + + if (kvm->arch.tdp_mmu_enabled) + kvm_tdp_mmu_zap_all(kvm); + spin_unlock(&kvm->mmu_lock); } @@ -6357,7 +5975,10 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) ratio = READ_ONCE(nx_huge_pages_recovery_ratio); to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0; - while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) { + for ( ; to_zap; --to_zap) { + if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) + break; + /* * We use a separate list instead of just using active_mmu_pages * because the number of lpage_disallowed pages is expected to @@ -6367,15 +5988,20 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) struct kvm_mmu_page, lpage_disallowed_link); WARN_ON_ONCE(!sp->lpage_disallowed); - kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); - WARN_ON_ONCE(sp->lpage_disallowed); + if (sp->tdp_mmu_page) + kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, + sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level)); + else { + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); + WARN_ON_ONCE(sp->lpage_disallowed); + } - if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) { + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { kvm_mmu_commit_zap_page(kvm, &invalid_list); - if (to_zap) - cond_resched_lock(&kvm->mmu_lock); + cond_resched_lock(&kvm->mmu_lock); } } + kvm_mmu_commit_zap_page(kvm, &invalid_list); spin_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, rcu_idx); diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 3acf3b8eb469..bfc6389edc28 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -3,9 +3,23 @@ #define __KVM_X86_MMU_INTERNAL_H #include <linux/types.h> - +#include <linux/kvm_host.h> #include <asm/kvm_host.h> +#undef MMU_DEBUG + +#ifdef MMU_DEBUG +extern bool dbg; + +#define pgprintk(x...) do { if (dbg) printk(x); } while (0) +#define rmap_printk(x...) do { if (dbg) printk(x); } while (0) +#define MMU_WARN_ON(x) WARN_ON(x) +#else +#define pgprintk(x...) do { } while (0) +#define rmap_printk(x...) do { } while (0) +#define MMU_WARN_ON(x) do { } while (0) +#endif + struct kvm_mmu_page { struct list_head link; struct hlist_node hash_link; @@ -41,8 +55,12 @@ struct kvm_mmu_page { /* Number of writes since the last time traversal visited this page. */ atomic_t write_flooding_count; + + bool tdp_mmu_page; }; +extern struct kmem_cache *mmu_page_header_cache; + static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page) { struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); @@ -55,9 +73,77 @@ static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep) return to_shadow_page(__pa(sptep)); } +static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) +{ + /* + * When using the EPT page-modification log, the GPAs in the log + * would come from L2 rather than L1. Therefore, we need to rely + * on write protection to record dirty pages. This also bypasses + * PML, since writes now result in a vmexit. + */ + return vcpu->arch.mmu == &vcpu->arch.guest_mmu; +} + +bool is_nx_huge_page_enabled(void); +bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, + bool can_unsync); + void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn); +void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, + u64 start_gfn, u64 pages); + +static inline void kvm_mmu_get_root(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + BUG_ON(!sp->root_count); + lockdep_assert_held(&kvm->mmu_lock); + + ++sp->root_count; +} + +static inline bool kvm_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + lockdep_assert_held(&kvm->mmu_lock); + --sp->root_count; + + return !sp->root_count; +} + +/* + * Return values of handle_mmio_page_fault, mmu.page_fault, and fast_page_fault(). + * + * RET_PF_RETRY: let CPU fault again on the address. + * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. + * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. + * RET_PF_FIXED: The faulting entry has been fixed. + * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU. + */ +enum { + RET_PF_RETRY = 0, + RET_PF_EMULATE, + RET_PF_INVALID, + RET_PF_FIXED, + RET_PF_SPURIOUS, +}; + +/* Bits which may be returned by set_spte() */ +#define SET_SPTE_WRITE_PROTECTED_PT BIT(0) +#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) +#define SET_SPTE_SPURIOUS BIT(2) + +int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, + int max_level, kvm_pfn_t *pfnp, + bool huge_page_disallowed, int *req_level); +void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level, + kvm_pfn_t *pfnp, int *goal_levelp); + +bool is_nx_huge_page_enabled(void); + +void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); + +void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp); +void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp); #endif /* __KVM_X86_MMU_INTERNAL_H */ diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index 9d15bc0c535b..213699b27b44 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -202,8 +202,8 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page, TRACE_EVENT( mark_mmio_spte, - TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access, unsigned int gen), - TP_ARGS(sptep, gfn, access, gen), + TP_PROTO(u64 *sptep, gfn_t gfn, u64 spte), + TP_ARGS(sptep, gfn, spte), TP_STRUCT__entry( __field(void *, sptep) @@ -215,8 +215,8 @@ TRACE_EVENT( TP_fast_assign( __entry->sptep = sptep; __entry->gfn = gfn; - __entry->access = access; - __entry->gen = gen; + __entry->access = spte & ACC_ALL; + __entry->gen = get_mmio_spte_generation(spte); ), TP_printk("sptep:%p gfn %llx access %x gen %x", __entry->sptep, @@ -244,14 +244,11 @@ TRACE_EVENT( __entry->access) ); -#define __spte_satisfied(__spte) \ - (__entry->retry && is_writable_pte(__entry->__spte)) - TRACE_EVENT( fast_page_fault, TP_PROTO(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 error_code, - u64 *sptep, u64 old_spte, bool retry), - TP_ARGS(vcpu, cr2_or_gpa, error_code, sptep, old_spte, retry), + u64 *sptep, u64 old_spte, int ret), + TP_ARGS(vcpu, cr2_or_gpa, error_code, sptep, old_spte, ret), TP_STRUCT__entry( __field(int, vcpu_id) @@ -260,7 +257,7 @@ TRACE_EVENT( __field(u64 *, sptep) __field(u64, old_spte) __field(u64, new_spte) - __field(bool, retry) + __field(int, ret) ), TP_fast_assign( @@ -270,7 +267,7 @@ TRACE_EVENT( __entry->sptep = sptep; __entry->old_spte = old_spte; __entry->new_spte = *sptep; - __entry->retry = retry; + __entry->ret = ret; ), TP_printk("vcpu %d gva %llx error_code %s sptep %p old %#llx" @@ -278,7 +275,7 @@ TRACE_EVENT( __entry->cr2_or_gpa, __print_flags(__entry->error_code, "|", kvm_mmu_trace_pferr_flags), __entry->sptep, __entry->old_spte, __entry->new_spte, - __spte_satisfied(old_spte), __spte_satisfied(new_spte) + __entry->ret == RET_PF_SPURIOUS, __entry->ret == RET_PF_FIXED ) ); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 4dd6b1e5b8cf..50e268eb8e1a 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -550,7 +550,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, * we call mmu_set_spte() with host_writable = true because * pte_prefetch_gfn_to_pfn always gets a writable pfn. */ - mmu_set_spte(vcpu, spte, pte_access, 0, PG_LEVEL_4K, gfn, pfn, + mmu_set_spte(vcpu, spte, pte_access, false, PG_LEVEL_4K, gfn, pfn, true, true); kvm_release_pfn_clean(pfn); @@ -625,15 +625,18 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, * emulate this operation, return 1 to indicate this case. */ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, - struct guest_walker *gw, - int write_fault, int max_level, - kvm_pfn_t pfn, bool map_writable, bool prefault, - bool lpage_disallowed) + struct guest_walker *gw, u32 error_code, + int max_level, kvm_pfn_t pfn, bool map_writable, + bool prefault) { + bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); + bool write_fault = error_code & PFERR_WRITE_MASK; + bool exec = error_code & PFERR_FETCH_MASK; + bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; struct kvm_mmu_page *sp = NULL; struct kvm_shadow_walk_iterator it; unsigned direct_access, access = gw->pt_access; - int top_level, hlevel, ret; + int top_level, level, req_level, ret; gfn_t base_gfn = gw->gfn; direct_access = gw->pte_access; @@ -679,7 +682,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, link_shadow_page(vcpu, it.sptep, sp); } - hlevel = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn); + level = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn, + huge_page_disallowed, &req_level); trace_kvm_mmu_spte_requested(addr, gw->level, pfn); @@ -690,10 +694,12 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, * We cannot overwrite existing page tables with an NX * large page, as the leaf could be executable. */ - disallowed_hugepage_adjust(it, gw->gfn, &pfn, &hlevel); + if (nx_huge_page_workaround_enabled) + disallowed_hugepage_adjust(*it.sptep, gw->gfn, it.level, + &pfn, &level); base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); - if (it.level == hlevel) + if (it.level == level) break; validate_direct_spte(vcpu, it.sptep, direct_access); @@ -704,13 +710,16 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, sp = kvm_mmu_get_page(vcpu, base_gfn, addr, it.level - 1, true, direct_access); link_shadow_page(vcpu, it.sptep, sp); - if (lpage_disallowed) + if (huge_page_disallowed && req_level >= it.level) account_huge_nx_page(vcpu->kvm, sp); } } ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, it.level, base_gfn, pfn, prefault, map_writable); + if (ret == RET_PF_SPURIOUS) + return ret; + FNAME(pte_prefetch)(vcpu, gw, it.sptep); ++vcpu->stat.pf_fixed; return ret; @@ -738,7 +747,7 @@ out_gpte_changed: */ static bool FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, - struct guest_walker *walker, int user_fault, + struct guest_walker *walker, bool user_fault, bool *write_fault_to_shadow_pgtable) { int level; @@ -776,15 +785,13 @@ FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, bool prefault) { - int write_fault = error_code & PFERR_WRITE_MASK; - int user_fault = error_code & PFERR_USER_MASK; + bool write_fault = error_code & PFERR_WRITE_MASK; + bool user_fault = error_code & PFERR_USER_MASK; struct guest_walker walker; int r; kvm_pfn_t pfn; unsigned long mmu_seq; bool map_writable, is_self_change_mapping; - bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && - is_nx_huge_page_enabled(); int max_level; pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); @@ -825,7 +832,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable); - if (lpage_disallowed || is_self_change_mapping) + if (is_self_change_mapping) max_level = PG_LEVEL_4K; else max_level = walker.level; @@ -869,8 +876,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, r = make_mmu_pages_available(vcpu); if (r) goto out_unlock; - r = FNAME(fetch)(vcpu, addr, &walker, write_fault, max_level, pfn, - map_writable, prefault, lpage_disallowed); + r = FNAME(fetch)(vcpu, addr, &walker, error_code, max_level, pfn, + map_writable, prefault); kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); out_unlock: @@ -895,6 +902,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) { struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; + u64 old_spte; int level; u64 *sptep; @@ -917,7 +925,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) sptep = iterator.sptep; sp = sptep_to_sp(sptep); - if (is_last_spte(*sptep, level)) { + old_spte = *sptep; + if (is_last_spte(old_spte, level)) { pt_element_t gpte; gpa_t pte_gpa; @@ -927,7 +936,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) pte_gpa = FNAME(get_level1_sp_gpa)(sp); pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); - if (mmu_page_zap_pte(vcpu->kvm, sp, sptep)) + mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL); + if (is_shadow_present_pte(old_spte)) kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c new file mode 100644 index 000000000000..d9c5665a55e9 --- /dev/null +++ b/arch/x86/kvm/mmu/spte.c @@ -0,0 +1,318 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Kernel-based Virtual Machine driver for Linux + * + * Macros and functions to access KVM PTEs (also known as SPTEs) + * + * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2020 Red Hat, Inc. and/or its affiliates. + */ + + +#include <linux/kvm_host.h> +#include "mmu.h" +#include "mmu_internal.h" +#include "x86.h" +#include "spte.h" + +#include <asm/e820/api.h> + +u64 __read_mostly shadow_nx_mask; +u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ +u64 __read_mostly shadow_user_mask; +u64 __read_mostly shadow_accessed_mask; +u64 __read_mostly shadow_dirty_mask; +u64 __read_mostly shadow_mmio_value; +u64 __read_mostly shadow_mmio_access_mask; +u64 __read_mostly shadow_present_mask; +u64 __read_mostly shadow_me_mask; +u64 __read_mostly shadow_acc_track_mask; + +u64 __read_mostly shadow_nonpresent_or_rsvd_mask; +u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; + +u8 __read_mostly shadow_phys_bits; + +static u64 generation_mmio_spte_mask(u64 gen) +{ + u64 mask; + + WARN_ON(gen & ~MMIO_SPTE_GEN_MASK); + BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK); + + mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK; + mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK; + return mask; +} + +u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access) +{ + u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK; + u64 mask = generation_mmio_spte_mask(gen); + u64 gpa = gfn << PAGE_SHIFT; + + access &= shadow_mmio_access_mask; + mask |= shadow_mmio_value | access; + mask |= gpa | shadow_nonpresent_or_rsvd_mask; + mask |= (gpa & shadow_nonpresent_or_rsvd_mask) + << shadow_nonpresent_or_rsvd_mask_len; + + return mask; +} + +static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) +{ + if (pfn_valid(pfn)) + return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) && + /* + * Some reserved pages, such as those from NVDIMM + * DAX devices, are not for MMIO, and can be mapped + * with cached memory type for better performance. + * However, the above check misconceives those pages + * as MMIO, and results in KVM mapping them with UC + * memory type, which would hurt the performance. + * Therefore, we check the host memory type in addition + * and only treat UC/UC-/WC pages as MMIO. + */ + (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn)); + + return !e820__mapped_raw_any(pfn_to_hpa(pfn), + pfn_to_hpa(pfn + 1) - 1, + E820_TYPE_RAM); +} + +int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, + gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool speculative, + bool can_unsync, bool host_writable, bool ad_disabled, + u64 *new_spte) +{ + u64 spte = 0; + int ret = 0; + + if (ad_disabled) + spte |= SPTE_AD_DISABLED_MASK; + else if (kvm_vcpu_ad_need_write_protect(vcpu)) + spte |= SPTE_AD_WRPROT_ONLY_MASK; + + /* + * For the EPT case, shadow_present_mask is 0 if hardware + * supports exec-only page table entries. In that case, + * ACC_USER_MASK and shadow_user_mask are used to represent + * read access. See FNAME(gpte_access) in paging_tmpl.h. + */ + spte |= shadow_present_mask; + if (!speculative) + spte |= spte_shadow_accessed_mask(spte); + + if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) && + is_nx_huge_page_enabled()) { + pte_access &= ~ACC_EXEC_MASK; + } + + if (pte_access & ACC_EXEC_MASK) + spte |= shadow_x_mask; + else + spte |= shadow_nx_mask; + + if (pte_access & ACC_USER_MASK) + spte |= shadow_user_mask; + + if (level > PG_LEVEL_4K) + spte |= PT_PAGE_SIZE_MASK; + if (tdp_enabled) + spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn, + kvm_is_mmio_pfn(pfn)); + + if (host_writable) + spte |= SPTE_HOST_WRITEABLE; + else + pte_access &= ~ACC_WRITE_MASK; + + if (!kvm_is_mmio_pfn(pfn)) + spte |= shadow_me_mask; + + spte |= (u64)pfn << PAGE_SHIFT; + + if (pte_access & ACC_WRITE_MASK) { + spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; + + /* + * Optimization: for pte sync, if spte was writable the hash + * lookup is unnecessary (and expensive). Write protection + * is responsibility of mmu_get_page / kvm_sync_page. + * Same reasoning can be applied to dirty page accounting. + */ + if (!can_unsync && is_writable_pte(old_spte)) + goto out; + + if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { + pgprintk("%s: found shadow page for %llx, marking ro\n", + __func__, gfn); + ret |= SET_SPTE_WRITE_PROTECTED_PT; + pte_access &= ~ACC_WRITE_MASK; + spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); + } + } + + if (pte_access & ACC_WRITE_MASK) + spte |= spte_shadow_dirty_mask(spte); + + if (speculative) + spte = mark_spte_for_access_track(spte); + +out: + *new_spte = spte; + return ret; +} + +u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) +{ + u64 spte; + + spte = __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK | + shadow_user_mask | shadow_x_mask | shadow_me_mask; + + if (ad_disabled) + spte |= SPTE_AD_DISABLED_MASK; + else + spte |= shadow_accessed_mask; + + return spte; +} + +u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn) +{ + u64 new_spte; + + new_spte = old_spte & ~PT64_BASE_ADDR_MASK; + new_spte |= (u64)new_pfn << PAGE_SHIFT; + + new_spte &= ~PT_WRITABLE_MASK; + new_spte &= ~SPTE_HOST_WRITEABLE; + + new_spte = mark_spte_for_access_track(new_spte); + + return new_spte; +} + +static u8 kvm_get_shadow_phys_bits(void) +{ + /* + * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected + * in CPU detection code, but the processor treats those reduced bits as + * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at + * the physical address bits reported by CPUID. + */ + if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008)) + return cpuid_eax(0x80000008) & 0xff; + + /* + * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with + * custom CPUID. Proceed with whatever the kernel found since these features + * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008). + */ + return boot_cpu_data.x86_phys_bits; +} + +u64 mark_spte_for_access_track(u64 spte) +{ + if (spte_ad_enabled(spte)) + return spte & ~shadow_accessed_mask; + + if (is_access_track_spte(spte)) + return spte; + + /* + * Making an Access Tracking PTE will result in removal of write access + * from the PTE. So, verify that we will be able to restore the write + * access in the fast page fault path later on. + */ + WARN_ONCE((spte & PT_WRITABLE_MASK) && + !spte_can_locklessly_be_made_writable(spte), + "kvm: Writable SPTE is not locklessly dirty-trackable\n"); + + WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask << + shadow_acc_track_saved_bits_shift), + "kvm: Access Tracking saved bit locations are not zero\n"); + + spte |= (spte & shadow_acc_track_saved_bits_mask) << + shadow_acc_track_saved_bits_shift; + spte &= ~shadow_acc_track_mask; + + return spte; +} + +void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask) +{ + BUG_ON((u64)(unsigned)access_mask != access_mask); + WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << shadow_nonpresent_or_rsvd_mask_len)); + WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask); + shadow_mmio_value = mmio_value | SPTE_MMIO_MASK; + shadow_mmio_access_mask = access_mask; +} +EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); + +/* + * Sets the shadow PTE masks used by the MMU. + * + * Assumptions: + * - Setting either @accessed_mask or @dirty_mask requires setting both + * - At least one of @accessed_mask or @acc_track_mask must be set + */ +void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, + u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, + u64 acc_track_mask, u64 me_mask) +{ + BUG_ON(!dirty_mask != !accessed_mask); + BUG_ON(!accessed_mask && !acc_track_mask); + BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK); + + shadow_user_mask = user_mask; + shadow_accessed_mask = accessed_mask; + shadow_dirty_mask = dirty_mask; + shadow_nx_mask = nx_mask; + shadow_x_mask = x_mask; + shadow_present_mask = p_mask; + shadow_acc_track_mask = acc_track_mask; + shadow_me_mask = me_mask; +} +EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); + +void kvm_mmu_reset_all_pte_masks(void) +{ + u8 low_phys_bits; + + shadow_user_mask = 0; + shadow_accessed_mask = 0; + shadow_dirty_mask = 0; + shadow_nx_mask = 0; + shadow_x_mask = 0; + shadow_present_mask = 0; + shadow_acc_track_mask = 0; + + shadow_phys_bits = kvm_get_shadow_phys_bits(); + + /* + * If the CPU has 46 or less physical address bits, then set an + * appropriate mask to guard against L1TF attacks. Otherwise, it is + * assumed that the CPU is not vulnerable to L1TF. + * + * Some Intel CPUs address the L1 cache using more PA bits than are + * reported by CPUID. Use the PA width of the L1 cache when possible + * to achieve more effective mitigation, e.g. if system RAM overlaps + * the most significant bits of legal physical address space. + */ + shadow_nonpresent_or_rsvd_mask = 0; + low_phys_bits = boot_cpu_data.x86_phys_bits; + if (boot_cpu_has_bug(X86_BUG_L1TF) && + !WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >= + 52 - shadow_nonpresent_or_rsvd_mask_len)) { + low_phys_bits = boot_cpu_data.x86_cache_bits + - shadow_nonpresent_or_rsvd_mask_len; + shadow_nonpresent_or_rsvd_mask = + rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1); + } + + shadow_nonpresent_or_rsvd_lower_gfn_mask = + GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT); +} diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h new file mode 100644 index 000000000000..4ecf40e0b8fe --- /dev/null +++ b/arch/x86/kvm/mmu/spte.h @@ -0,0 +1,252 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#ifndef KVM_X86_MMU_SPTE_H +#define KVM_X86_MMU_SPTE_H + +#include "mmu_internal.h" + +#define PT_FIRST_AVAIL_BITS_SHIFT 10 +#define PT64_SECOND_AVAIL_BITS_SHIFT 54 + +/* + * The mask used to denote special SPTEs, which can be either MMIO SPTEs or + * Access Tracking SPTEs. + */ +#define SPTE_SPECIAL_MASK (3ULL << 52) +#define SPTE_AD_ENABLED_MASK (0ULL << 52) +#define SPTE_AD_DISABLED_MASK (1ULL << 52) +#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52) +#define SPTE_MMIO_MASK (3ULL << 52) + +#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK +#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) +#else +#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) +#endif +#define PT64_LVL_ADDR_MASK(level) \ + (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ + * PT64_LEVEL_BITS))) - 1)) +#define PT64_LVL_OFFSET_MASK(level) \ + (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ + * PT64_LEVEL_BITS))) - 1)) + +#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ + | shadow_x_mask | shadow_nx_mask | shadow_me_mask) + +#define ACC_EXEC_MASK 1 +#define ACC_WRITE_MASK PT_WRITABLE_MASK +#define ACC_USER_MASK PT_USER_MASK +#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) + +/* The mask for the R/X bits in EPT PTEs */ +#define PT64_EPT_READABLE_MASK 0x1ull +#define PT64_EPT_EXECUTABLE_MASK 0x4ull + +#define PT64_LEVEL_BITS 9 + +#define PT64_LEVEL_SHIFT(level) \ + (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) + +#define PT64_INDEX(address, level)\ + (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) +#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) + + +#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) +#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) + +/* + * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of + * the memslots generation and is derived as follows: + * + * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11 + * Bits 9-18 of the MMIO generation are propagated to spte bits 52-61 + * + * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in + * the MMIO generation number, as doing so would require stealing a bit from + * the "real" generation number and thus effectively halve the maximum number + * of MMIO generations that can be handled before encountering a wrap (which + * requires a full MMU zap). The flag is instead explicitly queried when + * checking for MMIO spte cache hits. + */ +#define MMIO_SPTE_GEN_MASK GENMASK_ULL(17, 0) + +#define MMIO_SPTE_GEN_LOW_START 3 +#define MMIO_SPTE_GEN_LOW_END 11 +#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \ + MMIO_SPTE_GEN_LOW_START) + +#define MMIO_SPTE_GEN_HIGH_START PT64_SECOND_AVAIL_BITS_SHIFT +#define MMIO_SPTE_GEN_HIGH_END 62 +#define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \ + MMIO_SPTE_GEN_HIGH_START) + +extern u64 __read_mostly shadow_nx_mask; +extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ +extern u64 __read_mostly shadow_user_mask; +extern u64 __read_mostly shadow_accessed_mask; +extern u64 __read_mostly shadow_dirty_mask; +extern u64 __read_mostly shadow_mmio_value; +extern u64 __read_mostly shadow_mmio_access_mask; +extern u64 __read_mostly shadow_present_mask; +extern u64 __read_mostly shadow_me_mask; + +/* + * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK; + * shadow_acc_track_mask is the set of bits to be cleared in non-accessed + * pages. + */ +extern u64 __read_mostly shadow_acc_track_mask; + +/* + * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order + * to guard against L1TF attacks. + */ +extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; + +/* + * The mask/shift to use for saving the original R/X bits when marking the PTE + * as not-present for access tracking purposes. We do not save the W bit as the + * PTEs being access tracked also need to be dirty tracked, so the W bit will be + * restored only when a write is attempted to the page. + */ +static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK | + PT64_EPT_EXECUTABLE_MASK; +static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT; + +/* + * The number of high-order 1 bits to use in the mask above. + */ +static const u64 shadow_nonpresent_or_rsvd_mask_len = 5; + +/* + * In some cases, we need to preserve the GFN of a non-present or reserved + * SPTE when we usurp the upper five bits of the physical address space to + * defend against L1TF, e.g. for MMIO SPTEs. To preserve the GFN, we'll + * shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask + * left into the reserved bits, i.e. the GFN in the SPTE will be split into + * high and low parts. This mask covers the lower bits of the GFN. + */ +extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; + +/* + * The number of non-reserved physical address bits irrespective of features + * that repurpose legal bits, e.g. MKTME. + */ +extern u8 __read_mostly shadow_phys_bits; + +static inline bool is_mmio_spte(u64 spte) +{ + return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK; +} + +static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) +{ + return sp->role.ad_disabled; +} + +static inline bool spte_ad_enabled(u64 spte) +{ + MMU_WARN_ON(is_mmio_spte(spte)); + return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK; +} + +static inline bool spte_ad_need_write_protect(u64 spte) +{ + MMU_WARN_ON(is_mmio_spte(spte)); + return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK; +} + +static inline u64 spte_shadow_accessed_mask(u64 spte) +{ + MMU_WARN_ON(is_mmio_spte(spte)); + return spte_ad_enabled(spte) ? shadow_accessed_mask : 0; +} + +static inline u64 spte_shadow_dirty_mask(u64 spte) +{ + MMU_WARN_ON(is_mmio_spte(spte)); + return spte_ad_enabled(spte) ? shadow_dirty_mask : 0; +} + +static inline bool is_access_track_spte(u64 spte) +{ + return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0; +} + +static inline int is_shadow_present_pte(u64 pte) +{ + return (pte != 0) && !is_mmio_spte(pte); +} + +static inline int is_large_pte(u64 pte) +{ + return pte & PT_PAGE_SIZE_MASK; +} + +static inline int is_last_spte(u64 pte, int level) +{ + if (level == PG_LEVEL_4K) + return 1; + if (is_large_pte(pte)) + return 1; + return 0; +} + +static inline bool is_executable_pte(u64 spte) +{ + return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask; +} + +static inline kvm_pfn_t spte_to_pfn(u64 pte) +{ + return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; +} + +static inline bool is_accessed_spte(u64 spte) +{ + u64 accessed_mask = spte_shadow_accessed_mask(spte); + + return accessed_mask ? spte & accessed_mask + : !is_access_track_spte(spte); +} + +static inline bool is_dirty_spte(u64 spte) +{ + u64 dirty_mask = spte_shadow_dirty_mask(spte); + + return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK; +} + +static inline bool spte_can_locklessly_be_made_writable(u64 spte) +{ + return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) == + (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE); +} + +static inline u64 get_mmio_spte_generation(u64 spte) +{ + u64 gen; + + gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START; + gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START; + return gen; +} + +/* Bits which may be returned by set_spte() */ +#define SET_SPTE_WRITE_PROTECTED_PT BIT(0) +#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) +#define SET_SPTE_SPURIOUS BIT(2) + +int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, + gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool speculative, + bool can_unsync, bool host_writable, bool ad_disabled, + u64 *new_spte); +u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled); +u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access); +u64 mark_spte_for_access_track(u64 spte); +u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn); + +void kvm_mmu_reset_all_pte_masks(void); + +#endif diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c new file mode 100644 index 000000000000..87b7e16911db --- /dev/null +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -0,0 +1,182 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "mmu_internal.h" +#include "tdp_iter.h" +#include "spte.h" + +/* + * Recalculates the pointer to the SPTE for the current GFN and level and + * reread the SPTE. + */ +static void tdp_iter_refresh_sptep(struct tdp_iter *iter) +{ + iter->sptep = iter->pt_path[iter->level - 1] + + SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level); + iter->old_spte = READ_ONCE(*iter->sptep); +} + +static gfn_t round_gfn_for_level(gfn_t gfn, int level) +{ + return gfn & -KVM_PAGES_PER_HPAGE(level); +} + +/* + * Sets a TDP iterator to walk a pre-order traversal of the paging structure + * rooted at root_pt, starting with the walk to translate goal_gfn. + */ +void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, + int min_level, gfn_t goal_gfn) +{ + WARN_ON(root_level < 1); + WARN_ON(root_level > PT64_ROOT_MAX_LEVEL); + + iter->goal_gfn = goal_gfn; + iter->root_level = root_level; + iter->min_level = min_level; + iter->level = root_level; + iter->pt_path[iter->level - 1] = root_pt; + + iter->gfn = round_gfn_for_level(iter->goal_gfn, iter->level); + tdp_iter_refresh_sptep(iter); + + iter->valid = true; +} + +/* + * Given an SPTE and its level, returns a pointer containing the host virtual + * address of the child page table referenced by the SPTE. Returns null if + * there is no such entry. + */ +u64 *spte_to_child_pt(u64 spte, int level) +{ + /* + * There's no child entry if this entry isn't present or is a + * last-level entry. + */ + if (!is_shadow_present_pte(spte) || is_last_spte(spte, level)) + return NULL; + + return __va(spte_to_pfn(spte) << PAGE_SHIFT); +} + +/* + * Steps down one level in the paging structure towards the goal GFN. Returns + * true if the iterator was able to step down a level, false otherwise. + */ +static bool try_step_down(struct tdp_iter *iter) +{ + u64 *child_pt; + + if (iter->level == iter->min_level) + return false; + + /* + * Reread the SPTE before stepping down to avoid traversing into page + * tables that are no longer linked from this entry. + */ + iter->old_spte = READ_ONCE(*iter->sptep); + + child_pt = spte_to_child_pt(iter->old_spte, iter->level); + if (!child_pt) + return false; + + iter->level--; + iter->pt_path[iter->level - 1] = child_pt; + iter->gfn = round_gfn_for_level(iter->goal_gfn, iter->level); + tdp_iter_refresh_sptep(iter); + + return true; +} + +/* + * Steps to the next entry in the current page table, at the current page table + * level. The next entry could point to a page backing guest memory or another + * page table, or it could be non-present. Returns true if the iterator was + * able to step to the next entry in the page table, false if the iterator was + * already at the end of the current page table. + */ +static bool try_step_side(struct tdp_iter *iter) +{ + /* + * Check if the iterator is already at the end of the current page + * table. + */ + if (SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level) == + (PT64_ENT_PER_PAGE - 1)) + return false; + + iter->gfn += KVM_PAGES_PER_HPAGE(iter->level); + iter->goal_gfn = iter->gfn; + iter->sptep++; + iter->old_spte = READ_ONCE(*iter->sptep); + + return true; +} + +/* + * Tries to traverse back up a level in the paging structure so that the walk + * can continue from the next entry in the parent page table. Returns true on a + * successful step up, false if already in the root page. + */ +static bool try_step_up(struct tdp_iter *iter) +{ + if (iter->level == iter->root_level) + return false; + + iter->level++; + iter->gfn = round_gfn_for_level(iter->gfn, iter->level); + tdp_iter_refresh_sptep(iter); + + return true; +} + +/* + * Step to the next SPTE in a pre-order traversal of the paging structure. + * To get to the next SPTE, the iterator either steps down towards the goal + * GFN, if at a present, non-last-level SPTE, or over to a SPTE mapping a + * highter GFN. + * + * The basic algorithm is as follows: + * 1. If the current SPTE is a non-last-level SPTE, step down into the page + * table it points to. + * 2. If the iterator cannot step down, it will try to step to the next SPTE + * in the current page of the paging structure. + * 3. If the iterator cannot step to the next entry in the current page, it will + * try to step up to the parent paging structure page. In this case, that + * SPTE will have already been visited, and so the iterator must also step + * to the side again. + */ +void tdp_iter_next(struct tdp_iter *iter) +{ + if (try_step_down(iter)) + return; + + do { + if (try_step_side(iter)) + return; + } while (try_step_up(iter)); + iter->valid = false; +} + +/* + * Restart the walk over the paging structure from the root, starting from the + * highest gfn the iterator had previously reached. Assumes that the entire + * paging structure, except the root page, may have been completely torn down + * and rebuilt. + */ +void tdp_iter_refresh_walk(struct tdp_iter *iter) +{ + gfn_t goal_gfn = iter->goal_gfn; + + if (iter->gfn > goal_gfn) + goal_gfn = iter->gfn; + + tdp_iter_start(iter, iter->pt_path[iter->root_level - 1], + iter->root_level, iter->min_level, goal_gfn); +} + +u64 *tdp_iter_root_pt(struct tdp_iter *iter) +{ + return iter->pt_path[iter->root_level - 1]; +} + diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h new file mode 100644 index 000000000000..47170d0dc98e --- /dev/null +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifndef __KVM_X86_MMU_TDP_ITER_H +#define __KVM_X86_MMU_TDP_ITER_H + +#include <linux/kvm_host.h> + +#include "mmu.h" + +/* + * A TDP iterator performs a pre-order walk over a TDP paging structure. + */ +struct tdp_iter { + /* + * The iterator will traverse the paging structure towards the mapping + * for this GFN. + */ + gfn_t goal_gfn; + /* Pointers to the page tables traversed to reach the current SPTE */ + u64 *pt_path[PT64_ROOT_MAX_LEVEL]; + /* A pointer to the current SPTE */ + u64 *sptep; + /* The lowest GFN mapped by the current SPTE */ + gfn_t gfn; + /* The level of the root page given to the iterator */ + int root_level; + /* The lowest level the iterator should traverse to */ + int min_level; + /* The iterator's current level within the paging structure */ + int level; + /* A snapshot of the value at sptep */ + u64 old_spte; + /* + * Whether the iterator has a valid state. This will be false if the + * iterator walks off the end of the paging structure. + */ + bool valid; +}; + +/* + * Iterates over every SPTE mapping the GFN range [start, end) in a + * preorder traversal. + */ +#define for_each_tdp_pte_min_level(iter, root, root_level, min_level, start, end) \ + for (tdp_iter_start(&iter, root, root_level, min_level, start); \ + iter.valid && iter.gfn < end; \ + tdp_iter_next(&iter)) + +#define for_each_tdp_pte(iter, root, root_level, start, end) \ + for_each_tdp_pte_min_level(iter, root, root_level, PG_LEVEL_4K, start, end) + +u64 *spte_to_child_pt(u64 pte, int level); + +void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, + int min_level, gfn_t goal_gfn); +void tdp_iter_next(struct tdp_iter *iter); +void tdp_iter_refresh_walk(struct tdp_iter *iter); +u64 *tdp_iter_root_pt(struct tdp_iter *iter); + +#endif /* __KVM_X86_MMU_TDP_ITER_H */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c new file mode 100644 index 000000000000..e246d71b8ea2 --- /dev/null +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -0,0 +1,1157 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "mmu.h" +#include "mmu_internal.h" +#include "mmutrace.h" +#include "tdp_iter.h" +#include "tdp_mmu.h" +#include "spte.h" + +#ifdef CONFIG_X86_64 +static bool __read_mostly tdp_mmu_enabled = false; +module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); +#endif + +static bool is_tdp_mmu_enabled(void) +{ +#ifdef CONFIG_X86_64 + return tdp_enabled && READ_ONCE(tdp_mmu_enabled); +#else + return false; +#endif /* CONFIG_X86_64 */ +} + +/* Initializes the TDP MMU for the VM, if enabled. */ +void kvm_mmu_init_tdp_mmu(struct kvm *kvm) +{ + if (!is_tdp_mmu_enabled()) + return; + + /* This should not be changed for the lifetime of the VM. */ + kvm->arch.tdp_mmu_enabled = true; + + INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); + INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); +} + +void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) +{ + if (!kvm->arch.tdp_mmu_enabled) + return; + + WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); +} + +#define for_each_tdp_mmu_root(_kvm, _root) \ + list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) + +bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa) +{ + struct kvm_mmu_page *sp; + + sp = to_shadow_page(hpa); + + return sp->tdp_mmu_page && sp->root_count; +} + +static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t start, gfn_t end, bool can_yield); + +void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root) +{ + gfn_t max_gfn = 1ULL << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT); + + lockdep_assert_held(&kvm->mmu_lock); + + WARN_ON(root->root_count); + WARN_ON(!root->tdp_mmu_page); + + list_del(&root->link); + + zap_gfn_range(kvm, root, 0, max_gfn, false); + + free_page((unsigned long)root->spt); + kmem_cache_free(mmu_page_header_cache, root); +} + +static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu, + int level) +{ + union kvm_mmu_page_role role; + + role = vcpu->arch.mmu->mmu_role.base; + role.level = level; + role.direct = true; + role.gpte_is_8_bytes = true; + role.access = ACC_ALL; + + return role; +} + +static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn, + int level) +{ + struct kvm_mmu_page *sp; + + sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); + sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); + set_page_private(virt_to_page(sp->spt), (unsigned long)sp); + + sp->role.word = page_role_for_level(vcpu, level).word; + sp->gfn = gfn; + sp->tdp_mmu_page = true; + + return sp; +} + +static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu) +{ + union kvm_mmu_page_role role; + struct kvm *kvm = vcpu->kvm; + struct kvm_mmu_page *root; + + role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level); + + spin_lock(&kvm->mmu_lock); + + /* Check for an existing root before allocating a new one. */ + for_each_tdp_mmu_root(kvm, root) { + if (root->role.word == role.word) { + kvm_mmu_get_root(kvm, root); + spin_unlock(&kvm->mmu_lock); + return root; + } + } + + root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level); + root->root_count = 1; + + list_add(&root->link, &kvm->arch.tdp_mmu_roots); + + spin_unlock(&kvm->mmu_lock); + + return root; +} + +hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu_page *root; + + root = get_tdp_mmu_vcpu_root(vcpu); + if (!root) + return INVALID_PAGE; + + return __pa(root->spt); +} + +static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, + u64 old_spte, u64 new_spte, int level); + +static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) +{ + return sp->role.smm ? 1 : 0; +} + +static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) +{ + bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); + + if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level)) + return; + + if (is_accessed_spte(old_spte) && + (!is_accessed_spte(new_spte) || pfn_changed)) + kvm_set_pfn_accessed(spte_to_pfn(old_spte)); +} + +static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, + u64 old_spte, u64 new_spte, int level) +{ + bool pfn_changed; + struct kvm_memory_slot *slot; + + if (level > PG_LEVEL_4K) + return; + + pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); + + if ((!is_writable_pte(old_spte) || pfn_changed) && + is_writable_pte(new_spte)) { + slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn); + mark_page_dirty_in_slot(slot, gfn); + } +} + +/** + * handle_changed_spte - handle bookkeeping associated with an SPTE change + * @kvm: kvm instance + * @as_id: the address space of the paging structure the SPTE was a part of + * @gfn: the base GFN that was mapped by the SPTE + * @old_spte: The value of the SPTE before the change + * @new_spte: The value of the SPTE after the change + * @level: the level of the PT the SPTE is part of in the paging structure + * + * Handle bookkeeping that might result from the modification of a SPTE. + * This function must be called for all TDP SPTE modifications. + */ +static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, + u64 old_spte, u64 new_spte, int level) +{ + bool was_present = is_shadow_present_pte(old_spte); + bool is_present = is_shadow_present_pte(new_spte); + bool was_leaf = was_present && is_last_spte(old_spte, level); + bool is_leaf = is_present && is_last_spte(new_spte, level); + bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); + u64 *pt; + struct kvm_mmu_page *sp; + u64 old_child_spte; + int i; + + WARN_ON(level > PT64_ROOT_MAX_LEVEL); + WARN_ON(level < PG_LEVEL_4K); + WARN_ON(gfn % KVM_PAGES_PER_HPAGE(level)); + + /* + * If this warning were to trigger it would indicate that there was a + * missing MMU notifier or a race with some notifier handler. + * A present, leaf SPTE should never be directly replaced with another + * present leaf SPTE pointing to a differnt PFN. A notifier handler + * should be zapping the SPTE before the main MM's page table is + * changed, or the SPTE should be zeroed, and the TLBs flushed by the + * thread before replacement. + */ + if (was_leaf && is_leaf && pfn_changed) { + pr_err("Invalid SPTE change: cannot replace a present leaf\n" + "SPTE with another present leaf SPTE mapping a\n" + "different PFN!\n" + "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", + as_id, gfn, old_spte, new_spte, level); + + /* + * Crash the host to prevent error propagation and guest data + * courruption. + */ + BUG(); + } + + if (old_spte == new_spte) + return; + + /* + * The only times a SPTE should be changed from a non-present to + * non-present state is when an MMIO entry is installed/modified/ + * removed. In that case, there is nothing to do here. + */ + if (!was_present && !is_present) { + /* + * If this change does not involve a MMIO SPTE, it is + * unexpected. Log the change, though it should not impact the + * guest since both the former and current SPTEs are nonpresent. + */ + if (WARN_ON(!is_mmio_spte(old_spte) && !is_mmio_spte(new_spte))) + pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" + "should not be replaced with another,\n" + "different nonpresent SPTE, unless one or both\n" + "are MMIO SPTEs.\n" + "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", + as_id, gfn, old_spte, new_spte, level); + return; + } + + + if (was_leaf && is_dirty_spte(old_spte) && + (!is_dirty_spte(new_spte) || pfn_changed)) + kvm_set_pfn_dirty(spte_to_pfn(old_spte)); + + /* + * Recursively handle child PTs if the change removed a subtree from + * the paging structure. + */ + if (was_present && !was_leaf && (pfn_changed || !is_present)) { + pt = spte_to_child_pt(old_spte, level); + sp = sptep_to_sp(pt); + + list_del(&sp->link); + + if (sp->lpage_disallowed) + unaccount_huge_nx_page(kvm, sp); + + for (i = 0; i < PT64_ENT_PER_PAGE; i++) { + old_child_spte = READ_ONCE(*(pt + i)); + WRITE_ONCE(*(pt + i), 0); + handle_changed_spte(kvm, as_id, + gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)), + old_child_spte, 0, level - 1); + } + + kvm_flush_remote_tlbs_with_address(kvm, gfn, + KVM_PAGES_PER_HPAGE(level)); + + free_page((unsigned long)pt); + kmem_cache_free(mmu_page_header_cache, sp); + } +} + +static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, + u64 old_spte, u64 new_spte, int level) +{ + __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level); + handle_changed_spte_acc_track(old_spte, new_spte, level); + handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, + new_spte, level); +} + +static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, + u64 new_spte, bool record_acc_track, + bool record_dirty_log) +{ + u64 *root_pt = tdp_iter_root_pt(iter); + struct kvm_mmu_page *root = sptep_to_sp(root_pt); + int as_id = kvm_mmu_page_as_id(root); + + WRITE_ONCE(*iter->sptep, new_spte); + + __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, + iter->level); + if (record_acc_track) + handle_changed_spte_acc_track(iter->old_spte, new_spte, + iter->level); + if (record_dirty_log) + handle_changed_spte_dirty_log(kvm, as_id, iter->gfn, + iter->old_spte, new_spte, + iter->level); +} + +static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, + u64 new_spte) +{ + __tdp_mmu_set_spte(kvm, iter, new_spte, true, true); +} + +static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm, + struct tdp_iter *iter, + u64 new_spte) +{ + __tdp_mmu_set_spte(kvm, iter, new_spte, false, true); +} + +static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, + struct tdp_iter *iter, + u64 new_spte) +{ + __tdp_mmu_set_spte(kvm, iter, new_spte, true, false); +} + +#define tdp_root_for_each_pte(_iter, _root, _start, _end) \ + for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end) + +#define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \ + tdp_root_for_each_pte(_iter, _root, _start, _end) \ + if (!is_shadow_present_pte(_iter.old_spte) || \ + !is_last_spte(_iter.old_spte, _iter.level)) \ + continue; \ + else + +#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ + for_each_tdp_pte(_iter, __va(_mmu->root_hpa), \ + _mmu->shadow_root_level, _start, _end) + +/* + * Flush the TLB if the process should drop kvm->mmu_lock. + * Return whether the caller still needs to flush the tlb. + */ +static bool tdp_mmu_iter_flush_cond_resched(struct kvm *kvm, struct tdp_iter *iter) +{ + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { + kvm_flush_remote_tlbs(kvm); + cond_resched_lock(&kvm->mmu_lock); + tdp_iter_refresh_walk(iter); + return false; + } else { + return true; + } +} + +static void tdp_mmu_iter_cond_resched(struct kvm *kvm, struct tdp_iter *iter) +{ + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { + cond_resched_lock(&kvm->mmu_lock); + tdp_iter_refresh_walk(iter); + } +} + +/* + * Tears down the mappings for the range of gfns, [start, end), and frees the + * non-root pages mapping GFNs strictly within that range. Returns true if + * SPTEs have been cleared and a TLB flush is needed before releasing the + * MMU lock. + * If can_yield is true, will release the MMU lock and reschedule if the + * scheduler needs the CPU or there is contention on the MMU lock. If this + * function cannot yield, it will not release the MMU lock or reschedule and + * the caller must ensure it does not supply too large a GFN range, or the + * operation can cause a soft lockup. + */ +static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t start, gfn_t end, bool can_yield) +{ + struct tdp_iter iter; + bool flush_needed = false; + + tdp_root_for_each_pte(iter, root, start, end) { + if (!is_shadow_present_pte(iter.old_spte)) + continue; + + /* + * If this is a non-last-level SPTE that covers a larger range + * than should be zapped, continue, and zap the mappings at a + * lower level. + */ + if ((iter.gfn < start || + iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) && + !is_last_spte(iter.old_spte, iter.level)) + continue; + + tdp_mmu_set_spte(kvm, &iter, 0); + + if (can_yield) + flush_needed = tdp_mmu_iter_flush_cond_resched(kvm, &iter); + else + flush_needed = true; + } + return flush_needed; +} + +/* + * Tears down the mappings for the range of gfns, [start, end), and frees the + * non-root pages mapping GFNs strictly within that range. Returns true if + * SPTEs have been cleared and a TLB flush is needed before releasing the + * MMU lock. + */ +bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end) +{ + struct kvm_mmu_page *root; + bool flush = false; + + for_each_tdp_mmu_root(kvm, root) { + /* + * Take a reference on the root so that it cannot be freed if + * this thread releases the MMU lock and yields in this loop. + */ + kvm_mmu_get_root(kvm, root); + + flush |= zap_gfn_range(kvm, root, start, end, true); + + kvm_mmu_put_root(kvm, root); + } + + return flush; +} + +void kvm_tdp_mmu_zap_all(struct kvm *kvm) +{ + gfn_t max_gfn = 1ULL << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT); + bool flush; + + flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn); + if (flush) + kvm_flush_remote_tlbs(kvm); +} + +/* + * Installs a last-level SPTE to handle a TDP page fault. + * (NPT/EPT violation/misconfiguration) + */ +static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, + int map_writable, + struct tdp_iter *iter, + kvm_pfn_t pfn, bool prefault) +{ + u64 new_spte; + int ret = 0; + int make_spte_ret = 0; + + if (unlikely(is_noslot_pfn(pfn))) { + new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); + trace_mark_mmio_spte(iter->sptep, iter->gfn, new_spte); + } else + make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn, + pfn, iter->old_spte, prefault, true, + map_writable, !shadow_accessed_mask, + &new_spte); + + if (new_spte == iter->old_spte) + ret = RET_PF_SPURIOUS; + else + tdp_mmu_set_spte(vcpu->kvm, iter, new_spte); + + /* + * If the page fault was caused by a write but the page is write + * protected, emulation is needed. If the emulation was skipped, + * the vCPU would have the same fault again. + */ + if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) { + if (write) + ret = RET_PF_EMULATE; + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + } + + /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ + if (unlikely(is_mmio_spte(new_spte))) + ret = RET_PF_EMULATE; + + trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep); + if (!prefault) + vcpu->stat.pf_fixed++; + + return ret; +} + +/* + * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing + * page tables and SPTEs to translate the faulting guest physical address. + */ +int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, + int map_writable, int max_level, kvm_pfn_t pfn, + bool prefault) +{ + bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); + bool write = error_code & PFERR_WRITE_MASK; + bool exec = error_code & PFERR_FETCH_MASK; + bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; + struct kvm_mmu *mmu = vcpu->arch.mmu; + struct tdp_iter iter; + struct kvm_mmu_page *sp; + u64 *child_pt; + u64 new_spte; + int ret; + gfn_t gfn = gpa >> PAGE_SHIFT; + int level; + int req_level; + + if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) + return RET_PF_RETRY; + if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))) + return RET_PF_RETRY; + + level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, + huge_page_disallowed, &req_level); + + trace_kvm_mmu_spte_requested(gpa, level, pfn); + tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { + if (nx_huge_page_workaround_enabled) + disallowed_hugepage_adjust(iter.old_spte, gfn, + iter.level, &pfn, &level); + + if (iter.level == level) + break; + + /* + * If there is an SPTE mapping a large page at a higher level + * than the target, that SPTE must be cleared and replaced + * with a non-leaf SPTE. + */ + if (is_shadow_present_pte(iter.old_spte) && + is_large_pte(iter.old_spte)) { + tdp_mmu_set_spte(vcpu->kvm, &iter, 0); + + kvm_flush_remote_tlbs_with_address(vcpu->kvm, iter.gfn, + KVM_PAGES_PER_HPAGE(iter.level)); + + /* + * The iter must explicitly re-read the spte here + * because the new value informs the !present + * path below. + */ + iter.old_spte = READ_ONCE(*iter.sptep); + } + + if (!is_shadow_present_pte(iter.old_spte)) { + sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level); + list_add(&sp->link, &vcpu->kvm->arch.tdp_mmu_pages); + child_pt = sp->spt; + clear_page(child_pt); + new_spte = make_nonleaf_spte(child_pt, + !shadow_accessed_mask); + + trace_kvm_mmu_get_page(sp, true); + if (huge_page_disallowed && req_level >= iter.level) + account_huge_nx_page(vcpu->kvm, sp); + + tdp_mmu_set_spte(vcpu->kvm, &iter, new_spte); + } + } + + if (WARN_ON(iter.level != level)) + return RET_PF_RETRY; + + ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter, + pfn, prefault); + + return ret; +} + +static int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, unsigned long start, + unsigned long end, unsigned long data, + int (*handler)(struct kvm *kvm, struct kvm_memory_slot *slot, + struct kvm_mmu_page *root, gfn_t start, + gfn_t end, unsigned long data)) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + struct kvm_mmu_page *root; + int ret = 0; + int as_id; + + for_each_tdp_mmu_root(kvm, root) { + /* + * Take a reference on the root so that it cannot be freed if + * this thread releases the MMU lock and yields in this loop. + */ + kvm_mmu_get_root(kvm, root); + + as_id = kvm_mmu_page_as_id(root); + slots = __kvm_memslots(kvm, as_id); + kvm_for_each_memslot(memslot, slots) { + unsigned long hva_start, hva_end; + gfn_t gfn_start, gfn_end; + + hva_start = max(start, memslot->userspace_addr); + hva_end = min(end, memslot->userspace_addr + + (memslot->npages << PAGE_SHIFT)); + if (hva_start >= hva_end) + continue; + /* + * {gfn(page) | page intersects with [hva_start, hva_end)} = + * {gfn_start, gfn_start+1, ..., gfn_end-1}. + */ + gfn_start = hva_to_gfn_memslot(hva_start, memslot); + gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); + + ret |= handler(kvm, memslot, root, gfn_start, + gfn_end, data); + } + + kvm_mmu_put_root(kvm, root); + } + + return ret; +} + +static int zap_gfn_range_hva_wrapper(struct kvm *kvm, + struct kvm_memory_slot *slot, + struct kvm_mmu_page *root, gfn_t start, + gfn_t end, unsigned long unused) +{ + return zap_gfn_range(kvm, root, start, end, false); +} + +int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start, + unsigned long end) +{ + return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0, + zap_gfn_range_hva_wrapper); +} + +/* + * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero + * if any of the GFNs in the range have been accessed. + */ +static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot, + struct kvm_mmu_page *root, gfn_t start, gfn_t end, + unsigned long unused) +{ + struct tdp_iter iter; + int young = 0; + u64 new_spte = 0; + + tdp_root_for_each_leaf_pte(iter, root, start, end) { + /* + * If we have a non-accessed entry we don't need to change the + * pte. + */ + if (!is_accessed_spte(iter.old_spte)) + continue; + + new_spte = iter.old_spte; + + if (spte_ad_enabled(new_spte)) { + clear_bit((ffs(shadow_accessed_mask) - 1), + (unsigned long *)&new_spte); + } else { + /* + * Capture the dirty status of the page, so that it doesn't get + * lost when the SPTE is marked for access tracking. + */ + if (is_writable_pte(new_spte)) + kvm_set_pfn_dirty(spte_to_pfn(new_spte)); + + new_spte = mark_spte_for_access_track(new_spte); + } + new_spte &= ~shadow_dirty_mask; + + tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte); + young = 1; + } + + return young; +} + +int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start, + unsigned long end) +{ + return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0, + age_gfn_range); +} + +static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, + struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused, + unsigned long unused2) +{ + struct tdp_iter iter; + + tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) + if (is_accessed_spte(iter.old_spte)) + return 1; + + return 0; +} + +int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva) +{ + return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0, + test_age_gfn); +} + +/* + * Handle the changed_pte MMU notifier for the TDP MMU. + * data is a pointer to the new pte_t mapping the HVA specified by the MMU + * notifier. + * Returns non-zero if a flush is needed before releasing the MMU lock. + */ +static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot, + struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused, + unsigned long data) +{ + struct tdp_iter iter; + pte_t *ptep = (pte_t *)data; + kvm_pfn_t new_pfn; + u64 new_spte; + int need_flush = 0; + + WARN_ON(pte_huge(*ptep)); + + new_pfn = pte_pfn(*ptep); + + tdp_root_for_each_pte(iter, root, gfn, gfn + 1) { + if (iter.level != PG_LEVEL_4K) + continue; + + if (!is_shadow_present_pte(iter.old_spte)) + break; + + tdp_mmu_set_spte(kvm, &iter, 0); + + kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1); + + if (!pte_write(*ptep)) { + new_spte = kvm_mmu_changed_pte_notifier_make_spte( + iter.old_spte, new_pfn); + + tdp_mmu_set_spte(kvm, &iter, new_spte); + } + + need_flush = 1; + } + + if (need_flush) + kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); + + return 0; +} + +int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address, + pte_t *host_ptep) +{ + return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1, + (unsigned long)host_ptep, + set_tdp_spte); +} + +/* + * Remove write access from all the SPTEs mapping GFNs [start, end). If + * skip_4k is set, SPTEs that map 4k pages, will not be write-protected. + * Returns true if an SPTE has been changed and the TLBs need to be flushed. + */ +static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t start, gfn_t end, int min_level) +{ + struct tdp_iter iter; + u64 new_spte; + bool spte_set = false; + + BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); + + for_each_tdp_pte_min_level(iter, root->spt, root->role.level, + min_level, start, end) { + if (!is_shadow_present_pte(iter.old_spte) || + !is_last_spte(iter.old_spte, iter.level)) + continue; + + new_spte = iter.old_spte & ~PT_WRITABLE_MASK; + + tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); + spte_set = true; + + tdp_mmu_iter_cond_resched(kvm, &iter); + } + return spte_set; +} + +/* + * Remove write access from all the SPTEs mapping GFNs in the memslot. Will + * only affect leaf SPTEs down to min_level. + * Returns true if an SPTE has been changed and the TLBs need to be flushed. + */ +bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, + int min_level) +{ + struct kvm_mmu_page *root; + int root_as_id; + bool spte_set = false; + + for_each_tdp_mmu_root(kvm, root) { + root_as_id = kvm_mmu_page_as_id(root); + if (root_as_id != slot->as_id) + continue; + + /* + * Take a reference on the root so that it cannot be freed if + * this thread releases the MMU lock and yields in this loop. + */ + kvm_mmu_get_root(kvm, root); + + spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, + slot->base_gfn + slot->npages, min_level); + + kvm_mmu_put_root(kvm, root); + } + + return spte_set; +} + +/* + * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If + * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. + * If AD bits are not enabled, this will require clearing the writable bit on + * each SPTE. Returns true if an SPTE has been changed and the TLBs need to + * be flushed. + */ +static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t start, gfn_t end) +{ + struct tdp_iter iter; + u64 new_spte; + bool spte_set = false; + + tdp_root_for_each_leaf_pte(iter, root, start, end) { + if (spte_ad_need_write_protect(iter.old_spte)) { + if (is_writable_pte(iter.old_spte)) + new_spte = iter.old_spte & ~PT_WRITABLE_MASK; + else + continue; + } else { + if (iter.old_spte & shadow_dirty_mask) + new_spte = iter.old_spte & ~shadow_dirty_mask; + else + continue; + } + + tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); + spte_set = true; + + tdp_mmu_iter_cond_resched(kvm, &iter); + } + return spte_set; +} + +/* + * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If + * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. + * If AD bits are not enabled, this will require clearing the writable bit on + * each SPTE. Returns true if an SPTE has been changed and the TLBs need to + * be flushed. + */ +bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot) +{ + struct kvm_mmu_page *root; + int root_as_id; + bool spte_set = false; + + for_each_tdp_mmu_root(kvm, root) { + root_as_id = kvm_mmu_page_as_id(root); + if (root_as_id != slot->as_id) + continue; + + /* + * Take a reference on the root so that it cannot be freed if + * this thread releases the MMU lock and yields in this loop. + */ + kvm_mmu_get_root(kvm, root); + + spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, + slot->base_gfn + slot->npages); + + kvm_mmu_put_root(kvm, root); + } + + return spte_set; +} + +/* + * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is + * set in mask, starting at gfn. The given memslot is expected to contain all + * the GFNs represented by set bits in the mask. If AD bits are enabled, + * clearing the dirty status will involve clearing the dirty bit on each SPTE + * or, if AD bits are not enabled, clearing the writable bit on each SPTE. + */ +static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t gfn, unsigned long mask, bool wrprot) +{ + struct tdp_iter iter; + u64 new_spte; + + tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), + gfn + BITS_PER_LONG) { + if (!mask) + break; + + if (iter.level > PG_LEVEL_4K || + !(mask & (1UL << (iter.gfn - gfn)))) + continue; + + if (wrprot || spte_ad_need_write_protect(iter.old_spte)) { + if (is_writable_pte(iter.old_spte)) + new_spte = iter.old_spte & ~PT_WRITABLE_MASK; + else + continue; + } else { + if (iter.old_spte & shadow_dirty_mask) + new_spte = iter.old_spte & ~shadow_dirty_mask; + else + continue; + } + + tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); + + mask &= ~(1UL << (iter.gfn - gfn)); + } +} + +/* + * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is + * set in mask, starting at gfn. The given memslot is expected to contain all + * the GFNs represented by set bits in the mask. If AD bits are enabled, + * clearing the dirty status will involve clearing the dirty bit on each SPTE + * or, if AD bits are not enabled, clearing the writable bit on each SPTE. + */ +void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn, unsigned long mask, + bool wrprot) +{ + struct kvm_mmu_page *root; + int root_as_id; + + lockdep_assert_held(&kvm->mmu_lock); + for_each_tdp_mmu_root(kvm, root) { + root_as_id = kvm_mmu_page_as_id(root); + if (root_as_id != slot->as_id) + continue; + + clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); + } +} + +/* + * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is + * only used for PML, and so will involve setting the dirty bit on each SPTE. + * Returns true if an SPTE has been changed and the TLBs need to be flushed. + */ +static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t start, gfn_t end) +{ + struct tdp_iter iter; + u64 new_spte; + bool spte_set = false; + + tdp_root_for_each_pte(iter, root, start, end) { + if (!is_shadow_present_pte(iter.old_spte)) + continue; + + new_spte = iter.old_spte | shadow_dirty_mask; + + tdp_mmu_set_spte(kvm, &iter, new_spte); + spte_set = true; + + tdp_mmu_iter_cond_resched(kvm, &iter); + } + + return spte_set; +} + +/* + * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is + * only used for PML, and so will involve setting the dirty bit on each SPTE. + * Returns true if an SPTE has been changed and the TLBs need to be flushed. + */ +bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot) +{ + struct kvm_mmu_page *root; + int root_as_id; + bool spte_set = false; + + for_each_tdp_mmu_root(kvm, root) { + root_as_id = kvm_mmu_page_as_id(root); + if (root_as_id != slot->as_id) + continue; + + /* + * Take a reference on the root so that it cannot be freed if + * this thread releases the MMU lock and yields in this loop. + */ + kvm_mmu_get_root(kvm, root); + + spte_set |= set_dirty_gfn_range(kvm, root, slot->base_gfn, + slot->base_gfn + slot->npages); + + kvm_mmu_put_root(kvm, root); + } + return spte_set; +} + +/* + * Clear non-leaf entries (and free associated page tables) which could + * be replaced by large mappings, for GFNs within the slot. + */ +static void zap_collapsible_spte_range(struct kvm *kvm, + struct kvm_mmu_page *root, + gfn_t start, gfn_t end) +{ + struct tdp_iter iter; + kvm_pfn_t pfn; + bool spte_set = false; + + tdp_root_for_each_pte(iter, root, start, end) { + if (!is_shadow_present_pte(iter.old_spte) || + is_last_spte(iter.old_spte, iter.level)) + continue; + + pfn = spte_to_pfn(iter.old_spte); + if (kvm_is_reserved_pfn(pfn) || + !PageTransCompoundMap(pfn_to_page(pfn))) + continue; + + tdp_mmu_set_spte(kvm, &iter, 0); + + spte_set = tdp_mmu_iter_flush_cond_resched(kvm, &iter); + } + + if (spte_set) + kvm_flush_remote_tlbs(kvm); +} + +/* + * Clear non-leaf entries (and free associated page tables) which could + * be replaced by large mappings, for GFNs within the slot. + */ +void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, + const struct kvm_memory_slot *slot) +{ + struct kvm_mmu_page *root; + int root_as_id; + + for_each_tdp_mmu_root(kvm, root) { + root_as_id = kvm_mmu_page_as_id(root); + if (root_as_id != slot->as_id) + continue; + + /* + * Take a reference on the root so that it cannot be freed if + * this thread releases the MMU lock and yields in this loop. + */ + kvm_mmu_get_root(kvm, root); + + zap_collapsible_spte_range(kvm, root, slot->base_gfn, + slot->base_gfn + slot->npages); + + kvm_mmu_put_root(kvm, root); + } +} + +/* + * Removes write access on the last level SPTE mapping this GFN and unsets the + * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted. + * Returns true if an SPTE was set and a TLB flush is needed. + */ +static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t gfn) +{ + struct tdp_iter iter; + u64 new_spte; + bool spte_set = false; + + tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) { + if (!is_writable_pte(iter.old_spte)) + break; + + new_spte = iter.old_spte & + ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); + + tdp_mmu_set_spte(kvm, &iter, new_spte); + spte_set = true; + } + + return spte_set; +} + +/* + * Removes write access on the last level SPTE mapping this GFN and unsets the + * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted. + * Returns true if an SPTE was set and a TLB flush is needed. + */ +bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, + struct kvm_memory_slot *slot, gfn_t gfn) +{ + struct kvm_mmu_page *root; + int root_as_id; + bool spte_set = false; + + lockdep_assert_held(&kvm->mmu_lock); + for_each_tdp_mmu_root(kvm, root) { + root_as_id = kvm_mmu_page_as_id(root); + if (root_as_id != slot->as_id) + continue; + + spte_set |= write_protect_gfn(kvm, root, gfn); + } + return spte_set; +} + +/* + * Return the level of the lowest level SPTE added to sptes. + * That SPTE may be non-present. + */ +int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes) +{ + struct tdp_iter iter; + struct kvm_mmu *mmu = vcpu->arch.mmu; + int leaf = vcpu->arch.mmu->shadow_root_level; + gfn_t gfn = addr >> PAGE_SHIFT; + + tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { + leaf = iter.level; + sptes[leaf - 1] = iter.old_spte; + } + + return leaf; +} diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h new file mode 100644 index 000000000000..556e065503f6 --- /dev/null +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifndef __KVM_X86_MMU_TDP_MMU_H +#define __KVM_X86_MMU_TDP_MMU_H + +#include <linux/kvm_host.h> + +void kvm_mmu_init_tdp_mmu(struct kvm *kvm); +void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); + +bool is_tdp_mmu_root(struct kvm *kvm, hpa_t root); +hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu); +void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root); + +bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end); +void kvm_tdp_mmu_zap_all(struct kvm *kvm); + +int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, + int map_writable, int max_level, kvm_pfn_t pfn, + bool prefault); + +int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start, + unsigned long end); + +int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start, + unsigned long end); +int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva); + +int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address, + pte_t *host_ptep); + +bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, + int min_level); +bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, + struct kvm_memory_slot *slot); +void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn, unsigned long mask, + bool wrprot); +bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot); +void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, + const struct kvm_memory_slot *slot); + +bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, + struct kvm_memory_slot *slot, gfn_t gfn); + +int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes); +#endif /* __KVM_X86_MMU_TDP_MMU_H */ |