diff options
-rw-r--r-- | Documentation/virt/kvm/api.rst | 16 | ||||
-rw-r--r-- | arch/x86/include/asm/kvm_host.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/mmu_internal.h | 7 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/spte.c | 7 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/spte.h | 3 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_mmu.c | 2 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 30 | ||||
-rw-r--r-- | include/uapi/linux/kvm.h | 1 |
8 files changed, 60 insertions, 8 deletions
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 320cb04f7bd9..bafaeedd455c 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -8206,6 +8206,22 @@ PV guests. The `KVM_PV_DUMP` command is available for the dump related UV data. Also the vcpu ioctl `KVM_S390_PV_CPU_COMMAND` is available and supports the `KVM_PV_DUMP_CPU` subcommand. +8.38 KVM_CAP_VM_DISABLE_NX_HUGE_PAGES +--------------------------- + +:Capability KVM_CAP_VM_DISABLE_NX_HUGE_PAGES +:Architectures: x86 +:Type: vm +:Parameters: arg[0] must be 0. +:Returns 0 on success, -EPERM if the userspace process does not + have CAP_SYS_BOOT, -EINVAL if args[0] is not 0 or any vCPUs have been + created. + +This capability disables the NX huge pages mitigation for iTLB MULTIHIT. + +The capability has no effect if the nx_huge_pages module parameter is not set. + +This capability may only be set before any vCPUs are created. 9. Known KVM API problems ========================= diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e37727a74d0a..7e4c31b57a75 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1336,6 +1336,8 @@ struct kvm_arch { * the global KVM_MAX_VCPU_IDS may lead to significant memory waste. */ u32 max_vcpu_ids; + + bool disable_nx_huge_pages; }; struct kvm_vm_stat { diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 5e1e3c8f8aaa..bb9d12ac0db3 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -155,9 +155,9 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, unsigned int pte_list_count(struct kvm_rmap_head *rmap_head); extern int nx_huge_pages; -static inline bool is_nx_huge_page_enabled(void) +static inline bool is_nx_huge_page_enabled(struct kvm *kvm) { - return READ_ONCE(nx_huge_pages); + return READ_ONCE(nx_huge_pages) && !kvm->arch.disable_nx_huge_pages; } struct kvm_page_fault { @@ -256,7 +256,8 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, .user = err & PFERR_USER_MASK, .prefetch = prefetch, .is_tdp = likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault), - .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(), + .nx_huge_page_workaround_enabled = + is_nx_huge_page_enabled(vcpu->kvm), .max_level = KVM_MAX_HUGEPAGE_LEVEL, .req_level = PG_LEVEL_4K, diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 242e4828d7df..db294c1beea2 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -147,7 +147,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, spte |= spte_shadow_accessed_mask(spte); if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) && - is_nx_huge_page_enabled()) { + is_nx_huge_page_enabled(vcpu->kvm)) { pte_access &= ~ACC_EXEC_MASK; } @@ -246,7 +246,8 @@ static u64 make_spte_executable(u64 spte) * This is used during huge page splitting to build the SPTEs that make up the * new page table. */ -u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index) +u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, int huge_level, + int index) { u64 child_spte; int child_level; @@ -274,7 +275,7 @@ u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index) * When splitting to a 4K page, mark the page executable as the * NX hugepage mitigation no longer applies. */ - if (is_nx_huge_page_enabled()) + if (is_nx_huge_page_enabled(kvm)) child_spte = make_spte_executable(child_spte); } diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 121c5eaaec77..256f90587e8d 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -421,7 +421,8 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool prefetch, bool can_unsync, bool host_writable, u64 *new_spte); -u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index); +u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, int huge_level, + int index); u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled); u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access); u64 mark_spte_for_access_track(u64 spte); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 1ea40809ef1f..522e2532343b 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1478,7 +1478,7 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, * not been linked in yet and thus is not reachable from any other CPU. */ for (i = 0; i < SPTE_ENT_PER_PAGE; i++) - sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i); + sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, level, i); /* * Replace the huge spte with a pointer to the populated lower level diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c1b3b2ea8ee0..7ce0c6fe166d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4324,6 +4324,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SYS_ATTRIBUTES: case KVM_CAP_VAPIC: case KVM_CAP_ENABLE_CAP: + case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES: r = 1; break; case KVM_CAP_EXIT_HYPERCALL: @@ -6184,6 +6185,35 @@ split_irqchip_unlock: } mutex_unlock(&kvm->lock); break; + case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES: + r = -EINVAL; + + /* + * Since the risk of disabling NX hugepages is a guest crashing + * the system, ensure the userspace process has permission to + * reboot the system. + * + * Note that unlike the reboot() syscall, the process must have + * this capability in the root namespace because exposing + * /dev/kvm into a container does not limit the scope of the + * iTLB multihit bug to that container. In other words, + * this must use capable(), not ns_capable(). + */ + if (!capable(CAP_SYS_BOOT)) { + r = -EPERM; + break; + } + + if (cap->args[0]) + break; + + mutex_lock(&kvm->lock); + if (!kvm->created_vcpus) { + kvm->arch.disable_nx_huge_pages = true; + r = 0; + } + mutex_unlock(&kvm->lock); + break; default: r = -EINVAL; break; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 7569b4ec199c..a36e78710382 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1166,6 +1166,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_S390_PROTECTED_DUMP 217 #define KVM_CAP_X86_TRIPLE_FAULT_EVENT 218 #define KVM_CAP_X86_NOTIFY_VMEXIT 219 +#define KVM_CAP_VM_DISABLE_NX_HUGE_PAGES 220 #ifdef KVM_CAP_IRQ_ROUTING |