aboutsummaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
authorLinus Torvalds2023-05-01 12:06:20 -0700
committerLinus Torvalds2023-05-01 12:06:20 -0700
commitc8c655c34e33544aec9d64b660872ab33c29b5f1 (patch)
tree4aad88f698f04cef9e5d9d573a6df6283085dadd /arch/x86
parentd75439d64a1e2b35e0f08906205b00279753cbed (diff)
parentb3c98052d46948a8d65d2778c7f306ff38366aac (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm updates from Paolo Bonzini: "s390: - More phys_to_virt conversions - Improvement of AP management for VSIE (nested virtualization) ARM64: - Numerous fixes for the pathological lock inversion issue that plagued KVM/arm64 since... forever. - New framework allowing SMCCC-compliant hypercalls to be forwarded to userspace, hopefully paving the way for some more features being moved to VMMs rather than be implemented in the kernel. - Large rework of the timer code to allow a VM-wide offset to be applied to both virtual and physical counters as well as a per-timer, per-vcpu offset that complements the global one. This last part allows the NV timer code to be implemented on top. - A small set of fixes to make sure that we don't change anything affecting the EL1&0 translation regime just after having having taken an exception to EL2 until we have executed a DSB. This ensures that speculative walks started in EL1&0 have completed. - The usual selftest fixes and improvements. x86: - Optimize CR0.WP toggling by avoiding an MMU reload when TDP is enabled, and by giving the guest control of CR0.WP when EPT is enabled on VMX (VMX-only because SVM doesn't support per-bit controls) - Add CR0/CR4 helpers to query single bits, and clean up related code where KVM was interpreting kvm_read_cr4_bits()'s "unsigned long" return as a bool - Move AMD_PSFD to cpufeatures.h and purge KVM's definition - Avoid unnecessary writes+flushes when the guest is only adding new PTEs - Overhaul .sync_page() and .invlpg() to utilize .sync_page()'s optimizations when emulating invalidations - Clean up the range-based flushing APIs - Revamp the TDP MMU's reaping of Accessed/Dirty bits to clear a single A/D bit using a LOCK AND instead of XCHG, and skip all of the "handle changed SPTE" overhead associated with writing the entire entry - Track the number of "tail" entries in a pte_list_desc to avoid having to walk (potentially) all descriptors during insertion and deletion, which gets quite expensive if the guest is spamming fork() - Disallow virtualizing legacy LBRs if architectural LBRs are available, the two are mutually exclusive in hardware - Disallow writes to immutable feature MSRs (notably PERF_CAPABILITIES) after KVM_RUN, similar to CPUID features - Overhaul the vmx_pmu_caps selftest to better validate PERF_CAPABILITIES - Apply PMU filters to emulated events and add test coverage to the pmu_event_filter selftest - AMD SVM: - Add support for virtual NMIs - Fixes for edge cases related to virtual interrupts - Intel AMX: - Don't advertise XTILE_CFG in KVM_GET_SUPPORTED_CPUID if XTILE_DATA is not being reported due to userspace not opting in via prctl() - Fix a bug in emulation of ENCLS in compatibility mode - Allow emulation of NOP and PAUSE for L2 - AMX selftests improvements - Misc cleanups MIPS: - Constify MIPS's internal callbacks (a leftover from the hardware enabling rework that landed in 6.3) Generic: - Drop unnecessary casts from "void *" throughout kvm_main.c - Tweak the layout of "struct kvm_mmu_memory_cache" to shrink the struct size by 8 bytes on 64-bit kernels by utilizing a padding hole Documentation: - Fix goof introduced by the conversion to rST" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (211 commits) KVM: s390: pci: fix virtual-physical confusion on module unload/load KVM: s390: vsie: clarifications on setting the APCB KVM: s390: interrupt: fix virtual-physical confusion for next alert GISA KVM: arm64: Have kvm_psci_vcpu_on() use WRITE_ONCE() to update mp_state KVM: arm64: Acquire mp_state_lock in kvm_arch_vcpu_ioctl_vcpu_init() KVM: selftests: Test the PMU event "Instructions retired" KVM: selftests: Copy full counter values from guest in PMU event filter test KVM: selftests: Use error codes to signal errors in PMU event filter test KVM: selftests: Print detailed info in PMU event filter asserts KVM: selftests: Add helpers for PMC asserts in PMU event filter test KVM: selftests: Add a common helper for the PMU event filter guest code KVM: selftests: Fix spelling mistake "perrmited" -> "permitted" KVM: arm64: vhe: Drop extra isb() on guest exit KVM: arm64: vhe: Synchronise with page table walker on MMU update KVM: arm64: pkvm: Document the side effects of kvm_flush_dcache_to_poc() KVM: arm64: nvhe: Synchronise with page table walker on TLBI KVM: arm64: Handle 32bit CNTPCTSS traps KVM: arm64: nvhe: Synchronise with page table walker on vcpu run KVM: arm64: vgic: Don't acquire its_lock before config_lock KVM: selftests: Add test to verify KVM's supported XCR0 ...
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/include/asm/cpufeatures.h9
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h6
-rw-r--r--arch/x86/include/asm/kvm_host.h89
-rw-r--r--arch/x86/include/asm/svm.h10
-rw-r--r--arch/x86/include/uapi/asm/kvm.h3
-rw-r--r--arch/x86/kvm/cpuid.c18
-rw-r--r--arch/x86/kvm/emulate.c8
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h18
-rw-r--r--arch/x86/kvm/kvm_onhyperv.c33
-rw-r--r--arch/x86/kvm/kvm_onhyperv.h7
-rw-r--r--arch/x86/kvm/mmu.h28
-rw-r--r--arch/x86/kvm/mmu/mmu.c524
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h20
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h275
-rw-r--r--arch/x86/kvm/mmu/spte.c2
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.h48
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c215
-rw-r--r--arch/x86/kvm/pmu.c25
-rw-r--r--arch/x86/kvm/pmu.h2
-rw-r--r--arch/x86/kvm/svm/nested.c91
-rw-r--r--arch/x86/kvm/svm/pmu.c2
-rw-r--r--arch/x86/kvm/svm/svm.c201
-rw-r--r--arch/x86/kvm/svm/svm.h29
-rw-r--r--arch/x86/kvm/svm/svm_onhyperv.h5
-rw-r--r--arch/x86/kvm/vmx/hyperv.c107
-rw-r--r--arch/x86/kvm/vmx/hyperv.h115
-rw-r--r--arch/x86/kvm/vmx/nested.c126
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c135
-rw-r--r--arch/x86/kvm/vmx/sgx.c4
-rw-r--r--arch/x86/kvm/vmx/vmx.c96
-rw-r--r--arch/x86/kvm/vmx/vmx.h20
-rw-r--r--arch/x86/kvm/vmx/vmx_ops.h22
-rw-r--r--arch/x86/kvm/x86.c256
-rw-r--r--arch/x86/kvm/x86.h64
34 files changed, 1502 insertions, 1111 deletions
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 353b054812de..cb8ca46213be 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -226,10 +226,9 @@
/* Virtualization flags: Linux defined, word 8 */
#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
-#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */
-#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */
-#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */
-#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */
+#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 1) /* Intel FlexPriority */
+#define X86_FEATURE_EPT ( 8*32+ 2) /* Intel Extended Page Table */
+#define X86_FEATURE_VPID ( 8*32+ 3) /* Intel Virtual Processor ID */
#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer VMMCALL to VMCALL */
#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */
@@ -338,6 +337,7 @@
#define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */
#define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */
#define X86_FEATURE_CPPC (13*32+27) /* Collaborative Processor Performance Control */
+#define X86_FEATURE_AMD_PSFD (13*32+28) /* "" Predictive Store Forwarding Disable */
#define X86_FEATURE_BTC_NO (13*32+29) /* "" Not vulnerable to Branch Type Confusion */
#define X86_FEATURE_BRS (13*32+31) /* Branch Sampling available */
@@ -370,6 +370,7 @@
#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */
#define X86_FEATURE_X2AVIC (15*32+18) /* Virtual x2apic */
#define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* Virtual SPEC_CTRL */
+#define X86_FEATURE_VNMI (15*32+25) /* Virtual NMI */
#define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* "" SVME addr check */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 8dc345cc6318..13bc212cd4bc 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -54,8 +54,8 @@ KVM_X86_OP(set_rflags)
KVM_X86_OP(get_if_flag)
KVM_X86_OP(flush_tlb_all)
KVM_X86_OP(flush_tlb_current)
-KVM_X86_OP_OPTIONAL(tlb_remote_flush)
-KVM_X86_OP_OPTIONAL(tlb_remote_flush_with_range)
+KVM_X86_OP_OPTIONAL(flush_remote_tlbs)
+KVM_X86_OP_OPTIONAL(flush_remote_tlbs_range)
KVM_X86_OP(flush_tlb_gva)
KVM_X86_OP(flush_tlb_guest)
KVM_X86_OP(vcpu_pre_run)
@@ -68,6 +68,8 @@ KVM_X86_OP(get_interrupt_shadow)
KVM_X86_OP(patch_hypercall)
KVM_X86_OP(inject_irq)
KVM_X86_OP(inject_nmi)
+KVM_X86_OP_OPTIONAL_RET0(is_vnmi_pending)
+KVM_X86_OP_OPTIONAL_RET0(set_vnmi_pending)
KVM_X86_OP(inject_exception)
KVM_X86_OP(cancel_injection)
KVM_X86_OP(interrupt_allowed)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 808c292ad3f4..fb9d1f2d6136 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -420,6 +420,10 @@ struct kvm_mmu_root_info {
#define KVM_MMU_NUM_PREV_ROOTS 3
+#define KVM_MMU_ROOT_CURRENT BIT(0)
+#define KVM_MMU_ROOT_PREVIOUS(i) BIT(1+i)
+#define KVM_MMU_ROOTS_ALL (BIT(1 + KVM_MMU_NUM_PREV_ROOTS) - 1)
+
#define KVM_HAVE_MMU_RWLOCK
struct kvm_mmu_page;
@@ -439,9 +443,8 @@ struct kvm_mmu {
gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
gpa_t gva_or_gpa, u64 access,
struct x86_exception *exception);
- int (*sync_page)(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp);
- void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa);
+ int (*sync_spte)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, int i);
struct kvm_mmu_root_info root;
union kvm_cpu_role cpu_role;
union kvm_mmu_page_role root_role;
@@ -479,11 +482,6 @@ struct kvm_mmu {
u64 pdptrs[4]; /* pae */
};
-struct kvm_tlb_range {
- u64 start_gfn;
- u64 pages;
-};
-
enum pmc_type {
KVM_PMC_GP = 0,
KVM_PMC_FIXED,
@@ -515,6 +513,7 @@ struct kvm_pmc {
#define MSR_ARCH_PERFMON_FIXED_CTR_MAX (MSR_ARCH_PERFMON_FIXED_CTR0 + KVM_PMC_MAX_FIXED - 1)
#define KVM_AMD_PMC_MAX_GENERIC 6
struct kvm_pmu {
+ u8 version;
unsigned nr_arch_gp_counters;
unsigned nr_arch_fixed_counters;
unsigned available_event_types;
@@ -527,7 +526,6 @@ struct kvm_pmu {
u64 global_ovf_ctrl_mask;
u64 reserved_bits;
u64 raw_event_mask;
- u8 version;
struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC];
struct kvm_pmc fixed_counters[KVM_PMC_MAX_FIXED];
struct irq_work irq_work;
@@ -876,7 +874,8 @@ struct kvm_vcpu_arch {
u64 tsc_scaling_ratio; /* current scaling ratio */
atomic_t nmi_queued; /* unprocessed asynchronous NMIs */
- unsigned nmi_pending; /* NMI queued after currently running handler */
+ /* Number of NMIs pending injection, not including hardware vNMIs. */
+ unsigned int nmi_pending;
bool nmi_injected; /* Trying to inject an NMI this entry */
bool smi_pending; /* SMI queued after currently running handler */
u8 handling_intr_from_guest;
@@ -947,23 +946,6 @@ struct kvm_vcpu_arch {
u64 msr_kvm_poll_control;
- /*
- * Indicates the guest is trying to write a gfn that contains one or
- * more of the PTEs used to translate the write itself, i.e. the access
- * is changing its own translation in the guest page tables. KVM exits
- * to userspace if emulation of the faulting instruction fails and this
- * flag is set, as KVM cannot make forward progress.
- *
- * If emulation fails for a write to guest page tables, KVM unprotects
- * (zaps) the shadow page for the target gfn and resumes the guest to
- * retry the non-emulatable instruction (on hardware). Unprotecting the
- * gfn doesn't allow forward progress for a self-changing access because
- * doing so also zaps the translation for the gfn, i.e. retrying the
- * instruction will hit a !PRESENT fault, which results in a new shadow
- * page and sends KVM back to square one.
- */
- bool write_fault_to_shadow_pgtable;
-
/* set at EPT violation at this point */
unsigned long exit_qualification;
@@ -1602,9 +1584,9 @@ struct kvm_x86_ops {
void (*flush_tlb_all)(struct kvm_vcpu *vcpu);
void (*flush_tlb_current)(struct kvm_vcpu *vcpu);
- int (*tlb_remote_flush)(struct kvm *kvm);
- int (*tlb_remote_flush_with_range)(struct kvm *kvm,
- struct kvm_tlb_range *range);
+ int (*flush_remote_tlbs)(struct kvm *kvm);
+ int (*flush_remote_tlbs_range)(struct kvm *kvm, gfn_t gfn,
+ gfn_t nr_pages);
/*
* Flush any TLB entries associated with the given GVA.
@@ -1638,6 +1620,13 @@ struct kvm_x86_ops {
int (*nmi_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked);
+ /* Whether or not a virtual NMI is pending in hardware. */
+ bool (*is_vnmi_pending)(struct kvm_vcpu *vcpu);
+ /*
+ * Attempt to pend a virtual NMI in harware. Returns %true on success
+ * to allow using static_call_ret0 as the fallback.
+ */
+ bool (*set_vnmi_pending)(struct kvm_vcpu *vcpu);
void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
void (*enable_irq_window)(struct kvm_vcpu *vcpu);
void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
@@ -1808,8 +1797,8 @@ void kvm_arch_free_vm(struct kvm *kvm);
#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB
static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm)
{
- if (kvm_x86_ops.tlb_remote_flush &&
- !static_call(kvm_x86_tlb_remote_flush)(kvm))
+ if (kvm_x86_ops.flush_remote_tlbs &&
+ !static_call(kvm_x86_flush_remote_tlbs)(kvm))
return 0;
else
return -ENOTSUPP;
@@ -1907,6 +1896,25 @@ u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
* EMULTYPE_COMPLETE_USER_EXIT - Set when the emulator should update interruptibility
* state and inject single-step #DBs after skipping
* an instruction (after completing userspace I/O).
+ *
+ * EMULTYPE_WRITE_PF_TO_SP - Set when emulating an intercepted page fault that
+ * is attempting to write a gfn that contains one or
+ * more of the PTEs used to translate the write itself,
+ * and the owning page table is being shadowed by KVM.
+ * If emulation of the faulting instruction fails and
+ * this flag is set, KVM will exit to userspace instead
+ * of retrying emulation as KVM cannot make forward
+ * progress.
+ *
+ * If emulation fails for a write to guest page tables,
+ * KVM unprotects (zaps) the shadow page for the target
+ * gfn and resumes the guest to retry the non-emulatable
+ * instruction (on hardware). Unprotecting the gfn
+ * doesn't allow forward progress for a self-changing
+ * access because doing so also zaps the translation for
+ * the gfn, i.e. retrying the instruction will hit a
+ * !PRESENT fault, which results in a new shadow page
+ * and sends KVM back to square one.
*/
#define EMULTYPE_NO_DECODE (1 << 0)
#define EMULTYPE_TRAP_UD (1 << 1)
@@ -1916,6 +1924,7 @@ u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
#define EMULTYPE_VMWARE_GP (1 << 5)
#define EMULTYPE_PF (1 << 6)
#define EMULTYPE_COMPLETE_USER_EXIT (1 << 7)
+#define EMULTYPE_WRITE_PF_TO_SP (1 << 8)
int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
@@ -1994,14 +2003,11 @@ static inline int __kvm_irq_line_state(unsigned long *irq_state,
return !!(*irq_state);
}
-#define KVM_MMU_ROOT_CURRENT BIT(0)
-#define KVM_MMU_ROOT_PREVIOUS(i) BIT(1+i)
-#define KVM_MMU_ROOTS_ALL (~0UL)
-
int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level);
void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
void kvm_inject_nmi(struct kvm_vcpu *vcpu);
+int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu);
void kvm_update_dr7(struct kvm_vcpu *vcpu);
@@ -2041,8 +2047,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
void *insn, int insn_len);
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
-void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
- gva_t gva, hpa_t root_hpa);
+void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ u64 addr, unsigned long roots);
void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd);
@@ -2204,4 +2210,11 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \
KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS)
+/*
+ * KVM previously used a u32 field in kvm_run to indicate the hypercall was
+ * initiated from long mode. KVM now sets bit 0 to indicate long mode, but the
+ * remaining 31 lower bits must be 0 to preserve ABI.
+ */
+#define KVM_EXIT_HYPERCALL_MBZ GENMASK_ULL(31, 1)
+
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 770dcf75eaa9..e7c7379d6ac7 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -183,6 +183,12 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define V_GIF_SHIFT 9
#define V_GIF_MASK (1 << V_GIF_SHIFT)
+#define V_NMI_PENDING_SHIFT 11
+#define V_NMI_PENDING_MASK (1 << V_NMI_PENDING_SHIFT)
+
+#define V_NMI_BLOCKING_SHIFT 12
+#define V_NMI_BLOCKING_MASK (1 << V_NMI_BLOCKING_SHIFT)
+
#define V_INTR_PRIO_SHIFT 16
#define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT)
@@ -197,6 +203,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define V_GIF_ENABLE_SHIFT 25
#define V_GIF_ENABLE_MASK (1 << V_GIF_ENABLE_SHIFT)
+#define V_NMI_ENABLE_SHIFT 26
+#define V_NMI_ENABLE_MASK (1 << V_NMI_ENABLE_SHIFT)
+
#define AVIC_ENABLE_SHIFT 31
#define AVIC_ENABLE_MASK (1 << AVIC_ENABLE_SHIFT)
@@ -278,7 +287,6 @@ static_assert((AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == AVIC_MAX_
static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_MAX_PHYSICAL_ID);
#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF)
-#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
struct vmcb_seg {
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 7f467fe05d42..1a6a1f987949 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -559,4 +559,7 @@ struct kvm_pmu_event_filter {
#define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */
#define KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */
+/* x86-specific KVM_EXIT_HYPERCALL flags. */
+#define KVM_EXIT_HYPERCALL_LONG_MODE BIT(0)
+
#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 599aebec2d52..123bf8b97a4b 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -60,12 +60,6 @@ u32 xstate_required_size(u64 xstate_bv, bool compacted)
return ret;
}
-/*
- * This one is tied to SSB in the user API, and not
- * visible in /proc/cpuinfo.
- */
-#define KVM_X86_FEATURE_AMD_PSFD (13*32+28) /* Predictive Store Forwarding Disable */
-
#define F feature_bit
/* Scattered Flag - For features that are scattered by cpufeatures.h. */
@@ -266,7 +260,7 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e
/* Update OSXSAVE bit */
if (boot_cpu_has(X86_FEATURE_XSAVE))
cpuid_entry_change(best, X86_FEATURE_OSXSAVE,
- kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE));
+ kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE));
cpuid_entry_change(best, X86_FEATURE_APIC,
vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE);
@@ -275,7 +269,7 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e
best = cpuid_entry2_find(entries, nent, 7, 0);
if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7)
cpuid_entry_change(best, X86_FEATURE_OSPKE,
- kvm_read_cr4_bits(vcpu, X86_CR4_PKE));
+ kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE));
best = cpuid_entry2_find(entries, nent, 0xD, 0);
if (best)
@@ -420,7 +414,7 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
* KVM_SET_CPUID{,2} again. To support this legacy behavior, check
* whether the supplied CPUID data is equal to what's already set.
*/
- if (vcpu->arch.last_vmentry_cpu != -1) {
+ if (kvm_vcpu_has_run(vcpu)) {
r = kvm_cpuid_check_equal(vcpu, e2, nent);
if (r)
return r;
@@ -653,7 +647,7 @@ void kvm_set_cpu_caps(void)
F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) |
F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16) |
- F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16)
+ F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D)
);
/* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */
@@ -715,7 +709,7 @@ void kvm_set_cpu_caps(void)
F(CLZERO) | F(XSAVEERPTR) |
F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) |
F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON) |
- __feature_bit(KVM_X86_FEATURE_AMD_PSFD)
+ F(AMD_PSFD)
);
/*
@@ -1002,7 +996,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
entry->eax = entry->ebx = entry->ecx = 0;
break;
case 0xd: {
- u64 permitted_xcr0 = kvm_caps.supported_xcr0 & xstate_get_guest_group_perm();
+ u64 permitted_xcr0 = kvm_get_filtered_xcr0();
u64 permitted_xss = kvm_caps.supported_xss;
entry->eax &= permitted_xcr0;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a20bec931764..936a397a08cd 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1640,6 +1640,14 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
goto exception;
break;
case VCPU_SREG_CS:
+ /*
+ * KVM uses "none" when loading CS as part of emulating Real
+ * Mode exceptions and IRET (handled above). In all other
+ * cases, loading CS without a control transfer is a KVM bug.
+ */
+ if (WARN_ON_ONCE(transfer == X86_TRANSFER_NONE))
+ goto exception;
+
if (!(seg_desc.type & 8))
goto exception;
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 4c91f626c058..75eae9c4998a 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -4,7 +4,7 @@
#include <linux/kvm_host.h>
-#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS
+#define KVM_POSSIBLE_CR0_GUEST_BITS (X86_CR0_TS | X86_CR0_WP)
#define KVM_POSSIBLE_CR4_GUEST_BITS \
(X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
| X86_CR4_OSXMMEXCPT | X86_CR4_PGE | X86_CR4_TSD | X86_CR4_FSGSBASE)
@@ -157,6 +157,14 @@ static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
return vcpu->arch.cr0 & mask;
}
+static __always_inline bool kvm_is_cr0_bit_set(struct kvm_vcpu *vcpu,
+ unsigned long cr0_bit)
+{
+ BUILD_BUG_ON(!is_power_of_2(cr0_bit));
+
+ return !!kvm_read_cr0_bits(vcpu, cr0_bit);
+}
+
static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu)
{
return kvm_read_cr0_bits(vcpu, ~0UL);
@@ -171,6 +179,14 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
return vcpu->arch.cr4 & mask;
}
+static __always_inline bool kvm_is_cr4_bit_set(struct kvm_vcpu *vcpu,
+ unsigned long cr4_bit)
+{
+ BUILD_BUG_ON(!is_power_of_2(cr4_bit));
+
+ return !!kvm_read_cr4_bits(vcpu, cr4_bit);
+}
+
static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu)
{
if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
diff --git a/arch/x86/kvm/kvm_onhyperv.c b/arch/x86/kvm/kvm_onhyperv.c
index 482d6639ef88..ded0bd688c65 100644
--- a/arch/x86/kvm/kvm_onhyperv.c
+++ b/arch/x86/kvm/kvm_onhyperv.c
@@ -10,17 +10,22 @@
#include "hyperv.h"
#include "kvm_onhyperv.h"
+struct kvm_hv_tlb_range {
+ u64 start_gfn;
+ u64 pages;
+};
+
static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush,
void *data)
{
- struct kvm_tlb_range *range = data;
+ struct kvm_hv_tlb_range *range = data;
return hyperv_fill_flush_guest_mapping_list(flush, range->start_gfn,
range->pages);
}
static inline int hv_remote_flush_root_tdp(hpa_t root_tdp,
- struct kvm_tlb_range *range)
+ struct kvm_hv_tlb_range *range)
{
if (range)
return hyperv_flush_guest_mapping_range(root_tdp,
@@ -29,8 +34,8 @@ static inline int hv_remote_flush_root_tdp(hpa_t root_tdp,
return hyperv_flush_guest_mapping(root_tdp);
}
-int hv_remote_flush_tlb_with_range(struct kvm *kvm,
- struct kvm_tlb_range *range)
+static int __hv_flush_remote_tlbs_range(struct kvm *kvm,
+ struct kvm_hv_tlb_range *range)
{
struct kvm_arch *kvm_arch = &kvm->arch;
struct kvm_vcpu *vcpu;
@@ -86,19 +91,29 @@ int hv_remote_flush_tlb_with_range(struct kvm *kvm,
spin_unlock(&kvm_arch->hv_root_tdp_lock);
return ret;
}
-EXPORT_SYMBOL_GPL(hv_remote_flush_tlb_with_range);
-int hv_remote_flush_tlb(struct kvm *kvm)
+int hv_flush_remote_tlbs_range(struct kvm *kvm, gfn_t start_gfn, gfn_t nr_pages)
+{
+ struct kvm_hv_tlb_range range = {
+ .start_gfn = start_gfn,
+ .pages = nr_pages,
+ };
+
+ return __hv_flush_remote_tlbs_range(kvm, &range);
+}
+EXPORT_SYMBOL_GPL(hv_flush_remote_tlbs_range);
+
+int hv_flush_remote_tlbs(struct kvm *kvm)
{
- return hv_remote_flush_tlb_with_range(kvm, NULL);
+ return __hv_flush_remote_tlbs_range(kvm, NULL);
}
-EXPORT_SYMBOL_GPL(hv_remote_flush_tlb);
+EXPORT_SYMBOL_GPL(hv_flush_remote_tlbs);
void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp)
{
struct kvm_arch *kvm_arch = &vcpu->kvm->arch;
- if (kvm_x86_ops.tlb_remote_flush == hv_remote_flush_tlb) {
+ if (kvm_x86_ops.flush_remote_tlbs == hv_flush_remote_tlbs) {
spin_lock(&kvm_arch->hv_root_tdp_lock);
vcpu->arch.hv_root_tdp = root_tdp;
if (root_tdp != kvm_arch->hv_root_tdp)
diff --git a/arch/x86/kvm/kvm_onhyperv.h b/arch/x86/kvm/kvm_onhyperv.h
index 6272dabec02d..f9ca3e7432b2 100644
--- a/arch/x86/kvm/kvm_onhyperv.h
+++ b/arch/x86/kvm/kvm_onhyperv.h
@@ -7,12 +7,11 @@
#define __ARCH_X86_KVM_KVM_ONHYPERV_H__
#if IS_ENABLED(CONFIG_HYPERV)
-int hv_remote_flush_tlb_with_range(struct kvm *kvm,
- struct kvm_tlb_range *range);
-int hv_remote_flush_tlb(struct kvm *kvm);
+int hv_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, gfn_t nr_pages);
+int hv_flush_remote_tlbs(struct kvm *kvm);
void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp);
#else /* !CONFIG_HYPERV */
-static inline int hv_remote_flush_tlb(struct kvm *kvm)
+static inline int hv_flush_remote_tlbs(struct kvm *kvm)
{
return -EOPNOTSUPP;
}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 168c46fd8dd1..92d5a1924fc1 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -113,6 +113,8 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
u64 fault_address, char *insn, int insn_len);
+void __kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu);
int kvm_mmu_load(struct kvm_vcpu *vcpu);
void kvm_mmu_unload(struct kvm_vcpu *vcpu);
@@ -132,7 +134,7 @@ static inline unsigned long kvm_get_pcid(struct kvm_vcpu *vcpu, gpa_t cr3)
{
BUILD_BUG_ON((X86_CR3_PCID_MASK & PAGE_MASK) != 0);
- return kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)
+ return kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)
? cr3 & X86_CR3_PCID_MASK
: 0;
}
@@ -153,6 +155,24 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu)
vcpu->arch.mmu->root_role.level);
}
+static inline void kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu)
+{
+ /*
+ * When EPT is enabled, KVM may passthrough CR0.WP to the guest, i.e.
+ * @mmu's snapshot of CR0.WP and thus all related paging metadata may
+ * be stale. Refresh CR0.WP and the metadata on-demand when checking
+ * for permission faults. Exempt nested MMUs, i.e. MMUs for shadowing
+ * nEPT and nNPT, as CR0.WP is ignored in both cases. Note, KVM does
+ * need to refresh nested_mmu, a.k.a. the walker used to translate L2
+ * GVAs to GPAs, as that "MMU" needs to honor L2's CR0.WP.
+ */
+ if (!tdp_enabled || mmu == &vcpu->arch.guest_mmu)
+ return;
+
+ __kvm_mmu_refresh_passthrough_bits(vcpu, mmu);
+}
+
/*
* Check if a given access (described through the I/D, W/R and U/S bits of a
* page fault error code pfec) causes a permission fault with the given PTE
@@ -184,8 +204,12 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
u64 implicit_access = access & PFERR_IMPLICIT_ACCESS;
bool not_smap = ((rflags & X86_EFLAGS_AC) | implicit_access) == X86_EFLAGS_AC;
int index = (pfec + (not_smap << PFERR_RSVD_BIT)) >> 1;
- bool fault = (mmu->permissions[index] >> pte_access) & 1;
u32 errcode = PFERR_PRESENT_MASK;
+ bool fault;
+
+ kvm_mmu_refresh_passthrough_bits(vcpu, mmu);
+
+ fault = (mmu->permissions[index] >> pte_access) & 1;
WARN_ON(pfec & (PFERR_PK_MASK | PFERR_RSVD_MASK));
if (unlikely(mmu->pkru_mask)) {
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index c8ebe542c565..c8961f45e3b1 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -125,17 +125,31 @@ module_param(dbg, bool, 0644);
#define PTE_LIST_EXT 14
/*
- * Slight optimization of cacheline layout, by putting `more' and `spte_count'
- * at the start; then accessing it will only use one single cacheline for
- * either full (entries==PTE_LIST_EXT) case or entries<=6.
+ * struct pte_list_desc is the core data structure used to implement a custom
+ * list for tracking a set of related SPTEs, e.g. all the SPTEs that map a
+ * given GFN when used in the context of rmaps. Using a custom list allows KVM
+ * to optimize for the common case where many GFNs will have at most a handful
+ * of SPTEs pointing at them, i.e. allows packing multiple SPTEs into a small
+ * memory footprint, which in turn improves runtime performance by exploiting
+ * cache locality.
+ *
+ * A list is comprised of one or more pte_list_desc objects (descriptors).
+ * Each individual descriptor stores up to PTE_LIST_EXT SPTEs. If a descriptor
+ * is full and a new SPTEs needs to be added, a new descriptor is allocated and
+ * becomes the head of the list. This means that by definitions, all tail
+ * descriptors are full.
+ *
+ * Note, the meta data fields are deliberately placed at the start of the
+ * structure to optimize the cacheline layout; accessing the descriptor will
+ * touch only a single cacheline so long as @spte_count<=6 (or if only the
+ * descriptors metadata is accessed).
*/
struct pte_list_desc {
struct pte_list_desc *more;
- /*
- * Stores number of entries stored in the pte_list_desc. No need to be
- * u64 but just for easier alignment. When PTE_LIST_EXT, means full.
- */
- u64 spte_count;
+ /* The number of PTEs stored in _this_ descriptor. */
+ u32 spte_count;
+ /* The number of PTEs stored in all tails of this descriptor. */
+ u32 tail_count;
u64 *sptes[PTE_LIST_EXT];
};
@@ -242,32 +256,35 @@ static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu)
return regs;
}
-static inline bool kvm_available_flush_tlb_with_range(void)
+static unsigned long get_guest_cr3(struct kvm_vcpu *vcpu)
{
- return kvm_x86_ops.tlb_remote_flush_with_range;
+ return kvm_read_cr3(vcpu);
}
-static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm,
- struct kvm_tlb_range *range)
+static inline unsigned long kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu)
{
- int ret = -ENOTSUPP;
-
- if (range && kvm_x86_ops.tlb_remote_flush_with_range)
- ret = static_call(kvm_x86_tlb_remote_flush_with_range)(kvm, range);
+ if (IS_ENABLED(CONFIG_RETPOLINE) && mmu->get_guest_pgd == get_guest_cr3)
+ return kvm_read_cr3(vcpu);
- if (ret)
- kvm_flush_remote_tlbs(kvm);
+ return mmu->get_guest_pgd(vcpu);
}
-void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
- u64 start_gfn, u64 pages)
+static inline bool kvm_available_flush_remote_tlbs_range(void)
{
- struct kvm_tlb_range range;
+ return kvm_x86_ops.flush_remote_tlbs_range;
+}
- range.start_gfn = start_gfn;
- range.pages = pages;
+void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t start_gfn,
+ gfn_t nr_pages)
+{
+ int ret = -EOPNOTSUPP;
- kvm_flush_remote_tlbs_with_range(kvm, &range);
+ if (kvm_x86_ops.flush_remote_tlbs_range)
+ ret = static_call(kvm_x86_flush_remote_tlbs_range)(kvm, start_gfn,
+ nr_pages);
+ if (ret)
+ kvm_flush_remote_tlbs(kvm);
}
static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index);
@@ -888,9 +905,9 @@ static void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
untrack_possible_nx_huge_page(kvm, sp);
}
-static struct kvm_memory_slot *
-gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
- bool no_dirty_log)
+static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu,
+ gfn_t gfn,
+ bool no_dirty_log)
{
struct kvm_memory_slot *slot;
@@ -929,53 +946,69 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
desc->sptes[0] = (u64 *)rmap_head->val;
desc->sptes[1] = spte;
desc->spte_count = 2;
+ desc->tail_count = 0;
rmap_head->val = (unsigned long)desc | 1;
++count;
} else {
rmap_printk("%p %llx many->many\n", spte, *spte);
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
- while (desc->spte_count == PTE_LIST_EXT) {
- count += PTE_LIST_EXT;
- if (!desc->more) {
- desc->more = kvm_mmu_memory_cache_alloc(cache);
- desc = desc->more;
- desc->spte_count = 0;
- break;
- }
- desc = desc->more;
+ count = desc->tail_count + desc->spte_count;
+
+ /*
+ * If the previous head is full, allocate a new head descriptor
+ * as tail descriptors are always kept full.
+ */
+ if (desc->spte_count == PTE_LIST_EXT) {
+ desc = kvm_mmu_memory_cache_alloc(cache);
+ desc->more = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ desc->spte_count = 0;
+ desc->tail_count = count;
+ rmap_head->val = (unsigned long)desc | 1;
}
- count += desc->spte_count;
desc->sptes[desc->spte_count++] = spte;
}
return count;
}
-static void
-pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
- struct pte_list_desc *desc, int i,
- struct pte_list_desc *prev_desc)
+static void pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
+ struct pte_list_desc *desc, int i)
{
- int j = desc->spte_count - 1;
+ struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ int j = head_desc->spte_count - 1;
+
+ /*
+ * The head descriptor should never be empty. A new head is added only
+ * when adding an entry and the previous head is full, and heads are
+ * removed (this flow) when they become empty.
+ */
+ BUG_ON(j < 0);
- desc->sptes[i] = desc->sptes[j];
- desc->sptes[j] = NULL;
- desc->spte_count--;
- if (desc->spte_count)
+ /*
+ * Replace the to-be-freed SPTE with the last valid entry from the head
+ * descriptor to ensure that tail descriptors are full at all times.
+ * Note, this also means that tail_count is stable for each descriptor.
+ */
+ desc->sptes[i] = head_desc->sptes[j];
+ head_desc->sptes[j] = NULL;
+ head_desc->spte_count--;
+ if (head_desc->spte_count)
return;
- if (!prev_desc && !desc->more)
+
+ /*
+ * The head descriptor is empty. If there are no tail descriptors,
+ * nullify the rmap head to mark the list as emtpy, else point the rmap
+ * head at the next descriptor, i.e. the new head.
+ */
+ if (!head_desc->more)
rmap_head->val = 0;
else
- if (prev_desc)
- prev_desc->more = desc->more;
- else
- rmap_head->val = (unsigned long)desc->more | 1;
- mmu_free_pte_list_desc(desc);
+ rmap_head->val = (unsigned long)head_desc->more | 1;
+ mmu_free_pte_list_desc(head_desc);
}
static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
{
struct pte_list_desc *desc;
- struct pte_list_desc *prev_desc;
int i;
if (!rmap_head->val) {
@@ -991,16 +1024,13 @@ static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
} else {
rmap_printk("%p many->many\n", spte);
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
- prev_desc = NULL;
while (desc) {
for (i = 0; i < desc->spte_count; ++i) {
if (desc->sptes[i] == spte) {
- pte_list_desc_remove_entry(rmap_head,
- desc, i, prev_desc);
+ pte_list_desc_remove_entry(rmap_head, desc, i);
return;
}
}
- prev_desc = desc;
desc = desc->more;
}
pr_err("%s: %p many->many\n", __func__, spte);
@@ -1047,7 +1077,6 @@ out:
unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
{
struct pte_list_desc *desc;
- unsigned int count = 0;
if (!rmap_head->val)
return 0;
@@ -1055,13 +1084,7 @@ unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
return 1;
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
-
- while (desc) {
- count += desc->spte_count;
- desc = desc->more;
- }
-
- return count;
+ return desc->tail_count + desc->spte_count;
}
static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level,
@@ -1073,14 +1096,6 @@ static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level,
return &slot->arch.rmap[level - PG_LEVEL_4K][idx];
}
-static bool rmap_can_add(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu_memory_cache *mc;
-
- mc = &vcpu->arch.mmu_pte_list_desc_cache;
- return kvm_mmu_memory_cache_nr_free_objects(mc);
-}
-
static void rmap_remove(struct kvm *kvm, u64 *spte)
{
struct kvm_memslots *slots;
@@ -1479,7 +1494,7 @@ restart:
}
}
- if (need_flush && kvm_available_flush_tlb_with_range()) {
+ if (need_flush && kvm_available_flush_remote_tlbs_range()) {
kvm_flush_remote_tlbs_gfn(kvm, gfn, level);
return false;
}
@@ -1504,8 +1519,8 @@ struct slot_rmap_walk_iterator {
struct kvm_rmap_head *end_rmap;
};
-static void
-rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
+static void rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator,
+ int level)
{
iterator->level = level;
iterator->gfn = iterator->start_gfn;
@@ -1513,10 +1528,10 @@ rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
iterator->end_rmap = gfn_to_rmap(iterator->end_gfn, level, iterator->slot);
}
-static void
-slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
- const struct kvm_memory_slot *slot, int start_level,
- int end_level, gfn_t start_gfn, gfn_t end_gfn)
+static void slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
+ const struct kvm_memory_slot *slot,
+ int start_level, int end_level,
+ gfn_t start_gfn, gfn_t end_gfn)
{
iterator->slot = slot;
iterator->start_level = start_level;
@@ -1789,12 +1804,6 @@ static void mark_unsync(u64 *spte)
kvm_mmu_mark_parents_unsync(sp);
}
-static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp)
-{
- return -1;
-}
-
#define KVM_PAGE_ARRAY_NR 16
struct kvm_mmu_pages {
@@ -1914,10 +1923,79 @@ static bool sp_has_gptes(struct kvm_mmu_page *sp)
&(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \
if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else
+static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+ union kvm_mmu_page_role root_role = vcpu->arch.mmu->root_role;
+
+ /*
+ * Ignore various flags when verifying that it's safe to sync a shadow
+ * page using the current MMU context.
+ *
+ * - level: not part of the overall MMU role and will never match as the MMU's
+ * level tracks the root level
+ * - access: updated based on the new guest PTE
+ * - quadrant: not part of the overall MMU role (similar to level)
+ */
+ const union kvm_mmu_page_role sync_role_ign = {
+ .level = 0xf,
+ .access = 0x7,
+ .quadrant = 0x3,
+ .passthrough = 0x1,
+ };
+
+ /*
+ * Direct pages can never be unsync, and KVM should never attempt to
+ * sync a shadow page for a different MMU context, e.g. if the role
+ * differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the
+ * reserved bits checks will be wrong, etc...
+ */
+ if (WARN_ON_ONCE(sp->role.direct || !vcpu->arch.mmu->sync_spte ||
+ (sp->role.word ^ root_role.word) & ~sync_role_ign.word))
+ return false;
+
+ return true;
+}
+
+static int kvm_sync_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i)
+{
+ if (!sp->spt[i])
+ return 0;
+
+ return vcpu->arch.mmu->sync_spte(vcpu, sp, i);
+}
+
+static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+ int flush = 0;
+ int i;
+
+ if (!kvm_sync_page_check(vcpu, sp))
+ return -1;
+
+ for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
+ int ret = kvm_sync_spte(vcpu, sp, i);
+
+ if (ret < -1)
+ return -1;
+ flush |= ret;
+ }
+
+ /*
+ * Note, any flush is purely for KVM's correctness, e.g. when dropping
+ * an existing SPTE or clearing W/A/D bits to ensure an mmu_notifier
+ * unmap or dirty logging event doesn't fail to flush. The guest is
+ * responsible for flushing the TLB to ensure any changes in protection
+ * bits are recognized, i.e. until the guest flushes or page faults on
+ * a relevant address, KVM is architecturally allowed to let vCPUs use
+ * cached translations with the old protection bits.
+ */
+ return flush;
+}
+
static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
struct list_head *invalid_list)
{
- int ret = vcpu->arch.mmu->sync_page(vcpu, sp);
+ int ret = __kvm_sync_page(vcpu, sp);
if (ret < 0)
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
@@ -3304,9 +3382,9 @@ static bool page_fault_can_be_fast(struct kvm_page_fault *fault)
* Returns true if the SPTE was fixed successfully. Otherwise,
* someone else modified the SPTE from its original value.
*/
-static bool
-fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
- u64 *sptep, u64 old_spte, u64 new_spte)
+static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault,
+ u64 *sptep, u64 old_spte, u64 new_spte)
{
/*
* Theoretically we could also set dirty bit (and flush TLB) here in
@@ -3513,6 +3591,8 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
LIST_HEAD(invalid_list);
bool free_active_root;
+ WARN_ON_ONCE(roots_to_free & ~KVM_MMU_ROOTS_ALL);
+
BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
/* Before acquiring the MMU lock, see if we need to do any real work. */
@@ -3731,7 +3811,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
int quadrant, i, r;
hpa_t root;
- root_pgd = mmu->get_guest_pgd(vcpu);
+ root_pgd = kvm_mmu_get_guest_pgd(vcpu, mmu);
root_gfn = root_pgd >> PAGE_SHIFT;
if (mmu_check_root(vcpu, root_gfn))
@@ -4181,7 +4261,7 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
arch.token = alloc_apf_token(vcpu);
arch.gfn = gfn;
arch.direct_map = vcpu->arch.mmu->root_role.direct;
- arch.cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu);
+ arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu);
return kvm_setup_async_pf(vcpu, cr2_or_gpa,
kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
@@ -4200,10 +4280,10 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
return;
if (!vcpu->arch.mmu->root_role.direct &&
- work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu))
+ work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu))
return;
- kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true);
+ kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true, NULL);
}
static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
@@ -4469,8 +4549,7 @@ static void nonpaging_init_context(struct kvm_mmu *context)
{
context->page_fault = nonpaging_page_fault;
context->gva_to_gpa = nonpaging_gva_to_gpa;
- context->sync_page = nonpaging_sync_page;
- context->invlpg = NULL;
+ context->sync_spte = NULL;
}
static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
@@ -4604,11 +4683,6 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
}
EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
-static unsigned long get_cr3(struct kvm_vcpu *vcpu)
-{
- return kvm_read_cr3(vcpu);
-}
-
static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
unsigned int access)
{
@@ -4638,10 +4712,9 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
#include "paging_tmpl.h"
#undef PTTYPE
-static void
-__reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
- u64 pa_bits_rsvd, int level, bool nx, bool gbpages,
- bool pse, bool amd)
+static void __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
+ u64 pa_bits_rsvd, int level, bool nx,
+ bool gbpages, bool pse, bool amd)
{
u64 gbpages_bit_rsvd = 0;
u64 nonleaf_bit8_rsvd = 0;
@@ -4754,9 +4827,9 @@ static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu,
guest_cpuid_is_amd_or_hygon(vcpu));
}
-static void
-__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
- u64 pa_bits_rsvd, bool execonly, int huge_page_level)
+static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
+ u64 pa_bits_rsvd, bool execonly,
+ int huge_page_level)
{
u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
u64 large_1g_rsvd = 0, large_2m_rsvd = 0;
@@ -4856,8 +4929,7 @@ static inline bool boot_cpu_is_amd(void)
* the direct page table on host, use as much mmu features as
* possible, however, kvm currently does not do execution-protection.
*/
-static void
-reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
+static void reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
{
struct rsvd_bits_validate *shadow_zero_check;
int i;
@@ -5060,20 +5132,18 @@ static void paging64_init_context(struct kvm_mmu *context)
{
context->page_fault = paging64_page_fault;
context->gva_to_gpa = paging64_gva_to_gpa;
- context->sync_page = paging64_sync_page;
- context->invlpg = paging64_invlpg;
+ context->sync_spte = paging64_sync_spte;
}
static void paging32_init_context(struct kvm_mmu *context)
{
context->page_fault = paging32_page_fault;
context->gva_to_gpa = paging32_gva_to_gpa;
- context->sync_page = paging32_sync_page;
- context->invlpg = paging32_invlpg;
+ context->sync_spte = paging32_sync_spte;
}
-static union kvm_cpu_role
-kvm_calc_cpu_role(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs)
+static union kvm_cpu_role kvm_calc_cpu_role(struct kvm_vcpu *vcpu,
+ const struct kvm_mmu_role_regs *regs)
{
union kvm_cpu_role role = {0};
@@ -5112,6 +5182,21 @@ kvm_calc_cpu_role(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs)
return role;
}
+void __kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu)
+{
+ const bool cr0_wp = kvm_is_cr0_bit_set(vcpu, X86_CR0_WP);
+
+ BUILD_BUG_ON((KVM_MMU_CR0_ROLE_BITS & KVM_POSSIBLE_CR0_GUEST_BITS) != X86_CR0_WP);
+ BUILD_BUG_ON((KVM_MMU_CR4_ROLE_BITS & KVM_POSSIBLE_CR4_GUEST_BITS));
+
+ if (is_cr0_wp(mmu) == cr0_wp)
+ return;
+
+ mmu->cpu_role.base.cr0_wp = cr0_wp;
+ reset_guest_paging_metadata(vcpu, mmu);
+}
+
static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
{
/* tdp_root_level is architecture forced level, use it if nonzero */
@@ -5157,9 +5242,8 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
context->cpu_role.as_u64 = cpu_role.as_u64;
context->root_role.word = root_role.word;
context->page_fault = kvm_tdp_page_fault;
- context->sync_page = nonpaging_sync_page;
- context->invlpg = NULL;
- context->get_guest_pgd = get_cr3;
+ context->sync_spte = NULL;
+ context->get_guest_pgd = get_guest_cr3;
context->get_pdptr = kvm_pdptr_read;
context->inject_page_fault = kvm_inject_page_fault;
@@ -5289,8 +5373,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
context->page_fault = ept_page_fault;
context->gva_to_gpa = ept_gva_to_gpa;
- context->sync_page = ept_sync_page;
- context->invlpg = ept_invlpg;
+ context->sync_spte = ept_sync_spte;
update_permission_bitmask(context, true);
context->pkru_mask = 0;
@@ -5309,7 +5392,7 @@ static void init_kvm_softmmu(struct kvm_vcpu *vcpu,
kvm_init_shadow_mmu(vcpu, cpu_role);
- context->get_guest_pgd = get_cr3;
+ context->get_guest_pgd = get_guest_cr3;
context->get_pdptr = kvm_pdptr_read;
context->inject_page_fault = kvm_inject_page_fault;
}
@@ -5323,7 +5406,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu,
return;
g_context->cpu_role.as_u64 = new_mode.as_u64;
- g_context->get_guest_pgd = get_cr3;
+ g_context->get_guest_pgd = get_guest_cr3;
g_context->get_pdptr = kvm_pdptr_read;
g_context->inject_page_fault = kvm_inject_page_fault;
@@ -5331,7 +5414,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu,
* L2 page tables are never shadowed, so there is no need to sync
* SPTEs.
*/
- g_context->invlpg = NULL;
+ g_context->sync_spte = NULL;
/*
* Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
@@ -5393,7 +5476,7 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
* Changing guest CPUID after KVM_RUN is forbidden, see the comment in
* kvm_arch_vcpu_ioctl().
*/
- KVM_BUG_ON(vcpu->arch.last_vmentry_cpu != -1, vcpu->kvm);
+ KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm);
}
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
@@ -5664,7 +5747,8 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
if (r == RET_PF_INVALID) {
r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
- lower_32_bits(error_code), false);
+ lower_32_bits(error_code), false,
+ &emulation_type);
if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
return -EIO;
}
@@ -5706,48 +5790,77 @@ emulate:
}
EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
-void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
- gva_t gva, hpa_t root_hpa)
+static void __kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ u64 addr, hpa_t root_hpa)
+{
+ struct kvm_shadow_walk_iterator iterator;
+
+ vcpu_clear_mmio_info(vcpu, addr);
+
+ if (!VALID_PAGE(root_hpa))
+ return;
+
+ write_lock(&vcpu->kvm->mmu_lock);
+ for_each_shadow_entry_using_root(vcpu, root_hpa, addr, iterator) {
+ struct kvm_mmu_page *sp = sptep_to_sp(iterator.sptep);
+
+ if (sp->unsync) {
+ int ret = kvm_sync_spte(vcpu, sp, iterator.index);
+
+ if (ret < 0)
+ mmu_page_zap_pte(vcpu->kvm, sp, iterator.sptep, NULL);
+ if (ret)
+ kvm_flush_remote_tlbs_sptep(vcpu->kvm, iterator.sptep);
+ }
+
+ if (!sp->unsync_children)
+ break;
+ }
+ write_unlock(&vcpu->kvm->mmu_lock);
+}
+
+void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ u64 addr, unsigned long roots)
{
int i;
+ WARN_ON_ONCE(roots & ~KVM_MMU_ROOTS_ALL);
+
/* It's actually a GPA for vcpu->arch.guest_mmu. */
if (mmu != &vcpu->arch.guest_mmu) {
/* INVLPG on a non-canonical address is a NOP according to the SDM. */
- if (is_noncanonical_address(gva, vcpu))
+ if (is_noncanonical_address(addr, vcpu))
return;
- static_call(kvm_x86_flush_tlb_gva)(vcpu, gva);
+ static_call(kvm_x86_flush_tlb_gva)(vcpu, addr);
}
- if (!mmu->invlpg)
+ if (!mmu->sync_spte)
return;
- if (root_hpa == INVALID_PAGE) {
- mmu->invlpg(vcpu, gva, mmu->root.hpa);
+ if (roots & KVM_MMU_ROOT_CURRENT)
+ __kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->root.hpa);
- /*
- * INVLPG is required to invalidate any global mappings for the VA,
- * irrespective of PCID. Since it would take us roughly similar amount
- * of work to determine whether any of the prev_root mappings of the VA
- * is marked global, or to just sync it blindly, so we might as well
- * just always sync it.
- *
- * Mappings not reachable via the current cr3 or the prev_roots will be
- * synced when switching to that cr3, so nothing needs to be done here
- * for them.
- */
- for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
- if (VALID_PAGE(mmu->prev_roots[i].hpa))
- mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
- } else {
- mmu->invlpg(vcpu, gva, root_hpa);
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+ if (roots & KVM_MMU_ROOT_PREVIOUS(i))
+ __kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->prev_roots[i].hpa);
}
}
+EXPORT_SYMBOL_GPL(kvm_mmu_invalidate_addr);
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
{
- kvm_mmu_invalidate_gva(vcpu, vcpu->arch.walk_mmu, gva, INVALID_PAGE);
+ /*
+ * INVLPG is required to invalidate any global mappings for the VA,
+ * irrespective of PCID. Blindly sync all roots as it would take
+ * roughly the same amount of work/time to determine whether any of the
+ * previous roots have a global mapping.
+ *
+ * Mappings not reachable via the current or previous cached roots will
+ * be synced when switching to that new cr3, so nothing needs to be
+ * done here for them.
+ */
+ kvm_mmu_invalidate_addr(vcpu, vcpu->arch.walk_mmu, gva, KVM_MMU_ROOTS_ALL);
++vcpu->stat.invlpg;
}
EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
@@ -5756,27 +5869,20 @@ EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
- bool tlb_flush = false;
+ unsigned long roots = 0;
uint i;
- if (pcid == kvm_get_active_pcid(vcpu)) {
- if (mmu->invlpg)
- mmu->invlpg(vcpu, gva, mmu->root.hpa);
- tlb_flush = true;
- }
+ if (pcid == kvm_get_active_pcid(vcpu))
+ roots |= KVM_MMU_ROOT_CURRENT;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
- pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) {
- if (mmu->invlpg)
- mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
- tlb_flush = true;
- }
+ pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd))
+ roots |= KVM_MMU_ROOT_PREVIOUS(i);
}
- if (tlb_flush)
- static_call(kvm_x86_flush_tlb_gva)(vcpu, gva);
-
+ if (roots)
+ kvm_mmu_invalidate_addr(vcpu, mmu, gva, roots);
++vcpu->stat.invlpg;
/*
@@ -5813,29 +5919,30 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
EXPORT_SYMBOL_GPL(kvm_configure_mmu);
/* The return value indicates if tlb flush on all vcpus is needed. */
-typedef bool (*slot_level_handler) (struct kvm *kvm,
+typedef bool (*slot_rmaps_handler) (struct kvm *kvm,
struct kvm_rmap_head *rmap_head,
const struct kvm_memory_slot *slot);
-/* The caller should hold mmu-lock before calling this function. */
-static __always_inline bool
-slot_handle_level_range(struct kvm *kvm, const struct kvm_memory_slot *memslot,
- slot_level_handler fn, int start_level, int end_level,
- gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield,
- bool flush)
+static __always_inline bool __walk_slot_rmaps(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ slot_rmaps_handler fn,
+ int start_level, int end_level,
+ gfn_t start_gfn, gfn_t end_gfn,
+ bool flush_on_yield, bool flush)
{
struct slot_rmap_walk_iterator iterator;
- for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ for_each_slot_rmap_range(slot, start_level, end_level, start_gfn,
end_gfn, &iterator) {
if (iterator.rmap)
- flush |= fn(kvm, iterator.rmap, memslot);
+ flush |= fn(kvm, iterator.rmap, slot);
if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
if (flush && flush_on_yield) {
- kvm_flush_remote_tlbs_with_address(kvm,
- start_gfn,
- iterator.gfn - start_gfn + 1);
+ kvm_flush_remote_tlbs_range(kvm, start_gfn,
+ iterator.gfn - start_gfn + 1);
flush = false;
}
cond_resched_rwlock_write(&kvm->mmu_lock);
@@ -5845,23 +5952,23 @@ slot_handle_level_range(struct kvm *kvm, const struct kvm_memory_slot *memslot,
return flush;
}
-static __always_inline bool
-slot_handle_level(struct kvm *kvm, const struct kvm_memory_slot *memslot,
- slot_level_handler fn, int start_level, int end_level,
- bool flush_on_yield)
+static __always_inline bool walk_slot_rmaps(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ slot_rmaps_handler fn,
+ int start_level, int end_level,
+ bool flush_on_yield)
{
- return slot_handle_level_range(kvm, memslot, fn, start_level,
- end_level, memslot->base_gfn,
- memslot->base_gfn + memslot->npages - 1,
- flush_on_yield, false);
+ return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level,
+ slot->base_gfn, slot->base_gfn + slot->npages - 1,
+ flush_on_yield, false);
}
-static __always_inline bool
-slot_handle_level_4k(struct kvm *kvm, const struct kvm_memory_slot *memslot,
- slot_level_handler fn, bool flush_on_yield)
+static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ slot_rmaps_handler fn,
+ bool flush_on_yield)
{
- return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K,
- PG_LEVEL_4K, flush_on_yield);
+ return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield);
}
static void free_mmu_pages(struct kvm_mmu *mmu)
@@ -6156,9 +6263,9 @@ static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_e
if (WARN_ON_ONCE(start >= end))
continue;
- flush = slot_handle_level_range(kvm, memslot, __kvm_zap_rmap,
- PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
- start, end - 1, true, flush);
+ flush = __walk_slot_rmaps(kvm, memslot, __kvm_zap_rmap,
+ PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
+ start, end - 1, true, flush);
}
}
@@ -6190,8 +6297,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
}
if (flush)
- kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
- gfn_end - gfn_start);
+ kvm_flush_remote_tlbs_range(kvm, gfn_start, gfn_end - gfn_start);
kvm_mmu_invalidate_end(kvm, 0, -1ul);
@@ -6211,8 +6317,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
{
if (kvm_memslots_have_rmaps(kvm)) {
write_lock(&kvm->mmu_lock);
- slot_handle_level(kvm, memslot, slot_rmap_write_protect,
- start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
+ walk_slot_rmaps(kvm, memslot, slot_rmap_write_protect,
+ start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
write_unlock(&kvm->mmu_lock);
}
@@ -6447,10 +6553,9 @@ static void kvm_shadow_mmu_try_split_huge_pages(struct kvm *kvm,
* all the way to the target level. There's no need to split pages
* already at the target level.
*/
- for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--) {
- slot_handle_level_range(kvm, slot, shadow_mmu_try_split_huge_pages,
- level, level, start, end - 1, true, false);
- }
+ for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--)
+ __walk_slot_rmaps(kvm, slot, shadow_mmu_try_split_huge_pages,
+ level, level, start, end - 1, true, false);
}
/* Must be called with the mmu_lock held in write-mode. */
@@ -6529,7 +6634,7 @@ restart:
PG_LEVEL_NUM)) {
kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
- if (kvm_available_flush_tlb_with_range())
+ if (kvm_available_flush_remote_tlbs_range())
kvm_flush_remote_tlbs_sptep(kvm, sptep);
else
need_tlb_flush = 1;
@@ -6548,8 +6653,8 @@ static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm,
* Note, use KVM_MAX_HUGEPAGE_LEVEL - 1 since there's no need to zap
* pages that are already mapped at the maximum hugepage level.
*/
- if (slot_handle_level(kvm, slot, kvm_mmu_zap_collapsible_spte,
- PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true))
+ if (walk_slot_rmaps(kvm, slot, kvm_mmu_zap_collapsible_spte,
+ PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true))
kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
}
@@ -6580,8 +6685,7 @@ void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
* is observed by any other operation on the same memslot.
*/
lockdep_assert_held(&kvm->slots_lock);
- kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
- memslot->npages);
+ kvm_flush_remote_tlbs_range(kvm, memslot->base_gfn, memslot->npages);
}
void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
@@ -6593,7 +6697,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
* Clear dirty bits only on 4k SPTEs since the legacy MMU only
* support dirty logging at a 4k granularity.
*/
- slot_handle_level_4k(kvm, memslot, __rmap_clear_dirty, false);
+ walk_slot_rmaps_4k(kvm, memslot, __rmap_clear_dirty, false);
write_unlock(&kvm->mmu_lock);
}
@@ -6663,8 +6767,8 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
}
}
-static unsigned long
-mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
+static unsigned long mmu_shrink_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
{
struct kvm *kvm;
int nr_to_scan = sc->nr_to_scan;
@@ -6722,8 +6826,8 @@ unlock:
return freed;
}
-static unsigned long
-mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+static unsigned long mmu_shrink_count(struct shrinker *shrink,
+ struct shrink_control *sc)
{
return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
}
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index cc58631e2336..d39af5639ce9 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -170,14 +170,14 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
struct kvm_memory_slot *slot, u64 gfn,
int min_level);
-void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
- u64 start_gfn, u64 pages);
+void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t start_gfn,
+ gfn_t nr_pages);
/* Flush the given page (huge or not) of guest memory. */
static inline void kvm_flush_remote_tlbs_gfn(struct kvm *kvm, gfn_t gfn, int level)
{
- kvm_flush_remote_tlbs_with_address(kvm, gfn_round_for_level(gfn, level),
- KVM_PAGES_PER_HPAGE(level));
+ kvm_flush_remote_tlbs_range(kvm, gfn_round_for_level(gfn, level),
+ KVM_PAGES_PER_HPAGE(level));
}
unsigned int pte_list_count(struct kvm_rmap_head *rmap_head);
@@ -240,6 +240,13 @@ struct kvm_page_fault {
kvm_pfn_t pfn;
hva_t hva;
bool map_writable;
+
+ /*
+ * Indicates the guest is trying to write a gfn that contains one or
+ * more of the PTEs used to translate the write itself, i.e. the access
+ * is changing its own translation in the guest page tables.
+ */
+ bool write_fault_to_shadow_pgtable;
};
int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
@@ -273,7 +280,7 @@ enum {
};
static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
- u32 err, bool prefetch)
+ u32 err, bool prefetch, int *emulation_type)
{
struct kvm_page_fault fault = {
.addr = cr2_or_gpa,
@@ -312,6 +319,9 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
else
r = vcpu->arch.mmu->page_fault(vcpu, &fault);
+ if (fault.write_fault_to_shadow_pgtable && emulation_type)
+ *emulation_type |= EMULTYPE_WRITE_PF_TO_SP;
+
/*
* Similar to above, prefetch faults aren't truly spurious, and the
* async #PF path doesn't do emulation. Do count faults that are fixed
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 57f0b75c80f9..0662e0278e70 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -324,7 +324,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
trace_kvm_mmu_pagetable_walk(addr, access);
retry_walk:
walker->level = mmu->cpu_role.base.level;
- pte = mmu->get_guest_pgd(vcpu);
+ pte = kvm_mmu_get_guest_pgd(vcpu, mmu);
have_ad = PT_HAVE_ACCESSED_DIRTY(mmu);
#if PTTYPE == 64
@@ -519,7 +519,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
static bool
FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- u64 *spte, pt_element_t gpte, bool no_dirty_log)
+ u64 *spte, pt_element_t gpte)
{
struct kvm_memory_slot *slot;
unsigned pte_access;
@@ -535,8 +535,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
pte_access = sp->role.access & FNAME(gpte_access)(gpte);
FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
- slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn,
- no_dirty_log && (pte_access & ACC_WRITE_MASK));
+ slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, pte_access & ACC_WRITE_MASK);
if (!slot)
return false;
@@ -605,7 +604,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
if (is_shadow_present_pte(*spte))
continue;
- if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i], true))
+ if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i]))
break;
}
}
@@ -685,8 +684,17 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
if (sp != ERR_PTR(-EEXIST))
link_shadow_page(vcpu, it.sptep, sp);
+
+ if (fault->write && table_gfn == fault->gfn)
+ fault->write_fault_to_shadow_pgtable = true;
}
+ /*
+ * Adjust the hugepage size _after_ resolving indirect shadow pages.
+ * KVM doesn't support mapping hugepages into the guest for gfns that
+ * are being shadowed by KVM, i.e. allocating a new shadow page may
+ * affect the allowed hugepage size.
+ */
kvm_mmu_hugepage_adjust(vcpu, fault);
trace_kvm_mmu_spte_requested(fault);
@@ -731,46 +739,6 @@ out_gpte_changed:
return RET_PF_RETRY;
}
- /*
- * To see whether the mapped gfn can write its page table in the current
- * mapping.
- *
- * It is the helper function of FNAME(page_fault). When guest uses large page
- * size to map the writable gfn which is used as current page table, we should
- * force kvm to use small page size to map it because new shadow page will be
- * created when kvm establishes shadow page table that stop kvm using large
- * page size. Do it early can avoid unnecessary #PF and emulation.
- *
- * @write_fault_to_shadow_pgtable will return true if the fault gfn is
- * currently used as its page table.
- *
- * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok
- * since the PDPT is always shadowed, that means, we can not use large page
- * size to map the gfn which is used as PDPT.
- */
-static bool
-FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
- struct guest_walker *walker, bool user_fault,
- bool *write_fault_to_shadow_pgtable)
-{
- int level;
- gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1);
- bool self_changed = false;
-
- if (!(walker->pte_access & ACC_WRITE_MASK ||
- (!is_cr0_wp(vcpu->arch.mmu) && !user_fault)))
- return false;
-
- for (level = walker->level; level <= walker->max_level; level++) {
- gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1];
-
- self_changed |= !(gfn & mask);
- *write_fault_to_shadow_pgtable |= !gfn;
- }
-
- return self_changed;
-}
-
/*
* Page fault handler. There are several causes for a page fault:
* - there is no shadow pte for the guest pte
@@ -789,7 +757,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
{
struct guest_walker walker;
int r;
- bool is_self_change_mapping;
pgprintk("%s: addr %lx err %x\n", __func__, fault->addr, fault->error_code);
WARN_ON_ONCE(fault->is_tdp);
@@ -814,6 +781,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
}
fault->gfn = walker.gfn;
+ fault->max_level = walker.level;
fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn);
if (page_fault_handle_page_track(vcpu, fault)) {
@@ -825,16 +793,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
if (r)
return r;
- vcpu->arch.write_fault_to_shadow_pgtable = false;
-
- is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
- &walker, fault->user, &vcpu->arch.write_fault_to_shadow_pgtable);
-
- if (is_self_change_mapping)
- fault->max_level = PG_LEVEL_4K;
- else
- fault->max_level = walker.level;
-
r = kvm_faultin_pfn(vcpu, fault, walker.pte_access);
if (r != RET_PF_CONTINUE)
return r;
@@ -887,64 +845,6 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
}
-static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
-{
- struct kvm_shadow_walk_iterator iterator;
- struct kvm_mmu_page *sp;
- u64 old_spte;
- int level;
- u64 *sptep;
-
- vcpu_clear_mmio_info(vcpu, gva);
-
- /*
- * No need to check return value here, rmap_can_add() can
- * help us to skip pte prefetch later.
- */
- mmu_topup_memory_caches(vcpu, true);
-
- if (!VALID_PAGE(root_hpa)) {
- WARN_ON(1);
- return;
- }
-
- write_lock(&vcpu->kvm->mmu_lock);
- for_each_shadow_entry_using_root(vcpu, root_hpa, gva, iterator) {
- level = iterator.level;
- sptep = iterator.sptep;
-
- sp = sptep_to_sp(sptep);
- old_spte = *sptep;
- if (is_last_spte(old_spte, level)) {
- pt_element_t gpte;
- gpa_t pte_gpa;
-
- if (!sp->unsync)
- break;
-
- pte_gpa = FNAME(get_level1_sp_gpa)(sp);
- pte_gpa += spte_index(sptep) * sizeof(pt_element_t);
-
- mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL);
- if (is_shadow_present_pte(old_spte))
- kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep);
-
- if (!rmap_can_add(vcpu))
- break;
-
- if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
- sizeof(pt_element_t)))
- break;
-
- FNAME(prefetch_gpte)(vcpu, sp, sptep, gpte, false);
- }
-
- if (!sp->unsync_children)
- break;
- }
- write_unlock(&vcpu->kvm->mmu_lock);
-}
-
/* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */
static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
gpa_t addr, u64 access,
@@ -977,114 +877,75 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
* can't change unless all sptes pointing to it are nuked first.
*
* Returns
- * < 0: the sp should be zapped
- * 0: the sp is synced and no tlb flushing is required
- * > 0: the sp is synced and tlb flushing is required
+ * < 0: failed to sync spte
+ * 0: the spte is synced and no tlb flushing is required
+ * > 0: the spte is synced and tlb flushing is required
*/
-static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i)
{
- union kvm_mmu_page_role root_role = vcpu->arch.mmu->root_role;
- int i;
bool host_writable;
gpa_t first_pte_gpa;
- bool flush = false;
-
- /*
- * Ignore various flags when verifying that it's safe to sync a shadow
- * page using the current MMU context.
- *
- * - level: not part of the overall MMU role and will never match as the MMU's
- * level tracks the root level
- * - access: updated based on the new guest PTE
- * - quadrant: not part of the overall MMU role (similar to level)
- */
- const union kvm_mmu_page_role sync_role_ign = {
- .level = 0xf,
- .access = 0x7,
- .quadrant = 0x3,
- .passthrough = 0x1,
- };
+ u64 *sptep, spte;
+ struct kvm_memory_slot *slot;
+ unsigned pte_access;
+ pt_element_t gpte;
+ gpa_t pte_gpa;
+ gfn_t gfn;
- /*
- * Direct pages can never be unsync, and KVM should never attempt to
- * sync a shadow page for a different MMU context, e.g. if the role
- * differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the
- * reserved bits checks will be wrong, etc...
- */
- if (WARN_ON_ONCE(sp->role.direct ||
- (sp->role.word ^ root_role.word) & ~sync_role_ign.word))
- return -1;
+ if (WARN_ON_ONCE(!sp->spt[i]))
+ return 0;
first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
+ pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
- for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
- u64 *sptep, spte;
- struct kvm_memory_slot *slot;
- unsigned pte_access;
- pt_element_t gpte;
- gpa_t pte_gpa;
- gfn_t gfn;
-
- if (!sp->spt[i])
- continue;
-
- pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
-
- if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
- sizeof(pt_element_t)))
- return -1;
-
- if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
- flush = true;
- continue;
- }
-
- gfn = gpte_to_gfn(gpte);
- pte_access = sp->role.access;
- pte_access &= FNAME(gpte_access)(gpte);
- FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
-
- if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access))
- continue;
+ if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
+ sizeof(pt_element_t)))
+ return -1;
- /*
- * Drop the SPTE if the new protections would result in a RWX=0
- * SPTE or if the gfn is changing. The RWX=0 case only affects
- * EPT with execute-only support, i.e. EPT without an effective
- * "present" bit, as all other paging modes will create a
- * read-only SPTE if pte_access is zero.
- */
- if ((!pte_access && !shadow_present_mask) ||
- gfn != kvm_mmu_page_get_gfn(sp, i)) {
- drop_spte(vcpu->kvm, &sp->spt[i]);
- flush = true;
- continue;
- }
+ if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte))
+ return 1;
- /* Update the shadowed access bits in case they changed. */
- kvm_mmu_page_set_access(sp, i, pte_access);
+ gfn = gpte_to_gfn(gpte);
+ pte_access = sp->role.access;
+ pte_access &= FNAME(gpte_access)(gpte);
+ FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
- sptep = &sp->spt[i];
- spte = *sptep;
- host_writable = spte & shadow_host_writable_mask;
- slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
- make_spte(vcpu, sp, slot, pte_access, gfn,
- spte_to_pfn(spte), spte, true, false,
- host_writable, &spte);
+ if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access))
+ return 0;
- flush |= mmu_spte_update(sptep, spte);
+ /*
+ * Drop the SPTE if the new protections would result in a RWX=0
+ * SPTE or if the gfn is changing. The RWX=0 case only affects
+ * EPT with execute-only support, i.e. EPT without an effective
+ * "present" bit, as all other paging modes will create a
+ * read-only SPTE if pte_access is zero.
+ */
+ if ((!pte_access && !shadow_present_mask) ||
+ gfn != kvm_mmu_page_get_gfn(sp, i)) {
+ drop_spte(vcpu->kvm, &sp->spt[i]);
+ return 1;
}
-
/*
- * Note, any flush is purely for KVM's correctness, e.g. when dropping
- * an existing SPTE or clearing W/A/D bits to ensure an mmu_notifier
- * unmap or dirty logging event doesn't fail to flush. The guest is
- * responsible for flushing the TLB to ensure any changes in protection
- * bits are recognized, i.e. until the guest flushes or page faults on
- * a relevant address, KVM is architecturally allowed to let vCPUs use
- * cached translations with the old protection bits.
+ * Do nothing if the permissions are unchanged. The existing SPTE is
+ * still, and prefetch_invalid_gpte() has verified that the A/D bits
+ * are set in the "new" gPTE, i.e. there is no danger of missing an A/D
+ * update due to A/D bits being set in the SPTE but not the gPTE.
*/
- return flush;
+ if (kvm_mmu_page_get_access(sp, i) == pte_access)
+ return 0;
+
+ /* Update the shadowed access bits in case they changed. */
+ kvm_mmu_page_set_access(sp, i, pte_access);
+
+ sptep = &sp->spt[i];
+ spte = *sptep;
+ host_writable = spte & shadow_host_writable_mask;
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ make_spte(vcpu, sp, slot, pte_access, gfn,
+ spte_to_pfn(spte), spte, true, false,
+ host_writable, &spte);
+
+ return mmu_spte_update(sptep, spte);
}
#undef pt_element_t
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index c15bfca3ed15..cf2c6426a6fc 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -164,7 +164,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
/*
* For simplicity, enforce the NX huge page mitigation even if not
* strictly necessary. KVM could ignore the mitigation if paging is
- * disabled in the guest, as the guest doesn't have an page tables to
+ * disabled in the guest, as the guest doesn't have any page tables to
* abuse. But to safely ignore the mitigation, KVM would have to
* ensure a new MMU is loaded (or all shadow pages zapped) when CR0.PG
* is toggled on, and that's a net negative for performance when TDP is
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index f0af385c56e0..fae559559a80 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -29,29 +29,49 @@ static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
WRITE_ONCE(*rcu_dereference(sptep), new_spte);
}
+/*
+ * SPTEs must be modified atomically if they are shadow-present, leaf
+ * SPTEs, and have volatile bits, i.e. has bits that can be set outside
+ * of mmu_lock. The Writable bit can be set by KVM's fast page fault
+ * handler, and Accessed and Dirty bits can be set by the CPU.
+ *
+ * Note, non-leaf SPTEs do have Accessed bits and those bits are
+ * technically volatile, but KVM doesn't consume the Accessed bit of
+ * non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit. This
+ * logic needs to be reassessed if KVM were to use non-leaf Accessed
+ * bits, e.g. to skip stepping down into child SPTEs when aging SPTEs.
+ */
+static inline bool kvm_tdp_mmu_spte_need_atomic_write(u64 old_spte, int level)
+{
+ return is_shadow_present_pte(old_spte) &&
+ is_last_spte(old_spte, level) &&
+ spte_has_volatile_bits(old_spte);
+}
+
static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
u64 new_spte, int level)
{
- /*
- * Atomically write the SPTE if it is a shadow-present, leaf SPTE with
- * volatile bits, i.e. has bits that can be set outside of mmu_lock.
- * The Writable bit can be set by KVM's fast page fault handler, and
- * Accessed and Dirty bits can be set by the CPU.
- *
- * Note, non-leaf SPTEs do have Accessed bits and those bits are
- * technically volatile, but KVM doesn't consume the Accessed bit of
- * non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit. This
- * logic needs to be reassessed if KVM were to use non-leaf Accessed
- * bits, e.g. to skip stepping down into child SPTEs when aging SPTEs.
- */
- if (is_shadow_present_pte(old_spte) && is_last_spte(old_spte, level) &&
- spte_has_volatile_bits(old_spte))
+ if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level))
return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte);
__kvm_tdp_mmu_write_spte(sptep, new_spte);
return old_spte;
}
+static inline u64 tdp_mmu_clear_spte_bits(tdp_ptep_t sptep, u64 old_spte,
+ u64 mask, int level)
+{
+ atomic64_t *sptep_atomic;
+
+ if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level)) {
+ sptep_atomic = (atomic64_t *)rcu_dereference(sptep);
+ return (u64)atomic64_fetch_and(~mask, sptep_atomic);
+ }
+
+ __kvm_tdp_mmu_write_spte(sptep, old_spte & ~mask);
+ return old_spte;
+}
+
/*
* A TDP iterator performs a pre-order walk over a TDP paging structure.
*/
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 7c25dbf32ecc..b2fca11b91ff 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -334,35 +334,6 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
u64 old_spte, u64 new_spte, int level,
bool shared);
-static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
-{
- if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
- return;
-
- if (is_accessed_spte(old_spte) &&
- (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
- spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
- kvm_set_pfn_accessed(spte_to_pfn(old_spte));
-}
-
-static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
- u64 old_spte, u64 new_spte, int level)
-{
- bool pfn_changed;
- struct kvm_memory_slot *slot;
-
- if (level > PG_LEVEL_4K)
- return;
-
- pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
-
- if ((!is_writable_pte(old_spte) || pfn_changed) &&
- is_writable_pte(new_spte)) {
- slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
- mark_page_dirty_in_slot(kvm, slot, gfn);
- }
-}
-
static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
kvm_account_pgtable_pages((void *)sp->spt, +1);
@@ -505,7 +476,7 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
}
/**
- * __handle_changed_spte - handle bookkeeping associated with an SPTE change
+ * handle_changed_spte - handle bookkeeping associated with an SPTE change
* @kvm: kvm instance
* @as_id: the address space of the paging structure the SPTE was a part of
* @gfn: the base GFN that was mapped by the SPTE
@@ -516,12 +487,13 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
* the MMU lock and the operation must synchronize with other
* threads that might be modifying SPTEs.
*
- * Handle bookkeeping that might result from the modification of a SPTE.
- * This function must be called for all TDP SPTE modifications.
+ * Handle bookkeeping that might result from the modification of a SPTE. Note,
+ * dirty logging updates are handled in common code, not here (see make_spte()
+ * and fast_pf_fix_direct_spte()).
*/
-static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
- u64 old_spte, u64 new_spte, int level,
- bool shared)
+static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
+ u64 old_spte, u64 new_spte, int level,
+ bool shared)
{
bool was_present = is_shadow_present_pte(old_spte);
bool is_present = is_shadow_present_pte(new_spte);
@@ -605,17 +577,10 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
if (was_present && !was_leaf &&
(is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
-}
-static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
- u64 old_spte, u64 new_spte, int level,
- bool shared)
-{
- __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
- shared);
- handle_changed_spte_acc_track(old_spte, new_spte, level);
- handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
- new_spte, level);
+ if (was_leaf && is_accessed_spte(old_spte) &&
+ (!is_present || !is_accessed_spte(new_spte) || pfn_changed))
+ kvm_set_pfn_accessed(spte_to_pfn(old_spte));
}
/*
@@ -658,9 +623,8 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
return -EBUSY;
- __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
- new_spte, iter->level, true);
- handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
+ handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
+ new_spte, iter->level, true);
return 0;
}
@@ -696,7 +660,7 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
/*
- * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
+ * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
* @kvm: KVM instance
* @as_id: Address space ID, i.e. regular vs. SMM
* @sptep: Pointer to the SPTE
@@ -704,23 +668,12 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
* @new_spte: The new value that will be set for the SPTE
* @gfn: The base GFN that was (or will be) mapped by the SPTE
* @level: The level _containing_ the SPTE (its parent PT's level)
- * @record_acc_track: Notify the MM subsystem of changes to the accessed state
- * of the page. Should be set unless handling an MMU
- * notifier for access tracking. Leaving record_acc_track
- * unset in that case prevents page accesses from being
- * double counted.
- * @record_dirty_log: Record the page as dirty in the dirty bitmap if
- * appropriate for the change being made. Should be set
- * unless performing certain dirty logging operations.
- * Leaving record_dirty_log unset in that case prevents page
- * writes from being double counted.
*
* Returns the old SPTE value, which _may_ be different than @old_spte if the
* SPTE had voldatile bits.
*/
-static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
- u64 old_spte, u64 new_spte, gfn_t gfn, int level,
- bool record_acc_track, bool record_dirty_log)
+static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
+ u64 old_spte, u64 new_spte, gfn_t gfn, int level)
{
lockdep_assert_held_write(&kvm->mmu_lock);
@@ -735,46 +688,17 @@ static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
- __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
-
- if (record_acc_track)
- handle_changed_spte_acc_track(old_spte, new_spte, level);
- if (record_dirty_log)
- handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
- new_spte, level);
+ handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
return old_spte;
}
-static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
- u64 new_spte, bool record_acc_track,
- bool record_dirty_log)
+static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
+ u64 new_spte)
{
WARN_ON_ONCE(iter->yielded);
-
- iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
- iter->old_spte, new_spte,
- iter->gfn, iter->level,
- record_acc_track, record_dirty_log);
-}
-
-static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
- u64 new_spte)
-{
- _tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
-}
-
-static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
- struct tdp_iter *iter,
- u64 new_spte)
-{
- _tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
-}
-
-static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
- struct tdp_iter *iter,
- u64 new_spte)
-{
- _tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
+ iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
+ iter->old_spte, new_spte,
+ iter->gfn, iter->level);
}
#define tdp_root_for_each_pte(_iter, _root, _start, _end) \
@@ -866,7 +790,7 @@ retry:
continue;
if (!shared)
- tdp_mmu_set_spte(kvm, &iter, 0);
+ tdp_mmu_iter_set_spte(kvm, &iter, 0);
else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
goto retry;
}
@@ -923,8 +847,8 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
return false;
- __tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
- sp->gfn, sp->role.level + 1, true, true);
+ tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
+ sp->gfn, sp->role.level + 1);
return true;
}
@@ -958,7 +882,7 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
!is_last_spte(iter.old_spte, iter.level))
continue;
- tdp_mmu_set_spte(kvm, &iter, 0);
+ tdp_mmu_iter_set_spte(kvm, &iter, 0);
flush = true;
}
@@ -1128,7 +1052,7 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
if (ret)
return ret;
} else {
- tdp_mmu_set_spte(kvm, iter, spte);
+ tdp_mmu_iter_set_spte(kvm, iter, spte);
}
tdp_account_mmu_page(kvm, sp);
@@ -1262,33 +1186,42 @@ static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
/*
* Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
* if any of the GFNs in the range have been accessed.
+ *
+ * No need to mark the corresponding PFN as accessed as this call is coming
+ * from the clear_young() or clear_flush_young() notifier, which uses the
+ * return value to determine if the page has been accessed.
*/
static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
struct kvm_gfn_range *range)
{
- u64 new_spte = 0;
+ u64 new_spte;
/* If we have a non-accessed entry we don't need to change the pte. */
if (!is_accessed_spte(iter->old_spte))
return false;
- new_spte = iter->old_spte;
-
- if (spte_ad_enabled(new_spte)) {
- new_spte &= ~shadow_accessed_mask;
+ if (spte_ad_enabled(iter->old_spte)) {
+ iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep,
+ iter->old_spte,
+ shadow_accessed_mask,
+ iter->level);
+ new_spte = iter->old_spte & ~shadow_accessed_mask;
} else {
/*
* Capture the dirty status of the page, so that it doesn't get
* lost when the SPTE is marked for access tracking.
*/
- if (is_writable_pte(new_spte))
- kvm_set_pfn_dirty(spte_to_pfn(new_spte));
+ if (is_writable_pte(iter->old_spte))
+ kvm_set_pfn_dirty(spte_to_pfn(iter->old_spte));
- new_spte = mark_spte_for_access_track(new_spte);
+ new_spte = mark_spte_for_access_track(iter->old_spte);
+ iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep,
+ iter->old_spte, new_spte,
+ iter->level);
}
- tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
-
+ trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
+ iter->old_spte, new_spte);
return true;
}
@@ -1324,15 +1257,15 @@ static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
* Note, when changing a read-only SPTE, it's not strictly necessary to
* zero the SPTE before setting the new PFN, but doing so preserves the
* invariant that the PFN of a present * leaf SPTE can never change.
- * See __handle_changed_spte().
+ * See handle_changed_spte().
*/
- tdp_mmu_set_spte(kvm, iter, 0);
+ tdp_mmu_iter_set_spte(kvm, iter, 0);
if (!pte_write(range->pte)) {
new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
pte_pfn(range->pte));
- tdp_mmu_set_spte(kvm, iter, new_spte);
+ tdp_mmu_iter_set_spte(kvm, iter, new_spte);
}
return true;
@@ -1349,7 +1282,7 @@ bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
/*
* No need to handle the remote TLB flush under RCU protection, the
* target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
- * shadow page. See the WARN on pfn_changed in __handle_changed_spte().
+ * shadow page. See the WARN on pfn_changed in handle_changed_spte().
*/
return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
}
@@ -1607,8 +1540,8 @@ void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t start, gfn_t end)
{
+ u64 dbit = kvm_ad_enabled() ? shadow_dirty_mask : PT_WRITABLE_MASK;
struct tdp_iter iter;
- u64 new_spte;
bool spte_set = false;
rcu_read_lock();
@@ -1621,19 +1554,13 @@ retry:
if (!is_shadow_present_pte(iter.old_spte))
continue;
- if (spte_ad_need_write_protect(iter.old_spte)) {
- if (is_writable_pte(iter.old_spte))
- new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
- else
- continue;
- } else {
- if (iter.old_spte & shadow_dirty_mask)
- new_spte = iter.old_spte & ~shadow_dirty_mask;
- else
- continue;
- }
+ MMU_WARN_ON(kvm_ad_enabled() &&
+ spte_ad_need_write_protect(iter.old_spte));
- if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
+ if (!(iter.old_spte & dbit))
+ continue;
+
+ if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit))
goto retry;
spte_set = true;
@@ -1675,8 +1602,9 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t gfn, unsigned long mask, bool wrprot)
{
+ u64 dbit = (wrprot || !kvm_ad_enabled()) ? PT_WRITABLE_MASK :
+ shadow_dirty_mask;
struct tdp_iter iter;
- u64 new_spte;
rcu_read_lock();
@@ -1685,25 +1613,26 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
if (!mask)
break;
+ MMU_WARN_ON(kvm_ad_enabled() &&
+ spte_ad_need_write_protect(iter.old_spte));
+
if (iter.level > PG_LEVEL_4K ||
!(mask & (1UL << (iter.gfn - gfn))))
continue;
mask &= ~(1UL << (iter.gfn - gfn));
- if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
- if (is_writable_pte(iter.old_spte))
- new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
- else
- continue;
- } else {
- if (iter.old_spte & shadow_dirty_mask)
- new_spte = iter.old_spte & ~shadow_dirty_mask;
- else
- continue;
- }
+ if (!(iter.old_spte & dbit))
+ continue;
+
+ iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep,
+ iter.old_spte, dbit,
+ iter.level);
- tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
+ trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level,
+ iter.old_spte,
+ iter.old_spte & ~dbit);
+ kvm_set_pfn_dirty(spte_to_pfn(iter.old_spte));
}
rcu_read_unlock();
@@ -1821,7 +1750,7 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
if (new_spte == iter.old_spte)
break;
- tdp_mmu_set_spte(kvm, &iter, new_spte);
+ tdp_mmu_iter_set_spte(kvm, &iter, new_spte);
spte_set = true;
}
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 612e6c70ce2e..1690d41c1830 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -93,7 +93,7 @@ void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops)
#undef __KVM_X86_PMU_OP
}
-static inline bool pmc_is_enabled(struct kvm_pmc *pmc)
+static inline bool pmc_is_globally_enabled(struct kvm_pmc *pmc)
{
return static_call(kvm_x86_pmu_pmc_is_enabled)(pmc);
}
@@ -400,6 +400,12 @@ static bool check_pmu_event_filter(struct kvm_pmc *pmc)
return is_fixed_event_allowed(filter, pmc->idx);
}
+static bool pmc_event_is_allowed(struct kvm_pmc *pmc)
+{
+ return pmc_is_globally_enabled(pmc) && pmc_speculative_in_use(pmc) &&
+ check_pmu_event_filter(pmc);
+}
+
static void reprogram_counter(struct kvm_pmc *pmc)
{
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
@@ -409,10 +415,7 @@ static void reprogram_counter(struct kvm_pmc *pmc)
pmc_pause_counter(pmc);
- if (!pmc_speculative_in_use(pmc) || !pmc_is_enabled(pmc))
- goto reprogram_complete;
-
- if (!check_pmu_event_filter(pmc))
+ if (!pmc_event_is_allowed(pmc))
goto reprogram_complete;
if (pmc->counter < pmc->prev_counter)
@@ -540,9 +543,9 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
if (!pmc)
return 1;
- if (!(kvm_read_cr4(vcpu) & X86_CR4_PCE) &&
+ if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_PCE) &&
(static_call(kvm_x86_get_cpl)(vcpu) != 0) &&
- (kvm_read_cr0(vcpu) & X86_CR0_PE))
+ kvm_is_cr0_bit_set(vcpu, X86_CR0_PE))
return 1;
*data = pmc_read_counter(pmc) & mask;
@@ -589,6 +592,10 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
*/
void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
{
+ if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm))
+ return;
+
+ bitmap_zero(vcpu_to_pmu(vcpu)->all_valid_pmc_idx, X86_PMC_IDX_MAX);
static_call(kvm_x86_pmu_refresh)(vcpu);
}
@@ -646,7 +653,7 @@ static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
{
pmc->prev_counter = pmc->counter;
pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc);
- kvm_pmu_request_counter_reprogam(pmc);
+ kvm_pmu_request_counter_reprogram(pmc);
}
static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc,
@@ -684,7 +691,7 @@ void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id)
for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) {
pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i);
- if (!pmc || !pmc_is_enabled(pmc) || !pmc_speculative_in_use(pmc))
+ if (!pmc || !pmc_event_is_allowed(pmc))
continue;
/* Ignore checks for edge detect, pin control, invert and CMASK bits */
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index be62c16f2265..5c7bbf03b599 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -195,7 +195,7 @@ static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops)
KVM_PMC_MAX_FIXED);
}
-static inline void kvm_pmu_request_counter_reprogam(struct kvm_pmc *pmc)
+static inline void kvm_pmu_request_counter_reprogram(struct kvm_pmc *pmc)
{
set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi);
kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 05d38944a6c0..96936ddf1b3c 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -139,13 +139,18 @@ void recalc_intercepts(struct vcpu_svm *svm)
if (g->int_ctl & V_INTR_MASKING_MASK) {
/*
- * Once running L2 with HF_VINTR_MASK, EFLAGS.IF and CR8
- * does not affect any interrupt we may want to inject;
- * therefore, writes to CR8 are irrelevant to L0, as are
- * interrupt window vmexits.
+ * If L2 is active and V_INTR_MASKING is enabled in vmcb12,
+ * disable intercept of CR8 writes as L2's CR8 does not affect
+ * any interrupt KVM may want to inject.
+ *
+ * Similarly, disable intercept of virtual interrupts (used to
+ * detect interrupt windows) if the saved RFLAGS.IF is '0', as
+ * the effective RFLAGS.IF for L1 interrupts will never be set
+ * while L2 is running (L2's RFLAGS.IF doesn't affect L1 IRQs).
*/
vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE);
- vmcb_clr_intercept(c, INTERCEPT_VINTR);
+ if (!(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF))
+ vmcb_clr_intercept(c, INTERCEPT_VINTR);
}
/*
@@ -276,6 +281,11 @@ static bool __nested_vmcb_check_controls(struct kvm_vcpu *vcpu,
if (CC(!nested_svm_check_tlb_ctl(vcpu, control->tlb_ctl)))
return false;
+ if (CC((control->int_ctl & V_NMI_ENABLE_MASK) &&
+ !vmcb12_is_intercept(control, INTERCEPT_NMI))) {
+ return false;
+ }
+
return true;
}
@@ -416,22 +426,24 @@ void nested_sync_control_from_vmcb02(struct vcpu_svm *svm)
/* Only a few fields of int_ctl are written by the processor. */
mask = V_IRQ_MASK | V_TPR_MASK;
- if (!(svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) &&
- svm_is_intercept(svm, INTERCEPT_VINTR)) {
- /*
- * In order to request an interrupt window, L0 is usurping
- * svm->vmcb->control.int_ctl and possibly setting V_IRQ
- * even if it was clear in L1's VMCB. Restoring it would be
- * wrong. However, in this case V_IRQ will remain true until
- * interrupt_window_interception calls svm_clear_vintr and
- * restores int_ctl. We can just leave it aside.
- */
+ /*
+ * Don't sync vmcb02 V_IRQ back to vmcb12 if KVM (L0) is intercepting
+ * virtual interrupts in order to request an interrupt window, as KVM
+ * has usurped vmcb02's int_ctl. If an interrupt window opens before
+ * the next VM-Exit, svm_clear_vintr() will restore vmcb12's int_ctl.
+ * If no window opens, V_IRQ will be correctly preserved in vmcb12's
+ * int_ctl (because it was never recognized while L2 was running).
+ */
+ if (svm_is_intercept(svm, INTERCEPT_VINTR) &&
+ !test_bit(INTERCEPT_VINTR, (unsigned long *)svm->nested.ctl.intercepts))
mask &= ~V_IRQ_MASK;
- }
if (nested_vgif_enabled(svm))
mask |= V_GIF_MASK;
+ if (nested_vnmi_enabled(svm))
+ mask |= V_NMI_BLOCKING_MASK | V_NMI_PENDING_MASK;
+
svm->nested.ctl.int_ctl &= ~mask;
svm->nested.ctl.int_ctl |= svm->vmcb->control.int_ctl & mask;
}
@@ -651,6 +663,17 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
else
int_ctl_vmcb01_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
+ if (vnmi) {
+ if (vmcb01->control.int_ctl & V_NMI_PENDING_MASK) {
+ svm->vcpu.arch.nmi_pending++;
+ kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+ }
+ if (nested_vnmi_enabled(svm))
+ int_ctl_vmcb12_bits |= (V_NMI_PENDING_MASK |
+ V_NMI_ENABLE_MASK |
+ V_NMI_BLOCKING_MASK);
+ }
+
/* Copied from vmcb01. msrpm_base can be overwritten later. */
vmcb02->control.nested_ctl = vmcb01->control.nested_ctl;
vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa;
@@ -1021,6 +1044,28 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
svm_switch_vmcb(svm, &svm->vmcb01);
+ /*
+ * Rules for synchronizing int_ctl bits from vmcb02 to vmcb01:
+ *
+ * V_IRQ, V_IRQ_VECTOR, V_INTR_PRIO_MASK, V_IGN_TPR: If L1 doesn't
+ * intercept interrupts, then KVM will use vmcb02's V_IRQ (and related
+ * flags) to detect interrupt windows for L1 IRQs (even if L1 uses
+ * virtual interrupt masking). Raise KVM_REQ_EVENT to ensure that
+ * KVM re-requests an interrupt window if necessary, which implicitly
+ * copies this bits from vmcb02 to vmcb01.
+ *
+ * V_TPR: If L1 doesn't use virtual interrupt masking, then L1's vTPR
+ * is stored in vmcb02, but its value doesn't need to be copied from/to
+ * vmcb01 because it is copied from/to the virtual APIC's TPR register
+ * on each VM entry/exit.
+ *
+ * V_GIF: If nested vGIF is not used, KVM uses vmcb02's V_GIF for L1's
+ * V_GIF. However, GIF is architecturally clear on each VM exit, thus
+ * there is no need to copy V_GIF from vmcb02 to vmcb01.
+ */
+ if (!nested_exit_on_intr(svm))
+ kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+
if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
svm_copy_lbrs(vmcb12, vmcb02);
svm_update_lbrv(vcpu);
@@ -1029,6 +1074,20 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
svm_update_lbrv(vcpu);
}
+ if (vnmi) {
+ if (vmcb02->control.int_ctl & V_NMI_BLOCKING_MASK)
+ vmcb01->control.int_ctl |= V_NMI_BLOCKING_MASK;
+ else
+ vmcb01->control.int_ctl &= ~V_NMI_BLOCKING_MASK;
+
+ if (vcpu->arch.nmi_pending) {
+ vcpu->arch.nmi_pending--;
+ vmcb01->control.int_ctl |= V_NMI_PENDING_MASK;
+ } else {
+ vmcb01->control.int_ctl &= ~V_NMI_PENDING_MASK;
+ }
+ }
+
/*
* On vmexit the GIF is set to false and
* no event can be injected in L1.
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index cc77a0681800..5fa939e411d8 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -161,7 +161,7 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
data &= ~pmu->reserved_bits;
if (data != pmc->eventsel) {
pmc->eventsel = data;
- kvm_pmu_request_counter_reprogam(pmc);
+ kvm_pmu_request_counter_reprogram(pmc);
}
return 0;
}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index a1b08359769b..ca32389f3c36 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -99,6 +99,7 @@ static const struct svm_direct_access_msrs {
#endif
{ .index = MSR_IA32_SPEC_CTRL, .always = false },
{ .index = MSR_IA32_PRED_CMD, .always = false },
+ { .index = MSR_IA32_FLUSH_CMD, .always = false },
{ .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
{ .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
{ .index = MSR_IA32_LASTINTFROMIP, .always = false },
@@ -234,6 +235,8 @@ module_param(dump_invalid_vmcb, bool, 0644);
bool intercept_smi = true;
module_param(intercept_smi, bool, 0444);
+bool vnmi = true;
+module_param(vnmi, bool, 0444);
static bool svm_gp_erratum_intercept = true;
@@ -1315,6 +1318,9 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
if (kvm_vcpu_apicv_active(vcpu))
avic_init_vmcb(svm, vmcb);
+ if (vnmi)
+ svm->vmcb->control.int_ctl |= V_NMI_ENABLE_MASK;
+
if (vgif) {
svm_clr_intercept(svm, INTERCEPT_STGI);
svm_clr_intercept(svm, INTERCEPT_CLGI);
@@ -1588,6 +1594,16 @@ static void svm_set_vintr(struct vcpu_svm *svm)
svm_set_intercept(svm, INTERCEPT_VINTR);
/*
+ * Recalculating intercepts may have cleared the VINTR intercept. If
+ * V_INTR_MASKING is enabled in vmcb12, then the effective RFLAGS.IF
+ * for L1 physical interrupts is L1's RFLAGS.IF at the time of VMRUN.
+ * Requesting an interrupt window if save.RFLAGS.IF=0 is pointless as
+ * interrupts will never be unblocked while L2 is running.
+ */
+ if (!svm_is_intercept(svm, INTERCEPT_VINTR))
+ return;
+
+ /*
* This is just a dummy VINTR to actually cause a vmexit to happen.
* Actual injection of virtual interrupts happens through EVENTINJ.
*/
@@ -2484,16 +2500,29 @@ static int task_switch_interception(struct kvm_vcpu *vcpu)
has_error_code, error_code);
}
+static void svm_clr_iret_intercept(struct vcpu_svm *svm)
+{
+ if (!sev_es_guest(svm->vcpu.kvm))
+ svm_clr_intercept(svm, INTERCEPT_IRET);
+}
+
+static void svm_set_iret_intercept(struct vcpu_svm *svm)
+{
+ if (!sev_es_guest(svm->vcpu.kvm))
+ svm_set_intercept(svm, INTERCEPT_IRET);
+}
+
static int iret_interception(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
++vcpu->stat.nmi_window_exits;
svm->awaiting_iret_completion = true;
- if (!sev_es_guest(vcpu->kvm)) {
- svm_clr_intercept(svm, INTERCEPT_IRET);
+
+ svm_clr_iret_intercept(svm);
+ if (!sev_es_guest(vcpu->kvm))
svm->nmi_iret_rip = kvm_rip_read(vcpu);
- }
+
kvm_make_request(KVM_REQ_EVENT, vcpu);
return 1;
}
@@ -2876,7 +2905,7 @@ static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
{
struct vcpu_svm *svm = to_svm(vcpu);
- int r;
+ int ret = 0;
u32 ecx = msr->index;
u64 data = msr->data;
@@ -2946,21 +2975,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
*/
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
break;
- case MSR_IA32_PRED_CMD:
- if (!msr->host_initiated &&
- !guest_has_pred_cmd_msr(vcpu))
- return 1;
-
- if (data & ~PRED_CMD_IBPB)
- return 1;
- if (!boot_cpu_has(X86_FEATURE_IBPB))
- return 1;
- if (!data)
- break;
-
- wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
- break;
case MSR_AMD64_VIRT_SPEC_CTRL:
if (!msr->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
@@ -3013,10 +3027,10 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
* guest via direct_access_msrs, and switch it via user return.
*/
preempt_disable();
- r = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
+ ret = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
preempt_enable();
- if (r)
- return 1;
+ if (ret)
+ break;
svm->tsc_aux = data;
break;
@@ -3074,7 +3088,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
default:
return kvm_set_msr_common(vcpu, msr);
}
- return 0;
+ return ret;
}
static int msr_interception(struct kvm_vcpu *vcpu)
@@ -3485,11 +3499,43 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
return;
svm->nmi_masked = true;
- if (!sev_es_guest(vcpu->kvm))
- svm_set_intercept(svm, INTERCEPT_IRET);
+ svm_set_iret_intercept(svm);
++vcpu->stat.nmi_injections;
}
+static bool svm_is_vnmi_pending(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!is_vnmi_enabled(svm))
+ return false;
+
+ return !!(svm->vmcb->control.int_ctl & V_NMI_BLOCKING_MASK);
+}
+
+static bool svm_set_vnmi_pending(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!is_vnmi_enabled(svm))
+ return false;
+
+ if (svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK)
+ return false;
+
+ svm->vmcb->control.int_ctl |= V_NMI_PENDING_MASK;
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
+
+ /*
+ * Because the pending NMI is serviced by hardware, KVM can't know when
+ * the NMI is "injected", but for all intents and purposes, passing the
+ * NMI off to hardware counts as injection.
+ */
+ ++vcpu->stat.nmi_injections;
+
+ return true;
+}
+
static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3585,6 +3631,35 @@ static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
}
+static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (is_vnmi_enabled(svm))
+ return svm->vmcb->control.int_ctl & V_NMI_BLOCKING_MASK;
+ else
+ return svm->nmi_masked;
+}
+
+static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (is_vnmi_enabled(svm)) {
+ if (masked)
+ svm->vmcb->control.int_ctl |= V_NMI_BLOCKING_MASK;
+ else
+ svm->vmcb->control.int_ctl &= ~V_NMI_BLOCKING_MASK;
+
+ } else {
+ svm->nmi_masked = masked;
+ if (masked)
+ svm_set_iret_intercept(svm);
+ else
+ svm_clr_iret_intercept(svm);
+ }
+}
+
bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3596,8 +3671,10 @@ bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
return false;
- return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
- svm->nmi_masked;
+ if (svm_get_nmi_mask(vcpu))
+ return true;
+
+ return vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK;
}
static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
@@ -3615,26 +3692,6 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
return 1;
}
-static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
-{
- return to_svm(vcpu)->nmi_masked;
-}
-
-static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if (masked) {
- svm->nmi_masked = true;
- if (!sev_es_guest(vcpu->kvm))
- svm_set_intercept(svm, INTERCEPT_IRET);
- } else {
- svm->nmi_masked = false;
- if (!sev_es_guest(vcpu->kvm))
- svm_clr_intercept(svm, INTERCEPT_IRET);
- }
-}
-
bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3715,7 +3772,16 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (svm->nmi_masked && !svm->awaiting_iret_completion)
+ /*
+ * KVM should never request an NMI window when vNMI is enabled, as KVM
+ * allows at most one to-be-injected NMI and one pending NMI, i.e. if
+ * two NMIs arrive simultaneously, KVM will inject one and set
+ * V_NMI_PENDING for the other. WARN, but continue with the standard
+ * single-step approach to try and salvage the pending NMI.
+ */
+ WARN_ON_ONCE(is_vnmi_enabled(svm));
+
+ if (svm_get_nmi_mask(vcpu) && !svm->awaiting_iret_completion)
return; /* IRET will cause a vm exit */
if (!gif_set(svm)) {
@@ -3777,13 +3843,13 @@ static void svm_flush_tlb_all(struct kvm_vcpu *vcpu)
{
/*
* When running on Hyper-V with EnlightenedNptTlb enabled, remote TLB
- * flushes should be routed to hv_remote_flush_tlb() without requesting
+ * flushes should be routed to hv_flush_remote_tlbs() without requesting
* a "regular" remote flush. Reaching this point means either there's
- * a KVM bug or a prior hv_remote_flush_tlb() call failed, both of
+ * a KVM bug or a prior hv_flush_remote_tlbs() call failed, both of
* which might be fatal to the guest. Yell, but try to recover.
*/
if (WARN_ON_ONCE(svm_hv_is_enlightened_tlb_enabled(vcpu)))
- hv_remote_flush_tlb(vcpu->kvm);
+ hv_flush_remote_tlbs(vcpu->kvm);
svm_flush_tlb_asid(vcpu);
}
@@ -4142,7 +4208,7 @@ static bool svm_has_emulated_msr(struct kvm *kvm, u32 index)
{
switch (index) {
case MSR_IA32_MCG_EXT_CTL:
- case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
+ case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
return false;
case MSR_IA32_SMBASE:
if (!IS_ENABLED(CONFIG_KVM_SMM))
@@ -4184,8 +4250,18 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
svm->vgif_enabled = vgif && guest_cpuid_has(vcpu, X86_FEATURE_VGIF);
+ svm->vnmi_enabled = vnmi && guest_cpuid_has(vcpu, X86_FEATURE_VNMI);
+
svm_recalc_instruction_intercepts(vcpu, svm);
+ if (boot_cpu_has(X86_FEATURE_IBPB))
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0,
+ !!guest_has_pred_cmd_msr(vcpu));
+
+ if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_FLUSH_CMD, 0,
+ !!guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D));
+
/* For sev guests, the memory encryption bit is not reserved in CR3. */
if (sev_guest(vcpu->kvm)) {
best = kvm_find_cpuid_entry(vcpu, 0x8000001F);
@@ -4563,7 +4639,6 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
void *insn, int insn_len)
{
bool smep, smap, is_user;
- unsigned long cr4;
u64 error_code;
/* Emulation is always possible when KVM has access to all guest state. */
@@ -4655,9 +4730,8 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK))
goto resume_guest;
- cr4 = kvm_read_cr4(vcpu);
- smep = cr4 & X86_CR4_SMEP;
- smap = cr4 & X86_CR4_SMAP;
+ smep = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMEP);
+ smap = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMAP);
is_user = svm_get_cpl(vcpu) == 3;
if (smap && (!smep || is_user)) {
pr_err_ratelimited("SEV Guest triggered AMD Erratum 1096\n");
@@ -4795,6 +4869,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.patch_hypercall = svm_patch_hypercall,
.inject_irq = svm_inject_irq,
.inject_nmi = svm_inject_nmi,
+ .is_vnmi_pending = svm_is_vnmi_pending,
+ .set_vnmi_pending = svm_set_vnmi_pending,
.inject_exception = svm_inject_exception,
.cancel_injection = svm_cancel_injection,
.interrupt_allowed = svm_interrupt_allowed,
@@ -4937,6 +5013,9 @@ static __init void svm_set_cpu_caps(void)
if (vgif)
kvm_cpu_cap_set(X86_FEATURE_VGIF);
+ if (vnmi)
+ kvm_cpu_cap_set(X86_FEATURE_VNMI);
+
/* Nested VM can receive #VMEXIT instead of triggering #GP */
kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
}
@@ -5088,6 +5167,16 @@ static __init int svm_hardware_setup(void)
pr_info("Virtual GIF supported\n");
}
+ vnmi = vgif && vnmi && boot_cpu_has(X86_FEATURE_VNMI);
+ if (vnmi)
+ pr_info("Virtual NMI enabled\n");
+
+ if (!vnmi) {
+ svm_x86_ops.is_vnmi_pending = NULL;
+ svm_x86_ops.set_vnmi_pending = NULL;
+ }
+
+
if (lbrv) {
if (!boot_cpu_has(X86_FEATURE_LBRV))
lbrv = false;
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 839809972da1..f44751dd8d5d 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -36,6 +36,7 @@ extern bool npt_enabled;
extern int vgif;
extern bool intercept_smi;
extern bool x2avic_enabled;
+extern bool vnmi;
/*
* Clean bits in VMCB.
@@ -265,6 +266,7 @@ struct vcpu_svm {
bool pause_filter_enabled : 1;
bool pause_threshold_enabled : 1;
bool vgif_enabled : 1;
+ bool vnmi_enabled : 1;
u32 ldr_reg;
u32 dfr_reg;
@@ -539,6 +541,12 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm)
return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE;
}
+static inline bool nested_vnmi_enabled(struct vcpu_svm *svm)
+{
+ return svm->vnmi_enabled &&
+ (svm->nested.ctl.int_ctl & V_NMI_ENABLE_MASK);
+}
+
static inline bool is_x2apic_msrpm_offset(u32 offset)
{
/* 4 msrs per u8, and 4 u8 in u32 */
@@ -548,6 +556,27 @@ static inline bool is_x2apic_msrpm_offset(u32 offset)
(msr < (APIC_BASE_MSR + 0x100));
}
+static inline struct vmcb *get_vnmi_vmcb_l1(struct vcpu_svm *svm)
+{
+ if (!vnmi)
+ return NULL;
+
+ if (is_guest_mode(&svm->vcpu))
+ return NULL;
+ else
+ return svm->vmcb01.ptr;
+}
+
+static inline bool is_vnmi_enabled(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = get_vnmi_vmcb_l1(svm);
+
+ if (vmcb)
+ return !!(vmcb->control.int_ctl & V_NMI_ENABLE_MASK);
+ else
+ return false;
+}
+
/* svm.c */
#define MSR_INVALID 0xffffffffU
diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h
index 786d46d73a8e..f85bc617ffe4 100644
--- a/arch/x86/kvm/svm/svm_onhyperv.h
+++ b/arch/x86/kvm/svm/svm_onhyperv.h
@@ -45,9 +45,8 @@ static inline __init void svm_hv_hardware_setup(void)
if (npt_enabled &&
ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB) {
pr_info(KBUILD_MODNAME ": Hyper-V enlightened NPT TLB flush enabled\n");
- svm_x86_ops.tlb_remote_flush = hv_remote_flush_tlb;
- svm_x86_ops.tlb_remote_flush_with_range =
- hv_remote_flush_tlb_with_range;
+ svm_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs;
+ svm_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range;
}
if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) {
diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c
index 22daca752797..79450e1ed7cf 100644
--- a/arch/x86/kvm/vmx/hyperv.c
+++ b/arch/x86/kvm/vmx/hyperv.c
@@ -13,7 +13,110 @@
#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
-DEFINE_STATIC_KEY_FALSE(enable_evmcs);
+/*
+ * Enlightened VMCSv1 doesn't support these:
+ *
+ * POSTED_INTR_NV = 0x00000002,
+ * GUEST_INTR_STATUS = 0x00000810,
+ * APIC_ACCESS_ADDR = 0x00002014,
+ * POSTED_INTR_DESC_ADDR = 0x00002016,
+ * EOI_EXIT_BITMAP0 = 0x0000201c,
+ * EOI_EXIT_BITMAP1 = 0x0000201e,
+ * EOI_EXIT_BITMAP2 = 0x00002020,
+ * EOI_EXIT_BITMAP3 = 0x00002022,
+ * GUEST_PML_INDEX = 0x00000812,
+ * PML_ADDRESS = 0x0000200e,
+ * VM_FUNCTION_CONTROL = 0x00002018,
+ * EPTP_LIST_ADDRESS = 0x00002024,
+ * VMREAD_BITMAP = 0x00002026,
+ * VMWRITE_BITMAP = 0x00002028,
+ *
+ * TSC_MULTIPLIER = 0x00002032,
+ * PLE_GAP = 0x00004020,
+ * PLE_WINDOW = 0x00004022,
+ * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E,
+ *
+ * Currently unsupported in KVM:
+ * GUEST_IA32_RTIT_CTL = 0x00002814,
+ */
+#define EVMCS1_SUPPORTED_PINCTRL \
+ (PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \
+ PIN_BASED_EXT_INTR_MASK | \
+ PIN_BASED_NMI_EXITING | \
+ PIN_BASED_VIRTUAL_NMIS)
+
+#define EVMCS1_SUPPORTED_EXEC_CTRL \
+ (CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \
+ CPU_BASED_HLT_EXITING | \
+ CPU_BASED_CR3_LOAD_EXITING | \
+ CPU_BASED_CR3_STORE_EXITING | \
+ CPU_BASED_UNCOND_IO_EXITING | \
+ CPU_BASED_MOV_DR_EXITING | \
+ CPU_BASED_USE_TSC_OFFSETTING | \
+ CPU_BASED_MWAIT_EXITING | \
+ CPU_BASED_MONITOR_EXITING | \
+ CPU_BASED_INVLPG_EXITING | \
+ CPU_BASED_RDPMC_EXITING | \
+ CPU_BASED_INTR_WINDOW_EXITING | \
+ CPU_BASED_CR8_LOAD_EXITING | \
+ CPU_BASED_CR8_STORE_EXITING | \
+ CPU_BASED_RDTSC_EXITING | \
+ CPU_BASED_TPR_SHADOW | \
+ CPU_BASED_USE_IO_BITMAPS | \
+ CPU_BASED_MONITOR_TRAP_FLAG | \
+ CPU_BASED_USE_MSR_BITMAPS | \
+ CPU_BASED_NMI_WINDOW_EXITING | \
+ CPU_BASED_PAUSE_EXITING | \
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
+
+#define EVMCS1_SUPPORTED_2NDEXEC \
+ (SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | \
+ SECONDARY_EXEC_WBINVD_EXITING | \
+ SECONDARY_EXEC_ENABLE_VPID | \
+ SECONDARY_EXEC_ENABLE_EPT | \
+ SECONDARY_EXEC_UNRESTRICTED_GUEST | \
+ SECONDARY_EXEC_DESC | \
+ SECONDARY_EXEC_ENABLE_RDTSCP | \
+ SECONDARY_EXEC_ENABLE_INVPCID | \
+ SECONDARY_EXEC_XSAVES | \
+ SECONDARY_EXEC_RDSEED_EXITING | \
+ SECONDARY_EXEC_RDRAND_EXITING | \
+ SECONDARY_EXEC_TSC_SCALING | \
+ SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | \
+ SECONDARY_EXEC_PT_USE_GPA | \
+ SECONDARY_EXEC_PT_CONCEAL_VMX | \
+ SECONDARY_EXEC_BUS_LOCK_DETECTION | \
+ SECONDARY_EXEC_NOTIFY_VM_EXITING | \
+ SECONDARY_EXEC_ENCLS_EXITING)
+
+#define EVMCS1_SUPPORTED_3RDEXEC (0ULL)
+
+#define EVMCS1_SUPPORTED_VMEXIT_CTRL \
+ (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | \
+ VM_EXIT_SAVE_DEBUG_CONTROLS | \
+ VM_EXIT_ACK_INTR_ON_EXIT | \
+ VM_EXIT_HOST_ADDR_SPACE_SIZE | \
+ VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \
+ VM_EXIT_SAVE_IA32_PAT | \
+ VM_EXIT_LOAD_IA32_PAT | \
+ VM_EXIT_SAVE_IA32_EFER | \
+ VM_EXIT_LOAD_IA32_EFER | \
+ VM_EXIT_CLEAR_BNDCFGS | \
+ VM_EXIT_PT_CONCEAL_PIP | \
+ VM_EXIT_CLEAR_IA32_RTIT_CTL)
+
+#define EVMCS1_SUPPORTED_VMENTRY_CTRL \
+ (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | \
+ VM_ENTRY_LOAD_DEBUG_CONTROLS | \
+ VM_ENTRY_IA32E_MODE | \
+ VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | \
+ VM_ENTRY_LOAD_IA32_PAT | \
+ VM_ENTRY_LOAD_IA32_EFER | \
+ VM_ENTRY_LOAD_BNDCFGS | \
+ VM_ENTRY_PT_CONCEAL_PIP | \
+ VM_ENTRY_LOAD_IA32_RTIT_CTL)
+
+#define EVMCS1_SUPPORTED_VMFUNC (0)
#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x)
#define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \
@@ -506,6 +609,8 @@ int nested_evmcs_check_controls(struct vmcs12 *vmcs12)
}
#if IS_ENABLED(CONFIG_HYPERV)
+DEFINE_STATIC_KEY_FALSE(__kvm_is_using_evmcs);
+
/*
* KVM on Hyper-V always uses the latest known eVMCSv1 revision, the assumption
* is: in case a feature has corresponding fields in eVMCS described and it was
diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h
index 78d17667e7ec..9623fe1651c4 100644
--- a/arch/x86/kvm/vmx/hyperv.h
+++ b/arch/x86/kvm/vmx/hyperv.h
@@ -16,117 +16,10 @@
struct vmcs_config;
-DECLARE_STATIC_KEY_FALSE(enable_evmcs);
-
#define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs))
#define KVM_EVMCS_VERSION 1
-/*
- * Enlightened VMCSv1 doesn't support these:
- *
- * POSTED_INTR_NV = 0x00000002,
- * GUEST_INTR_STATUS = 0x00000810,
- * APIC_ACCESS_ADDR = 0x00002014,
- * POSTED_INTR_DESC_ADDR = 0x00002016,
- * EOI_EXIT_BITMAP0 = 0x0000201c,
- * EOI_EXIT_BITMAP1 = 0x0000201e,
- * EOI_EXIT_BITMAP2 = 0x00002020,
- * EOI_EXIT_BITMAP3 = 0x00002022,
- * GUEST_PML_INDEX = 0x00000812,
- * PML_ADDRESS = 0x0000200e,
- * VM_FUNCTION_CONTROL = 0x00002018,
- * EPTP_LIST_ADDRESS = 0x00002024,
- * VMREAD_BITMAP = 0x00002026,
- * VMWRITE_BITMAP = 0x00002028,
- *
- * TSC_MULTIPLIER = 0x00002032,
- * PLE_GAP = 0x00004020,
- * PLE_WINDOW = 0x00004022,
- * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E,
- *
- * Currently unsupported in KVM:
- * GUEST_IA32_RTIT_CTL = 0x00002814,
- */
-#define EVMCS1_SUPPORTED_PINCTRL \
- (PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \
- PIN_BASED_EXT_INTR_MASK | \
- PIN_BASED_NMI_EXITING | \
- PIN_BASED_VIRTUAL_NMIS)
-
-#define EVMCS1_SUPPORTED_EXEC_CTRL \
- (CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \
- CPU_BASED_HLT_EXITING | \
- CPU_BASED_CR3_LOAD_EXITING | \
- CPU_BASED_CR3_STORE_EXITING | \
- CPU_BASED_UNCOND_IO_EXITING | \
- CPU_BASED_MOV_DR_EXITING | \
- CPU_BASED_USE_TSC_OFFSETTING | \
- CPU_BASED_MWAIT_EXITING | \
- CPU_BASED_MONITOR_EXITING | \
- CPU_BASED_INVLPG_EXITING | \
- CPU_BASED_RDPMC_EXITING | \
- CPU_BASED_INTR_WINDOW_EXITING | \
- CPU_BASED_CR8_LOAD_EXITING | \
- CPU_BASED_CR8_STORE_EXITING | \
- CPU_BASED_RDTSC_EXITING | \
- CPU_BASED_TPR_SHADOW | \
- CPU_BASED_USE_IO_BITMAPS | \
- CPU_BASED_MONITOR_TRAP_FLAG | \
- CPU_BASED_USE_MSR_BITMAPS | \
- CPU_BASED_NMI_WINDOW_EXITING | \
- CPU_BASED_PAUSE_EXITING | \
- CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
-
-#define EVMCS1_SUPPORTED_2NDEXEC \
- (SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | \
- SECONDARY_EXEC_WBINVD_EXITING | \
- SECONDARY_EXEC_ENABLE_VPID | \
- SECONDARY_EXEC_ENABLE_EPT | \
- SECONDARY_EXEC_UNRESTRICTED_GUEST | \
- SECONDARY_EXEC_DESC | \
- SECONDARY_EXEC_ENABLE_RDTSCP | \
- SECONDARY_EXEC_ENABLE_INVPCID | \
- SECONDARY_EXEC_XSAVES | \
- SECONDARY_EXEC_RDSEED_EXITING | \
- SECONDARY_EXEC_RDRAND_EXITING | \
- SECONDARY_EXEC_TSC_SCALING | \
- SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | \
- SECONDARY_EXEC_PT_USE_GPA | \
- SECONDARY_EXEC_PT_CONCEAL_VMX | \
- SECONDARY_EXEC_BUS_LOCK_DETECTION | \
- SECONDARY_EXEC_NOTIFY_VM_EXITING | \
- SECONDARY_EXEC_ENCLS_EXITING)
-
-#define EVMCS1_SUPPORTED_3RDEXEC (0ULL)
-
-#define EVMCS1_SUPPORTED_VMEXIT_CTRL \
- (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | \
- VM_EXIT_SAVE_DEBUG_CONTROLS | \
- VM_EXIT_ACK_INTR_ON_EXIT | \
- VM_EXIT_HOST_ADDR_SPACE_SIZE | \
- VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \
- VM_EXIT_SAVE_IA32_PAT | \
- VM_EXIT_LOAD_IA32_PAT | \
- VM_EXIT_SAVE_IA32_EFER | \
- VM_EXIT_LOAD_IA32_EFER | \
- VM_EXIT_CLEAR_BNDCFGS | \
- VM_EXIT_PT_CONCEAL_PIP | \
- VM_EXIT_CLEAR_IA32_RTIT_CTL)
-
-#define EVMCS1_SUPPORTED_VMENTRY_CTRL \
- (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | \
- VM_ENTRY_LOAD_DEBUG_CONTROLS | \
- VM_ENTRY_IA32E_MODE | \
- VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | \
- VM_ENTRY_LOAD_IA32_PAT | \
- VM_ENTRY_LOAD_IA32_EFER | \
- VM_ENTRY_LOAD_BNDCFGS | \
- VM_ENTRY_PT_CONCEAL_PIP | \
- VM_ENTRY_LOAD_IA32_RTIT_CTL)
-
-#define EVMCS1_SUPPORTED_VMFUNC (0)
-
struct evmcs_field {
u16 offset;
u16 clean_field;
@@ -174,6 +67,13 @@ static inline u64 evmcs_read_any(struct hv_enlightened_vmcs *evmcs,
#if IS_ENABLED(CONFIG_HYPERV)
+DECLARE_STATIC_KEY_FALSE(__kvm_is_using_evmcs);
+
+static __always_inline bool kvm_is_using_evmcs(void)
+{
+ return static_branch_unlikely(&__kvm_is_using_evmcs);
+}
+
static __always_inline int get_evmcs_offset(unsigned long field,
u16 *clean_field)
{
@@ -263,6 +163,7 @@ static inline void evmcs_load(u64 phys_addr)
void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf);
#else /* !IS_ENABLED(CONFIG_HYPERV) */
+static __always_inline bool kvm_is_using_evmcs(void) { return false; }
static __always_inline void evmcs_write64(unsigned long field, u64 value) {}
static __always_inline void evmcs_write32(unsigned long field, u32 value) {}
static __always_inline void evmcs_write16(unsigned long field, u16 value) {}
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 768487611db7..e35cf0bd0df9 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -358,6 +358,7 @@ static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp)
static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp,
gpa_t addr)
{
+ unsigned long roots = 0;
uint i;
struct kvm_mmu_root_info *cached_root;
@@ -368,8 +369,10 @@ static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp,
if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd,
eptp))
- vcpu->arch.mmu->invlpg(vcpu, addr, cached_root->hpa);
+ roots |= KVM_MMU_ROOT_PREVIOUS(i);
}
+ if (roots)
+ kvm_mmu_invalidate_addr(vcpu, vcpu->arch.mmu, addr, roots);
}
static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
@@ -654,6 +657,9 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
MSR_IA32_PRED_CMD, MSR_TYPE_W);
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_FLUSH_CMD, MSR_TYPE_W);
+
kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false);
vmx->nested.force_msr_bitmap_recalc = false;
@@ -4483,7 +4489,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
* CR0_GUEST_HOST_MASK is already set in the original vmcs01
* (KVM doesn't change it);
*/
- vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;
+ vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits();
vmx_set_cr0(vcpu, vmcs12->host_cr0);
/* Same as above - no reason to call set_cr4_guest_host_mask(). */
@@ -4634,7 +4640,7 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
*/
vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
- vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;
+ vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits();
vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
@@ -5156,7 +5162,7 @@ static int handle_vmxon(struct kvm_vcpu *vcpu)
* does force CR0.PE=1, but only to also force VM86 in order to emulate
* Real Mode, and so there's no need to check CR0.PE manually.
*/
- if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) {
+ if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_VMXE)) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
@@ -6755,36 +6761,9 @@ static u64 nested_vmx_calc_vmcs_enum_msr(void)
return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT;
}
-/*
- * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
- * returned for the various VMX controls MSRs when nested VMX is enabled.
- * The same values should also be used to verify that vmcs12 control fields are
- * valid during nested entry from L1 to L2.
- * Each of these control msrs has a low and high 32-bit half: A low bit is on
- * if the corresponding bit in the (32-bit) control field *must* be on, and a
- * bit in the high half is on if the corresponding bit in the control field
- * may be on. See also vmx_control_verify().
- */
-void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
+static void nested_vmx_setup_pinbased_ctls(struct vmcs_config *vmcs_conf,
+ struct nested_vmx_msrs *msrs)
{
- struct nested_vmx_msrs *msrs = &vmcs_conf->nested;
-
- /*
- * Note that as a general rule, the high half of the MSRs (bits in
- * the control fields which may be 1) should be initialized by the
- * intersection of the underlying hardware's MSR (i.e., features which
- * can be supported) and the list of features we want to expose -
- * because they are known to be properly supported in our code.
- * Also, usually, the low half of the MSRs (bits which must be 1) can
- * be set to 0, meaning that L1 may turn off any of these bits. The
- * reason is that if one of these bits is necessary, it will appear
- * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
- * fields of vmcs01 and vmcs02, will turn these bits off - and
- * nested_vmx_l1_wants_exit() will not pass related exits to L1.
- * These rules have exceptions below.
- */
-
- /* pin-based controls */
msrs->pinbased_ctls_low =
PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
@@ -6797,8 +6776,11 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
msrs->pinbased_ctls_high |=
PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
PIN_BASED_VMX_PREEMPTION_TIMER;
+}
- /* exit controls */
+static void nested_vmx_setup_exit_ctls(struct vmcs_config *vmcs_conf,
+ struct nested_vmx_msrs *msrs)
+{
msrs->exit_ctls_low =
VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
@@ -6817,8 +6799,11 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
/* We support free control of debug control saving. */
msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
+}
- /* entry controls */
+static void nested_vmx_setup_entry_ctls(struct vmcs_config *vmcs_conf,
+ struct nested_vmx_msrs *msrs)
+{
msrs->entry_ctls_low =
VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
@@ -6834,8 +6819,11 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
/* We support free control of debug control loading. */
msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
+}
- /* cpu-based controls */
+static void nested_vmx_setup_cpubased_ctls(struct vmcs_config *vmcs_conf,
+ struct nested_vmx_msrs *msrs)
+{
msrs->procbased_ctls_low =
CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
@@ -6867,12 +6855,12 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
/* We support free control of CR3 access interception. */
msrs->procbased_ctls_low &=
~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
+}
- /*
- * secondary cpu-based controls. Do not include those that
- * depend on CPUID bits, they are added later by
- * vmx_vcpu_after_set_cpuid.
- */
+static void nested_vmx_setup_secondary_ctls(u32 ept_caps,
+ struct vmcs_config *vmcs_conf,
+ struct nested_vmx_msrs *msrs)
+{
msrs->secondary_ctls_low = 0;
msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl;
@@ -6950,8 +6938,11 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
if (enable_sgx)
msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING;
+}
- /* miscellaneous data */
+static void nested_vmx_setup_misc_data(struct vmcs_config *vmcs_conf,
+ struct nested_vmx_msrs *msrs)
+{
msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA;
msrs->misc_low |=
MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
@@ -6959,7 +6950,10 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
VMX_MISC_ACTIVITY_HLT |
VMX_MISC_ACTIVITY_WAIT_SIPI;
msrs->misc_high = 0;
+}
+static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs)
+{
/*
* This MSR reports some information about VMX support. We
* should return information about the VMX we emulate for the
@@ -6974,7 +6968,10 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
if (cpu_has_vmx_basic_inout())
msrs->basic |= VMX_BASIC_INOUT;
+}
+static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs)
+{
/*
* These MSRs specify bits which the guest must keep fixed on
* while L1 is in VMXON mode (in L1's root mode, or running an L2).
@@ -6991,6 +6988,51 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
if (vmx_umip_emulated())
msrs->cr4_fixed1 |= X86_CR4_UMIP;
+}
+
+/*
+ * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
+ * returned for the various VMX controls MSRs when nested VMX is enabled.
+ * The same values should also be used to verify that vmcs12 control fields are
+ * valid during nested entry from L1 to L2.
+ * Each of these control msrs has a low and high 32-bit half: A low bit is on
+ * if the corresponding bit in the (32-bit) control field *must* be on, and a
+ * bit in the high half is on if the corresponding bit in the control field
+ * may be on. See also vmx_control_verify().
+ */
+void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
+{
+ struct nested_vmx_msrs *msrs = &vmcs_conf->nested;
+
+ /*
+ * Note that as a general rule, the high half of the MSRs (bits in
+ * the control fields which may be 1) should be initialized by the
+ * intersection of the underlying hardware's MSR (i.e., features which
+ * can be supported) and the list of features we want to expose -
+ * because they are known to be properly supported in our code.
+ * Also, usually, the low half of the MSRs (bits which must be 1) can
+ * be set to 0, meaning that L1 may turn off any of these bits. The
+ * reason is that if one of these bits is necessary, it will appear
+ * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
+ * fields of vmcs01 and vmcs02, will turn these bits off - and
+ * nested_vmx_l1_wants_exit() will not pass related exits to L1.
+ * These rules have exceptions below.
+ */
+ nested_vmx_setup_pinbased_ctls(vmcs_conf, msrs);
+
+ nested_vmx_setup_exit_ctls(vmcs_conf, msrs);
+
+ nested_vmx_setup_entry_ctls(vmcs_conf, msrs);
+
+ nested_vmx_setup_cpubased_ctls(vmcs_conf, msrs);
+
+ nested_vmx_setup_secondary_ctls(ept_caps, vmcs_conf, msrs);
+
+ nested_vmx_setup_misc_data(vmcs_conf, msrs);
+
+ nested_vmx_setup_basic(msrs);
+
+ nested_vmx_setup_cr_fixed(msrs);
msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr();
}
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index e8a3be0b9df9..741efe2c497b 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -57,7 +57,7 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i);
__set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use);
- kvm_pmu_request_counter_reprogam(pmc);
+ kvm_pmu_request_counter_reprogram(pmc);
}
}
@@ -76,13 +76,13 @@ static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
static void reprogram_counters(struct kvm_pmu *pmu, u64 diff)
{
int bit;
- struct kvm_pmc *pmc;
- for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) {
- pmc = intel_pmc_idx_to_pmc(pmu, bit);
- if (pmc)
- kvm_pmu_request_counter_reprogam(pmc);
- }
+ if (!diff)
+ return;
+
+ for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX)
+ set_bit(bit, pmu->reprogram_pmi);
+ kvm_make_request(KVM_REQ_PMU, pmu_to_vcpu(pmu));
}
static bool intel_hw_event_available(struct kvm_pmc *pmc)
@@ -351,45 +351,47 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
switch (msr) {
case MSR_CORE_PERF_FIXED_CTR_CTRL:
msr_info->data = pmu->fixed_ctr_ctrl;
- return 0;
+ break;
case MSR_CORE_PERF_GLOBAL_STATUS:
msr_info->data = pmu->global_status;
- return 0;
+ break;
case MSR_CORE_PERF_GLOBAL_CTRL:
msr_info->data = pmu->global_ctrl;
- return 0;
+ break;
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
msr_info->data = 0;
- return 0;
+ break;
case MSR_IA32_PEBS_ENABLE:
msr_info->data = pmu->pebs_enable;
- return 0;
+ break;
case MSR_IA32_DS_AREA:
msr_info->data = pmu->ds_area;
- return 0;
+ break;
case MSR_PEBS_DATA_CFG:
msr_info->data = pmu->pebs_data_cfg;
- return 0;
+ break;
default:
if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
(pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
u64 val = pmc_read_counter(pmc);
msr_info->data =
val & pmu->counter_bitmask[KVM_PMC_GP];
- return 0;
+ break;
} else if ((pmc = get_fixed_pmc(pmu, msr))) {
u64 val = pmc_read_counter(pmc);
msr_info->data =
val & pmu->counter_bitmask[KVM_PMC_FIXED];
- return 0;
+ break;
} else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
msr_info->data = pmc->eventsel;
- return 0;
- } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, true))
- return 0;
+ break;
+ } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, true)) {
+ break;
+ }
+ return 1;
}
- return 1;
+ return 0;
}
static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
@@ -402,44 +404,43 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
switch (msr) {
case MSR_CORE_PERF_FIXED_CTR_CTRL:
- if (pmu->fixed_ctr_ctrl == data)
- return 0;
- if (!(data & pmu->fixed_ctr_ctrl_mask)) {
+ if (data & pmu->fixed_ctr_ctrl_mask)
+ return 1;
+
+ if (pmu->fixed_ctr_ctrl != data)
reprogram_fixed_counters(pmu, data);
- return 0;
- }
break;
case MSR_CORE_PERF_GLOBAL_STATUS:
- if (msr_info->host_initiated) {
- pmu->global_status = data;
- return 0;
- }
- break; /* RO MSR */
+ if (!msr_info->host_initiated)
+ return 1; /* RO MSR */
+
+ pmu->global_status = data;
+ break;
case MSR_CORE_PERF_GLOBAL_CTRL:
- if (pmu->global_ctrl == data)
- return 0;
- if (kvm_valid_perf_global_ctrl(pmu, data)) {
+ if (!kvm_valid_perf_global_ctrl(pmu, data))
+ return 1;
+
+ if (pmu->global_ctrl != data) {
diff = pmu->global_ctrl ^ data;
pmu->global_ctrl = data;
reprogram_counters(pmu, diff);
- return 0;
}
break;
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
- if (!(data & pmu->global_ovf_ctrl_mask)) {
- if (!msr_info->host_initiated)
- pmu->global_status &= ~data;
- return 0;
- }
+ if (data & pmu->global_ovf_ctrl_mask)
+ return 1;
+
+ if (!msr_info->host_initiated)
+ pmu->global_status &= ~data;
break;
case MSR_IA32_PEBS_ENABLE:
- if (pmu->pebs_enable == data)
- return 0;
- if (!(data & pmu->pebs_enable_mask)) {
+ if (data & pmu->pebs_enable_mask)
+ return 1;
+
+ if (pmu->pebs_enable != data) {
diff = pmu->pebs_enable ^ data;
pmu->pebs_enable = data;
reprogram_counters(pmu, diff);
- return 0;
}
break;
case MSR_IA32_DS_AREA:
@@ -447,15 +448,14 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
if (is_noncanonical_address(data, vcpu))
return 1;
+
pmu->ds_area = data;
- return 0;
+ break;
case MSR_PEBS_DATA_CFG:
- if (pmu->pebs_data_cfg == data)
- return 0;
- if (!(data & pmu->pebs_data_cfg_mask)) {
- pmu->pebs_data_cfg = data;
- return 0;
- }
+ if (data & pmu->pebs_data_cfg_mask)
+ return 1;
+
+ pmu->pebs_data_cfg = data;
break;
default:
if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
@@ -463,33 +463,38 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if ((msr & MSR_PMC_FULL_WIDTH_BIT) &&
(data & ~pmu->counter_bitmask[KVM_PMC_GP]))
return 1;
+
if (!msr_info->host_initiated &&
!(msr & MSR_PMC_FULL_WIDTH_BIT))
data = (s64)(s32)data;
pmc->counter += data - pmc_read_counter(pmc);
pmc_update_sample_period(pmc);
- return 0;
+ break;
} else if ((pmc = get_fixed_pmc(pmu, msr))) {
pmc->counter += data - pmc_read_counter(pmc);
pmc_update_sample_period(pmc);
- return 0;
+ break;
} else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
- if (data == pmc->eventsel)
- return 0;
reserved_bits = pmu->reserved_bits;
if ((pmc->idx == 2) &&
(pmu->raw_event_mask & HSW_IN_TX_CHECKPOINTED))
reserved_bits ^= HSW_IN_TX_CHECKPOINTED;
- if (!(data & reserved_bits)) {
+ if (data & reserved_bits)
+ return 1;
+
+ if (data != pmc->eventsel) {
pmc->eventsel = data;
- kvm_pmu_request_counter_reprogam(pmc);
- return 0;
+ kvm_pmu_request_counter_reprogram(pmc);
}
- } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false))
- return 0;
+ break;
+ } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false)) {
+ break;
+ }
+ /* Not a known PMU MSR. */
+ return 1;
}
- return 1;
+ return 0;
}
static void setup_fixed_pmc_eventsel(struct kvm_pmu *pmu)
@@ -531,6 +536,16 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->pebs_enable_mask = ~0ull;
pmu->pebs_data_cfg_mask = ~0ull;
+ memset(&lbr_desc->records, 0, sizeof(lbr_desc->records));
+
+ /*
+ * Setting passthrough of LBR MSRs is done only in the VM-Entry loop,
+ * and PMU refresh is disallowed after the vCPU has run, i.e. this code
+ * should never be reached while KVM is passing through MSRs.
+ */
+ if (KVM_BUG_ON(lbr_desc->msr_passthrough, vcpu->kvm))
+ return;
+
entry = kvm_find_cpuid_entry(vcpu, 0xa);
if (!entry || !vcpu->kvm->arch.enable_pmu)
return;
diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c
index aa53c98034bf..0574030b071f 100644
--- a/arch/x86/kvm/vmx/sgx.c
+++ b/arch/x86/kvm/vmx/sgx.c
@@ -29,14 +29,14 @@ static int sgx_get_encls_gva(struct kvm_vcpu *vcpu, unsigned long offset,
/* Skip vmcs.GUEST_DS retrieval for 64-bit mode to avoid VMREADs. */
*gva = offset;
- if (!is_long_mode(vcpu)) {
+ if (!is_64_bit_mode(vcpu)) {
vmx_get_segment(vcpu, &s, VCPU_SREG_DS);
*gva += s.base;
}
if (!IS_ALIGNED(*gva, alignment)) {
fault = true;
- } else if (likely(is_long_mode(vcpu))) {
+ } else if (likely(is_64_bit_mode(vcpu))) {
fault = is_noncanonical_address(*gva, vcpu);
} else {
*gva &= 0xffffffff;
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index d2d6e1b6c788..44fb619803b8 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -164,6 +164,7 @@ module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
MSR_IA32_SPEC_CTRL,
MSR_IA32_PRED_CMD,
+ MSR_IA32_FLUSH_CMD,
MSR_IA32_TSC,
#ifdef CONFIG_X86_64
MSR_FS_BASE,
@@ -579,7 +580,7 @@ static __init void hv_init_evmcs(void)
if (enlightened_vmcs) {
pr_info("Using Hyper-V Enlightened VMCS\n");
- static_branch_enable(&enable_evmcs);
+ static_branch_enable(&__kvm_is_using_evmcs);
}
if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
@@ -595,7 +596,7 @@ static void hv_reset_evmcs(void)
{
struct hv_vp_assist_page *vp_ap;
- if (!static_branch_unlikely(&enable_evmcs))
+ if (!kvm_is_using_evmcs())
return;
/*
@@ -1945,7 +1946,7 @@ static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx,
static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
{
switch (msr->index) {
- case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
+ case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
if (!nested)
return 1;
return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
@@ -2030,7 +2031,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash
[msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
break;
- case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
+ case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
if (!nested_vmx_allowed(vcpu))
return 1;
if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
@@ -2285,33 +2286,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR))
return 1;
goto find_uret_msr;
- case MSR_IA32_PRED_CMD:
- if (!msr_info->host_initiated &&
- !guest_has_pred_cmd_msr(vcpu))
- return 1;
-
- if (data & ~PRED_CMD_IBPB)
- return 1;
- if (!boot_cpu_has(X86_FEATURE_IBPB))
- return 1;
- if (!data)
- break;
-
- wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
-
- /*
- * For non-nested:
- * When it's written (to non-zero) for the first time, pass
- * it through.
- *
- * For nested:
- * The handling of the MSR bitmap for L2 guests is done in
- * nested_vmx_prepare_msr_bitmap. We should not touch the
- * vmcs02.msr_bitmap here since it gets completely overwritten
- * in the merging.
- */
- vmx_disable_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W);
- break;
case MSR_IA32_CR_PAT:
if (!kvm_pat_valid(data))
return 1;
@@ -2366,7 +2340,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vmx->msr_ia32_sgxlepubkeyhash
[msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data;
break;
- case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
+ case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
if (!msr_info->host_initiated)
return 1; /* they are read-only */
if (!nested_vmx_allowed(vcpu))
@@ -2816,8 +2790,7 @@ static int vmx_hardware_enable(void)
* This can happen if we hot-added a CPU but failed to allocate
* VP assist page for it.
*/
- if (static_branch_unlikely(&enable_evmcs) &&
- !hv_get_vp_assist_page(cpu))
+ if (kvm_is_using_evmcs() && !hv_get_vp_assist_page(cpu))
return -EFAULT;
intel_pt_handle_vmx(1);
@@ -2869,7 +2842,7 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
memset(vmcs, 0, vmcs_config.size);
/* KVM supports Enlightened VMCS v1 only */
- if (static_branch_unlikely(&enable_evmcs))
+ if (kvm_is_using_evmcs())
vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
else
vmcs->hdr.revision_id = vmcs_config.revision_id;
@@ -2964,7 +2937,7 @@ static __init int alloc_kvm_area(void)
* still be marked with revision_id reported by
* physical CPU.
*/
- if (static_branch_unlikely(&enable_evmcs))
+ if (kvm_is_using_evmcs())
vmcs->hdr.revision_id = vmcs_config.revision_id;
per_cpu(vmxarea, cpu) = vmcs;
@@ -3931,7 +3904,7 @@ static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
* 'Enlightened MSR Bitmap' feature L0 needs to know that MSR
* bitmap has changed.
*/
- if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs)) {
+ if (kvm_is_using_evmcs()) {
struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
if (evmcs->hv_enlightenments_control.msr_bitmap)
@@ -4773,7 +4746,7 @@ static void init_vmcs(struct vcpu_vmx *vmx)
/* 22.2.1, 20.8.1 */
vm_entry_controls_set(vmx, vmx_vmentry_ctrl());
- vmx->vcpu.arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;
+ vmx->vcpu.arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits();
vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits);
set_cr4_guest_host_mask(vmx);
@@ -5163,7 +5136,7 @@ bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu)
if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
return true;
- return vmx_get_cpl(vcpu) == 3 && kvm_read_cr0_bits(vcpu, X86_CR0_AM) &&
+ return vmx_get_cpl(vcpu) == 3 && kvm_is_cr0_bit_set(vcpu, X86_CR0_AM) &&
(kvm_get_rflags(vcpu) & X86_EFLAGS_AC);
}
@@ -5500,7 +5473,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
break;
case 3: /* lmsw */
val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
- trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
+ trace_kvm_cr_write(0, (kvm_read_cr0_bits(vcpu, ~0xful) | val));
kvm_lmsw(vcpu, val);
return kvm_skip_emulated_instruction(vcpu);
@@ -6957,7 +6930,7 @@ static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
* real mode.
*/
return enable_unrestricted_guest || emulate_invalid_guest_state;
- case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
+ case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
return nested;
case MSR_AMD64_VIRT_SPEC_CTRL:
case MSR_AMD64_TSC_RATIO:
@@ -7310,7 +7283,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_run_flags(vmx));
/* All fields are clean at this point */
- if (static_branch_unlikely(&enable_evmcs)) {
+ if (kvm_is_using_evmcs()) {
current_evmcs->hv_clean_fields |=
HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
@@ -7440,7 +7413,7 @@ static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
* feature only for vmcs01, KVM currently isn't equipped to realize any
* performance benefits from enabling it for vmcs02.
*/
- if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs) &&
+ if (kvm_is_using_evmcs() &&
(ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
@@ -7558,7 +7531,7 @@ static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
if (!kvm_arch_has_noncoherent_dma(vcpu->kvm))
return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
- if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
+ if (kvm_read_cr0_bits(vcpu, X86_CR0_CD)) {
if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
cache = MTRR_TYPE_WRBACK;
else
@@ -7744,6 +7717,13 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
!guest_cpuid_has(vcpu, X86_FEATURE_XFD));
+ if (boot_cpu_has(X86_FEATURE_IBPB))
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
+ !guest_has_pred_cmd_msr(vcpu));
+
+ if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
+ !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D));
set_cr4_guest_host_mask(vmx);
@@ -7776,9 +7756,11 @@ static u64 vmx_get_perf_capabilities(void)
if (boot_cpu_has(X86_FEATURE_PDCM))
rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);
- x86_perf_get_lbr(&lbr);
- if (lbr.nr)
- perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT;
+ if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) {
+ x86_perf_get_lbr(&lbr);
+ if (lbr.nr)
+ perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT;
+ }
if (vmx_pebs_supported()) {
perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK;
@@ -7918,6 +7900,21 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
/* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */
break;
+ case x86_intercept_pause:
+ /*
+ * PAUSE is a single-byte NOP with a REPE prefix, i.e. collides
+ * with vanilla NOPs in the emulator. Apply the interception
+ * check only to actual PAUSE instructions. Don't check
+ * PAUSE-loop-exiting, software can't expect a given PAUSE to
+ * exit, i.e. KVM is within its rights to allow L2 to execute
+ * the PAUSE.
+ */
+ if ((info->rep_prefix != REPE_PREFIX) ||
+ !nested_cpu_has2(vmcs12, CPU_BASED_PAUSE_EXITING))
+ return X86EMUL_CONTINUE;
+
+ break;
+
/* TODO: check more intercepts... */
default:
break;
@@ -8415,9 +8412,8 @@ static __init int hardware_setup(void)
#if IS_ENABLED(CONFIG_HYPERV)
if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
&& enable_ept) {
- vmx_x86_ops.tlb_remote_flush = hv_remote_flush_tlb;
- vmx_x86_ops.tlb_remote_flush_with_range =
- hv_remote_flush_tlb_with_range;
+ vmx_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs;
+ vmx_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range;
}
#endif
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 2acdc54bc34b..9e66531861cf 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -369,7 +369,7 @@ struct vcpu_vmx {
struct lbr_desc lbr_desc;
/* Save desired MSR intercept (read: pass-through) state */
-#define MAX_POSSIBLE_PASSTHROUGH_MSRS 15
+#define MAX_POSSIBLE_PASSTHROUGH_MSRS 16
struct {
DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
@@ -640,6 +640,24 @@ BUILD_CONTROLS_SHADOW(tertiary_exec, TERTIARY_VM_EXEC_CONTROL, 64)
(1 << VCPU_EXREG_EXIT_INFO_1) | \
(1 << VCPU_EXREG_EXIT_INFO_2))
+static inline unsigned long vmx_l1_guest_owned_cr0_bits(void)
+{
+ unsigned long bits = KVM_POSSIBLE_CR0_GUEST_BITS;
+
+ /*
+ * CR0.WP needs to be intercepted when KVM is shadowing legacy paging
+ * in order to construct shadow PTEs with the correct protections.
+ * Note! CR0.WP technically can be passed through to the guest if
+ * paging is disabled, but checking CR0.PG would generate a cyclical
+ * dependency of sorts due to forcing the caller to ensure CR0 holds
+ * the correct value prior to determining which CR0 bits can be owned
+ * by L1. Keep it simple and limit the optimization to EPT.
+ */
+ if (!enable_ept)
+ bits &= ~X86_CR0_WP;
+ return bits;
+}
+
static __always_inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm)
{
return container_of(kvm, struct kvm_vmx, kvm);
diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h
index db95bde52998..ce47dc265f89 100644
--- a/arch/x86/kvm/vmx/vmx_ops.h
+++ b/arch/x86/kvm/vmx/vmx_ops.h
@@ -147,7 +147,7 @@ do_exception:
static __always_inline u16 vmcs_read16(unsigned long field)
{
vmcs_check16(field);
- if (static_branch_unlikely(&enable_evmcs))
+ if (kvm_is_using_evmcs())
return evmcs_read16(field);
return __vmcs_readl(field);
}
@@ -155,7 +155,7 @@ static __always_inline u16 vmcs_read16(unsigned long field)
static __always_inline u32 vmcs_read32(unsigned long field)
{
vmcs_check32(field);
- if (static_branch_unlikely(&enable_evmcs))
+ if (kvm_is_using_evmcs())
return evmcs_read32(field);
return __vmcs_readl(field);
}
@@ -163,7 +163,7 @@ static __always_inline u32 vmcs_read32(unsigned long field)
static __always_inline u64 vmcs_read64(unsigned long field)
{
vmcs_check64(field);
- if (static_branch_unlikely(&enable_evmcs))
+ if (kvm_is_using_evmcs())
return evmcs_read64(field);
#ifdef CONFIG_X86_64
return __vmcs_readl(field);
@@ -175,7 +175,7 @@ static __always_inline u64 vmcs_read64(unsigned long field)
static __always_inline unsigned long vmcs_readl(unsigned long field)
{
vmcs_checkl(field);
- if (static_branch_unlikely(&enable_evmcs))
+ if (kvm_is_using_evmcs())
return evmcs_read64(field);
return __vmcs_readl(field);
}
@@ -222,7 +222,7 @@ static __always_inline void __vmcs_writel(unsigned long field, unsigned long val
static __always_inline void vmcs_write16(unsigned long field, u16 value)
{
vmcs_check16(field);
- if (static_branch_unlikely(&enable_evmcs))
+ if (kvm_is_using_evmcs())
return evmcs_write16(field, value);
__vmcs_writel(field, value);
@@ -231,7 +231,7 @@ static __always_inline void vmcs_write16(unsigned long field, u16 value)
static __always_inline void vmcs_write32(unsigned long field, u32 value)
{
vmcs_check32(field);
- if (static_branch_unlikely(&enable_evmcs))
+ if (kvm_is_using_evmcs())
return evmcs_write32(field, value);
__vmcs_writel(field, value);
@@ -240,7 +240,7 @@ static __always_inline void vmcs_write32(unsigned long field, u32 value)
static __always_inline void vmcs_write64(unsigned long field, u64 value)
{
vmcs_check64(field);
- if (static_branch_unlikely(&enable_evmcs))
+ if (kvm_is_using_evmcs())
return evmcs_write64(field, value);
__vmcs_writel(field, value);
@@ -252,7 +252,7 @@ static __always_inline void vmcs_write64(unsigned long field, u64 value)
static __always_inline void vmcs_writel(unsigned long field, unsigned long value)
{
vmcs_checkl(field);
- if (static_branch_unlikely(&enable_evmcs))
+ if (kvm_is_using_evmcs())
return evmcs_write64(field, value);
__vmcs_writel(field, value);
@@ -262,7 +262,7 @@ static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask)
{
BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
"vmcs_clear_bits does not support 64-bit fields");
- if (static_branch_unlikely(&enable_evmcs))
+ if (kvm_is_using_evmcs())
return evmcs_write32(field, evmcs_read32(field) & ~mask);
__vmcs_writel(field, __vmcs_readl(field) & ~mask);
@@ -272,7 +272,7 @@ static __always_inline void vmcs_set_bits(unsigned long field, u32 mask)
{
BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
"vmcs_set_bits does not support 64-bit fields");
- if (static_branch_unlikely(&enable_evmcs))
+ if (kvm_is_using_evmcs())
return evmcs_write32(field, evmcs_read32(field) | mask);
__vmcs_writel(field, __vmcs_readl(field) | mask);
@@ -289,7 +289,7 @@ static inline void vmcs_load(struct vmcs *vmcs)
{
u64 phys_addr = __pa(vmcs);
- if (static_branch_unlikely(&enable_evmcs))
+ if (kvm_is_using_evmcs())
return evmcs_load(phys_addr);
vmx_asm1(vmptrld, "m"(phys_addr), vmcs, phys_addr);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2b1d82647195..ceb7c5e9cf9e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -196,7 +196,7 @@ bool __read_mostly eager_page_split = true;
module_param(eager_page_split, bool, 0644);
/* Enable/disable SMT_RSB bug mitigation */
-bool __read_mostly mitigate_smt_rsb;
+static bool __read_mostly mitigate_smt_rsb;
module_param(mitigate_smt_rsb, bool, 0444);
/*
@@ -804,8 +804,8 @@ void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
*/
if ((fault->error_code & PFERR_PRESENT_MASK) &&
!(fault->error_code & PFERR_RSVD_MASK))
- kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address,
- fault_mmu->root.hpa);
+ kvm_mmu_invalidate_addr(vcpu, fault_mmu, fault->address,
+ KVM_MMU_ROOT_CURRENT);
fault_mmu->inject_page_fault(vcpu, fault);
}
@@ -843,7 +843,7 @@ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
{
- if ((dr != 4 && dr != 5) || !kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+ if ((dr != 4 && dr != 5) || !kvm_is_cr4_bit_set(vcpu, X86_CR4_DE))
return true;
kvm_queue_exception(vcpu, UD_VECTOR);
@@ -908,6 +908,24 @@ EXPORT_SYMBOL_GPL(load_pdptrs);
void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0)
{
+ /*
+ * CR0.WP is incorporated into the MMU role, but only for non-nested,
+ * indirect shadow MMUs. If paging is disabled, no updates are needed
+ * as there are no permission bits to emulate. If TDP is enabled, the
+ * MMU's metadata needs to be updated, e.g. so that emulating guest
+ * translations does the right thing, but there's no need to unload the
+ * root as CR0.WP doesn't affect SPTEs.
+ */
+ if ((cr0 ^ old_cr0) == X86_CR0_WP) {
+ if (!(cr0 & X86_CR0_PG))
+ return;
+
+ if (tdp_enabled) {
+ kvm_init_mmu(vcpu);
+ return;
+ }
+ }
+
if ((cr0 ^ old_cr0) & X86_CR0_PG) {
kvm_clear_async_pf_completion_queue(vcpu);
kvm_async_pf_hash_reset(vcpu);
@@ -967,7 +985,7 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
return 1;
if (!(cr0 & X86_CR0_PG) &&
- (is_64_bit_mode(vcpu) || kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)))
+ (is_64_bit_mode(vcpu) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)))
return 1;
static_call(kvm_x86_set_cr0)(vcpu, cr0);
@@ -989,7 +1007,7 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
if (vcpu->arch.guest_state_protected)
return;
- if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
+ if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
if (vcpu->arch.xcr0 != host_xcr0)
xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
@@ -1003,7 +1021,7 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
if (static_cpu_has(X86_FEATURE_PKU) &&
vcpu->arch.pkru != vcpu->arch.host_pkru &&
((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
- kvm_read_cr4_bits(vcpu, X86_CR4_PKE)))
+ kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE)))
write_pkru(vcpu->arch.pkru);
#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
}
@@ -1017,14 +1035,14 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
if (static_cpu_has(X86_FEATURE_PKU) &&
((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
- kvm_read_cr4_bits(vcpu, X86_CR4_PKE))) {
+ kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) {
vcpu->arch.pkru = rdpkru();
if (vcpu->arch.pkru != vcpu->arch.host_pkru)
write_pkru(vcpu->arch.host_pkru);
}
#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
- if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
+ if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
if (vcpu->arch.xcr0 != host_xcr0)
xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
@@ -1180,9 +1198,6 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
return 1;
if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
- if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID))
- return 1;
-
/* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
return 1;
@@ -1229,7 +1244,7 @@ static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid)
* PCIDs for them are also 0, because MOV to CR3 always flushes the TLB
* with PCIDE=0.
*/
- if (!kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
+ if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE))
return;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
@@ -1244,9 +1259,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
bool skip_tlb_flush = false;
unsigned long pcid = 0;
#ifdef CONFIG_X86_64
- bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
-
- if (pcid_enabled) {
+ if (kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)) {
skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
cr3 &= ~X86_CR3_PCID_NOFLUSH;
pcid = cr3 & X86_CR3_PCID_MASK;
@@ -1545,39 +1558,41 @@ static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
static unsigned num_emulated_msrs;
/*
- * List of msr numbers which are used to expose MSR-based features that
- * can be used by a hypervisor to validate requested CPU features.
+ * List of MSRs that control the existence of MSR-based features, i.e. MSRs
+ * that are effectively CPUID leafs. VMX MSRs are also included in the set of
+ * feature MSRs, but are handled separately to allow expedited lookups.
*/
-static const u32 msr_based_features_all[] = {
- MSR_IA32_VMX_BASIC,
- MSR_IA32_VMX_TRUE_PINBASED_CTLS,
- MSR_IA32_VMX_PINBASED_CTLS,
- MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
- MSR_IA32_VMX_PROCBASED_CTLS,
- MSR_IA32_VMX_TRUE_EXIT_CTLS,
- MSR_IA32_VMX_EXIT_CTLS,
- MSR_IA32_VMX_TRUE_ENTRY_CTLS,
- MSR_IA32_VMX_ENTRY_CTLS,
- MSR_IA32_VMX_MISC,
- MSR_IA32_VMX_CR0_FIXED0,
- MSR_IA32_VMX_CR0_FIXED1,
- MSR_IA32_VMX_CR4_FIXED0,
- MSR_IA32_VMX_CR4_FIXED1,
- MSR_IA32_VMX_VMCS_ENUM,
- MSR_IA32_VMX_PROCBASED_CTLS2,
- MSR_IA32_VMX_EPT_VPID_CAP,
- MSR_IA32_VMX_VMFUNC,
-
+static const u32 msr_based_features_all_except_vmx[] = {
MSR_AMD64_DE_CFG,
MSR_IA32_UCODE_REV,
MSR_IA32_ARCH_CAPABILITIES,
MSR_IA32_PERF_CAPABILITIES,
};
-static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)];
+static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx) +
+ (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)];
static unsigned int num_msr_based_features;
/*
+ * All feature MSRs except uCode revID, which tracks the currently loaded uCode
+ * patch, are immutable once the vCPU model is defined.
+ */
+static bool kvm_is_immutable_feature_msr(u32 msr)
+{
+ int i;
+
+ if (msr >= KVM_FIRST_EMULATED_VMX_MSR && msr <= KVM_LAST_EMULATED_VMX_MSR)
+ return true;
+
+ for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) {
+ if (msr == msr_based_features_all_except_vmx[i])
+ return msr != MSR_IA32_UCODE_REV;
+ }
+
+ return false;
+}
+
+/*
* Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM
* does not yet virtualize. These include:
* 10 - MISC_PACKAGE_CTRLS
@@ -2194,6 +2209,22 @@ static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
{
+ u64 val;
+
+ /*
+ * Disallow writes to immutable feature MSRs after KVM_RUN. KVM does
+ * not support modifying the guest vCPU model on the fly, e.g. changing
+ * the nVMX capabilities while L2 is running is nonsensical. Ignore
+ * writes of the same value, e.g. to allow userspace to blindly stuff
+ * all MSRs when emulating RESET.
+ */
+ if (kvm_vcpu_has_run(vcpu) && kvm_is_immutable_feature_msr(index)) {
+ if (do_get_msr(vcpu, index, &val) || *data != val)
+ return -EINVAL;
+
+ return 0;
+ }
+
return kvm_set_msr_ignored_check(vcpu, index, *data, true);
}
@@ -3616,9 +3647,40 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (data & ~kvm_caps.supported_perf_cap)
return 1;
+ /*
+ * Note, this is not just a performance optimization! KVM
+ * disallows changing feature MSRs after the vCPU has run; PMU
+ * refresh will bug the VM if called after the vCPU has run.
+ */
+ if (vcpu->arch.perf_capabilities == data)
+ break;
+
vcpu->arch.perf_capabilities = data;
kvm_pmu_refresh(vcpu);
- return 0;
+ break;
+ case MSR_IA32_PRED_CMD:
+ if (!msr_info->host_initiated && !guest_has_pred_cmd_msr(vcpu))
+ return 1;
+
+ if (!boot_cpu_has(X86_FEATURE_IBPB) || (data & ~PRED_CMD_IBPB))
+ return 1;
+ if (!data)
+ break;
+
+ wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
+ break;
+ case MSR_IA32_FLUSH_CMD:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D))
+ return 1;
+
+ if (!boot_cpu_has(X86_FEATURE_FLUSH_L1D) || (data & ~L1D_FLUSH))
+ return 1;
+ if (!data)
+ break;
+
+ wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+ break;
case MSR_EFER:
return set_efer(vcpu, msr_info);
case MSR_K7_HWCR:
@@ -4534,9 +4596,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = 0;
break;
case KVM_CAP_XSAVE2: {
- u64 guest_perm = xstate_get_guest_group_perm();
-
- r = xstate_required_size(kvm_caps.supported_xcr0 & guest_perm, false);
+ r = xstate_required_size(kvm_get_filtered_xcr0(), false);
if (r < sizeof(struct kvm_xsave))
r = sizeof(struct kvm_xsave);
break;
@@ -5036,7 +5096,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
return 0;
if (mce->status & MCI_STATUS_UC) {
if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
- !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
+ !kvm_is_cr4_bit_set(vcpu, X86_CR4_MCE)) {
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
return 0;
}
@@ -5128,7 +5188,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
events->interrupt.shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
events->nmi.injected = vcpu->arch.nmi_injected;
- events->nmi.pending = vcpu->arch.nmi_pending != 0;
+ events->nmi.pending = kvm_get_nr_pending_nmis(vcpu);
events->nmi.masked = static_call(kvm_x86_get_nmi_mask)(vcpu);
/* events->sipi_vector is never valid when reporting to user space */
@@ -5215,8 +5275,11 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
events->interrupt.shadow);
vcpu->arch.nmi_injected = events->nmi.injected;
- if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
- vcpu->arch.nmi_pending = events->nmi.pending;
+ if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) {
+ vcpu->arch.nmi_pending = 0;
+ atomic_set(&vcpu->arch.nmi_queued, events->nmi.pending);
+ kvm_make_request(KVM_REQ_NMI, vcpu);
+ }
static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked);
if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
@@ -6024,11 +6087,6 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
return 0;
}
-static unsigned long kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
-{
- return kvm->arch.n_max_mmu_pages;
-}
-
static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
{
struct kvm_pic *pic = kvm->arch.vpic;
@@ -6675,8 +6733,7 @@ static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp)
return 0;
}
-long kvm_arch_vm_ioctl(struct file *filp,
- unsigned int ioctl, unsigned long arg)
+int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
{
struct kvm *kvm = filp->private_data;
void __user *argp = (void __user *)arg;
@@ -6714,9 +6771,6 @@ set_identity_unlock:
case KVM_SET_NR_MMU_PAGES:
r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
break;
- case KVM_GET_NR_MMU_PAGES:
- r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
- break;
case KVM_CREATE_IRQCHIP: {
mutex_lock(&kvm->lock);
@@ -7021,6 +7075,18 @@ out:
return r;
}
+static void kvm_probe_feature_msr(u32 msr_index)
+{
+ struct kvm_msr_entry msr = {
+ .index = msr_index,
+ };
+
+ if (kvm_get_msr_feature(&msr))
+ return;
+
+ msr_based_features[num_msr_based_features++] = msr_index;
+}
+
static void kvm_probe_msr_to_save(u32 msr_index)
{
u32 dummy[2];
@@ -7096,7 +7162,7 @@ static void kvm_probe_msr_to_save(u32 msr_index)
msrs_to_save[num_msrs_to_save++] = msr_index;
}
-static void kvm_init_msr_list(void)
+static void kvm_init_msr_lists(void)
{
unsigned i;
@@ -7122,15 +7188,11 @@ static void kvm_init_msr_list(void)
emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i];
}
- for (i = 0; i < ARRAY_SIZE(msr_based_features_all); i++) {
- struct kvm_msr_entry msr;
+ for (i = KVM_FIRST_EMULATED_VMX_MSR; i <= KVM_LAST_EMULATED_VMX_MSR; i++)
+ kvm_probe_feature_msr(i);
- msr.index = msr_based_features_all[i];
- if (kvm_get_msr_feature(&msr))
- continue;
-
- msr_based_features[num_msr_based_features++] = msr_based_features_all[i];
- }
+ for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++)
+ kvm_probe_feature_msr(msr_based_features_all_except_vmx[i]);
}
static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
@@ -8466,7 +8528,6 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
}
static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
- bool write_fault_to_shadow_pgtable,
int emulation_type)
{
gpa_t gpa = cr2_or_gpa;
@@ -8537,7 +8598,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
* be fixed by unprotecting shadow page and it should
* be reported to userspace.
*/
- return !write_fault_to_shadow_pgtable;
+ return !(emulation_type & EMULTYPE_WRITE_PF_TO_SP);
}
static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
@@ -8785,20 +8846,12 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
int r;
struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
bool writeback = true;
- bool write_fault_to_spt;
if (unlikely(!kvm_can_emulate_insn(vcpu, emulation_type, insn, insn_len)))
return 1;
vcpu->arch.l1tf_flush_l1d = true;
- /*
- * Clear write_fault_to_shadow_pgtable here to ensure it is
- * never reused.
- */
- write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
- vcpu->arch.write_fault_to_shadow_pgtable = false;
-
if (!(emulation_type & EMULTYPE_NO_DECODE)) {
kvm_clear_exception_queue(vcpu);
@@ -8819,7 +8872,6 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
return 1;
}
if (reexecute_instruction(vcpu, cr2_or_gpa,
- write_fault_to_spt,
emulation_type))
return 1;
@@ -8898,8 +8950,7 @@ restart:
return 1;
if (r == EMULATION_FAILED) {
- if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt,
- emulation_type))
+ if (reexecute_instruction(vcpu, cr2_or_gpa, emulation_type))
return 1;
return handle_emulation_failure(vcpu, emulation_type);
@@ -9477,7 +9528,7 @@ static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
kvm_caps.max_guest_tsc_khz = max;
}
kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits;
- kvm_init_msr_list();
+ kvm_init_msr_lists();
return 0;
out_unwind_ops:
@@ -9808,7 +9859,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
vcpu->run->hypercall.args[0] = gpa;
vcpu->run->hypercall.args[1] = npages;
vcpu->run->hypercall.args[2] = attrs;
- vcpu->run->hypercall.longmode = op_64_bit;
+ vcpu->run->hypercall.flags = 0;
+ if (op_64_bit)
+ vcpu->run->hypercall.flags |= KVM_EXIT_HYPERCALL_LONG_MODE;
+
+ WARN_ON_ONCE(vcpu->run->hypercall.flags & KVM_EXIT_HYPERCALL_MBZ);
vcpu->arch.complete_userspace_io = complete_hypercall_exit;
return 0;
}
@@ -10170,19 +10225,46 @@ out:
static void process_nmi(struct kvm_vcpu *vcpu)
{
- unsigned limit = 2;
+ unsigned int limit;
/*
- * x86 is limited to one NMI running, and one NMI pending after it.
- * If an NMI is already in progress, limit further NMIs to just one.
- * Otherwise, allow two (and we'll inject the first one immediately).
+ * x86 is limited to one NMI pending, but because KVM can't react to
+ * incoming NMIs as quickly as bare metal, e.g. if the vCPU is
+ * scheduled out, KVM needs to play nice with two queued NMIs showing
+ * up at the same time. To handle this scenario, allow two NMIs to be
+ * (temporarily) pending so long as NMIs are not blocked and KVM is not
+ * waiting for a previous NMI injection to complete (which effectively
+ * blocks NMIs). KVM will immediately inject one of the two NMIs, and
+ * will request an NMI window to handle the second NMI.
*/
if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected)
limit = 1;
+ else
+ limit = 2;
+
+ /*
+ * Adjust the limit to account for pending virtual NMIs, which aren't
+ * tracked in vcpu->arch.nmi_pending.
+ */
+ if (static_call(kvm_x86_is_vnmi_pending)(vcpu))
+ limit--;
vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
- kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ if (vcpu->arch.nmi_pending &&
+ (static_call(kvm_x86_set_vnmi_pending)(vcpu)))
+ vcpu->arch.nmi_pending--;
+
+ if (vcpu->arch.nmi_pending)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+
+/* Return total number of NMIs pending injection to the VM */
+int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.nmi_pending +
+ static_call(kvm_x86_is_vnmi_pending)(vcpu);
}
void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
@@ -13268,7 +13350,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
return 1;
}
- pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
+ pcid_enabled = kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE);
switch (type) {
case INVPCID_TYPE_INDIV_ADDR:
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index a8167b47b8c8..c544602d07a3 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -3,6 +3,7 @@
#define ARCH_X86_KVM_X86_H
#include <linux/kvm_host.h>
+#include <asm/fpu/xstate.h>
#include <asm/mce.h>
#include <asm/pvclock.h>
#include "kvm_cache_regs.h"
@@ -40,6 +41,14 @@ void kvm_spurious_fault(void);
failed; \
})
+/*
+ * The first...last VMX feature MSRs that are emulated by KVM. This may or may
+ * not cover all known VMX MSRs, as KVM doesn't emulate an MSR until there's an
+ * associated feature that KVM supports for nested virtualization.
+ */
+#define KVM_FIRST_EMULATED_VMX_MSR MSR_IA32_VMX_BASIC
+#define KVM_LAST_EMULATED_VMX_MSR MSR_IA32_VMX_VMFUNC
+
#define KVM_DEFAULT_PLE_GAP 128
#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
#define KVM_DEFAULT_PLE_WINDOW_GROW 2
@@ -83,6 +92,11 @@ static inline unsigned int __shrink_ple_window(unsigned int val,
void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu);
int kvm_check_nested_events(struct kvm_vcpu *vcpu);
+static inline bool kvm_vcpu_has_run(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.last_vmentry_cpu != -1;
+}
+
static inline bool kvm_is_exception_pending(struct kvm_vcpu *vcpu)
{
return vcpu->arch.exception.pending ||
@@ -123,15 +137,15 @@ static inline bool kvm_exception_is_soft(unsigned int nr)
static inline bool is_protmode(struct kvm_vcpu *vcpu)
{
- return kvm_read_cr0_bits(vcpu, X86_CR0_PE);
+ return kvm_is_cr0_bit_set(vcpu, X86_CR0_PE);
}
-static inline int is_long_mode(struct kvm_vcpu *vcpu)
+static inline bool is_long_mode(struct kvm_vcpu *vcpu)
{
#ifdef CONFIG_X86_64
- return vcpu->arch.efer & EFER_LMA;
+ return !!(vcpu->arch.efer & EFER_LMA);
#else
- return 0;
+ return false;
#endif
}
@@ -171,19 +185,19 @@ static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
}
-static inline int is_pae(struct kvm_vcpu *vcpu)
+static inline bool is_pae(struct kvm_vcpu *vcpu)
{
- return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);
+ return kvm_is_cr4_bit_set(vcpu, X86_CR4_PAE);
}
-static inline int is_pse(struct kvm_vcpu *vcpu)
+static inline bool is_pse(struct kvm_vcpu *vcpu)
{
- return kvm_read_cr4_bits(vcpu, X86_CR4_PSE);
+ return kvm_is_cr4_bit_set(vcpu, X86_CR4_PSE);
}
-static inline int is_paging(struct kvm_vcpu *vcpu)
+static inline bool is_paging(struct kvm_vcpu *vcpu)
{
- return likely(kvm_read_cr0_bits(vcpu, X86_CR0_PG));
+ return likely(kvm_is_cr0_bit_set(vcpu, X86_CR0_PG));
}
static inline bool is_pae_paging(struct kvm_vcpu *vcpu)
@@ -193,7 +207,7 @@ static inline bool is_pae_paging(struct kvm_vcpu *vcpu)
static inline u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu)
{
- return kvm_read_cr4_bits(vcpu, X86_CR4_LA57) ? 57 : 48;
+ return kvm_is_cr4_bit_set(vcpu, X86_CR4_LA57) ? 57 : 48;
}
static inline bool is_noncanonical_address(u64 la, struct kvm_vcpu *vcpu)
@@ -315,6 +329,34 @@ extern struct kvm_caps kvm_caps;
extern bool enable_pmu;
+/*
+ * Get a filtered version of KVM's supported XCR0 that strips out dynamic
+ * features for which the current process doesn't (yet) have permission to use.
+ * This is intended to be used only when enumerating support to userspace,
+ * e.g. in KVM_GET_SUPPORTED_CPUID and KVM_CAP_XSAVE2, it does NOT need to be
+ * used to check/restrict guest behavior as KVM rejects KVM_SET_CPUID{2} if
+ * userspace attempts to enable unpermitted features.
+ */
+static inline u64 kvm_get_filtered_xcr0(void)
+{
+ u64 permitted_xcr0 = kvm_caps.supported_xcr0;
+
+ BUILD_BUG_ON(XFEATURE_MASK_USER_DYNAMIC != XFEATURE_MASK_XTILE_DATA);
+
+ if (permitted_xcr0 & XFEATURE_MASK_USER_DYNAMIC) {
+ permitted_xcr0 &= xstate_get_guest_group_perm();
+
+ /*
+ * Treat XTILE_CFG as unsupported if the current process isn't
+ * allowed to use XTILE_DATA, as attempting to set XTILE_CFG in
+ * XCR0 without setting XTILE_DATA is architecturally illegal.
+ */
+ if (!(permitted_xcr0 & XFEATURE_MASK_XTILE_DATA))
+ permitted_xcr0 &= ~XFEATURE_MASK_XTILE_CFG;
+ }
+ return permitted_xcr0;
+}
+
static inline bool kvm_mpx_supported(void)
{
return (kvm_caps.supported_xcr0 & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR))