From 45c3af974ec6c22d7392e15eb5d755e3c0079bd1 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sun, 25 Nov 2018 18:45:35 +0100 Subject: KVM: x86: Trace changes to active TSC offset regardless if vCPU in guest-mode For some reason, kvm_x86_ops->write_l1_tsc_offset() skipped trace of change to active TSC offset in case vCPU is in guest-mode. This patch changes write_l1_tsc_offset() behavior to trace any change to active TSC offset to aid debugging. The VMX code is changed to look more similar to SVM, which is in my opinion nicer. Based on a patch by Liran Alon. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 9 +++++---- arch/x86/kvm/vmx.c | 34 +++++++++++++++++----------------- 2 files changed, 22 insertions(+), 21 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index cc6467b35a85..d7fec9f4af43 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1456,10 +1456,11 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) g_tsc_offset = svm->vmcb->control.tsc_offset - svm->nested.hsave->control.tsc_offset; svm->nested.hsave->control.tsc_offset = offset; - } else - trace_kvm_write_tsc_offset(vcpu->vcpu_id, - svm->vmcb->control.tsc_offset, - offset); + } + + trace_kvm_write_tsc_offset(vcpu->vcpu_id, + svm->vmcb->control.tsc_offset - g_tsc_offset, + offset); svm->vmcb->control.tsc_offset = offset + g_tsc_offset; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 02edd9960e9d..9e2438e3c646 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3466,24 +3466,24 @@ static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu) static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { - u64 active_offset = offset; - if (is_guest_mode(vcpu)) { - /* - * We're here if L1 chose not to trap WRMSR to TSC. According - * to the spec, this should set L1's TSC; The offset that L1 - * set for L2 remains unchanged, and still needs to be added - * to the newly set TSC to get L2's TSC. - */ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING)) - active_offset += vmcs12->tsc_offset; - } else { - trace_kvm_write_tsc_offset(vcpu->vcpu_id, - vmcs_read64(TSC_OFFSET), offset); - } + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + u64 g_tsc_offset = 0; + + /* + * We're here if L1 chose not to trap WRMSR to TSC. According + * to the spec, this should set L1's TSC; The offset that L1 + * set for L2 remains unchanged, and still needs to be added + * to the newly set TSC to get L2's TSC. + */ + if (is_guest_mode(vcpu) && + (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)) + g_tsc_offset = vmcs12->tsc_offset; - vmcs_write64(TSC_OFFSET, active_offset); - return active_offset; + trace_kvm_write_tsc_offset(vcpu->vcpu_id, + vcpu->arch.tsc_offset - g_tsc_offset, + offset); + vmcs_write64(TSC_OFFSET, offset + g_tsc_offset); + return offset + g_tsc_offset; } /* -- cgit v1.2.3 From 88656040b0c0c09202fbcb461a8c33d2ec1c0c19 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Mon, 24 Sep 2018 11:05:43 -0700 Subject: KVM: nVMX: Unrestricted guest mode requires EPT As specified in Intel's SDM, do not allow the L1 hypervisor to launch an L2 guest with the VM-execution controls for "unrestricted guest" or "mode-based execute control for EPT" set and the VM-execution control for "enable EPT" clear. Note that the VM-execution control for "mode-based execute control for EPT" is not yet virtualized by kvm. Reported-by: Andrew Thornton Signed-off-by: Jim Mattson Reviewed-by: Peter Shier Reviewed-by: Sean Christopherson Reviewed-by: Wanpeng Li Reviewed-by: Liran Alon Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/vmx.h | 1 + arch/x86/kvm/vmx.c | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index ade0f153947d..898c443eeed1 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -78,6 +78,7 @@ #define SECONDARY_EXEC_RDSEED_EXITING 0x00010000 #define SECONDARY_EXEC_ENABLE_PML 0x00020000 #define SECONDARY_EXEC_XSAVES 0x00100000 +#define SECONDARY_EXEC_MODE_BASED_EPT_EXEC 0x00400000 #define SECONDARY_EXEC_TSC_SCALING 0x02000000 #define PIN_BASED_EXT_INTR_MASK 0x00000001 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 9e2438e3c646..2806f79e7631 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -12308,6 +12308,24 @@ static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, return 0; } +static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && + !nested_cpu_has_ept(vmcs12)) + return -EINVAL; + return 0; +} + +static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && + !nested_cpu_has_ept(vmcs12)) + return -EINVAL; + return 0; +} + static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { @@ -13036,6 +13054,12 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (nested_vmx_check_pml_controls(vcpu, vmcs12)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; -- cgit v1.2.3 From 00df9181de2a116af2bc81cce7e3dda4e0431a2d Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Tue, 23 Oct 2018 00:09:11 +0200 Subject: KVM/nVMX: Remove unneeded forward jump in nested_vmx_check_vmentry_hw asm There is no need to jump just after the jump insn itself. Also, make code similar to entering guest mode in vmx_vcpu_run. Signed-off-by: Uros Bizjak Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2806f79e7631..6aaf4079541a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11274,7 +11274,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) "je 3f \n\t" "mov %%" _ASM_AX", %%cr2 \n\t" "3: \n\t" - /* Check if vmlaunch of vmresume is needed */ + /* Check if vmlaunch or vmresume is needed */ "cmpl $0, %c[launched](%0) \n\t" /* Load guest registers. Don't clobber flags. */ "mov %c[rax](%0), %%" _ASM_AX " \n\t" @@ -13298,15 +13298,13 @@ static int __noclone nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t" "mov %%" _ASM_SP ", %c[host_rsp](%0)\n\t" - /* Check if vmlaunch of vmresume is needed */ + /* Check if vmlaunch or vmresume is needed */ "cmpl $0, %c[launched](%0)\n\t" - "je 1f\n\t" - __ex("vmresume") "\n\t" - "jmp 2f\n\t" - "1: " __ex("vmlaunch") "\n\t" + "jne 1f\n\t" + __ex("vmlaunch") "\n\t" "jmp 2f\n\t" + "1: " __ex("vmresume") "\n\t" "2: " - /* Set vmx->fail accordingly */ "setbe %c[fail](%0)\n\t" -- cgit v1.2.3 From 14aa61d0a9eb3ddad06c3a0033f88b5fa7f05613 Mon Sep 17 00:00:00 2001 From: Krish Sadhukhan Date: Thu, 1 Nov 2018 01:21:58 -0400 Subject: nVMX x86: Check VMX-preemption timer controls on vmentry of L2 guests According to section "Checks on VMX Controls" in Intel SDM vol 3C, the following check needs to be enforced on vmentry of L2 guests: If the "activate VMX-preemption timer" VM-execution control is 0, the the "save VMX-preemption timer value" VM-exit control must also be 0. Signed-off-by: Krish Sadhukhan Reviewed-by: Mihai Carabas Reviewed-by: Liran Alon Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6aaf4079541a..d78fe0abf9ac 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2084,6 +2084,12 @@ static inline bool nested_cpu_has_shadow_vmcs(struct vmcs12 *vmcs12) return nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS); } +static inline bool nested_cpu_has_save_preemption_timer(struct vmcs12 *vmcs12) +{ + return vmcs12->vm_exit_controls & + VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; +} + static inline bool is_nmi(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) @@ -13051,6 +13057,10 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (!nested_cpu_has_preemption_timer(vmcs12) && + nested_cpu_has_save_preemption_timer(vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_vmx_check_pml_controls(vcpu, vmcs12)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; -- cgit v1.2.3 From fca91f6d60b6ee53b8d43c8ad5bad153a758961c Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 15 Mar 2017 07:40:55 -0700 Subject: kvm: nVMX: Set VM instruction error for VMPTRLD of unbacked page It is never correct for a VMX instruction to fail with "invalid VMCS" if there is, in fact, a current VMCS. Reads from unbacked addresses return all 1's, which means that an unbacked VMCS will not have the correct VMCS revision ID (i.e. VMCS12_REVISION). Fixes: 63846663eac78 ("KVM: nVMX: Implement VMPTRLD") Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d78fe0abf9ac..c379d0bfdcba 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -9318,9 +9318,17 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) struct vmcs12 *new_vmcs12; struct page *page; page = kvm_vcpu_gpa_to_page(vcpu, vmptr); - if (is_error_page(page)) - return nested_vmx_failInvalid(vcpu); - + if (is_error_page(page)) { + /* + * Reads from an unbacked page return all 1s, + * which means that the 32 bits located at the + * given physical address won't match the required + * VMCS12_REVISION identifier. + */ + nested_vmx_failValid(vcpu, + VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); + return kvm_skip_emulated_instruction(vcpu); + } new_vmcs12 = kmap(page); if (new_vmcs12->hdr.revision_id != VMCS12_REVISION || (new_vmcs12->hdr.shadow_vmcs && -- cgit v1.2.3 From e5d83c74a5800c2a1fa3ba982c1c4b2b39ae6db2 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 16 Feb 2017 10:40:56 +0100 Subject: kvm: make KVM_CAP_ENABLE_CAP_VM architecture agnostic The first such capability to be handled in virt/kvm/ will be manual dirty page reprotection. Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/api.txt | 13 +++++++++---- arch/powerpc/kvm/powerpc.c | 14 ++------------ arch/s390/kvm/kvm-s390.c | 11 +---------- arch/x86/kvm/x86.c | 14 ++------------ include/linux/kvm_host.h | 2 ++ virt/kvm/kvm_main.c | 25 +++++++++++++++++++++++++ 6 files changed, 41 insertions(+), 38 deletions(-) (limited to 'arch/x86/kvm') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index cd209f7730af..1071c10cf1c7 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1129,10 +1129,15 @@ documentation when it pops into existence). 4.37 KVM_ENABLE_CAP -Capability: KVM_CAP_ENABLE_CAP, KVM_CAP_ENABLE_CAP_VM -Architectures: x86 (only KVM_CAP_ENABLE_CAP_VM), - mips (only KVM_CAP_ENABLE_CAP), ppc, s390 -Type: vcpu ioctl, vm ioctl (with KVM_CAP_ENABLE_CAP_VM) +Capability: KVM_CAP_ENABLE_CAP +Architectures: mips, ppc, s390 +Type: vcpu ioctl +Parameters: struct kvm_enable_cap (in) +Returns: 0 on success; -1 on error + +Capability: KVM_CAP_ENABLE_CAP_VM +Architectures: all +Type: vcpu ioctl Parameters: struct kvm_enable_cap (in) Returns: 0 on success; -1 on error diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 2869a299c4ed..b1ed31a17a8c 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -518,7 +518,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_PPC_UNSET_IRQ: case KVM_CAP_PPC_IRQ_LEVEL: case KVM_CAP_ENABLE_CAP: - case KVM_CAP_ENABLE_CAP_VM: case KVM_CAP_ONE_REG: case KVM_CAP_IOEVENTFD: case KVM_CAP_DEVICE_CTRL: @@ -2084,8 +2083,8 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, } -static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, - struct kvm_enable_cap *cap) +int kvm_vm_ioctl_enable_cap(struct kvm *kvm, + struct kvm_enable_cap *cap) { int r; @@ -2273,15 +2272,6 @@ long kvm_arch_vm_ioctl(struct file *filp, break; } - case KVM_ENABLE_CAP: - { - struct kvm_enable_cap cap; - r = -EFAULT; - if (copy_from_user(&cap, argp, sizeof(cap))) - goto out; - r = kvm_vm_ioctl_enable_cap(kvm, &cap); - break; - } #ifdef CONFIG_SPAPR_TCE_IOMMU case KVM_CREATE_SPAPR_TCE_64: { struct kvm_create_spapr_tce_64 create_tce_64; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index fe24150ff666..16c300bdf2c8 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -464,7 +464,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_CSS_SUPPORT: case KVM_CAP_IOEVENTFD: case KVM_CAP_DEVICE_CTRL: - case KVM_CAP_ENABLE_CAP_VM: case KVM_CAP_S390_IRQCHIP: case KVM_CAP_VM_ATTRIBUTES: case KVM_CAP_MP_STATE: @@ -607,7 +606,7 @@ static void icpt_operexc_on_all_vcpus(struct kvm *kvm) } } -static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) +int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) { int r; @@ -1933,14 +1932,6 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_s390_inject_vm(kvm, &s390int); break; } - case KVM_ENABLE_CAP: { - struct kvm_enable_cap cap; - r = -EFAULT; - if (copy_from_user(&cap, argp, sizeof(cap))) - break; - r = kvm_vm_ioctl_enable_cap(kvm, &cap); - break; - } case KVM_CREATE_IRQCHIP: { struct kvm_irq_routing_entry routing; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d02937760c3b..714c5eb0c3bd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3008,7 +3008,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_HYPERV_TIME: case KVM_CAP_IOAPIC_POLARITY_IGNORED: case KVM_CAP_TSC_DEADLINE_TIMER: - case KVM_CAP_ENABLE_CAP_VM: case KVM_CAP_DISABLE_QUIRKS: case KVM_CAP_SET_BOOT_CPU_ID: case KVM_CAP_SPLIT_IRQCHIP: @@ -4431,8 +4430,8 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, return 0; } -static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, - struct kvm_enable_cap *cap) +int kvm_vm_ioctl_enable_cap(struct kvm *kvm, + struct kvm_enable_cap *cap) { int r; @@ -4765,15 +4764,6 @@ set_identity_unlock: r = 0; break; } - case KVM_ENABLE_CAP: { - struct kvm_enable_cap cap; - - r = -EFAULT; - if (copy_from_user(&cap, argp, sizeof(cap))) - goto out; - r = kvm_vm_ioctl_enable_cap(kvm, &cap); - break; - } case KVM_MEMORY_ENCRYPT_OP: { r = -ENOTTY; if (kvm_x86_ops->mem_enc_op) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index c926698040e0..54cc06dd7e6c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -765,6 +765,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, bool line_status); +int kvm_vm_ioctl_enable_cap(struct kvm *kvm, + struct kvm_enable_cap *cap); long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 2679e476b6c3..1d6b77162d7c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2948,6 +2948,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) #endif case KVM_CAP_IOEVENTFD_ANY_LENGTH: case KVM_CAP_CHECK_EXTENSION_VM: + case KVM_CAP_ENABLE_CAP_VM: return 1; #ifdef CONFIG_KVM_MMIO case KVM_CAP_COALESCED_MMIO: @@ -2971,6 +2972,21 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) return kvm_vm_ioctl_check_extension(kvm, arg); } +int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm, + struct kvm_enable_cap *cap) +{ + return -EINVAL; +} + +static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, + struct kvm_enable_cap *cap) +{ + switch (cap->cap) { + default: + return kvm_vm_ioctl_enable_cap(kvm, cap); + } +} + static long kvm_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -2984,6 +3000,15 @@ static long kvm_vm_ioctl(struct file *filp, case KVM_CREATE_VCPU: r = kvm_vm_ioctl_create_vcpu(kvm, arg); break; + case KVM_ENABLE_CAP: { + struct kvm_enable_cap cap; + + r = -EFAULT; + if (copy_from_user(&cap, argp, sizeof(cap))) + goto out; + r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap); + break; + } case KVM_SET_USER_MEMORY_REGION: { struct kvm_userspace_memory_region kvm_userspace_mem; -- cgit v1.2.3 From 8fe65a8299f9e1f40cb95308ab7b3c4ad80bf801 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 23 Oct 2018 02:18:42 +0200 Subject: kvm: rename last argument to kvm_get_dirty_log_protect When manual dirty log reprotect will be enabled, kvm_get_dirty_log_protect's pointer argument will always be false on exit, because no TLB flush is needed until the manual re-protection operation. Rename it from "is_dirty" to "flush", which more accurately tells the caller what they have to do with it. Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 6 +++--- arch/x86/kvm/x86.c | 6 +++--- include/linux/kvm_host.h | 2 +- virt/kvm/arm/arm.c | 6 +++--- virt/kvm/kvm_main.c | 6 +++--- 5 files changed, 13 insertions(+), 13 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 1fcc4d149054..3898e657952e 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1004,14 +1004,14 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; - bool is_dirty = false; + bool flush = false; int r; mutex_lock(&kvm->slots_lock); - r = kvm_get_dirty_log_protect(kvm, log, &is_dirty); + r = kvm_get_dirty_log_protect(kvm, log, &flush); - if (is_dirty) { + if (flush) { slots = kvm_memslots(kvm); memslot = id_to_memslot(slots, log->slot); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 714c5eb0c3bd..448f011aa317 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4393,7 +4393,7 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, */ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { - bool is_dirty = false; + bool flush = false; int r; mutex_lock(&kvm->slots_lock); @@ -4404,14 +4404,14 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) if (kvm_x86_ops->flush_log_dirty) kvm_x86_ops->flush_log_dirty(kvm); - r = kvm_get_dirty_log_protect(kvm, log, &is_dirty); + r = kvm_get_dirty_log_protect(kvm, log, &flush); /* * All the TLBs can be flushed out of mmu lock, see the comments in * kvm_mmu_slot_remove_write_access(). */ lockdep_assert_held(&kvm->slots_lock); - if (is_dirty) + if (flush) kvm_flush_remote_tlbs(kvm); mutex_unlock(&kvm->slots_lock); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 54cc06dd7e6c..8c56b2873b13 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -753,7 +753,7 @@ int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log, int *is_dirty); int kvm_get_dirty_log_protect(struct kvm *kvm, - struct kvm_dirty_log *log, bool *is_dirty); + struct kvm_dirty_log *log, bool *flush); void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 23774970c9df..120a2663dab9 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -1205,14 +1205,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp, */ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { - bool is_dirty = false; + bool flush = false; int r; mutex_lock(&kvm->slots_lock); - r = kvm_get_dirty_log_protect(kvm, log, &is_dirty); + r = kvm_get_dirty_log_protect(kvm, log, &flush); - if (is_dirty) + if (flush) kvm_flush_remote_tlbs(kvm); mutex_unlock(&kvm->slots_lock); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1d6b77162d7c..54f0fcfd431e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1154,7 +1154,7 @@ EXPORT_SYMBOL_GPL(kvm_get_dirty_log); * */ int kvm_get_dirty_log_protect(struct kvm *kvm, - struct kvm_dirty_log *log, bool *is_dirty) + struct kvm_dirty_log *log, bool *flush) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; @@ -1181,7 +1181,7 @@ int kvm_get_dirty_log_protect(struct kvm *kvm, memset(dirty_bitmap_buffer, 0, n); spin_lock(&kvm->mmu_lock); - *is_dirty = false; + *flush = false; for (i = 0; i < n / sizeof(long); i++) { unsigned long mask; gfn_t offset; @@ -1189,7 +1189,7 @@ int kvm_get_dirty_log_protect(struct kvm *kvm, if (!dirty_bitmap[i]) continue; - *is_dirty = true; + *flush = true; mask = xchg(&dirty_bitmap[i], 0); dirty_bitmap_buffer[i] = mask; -- cgit v1.2.3 From 2a31b9db153530df4aa02dac8c32837bf5f47019 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 23 Oct 2018 02:36:47 +0200 Subject: kvm: introduce manual dirty log reprotect There are two problems with KVM_GET_DIRTY_LOG. First, and less important, it can take kvm->mmu_lock for an extended period of time. Second, its user can actually see many false positives in some cases. The latter is due to a benign race like this: 1. KVM_GET_DIRTY_LOG returns a set of dirty pages and write protects them. 2. The guest modifies the pages, causing them to be marked ditry. 3. Userspace actually copies the pages. 4. KVM_GET_DIRTY_LOG returns those pages as dirty again, even though they were not written to since (3). This is especially a problem for large guests, where the time between (1) and (3) can be substantial. This patch introduces a new capability which, when enabled, makes KVM_GET_DIRTY_LOG not write-protect the pages it returns. Instead, userspace has to explicitly clear the dirty log bits just before using the content of the page. The new KVM_CLEAR_DIRTY_LOG ioctl can also operate on a 64-page granularity rather than requiring to sync a full memslot; this way, the mmu_lock is taken for small amounts of time, and only a small amount of time will pass between write protection of pages and the sending of their content. Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/api.txt | 67 +++++++++++ arch/mips/kvm/mips.c | 23 ++++ arch/x86/kvm/x86.c | 27 +++++ include/linux/kvm_host.h | 5 + include/uapi/linux/kvm.h | 15 +++ tools/testing/selftests/kvm/Makefile | 2 + tools/testing/selftests/kvm/clear_dirty_log_test.c | 2 + tools/testing/selftests/kvm/dirty_log_test.c | 19 +++ tools/testing/selftests/kvm/include/kvm_util.h | 2 + tools/testing/selftests/kvm/lib/kvm_util.c | 13 ++ virt/kvm/arm/arm.c | 16 +++ virt/kvm/kvm_main.c | 132 ++++++++++++++++++--- 12 files changed, 306 insertions(+), 17 deletions(-) create mode 100644 tools/testing/selftests/kvm/clear_dirty_log_test.c (limited to 'arch/x86/kvm') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 1071c10cf1c7..f2c345f7b630 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -305,6 +305,9 @@ the address space for which you want to return the dirty bitmap. They must be less than the value that KVM_CHECK_EXTENSION returns for the KVM_CAP_MULTI_ADDRESS_SPACE capability. +The bits in the dirty bitmap are cleared before the ioctl returns, unless +KVM_CAP_MANUAL_DIRTY_LOG_PROTECT is enabled. For more information, +see the description of the capability. 4.9 KVM_SET_MEMORY_ALIAS @@ -3758,6 +3761,46 @@ Coalesced pio is based on coalesced mmio. There is little difference between coalesced mmio and pio except that coalesced pio records accesses to I/O ports. +4.117 KVM_CLEAR_DIRTY_LOG (vm ioctl) + +Capability: KVM_CAP_MANUAL_DIRTY_LOG_PROTECT +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_dirty_log (in) +Returns: 0 on success, -1 on error + +/* for KVM_CLEAR_DIRTY_LOG */ +struct kvm_clear_dirty_log { + __u32 slot; + __u32 num_pages; + __u64 first_page; + union { + void __user *dirty_bitmap; /* one bit per page */ + __u64 padding; + }; +}; + +The ioctl clears the dirty status of pages in a memory slot, according to +the bitmap that is passed in struct kvm_clear_dirty_log's dirty_bitmap +field. Bit 0 of the bitmap corresponds to page "first_page" in the +memory slot, and num_pages is the size in bits of the input bitmap. +Both first_page and num_pages must be a multiple of 64. For each bit +that is set in the input bitmap, the corresponding page is marked "clean" +in KVM's dirty bitmap, and dirty tracking is re-enabled for that page +(for example via write-protection, or by clearing the dirty bit in +a page table entry). + +If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 specifies +the address space for which you want to return the dirty bitmap. +They must be less than the value that KVM_CHECK_EXTENSION returns for +the KVM_CAP_MULTI_ADDRESS_SPACE capability. + +This ioctl is mostly useful when KVM_CAP_MANUAL_DIRTY_LOG_PROTECT +is enabled; for more information, see the description of the capability. +However, it can always be used as long as KVM_CHECK_EXTENSION confirms +that KVM_CAP_MANUAL_DIRTY_LOG_PROTECT is present. + + 5. The kvm_run structure ------------------------ @@ -4652,6 +4695,30 @@ and injected exceptions. * For the new DR6 bits, note that bit 16 is set iff the #DB exception will clear DR6.RTM. +7.18 KVM_CAP_MANUAL_DIRTY_LOG_PROTECT + +Architectures: all +Parameters: args[0] whether feature should be enabled or not + +With this capability enabled, KVM_GET_DIRTY_LOG will not automatically +clear and write-protect all pages that are returned as dirty. +Rather, userspace will have to do this operation separately using +KVM_CLEAR_DIRTY_LOG. + +At the cost of a slightly more complicated operation, this provides better +scalability and responsiveness for two reasons. First, +KVM_CLEAR_DIRTY_LOG ioctl can operate on a 64-page granularity rather +than requiring to sync a full memslot; this ensures that KVM does not +take spinlocks for an extended period of time. Second, in some cases a +large amount of time can pass between a call to KVM_GET_DIRTY_LOG and +userspace actually using the data in the page. Pages can be modified +during this time, which is inefficint for both the guest and userspace: +the guest will incur a higher penalty due to write protection faults, +while userspace can see false reports of dirty pages. Manual reprotection +helps reducing this time, improving guest performance and reducing the +number of dirty log false positives. + + 8. Other capabilities. ---------------------- diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 3898e657952e..3734cd58895e 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1023,6 +1023,29 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) return r; } +int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + bool flush = false; + int r; + + mutex_lock(&kvm->slots_lock); + + r = kvm_clear_dirty_log_protect(kvm, log, &flush); + + if (flush) { + slots = kvm_memslots(kvm); + memslot = id_to_memslot(slots, log->slot); + + /* Let implementation handle TLB/GVA invalidation */ + kvm_mips_callbacks->flush_shadow_memslot(kvm, memslot); + } + + mutex_unlock(&kvm->slots_lock); + return r; +} + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { long r; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 448f011aa317..6af846c54660 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4418,6 +4418,33 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) return r; } +int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log) +{ + bool flush = false; + int r; + + mutex_lock(&kvm->slots_lock); + + /* + * Flush potentially hardware-cached dirty pages to dirty_bitmap. + */ + if (kvm_x86_ops->flush_log_dirty) + kvm_x86_ops->flush_log_dirty(kvm); + + r = kvm_clear_dirty_log_protect(kvm, log, &flush); + + /* + * All the TLBs can be flushed out of mmu lock, see the comments in + * kvm_mmu_slot_remove_write_access(). + */ + lockdep_assert_held(&kvm->slots_lock); + if (flush) + kvm_flush_remote_tlbs(kvm); + + mutex_unlock(&kvm->slots_lock); + return r; +} + int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, bool line_status) { diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8c56b2873b13..e065aeaae29e 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -449,6 +449,7 @@ struct kvm { #endif long tlbs_dirty; struct list_head devices; + bool manual_dirty_log_protect; struct dentry *debugfs_dentry; struct kvm_stat_data **debugfs_stat_data; struct srcu_struct srcu; @@ -754,6 +755,8 @@ int kvm_get_dirty_log(struct kvm *kvm, int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log, bool *flush); +int kvm_clear_dirty_log_protect(struct kvm *kvm, + struct kvm_clear_dirty_log *log, bool *flush); void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, @@ -762,6 +765,8 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log); +int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, + struct kvm_clear_dirty_log *log); int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, bool line_status); diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 2b7a652c9fa4..9fe35f1ac938 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -492,6 +492,17 @@ struct kvm_dirty_log { }; }; +/* for KVM_CLEAR_DIRTY_LOG */ +struct kvm_clear_dirty_log { + __u32 slot; + __u32 num_pages; + __u64 first_page; + union { + void __user *dirty_bitmap; /* one bit per page */ + __u64 padding2; + }; +}; + /* for KVM_SET_SIGNAL_MASK */ struct kvm_signal_mask { __u32 len; @@ -975,6 +986,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_HYPERV_ENLIGHTENED_VMCS 163 #define KVM_CAP_EXCEPTION_PAYLOAD 164 #define KVM_CAP_ARM_VM_IPA_SIZE 165 +#define KVM_CAP_MANUAL_DIRTY_LOG_PROTECT 166 #ifdef KVM_CAP_IRQ_ROUTING @@ -1421,6 +1433,9 @@ struct kvm_enc_region { #define KVM_GET_NESTED_STATE _IOWR(KVMIO, 0xbe, struct kvm_nested_state) #define KVM_SET_NESTED_STATE _IOW(KVMIO, 0xbf, struct kvm_nested_state) +/* Available with KVM_CAP_MANUAL_DIRTY_LOG_PROTECT */ +#define KVM_CLEAR_DIRTY_LOG _IOWR(KVMIO, 0xc0, struct kvm_clear_dirty_log) + /* Secure Encrypted Virtualization command */ enum sev_cmd_id { /* Guest initialization commands */ diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 52bfe5e76907..caaa0d5eba92 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -16,8 +16,10 @@ TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test TEST_GEN_PROGS_x86_64 += x86_64/state_test TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test TEST_GEN_PROGS_x86_64 += dirty_log_test +TEST_GEN_PROGS_x86_64 += clear_dirty_log_test TEST_GEN_PROGS_aarch64 += dirty_log_test +TEST_GEN_PROGS_aarch64 += clear_dirty_log_test TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M)) LIBKVM += $(LIBKVM_$(UNAME_M)) diff --git a/tools/testing/selftests/kvm/clear_dirty_log_test.c b/tools/testing/selftests/kvm/clear_dirty_log_test.c new file mode 100644 index 000000000000..749336937d37 --- /dev/null +++ b/tools/testing/selftests/kvm/clear_dirty_log_test.c @@ -0,0 +1,2 @@ +#define USE_CLEAR_DIRTY_LOG +#include "dirty_log_test.c" diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index aeff95a91b15..4629c7ccfa28 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -275,6 +275,14 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, vm = create_vm(mode, VCPU_ID, guest_num_pages, guest_code); +#ifdef USE_CLEAR_DIRTY_LOG + struct kvm_enable_cap cap = {}; + + cap.cap = KVM_CAP_MANUAL_DIRTY_LOG_PROTECT; + cap.args[0] = 1; + vm_enable_cap(vm, &cap); +#endif + /* Add an extra memory slot for testing dirty logging */ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, guest_test_mem, @@ -316,6 +324,10 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, /* Give the vcpu thread some time to dirty some pages */ usleep(interval * 1000); kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap); +#ifdef USE_CLEAR_DIRTY_LOG + kvm_vm_clear_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap, 0, + DIV_ROUND_UP(host_num_pages, 64) * 64); +#endif vm_dirty_log_verify(bmap); iteration++; sync_global_to_guest(vm, iteration); @@ -392,6 +404,13 @@ int main(int argc, char *argv[]) unsigned int mode; int opt, i; +#ifdef USE_CLEAR_DIRTY_LOG + if (!kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT)) { + fprintf(stderr, "KVM_CLEAR_DIRTY_LOG not available, skipping tests\n"); + exit(KSFT_SKIP); + } +#endif + while ((opt = getopt(argc, argv, "hi:I:o:tm:")) != -1) { switch (opt) { case 'i': diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index a4e59e3b4826..c51bfaba017a 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -58,6 +58,8 @@ void kvm_vm_free(struct kvm_vm *vmp); void kvm_vm_restart(struct kvm_vm *vmp, int perm); void kvm_vm_release(struct kvm_vm *vmp); void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log); +void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log, + uint64_t first_page, uint32_t num_pages); int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, const vm_vaddr_t gva, size_t len); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 1b41e71283d5..c9e94d6503af 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -231,6 +231,19 @@ void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log) strerror(-ret)); } +void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log, + uint64_t first_page, uint32_t num_pages) +{ + struct kvm_clear_dirty_log args = { .dirty_bitmap = log, .slot = slot, + .first_page = first_page, + .num_pages = num_pages }; + int ret; + + ret = ioctl(vm->fd, KVM_CLEAR_DIRTY_LOG, &args); + TEST_ASSERT(ret == 0, "%s: KVM_CLEAR_DIRTY_LOG failed: %s", + strerror(-ret)); +} + /* * Userspace Memory Region Find * diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 120a2663dab9..e91adf77d99a 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -1219,6 +1219,22 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) return r; } +int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log) +{ + bool flush = false; + int r; + + mutex_lock(&kvm->slots_lock); + + r = kvm_clear_dirty_log_protect(kvm, log, &flush); + + if (flush) + kvm_flush_remote_tlbs(kvm); + + mutex_unlock(&kvm->slots_lock); + return r; +} + static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev_addr) { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 54f0fcfd431e..0041947b7390 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1133,7 +1133,7 @@ EXPORT_SYMBOL_GPL(kvm_get_dirty_log); #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT /** * kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages - * are dirty write protect them for next write. + * and reenable dirty page tracking for the corresponding pages. * @kvm: pointer to kvm instance * @log: slot id and address to which we copy the log * @is_dirty: flag set if any page is dirty @@ -1176,37 +1176,114 @@ int kvm_get_dirty_log_protect(struct kvm *kvm, return -ENOENT; n = kvm_dirty_bitmap_bytes(memslot); + *flush = false; + if (kvm->manual_dirty_log_protect) { + /* + * Unlike kvm_get_dirty_log, we always return false in *flush, + * because no flush is needed until KVM_CLEAR_DIRTY_LOG. There + * is some code duplication between this function and + * kvm_get_dirty_log, but hopefully all architecture + * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log + * can be eliminated. + */ + dirty_bitmap_buffer = dirty_bitmap; + } else { + dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); + memset(dirty_bitmap_buffer, 0, n); - dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); - memset(dirty_bitmap_buffer, 0, n); + spin_lock(&kvm->mmu_lock); + for (i = 0; i < n / sizeof(long); i++) { + unsigned long mask; + gfn_t offset; - spin_lock(&kvm->mmu_lock); + if (!dirty_bitmap[i]) + continue; + + *flush = true; + mask = xchg(&dirty_bitmap[i], 0); + dirty_bitmap_buffer[i] = mask; + + if (mask) { + offset = i * BITS_PER_LONG; + kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, + offset, mask); + } + } + spin_unlock(&kvm->mmu_lock); + } + + if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) + return -EFAULT; + return 0; +} +EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect); + +/** + * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap + * and reenable dirty page tracking for the corresponding pages. + * @kvm: pointer to kvm instance + * @log: slot id and address from which to fetch the bitmap of dirty pages + */ +int kvm_clear_dirty_log_protect(struct kvm *kvm, + struct kvm_clear_dirty_log *log, bool *flush) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + int as_id, id, n; + gfn_t offset; + unsigned long i; + unsigned long *dirty_bitmap; + unsigned long *dirty_bitmap_buffer; + + as_id = log->slot >> 16; + id = (u16)log->slot; + if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) + return -EINVAL; + + if ((log->first_page & 63) || (log->num_pages & 63)) + return -EINVAL; + + slots = __kvm_memslots(kvm, as_id); + memslot = id_to_memslot(slots, id); + + dirty_bitmap = memslot->dirty_bitmap; + if (!dirty_bitmap) + return -ENOENT; + + n = kvm_dirty_bitmap_bytes(memslot); *flush = false; - for (i = 0; i < n / sizeof(long); i++) { - unsigned long mask; - gfn_t offset; + dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); + if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n)) + return -EFAULT; - if (!dirty_bitmap[i]) + spin_lock(&kvm->mmu_lock); + for (offset = log->first_page, + i = offset / BITS_PER_LONG, n = log->num_pages / BITS_PER_LONG; n--; + i++, offset += BITS_PER_LONG) { + unsigned long mask = *dirty_bitmap_buffer++; + atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i]; + if (!mask) continue; - *flush = true; - - mask = xchg(&dirty_bitmap[i], 0); - dirty_bitmap_buffer[i] = mask; + mask &= atomic_long_fetch_andnot(mask, p); + /* + * mask contains the bits that really have been cleared. This + * never includes any bits beyond the length of the memslot (if + * the length is not aligned to 64 pages), therefore it is not + * a problem if userspace sets them in log->dirty_bitmap. + */ if (mask) { - offset = i * BITS_PER_LONG; + *flush = true; kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset, mask); } } - spin_unlock(&kvm->mmu_lock); - if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) - return -EFAULT; + return 0; } -EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect); +EXPORT_SYMBOL_GPL(kvm_clear_dirty_log_protect); #endif bool kvm_largepages_enabled(void) @@ -2949,6 +3026,9 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) case KVM_CAP_IOEVENTFD_ANY_LENGTH: case KVM_CAP_CHECK_EXTENSION_VM: case KVM_CAP_ENABLE_CAP_VM: +#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT + case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT: +#endif return 1; #ifdef CONFIG_KVM_MMIO case KVM_CAP_COALESCED_MMIO: @@ -2982,6 +3062,13 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, struct kvm_enable_cap *cap) { switch (cap->cap) { +#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT + case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT: + if (cap->flags || (cap->args[0] & ~1)) + return -EINVAL; + kvm->manual_dirty_log_protect = cap->args[0]; + return 0; +#endif default: return kvm_vm_ioctl_enable_cap(kvm, cap); } @@ -3029,6 +3116,17 @@ static long kvm_vm_ioctl(struct file *filp, r = kvm_vm_ioctl_get_dirty_log(kvm, &log); break; } +#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT + case KVM_CLEAR_DIRTY_LOG: { + struct kvm_clear_dirty_log log; + + r = -EFAULT; + if (copy_from_user(&log, argp, sizeof(log))) + goto out; + r = kvm_vm_ioctl_clear_dirty_log(kvm, &log); + break; + } +#endif #ifdef CONFIG_KVM_MMIO case KVM_REGISTER_COALESCED_MMIO: { struct kvm_coalesced_mmio_zone zone; -- cgit v1.2.3 From 1b3ab5ad1b8ad99bae76ec583809c5f5a31c707c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 3 Dec 2018 13:52:51 -0800 Subject: KVM: nVMX: Free the VMREAD/VMWRITE bitmaps if alloc_kvm_area() fails Fixes: 34a1cd60d17f ("kvm: x86: vmx: move some vmx setting from vmx_init() to hardware_setup()") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c379d0bfdcba..3ec47b7a94d6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8037,13 +8037,16 @@ static __init int hardware_setup(void) kvm_mce_cap_supported |= MCG_LMCE_P; - return alloc_kvm_area(); + r = alloc_kvm_area(); + if (r) + goto out; + return 0; out: for (i = 0; i < VMX_BITMAP_NR; i++) free_page((unsigned long)vmx_bitmap[i]); - return r; + return r; } static __exit void hardware_unsetup(void) -- cgit v1.2.3 From dfae3c03b89fd5547b1adee857b10dc6f1c66132 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 3 Dec 2018 13:52:52 -0800 Subject: KVM: nVMX: Allocate and configure VM{READ,WRITE} bitmaps iff enable_shadow_vmcs ...and make enable_shadow_vmcs depend on nested. Aside from the obvious memory savings, this will allow moving the relevant code out of vmx.c in the future, e.g. to a nested specific file. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 43 +++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3ec47b7a94d6..49c5b155e07d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4862,6 +4862,9 @@ static void init_vmcs_shadow_fields(void) { int i, j; + memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); + memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); + for (i = j = 0; i < max_shadow_read_only_fields; i++) { u16 field = shadow_read_only_fields[i]; if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && @@ -7904,19 +7907,8 @@ static __init int hardware_setup(void) for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) kvm_define_shared_msr(i, vmx_msr_index[i]); - for (i = 0; i < VMX_BITMAP_NR; i++) { - vmx_bitmap[i] = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_bitmap[i]) - goto out; - } - - memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); - memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); - - if (setup_vmcs_config(&vmcs_config) < 0) { - r = -EIO; - goto out; - } + if (setup_vmcs_config(&vmcs_config) < 0) + return -EIO; if (boot_cpu_has(X86_FEATURE_NX)) kvm_enable_efer_bits(EFER_NX); @@ -8027,10 +8019,18 @@ static __init int hardware_setup(void) kvm_x86_ops->cancel_hv_timer = NULL; } - if (!cpu_has_vmx_shadow_vmcs()) + if (!cpu_has_vmx_shadow_vmcs() || !nested) enable_shadow_vmcs = 0; - if (enable_shadow_vmcs) + if (enable_shadow_vmcs) { + for (i = 0; i < VMX_BITMAP_NR; i++) { + vmx_bitmap[i] = (unsigned long *) + __get_free_page(GFP_KERNEL); + if (!vmx_bitmap[i]) + goto out; + } + init_vmcs_shadow_fields(); + } kvm_set_posted_intr_wakeup_handler(wakeup_handler); nested_vmx_setup_ctls_msrs(&vmcs_config.nested, enable_apicv); @@ -8043,9 +8043,10 @@ static __init int hardware_setup(void) return 0; out: - for (i = 0; i < VMX_BITMAP_NR; i++) - free_page((unsigned long)vmx_bitmap[i]); - + if (enable_shadow_vmcs) { + for (i = 0; i < VMX_BITMAP_NR; i++) + free_page((unsigned long)vmx_bitmap[i]); + } return r; } @@ -8053,8 +8054,10 @@ static __exit void hardware_unsetup(void) { int i; - for (i = 0; i < VMX_BITMAP_NR; i++) - free_page((unsigned long)vmx_bitmap[i]); + if (enable_shadow_vmcs) { + for (i = 0; i < VMX_BITMAP_NR; i++) + free_page((unsigned long)vmx_bitmap[i]); + } free_kvm_area(); } -- cgit v1.2.3 From 199b118ab3d528371f6e3ac9f6001a0f3fd11d04 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 3 Dec 2018 13:52:53 -0800 Subject: KVM: VMX: Alphabetize the includes in vmx.c ...to prepare for the creation of a "vmx" subdirectory that will contain a variety of headers. Clean things up now to avoid making a bigger mess in the future. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 51 +++++++++++++++++++++++++-------------------------- 1 file changed, 25 insertions(+), 26 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 49c5b155e07d..589230c923e2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -16,49 +16,48 @@ * */ -#include "irq.h" -#include "mmu.h" -#include "cpuid.h" -#include "lapic.h" -#include "hyperv.h" - +#include +#include +#include +#include #include #include -#include -#include -#include -#include #include #include -#include +#include +#include +#include #include #include -#include -#include -#include -#include "kvm_cache_regs.h" -#include "x86.h" +#include +#include #include #include -#include +#include #include -#include -#include -#include #include -#include -#include -#include -#include +#include #include +#include +#include +#include #include -#include #include +#include +#include +#include -#include "trace.h" +#include "cpuid.h" +#include "hyperv.h" +#include "irq.h" +#include "kvm_cache_regs.h" +#include "lapic.h" +#include "mmu.h" #include "pmu.h" +#include "trace.h" #include "vmx_evmcs.h" +#include "x86.h" #define __ex(x) __kvm_handle_fault_on_reboot(x) #define __ex_clear(x, reg) \ -- cgit v1.2.3 From 8ba2e525ecd7428e25d80f37c533612d62f2dc26 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 3 Dec 2018 13:52:54 -0800 Subject: KVM: x86: Add requisite includes to kvm_cache_regs.h Until this point vmx.c has been the only consumer and included the file after many others. Prepare for multiple consumers, i.e. the shattering of vmx.c Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/kvm_cache_regs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 9619dcc2b325..f8f56a93358b 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -2,6 +2,8 @@ #ifndef ASM_KVM_CACHE_REGS_H #define ASM_KVM_CACHE_REGS_H +#include + #define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS #define KVM_POSSIBLE_CR4_GUEST_BITS \ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ -- cgit v1.2.3 From 3592cda6bc27fd6e73f73a6e793cbd0c09a07a36 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 3 Dec 2018 13:52:55 -0800 Subject: KVM: x86: Add requisite includes to hyperv.h Until this point vmx.c has been the only consumer and included the file after many others. Prepare for multiple consumers, i.e. the shattering of vmx.c Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 0e66c12ed2c3..9c21c3479899 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -24,6 +24,8 @@ #ifndef __ARCH_X86_KVM_HYPERV_H__ #define __ARCH_X86_KVM_HYPERV_H__ +#include + static inline struct kvm_vcpu_hv *vcpu_to_hv_vcpu(struct kvm_vcpu *vcpu) { return &vcpu->arch.hyperv; -- cgit v1.2.3 From a821bab2d1ee869e04b218b198837bf07f2d27c1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 3 Dec 2018 13:52:56 -0800 Subject: KVM: VMX: Move VMX specific files to a "vmx" subdirectory ...to prepare for shattering vmx.c into multiple files without having to prepend "vmx_" to all new files. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/Makefile | 2 +- arch/x86/kvm/pmu_intel.c | 358 - arch/x86/kvm/vmx.c | 15295 --------------------------------- arch/x86/kvm/vmx/pmu_intel.c | 358 + arch/x86/kvm/vmx/vmx.c | 15295 +++++++++++++++++++++++++++++++++ arch/x86/kvm/vmx/vmx_evmcs.h | 324 + arch/x86/kvm/vmx/vmx_shadow_fields.h | 74 + arch/x86/kvm/vmx_evmcs.h | 324 - arch/x86/kvm/vmx_shadow_fields.h | 74 - 9 files changed, 16052 insertions(+), 16052 deletions(-) delete mode 100644 arch/x86/kvm/pmu_intel.c delete mode 100644 arch/x86/kvm/vmx.c create mode 100644 arch/x86/kvm/vmx/pmu_intel.c create mode 100644 arch/x86/kvm/vmx/vmx.c create mode 100644 arch/x86/kvm/vmx/vmx_evmcs.h create mode 100644 arch/x86/kvm/vmx/vmx_shadow_fields.h delete mode 100644 arch/x86/kvm/vmx_evmcs.h delete mode 100644 arch/x86/kvm/vmx_shadow_fields.h (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index dc4f2fdf5e57..13fd54de5449 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -16,7 +16,7 @@ kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ hyperv.o page_track.o debugfs.o -kvm-intel-y += vmx.o pmu_intel.o +kvm-intel-y += vmx/vmx.o vmx/pmu_intel.o kvm-amd-y += svm.o pmu_amd.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/x86/kvm/pmu_intel.c b/arch/x86/kvm/pmu_intel.c deleted file mode 100644 index 5ab4a364348e..000000000000 --- a/arch/x86/kvm/pmu_intel.c +++ /dev/null @@ -1,358 +0,0 @@ -/* - * KVM PMU support for Intel CPUs - * - * Copyright 2011 Red Hat, Inc. and/or its affiliates. - * - * Authors: - * Avi Kivity - * Gleb Natapov - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ -#include -#include -#include -#include -#include "x86.h" -#include "cpuid.h" -#include "lapic.h" -#include "pmu.h" - -static struct kvm_event_hw_type_mapping intel_arch_events[] = { - /* Index must match CPUID 0x0A.EBX bit vector */ - [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES }, - [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, - [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES }, - [3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES }, - [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES }, - [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, - [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, - [7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES }, -}; - -/* mapping between fixed pmc index and intel_arch_events array */ -static int fixed_pmc_events[] = {1, 0, 7}; - -static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) -{ - int i; - - for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { - u8 new_ctrl = fixed_ctrl_field(data, i); - u8 old_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, i); - struct kvm_pmc *pmc; - - pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i); - - if (old_ctrl == new_ctrl) - continue; - - reprogram_fixed_counter(pmc, new_ctrl, i); - } - - pmu->fixed_ctr_ctrl = data; -} - -/* function is called when global control register has been updated. */ -static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data) -{ - int bit; - u64 diff = pmu->global_ctrl ^ data; - - pmu->global_ctrl = data; - - for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) - reprogram_counter(pmu, bit); -} - -static unsigned intel_find_arch_event(struct kvm_pmu *pmu, - u8 event_select, - u8 unit_mask) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++) - if (intel_arch_events[i].eventsel == event_select - && intel_arch_events[i].unit_mask == unit_mask - && (pmu->available_event_types & (1 << i))) - break; - - if (i == ARRAY_SIZE(intel_arch_events)) - return PERF_COUNT_HW_MAX; - - return intel_arch_events[i].event_type; -} - -static unsigned intel_find_fixed_event(int idx) -{ - if (idx >= ARRAY_SIZE(fixed_pmc_events)) - return PERF_COUNT_HW_MAX; - - return intel_arch_events[fixed_pmc_events[idx]].event_type; -} - -/* check if a PMC is enabled by comparing it with globl_ctrl bits. */ -static bool intel_pmc_is_enabled(struct kvm_pmc *pmc) -{ - struct kvm_pmu *pmu = pmc_to_pmu(pmc); - - return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); -} - -static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) -{ - if (pmc_idx < INTEL_PMC_IDX_FIXED) - return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx, - MSR_P6_EVNTSEL0); - else { - u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED; - - return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0); - } -} - -/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */ -static int intel_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx) -{ - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - bool fixed = idx & (1u << 30); - - idx &= ~(3u << 30); - - return (!fixed && idx >= pmu->nr_arch_gp_counters) || - (fixed && idx >= pmu->nr_arch_fixed_counters); -} - -static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu, - unsigned idx) -{ - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - bool fixed = idx & (1u << 30); - struct kvm_pmc *counters; - - idx &= ~(3u << 30); - if (!fixed && idx >= pmu->nr_arch_gp_counters) - return NULL; - if (fixed && idx >= pmu->nr_arch_fixed_counters) - return NULL; - counters = fixed ? pmu->fixed_counters : pmu->gp_counters; - - return &counters[idx]; -} - -static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) -{ - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - int ret; - - switch (msr) { - case MSR_CORE_PERF_FIXED_CTR_CTRL: - case MSR_CORE_PERF_GLOBAL_STATUS: - case MSR_CORE_PERF_GLOBAL_CTRL: - case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - ret = pmu->version > 1; - break; - default: - ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) || - get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) || - get_fixed_pmc(pmu, msr); - break; - } - - return ret; -} - -static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data) -{ - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - struct kvm_pmc *pmc; - - switch (msr) { - case MSR_CORE_PERF_FIXED_CTR_CTRL: - *data = pmu->fixed_ctr_ctrl; - return 0; - case MSR_CORE_PERF_GLOBAL_STATUS: - *data = pmu->global_status; - return 0; - case MSR_CORE_PERF_GLOBAL_CTRL: - *data = pmu->global_ctrl; - return 0; - case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - *data = pmu->global_ovf_ctrl; - return 0; - default: - if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || - (pmc = get_fixed_pmc(pmu, msr))) { - *data = pmc_read_counter(pmc); - return 0; - } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { - *data = pmc->eventsel; - return 0; - } - } - - return 1; -} - -static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) -{ - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - struct kvm_pmc *pmc; - u32 msr = msr_info->index; - u64 data = msr_info->data; - - switch (msr) { - case MSR_CORE_PERF_FIXED_CTR_CTRL: - if (pmu->fixed_ctr_ctrl == data) - return 0; - if (!(data & 0xfffffffffffff444ull)) { - reprogram_fixed_counters(pmu, data); - return 0; - } - break; - case MSR_CORE_PERF_GLOBAL_STATUS: - if (msr_info->host_initiated) { - pmu->global_status = data; - return 0; - } - break; /* RO MSR */ - case MSR_CORE_PERF_GLOBAL_CTRL: - if (pmu->global_ctrl == data) - return 0; - if (!(data & pmu->global_ctrl_mask)) { - global_ctrl_changed(pmu, data); - return 0; - } - break; - case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) { - if (!msr_info->host_initiated) - pmu->global_status &= ~data; - pmu->global_ovf_ctrl = data; - return 0; - } - break; - default: - if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || - (pmc = get_fixed_pmc(pmu, msr))) { - if (!msr_info->host_initiated) - data = (s64)(s32)data; - pmc->counter += data - pmc_read_counter(pmc); - return 0; - } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { - if (data == pmc->eventsel) - return 0; - if (!(data & pmu->reserved_bits)) { - reprogram_gp_counter(pmc, data); - return 0; - } - } - } - - return 1; -} - -static void intel_pmu_refresh(struct kvm_vcpu *vcpu) -{ - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - struct kvm_cpuid_entry2 *entry; - union cpuid10_eax eax; - union cpuid10_edx edx; - - pmu->nr_arch_gp_counters = 0; - pmu->nr_arch_fixed_counters = 0; - pmu->counter_bitmask[KVM_PMC_GP] = 0; - pmu->counter_bitmask[KVM_PMC_FIXED] = 0; - pmu->version = 0; - pmu->reserved_bits = 0xffffffff00200000ull; - - entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); - if (!entry) - return; - eax.full = entry->eax; - edx.full = entry->edx; - - pmu->version = eax.split.version_id; - if (!pmu->version) - return; - - pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters, - INTEL_PMC_MAX_GENERIC); - pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1; - pmu->available_event_types = ~entry->ebx & - ((1ull << eax.split.mask_length) - 1); - - if (pmu->version == 1) { - pmu->nr_arch_fixed_counters = 0; - } else { - pmu->nr_arch_fixed_counters = - min_t(int, edx.split.num_counters_fixed, - INTEL_PMC_MAX_FIXED); - pmu->counter_bitmask[KVM_PMC_FIXED] = - ((u64)1 << edx.split.bit_width_fixed) - 1; - } - - pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) | - (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED); - pmu->global_ctrl_mask = ~pmu->global_ctrl; - - entry = kvm_find_cpuid_entry(vcpu, 7, 0); - if (entry && - (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) && - (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) - pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED; -} - -static void intel_pmu_init(struct kvm_vcpu *vcpu) -{ - int i; - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - - for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { - pmu->gp_counters[i].type = KVM_PMC_GP; - pmu->gp_counters[i].vcpu = vcpu; - pmu->gp_counters[i].idx = i; - } - - for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) { - pmu->fixed_counters[i].type = KVM_PMC_FIXED; - pmu->fixed_counters[i].vcpu = vcpu; - pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED; - } -} - -static void intel_pmu_reset(struct kvm_vcpu *vcpu) -{ - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - int i; - - for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { - struct kvm_pmc *pmc = &pmu->gp_counters[i]; - - pmc_stop_counter(pmc); - pmc->counter = pmc->eventsel = 0; - } - - for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) - pmc_stop_counter(&pmu->fixed_counters[i]); - - pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = - pmu->global_ovf_ctrl = 0; -} - -struct kvm_pmu_ops intel_pmu_ops = { - .find_arch_event = intel_find_arch_event, - .find_fixed_event = intel_find_fixed_event, - .pmc_is_enabled = intel_pmc_is_enabled, - .pmc_idx_to_pmc = intel_pmc_idx_to_pmc, - .msr_idx_to_pmc = intel_msr_idx_to_pmc, - .is_valid_msr_idx = intel_is_valid_msr_idx, - .is_valid_msr = intel_is_valid_msr, - .get_msr = intel_pmu_get_msr, - .set_msr = intel_pmu_set_msr, - .refresh = intel_pmu_refresh, - .init = intel_pmu_init, - .reset = intel_pmu_reset, -}; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c deleted file mode 100644 index 589230c923e2..000000000000 --- a/arch/x86/kvm/vmx.c +++ /dev/null @@ -1,15295 +0,0 @@ -/* - * Kernel-based Virtual Machine driver for Linux - * - * This module enables machines with Intel VT-x extensions to run virtual - * machines without emulation or binary translation. - * - * Copyright (C) 2006 Qumranet, Inc. - * Copyright 2010 Red Hat, Inc. and/or its affiliates. - * - * Authors: - * Avi Kivity - * Yaniv Kamay - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "cpuid.h" -#include "hyperv.h" -#include "irq.h" -#include "kvm_cache_regs.h" -#include "lapic.h" -#include "mmu.h" -#include "pmu.h" -#include "trace.h" -#include "vmx_evmcs.h" -#include "x86.h" - -#define __ex(x) __kvm_handle_fault_on_reboot(x) -#define __ex_clear(x, reg) \ - ____kvm_handle_fault_on_reboot(x, "xor " reg ", " reg) - -MODULE_AUTHOR("Qumranet"); -MODULE_LICENSE("GPL"); - -static const struct x86_cpu_id vmx_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_VMX), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); - -static bool __read_mostly enable_vpid = 1; -module_param_named(vpid, enable_vpid, bool, 0444); - -static bool __read_mostly enable_vnmi = 1; -module_param_named(vnmi, enable_vnmi, bool, S_IRUGO); - -static bool __read_mostly flexpriority_enabled = 1; -module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); - -static bool __read_mostly enable_ept = 1; -module_param_named(ept, enable_ept, bool, S_IRUGO); - -static bool __read_mostly enable_unrestricted_guest = 1; -module_param_named(unrestricted_guest, - enable_unrestricted_guest, bool, S_IRUGO); - -static bool __read_mostly enable_ept_ad_bits = 1; -module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO); - -static bool __read_mostly emulate_invalid_guest_state = true; -module_param(emulate_invalid_guest_state, bool, S_IRUGO); - -static bool __read_mostly fasteoi = 1; -module_param(fasteoi, bool, S_IRUGO); - -static bool __read_mostly enable_apicv = 1; -module_param(enable_apicv, bool, S_IRUGO); - -static bool __read_mostly enable_shadow_vmcs = 1; -module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); -/* - * If nested=1, nested virtualization is supported, i.e., guests may use - * VMX and be a hypervisor for its own guests. If nested=0, guests may not - * use VMX instructions. - */ -static bool __read_mostly nested = 1; -module_param(nested, bool, S_IRUGO); - -static bool __read_mostly nested_early_check = 0; -module_param(nested_early_check, bool, S_IRUGO); - -static u64 __read_mostly host_xss; - -static bool __read_mostly enable_pml = 1; -module_param_named(pml, enable_pml, bool, S_IRUGO); - -#define MSR_TYPE_R 1 -#define MSR_TYPE_W 2 -#define MSR_TYPE_RW 3 - -#define MSR_BITMAP_MODE_X2APIC 1 -#define MSR_BITMAP_MODE_X2APIC_APICV 2 - -#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL - -/* Guest_tsc -> host_tsc conversion requires 64-bit division. */ -static int __read_mostly cpu_preemption_timer_multi; -static bool __read_mostly enable_preemption_timer = 1; -#ifdef CONFIG_X86_64 -module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); -#endif - -#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD) -#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE -#define KVM_VM_CR0_ALWAYS_ON \ - (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \ - X86_CR0_WP | X86_CR0_PG | X86_CR0_PE) -#define KVM_CR4_GUEST_OWNED_BITS \ - (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ - | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD) - -#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE -#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) -#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) - -#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) - -#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 - -/* - * Hyper-V requires all of these, so mark them as supported even though - * they are just treated the same as all-context. - */ -#define VMX_VPID_EXTENT_SUPPORTED_MASK \ - (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ - VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ - VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ - VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) - -/* - * These 2 parameters are used to config the controls for Pause-Loop Exiting: - * ple_gap: upper bound on the amount of time between two successive - * executions of PAUSE in a loop. Also indicate if ple enabled. - * According to test, this time is usually smaller than 128 cycles. - * ple_window: upper bound on the amount of time a guest is allowed to execute - * in a PAUSE loop. Tests indicate that most spinlocks are held for - * less than 2^12 cycles - * Time is measured based on a counter that runs at the same rate as the TSC, - * refer SDM volume 3b section 21.6.13 & 22.1.3. - */ -static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP; -module_param(ple_gap, uint, 0444); - -static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; -module_param(ple_window, uint, 0444); - -/* Default doubles per-vcpu window every exit. */ -static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW; -module_param(ple_window_grow, uint, 0444); - -/* Default resets per-vcpu window every exit to ple_window. */ -static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK; -module_param(ple_window_shrink, uint, 0444); - -/* Default is to compute the maximum so we can never overflow. */ -static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; -module_param(ple_window_max, uint, 0444); - -extern const ulong vmx_return; -extern const ulong vmx_early_consistency_check_return; - -static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); -static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); -static DEFINE_MUTEX(vmx_l1d_flush_mutex); - -/* Storage for pre module init parameter parsing */ -static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO; - -static const struct { - const char *option; - bool for_parse; -} vmentry_l1d_param[] = { - [VMENTER_L1D_FLUSH_AUTO] = {"auto", true}, - [VMENTER_L1D_FLUSH_NEVER] = {"never", true}, - [VMENTER_L1D_FLUSH_COND] = {"cond", true}, - [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true}, - [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false}, - [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false}, -}; - -#define L1D_CACHE_ORDER 4 -static void *vmx_l1d_flush_pages; - -static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) -{ - struct page *page; - unsigned int i; - - if (!enable_ept) { - l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED; - return 0; - } - - if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) { - u64 msr; - - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr); - if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { - l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; - return 0; - } - } - - /* If set to auto use the default l1tf mitigation method */ - if (l1tf == VMENTER_L1D_FLUSH_AUTO) { - switch (l1tf_mitigation) { - case L1TF_MITIGATION_OFF: - l1tf = VMENTER_L1D_FLUSH_NEVER; - break; - case L1TF_MITIGATION_FLUSH_NOWARN: - case L1TF_MITIGATION_FLUSH: - case L1TF_MITIGATION_FLUSH_NOSMT: - l1tf = VMENTER_L1D_FLUSH_COND; - break; - case L1TF_MITIGATION_FULL: - case L1TF_MITIGATION_FULL_FORCE: - l1tf = VMENTER_L1D_FLUSH_ALWAYS; - break; - } - } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) { - l1tf = VMENTER_L1D_FLUSH_ALWAYS; - } - - if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages && - !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) { - page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER); - if (!page) - return -ENOMEM; - vmx_l1d_flush_pages = page_address(page); - - /* - * Initialize each page with a different pattern in - * order to protect against KSM in the nested - * virtualization case. - */ - for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) { - memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1, - PAGE_SIZE); - } - } - - l1tf_vmx_mitigation = l1tf; - - if (l1tf != VMENTER_L1D_FLUSH_NEVER) - static_branch_enable(&vmx_l1d_should_flush); - else - static_branch_disable(&vmx_l1d_should_flush); - - if (l1tf == VMENTER_L1D_FLUSH_COND) - static_branch_enable(&vmx_l1d_flush_cond); - else - static_branch_disable(&vmx_l1d_flush_cond); - return 0; -} - -static int vmentry_l1d_flush_parse(const char *s) -{ - unsigned int i; - - if (s) { - for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) { - if (vmentry_l1d_param[i].for_parse && - sysfs_streq(s, vmentry_l1d_param[i].option)) - return i; - } - } - return -EINVAL; -} - -static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) -{ - int l1tf, ret; - - l1tf = vmentry_l1d_flush_parse(s); - if (l1tf < 0) - return l1tf; - - if (!boot_cpu_has(X86_BUG_L1TF)) - return 0; - - /* - * Has vmx_init() run already? If not then this is the pre init - * parameter parsing. In that case just store the value and let - * vmx_init() do the proper setup after enable_ept has been - * established. - */ - if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) { - vmentry_l1d_flush_param = l1tf; - return 0; - } - - mutex_lock(&vmx_l1d_flush_mutex); - ret = vmx_setup_l1d_flush(l1tf); - mutex_unlock(&vmx_l1d_flush_mutex); - return ret; -} - -static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) -{ - if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param))) - return sprintf(s, "???\n"); - - return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); -} - -static const struct kernel_param_ops vmentry_l1d_flush_ops = { - .set = vmentry_l1d_flush_set, - .get = vmentry_l1d_flush_get, -}; -module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); - -enum ept_pointers_status { - EPT_POINTERS_CHECK = 0, - EPT_POINTERS_MATCH = 1, - EPT_POINTERS_MISMATCH = 2 -}; - -struct kvm_vmx { - struct kvm kvm; - - unsigned int tss_addr; - bool ept_identity_pagetable_done; - gpa_t ept_identity_map_addr; - - enum ept_pointers_status ept_pointers_match; - spinlock_t ept_pointer_lock; -}; - -#define NR_AUTOLOAD_MSRS 8 - -struct vmcs_hdr { - u32 revision_id:31; - u32 shadow_vmcs:1; -}; - -struct vmcs { - struct vmcs_hdr hdr; - u32 abort; - char data[0]; -}; - -/* - * vmcs_host_state tracks registers that are loaded from the VMCS on VMEXIT - * and whose values change infrequently, but are not constant. I.e. this is - * used as a write-through cache of the corresponding VMCS fields. - */ -struct vmcs_host_state { - unsigned long cr3; /* May not match real cr3 */ - unsigned long cr4; /* May not match real cr4 */ - unsigned long gs_base; - unsigned long fs_base; - - u16 fs_sel, gs_sel, ldt_sel; -#ifdef CONFIG_X86_64 - u16 ds_sel, es_sel; -#endif -}; - -/* - * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also - * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs - * loaded on this CPU (so we can clear them if the CPU goes down). - */ -struct loaded_vmcs { - struct vmcs *vmcs; - struct vmcs *shadow_vmcs; - int cpu; - bool launched; - bool nmi_known_unmasked; - bool hv_timer_armed; - /* Support for vnmi-less CPUs */ - int soft_vnmi_blocked; - ktime_t entry_time; - s64 vnmi_blocked_time; - unsigned long *msr_bitmap; - struct list_head loaded_vmcss_on_cpu_link; - struct vmcs_host_state host_state; -}; - -struct shared_msr_entry { - unsigned index; - u64 data; - u64 mask; -}; - -/* - * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a - * single nested guest (L2), hence the name vmcs12. Any VMX implementation has - * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is - * stored in guest memory specified by VMPTRLD, but is opaque to the guest, - * which must access it using VMREAD/VMWRITE/VMCLEAR instructions. - * More than one of these structures may exist, if L1 runs multiple L2 guests. - * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the - * underlying hardware which will be used to run L2. - * This structure is packed to ensure that its layout is identical across - * machines (necessary for live migration). - * - * IMPORTANT: Changing the layout of existing fields in this structure - * will break save/restore compatibility with older kvm releases. When - * adding new fields, either use space in the reserved padding* arrays - * or add the new fields to the end of the structure. - */ -typedef u64 natural_width; -struct __packed vmcs12 { - /* According to the Intel spec, a VMCS region must start with the - * following two fields. Then follow implementation-specific data. - */ - struct vmcs_hdr hdr; - u32 abort; - - u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ - u32 padding[7]; /* room for future expansion */ - - u64 io_bitmap_a; - u64 io_bitmap_b; - u64 msr_bitmap; - u64 vm_exit_msr_store_addr; - u64 vm_exit_msr_load_addr; - u64 vm_entry_msr_load_addr; - u64 tsc_offset; - u64 virtual_apic_page_addr; - u64 apic_access_addr; - u64 posted_intr_desc_addr; - u64 ept_pointer; - u64 eoi_exit_bitmap0; - u64 eoi_exit_bitmap1; - u64 eoi_exit_bitmap2; - u64 eoi_exit_bitmap3; - u64 xss_exit_bitmap; - u64 guest_physical_address; - u64 vmcs_link_pointer; - u64 guest_ia32_debugctl; - u64 guest_ia32_pat; - u64 guest_ia32_efer; - u64 guest_ia32_perf_global_ctrl; - u64 guest_pdptr0; - u64 guest_pdptr1; - u64 guest_pdptr2; - u64 guest_pdptr3; - u64 guest_bndcfgs; - u64 host_ia32_pat; - u64 host_ia32_efer; - u64 host_ia32_perf_global_ctrl; - u64 vmread_bitmap; - u64 vmwrite_bitmap; - u64 vm_function_control; - u64 eptp_list_address; - u64 pml_address; - u64 padding64[3]; /* room for future expansion */ - /* - * To allow migration of L1 (complete with its L2 guests) between - * machines of different natural widths (32 or 64 bit), we cannot have - * unsigned long fields with no explict size. We use u64 (aliased - * natural_width) instead. Luckily, x86 is little-endian. - */ - natural_width cr0_guest_host_mask; - natural_width cr4_guest_host_mask; - natural_width cr0_read_shadow; - natural_width cr4_read_shadow; - natural_width cr3_target_value0; - natural_width cr3_target_value1; - natural_width cr3_target_value2; - natural_width cr3_target_value3; - natural_width exit_qualification; - natural_width guest_linear_address; - natural_width guest_cr0; - natural_width guest_cr3; - natural_width guest_cr4; - natural_width guest_es_base; - natural_width guest_cs_base; - natural_width guest_ss_base; - natural_width guest_ds_base; - natural_width guest_fs_base; - natural_width guest_gs_base; - natural_width guest_ldtr_base; - natural_width guest_tr_base; - natural_width guest_gdtr_base; - natural_width guest_idtr_base; - natural_width guest_dr7; - natural_width guest_rsp; - natural_width guest_rip; - natural_width guest_rflags; - natural_width guest_pending_dbg_exceptions; - natural_width guest_sysenter_esp; - natural_width guest_sysenter_eip; - natural_width host_cr0; - natural_width host_cr3; - natural_width host_cr4; - natural_width host_fs_base; - natural_width host_gs_base; - natural_width host_tr_base; - natural_width host_gdtr_base; - natural_width host_idtr_base; - natural_width host_ia32_sysenter_esp; - natural_width host_ia32_sysenter_eip; - natural_width host_rsp; - natural_width host_rip; - natural_width paddingl[8]; /* room for future expansion */ - u32 pin_based_vm_exec_control; - u32 cpu_based_vm_exec_control; - u32 exception_bitmap; - u32 page_fault_error_code_mask; - u32 page_fault_error_code_match; - u32 cr3_target_count; - u32 vm_exit_controls; - u32 vm_exit_msr_store_count; - u32 vm_exit_msr_load_count; - u32 vm_entry_controls; - u32 vm_entry_msr_load_count; - u32 vm_entry_intr_info_field; - u32 vm_entry_exception_error_code; - u32 vm_entry_instruction_len; - u32 tpr_threshold; - u32 secondary_vm_exec_control; - u32 vm_instruction_error; - u32 vm_exit_reason; - u32 vm_exit_intr_info; - u32 vm_exit_intr_error_code; - u32 idt_vectoring_info_field; - u32 idt_vectoring_error_code; - u32 vm_exit_instruction_len; - u32 vmx_instruction_info; - u32 guest_es_limit; - u32 guest_cs_limit; - u32 guest_ss_limit; - u32 guest_ds_limit; - u32 guest_fs_limit; - u32 guest_gs_limit; - u32 guest_ldtr_limit; - u32 guest_tr_limit; - u32 guest_gdtr_limit; - u32 guest_idtr_limit; - u32 guest_es_ar_bytes; - u32 guest_cs_ar_bytes; - u32 guest_ss_ar_bytes; - u32 guest_ds_ar_bytes; - u32 guest_fs_ar_bytes; - u32 guest_gs_ar_bytes; - u32 guest_ldtr_ar_bytes; - u32 guest_tr_ar_bytes; - u32 guest_interruptibility_info; - u32 guest_activity_state; - u32 guest_sysenter_cs; - u32 host_ia32_sysenter_cs; - u32 vmx_preemption_timer_value; - u32 padding32[7]; /* room for future expansion */ - u16 virtual_processor_id; - u16 posted_intr_nv; - u16 guest_es_selector; - u16 guest_cs_selector; - u16 guest_ss_selector; - u16 guest_ds_selector; - u16 guest_fs_selector; - u16 guest_gs_selector; - u16 guest_ldtr_selector; - u16 guest_tr_selector; - u16 guest_intr_status; - u16 host_es_selector; - u16 host_cs_selector; - u16 host_ss_selector; - u16 host_ds_selector; - u16 host_fs_selector; - u16 host_gs_selector; - u16 host_tr_selector; - u16 guest_pml_index; -}; - -/* - * For save/restore compatibility, the vmcs12 field offsets must not change. - */ -#define CHECK_OFFSET(field, loc) \ - BUILD_BUG_ON_MSG(offsetof(struct vmcs12, field) != (loc), \ - "Offset of " #field " in struct vmcs12 has changed.") - -static inline void vmx_check_vmcs12_offsets(void) { - CHECK_OFFSET(hdr, 0); - CHECK_OFFSET(abort, 4); - CHECK_OFFSET(launch_state, 8); - CHECK_OFFSET(io_bitmap_a, 40); - CHECK_OFFSET(io_bitmap_b, 48); - CHECK_OFFSET(msr_bitmap, 56); - CHECK_OFFSET(vm_exit_msr_store_addr, 64); - CHECK_OFFSET(vm_exit_msr_load_addr, 72); - CHECK_OFFSET(vm_entry_msr_load_addr, 80); - CHECK_OFFSET(tsc_offset, 88); - CHECK_OFFSET(virtual_apic_page_addr, 96); - CHECK_OFFSET(apic_access_addr, 104); - CHECK_OFFSET(posted_intr_desc_addr, 112); - CHECK_OFFSET(ept_pointer, 120); - CHECK_OFFSET(eoi_exit_bitmap0, 128); - CHECK_OFFSET(eoi_exit_bitmap1, 136); - CHECK_OFFSET(eoi_exit_bitmap2, 144); - CHECK_OFFSET(eoi_exit_bitmap3, 152); - CHECK_OFFSET(xss_exit_bitmap, 160); - CHECK_OFFSET(guest_physical_address, 168); - CHECK_OFFSET(vmcs_link_pointer, 176); - CHECK_OFFSET(guest_ia32_debugctl, 184); - CHECK_OFFSET(guest_ia32_pat, 192); - CHECK_OFFSET(guest_ia32_efer, 200); - CHECK_OFFSET(guest_ia32_perf_global_ctrl, 208); - CHECK_OFFSET(guest_pdptr0, 216); - CHECK_OFFSET(guest_pdptr1, 224); - CHECK_OFFSET(guest_pdptr2, 232); - CHECK_OFFSET(guest_pdptr3, 240); - CHECK_OFFSET(guest_bndcfgs, 248); - CHECK_OFFSET(host_ia32_pat, 256); - CHECK_OFFSET(host_ia32_efer, 264); - CHECK_OFFSET(host_ia32_perf_global_ctrl, 272); - CHECK_OFFSET(vmread_bitmap, 280); - CHECK_OFFSET(vmwrite_bitmap, 288); - CHECK_OFFSET(vm_function_control, 296); - CHECK_OFFSET(eptp_list_address, 304); - CHECK_OFFSET(pml_address, 312); - CHECK_OFFSET(cr0_guest_host_mask, 344); - CHECK_OFFSET(cr4_guest_host_mask, 352); - CHECK_OFFSET(cr0_read_shadow, 360); - CHECK_OFFSET(cr4_read_shadow, 368); - CHECK_OFFSET(cr3_target_value0, 376); - CHECK_OFFSET(cr3_target_value1, 384); - CHECK_OFFSET(cr3_target_value2, 392); - CHECK_OFFSET(cr3_target_value3, 400); - CHECK_OFFSET(exit_qualification, 408); - CHECK_OFFSET(guest_linear_address, 416); - CHECK_OFFSET(guest_cr0, 424); - CHECK_OFFSET(guest_cr3, 432); - CHECK_OFFSET(guest_cr4, 440); - CHECK_OFFSET(guest_es_base, 448); - CHECK_OFFSET(guest_cs_base, 456); - CHECK_OFFSET(guest_ss_base, 464); - CHECK_OFFSET(guest_ds_base, 472); - CHECK_OFFSET(guest_fs_base, 480); - CHECK_OFFSET(guest_gs_base, 488); - CHECK_OFFSET(guest_ldtr_base, 496); - CHECK_OFFSET(guest_tr_base, 504); - CHECK_OFFSET(guest_gdtr_base, 512); - CHECK_OFFSET(guest_idtr_base, 520); - CHECK_OFFSET(guest_dr7, 528); - CHECK_OFFSET(guest_rsp, 536); - CHECK_OFFSET(guest_rip, 544); - CHECK_OFFSET(guest_rflags, 552); - CHECK_OFFSET(guest_pending_dbg_exceptions, 560); - CHECK_OFFSET(guest_sysenter_esp, 568); - CHECK_OFFSET(guest_sysenter_eip, 576); - CHECK_OFFSET(host_cr0, 584); - CHECK_OFFSET(host_cr3, 592); - CHECK_OFFSET(host_cr4, 600); - CHECK_OFFSET(host_fs_base, 608); - CHECK_OFFSET(host_gs_base, 616); - CHECK_OFFSET(host_tr_base, 624); - CHECK_OFFSET(host_gdtr_base, 632); - CHECK_OFFSET(host_idtr_base, 640); - CHECK_OFFSET(host_ia32_sysenter_esp, 648); - CHECK_OFFSET(host_ia32_sysenter_eip, 656); - CHECK_OFFSET(host_rsp, 664); - CHECK_OFFSET(host_rip, 672); - CHECK_OFFSET(pin_based_vm_exec_control, 744); - CHECK_OFFSET(cpu_based_vm_exec_control, 748); - CHECK_OFFSET(exception_bitmap, 752); - CHECK_OFFSET(page_fault_error_code_mask, 756); - CHECK_OFFSET(page_fault_error_code_match, 760); - CHECK_OFFSET(cr3_target_count, 764); - CHECK_OFFSET(vm_exit_controls, 768); - CHECK_OFFSET(vm_exit_msr_store_count, 772); - CHECK_OFFSET(vm_exit_msr_load_count, 776); - CHECK_OFFSET(vm_entry_controls, 780); - CHECK_OFFSET(vm_entry_msr_load_count, 784); - CHECK_OFFSET(vm_entry_intr_info_field, 788); - CHECK_OFFSET(vm_entry_exception_error_code, 792); - CHECK_OFFSET(vm_entry_instruction_len, 796); - CHECK_OFFSET(tpr_threshold, 800); - CHECK_OFFSET(secondary_vm_exec_control, 804); - CHECK_OFFSET(vm_instruction_error, 808); - CHECK_OFFSET(vm_exit_reason, 812); - CHECK_OFFSET(vm_exit_intr_info, 816); - CHECK_OFFSET(vm_exit_intr_error_code, 820); - CHECK_OFFSET(idt_vectoring_info_field, 824); - CHECK_OFFSET(idt_vectoring_error_code, 828); - CHECK_OFFSET(vm_exit_instruction_len, 832); - CHECK_OFFSET(vmx_instruction_info, 836); - CHECK_OFFSET(guest_es_limit, 840); - CHECK_OFFSET(guest_cs_limit, 844); - CHECK_OFFSET(guest_ss_limit, 848); - CHECK_OFFSET(guest_ds_limit, 852); - CHECK_OFFSET(guest_fs_limit, 856); - CHECK_OFFSET(guest_gs_limit, 860); - CHECK_OFFSET(guest_ldtr_limit, 864); - CHECK_OFFSET(guest_tr_limit, 868); - CHECK_OFFSET(guest_gdtr_limit, 872); - CHECK_OFFSET(guest_idtr_limit, 876); - CHECK_OFFSET(guest_es_ar_bytes, 880); - CHECK_OFFSET(guest_cs_ar_bytes, 884); - CHECK_OFFSET(guest_ss_ar_bytes, 888); - CHECK_OFFSET(guest_ds_ar_bytes, 892); - CHECK_OFFSET(guest_fs_ar_bytes, 896); - CHECK_OFFSET(guest_gs_ar_bytes, 900); - CHECK_OFFSET(guest_ldtr_ar_bytes, 904); - CHECK_OFFSET(guest_tr_ar_bytes, 908); - CHECK_OFFSET(guest_interruptibility_info, 912); - CHECK_OFFSET(guest_activity_state, 916); - CHECK_OFFSET(guest_sysenter_cs, 920); - CHECK_OFFSET(host_ia32_sysenter_cs, 924); - CHECK_OFFSET(vmx_preemption_timer_value, 928); - CHECK_OFFSET(virtual_processor_id, 960); - CHECK_OFFSET(posted_intr_nv, 962); - CHECK_OFFSET(guest_es_selector, 964); - CHECK_OFFSET(guest_cs_selector, 966); - CHECK_OFFSET(guest_ss_selector, 968); - CHECK_OFFSET(guest_ds_selector, 970); - CHECK_OFFSET(guest_fs_selector, 972); - CHECK_OFFSET(guest_gs_selector, 974); - CHECK_OFFSET(guest_ldtr_selector, 976); - CHECK_OFFSET(guest_tr_selector, 978); - CHECK_OFFSET(guest_intr_status, 980); - CHECK_OFFSET(host_es_selector, 982); - CHECK_OFFSET(host_cs_selector, 984); - CHECK_OFFSET(host_ss_selector, 986); - CHECK_OFFSET(host_ds_selector, 988); - CHECK_OFFSET(host_fs_selector, 990); - CHECK_OFFSET(host_gs_selector, 992); - CHECK_OFFSET(host_tr_selector, 994); - CHECK_OFFSET(guest_pml_index, 996); -} - -/* - * VMCS12_REVISION is an arbitrary id that should be changed if the content or - * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and - * VMPTRLD verifies that the VMCS region that L1 is loading contains this id. - * - * IMPORTANT: Changing this value will break save/restore compatibility with - * older kvm releases. - */ -#define VMCS12_REVISION 0x11e57ed0 - -/* - * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region - * and any VMCS region. Although only sizeof(struct vmcs12) are used by the - * current implementation, 4K are reserved to avoid future complications. - */ -#define VMCS12_SIZE 0x1000 - -/* - * VMCS12_MAX_FIELD_INDEX is the highest index value used in any - * supported VMCS12 field encoding. - */ -#define VMCS12_MAX_FIELD_INDEX 0x17 - -struct nested_vmx_msrs { - /* - * We only store the "true" versions of the VMX capability MSRs. We - * generate the "non-true" versions by setting the must-be-1 bits - * according to the SDM. - */ - u32 procbased_ctls_low; - u32 procbased_ctls_high; - u32 secondary_ctls_low; - u32 secondary_ctls_high; - u32 pinbased_ctls_low; - u32 pinbased_ctls_high; - u32 exit_ctls_low; - u32 exit_ctls_high; - u32 entry_ctls_low; - u32 entry_ctls_high; - u32 misc_low; - u32 misc_high; - u32 ept_caps; - u32 vpid_caps; - u64 basic; - u64 cr0_fixed0; - u64 cr0_fixed1; - u64 cr4_fixed0; - u64 cr4_fixed1; - u64 vmcs_enum; - u64 vmfunc_controls; -}; - -/* - * The nested_vmx structure is part of vcpu_vmx, and holds information we need - * for correct emulation of VMX (i.e., nested VMX) on this vcpu. - */ -struct nested_vmx { - /* Has the level1 guest done vmxon? */ - bool vmxon; - gpa_t vmxon_ptr; - bool pml_full; - - /* The guest-physical address of the current VMCS L1 keeps for L2 */ - gpa_t current_vmptr; - /* - * Cache of the guest's VMCS, existing outside of guest memory. - * Loaded from guest memory during VMPTRLD. Flushed to guest - * memory during VMCLEAR and VMPTRLD. - */ - struct vmcs12 *cached_vmcs12; - /* - * Cache of the guest's shadow VMCS, existing outside of guest - * memory. Loaded from guest memory during VM entry. Flushed - * to guest memory during VM exit. - */ - struct vmcs12 *cached_shadow_vmcs12; - /* - * Indicates if the shadow vmcs or enlightened vmcs must be updated - * with the data held by struct vmcs12. - */ - bool need_vmcs12_sync; - bool dirty_vmcs12; - - /* - * vmcs02 has been initialized, i.e. state that is constant for - * vmcs02 has been written to the backing VMCS. Initialization - * is delayed until L1 actually attempts to run a nested VM. - */ - bool vmcs02_initialized; - - bool change_vmcs01_virtual_apic_mode; - - /* - * Enlightened VMCS has been enabled. It does not mean that L1 has to - * use it. However, VMX features available to L1 will be limited based - * on what the enlightened VMCS supports. - */ - bool enlightened_vmcs_enabled; - - /* L2 must run next, and mustn't decide to exit to L1. */ - bool nested_run_pending; - - struct loaded_vmcs vmcs02; - - /* - * Guest pages referred to in the vmcs02 with host-physical - * pointers, so we must keep them pinned while L2 runs. - */ - struct page *apic_access_page; - struct page *virtual_apic_page; - struct page *pi_desc_page; - struct pi_desc *pi_desc; - bool pi_pending; - u16 posted_intr_nv; - - struct hrtimer preemption_timer; - bool preemption_timer_expired; - - /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ - u64 vmcs01_debugctl; - u64 vmcs01_guest_bndcfgs; - - u16 vpid02; - u16 last_vpid; - - struct nested_vmx_msrs msrs; - - /* SMM related state */ - struct { - /* in VMX operation on SMM entry? */ - bool vmxon; - /* in guest mode on SMM entry? */ - bool guest_mode; - } smm; - - gpa_t hv_evmcs_vmptr; - struct page *hv_evmcs_page; - struct hv_enlightened_vmcs *hv_evmcs; -}; - -#define POSTED_INTR_ON 0 -#define POSTED_INTR_SN 1 - -/* Posted-Interrupt Descriptor */ -struct pi_desc { - u32 pir[8]; /* Posted interrupt requested */ - union { - struct { - /* bit 256 - Outstanding Notification */ - u16 on : 1, - /* bit 257 - Suppress Notification */ - sn : 1, - /* bit 271:258 - Reserved */ - rsvd_1 : 14; - /* bit 279:272 - Notification Vector */ - u8 nv; - /* bit 287:280 - Reserved */ - u8 rsvd_2; - /* bit 319:288 - Notification Destination */ - u32 ndst; - }; - u64 control; - }; - u32 rsvd[6]; -} __aligned(64); - -static bool pi_test_and_set_on(struct pi_desc *pi_desc) -{ - return test_and_set_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static bool pi_test_and_clear_on(struct pi_desc *pi_desc) -{ - return test_and_clear_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) -{ - return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); -} - -static inline void pi_clear_sn(struct pi_desc *pi_desc) -{ - return clear_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); -} - -static inline void pi_set_sn(struct pi_desc *pi_desc) -{ - return set_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); -} - -static inline void pi_clear_on(struct pi_desc *pi_desc) -{ - clear_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static inline int pi_test_on(struct pi_desc *pi_desc) -{ - return test_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static inline int pi_test_sn(struct pi_desc *pi_desc) -{ - return test_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); -} - -struct vmx_msrs { - unsigned int nr; - struct vmx_msr_entry val[NR_AUTOLOAD_MSRS]; -}; - -struct vcpu_vmx { - struct kvm_vcpu vcpu; - unsigned long host_rsp; - u8 fail; - u8 msr_bitmap_mode; - u32 exit_intr_info; - u32 idt_vectoring_info; - ulong rflags; - struct shared_msr_entry *guest_msrs; - int nmsrs; - int save_nmsrs; - bool guest_msrs_dirty; - unsigned long host_idt_base; -#ifdef CONFIG_X86_64 - u64 msr_host_kernel_gs_base; - u64 msr_guest_kernel_gs_base; -#endif - - u64 arch_capabilities; - u64 spec_ctrl; - - u32 vm_entry_controls_shadow; - u32 vm_exit_controls_shadow; - u32 secondary_exec_control; - - /* - * loaded_vmcs points to the VMCS currently used in this vcpu. For a - * non-nested (L1) guest, it always points to vmcs01. For a nested - * guest (L2), it points to a different VMCS. loaded_cpu_state points - * to the VMCS whose state is loaded into the CPU registers that only - * need to be switched when transitioning to/from the kernel; a NULL - * value indicates that host state is loaded. - */ - struct loaded_vmcs vmcs01; - struct loaded_vmcs *loaded_vmcs; - struct loaded_vmcs *loaded_cpu_state; - bool __launched; /* temporary, used in vmx_vcpu_run */ - struct msr_autoload { - struct vmx_msrs guest; - struct vmx_msrs host; - } msr_autoload; - - struct { - int vm86_active; - ulong save_rflags; - struct kvm_segment segs[8]; - } rmode; - struct { - u32 bitmask; /* 4 bits per segment (1 bit per field) */ - struct kvm_save_segment { - u16 selector; - unsigned long base; - u32 limit; - u32 ar; - } seg[8]; - } segment_cache; - int vpid; - bool emulation_required; - - u32 exit_reason; - - /* Posted interrupt descriptor */ - struct pi_desc pi_desc; - - /* Support for a guest hypervisor (nested VMX) */ - struct nested_vmx nested; - - /* Dynamic PLE window. */ - int ple_window; - bool ple_window_dirty; - - bool req_immediate_exit; - - /* Support for PML */ -#define PML_ENTITY_NUM 512 - struct page *pml_pg; - - /* apic deadline value in host tsc */ - u64 hv_deadline_tsc; - - u64 current_tsc_ratio; - - u32 host_pkru; - - unsigned long host_debugctlmsr; - - /* - * Only bits masked by msr_ia32_feature_control_valid_bits can be set in - * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included - * in msr_ia32_feature_control_valid_bits. - */ - u64 msr_ia32_feature_control; - u64 msr_ia32_feature_control_valid_bits; - u64 ept_pointer; -}; - -enum segment_cache_field { - SEG_FIELD_SEL = 0, - SEG_FIELD_BASE = 1, - SEG_FIELD_LIMIT = 2, - SEG_FIELD_AR = 3, - - SEG_FIELD_NR = 4 -}; - -static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm) -{ - return container_of(kvm, struct kvm_vmx, kvm); -} - -static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) -{ - return container_of(vcpu, struct vcpu_vmx, vcpu); -} - -static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) -{ - return &(to_vmx(vcpu)->pi_desc); -} - -#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) -#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) -#define FIELD(number, name) [ROL16(number, 6)] = VMCS12_OFFSET(name) -#define FIELD64(number, name) \ - FIELD(number, name), \ - [ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32) - - -static u16 shadow_read_only_fields[] = { -#define SHADOW_FIELD_RO(x) x, -#include "vmx_shadow_fields.h" -}; -static int max_shadow_read_only_fields = - ARRAY_SIZE(shadow_read_only_fields); - -static u16 shadow_read_write_fields[] = { -#define SHADOW_FIELD_RW(x) x, -#include "vmx_shadow_fields.h" -}; -static int max_shadow_read_write_fields = - ARRAY_SIZE(shadow_read_write_fields); - -static const unsigned short vmcs_field_to_offset_table[] = { - FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), - FIELD(POSTED_INTR_NV, posted_intr_nv), - FIELD(GUEST_ES_SELECTOR, guest_es_selector), - FIELD(GUEST_CS_SELECTOR, guest_cs_selector), - FIELD(GUEST_SS_SELECTOR, guest_ss_selector), - FIELD(GUEST_DS_SELECTOR, guest_ds_selector), - FIELD(GUEST_FS_SELECTOR, guest_fs_selector), - FIELD(GUEST_GS_SELECTOR, guest_gs_selector), - FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector), - FIELD(GUEST_TR_SELECTOR, guest_tr_selector), - FIELD(GUEST_INTR_STATUS, guest_intr_status), - FIELD(GUEST_PML_INDEX, guest_pml_index), - FIELD(HOST_ES_SELECTOR, host_es_selector), - FIELD(HOST_CS_SELECTOR, host_cs_selector), - FIELD(HOST_SS_SELECTOR, host_ss_selector), - FIELD(HOST_DS_SELECTOR, host_ds_selector), - FIELD(HOST_FS_SELECTOR, host_fs_selector), - FIELD(HOST_GS_SELECTOR, host_gs_selector), - FIELD(HOST_TR_SELECTOR, host_tr_selector), - FIELD64(IO_BITMAP_A, io_bitmap_a), - FIELD64(IO_BITMAP_B, io_bitmap_b), - FIELD64(MSR_BITMAP, msr_bitmap), - FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr), - FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr), - FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr), - FIELD64(PML_ADDRESS, pml_address), - FIELD64(TSC_OFFSET, tsc_offset), - FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr), - FIELD64(APIC_ACCESS_ADDR, apic_access_addr), - FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr), - FIELD64(VM_FUNCTION_CONTROL, vm_function_control), - FIELD64(EPT_POINTER, ept_pointer), - FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0), - FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1), - FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2), - FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3), - FIELD64(EPTP_LIST_ADDRESS, eptp_list_address), - FIELD64(VMREAD_BITMAP, vmread_bitmap), - FIELD64(VMWRITE_BITMAP, vmwrite_bitmap), - FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), - FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), - FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), - FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl), - FIELD64(GUEST_IA32_PAT, guest_ia32_pat), - FIELD64(GUEST_IA32_EFER, guest_ia32_efer), - FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl), - FIELD64(GUEST_PDPTR0, guest_pdptr0), - FIELD64(GUEST_PDPTR1, guest_pdptr1), - FIELD64(GUEST_PDPTR2, guest_pdptr2), - FIELD64(GUEST_PDPTR3, guest_pdptr3), - FIELD64(GUEST_BNDCFGS, guest_bndcfgs), - FIELD64(HOST_IA32_PAT, host_ia32_pat), - FIELD64(HOST_IA32_EFER, host_ia32_efer), - FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl), - FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control), - FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control), - FIELD(EXCEPTION_BITMAP, exception_bitmap), - FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask), - FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match), - FIELD(CR3_TARGET_COUNT, cr3_target_count), - FIELD(VM_EXIT_CONTROLS, vm_exit_controls), - FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count), - FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count), - FIELD(VM_ENTRY_CONTROLS, vm_entry_controls), - FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count), - FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field), - FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code), - FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len), - FIELD(TPR_THRESHOLD, tpr_threshold), - FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control), - FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error), - FIELD(VM_EXIT_REASON, vm_exit_reason), - FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info), - FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code), - FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field), - FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code), - FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len), - FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info), - FIELD(GUEST_ES_LIMIT, guest_es_limit), - FIELD(GUEST_CS_LIMIT, guest_cs_limit), - FIELD(GUEST_SS_LIMIT, guest_ss_limit), - FIELD(GUEST_DS_LIMIT, guest_ds_limit), - FIELD(GUEST_FS_LIMIT, guest_fs_limit), - FIELD(GUEST_GS_LIMIT, guest_gs_limit), - FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit), - FIELD(GUEST_TR_LIMIT, guest_tr_limit), - FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit), - FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit), - FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes), - FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes), - FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes), - FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes), - FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes), - FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes), - FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes), - FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes), - FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info), - FIELD(GUEST_ACTIVITY_STATE, guest_activity_state), - FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs), - FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs), - FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value), - FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask), - FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask), - FIELD(CR0_READ_SHADOW, cr0_read_shadow), - FIELD(CR4_READ_SHADOW, cr4_read_shadow), - FIELD(CR3_TARGET_VALUE0, cr3_target_value0), - FIELD(CR3_TARGET_VALUE1, cr3_target_value1), - FIELD(CR3_TARGET_VALUE2, cr3_target_value2), - FIELD(CR3_TARGET_VALUE3, cr3_target_value3), - FIELD(EXIT_QUALIFICATION, exit_qualification), - FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address), - FIELD(GUEST_CR0, guest_cr0), - FIELD(GUEST_CR3, guest_cr3), - FIELD(GUEST_CR4, guest_cr4), - FIELD(GUEST_ES_BASE, guest_es_base), - FIELD(GUEST_CS_BASE, guest_cs_base), - FIELD(GUEST_SS_BASE, guest_ss_base), - FIELD(GUEST_DS_BASE, guest_ds_base), - FIELD(GUEST_FS_BASE, guest_fs_base), - FIELD(GUEST_GS_BASE, guest_gs_base), - FIELD(GUEST_LDTR_BASE, guest_ldtr_base), - FIELD(GUEST_TR_BASE, guest_tr_base), - FIELD(GUEST_GDTR_BASE, guest_gdtr_base), - FIELD(GUEST_IDTR_BASE, guest_idtr_base), - FIELD(GUEST_DR7, guest_dr7), - FIELD(GUEST_RSP, guest_rsp), - FIELD(GUEST_RIP, guest_rip), - FIELD(GUEST_RFLAGS, guest_rflags), - FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions), - FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp), - FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip), - FIELD(HOST_CR0, host_cr0), - FIELD(HOST_CR3, host_cr3), - FIELD(HOST_CR4, host_cr4), - FIELD(HOST_FS_BASE, host_fs_base), - FIELD(HOST_GS_BASE, host_gs_base), - FIELD(HOST_TR_BASE, host_tr_base), - FIELD(HOST_GDTR_BASE, host_gdtr_base), - FIELD(HOST_IDTR_BASE, host_idtr_base), - FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp), - FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip), - FIELD(HOST_RSP, host_rsp), - FIELD(HOST_RIP, host_rip), -}; - -static inline short vmcs_field_to_offset(unsigned long field) -{ - const size_t size = ARRAY_SIZE(vmcs_field_to_offset_table); - unsigned short offset; - unsigned index; - - if (field >> 15) - return -ENOENT; - - index = ROL16(field, 6); - if (index >= size) - return -ENOENT; - - index = array_index_nospec(index, size); - offset = vmcs_field_to_offset_table[index]; - if (offset == 0) - return -ENOENT; - return offset; -} - -static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) -{ - return to_vmx(vcpu)->nested.cached_vmcs12; -} - -static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu) -{ - return to_vmx(vcpu)->nested.cached_shadow_vmcs12; -} - -static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu); -static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); -static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); -static bool vmx_xsaves_supported(void); -static void vmx_set_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg); -static void vmx_get_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg); -static bool guest_state_valid(struct kvm_vcpu *vcpu); -static u32 vmx_segment_access_rights(struct kvm_segment *var); -static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); -static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); -static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); -static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, - u16 error_code); -static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); -static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, - u32 msr, int type); - -static DEFINE_PER_CPU(struct vmcs *, vmxarea); -static DEFINE_PER_CPU(struct vmcs *, current_vmcs); -/* - * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed - * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. - */ -static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); - -/* - * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we - * can find which vCPU should be waken up. - */ -static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); -static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); - -enum { - VMX_VMREAD_BITMAP, - VMX_VMWRITE_BITMAP, - VMX_BITMAP_NR -}; - -static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; - -#define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) -#define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) - -static bool cpu_has_load_ia32_efer; -static bool cpu_has_load_perf_global_ctrl; - -static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); -static DEFINE_SPINLOCK(vmx_vpid_lock); - -static struct vmcs_config { - int size; - int order; - u32 basic_cap; - u32 revision_id; - u32 pin_based_exec_ctrl; - u32 cpu_based_exec_ctrl; - u32 cpu_based_2nd_exec_ctrl; - u32 vmexit_ctrl; - u32 vmentry_ctrl; - struct nested_vmx_msrs nested; -} vmcs_config; - -static struct vmx_capability { - u32 ept; - u32 vpid; -} vmx_capability; - -#define VMX_SEGMENT_FIELD(seg) \ - [VCPU_SREG_##seg] = { \ - .selector = GUEST_##seg##_SELECTOR, \ - .base = GUEST_##seg##_BASE, \ - .limit = GUEST_##seg##_LIMIT, \ - .ar_bytes = GUEST_##seg##_AR_BYTES, \ - } - -static const struct kvm_vmx_segment_field { - unsigned selector; - unsigned base; - unsigned limit; - unsigned ar_bytes; -} kvm_vmx_segment_fields[] = { - VMX_SEGMENT_FIELD(CS), - VMX_SEGMENT_FIELD(DS), - VMX_SEGMENT_FIELD(ES), - VMX_SEGMENT_FIELD(FS), - VMX_SEGMENT_FIELD(GS), - VMX_SEGMENT_FIELD(SS), - VMX_SEGMENT_FIELD(TR), - VMX_SEGMENT_FIELD(LDTR), -}; - -static u64 host_efer; - -static void ept_save_pdptrs(struct kvm_vcpu *vcpu); - -/* - * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it - * away by decrementing the array size. - */ -static const u32 vmx_msr_index[] = { -#ifdef CONFIG_X86_64 - MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, -#endif - MSR_EFER, MSR_TSC_AUX, MSR_STAR, -}; - -DEFINE_STATIC_KEY_FALSE(enable_evmcs); - -#define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs)) - -#define KVM_EVMCS_VERSION 1 - -/* - * Enlightened VMCSv1 doesn't support these: - * - * POSTED_INTR_NV = 0x00000002, - * GUEST_INTR_STATUS = 0x00000810, - * APIC_ACCESS_ADDR = 0x00002014, - * POSTED_INTR_DESC_ADDR = 0x00002016, - * EOI_EXIT_BITMAP0 = 0x0000201c, - * EOI_EXIT_BITMAP1 = 0x0000201e, - * EOI_EXIT_BITMAP2 = 0x00002020, - * EOI_EXIT_BITMAP3 = 0x00002022, - * GUEST_PML_INDEX = 0x00000812, - * PML_ADDRESS = 0x0000200e, - * VM_FUNCTION_CONTROL = 0x00002018, - * EPTP_LIST_ADDRESS = 0x00002024, - * VMREAD_BITMAP = 0x00002026, - * VMWRITE_BITMAP = 0x00002028, - * - * TSC_MULTIPLIER = 0x00002032, - * PLE_GAP = 0x00004020, - * PLE_WINDOW = 0x00004022, - * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E, - * GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808, - * HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04, - * - * Currently unsupported in KVM: - * GUEST_IA32_RTIT_CTL = 0x00002814, - */ -#define EVMCS1_UNSUPPORTED_PINCTRL (PIN_BASED_POSTED_INTR | \ - PIN_BASED_VMX_PREEMPTION_TIMER) -#define EVMCS1_UNSUPPORTED_2NDEXEC \ - (SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \ - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | \ - SECONDARY_EXEC_APIC_REGISTER_VIRT | \ - SECONDARY_EXEC_ENABLE_PML | \ - SECONDARY_EXEC_ENABLE_VMFUNC | \ - SECONDARY_EXEC_SHADOW_VMCS | \ - SECONDARY_EXEC_TSC_SCALING | \ - SECONDARY_EXEC_PAUSE_LOOP_EXITING) -#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) -#define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) -#define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING) - -#if IS_ENABLED(CONFIG_HYPERV) -static bool __read_mostly enlightened_vmcs = true; -module_param(enlightened_vmcs, bool, 0444); - -static inline void evmcs_write64(unsigned long field, u64 value) -{ - u16 clean_field; - int offset = get_evmcs_offset(field, &clean_field); - - if (offset < 0) - return; - - *(u64 *)((char *)current_evmcs + offset) = value; - - current_evmcs->hv_clean_fields &= ~clean_field; -} - -static inline void evmcs_write32(unsigned long field, u32 value) -{ - u16 clean_field; - int offset = get_evmcs_offset(field, &clean_field); - - if (offset < 0) - return; - - *(u32 *)((char *)current_evmcs + offset) = value; - current_evmcs->hv_clean_fields &= ~clean_field; -} - -static inline void evmcs_write16(unsigned long field, u16 value) -{ - u16 clean_field; - int offset = get_evmcs_offset(field, &clean_field); - - if (offset < 0) - return; - - *(u16 *)((char *)current_evmcs + offset) = value; - current_evmcs->hv_clean_fields &= ~clean_field; -} - -static inline u64 evmcs_read64(unsigned long field) -{ - int offset = get_evmcs_offset(field, NULL); - - if (offset < 0) - return 0; - - return *(u64 *)((char *)current_evmcs + offset); -} - -static inline u32 evmcs_read32(unsigned long field) -{ - int offset = get_evmcs_offset(field, NULL); - - if (offset < 0) - return 0; - - return *(u32 *)((char *)current_evmcs + offset); -} - -static inline u16 evmcs_read16(unsigned long field) -{ - int offset = get_evmcs_offset(field, NULL); - - if (offset < 0) - return 0; - - return *(u16 *)((char *)current_evmcs + offset); -} - -static inline void evmcs_touch_msr_bitmap(void) -{ - if (unlikely(!current_evmcs)) - return; - - if (current_evmcs->hv_enlightenments_control.msr_bitmap) - current_evmcs->hv_clean_fields &= - ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP; -} - -static void evmcs_load(u64 phys_addr) -{ - struct hv_vp_assist_page *vp_ap = - hv_get_vp_assist_page(smp_processor_id()); - - vp_ap->current_nested_vmcs = phys_addr; - vp_ap->enlighten_vmentry = 1; -} - -static void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) -{ - vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL; - vmcs_conf->cpu_based_2nd_exec_ctrl &= ~EVMCS1_UNSUPPORTED_2NDEXEC; - - vmcs_conf->vmexit_ctrl &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL; - vmcs_conf->vmentry_ctrl &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL; - -} - -/* check_ept_pointer() should be under protection of ept_pointer_lock. */ -static void check_ept_pointer_match(struct kvm *kvm) -{ - struct kvm_vcpu *vcpu; - u64 tmp_eptp = INVALID_PAGE; - int i; - - kvm_for_each_vcpu(i, vcpu, kvm) { - if (!VALID_PAGE(tmp_eptp)) { - tmp_eptp = to_vmx(vcpu)->ept_pointer; - } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) { - to_kvm_vmx(kvm)->ept_pointers_match - = EPT_POINTERS_MISMATCH; - return; - } - } - - to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH; -} - -static int vmx_hv_remote_flush_tlb(struct kvm *kvm) -{ - struct kvm_vcpu *vcpu; - int ret = -ENOTSUPP, i; - - spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock); - - if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK) - check_ept_pointer_match(kvm); - - /* - * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs the address of the - * base of EPT PML4 table, strip off EPT configuration information. - */ - if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) { - kvm_for_each_vcpu(i, vcpu, kvm) - ret |= hyperv_flush_guest_mapping( - to_vmx(kvm_get_vcpu(kvm, i))->ept_pointer & PAGE_MASK); - } else { - ret = hyperv_flush_guest_mapping( - to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer & PAGE_MASK); - } - - spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); - return ret; -} -#else /* !IS_ENABLED(CONFIG_HYPERV) */ -static inline void evmcs_write64(unsigned long field, u64 value) {} -static inline void evmcs_write32(unsigned long field, u32 value) {} -static inline void evmcs_write16(unsigned long field, u16 value) {} -static inline u64 evmcs_read64(unsigned long field) { return 0; } -static inline u32 evmcs_read32(unsigned long field) { return 0; } -static inline u16 evmcs_read16(unsigned long field) { return 0; } -static inline void evmcs_load(u64 phys_addr) {} -static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {} -static inline void evmcs_touch_msr_bitmap(void) {} -#endif /* IS_ENABLED(CONFIG_HYPERV) */ - -static int nested_enable_evmcs(struct kvm_vcpu *vcpu, - uint16_t *vmcs_version) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - /* - * vmcs_version represents the range of supported Enlightened VMCS - * versions: lower 8 bits is the minimal version, higher 8 bits is the - * maximum supported version. KVM supports versions from 1 to - * KVM_EVMCS_VERSION. - */ - if (vmcs_version) - *vmcs_version = (KVM_EVMCS_VERSION << 8) | 1; - - /* We don't support disabling the feature for simplicity. */ - if (vmx->nested.enlightened_vmcs_enabled) - return 0; - - vmx->nested.enlightened_vmcs_enabled = true; - - vmx->nested.msrs.pinbased_ctls_high &= ~EVMCS1_UNSUPPORTED_PINCTRL; - vmx->nested.msrs.entry_ctls_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL; - vmx->nested.msrs.exit_ctls_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL; - vmx->nested.msrs.secondary_ctls_high &= ~EVMCS1_UNSUPPORTED_2NDEXEC; - vmx->nested.msrs.vmfunc_controls &= ~EVMCS1_UNSUPPORTED_VMFUNC; - - return 0; -} - -static inline bool is_exception_n(u32 intr_info, u8 vector) -{ - return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | - INTR_INFO_VALID_MASK)) == - (INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK); -} - -static inline bool is_debug(u32 intr_info) -{ - return is_exception_n(intr_info, DB_VECTOR); -} - -static inline bool is_breakpoint(u32 intr_info) -{ - return is_exception_n(intr_info, BP_VECTOR); -} - -static inline bool is_page_fault(u32 intr_info) -{ - return is_exception_n(intr_info, PF_VECTOR); -} - -static inline bool is_invalid_opcode(u32 intr_info) -{ - return is_exception_n(intr_info, UD_VECTOR); -} - -static inline bool is_gp_fault(u32 intr_info) -{ - return is_exception_n(intr_info, GP_VECTOR); -} - -static inline bool is_machine_check(u32 intr_info) -{ - return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | - INTR_INFO_VALID_MASK)) == - (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK); -} - -/* Undocumented: icebp/int1 */ -static inline bool is_icebp(u32 intr_info) -{ - return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) - == (INTR_TYPE_PRIV_SW_EXCEPTION | INTR_INFO_VALID_MASK); -} - -static inline bool cpu_has_vmx_msr_bitmap(void) -{ - return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS; -} - -static inline bool cpu_has_vmx_tpr_shadow(void) -{ - return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW; -} - -static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu) -{ - return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu); -} - -static inline bool cpu_has_secondary_exec_ctrls(void) -{ - return vmcs_config.cpu_based_exec_ctrl & - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; -} - -static inline bool cpu_has_vmx_virtualize_apic_accesses(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; -} - -static inline bool cpu_has_vmx_virtualize_x2apic_mode(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; -} - -static inline bool cpu_has_vmx_apic_register_virt(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_APIC_REGISTER_VIRT; -} - -static inline bool cpu_has_vmx_virtual_intr_delivery(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; -} - -static inline bool cpu_has_vmx_encls_vmexit(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_ENCLS_EXITING; -} - -/* - * Comment's format: document - errata name - stepping - processor name. - * Refer from - * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp - */ -static u32 vmx_preemption_cpu_tfms[] = { -/* 323344.pdf - BA86 - D0 - Xeon 7500 Series */ -0x000206E6, -/* 323056.pdf - AAX65 - C2 - Xeon L3406 */ -/* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */ -/* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */ -0x00020652, -/* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */ -0x00020655, -/* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */ -/* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */ -/* - * 320767.pdf - AAP86 - B1 - - * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile - */ -0x000106E5, -/* 321333.pdf - AAM126 - C0 - Xeon 3500 */ -0x000106A0, -/* 321333.pdf - AAM126 - C1 - Xeon 3500 */ -0x000106A1, -/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */ -0x000106A4, - /* 321333.pdf - AAM126 - D0 - Xeon 3500 */ - /* 321324.pdf - AAK139 - D0 - Xeon 5500 */ - /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */ -0x000106A5, -}; - -static inline bool cpu_has_broken_vmx_preemption_timer(void) -{ - u32 eax = cpuid_eax(0x00000001), i; - - /* Clear the reserved bits */ - eax &= ~(0x3U << 14 | 0xfU << 28); - for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++) - if (eax == vmx_preemption_cpu_tfms[i]) - return true; - - return false; -} - -static inline bool cpu_has_vmx_preemption_timer(void) -{ - return vmcs_config.pin_based_exec_ctrl & - PIN_BASED_VMX_PREEMPTION_TIMER; -} - -static inline bool cpu_has_vmx_posted_intr(void) -{ - return IS_ENABLED(CONFIG_X86_LOCAL_APIC) && - vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; -} - -static inline bool cpu_has_vmx_apicv(void) -{ - return cpu_has_vmx_apic_register_virt() && - cpu_has_vmx_virtual_intr_delivery() && - cpu_has_vmx_posted_intr(); -} - -static inline bool cpu_has_vmx_flexpriority(void) -{ - return cpu_has_vmx_tpr_shadow() && - cpu_has_vmx_virtualize_apic_accesses(); -} - -static inline bool cpu_has_vmx_ept_execute_only(void) -{ - return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT; -} - -static inline bool cpu_has_vmx_ept_2m_page(void) -{ - return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT; -} - -static inline bool cpu_has_vmx_ept_1g_page(void) -{ - return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT; -} - -static inline bool cpu_has_vmx_ept_4levels(void) -{ - return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; -} - -static inline bool cpu_has_vmx_ept_mt_wb(void) -{ - return vmx_capability.ept & VMX_EPTP_WB_BIT; -} - -static inline bool cpu_has_vmx_ept_5levels(void) -{ - return vmx_capability.ept & VMX_EPT_PAGE_WALK_5_BIT; -} - -static inline bool cpu_has_vmx_ept_ad_bits(void) -{ - return vmx_capability.ept & VMX_EPT_AD_BIT; -} - -static inline bool cpu_has_vmx_invept_context(void) -{ - return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT; -} - -static inline bool cpu_has_vmx_invept_global(void) -{ - return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT; -} - -static inline bool cpu_has_vmx_invvpid_individual_addr(void) -{ - return vmx_capability.vpid & VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT; -} - -static inline bool cpu_has_vmx_invvpid_single(void) -{ - return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT; -} - -static inline bool cpu_has_vmx_invvpid_global(void) -{ - return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; -} - -static inline bool cpu_has_vmx_invvpid(void) -{ - return vmx_capability.vpid & VMX_VPID_INVVPID_BIT; -} - -static inline bool cpu_has_vmx_ept(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_ENABLE_EPT; -} - -static inline bool cpu_has_vmx_unrestricted_guest(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_UNRESTRICTED_GUEST; -} - -static inline bool cpu_has_vmx_ple(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_PAUSE_LOOP_EXITING; -} - -static inline bool cpu_has_vmx_basic_inout(void) -{ - return (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT); -} - -static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) -{ - return flexpriority_enabled && lapic_in_kernel(vcpu); -} - -static inline bool cpu_has_vmx_vpid(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_ENABLE_VPID; -} - -static inline bool cpu_has_vmx_rdtscp(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_RDTSCP; -} - -static inline bool cpu_has_vmx_invpcid(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_ENABLE_INVPCID; -} - -static inline bool cpu_has_virtual_nmis(void) -{ - return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; -} - -static inline bool cpu_has_vmx_wbinvd_exit(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_WBINVD_EXITING; -} - -static inline bool cpu_has_vmx_shadow_vmcs(void) -{ - u64 vmx_msr; - rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); - /* check if the cpu supports writing r/o exit information fields */ - if (!(vmx_msr & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS)) - return false; - - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_SHADOW_VMCS; -} - -static inline bool cpu_has_vmx_pml(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML; -} - -static inline bool cpu_has_vmx_tsc_scaling(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_TSC_SCALING; -} - -static inline bool cpu_has_vmx_vmfunc(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_ENABLE_VMFUNC; -} - -static bool vmx_umip_emulated(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_DESC; -} - -static inline bool report_flexpriority(void) -{ - return flexpriority_enabled; -} - -static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu) -{ - return vmx_misc_cr3_count(to_vmx(vcpu)->nested.msrs.misc_low); -} - -/* - * Do the virtual VMX capability MSRs specify that L1 can use VMWRITE - * to modify any valid field of the VMCS, or are the VM-exit - * information fields read-only? - */ -static inline bool nested_cpu_has_vmwrite_any_field(struct kvm_vcpu *vcpu) -{ - return to_vmx(vcpu)->nested.msrs.misc_low & - MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS; -} - -static inline bool nested_cpu_has_zero_length_injection(struct kvm_vcpu *vcpu) -{ - return to_vmx(vcpu)->nested.msrs.misc_low & VMX_MISC_ZERO_LEN_INS; -} - -static inline bool nested_cpu_supports_monitor_trap_flag(struct kvm_vcpu *vcpu) -{ - return to_vmx(vcpu)->nested.msrs.procbased_ctls_high & - CPU_BASED_MONITOR_TRAP_FLAG; -} - -static inline bool nested_cpu_has_vmx_shadow_vmcs(struct kvm_vcpu *vcpu) -{ - return to_vmx(vcpu)->nested.msrs.secondary_ctls_high & - SECONDARY_EXEC_SHADOW_VMCS; -} - -static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit) -{ - return vmcs12->cpu_based_vm_exec_control & bit; -} - -static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit) -{ - return (vmcs12->cpu_based_vm_exec_control & - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && - (vmcs12->secondary_vm_exec_control & bit); -} - -static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12) -{ - return vmcs12->pin_based_vm_exec_control & - PIN_BASED_VMX_PREEMPTION_TIMER; -} - -static inline bool nested_cpu_has_nmi_exiting(struct vmcs12 *vmcs12) -{ - return vmcs12->pin_based_vm_exec_control & PIN_BASED_NMI_EXITING; -} - -static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12) -{ - return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; -} - -static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) -{ - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); -} - -static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12) -{ - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); -} - -static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12) -{ - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML); -} - -static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12) -{ - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); -} - -static inline bool nested_cpu_has_vpid(struct vmcs12 *vmcs12) -{ - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID); -} - -static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12) -{ - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT); -} - -static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12) -{ - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); -} - -static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12) -{ - return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR; -} - -static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12) -{ - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC); -} - -static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12) -{ - return nested_cpu_has_vmfunc(vmcs12) && - (vmcs12->vm_function_control & - VMX_VMFUNC_EPTP_SWITCHING); -} - -static inline bool nested_cpu_has_shadow_vmcs(struct vmcs12 *vmcs12) -{ - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS); -} - -static inline bool nested_cpu_has_save_preemption_timer(struct vmcs12 *vmcs12) -{ - return vmcs12->vm_exit_controls & - VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; -} - -static inline bool is_nmi(u32 intr_info) -{ - return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) - == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK); -} - -static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, - u32 exit_intr_info, - unsigned long exit_qualification); - -static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) -{ - int i; - - for (i = 0; i < vmx->nmsrs; ++i) - if (vmx_msr_index[vmx->guest_msrs[i].index] == msr) - return i; - return -1; -} - -static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva) -{ - struct { - u64 vpid : 16; - u64 rsvd : 48; - u64 gva; - } operand = { vpid, 0, gva }; - bool error; - - asm volatile (__ex("invvpid %2, %1") CC_SET(na) - : CC_OUT(na) (error) : "r"(ext), "m"(operand)); - BUG_ON(error); -} - -static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa) -{ - struct { - u64 eptp, gpa; - } operand = {eptp, gpa}; - bool error; - - asm volatile (__ex("invept %2, %1") CC_SET(na) - : CC_OUT(na) (error) : "r"(ext), "m"(operand)); - BUG_ON(error); -} - -static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) -{ - int i; - - i = __find_msr_index(vmx, msr); - if (i >= 0) - return &vmx->guest_msrs[i]; - return NULL; -} - -static void vmcs_clear(struct vmcs *vmcs) -{ - u64 phys_addr = __pa(vmcs); - bool error; - - asm volatile (__ex("vmclear %1") CC_SET(na) - : CC_OUT(na) (error) : "m"(phys_addr)); - if (unlikely(error)) - printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", - vmcs, phys_addr); -} - -static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) -{ - vmcs_clear(loaded_vmcs->vmcs); - if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched) - vmcs_clear(loaded_vmcs->shadow_vmcs); - loaded_vmcs->cpu = -1; - loaded_vmcs->launched = 0; -} - -static void vmcs_load(struct vmcs *vmcs) -{ - u64 phys_addr = __pa(vmcs); - bool error; - - if (static_branch_unlikely(&enable_evmcs)) - return evmcs_load(phys_addr); - - asm volatile (__ex("vmptrld %1") CC_SET(na) - : CC_OUT(na) (error) : "m"(phys_addr)); - if (unlikely(error)) - printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n", - vmcs, phys_addr); -} - -#ifdef CONFIG_KEXEC_CORE -/* - * This bitmap is used to indicate whether the vmclear - * operation is enabled on all cpus. All disabled by - * default. - */ -static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE; - -static inline void crash_enable_local_vmclear(int cpu) -{ - cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap); -} - -static inline void crash_disable_local_vmclear(int cpu) -{ - cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap); -} - -static inline int crash_local_vmclear_enabled(int cpu) -{ - return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap); -} - -static void crash_vmclear_local_loaded_vmcss(void) -{ - int cpu = raw_smp_processor_id(); - struct loaded_vmcs *v; - - if (!crash_local_vmclear_enabled(cpu)) - return; - - list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), - loaded_vmcss_on_cpu_link) - vmcs_clear(v->vmcs); -} -#else -static inline void crash_enable_local_vmclear(int cpu) { } -static inline void crash_disable_local_vmclear(int cpu) { } -#endif /* CONFIG_KEXEC_CORE */ - -static void __loaded_vmcs_clear(void *arg) -{ - struct loaded_vmcs *loaded_vmcs = arg; - int cpu = raw_smp_processor_id(); - - if (loaded_vmcs->cpu != cpu) - return; /* vcpu migration can race with cpu offline */ - if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) - per_cpu(current_vmcs, cpu) = NULL; - crash_disable_local_vmclear(cpu); - list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); - - /* - * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link - * is before setting loaded_vmcs->vcpu to -1 which is done in - * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist - * then adds the vmcs into percpu list before it is deleted. - */ - smp_wmb(); - - loaded_vmcs_init(loaded_vmcs); - crash_enable_local_vmclear(cpu); -} - -static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) -{ - int cpu = loaded_vmcs->cpu; - - if (cpu != -1) - smp_call_function_single(cpu, - __loaded_vmcs_clear, loaded_vmcs, 1); -} - -static inline bool vpid_sync_vcpu_addr(int vpid, gva_t addr) -{ - if (vpid == 0) - return true; - - if (cpu_has_vmx_invvpid_individual_addr()) { - __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr); - return true; - } - - return false; -} - -static inline void vpid_sync_vcpu_single(int vpid) -{ - if (vpid == 0) - return; - - if (cpu_has_vmx_invvpid_single()) - __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0); -} - -static inline void vpid_sync_vcpu_global(void) -{ - if (cpu_has_vmx_invvpid_global()) - __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0); -} - -static inline void vpid_sync_context(int vpid) -{ - if (cpu_has_vmx_invvpid_single()) - vpid_sync_vcpu_single(vpid); - else - vpid_sync_vcpu_global(); -} - -static inline void ept_sync_global(void) -{ - __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0); -} - -static inline void ept_sync_context(u64 eptp) -{ - if (cpu_has_vmx_invept_context()) - __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0); - else - ept_sync_global(); -} - -static __always_inline void vmcs_check16(unsigned long field) -{ - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000, - "16-bit accessor invalid for 64-bit field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, - "16-bit accessor invalid for 64-bit high field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, - "16-bit accessor invalid for 32-bit high field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, - "16-bit accessor invalid for natural width field"); -} - -static __always_inline void vmcs_check32(unsigned long field) -{ - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, - "32-bit accessor invalid for 16-bit field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, - "32-bit accessor invalid for natural width field"); -} - -static __always_inline void vmcs_check64(unsigned long field) -{ - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, - "64-bit accessor invalid for 16-bit field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, - "64-bit accessor invalid for 64-bit high field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, - "64-bit accessor invalid for 32-bit field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, - "64-bit accessor invalid for natural width field"); -} - -static __always_inline void vmcs_checkl(unsigned long field) -{ - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, - "Natural width accessor invalid for 16-bit field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000, - "Natural width accessor invalid for 64-bit field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, - "Natural width accessor invalid for 64-bit high field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, - "Natural width accessor invalid for 32-bit field"); -} - -static __always_inline unsigned long __vmcs_readl(unsigned long field) -{ - unsigned long value; - - asm volatile (__ex_clear("vmread %1, %0", "%k0") - : "=r"(value) : "r"(field)); - return value; -} - -static __always_inline u16 vmcs_read16(unsigned long field) -{ - vmcs_check16(field); - if (static_branch_unlikely(&enable_evmcs)) - return evmcs_read16(field); - return __vmcs_readl(field); -} - -static __always_inline u32 vmcs_read32(unsigned long field) -{ - vmcs_check32(field); - if (static_branch_unlikely(&enable_evmcs)) - return evmcs_read32(field); - return __vmcs_readl(field); -} - -static __always_inline u64 vmcs_read64(unsigned long field) -{ - vmcs_check64(field); - if (static_branch_unlikely(&enable_evmcs)) - return evmcs_read64(field); -#ifdef CONFIG_X86_64 - return __vmcs_readl(field); -#else - return __vmcs_readl(field) | ((u64)__vmcs_readl(field+1) << 32); -#endif -} - -static __always_inline unsigned long vmcs_readl(unsigned long field) -{ - vmcs_checkl(field); - if (static_branch_unlikely(&enable_evmcs)) - return evmcs_read64(field); - return __vmcs_readl(field); -} - -static noinline void vmwrite_error(unsigned long field, unsigned long value) -{ - printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n", - field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); - dump_stack(); -} - -static __always_inline void __vmcs_writel(unsigned long field, unsigned long value) -{ - bool error; - - asm volatile (__ex("vmwrite %2, %1") CC_SET(na) - : CC_OUT(na) (error) : "r"(field), "rm"(value)); - if (unlikely(error)) - vmwrite_error(field, value); -} - -static __always_inline void vmcs_write16(unsigned long field, u16 value) -{ - vmcs_check16(field); - if (static_branch_unlikely(&enable_evmcs)) - return evmcs_write16(field, value); - - __vmcs_writel(field, value); -} - -static __always_inline void vmcs_write32(unsigned long field, u32 value) -{ - vmcs_check32(field); - if (static_branch_unlikely(&enable_evmcs)) - return evmcs_write32(field, value); - - __vmcs_writel(field, value); -} - -static __always_inline void vmcs_write64(unsigned long field, u64 value) -{ - vmcs_check64(field); - if (static_branch_unlikely(&enable_evmcs)) - return evmcs_write64(field, value); - - __vmcs_writel(field, value); -#ifndef CONFIG_X86_64 - asm volatile (""); - __vmcs_writel(field+1, value >> 32); -#endif -} - -static __always_inline void vmcs_writel(unsigned long field, unsigned long value) -{ - vmcs_checkl(field); - if (static_branch_unlikely(&enable_evmcs)) - return evmcs_write64(field, value); - - __vmcs_writel(field, value); -} - -static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask) -{ - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000, - "vmcs_clear_bits does not support 64-bit fields"); - if (static_branch_unlikely(&enable_evmcs)) - return evmcs_write32(field, evmcs_read32(field) & ~mask); - - __vmcs_writel(field, __vmcs_readl(field) & ~mask); -} - -static __always_inline void vmcs_set_bits(unsigned long field, u32 mask) -{ - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000, - "vmcs_set_bits does not support 64-bit fields"); - if (static_branch_unlikely(&enable_evmcs)) - return evmcs_write32(field, evmcs_read32(field) | mask); - - __vmcs_writel(field, __vmcs_readl(field) | mask); -} - -static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx) -{ - vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS); -} - -static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val) -{ - vmcs_write32(VM_ENTRY_CONTROLS, val); - vmx->vm_entry_controls_shadow = val; -} - -static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val) -{ - if (vmx->vm_entry_controls_shadow != val) - vm_entry_controls_init(vmx, val); -} - -static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx) -{ - return vmx->vm_entry_controls_shadow; -} - - -static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val) -{ - vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val); -} - -static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val) -{ - vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val); -} - -static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx) -{ - vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS); -} - -static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val) -{ - vmcs_write32(VM_EXIT_CONTROLS, val); - vmx->vm_exit_controls_shadow = val; -} - -static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val) -{ - if (vmx->vm_exit_controls_shadow != val) - vm_exit_controls_init(vmx, val); -} - -static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx) -{ - return vmx->vm_exit_controls_shadow; -} - - -static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val) -{ - vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val); -} - -static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val) -{ - vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val); -} - -static void vmx_segment_cache_clear(struct vcpu_vmx *vmx) -{ - vmx->segment_cache.bitmask = 0; -} - -static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg, - unsigned field) -{ - bool ret; - u32 mask = 1 << (seg * SEG_FIELD_NR + field); - - if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) { - vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS); - vmx->segment_cache.bitmask = 0; - } - ret = vmx->segment_cache.bitmask & mask; - vmx->segment_cache.bitmask |= mask; - return ret; -} - -static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg) -{ - u16 *p = &vmx->segment_cache.seg[seg].selector; - - if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL)) - *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector); - return *p; -} - -static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg) -{ - ulong *p = &vmx->segment_cache.seg[seg].base; - - if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE)) - *p = vmcs_readl(kvm_vmx_segment_fields[seg].base); - return *p; -} - -static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg) -{ - u32 *p = &vmx->segment_cache.seg[seg].limit; - - if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT)) - *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit); - return *p; -} - -static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg) -{ - u32 *p = &vmx->segment_cache.seg[seg].ar; - - if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR)) - *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes); - return *p; -} - -static void update_exception_bitmap(struct kvm_vcpu *vcpu) -{ - u32 eb; - - eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | - (1u << DB_VECTOR) | (1u << AC_VECTOR); - /* - * Guest access to VMware backdoor ports could legitimately - * trigger #GP because of TSS I/O permission bitmap. - * We intercept those #GP and allow access to them anyway - * as VMware does. - */ - if (enable_vmware_backdoor) - eb |= (1u << GP_VECTOR); - if ((vcpu->guest_debug & - (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == - (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) - eb |= 1u << BP_VECTOR; - if (to_vmx(vcpu)->rmode.vm86_active) - eb = ~0; - if (enable_ept) - eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ - - /* When we are running a nested L2 guest and L1 specified for it a - * certain exception bitmap, we must trap the same exceptions and pass - * them to L1. When running L2, we will only handle the exceptions - * specified above if L1 did not want them. - */ - if (is_guest_mode(vcpu)) - eb |= get_vmcs12(vcpu)->exception_bitmap; - - vmcs_write32(EXCEPTION_BITMAP, eb); -} - -/* - * Check if MSR is intercepted for currently loaded MSR bitmap. - */ -static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) -{ - unsigned long *msr_bitmap; - int f = sizeof(unsigned long); - - if (!cpu_has_vmx_msr_bitmap()) - return true; - - msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap; - - if (msr <= 0x1fff) { - return !!test_bit(msr, msr_bitmap + 0x800 / f); - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { - msr &= 0x1fff; - return !!test_bit(msr, msr_bitmap + 0xc00 / f); - } - - return true; -} - -/* - * Check if MSR is intercepted for L01 MSR bitmap. - */ -static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr) -{ - unsigned long *msr_bitmap; - int f = sizeof(unsigned long); - - if (!cpu_has_vmx_msr_bitmap()) - return true; - - msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; - - if (msr <= 0x1fff) { - return !!test_bit(msr, msr_bitmap + 0x800 / f); - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { - msr &= 0x1fff; - return !!test_bit(msr, msr_bitmap + 0xc00 / f); - } - - return true; -} - -static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, - unsigned long entry, unsigned long exit) -{ - vm_entry_controls_clearbit(vmx, entry); - vm_exit_controls_clearbit(vmx, exit); -} - -static int find_msr(struct vmx_msrs *m, unsigned int msr) -{ - unsigned int i; - - for (i = 0; i < m->nr; ++i) { - if (m->val[i].index == msr) - return i; - } - return -ENOENT; -} - -static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) -{ - int i; - struct msr_autoload *m = &vmx->msr_autoload; - - switch (msr) { - case MSR_EFER: - if (cpu_has_load_ia32_efer) { - clear_atomic_switch_msr_special(vmx, - VM_ENTRY_LOAD_IA32_EFER, - VM_EXIT_LOAD_IA32_EFER); - return; - } - break; - case MSR_CORE_PERF_GLOBAL_CTRL: - if (cpu_has_load_perf_global_ctrl) { - clear_atomic_switch_msr_special(vmx, - VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, - VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); - return; - } - break; - } - i = find_msr(&m->guest, msr); - if (i < 0) - goto skip_guest; - --m->guest.nr; - m->guest.val[i] = m->guest.val[m->guest.nr]; - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); - -skip_guest: - i = find_msr(&m->host, msr); - if (i < 0) - return; - - --m->host.nr; - m->host.val[i] = m->host.val[m->host.nr]; - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); -} - -static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, - unsigned long entry, unsigned long exit, - unsigned long guest_val_vmcs, unsigned long host_val_vmcs, - u64 guest_val, u64 host_val) -{ - vmcs_write64(guest_val_vmcs, guest_val); - if (host_val_vmcs != HOST_IA32_EFER) - vmcs_write64(host_val_vmcs, host_val); - vm_entry_controls_setbit(vmx, entry); - vm_exit_controls_setbit(vmx, exit); -} - -static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, - u64 guest_val, u64 host_val, bool entry_only) -{ - int i, j = 0; - struct msr_autoload *m = &vmx->msr_autoload; - - switch (msr) { - case MSR_EFER: - if (cpu_has_load_ia32_efer) { - add_atomic_switch_msr_special(vmx, - VM_ENTRY_LOAD_IA32_EFER, - VM_EXIT_LOAD_IA32_EFER, - GUEST_IA32_EFER, - HOST_IA32_EFER, - guest_val, host_val); - return; - } - break; - case MSR_CORE_PERF_GLOBAL_CTRL: - if (cpu_has_load_perf_global_ctrl) { - add_atomic_switch_msr_special(vmx, - VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, - VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL, - GUEST_IA32_PERF_GLOBAL_CTRL, - HOST_IA32_PERF_GLOBAL_CTRL, - guest_val, host_val); - return; - } - break; - case MSR_IA32_PEBS_ENABLE: - /* PEBS needs a quiescent period after being disabled (to write - * a record). Disabling PEBS through VMX MSR swapping doesn't - * provide that period, so a CPU could write host's record into - * guest's memory. - */ - wrmsrl(MSR_IA32_PEBS_ENABLE, 0); - } - - i = find_msr(&m->guest, msr); - if (!entry_only) - j = find_msr(&m->host, msr); - - if (i == NR_AUTOLOAD_MSRS || j == NR_AUTOLOAD_MSRS) { - printk_once(KERN_WARNING "Not enough msr switch entries. " - "Can't add msr %x\n", msr); - return; - } - if (i < 0) { - i = m->guest.nr++; - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); - } - m->guest.val[i].index = msr; - m->guest.val[i].value = guest_val; - - if (entry_only) - return; - - if (j < 0) { - j = m->host.nr++; - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); - } - m->host.val[j].index = msr; - m->host.val[j].value = host_val; -} - -static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) -{ - u64 guest_efer = vmx->vcpu.arch.efer; - u64 ignore_bits = 0; - - if (!enable_ept) { - /* - * NX is needed to handle CR0.WP=1, CR4.SMEP=1. Testing - * host CPUID is more efficient than testing guest CPUID - * or CR4. Host SMEP is anyway a requirement for guest SMEP. - */ - if (boot_cpu_has(X86_FEATURE_SMEP)) - guest_efer |= EFER_NX; - else if (!(guest_efer & EFER_NX)) - ignore_bits |= EFER_NX; - } - - /* - * LMA and LME handled by hardware; SCE meaningless outside long mode. - */ - ignore_bits |= EFER_SCE; -#ifdef CONFIG_X86_64 - ignore_bits |= EFER_LMA | EFER_LME; - /* SCE is meaningful only in long mode on Intel */ - if (guest_efer & EFER_LMA) - ignore_bits &= ~(u64)EFER_SCE; -#endif - - /* - * On EPT, we can't emulate NX, so we must switch EFER atomically. - * On CPUs that support "load IA32_EFER", always switch EFER - * atomically, since it's faster than switching it manually. - */ - if (cpu_has_load_ia32_efer || - (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) { - if (!(guest_efer & EFER_LMA)) - guest_efer &= ~EFER_LME; - if (guest_efer != host_efer) - add_atomic_switch_msr(vmx, MSR_EFER, - guest_efer, host_efer, false); - else - clear_atomic_switch_msr(vmx, MSR_EFER); - return false; - } else { - clear_atomic_switch_msr(vmx, MSR_EFER); - - guest_efer &= ~ignore_bits; - guest_efer |= host_efer & ignore_bits; - - vmx->guest_msrs[efer_offset].data = guest_efer; - vmx->guest_msrs[efer_offset].mask = ~ignore_bits; - - return true; - } -} - -#ifdef CONFIG_X86_32 -/* - * On 32-bit kernels, VM exits still load the FS and GS bases from the - * VMCS rather than the segment table. KVM uses this helper to figure - * out the current bases to poke them into the VMCS before entry. - */ -static unsigned long segment_base(u16 selector) -{ - struct desc_struct *table; - unsigned long v; - - if (!(selector & ~SEGMENT_RPL_MASK)) - return 0; - - table = get_current_gdt_ro(); - - if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) { - u16 ldt_selector = kvm_read_ldt(); - - if (!(ldt_selector & ~SEGMENT_RPL_MASK)) - return 0; - - table = (struct desc_struct *)segment_base(ldt_selector); - } - v = get_desc_base(&table[selector >> 3]); - return v; -} -#endif - -static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct vmcs_host_state *host_state; -#ifdef CONFIG_X86_64 - int cpu = raw_smp_processor_id(); -#endif - unsigned long fs_base, gs_base; - u16 fs_sel, gs_sel; - int i; - - vmx->req_immediate_exit = false; - - /* - * Note that guest MSRs to be saved/restored can also be changed - * when guest state is loaded. This happens when guest transitions - * to/from long-mode by setting MSR_EFER.LMA. - */ - if (!vmx->loaded_cpu_state || vmx->guest_msrs_dirty) { - vmx->guest_msrs_dirty = false; - for (i = 0; i < vmx->save_nmsrs; ++i) - kvm_set_shared_msr(vmx->guest_msrs[i].index, - vmx->guest_msrs[i].data, - vmx->guest_msrs[i].mask); - - } - - if (vmx->loaded_cpu_state) - return; - - vmx->loaded_cpu_state = vmx->loaded_vmcs; - host_state = &vmx->loaded_cpu_state->host_state; - - /* - * Set host fs and gs selectors. Unfortunately, 22.2.3 does not - * allow segment selectors with cpl > 0 or ti == 1. - */ - host_state->ldt_sel = kvm_read_ldt(); - -#ifdef CONFIG_X86_64 - savesegment(ds, host_state->ds_sel); - savesegment(es, host_state->es_sel); - - gs_base = cpu_kernelmode_gs_base(cpu); - if (likely(is_64bit_mm(current->mm))) { - save_fsgs_for_kvm(); - fs_sel = current->thread.fsindex; - gs_sel = current->thread.gsindex; - fs_base = current->thread.fsbase; - vmx->msr_host_kernel_gs_base = current->thread.gsbase; - } else { - savesegment(fs, fs_sel); - savesegment(gs, gs_sel); - fs_base = read_msr(MSR_FS_BASE); - vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); - } - - wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); -#else - savesegment(fs, fs_sel); - savesegment(gs, gs_sel); - fs_base = segment_base(fs_sel); - gs_base = segment_base(gs_sel); -#endif - - if (unlikely(fs_sel != host_state->fs_sel)) { - if (!(fs_sel & 7)) - vmcs_write16(HOST_FS_SELECTOR, fs_sel); - else - vmcs_write16(HOST_FS_SELECTOR, 0); - host_state->fs_sel = fs_sel; - } - if (unlikely(gs_sel != host_state->gs_sel)) { - if (!(gs_sel & 7)) - vmcs_write16(HOST_GS_SELECTOR, gs_sel); - else - vmcs_write16(HOST_GS_SELECTOR, 0); - host_state->gs_sel = gs_sel; - } - if (unlikely(fs_base != host_state->fs_base)) { - vmcs_writel(HOST_FS_BASE, fs_base); - host_state->fs_base = fs_base; - } - if (unlikely(gs_base != host_state->gs_base)) { - vmcs_writel(HOST_GS_BASE, gs_base); - host_state->gs_base = gs_base; - } -} - -static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) -{ - struct vmcs_host_state *host_state; - - if (!vmx->loaded_cpu_state) - return; - - WARN_ON_ONCE(vmx->loaded_cpu_state != vmx->loaded_vmcs); - host_state = &vmx->loaded_cpu_state->host_state; - - ++vmx->vcpu.stat.host_state_reload; - vmx->loaded_cpu_state = NULL; - -#ifdef CONFIG_X86_64 - rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); -#endif - if (host_state->ldt_sel || (host_state->gs_sel & 7)) { - kvm_load_ldt(host_state->ldt_sel); -#ifdef CONFIG_X86_64 - load_gs_index(host_state->gs_sel); -#else - loadsegment(gs, host_state->gs_sel); -#endif - } - if (host_state->fs_sel & 7) - loadsegment(fs, host_state->fs_sel); -#ifdef CONFIG_X86_64 - if (unlikely(host_state->ds_sel | host_state->es_sel)) { - loadsegment(ds, host_state->ds_sel); - loadsegment(es, host_state->es_sel); - } -#endif - invalidate_tss_limit(); -#ifdef CONFIG_X86_64 - wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); -#endif - load_fixmap_gdt(raw_smp_processor_id()); -} - -#ifdef CONFIG_X86_64 -static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) -{ - preempt_disable(); - if (vmx->loaded_cpu_state) - rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); - preempt_enable(); - return vmx->msr_guest_kernel_gs_base; -} - -static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) -{ - preempt_disable(); - if (vmx->loaded_cpu_state) - wrmsrl(MSR_KERNEL_GS_BASE, data); - preempt_enable(); - vmx->msr_guest_kernel_gs_base = data; -} -#endif - -static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) -{ - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - struct pi_desc old, new; - unsigned int dest; - - /* - * In case of hot-plug or hot-unplug, we may have to undo - * vmx_vcpu_pi_put even if there is no assigned device. And we - * always keep PI.NDST up to date for simplicity: it makes the - * code easier, and CPU migration is not a fast path. - */ - if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) - return; - - /* - * First handle the simple case where no cmpxchg is necessary; just - * allow posting non-urgent interrupts. - * - * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change - * PI.NDST: pi_post_block will do it for us and the wakeup_handler - * expects the VCPU to be on the blocked_vcpu_list that matches - * PI.NDST. - */ - if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || - vcpu->cpu == cpu) { - pi_clear_sn(pi_desc); - return; - } - - /* The full case. */ - do { - old.control = new.control = pi_desc->control; - - dest = cpu_physical_id(cpu); - - if (x2apic_enabled()) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - - new.sn = 0; - } while (cmpxchg64(&pi_desc->control, old.control, - new.control) != old.control); -} - -static void decache_tsc_multiplier(struct vcpu_vmx *vmx) -{ - vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio; - vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio); -} - -/* - * Switches to specified vcpu, until a matching vcpu_put(), but assumes - * vcpu mutex is already taken. - */ -static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - bool already_loaded = vmx->loaded_vmcs->cpu == cpu; - - if (!already_loaded) { - loaded_vmcs_clear(vmx->loaded_vmcs); - local_irq_disable(); - crash_disable_local_vmclear(cpu); - - /* - * Read loaded_vmcs->cpu should be before fetching - * loaded_vmcs->loaded_vmcss_on_cpu_link. - * See the comments in __loaded_vmcs_clear(). - */ - smp_rmb(); - - list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, - &per_cpu(loaded_vmcss_on_cpu, cpu)); - crash_enable_local_vmclear(cpu); - local_irq_enable(); - } - - if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { - per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; - vmcs_load(vmx->loaded_vmcs->vmcs); - indirect_branch_prediction_barrier(); - } - - if (!already_loaded) { - void *gdt = get_current_gdt_ro(); - unsigned long sysenter_esp; - - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); - - /* - * Linux uses per-cpu TSS and GDT, so set these when switching - * processors. See 22.2.4. - */ - vmcs_writel(HOST_TR_BASE, - (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss); - vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ - - /* - * VM exits change the host TR limit to 0x67 after a VM - * exit. This is okay, since 0x67 covers everything except - * the IO bitmap and have have code to handle the IO bitmap - * being lost after a VM exit. - */ - BUILD_BUG_ON(IO_BITMAP_OFFSET - 1 != 0x67); - - rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); - vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ - - vmx->loaded_vmcs->cpu = cpu; - } - - /* Setup TSC multiplier */ - if (kvm_has_tsc_control && - vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio) - decache_tsc_multiplier(vmx); - - vmx_vcpu_pi_load(vcpu, cpu); - vmx->host_pkru = read_pkru(); - vmx->host_debugctlmsr = get_debugctlmsr(); -} - -static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) -{ - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) - return; - - /* Set SN when the vCPU is preempted */ - if (vcpu->preempted) - pi_set_sn(pi_desc); -} - -static void vmx_vcpu_put(struct kvm_vcpu *vcpu) -{ - vmx_vcpu_pi_put(vcpu); - - vmx_prepare_switch_to_host(to_vmx(vcpu)); -} - -static bool emulation_required(struct kvm_vcpu *vcpu) -{ - return emulate_invalid_guest_state && !guest_state_valid(vcpu); -} - -static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); - -/* - * Return the cr0 value that a nested guest would read. This is a combination - * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by - * its hypervisor (cr0_read_shadow). - */ -static inline unsigned long nested_read_cr0(struct vmcs12 *fields) -{ - return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) | - (fields->cr0_read_shadow & fields->cr0_guest_host_mask); -} -static inline unsigned long nested_read_cr4(struct vmcs12 *fields) -{ - return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) | - (fields->cr4_read_shadow & fields->cr4_guest_host_mask); -} - -static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) -{ - unsigned long rflags, save_rflags; - - if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) { - __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); - rflags = vmcs_readl(GUEST_RFLAGS); - if (to_vmx(vcpu)->rmode.vm86_active) { - rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; - save_rflags = to_vmx(vcpu)->rmode.save_rflags; - rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; - } - to_vmx(vcpu)->rflags = rflags; - } - return to_vmx(vcpu)->rflags; -} - -static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) -{ - unsigned long old_rflags = vmx_get_rflags(vcpu); - - __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); - to_vmx(vcpu)->rflags = rflags; - if (to_vmx(vcpu)->rmode.vm86_active) { - to_vmx(vcpu)->rmode.save_rflags = rflags; - rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; - } - vmcs_writel(GUEST_RFLAGS, rflags); - - if ((old_rflags ^ to_vmx(vcpu)->rflags) & X86_EFLAGS_VM) - to_vmx(vcpu)->emulation_required = emulation_required(vcpu); -} - -static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) -{ - u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); - int ret = 0; - - if (interruptibility & GUEST_INTR_STATE_STI) - ret |= KVM_X86_SHADOW_INT_STI; - if (interruptibility & GUEST_INTR_STATE_MOV_SS) - ret |= KVM_X86_SHADOW_INT_MOV_SS; - - return ret; -} - -static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) -{ - u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); - u32 interruptibility = interruptibility_old; - - interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); - - if (mask & KVM_X86_SHADOW_INT_MOV_SS) - interruptibility |= GUEST_INTR_STATE_MOV_SS; - else if (mask & KVM_X86_SHADOW_INT_STI) - interruptibility |= GUEST_INTR_STATE_STI; - - if ((interruptibility != interruptibility_old)) - vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); -} - -static void skip_emulated_instruction(struct kvm_vcpu *vcpu) -{ - unsigned long rip; - - rip = kvm_rip_read(vcpu); - rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); - kvm_rip_write(vcpu, rip); - - /* skipping an emulated instruction also counts */ - vmx_set_interrupt_shadow(vcpu, 0); -} - -static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu, - unsigned long exit_qual) -{ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - unsigned int nr = vcpu->arch.exception.nr; - u32 intr_info = nr | INTR_INFO_VALID_MASK; - - if (vcpu->arch.exception.has_error_code) { - vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code; - intr_info |= INTR_INFO_DELIVER_CODE_MASK; - } - - if (kvm_exception_is_soft(nr)) - intr_info |= INTR_TYPE_SOFT_EXCEPTION; - else - intr_info |= INTR_TYPE_HARD_EXCEPTION; - - if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && - vmx_get_nmi_mask(vcpu)) - intr_info |= INTR_INFO_UNBLOCK_NMI; - - nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); -} - -/* - * KVM wants to inject page-faults which it got to the guest. This function - * checks whether in a nested guest, we need to inject them to L1 or L2. - */ -static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual) -{ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - unsigned int nr = vcpu->arch.exception.nr; - bool has_payload = vcpu->arch.exception.has_payload; - unsigned long payload = vcpu->arch.exception.payload; - - if (nr == PF_VECTOR) { - if (vcpu->arch.exception.nested_apf) { - *exit_qual = vcpu->arch.apf.nested_apf_token; - return 1; - } - if (nested_vmx_is_page_fault_vmexit(vmcs12, - vcpu->arch.exception.error_code)) { - *exit_qual = has_payload ? payload : vcpu->arch.cr2; - return 1; - } - } else if (vmcs12->exception_bitmap & (1u << nr)) { - if (nr == DB_VECTOR) { - if (!has_payload) { - payload = vcpu->arch.dr6; - payload &= ~(DR6_FIXED_1 | DR6_BT); - payload ^= DR6_RTM; - } - *exit_qual = payload; - } else - *exit_qual = 0; - return 1; - } - - return 0; -} - -static void vmx_clear_hlt(struct kvm_vcpu *vcpu) -{ - /* - * Ensure that we clear the HLT state in the VMCS. We don't need to - * explicitly skip the instruction because if the HLT state is set, - * then the instruction is already executing and RIP has already been - * advanced. - */ - if (kvm_hlt_in_guest(vcpu->kvm) && - vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT) - vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); -} - -static void vmx_queue_exception(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned nr = vcpu->arch.exception.nr; - bool has_error_code = vcpu->arch.exception.has_error_code; - u32 error_code = vcpu->arch.exception.error_code; - u32 intr_info = nr | INTR_INFO_VALID_MASK; - - kvm_deliver_exception_payload(vcpu); - - if (has_error_code) { - vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); - intr_info |= INTR_INFO_DELIVER_CODE_MASK; - } - - if (vmx->rmode.vm86_active) { - int inc_eip = 0; - if (kvm_exception_is_soft(nr)) - inc_eip = vcpu->arch.event_exit_inst_len; - if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE) - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); - return; - } - - WARN_ON_ONCE(vmx->emulation_required); - - if (kvm_exception_is_soft(nr)) { - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, - vmx->vcpu.arch.event_exit_inst_len); - intr_info |= INTR_TYPE_SOFT_EXCEPTION; - } else - intr_info |= INTR_TYPE_HARD_EXCEPTION; - - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); - - vmx_clear_hlt(vcpu); -} - -static bool vmx_rdtscp_supported(void) -{ - return cpu_has_vmx_rdtscp(); -} - -static bool vmx_invpcid_supported(void) -{ - return cpu_has_vmx_invpcid(); -} - -/* - * Swap MSR entry in host/guest MSR entry array. - */ -static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) -{ - struct shared_msr_entry tmp; - - tmp = vmx->guest_msrs[to]; - vmx->guest_msrs[to] = vmx->guest_msrs[from]; - vmx->guest_msrs[from] = tmp; -} - -/* - * Set up the vmcs to automatically save and restore system - * msrs. Don't touch the 64-bit msrs if the guest is in legacy - * mode, as fiddling with msrs is very expensive. - */ -static void setup_msrs(struct vcpu_vmx *vmx) -{ - int save_nmsrs, index; - - save_nmsrs = 0; -#ifdef CONFIG_X86_64 - if (is_long_mode(&vmx->vcpu)) { - index = __find_msr_index(vmx, MSR_SYSCALL_MASK); - if (index >= 0) - move_msr_up(vmx, index, save_nmsrs++); - index = __find_msr_index(vmx, MSR_LSTAR); - if (index >= 0) - move_msr_up(vmx, index, save_nmsrs++); - index = __find_msr_index(vmx, MSR_CSTAR); - if (index >= 0) - move_msr_up(vmx, index, save_nmsrs++); - index = __find_msr_index(vmx, MSR_TSC_AUX); - if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP)) - move_msr_up(vmx, index, save_nmsrs++); - /* - * MSR_STAR is only needed on long mode guests, and only - * if efer.sce is enabled. - */ - index = __find_msr_index(vmx, MSR_STAR); - if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE)) - move_msr_up(vmx, index, save_nmsrs++); - } -#endif - index = __find_msr_index(vmx, MSR_EFER); - if (index >= 0 && update_transition_efer(vmx, index)) - move_msr_up(vmx, index, save_nmsrs++); - - vmx->save_nmsrs = save_nmsrs; - vmx->guest_msrs_dirty = true; - - if (cpu_has_vmx_msr_bitmap()) - vmx_update_msr_bitmap(&vmx->vcpu); -} - -static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu) -{ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - - if (is_guest_mode(vcpu) && - (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)) - return vcpu->arch.tsc_offset - vmcs12->tsc_offset; - - return vcpu->arch.tsc_offset; -} - -static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) -{ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - u64 g_tsc_offset = 0; - - /* - * We're here if L1 chose not to trap WRMSR to TSC. According - * to the spec, this should set L1's TSC; The offset that L1 - * set for L2 remains unchanged, and still needs to be added - * to the newly set TSC to get L2's TSC. - */ - if (is_guest_mode(vcpu) && - (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)) - g_tsc_offset = vmcs12->tsc_offset; - - trace_kvm_write_tsc_offset(vcpu->vcpu_id, - vcpu->arch.tsc_offset - g_tsc_offset, - offset); - vmcs_write64(TSC_OFFSET, offset + g_tsc_offset); - return offset + g_tsc_offset; -} - -/* - * nested_vmx_allowed() checks whether a guest should be allowed to use VMX - * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for - * all guests if the "nested" module option is off, and can also be disabled - * for a single guest by disabling its VMX cpuid bit. - */ -static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu) -{ - return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX); -} - -/* - * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be - * returned for the various VMX controls MSRs when nested VMX is enabled. - * The same values should also be used to verify that vmcs12 control fields are - * valid during nested entry from L1 to L2. - * Each of these control msrs has a low and high 32-bit half: A low bit is on - * if the corresponding bit in the (32-bit) control field *must* be on, and a - * bit in the high half is on if the corresponding bit in the control field - * may be on. See also vmx_control_verify(). - */ -static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv) -{ - if (!nested) { - memset(msrs, 0, sizeof(*msrs)); - return; - } - - /* - * Note that as a general rule, the high half of the MSRs (bits in - * the control fields which may be 1) should be initialized by the - * intersection of the underlying hardware's MSR (i.e., features which - * can be supported) and the list of features we want to expose - - * because they are known to be properly supported in our code. - * Also, usually, the low half of the MSRs (bits which must be 1) can - * be set to 0, meaning that L1 may turn off any of these bits. The - * reason is that if one of these bits is necessary, it will appear - * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control - * fields of vmcs01 and vmcs02, will turn these bits off - and - * nested_vmx_exit_reflected() will not pass related exits to L1. - * These rules have exceptions below. - */ - - /* pin-based controls */ - rdmsr(MSR_IA32_VMX_PINBASED_CTLS, - msrs->pinbased_ctls_low, - msrs->pinbased_ctls_high); - msrs->pinbased_ctls_low |= - PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; - msrs->pinbased_ctls_high &= - PIN_BASED_EXT_INTR_MASK | - PIN_BASED_NMI_EXITING | - PIN_BASED_VIRTUAL_NMIS | - (apicv ? PIN_BASED_POSTED_INTR : 0); - msrs->pinbased_ctls_high |= - PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | - PIN_BASED_VMX_PREEMPTION_TIMER; - - /* exit controls */ - rdmsr(MSR_IA32_VMX_EXIT_CTLS, - msrs->exit_ctls_low, - msrs->exit_ctls_high); - msrs->exit_ctls_low = - VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; - - msrs->exit_ctls_high &= -#ifdef CONFIG_X86_64 - VM_EXIT_HOST_ADDR_SPACE_SIZE | -#endif - VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; - msrs->exit_ctls_high |= - VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | - VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | - VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; - - /* We support free control of debug control saving. */ - msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; - - /* entry controls */ - rdmsr(MSR_IA32_VMX_ENTRY_CTLS, - msrs->entry_ctls_low, - msrs->entry_ctls_high); - msrs->entry_ctls_low = - VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; - msrs->entry_ctls_high &= -#ifdef CONFIG_X86_64 - VM_ENTRY_IA32E_MODE | -#endif - VM_ENTRY_LOAD_IA32_PAT; - msrs->entry_ctls_high |= - (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); - - /* We support free control of debug control loading. */ - msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; - - /* cpu-based controls */ - rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, - msrs->procbased_ctls_low, - msrs->procbased_ctls_high); - msrs->procbased_ctls_low = - CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; - msrs->procbased_ctls_high &= - CPU_BASED_VIRTUAL_INTR_PENDING | - CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | - CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | - CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING | -#ifdef CONFIG_X86_64 - CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | -#endif - CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | - CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | - CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | - CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | - CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; - /* - * We can allow some features even when not supported by the - * hardware. For example, L1 can specify an MSR bitmap - and we - * can use it to avoid exits to L1 - even when L0 runs L2 - * without MSR bitmaps. - */ - msrs->procbased_ctls_high |= - CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | - CPU_BASED_USE_MSR_BITMAPS; - - /* We support free control of CR3 access interception. */ - msrs->procbased_ctls_low &= - ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); - - /* - * secondary cpu-based controls. Do not include those that - * depend on CPUID bits, they are added later by vmx_cpuid_update. - */ - rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, - msrs->secondary_ctls_low, - msrs->secondary_ctls_high); - msrs->secondary_ctls_low = 0; - msrs->secondary_ctls_high &= - SECONDARY_EXEC_DESC | - SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | - SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | - SECONDARY_EXEC_WBINVD_EXITING; - - /* - * We can emulate "VMCS shadowing," even if the hardware - * doesn't support it. - */ - msrs->secondary_ctls_high |= - SECONDARY_EXEC_SHADOW_VMCS; - - if (enable_ept) { - /* nested EPT: emulate EPT also to L1 */ - msrs->secondary_ctls_high |= - SECONDARY_EXEC_ENABLE_EPT; - msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT | - VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT; - if (cpu_has_vmx_ept_execute_only()) - msrs->ept_caps |= - VMX_EPT_EXECUTE_ONLY_BIT; - msrs->ept_caps &= vmx_capability.ept; - msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | - VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | - VMX_EPT_1GB_PAGE_BIT; - if (enable_ept_ad_bits) { - msrs->secondary_ctls_high |= - SECONDARY_EXEC_ENABLE_PML; - msrs->ept_caps |= VMX_EPT_AD_BIT; - } - } - - if (cpu_has_vmx_vmfunc()) { - msrs->secondary_ctls_high |= - SECONDARY_EXEC_ENABLE_VMFUNC; - /* - * Advertise EPTP switching unconditionally - * since we emulate it - */ - if (enable_ept) - msrs->vmfunc_controls = - VMX_VMFUNC_EPTP_SWITCHING; - } - - /* - * Old versions of KVM use the single-context version without - * checking for support, so declare that it is supported even - * though it is treated as global context. The alternative is - * not failing the single-context invvpid, and it is worse. - */ - if (enable_vpid) { - msrs->secondary_ctls_high |= - SECONDARY_EXEC_ENABLE_VPID; - msrs->vpid_caps = VMX_VPID_INVVPID_BIT | - VMX_VPID_EXTENT_SUPPORTED_MASK; - } - - if (enable_unrestricted_guest) - msrs->secondary_ctls_high |= - SECONDARY_EXEC_UNRESTRICTED_GUEST; - - if (flexpriority_enabled) - msrs->secondary_ctls_high |= - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; - - /* miscellaneous data */ - rdmsr(MSR_IA32_VMX_MISC, - msrs->misc_low, - msrs->misc_high); - msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA; - msrs->misc_low |= - MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | - VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | - VMX_MISC_ACTIVITY_HLT; - msrs->misc_high = 0; - - /* - * This MSR reports some information about VMX support. We - * should return information about the VMX we emulate for the - * guest, and the VMCS structure we give it - not about the - * VMX support of the underlying hardware. - */ - msrs->basic = - VMCS12_REVISION | - VMX_BASIC_TRUE_CTLS | - ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | - (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); - - if (cpu_has_vmx_basic_inout()) - msrs->basic |= VMX_BASIC_INOUT; - - /* - * These MSRs specify bits which the guest must keep fixed on - * while L1 is in VMXON mode (in L1's root mode, or running an L2). - * We picked the standard core2 setting. - */ -#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) -#define VMXON_CR4_ALWAYSON X86_CR4_VMXE - msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; - msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; - - /* These MSRs specify bits which the guest must keep fixed off. */ - rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); - rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); - - /* highest index: VMX_PREEMPTION_TIMER_VALUE */ - msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1; -} - -/* - * if fixed0[i] == 1: val[i] must be 1 - * if fixed1[i] == 0: val[i] must be 0 - */ -static inline bool fixed_bits_valid(u64 val, u64 fixed0, u64 fixed1) -{ - return ((val & fixed1) | fixed0) == val; -} - -static inline bool vmx_control_verify(u32 control, u32 low, u32 high) -{ - return fixed_bits_valid(control, low, high); -} - -static inline u64 vmx_control_msr(u32 low, u32 high) -{ - return low | ((u64)high << 32); -} - -static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) -{ - superset &= mask; - subset &= mask; - - return (superset | subset) == superset; -} - -static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) -{ - const u64 feature_and_reserved = - /* feature (except bit 48; see below) */ - BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | - /* reserved */ - BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); - u64 vmx_basic = vmx->nested.msrs.basic; - - if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) - return -EINVAL; - - /* - * KVM does not emulate a version of VMX that constrains physical - * addresses of VMX structures (e.g. VMCS) to 32-bits. - */ - if (data & BIT_ULL(48)) - return -EINVAL; - - if (vmx_basic_vmcs_revision_id(vmx_basic) != - vmx_basic_vmcs_revision_id(data)) - return -EINVAL; - - if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) - return -EINVAL; - - vmx->nested.msrs.basic = data; - return 0; -} - -static int -vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) -{ - u64 supported; - u32 *lowp, *highp; - - switch (msr_index) { - case MSR_IA32_VMX_TRUE_PINBASED_CTLS: - lowp = &vmx->nested.msrs.pinbased_ctls_low; - highp = &vmx->nested.msrs.pinbased_ctls_high; - break; - case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: - lowp = &vmx->nested.msrs.procbased_ctls_low; - highp = &vmx->nested.msrs.procbased_ctls_high; - break; - case MSR_IA32_VMX_TRUE_EXIT_CTLS: - lowp = &vmx->nested.msrs.exit_ctls_low; - highp = &vmx->nested.msrs.exit_ctls_high; - break; - case MSR_IA32_VMX_TRUE_ENTRY_CTLS: - lowp = &vmx->nested.msrs.entry_ctls_low; - highp = &vmx->nested.msrs.entry_ctls_high; - break; - case MSR_IA32_VMX_PROCBASED_CTLS2: - lowp = &vmx->nested.msrs.secondary_ctls_low; - highp = &vmx->nested.msrs.secondary_ctls_high; - break; - default: - BUG(); - } - - supported = vmx_control_msr(*lowp, *highp); - - /* Check must-be-1 bits are still 1. */ - if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) - return -EINVAL; - - /* Check must-be-0 bits are still 0. */ - if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) - return -EINVAL; - - *lowp = data; - *highp = data >> 32; - return 0; -} - -static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) -{ - const u64 feature_and_reserved_bits = - /* feature */ - BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | - BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | - /* reserved */ - GENMASK_ULL(13, 9) | BIT_ULL(31); - u64 vmx_misc; - - vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, - vmx->nested.msrs.misc_high); - - if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) - return -EINVAL; - - if ((vmx->nested.msrs.pinbased_ctls_high & - PIN_BASED_VMX_PREEMPTION_TIMER) && - vmx_misc_preemption_timer_rate(data) != - vmx_misc_preemption_timer_rate(vmx_misc)) - return -EINVAL; - - if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) - return -EINVAL; - - if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) - return -EINVAL; - - if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) - return -EINVAL; - - vmx->nested.msrs.misc_low = data; - vmx->nested.msrs.misc_high = data >> 32; - - /* - * If L1 has read-only VM-exit information fields, use the - * less permissive vmx_vmwrite_bitmap to specify write - * permissions for the shadow VMCS. - */ - if (enable_shadow_vmcs && !nested_cpu_has_vmwrite_any_field(&vmx->vcpu)) - vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); - - return 0; -} - -static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) -{ - u64 vmx_ept_vpid_cap; - - vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps, - vmx->nested.msrs.vpid_caps); - - /* Every bit is either reserved or a feature bit. */ - if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) - return -EINVAL; - - vmx->nested.msrs.ept_caps = data; - vmx->nested.msrs.vpid_caps = data >> 32; - return 0; -} - -static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) -{ - u64 *msr; - - switch (msr_index) { - case MSR_IA32_VMX_CR0_FIXED0: - msr = &vmx->nested.msrs.cr0_fixed0; - break; - case MSR_IA32_VMX_CR4_FIXED0: - msr = &vmx->nested.msrs.cr4_fixed0; - break; - default: - BUG(); - } - - /* - * 1 bits (which indicates bits which "must-be-1" during VMX operation) - * must be 1 in the restored value. - */ - if (!is_bitwise_subset(data, *msr, -1ULL)) - return -EINVAL; - - *msr = data; - return 0; -} - -/* - * Called when userspace is restoring VMX MSRs. - * - * Returns 0 on success, non-0 otherwise. - */ -static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - /* - * Don't allow changes to the VMX capability MSRs while the vCPU - * is in VMX operation. - */ - if (vmx->nested.vmxon) - return -EBUSY; - - switch (msr_index) { - case MSR_IA32_VMX_BASIC: - return vmx_restore_vmx_basic(vmx, data); - case MSR_IA32_VMX_PINBASED_CTLS: - case MSR_IA32_VMX_PROCBASED_CTLS: - case MSR_IA32_VMX_EXIT_CTLS: - case MSR_IA32_VMX_ENTRY_CTLS: - /* - * The "non-true" VMX capability MSRs are generated from the - * "true" MSRs, so we do not support restoring them directly. - * - * If userspace wants to emulate VMX_BASIC[55]=0, userspace - * should restore the "true" MSRs with the must-be-1 bits - * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND - * DEFAULT SETTINGS". - */ - return -EINVAL; - case MSR_IA32_VMX_TRUE_PINBASED_CTLS: - case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: - case MSR_IA32_VMX_TRUE_EXIT_CTLS: - case MSR_IA32_VMX_TRUE_ENTRY_CTLS: - case MSR_IA32_VMX_PROCBASED_CTLS2: - return vmx_restore_control_msr(vmx, msr_index, data); - case MSR_IA32_VMX_MISC: - return vmx_restore_vmx_misc(vmx, data); - case MSR_IA32_VMX_CR0_FIXED0: - case MSR_IA32_VMX_CR4_FIXED0: - return vmx_restore_fixed0_msr(vmx, msr_index, data); - case MSR_IA32_VMX_CR0_FIXED1: - case MSR_IA32_VMX_CR4_FIXED1: - /* - * These MSRs are generated based on the vCPU's CPUID, so we - * do not support restoring them directly. - */ - return -EINVAL; - case MSR_IA32_VMX_EPT_VPID_CAP: - return vmx_restore_vmx_ept_vpid_cap(vmx, data); - case MSR_IA32_VMX_VMCS_ENUM: - vmx->nested.msrs.vmcs_enum = data; - return 0; - default: - /* - * The rest of the VMX capability MSRs do not support restore. - */ - return -EINVAL; - } -} - -/* Returns 0 on success, non-0 otherwise. */ -static int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) -{ - switch (msr_index) { - case MSR_IA32_VMX_BASIC: - *pdata = msrs->basic; - break; - case MSR_IA32_VMX_TRUE_PINBASED_CTLS: - case MSR_IA32_VMX_PINBASED_CTLS: - *pdata = vmx_control_msr( - msrs->pinbased_ctls_low, - msrs->pinbased_ctls_high); - if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) - *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; - break; - case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: - case MSR_IA32_VMX_PROCBASED_CTLS: - *pdata = vmx_control_msr( - msrs->procbased_ctls_low, - msrs->procbased_ctls_high); - if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) - *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; - break; - case MSR_IA32_VMX_TRUE_EXIT_CTLS: - case MSR_IA32_VMX_EXIT_CTLS: - *pdata = vmx_control_msr( - msrs->exit_ctls_low, - msrs->exit_ctls_high); - if (msr_index == MSR_IA32_VMX_EXIT_CTLS) - *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; - break; - case MSR_IA32_VMX_TRUE_ENTRY_CTLS: - case MSR_IA32_VMX_ENTRY_CTLS: - *pdata = vmx_control_msr( - msrs->entry_ctls_low, - msrs->entry_ctls_high); - if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) - *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; - break; - case MSR_IA32_VMX_MISC: - *pdata = vmx_control_msr( - msrs->misc_low, - msrs->misc_high); - break; - case MSR_IA32_VMX_CR0_FIXED0: - *pdata = msrs->cr0_fixed0; - break; - case MSR_IA32_VMX_CR0_FIXED1: - *pdata = msrs->cr0_fixed1; - break; - case MSR_IA32_VMX_CR4_FIXED0: - *pdata = msrs->cr4_fixed0; - break; - case MSR_IA32_VMX_CR4_FIXED1: - *pdata = msrs->cr4_fixed1; - break; - case MSR_IA32_VMX_VMCS_ENUM: - *pdata = msrs->vmcs_enum; - break; - case MSR_IA32_VMX_PROCBASED_CTLS2: - *pdata = vmx_control_msr( - msrs->secondary_ctls_low, - msrs->secondary_ctls_high); - break; - case MSR_IA32_VMX_EPT_VPID_CAP: - *pdata = msrs->ept_caps | - ((u64)msrs->vpid_caps << 32); - break; - case MSR_IA32_VMX_VMFUNC: - *pdata = msrs->vmfunc_controls; - break; - default: - return 1; - } - - return 0; -} - -static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu, - uint64_t val) -{ - uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits; - - return !(val & ~valid_bits); -} - -static int vmx_get_msr_feature(struct kvm_msr_entry *msr) -{ - switch (msr->index) { - case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: - if (!nested) - return 1; - return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data); - default: - return 1; - } - - return 0; -} - -/* - * Reads an msr value (of 'msr_index') into 'pdata'. - * Returns 0 on success, non-0 otherwise. - * Assumes vcpu_load() was already called. - */ -static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct shared_msr_entry *msr; - - switch (msr_info->index) { -#ifdef CONFIG_X86_64 - case MSR_FS_BASE: - msr_info->data = vmcs_readl(GUEST_FS_BASE); - break; - case MSR_GS_BASE: - msr_info->data = vmcs_readl(GUEST_GS_BASE); - break; - case MSR_KERNEL_GS_BASE: - msr_info->data = vmx_read_guest_kernel_gs_base(vmx); - break; -#endif - case MSR_EFER: - return kvm_get_msr_common(vcpu, msr_info); - case MSR_IA32_SPEC_CTRL: - if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) - return 1; - - msr_info->data = to_vmx(vcpu)->spec_ctrl; - break; - case MSR_IA32_ARCH_CAPABILITIES: - if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) - return 1; - msr_info->data = to_vmx(vcpu)->arch_capabilities; - break; - case MSR_IA32_SYSENTER_CS: - msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); - break; - case MSR_IA32_SYSENTER_EIP: - msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP); - break; - case MSR_IA32_SYSENTER_ESP: - msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); - break; - case MSR_IA32_BNDCFGS: - if (!kvm_mpx_supported() || - (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_MPX))) - return 1; - msr_info->data = vmcs_read64(GUEST_BNDCFGS); - break; - case MSR_IA32_MCG_EXT_CTL: - if (!msr_info->host_initiated && - !(vmx->msr_ia32_feature_control & - FEATURE_CONTROL_LMCE)) - return 1; - msr_info->data = vcpu->arch.mcg_ext_ctl; - break; - case MSR_IA32_FEATURE_CONTROL: - msr_info->data = vmx->msr_ia32_feature_control; - break; - case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: - if (!nested_vmx_allowed(vcpu)) - return 1; - return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, - &msr_info->data); - case MSR_IA32_XSS: - if (!vmx_xsaves_supported()) - return 1; - msr_info->data = vcpu->arch.ia32_xss; - break; - case MSR_TSC_AUX: - if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) - return 1; - /* Otherwise falls through */ - default: - msr = find_msr_entry(vmx, msr_info->index); - if (msr) { - msr_info->data = msr->data; - break; - } - return kvm_get_msr_common(vcpu, msr_info); - } - - return 0; -} - -static void vmx_leave_nested(struct kvm_vcpu *vcpu); - -/* - * Writes msr value into into the appropriate "register". - * Returns 0 on success, non-0 otherwise. - * Assumes vcpu_load() was already called. - */ -static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct shared_msr_entry *msr; - int ret = 0; - u32 msr_index = msr_info->index; - u64 data = msr_info->data; - - switch (msr_index) { - case MSR_EFER: - ret = kvm_set_msr_common(vcpu, msr_info); - break; -#ifdef CONFIG_X86_64 - case MSR_FS_BASE: - vmx_segment_cache_clear(vmx); - vmcs_writel(GUEST_FS_BASE, data); - break; - case MSR_GS_BASE: - vmx_segment_cache_clear(vmx); - vmcs_writel(GUEST_GS_BASE, data); - break; - case MSR_KERNEL_GS_BASE: - vmx_write_guest_kernel_gs_base(vmx, data); - break; -#endif - case MSR_IA32_SYSENTER_CS: - vmcs_write32(GUEST_SYSENTER_CS, data); - break; - case MSR_IA32_SYSENTER_EIP: - vmcs_writel(GUEST_SYSENTER_EIP, data); - break; - case MSR_IA32_SYSENTER_ESP: - vmcs_writel(GUEST_SYSENTER_ESP, data); - break; - case MSR_IA32_BNDCFGS: - if (!kvm_mpx_supported() || - (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_MPX))) - return 1; - if (is_noncanonical_address(data & PAGE_MASK, vcpu) || - (data & MSR_IA32_BNDCFGS_RSVD)) - return 1; - vmcs_write64(GUEST_BNDCFGS, data); - break; - case MSR_IA32_SPEC_CTRL: - if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) - return 1; - - /* The STIBP bit doesn't fault even if it's not advertised */ - if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD)) - return 1; - - vmx->spec_ctrl = data; - - if (!data) - break; - - /* - * For non-nested: - * When it's written (to non-zero) for the first time, pass - * it through. - * - * For nested: - * The handling of the MSR bitmap for L2 guests is done in - * nested_vmx_merge_msr_bitmap. We should not touch the - * vmcs02.msr_bitmap here since it gets completely overwritten - * in the merging. We update the vmcs01 here for L1 as well - * since it will end up touching the MSR anyway now. - */ - vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, - MSR_IA32_SPEC_CTRL, - MSR_TYPE_RW); - break; - case MSR_IA32_PRED_CMD: - if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) - return 1; - - if (data & ~PRED_CMD_IBPB) - return 1; - - if (!data) - break; - - wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); - - /* - * For non-nested: - * When it's written (to non-zero) for the first time, pass - * it through. - * - * For nested: - * The handling of the MSR bitmap for L2 guests is done in - * nested_vmx_merge_msr_bitmap. We should not touch the - * vmcs02.msr_bitmap here since it gets completely overwritten - * in the merging. - */ - vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD, - MSR_TYPE_W); - break; - case MSR_IA32_ARCH_CAPABILITIES: - if (!msr_info->host_initiated) - return 1; - vmx->arch_capabilities = data; - break; - case MSR_IA32_CR_PAT: - if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { - if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) - return 1; - vmcs_write64(GUEST_IA32_PAT, data); - vcpu->arch.pat = data; - break; - } - ret = kvm_set_msr_common(vcpu, msr_info); - break; - case MSR_IA32_TSC_ADJUST: - ret = kvm_set_msr_common(vcpu, msr_info); - break; - case MSR_IA32_MCG_EXT_CTL: - if ((!msr_info->host_initiated && - !(to_vmx(vcpu)->msr_ia32_feature_control & - FEATURE_CONTROL_LMCE)) || - (data & ~MCG_EXT_CTL_LMCE_EN)) - return 1; - vcpu->arch.mcg_ext_ctl = data; - break; - case MSR_IA32_FEATURE_CONTROL: - if (!vmx_feature_control_msr_valid(vcpu, data) || - (to_vmx(vcpu)->msr_ia32_feature_control & - FEATURE_CONTROL_LOCKED && !msr_info->host_initiated)) - return 1; - vmx->msr_ia32_feature_control = data; - if (msr_info->host_initiated && data == 0) - vmx_leave_nested(vcpu); - break; - case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: - if (!msr_info->host_initiated) - return 1; /* they are read-only */ - if (!nested_vmx_allowed(vcpu)) - return 1; - return vmx_set_vmx_msr(vcpu, msr_index, data); - case MSR_IA32_XSS: - if (!vmx_xsaves_supported()) - return 1; - /* - * The only supported bit as of Skylake is bit 8, but - * it is not supported on KVM. - */ - if (data != 0) - return 1; - vcpu->arch.ia32_xss = data; - if (vcpu->arch.ia32_xss != host_xss) - add_atomic_switch_msr(vmx, MSR_IA32_XSS, - vcpu->arch.ia32_xss, host_xss, false); - else - clear_atomic_switch_msr(vmx, MSR_IA32_XSS); - break; - case MSR_TSC_AUX: - if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) - return 1; - /* Check reserved bit, higher 32 bits should be zero */ - if ((data >> 32) != 0) - return 1; - /* Otherwise falls through */ - default: - msr = find_msr_entry(vmx, msr_index); - if (msr) { - u64 old_msr_data = msr->data; - msr->data = data; - if (msr - vmx->guest_msrs < vmx->save_nmsrs) { - preempt_disable(); - ret = kvm_set_shared_msr(msr->index, msr->data, - msr->mask); - preempt_enable(); - if (ret) - msr->data = old_msr_data; - } - break; - } - ret = kvm_set_msr_common(vcpu, msr_info); - } - - return ret; -} - -static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) -{ - __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); - switch (reg) { - case VCPU_REGS_RSP: - vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); - break; - case VCPU_REGS_RIP: - vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); - break; - case VCPU_EXREG_PDPTR: - if (enable_ept) - ept_save_pdptrs(vcpu); - break; - default: - break; - } -} - -static __init int cpu_has_kvm_support(void) -{ - return cpu_has_vmx(); -} - -static __init int vmx_disabled_by_bios(void) -{ - u64 msr; - - rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); - if (msr & FEATURE_CONTROL_LOCKED) { - /* launched w/ TXT and VMX disabled */ - if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) - && tboot_enabled()) - return 1; - /* launched w/o TXT and VMX only enabled w/ TXT */ - if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) - && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) - && !tboot_enabled()) { - printk(KERN_WARNING "kvm: disable TXT in the BIOS or " - "activate TXT before enabling KVM\n"); - return 1; - } - /* launched w/o TXT and VMX disabled */ - if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) - && !tboot_enabled()) - return 1; - } - - return 0; -} - -static void kvm_cpu_vmxon(u64 addr) -{ - cr4_set_bits(X86_CR4_VMXE); - intel_pt_handle_vmx(1); - - asm volatile ("vmxon %0" : : "m"(addr)); -} - -static int hardware_enable(void) -{ - int cpu = raw_smp_processor_id(); - u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); - u64 old, test_bits; - - if (cr4_read_shadow() & X86_CR4_VMXE) - return -EBUSY; - - /* - * This can happen if we hot-added a CPU but failed to allocate - * VP assist page for it. - */ - if (static_branch_unlikely(&enable_evmcs) && - !hv_get_vp_assist_page(cpu)) - return -EFAULT; - - INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); - INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); - spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); - - /* - * Now we can enable the vmclear operation in kdump - * since the loaded_vmcss_on_cpu list on this cpu - * has been initialized. - * - * Though the cpu is not in VMX operation now, there - * is no problem to enable the vmclear operation - * for the loaded_vmcss_on_cpu list is empty! - */ - crash_enable_local_vmclear(cpu); - - rdmsrl(MSR_IA32_FEATURE_CONTROL, old); - - test_bits = FEATURE_CONTROL_LOCKED; - test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; - if (tboot_enabled()) - test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX; - - if ((old & test_bits) != test_bits) { - /* enable and lock */ - wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); - } - kvm_cpu_vmxon(phys_addr); - if (enable_ept) - ept_sync_global(); - - return 0; -} - -static void vmclear_local_loaded_vmcss(void) -{ - int cpu = raw_smp_processor_id(); - struct loaded_vmcs *v, *n; - - list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), - loaded_vmcss_on_cpu_link) - __loaded_vmcs_clear(v); -} - - -/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot() - * tricks. - */ -static void kvm_cpu_vmxoff(void) -{ - asm volatile (__ex("vmxoff")); - - intel_pt_handle_vmx(0); - cr4_clear_bits(X86_CR4_VMXE); -} - -static void hardware_disable(void) -{ - vmclear_local_loaded_vmcss(); - kvm_cpu_vmxoff(); -} - -static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, - u32 msr, u32 *result) -{ - u32 vmx_msr_low, vmx_msr_high; - u32 ctl = ctl_min | ctl_opt; - - rdmsr(msr, vmx_msr_low, vmx_msr_high); - - ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ - ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ - - /* Ensure minimum (required) set of control bits are supported. */ - if (ctl_min & ~ctl) - return -EIO; - - *result = ctl; - return 0; -} - -static __init bool allow_1_setting(u32 msr, u32 ctl) -{ - u32 vmx_msr_low, vmx_msr_high; - - rdmsr(msr, vmx_msr_low, vmx_msr_high); - return vmx_msr_high & ctl; -} - -static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) -{ - u32 vmx_msr_low, vmx_msr_high; - u32 min, opt, min2, opt2; - u32 _pin_based_exec_control = 0; - u32 _cpu_based_exec_control = 0; - u32 _cpu_based_2nd_exec_control = 0; - u32 _vmexit_control = 0; - u32 _vmentry_control = 0; - - memset(vmcs_conf, 0, sizeof(*vmcs_conf)); - min = CPU_BASED_HLT_EXITING | -#ifdef CONFIG_X86_64 - CPU_BASED_CR8_LOAD_EXITING | - CPU_BASED_CR8_STORE_EXITING | -#endif - CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING | - CPU_BASED_UNCOND_IO_EXITING | - CPU_BASED_MOV_DR_EXITING | - CPU_BASED_USE_TSC_OFFSETING | - CPU_BASED_MWAIT_EXITING | - CPU_BASED_MONITOR_EXITING | - CPU_BASED_INVLPG_EXITING | - CPU_BASED_RDPMC_EXITING; - - opt = CPU_BASED_TPR_SHADOW | - CPU_BASED_USE_MSR_BITMAPS | - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, - &_cpu_based_exec_control) < 0) - return -EIO; -#ifdef CONFIG_X86_64 - if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) - _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING & - ~CPU_BASED_CR8_STORE_EXITING; -#endif - if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { - min2 = 0; - opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | - SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | - SECONDARY_EXEC_WBINVD_EXITING | - SECONDARY_EXEC_ENABLE_VPID | - SECONDARY_EXEC_ENABLE_EPT | - SECONDARY_EXEC_UNRESTRICTED_GUEST | - SECONDARY_EXEC_PAUSE_LOOP_EXITING | - SECONDARY_EXEC_DESC | - SECONDARY_EXEC_RDTSCP | - SECONDARY_EXEC_ENABLE_INVPCID | - SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | - SECONDARY_EXEC_SHADOW_VMCS | - SECONDARY_EXEC_XSAVES | - SECONDARY_EXEC_RDSEED_EXITING | - SECONDARY_EXEC_RDRAND_EXITING | - SECONDARY_EXEC_ENABLE_PML | - SECONDARY_EXEC_TSC_SCALING | - SECONDARY_EXEC_ENABLE_VMFUNC | - SECONDARY_EXEC_ENCLS_EXITING; - if (adjust_vmx_controls(min2, opt2, - MSR_IA32_VMX_PROCBASED_CTLS2, - &_cpu_based_2nd_exec_control) < 0) - return -EIO; - } -#ifndef CONFIG_X86_64 - if (!(_cpu_based_2nd_exec_control & - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) - _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; -#endif - - if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) - _cpu_based_2nd_exec_control &= ~( - SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); - - rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, - &vmx_capability.ept, &vmx_capability.vpid); - - if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { - /* CR3 accesses and invlpg don't need to cause VM Exits when EPT - enabled */ - _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING | - CPU_BASED_INVLPG_EXITING); - } else if (vmx_capability.ept) { - vmx_capability.ept = 0; - pr_warn_once("EPT CAP should not exist if not support " - "1-setting enable EPT VM-execution control\n"); - } - if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && - vmx_capability.vpid) { - vmx_capability.vpid = 0; - pr_warn_once("VPID CAP should not exist if not support " - "1-setting enable VPID VM-execution control\n"); - } - - min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT; -#ifdef CONFIG_X86_64 - min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; -#endif - opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT | - VM_EXIT_CLEAR_BNDCFGS; - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, - &_vmexit_control) < 0) - return -EIO; - - min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; - opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR | - PIN_BASED_VMX_PREEMPTION_TIMER; - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, - &_pin_based_exec_control) < 0) - return -EIO; - - if (cpu_has_broken_vmx_preemption_timer()) - _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; - if (!(_cpu_based_2nd_exec_control & - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)) - _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; - - min = VM_ENTRY_LOAD_DEBUG_CONTROLS; - opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, - &_vmentry_control) < 0) - return -EIO; - - rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); - - /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ - if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) - return -EIO; - -#ifdef CONFIG_X86_64 - /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */ - if (vmx_msr_high & (1u<<16)) - return -EIO; -#endif - - /* Require Write-Back (WB) memory type for VMCS accesses. */ - if (((vmx_msr_high >> 18) & 15) != 6) - return -EIO; - - vmcs_conf->size = vmx_msr_high & 0x1fff; - vmcs_conf->order = get_order(vmcs_conf->size); - vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff; - - vmcs_conf->revision_id = vmx_msr_low; - - vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; - vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; - vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; - vmcs_conf->vmexit_ctrl = _vmexit_control; - vmcs_conf->vmentry_ctrl = _vmentry_control; - - if (static_branch_unlikely(&enable_evmcs)) - evmcs_sanitize_exec_ctrls(vmcs_conf); - - cpu_has_load_ia32_efer = - allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS, - VM_ENTRY_LOAD_IA32_EFER) - && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS, - VM_EXIT_LOAD_IA32_EFER); - - cpu_has_load_perf_global_ctrl = - allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS, - VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) - && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS, - VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); - - /* - * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL - * but due to errata below it can't be used. Workaround is to use - * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL. - * - * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32] - * - * AAK155 (model 26) - * AAP115 (model 30) - * AAT100 (model 37) - * BC86,AAY89,BD102 (model 44) - * BA97 (model 46) - * - */ - if (cpu_has_load_perf_global_ctrl && boot_cpu_data.x86 == 0x6) { - switch (boot_cpu_data.x86_model) { - case 26: - case 30: - case 37: - case 44: - case 46: - cpu_has_load_perf_global_ctrl = false; - printk_once(KERN_WARNING"kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " - "does not work properly. Using workaround\n"); - break; - default: - break; - } - } - - if (boot_cpu_has(X86_FEATURE_XSAVES)) - rdmsrl(MSR_IA32_XSS, host_xss); - - return 0; -} - -static struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu) -{ - int node = cpu_to_node(cpu); - struct page *pages; - struct vmcs *vmcs; - - pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order); - if (!pages) - return NULL; - vmcs = page_address(pages); - memset(vmcs, 0, vmcs_config.size); - - /* KVM supports Enlightened VMCS v1 only */ - if (static_branch_unlikely(&enable_evmcs)) - vmcs->hdr.revision_id = KVM_EVMCS_VERSION; - else - vmcs->hdr.revision_id = vmcs_config.revision_id; - - if (shadow) - vmcs->hdr.shadow_vmcs = 1; - return vmcs; -} - -static void free_vmcs(struct vmcs *vmcs) -{ - free_pages((unsigned long)vmcs, vmcs_config.order); -} - -/* - * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded - */ -static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) -{ - if (!loaded_vmcs->vmcs) - return; - loaded_vmcs_clear(loaded_vmcs); - free_vmcs(loaded_vmcs->vmcs); - loaded_vmcs->vmcs = NULL; - if (loaded_vmcs->msr_bitmap) - free_page((unsigned long)loaded_vmcs->msr_bitmap); - WARN_ON(loaded_vmcs->shadow_vmcs != NULL); -} - -static struct vmcs *alloc_vmcs(bool shadow) -{ - return alloc_vmcs_cpu(shadow, raw_smp_processor_id()); -} - -static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) -{ - loaded_vmcs->vmcs = alloc_vmcs(false); - if (!loaded_vmcs->vmcs) - return -ENOMEM; - - loaded_vmcs->shadow_vmcs = NULL; - loaded_vmcs_init(loaded_vmcs); - - if (cpu_has_vmx_msr_bitmap()) { - loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!loaded_vmcs->msr_bitmap) - goto out_vmcs; - memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE); - - if (IS_ENABLED(CONFIG_HYPERV) && - static_branch_unlikely(&enable_evmcs) && - (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) { - struct hv_enlightened_vmcs *evmcs = - (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs; - - evmcs->hv_enlightenments_control.msr_bitmap = 1; - } - } - - memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state)); - - return 0; - -out_vmcs: - free_loaded_vmcs(loaded_vmcs); - return -ENOMEM; -} - -static void free_kvm_area(void) -{ - int cpu; - - for_each_possible_cpu(cpu) { - free_vmcs(per_cpu(vmxarea, cpu)); - per_cpu(vmxarea, cpu) = NULL; - } -} - -enum vmcs_field_width { - VMCS_FIELD_WIDTH_U16 = 0, - VMCS_FIELD_WIDTH_U64 = 1, - VMCS_FIELD_WIDTH_U32 = 2, - VMCS_FIELD_WIDTH_NATURAL_WIDTH = 3 -}; - -static inline int vmcs_field_width(unsigned long field) -{ - if (0x1 & field) /* the *_HIGH fields are all 32 bit */ - return VMCS_FIELD_WIDTH_U32; - return (field >> 13) & 0x3 ; -} - -static inline int vmcs_field_readonly(unsigned long field) -{ - return (((field >> 10) & 0x3) == 1); -} - -static void init_vmcs_shadow_fields(void) -{ - int i, j; - - memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); - memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); - - for (i = j = 0; i < max_shadow_read_only_fields; i++) { - u16 field = shadow_read_only_fields[i]; - if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && - (i + 1 == max_shadow_read_only_fields || - shadow_read_only_fields[i + 1] != field + 1)) - pr_err("Missing field from shadow_read_only_field %x\n", - field + 1); - - clear_bit(field, vmx_vmread_bitmap); -#ifdef CONFIG_X86_64 - if (field & 1) - continue; -#endif - if (j < i) - shadow_read_only_fields[j] = field; - j++; - } - max_shadow_read_only_fields = j; - - for (i = j = 0; i < max_shadow_read_write_fields; i++) { - u16 field = shadow_read_write_fields[i]; - if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && - (i + 1 == max_shadow_read_write_fields || - shadow_read_write_fields[i + 1] != field + 1)) - pr_err("Missing field from shadow_read_write_field %x\n", - field + 1); - - /* - * PML and the preemption timer can be emulated, but the - * processor cannot vmwrite to fields that don't exist - * on bare metal. - */ - switch (field) { - case GUEST_PML_INDEX: - if (!cpu_has_vmx_pml()) - continue; - break; - case VMX_PREEMPTION_TIMER_VALUE: - if (!cpu_has_vmx_preemption_timer()) - continue; - break; - case GUEST_INTR_STATUS: - if (!cpu_has_vmx_apicv()) - continue; - break; - default: - break; - } - - clear_bit(field, vmx_vmwrite_bitmap); - clear_bit(field, vmx_vmread_bitmap); -#ifdef CONFIG_X86_64 - if (field & 1) - continue; -#endif - if (j < i) - shadow_read_write_fields[j] = field; - j++; - } - max_shadow_read_write_fields = j; -} - -static __init int alloc_kvm_area(void) -{ - int cpu; - - for_each_possible_cpu(cpu) { - struct vmcs *vmcs; - - vmcs = alloc_vmcs_cpu(false, cpu); - if (!vmcs) { - free_kvm_area(); - return -ENOMEM; - } - - /* - * When eVMCS is enabled, alloc_vmcs_cpu() sets - * vmcs->revision_id to KVM_EVMCS_VERSION instead of - * revision_id reported by MSR_IA32_VMX_BASIC. - * - * However, even though not explictly documented by - * TLFS, VMXArea passed as VMXON argument should - * still be marked with revision_id reported by - * physical CPU. - */ - if (static_branch_unlikely(&enable_evmcs)) - vmcs->hdr.revision_id = vmcs_config.revision_id; - - per_cpu(vmxarea, cpu) = vmcs; - } - return 0; -} - -static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, - struct kvm_segment *save) -{ - if (!emulate_invalid_guest_state) { - /* - * CS and SS RPL should be equal during guest entry according - * to VMX spec, but in reality it is not always so. Since vcpu - * is in the middle of the transition from real mode to - * protected mode it is safe to assume that RPL 0 is a good - * default value. - */ - if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) - save->selector &= ~SEGMENT_RPL_MASK; - save->dpl = save->selector & SEGMENT_RPL_MASK; - save->s = 1; - } - vmx_set_segment(vcpu, save, seg); -} - -static void enter_pmode(struct kvm_vcpu *vcpu) -{ - unsigned long flags; - struct vcpu_vmx *vmx = to_vmx(vcpu); - - /* - * Update real mode segment cache. It may be not up-to-date if sement - * register was written while vcpu was in a guest mode. - */ - vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); - vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); - vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); - vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); - vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); - vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); - - vmx->rmode.vm86_active = 0; - - vmx_segment_cache_clear(vmx); - - vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); - - flags = vmcs_readl(GUEST_RFLAGS); - flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; - flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; - vmcs_writel(GUEST_RFLAGS, flags); - - vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | - (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME)); - - update_exception_bitmap(vcpu); - - fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); - fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); - fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); - fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); - fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); - fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); -} - -static void fix_rmode_seg(int seg, struct kvm_segment *save) -{ - const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; - struct kvm_segment var = *save; - - var.dpl = 0x3; - if (seg == VCPU_SREG_CS) - var.type = 0x3; - - if (!emulate_invalid_guest_state) { - var.selector = var.base >> 4; - var.base = var.base & 0xffff0; - var.limit = 0xffff; - var.g = 0; - var.db = 0; - var.present = 1; - var.s = 1; - var.l = 0; - var.unusable = 0; - var.type = 0x3; - var.avl = 0; - if (save->base & 0xf) - printk_once(KERN_WARNING "kvm: segment base is not " - "paragraph aligned when entering " - "protected mode (seg=%d)", seg); - } - - vmcs_write16(sf->selector, var.selector); - vmcs_writel(sf->base, var.base); - vmcs_write32(sf->limit, var.limit); - vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var)); -} - -static void enter_rmode(struct kvm_vcpu *vcpu) -{ - unsigned long flags; - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm); - - vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); - vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); - vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); - vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); - vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); - vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); - vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); - - vmx->rmode.vm86_active = 1; - - /* - * Very old userspace does not call KVM_SET_TSS_ADDR before entering - * vcpu. Warn the user that an update is overdue. - */ - if (!kvm_vmx->tss_addr) - printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be " - "called before entering vcpu\n"); - - vmx_segment_cache_clear(vmx); - - vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr); - vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); - vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); - - flags = vmcs_readl(GUEST_RFLAGS); - vmx->rmode.save_rflags = flags; - - flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; - - vmcs_writel(GUEST_RFLAGS, flags); - vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); - update_exception_bitmap(vcpu); - - fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); - fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); - fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); - fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); - fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); - fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); - - kvm_mmu_reset_context(vcpu); -} - -static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); - - if (!msr) - return; - - vcpu->arch.efer = efer; - if (efer & EFER_LMA) { - vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); - msr->data = efer; - } else { - vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); - - msr->data = efer & ~EFER_LME; - } - setup_msrs(vmx); -} - -#ifdef CONFIG_X86_64 - -static void enter_lmode(struct kvm_vcpu *vcpu) -{ - u32 guest_tr_ar; - - vmx_segment_cache_clear(to_vmx(vcpu)); - - guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); - if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) { - pr_debug_ratelimited("%s: tss fixup for long mode. \n", - __func__); - vmcs_write32(GUEST_TR_AR_BYTES, - (guest_tr_ar & ~VMX_AR_TYPE_MASK) - | VMX_AR_TYPE_BUSY_64_TSS); - } - vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA); -} - -static void exit_lmode(struct kvm_vcpu *vcpu) -{ - vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); - vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA); -} - -#endif - -static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid, - bool invalidate_gpa) -{ - if (enable_ept && (invalidate_gpa || !enable_vpid)) { - if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) - return; - ept_sync_context(construct_eptp(vcpu, - vcpu->arch.mmu->root_hpa)); - } else { - vpid_sync_context(vpid); - } -} - -static void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) -{ - __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa); -} - -static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) -{ - int vpid = to_vmx(vcpu)->vpid; - - if (!vpid_sync_vcpu_addr(vpid, addr)) - vpid_sync_context(vpid); - - /* - * If VPIDs are not supported or enabled, then the above is a no-op. - * But we don't really need a TLB flush in that case anyway, because - * each VM entry/exit includes an implicit flush when VPID is 0. - */ -} - -static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) -{ - ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; - - vcpu->arch.cr0 &= ~cr0_guest_owned_bits; - vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; -} - -static void vmx_decache_cr3(struct kvm_vcpu *vcpu) -{ - if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu))) - vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); -} - -static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) -{ - ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; - - vcpu->arch.cr4 &= ~cr4_guest_owned_bits; - vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits; -} - -static void ept_load_pdptrs(struct kvm_vcpu *vcpu) -{ - struct kvm_mmu *mmu = vcpu->arch.walk_mmu; - - if (!test_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_dirty)) - return; - - if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { - vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]); - vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]); - vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]); - vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]); - } -} - -static void ept_save_pdptrs(struct kvm_vcpu *vcpu) -{ - struct kvm_mmu *mmu = vcpu->arch.walk_mmu; - - if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { - mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); - mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); - mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); - mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); - } - - __set_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_avail); - __set_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_dirty); -} - -static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) -{ - u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0; - u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1; - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - - if (to_vmx(vcpu)->nested.msrs.secondary_ctls_high & - SECONDARY_EXEC_UNRESTRICTED_GUEST && - nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) - fixed0 &= ~(X86_CR0_PE | X86_CR0_PG); - - return fixed_bits_valid(val, fixed0, fixed1); -} - -static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) -{ - u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0; - u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1; - - return fixed_bits_valid(val, fixed0, fixed1); -} - -static bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val) -{ - u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr4_fixed0; - u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr4_fixed1; - - return fixed_bits_valid(val, fixed0, fixed1); -} - -/* No difference in the restrictions on guest and host CR4 in VMX operation. */ -#define nested_guest_cr4_valid nested_cr4_valid -#define nested_host_cr4_valid nested_cr4_valid - -static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); - -static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, - unsigned long cr0, - struct kvm_vcpu *vcpu) -{ - if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) - vmx_decache_cr3(vcpu); - if (!(cr0 & X86_CR0_PG)) { - /* From paging/starting to nonpaging */ - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, - vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) | - (CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING)); - vcpu->arch.cr0 = cr0; - vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); - } else if (!is_paging(vcpu)) { - /* From nonpaging to paging */ - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, - vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) & - ~(CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING)); - vcpu->arch.cr0 = cr0; - vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); - } - - if (!(cr0 & X86_CR0_WP)) - *hw_cr0 &= ~X86_CR0_WP; -} - -static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long hw_cr0; - - hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF); - if (enable_unrestricted_guest) - hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; - else { - hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; - - if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) - enter_pmode(vcpu); - - if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) - enter_rmode(vcpu); - } - -#ifdef CONFIG_X86_64 - if (vcpu->arch.efer & EFER_LME) { - if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) - enter_lmode(vcpu); - if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) - exit_lmode(vcpu); - } -#endif - - if (enable_ept && !enable_unrestricted_guest) - ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); - - vmcs_writel(CR0_READ_SHADOW, cr0); - vmcs_writel(GUEST_CR0, hw_cr0); - vcpu->arch.cr0 = cr0; - - /* depends on vcpu->arch.cr0 to be set to a new value */ - vmx->emulation_required = emulation_required(vcpu); -} - -static int get_ept_level(struct kvm_vcpu *vcpu) -{ - if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48)) - return 5; - return 4; -} - -static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) -{ - u64 eptp = VMX_EPTP_MT_WB; - - eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; - - if (enable_ept_ad_bits && - (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) - eptp |= VMX_EPTP_AD_ENABLE_BIT; - eptp |= (root_hpa & PAGE_MASK); - - return eptp; -} - -static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) -{ - struct kvm *kvm = vcpu->kvm; - unsigned long guest_cr3; - u64 eptp; - - guest_cr3 = cr3; - if (enable_ept) { - eptp = construct_eptp(vcpu, cr3); - vmcs_write64(EPT_POINTER, eptp); - - if (kvm_x86_ops->tlb_remote_flush) { - spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock); - to_vmx(vcpu)->ept_pointer = eptp; - to_kvm_vmx(kvm)->ept_pointers_match - = EPT_POINTERS_CHECK; - spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); - } - - if (enable_unrestricted_guest || is_paging(vcpu) || - is_guest_mode(vcpu)) - guest_cr3 = kvm_read_cr3(vcpu); - else - guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; - ept_load_pdptrs(vcpu); - } - - vmcs_writel(GUEST_CR3, guest_cr3); -} - -static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) -{ - /* - * Pass through host's Machine Check Enable value to hw_cr4, which - * is in force while we are in guest mode. Do not let guests control - * this bit, even if host CR4.MCE == 0. - */ - unsigned long hw_cr4; - - hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE); - if (enable_unrestricted_guest) - hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST; - else if (to_vmx(vcpu)->rmode.vm86_active) - hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON; - else - hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON; - - if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) { - if (cr4 & X86_CR4_UMIP) { - vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, - SECONDARY_EXEC_DESC); - hw_cr4 &= ~X86_CR4_UMIP; - } else if (!is_guest_mode(vcpu) || - !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) - vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, - SECONDARY_EXEC_DESC); - } - - if (cr4 & X86_CR4_VMXE) { - /* - * To use VMXON (and later other VMX instructions), a guest - * must first be able to turn on cr4.VMXE (see handle_vmon()). - * So basically the check on whether to allow nested VMX - * is here. We operate under the default treatment of SMM, - * so VMX cannot be enabled under SMM. - */ - if (!nested_vmx_allowed(vcpu) || is_smm(vcpu)) - return 1; - } - - if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4)) - return 1; - - vcpu->arch.cr4 = cr4; - - if (!enable_unrestricted_guest) { - if (enable_ept) { - if (!is_paging(vcpu)) { - hw_cr4 &= ~X86_CR4_PAE; - hw_cr4 |= X86_CR4_PSE; - } else if (!(cr4 & X86_CR4_PAE)) { - hw_cr4 &= ~X86_CR4_PAE; - } - } - - /* - * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in - * hardware. To emulate this behavior, SMEP/SMAP/PKU needs - * to be manually disabled when guest switches to non-paging - * mode. - * - * If !enable_unrestricted_guest, the CPU is always running - * with CR0.PG=1 and CR4 needs to be modified. - * If enable_unrestricted_guest, the CPU automatically - * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0. - */ - if (!is_paging(vcpu)) - hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); - } - - vmcs_writel(CR4_READ_SHADOW, cr4); - vmcs_writel(GUEST_CR4, hw_cr4); - return 0; -} - -static void vmx_get_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 ar; - - if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { - *var = vmx->rmode.segs[seg]; - if (seg == VCPU_SREG_TR - || var->selector == vmx_read_guest_seg_selector(vmx, seg)) - return; - var->base = vmx_read_guest_seg_base(vmx, seg); - var->selector = vmx_read_guest_seg_selector(vmx, seg); - return; - } - var->base = vmx_read_guest_seg_base(vmx, seg); - var->limit = vmx_read_guest_seg_limit(vmx, seg); - var->selector = vmx_read_guest_seg_selector(vmx, seg); - ar = vmx_read_guest_seg_ar(vmx, seg); - var->unusable = (ar >> 16) & 1; - var->type = ar & 15; - var->s = (ar >> 4) & 1; - var->dpl = (ar >> 5) & 3; - /* - * Some userspaces do not preserve unusable property. Since usable - * segment has to be present according to VMX spec we can use present - * property to amend userspace bug by making unusable segment always - * nonpresent. vmx_segment_access_rights() already marks nonpresent - * segment as unusable. - */ - var->present = !var->unusable; - var->avl = (ar >> 12) & 1; - var->l = (ar >> 13) & 1; - var->db = (ar >> 14) & 1; - var->g = (ar >> 15) & 1; -} - -static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) -{ - struct kvm_segment s; - - if (to_vmx(vcpu)->rmode.vm86_active) { - vmx_get_segment(vcpu, &s, seg); - return s.base; - } - return vmx_read_guest_seg_base(to_vmx(vcpu), seg); -} - -static int vmx_get_cpl(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (unlikely(vmx->rmode.vm86_active)) - return 0; - else { - int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); - return VMX_AR_DPL(ar); - } -} - -static u32 vmx_segment_access_rights(struct kvm_segment *var) -{ - u32 ar; - - if (var->unusable || !var->present) - ar = 1 << 16; - else { - ar = var->type & 15; - ar |= (var->s & 1) << 4; - ar |= (var->dpl & 3) << 5; - ar |= (var->present & 1) << 7; - ar |= (var->avl & 1) << 12; - ar |= (var->l & 1) << 13; - ar |= (var->db & 1) << 14; - ar |= (var->g & 1) << 15; - } - - return ar; -} - -static void vmx_set_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; - - vmx_segment_cache_clear(vmx); - - if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { - vmx->rmode.segs[seg] = *var; - if (seg == VCPU_SREG_TR) - vmcs_write16(sf->selector, var->selector); - else if (var->s) - fix_rmode_seg(seg, &vmx->rmode.segs[seg]); - goto out; - } - - vmcs_writel(sf->base, var->base); - vmcs_write32(sf->limit, var->limit); - vmcs_write16(sf->selector, var->selector); - - /* - * Fix the "Accessed" bit in AR field of segment registers for older - * qemu binaries. - * IA32 arch specifies that at the time of processor reset the - * "Accessed" bit in the AR field of segment registers is 1. And qemu - * is setting it to 0 in the userland code. This causes invalid guest - * state vmexit when "unrestricted guest" mode is turned on. - * Fix for this setup issue in cpu_reset is being pushed in the qemu - * tree. Newer qemu binaries with that qemu fix would not need this - * kvm hack. - */ - if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR)) - var->type |= 0x1; /* Accessed */ - - vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); - -out: - vmx->emulation_required = emulation_required(vcpu); -} - -static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) -{ - u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS); - - *db = (ar >> 14) & 1; - *l = (ar >> 13) & 1; -} - -static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) -{ - dt->size = vmcs_read32(GUEST_IDTR_LIMIT); - dt->address = vmcs_readl(GUEST_IDTR_BASE); -} - -static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) -{ - vmcs_write32(GUEST_IDTR_LIMIT, dt->size); - vmcs_writel(GUEST_IDTR_BASE, dt->address); -} - -static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) -{ - dt->size = vmcs_read32(GUEST_GDTR_LIMIT); - dt->address = vmcs_readl(GUEST_GDTR_BASE); -} - -static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) -{ - vmcs_write32(GUEST_GDTR_LIMIT, dt->size); - vmcs_writel(GUEST_GDTR_BASE, dt->address); -} - -static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) -{ - struct kvm_segment var; - u32 ar; - - vmx_get_segment(vcpu, &var, seg); - var.dpl = 0x3; - if (seg == VCPU_SREG_CS) - var.type = 0x3; - ar = vmx_segment_access_rights(&var); - - if (var.base != (var.selector << 4)) - return false; - if (var.limit != 0xffff) - return false; - if (ar != 0xf3) - return false; - - return true; -} - -static bool code_segment_valid(struct kvm_vcpu *vcpu) -{ - struct kvm_segment cs; - unsigned int cs_rpl; - - vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); - cs_rpl = cs.selector & SEGMENT_RPL_MASK; - - if (cs.unusable) - return false; - if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK)) - return false; - if (!cs.s) - return false; - if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) { - if (cs.dpl > cs_rpl) - return false; - } else { - if (cs.dpl != cs_rpl) - return false; - } - if (!cs.present) - return false; - - /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */ - return true; -} - -static bool stack_segment_valid(struct kvm_vcpu *vcpu) -{ - struct kvm_segment ss; - unsigned int ss_rpl; - - vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); - ss_rpl = ss.selector & SEGMENT_RPL_MASK; - - if (ss.unusable) - return true; - if (ss.type != 3 && ss.type != 7) - return false; - if (!ss.s) - return false; - if (ss.dpl != ss_rpl) /* DPL != RPL */ - return false; - if (!ss.present) - return false; - - return true; -} - -static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) -{ - struct kvm_segment var; - unsigned int rpl; - - vmx_get_segment(vcpu, &var, seg); - rpl = var.selector & SEGMENT_RPL_MASK; - - if (var.unusable) - return true; - if (!var.s) - return false; - if (!var.present) - return false; - if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) { - if (var.dpl < rpl) /* DPL < RPL */ - return false; - } - - /* TODO: Add other members to kvm_segment_field to allow checking for other access - * rights flags - */ - return true; -} - -static bool tr_valid(struct kvm_vcpu *vcpu) -{ - struct kvm_segment tr; - - vmx_get_segment(vcpu, &tr, VCPU_SREG_TR); - - if (tr.unusable) - return false; - if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */ - return false; - if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */ - return false; - if (!tr.present) - return false; - - return true; -} - -static bool ldtr_valid(struct kvm_vcpu *vcpu) -{ - struct kvm_segment ldtr; - - vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR); - - if (ldtr.unusable) - return true; - if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */ - return false; - if (ldtr.type != 2) - return false; - if (!ldtr.present) - return false; - - return true; -} - -static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) -{ - struct kvm_segment cs, ss; - - vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); - vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); - - return ((cs.selector & SEGMENT_RPL_MASK) == - (ss.selector & SEGMENT_RPL_MASK)); -} - -/* - * Check if guest state is valid. Returns true if valid, false if - * not. - * We assume that registers are always usable - */ -static bool guest_state_valid(struct kvm_vcpu *vcpu) -{ - if (enable_unrestricted_guest) - return true; - - /* real mode guest state checks */ - if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { - if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) - return false; - if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) - return false; - if (!rmode_segment_valid(vcpu, VCPU_SREG_DS)) - return false; - if (!rmode_segment_valid(vcpu, VCPU_SREG_ES)) - return false; - if (!rmode_segment_valid(vcpu, VCPU_SREG_FS)) - return false; - if (!rmode_segment_valid(vcpu, VCPU_SREG_GS)) - return false; - } else { - /* protected mode guest state checks */ - if (!cs_ss_rpl_check(vcpu)) - return false; - if (!code_segment_valid(vcpu)) - return false; - if (!stack_segment_valid(vcpu)) - return false; - if (!data_segment_valid(vcpu, VCPU_SREG_DS)) - return false; - if (!data_segment_valid(vcpu, VCPU_SREG_ES)) - return false; - if (!data_segment_valid(vcpu, VCPU_SREG_FS)) - return false; - if (!data_segment_valid(vcpu, VCPU_SREG_GS)) - return false; - if (!tr_valid(vcpu)) - return false; - if (!ldtr_valid(vcpu)) - return false; - } - /* TODO: - * - Add checks on RIP - * - Add checks on RFLAGS - */ - - return true; -} - -static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) -{ - return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu)); -} - -static int init_rmode_tss(struct kvm *kvm) -{ - gfn_t fn; - u16 data = 0; - int idx, r; - - idx = srcu_read_lock(&kvm->srcu); - fn = to_kvm_vmx(kvm)->tss_addr >> PAGE_SHIFT; - r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); - if (r < 0) - goto out; - data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; - r = kvm_write_guest_page(kvm, fn++, &data, - TSS_IOPB_BASE_OFFSET, sizeof(u16)); - if (r < 0) - goto out; - r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); - if (r < 0) - goto out; - r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); - if (r < 0) - goto out; - data = ~0; - r = kvm_write_guest_page(kvm, fn, &data, - RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1, - sizeof(u8)); -out: - srcu_read_unlock(&kvm->srcu, idx); - return r; -} - -static int init_rmode_identity_map(struct kvm *kvm) -{ - struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); - int i, idx, r = 0; - kvm_pfn_t identity_map_pfn; - u32 tmp; - - /* Protect kvm_vmx->ept_identity_pagetable_done. */ - mutex_lock(&kvm->slots_lock); - - if (likely(kvm_vmx->ept_identity_pagetable_done)) - goto out2; - - if (!kvm_vmx->ept_identity_map_addr) - kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; - identity_map_pfn = kvm_vmx->ept_identity_map_addr >> PAGE_SHIFT; - - r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, - kvm_vmx->ept_identity_map_addr, PAGE_SIZE); - if (r < 0) - goto out2; - - idx = srcu_read_lock(&kvm->srcu); - r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); - if (r < 0) - goto out; - /* Set up identity-mapping pagetable for EPT in real mode */ - for (i = 0; i < PT32_ENT_PER_PAGE; i++) { - tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | - _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); - r = kvm_write_guest_page(kvm, identity_map_pfn, - &tmp, i * sizeof(tmp), sizeof(tmp)); - if (r < 0) - goto out; - } - kvm_vmx->ept_identity_pagetable_done = true; - -out: - srcu_read_unlock(&kvm->srcu, idx); - -out2: - mutex_unlock(&kvm->slots_lock); - return r; -} - -static void seg_setup(int seg) -{ - const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; - unsigned int ar; - - vmcs_write16(sf->selector, 0); - vmcs_writel(sf->base, 0); - vmcs_write32(sf->limit, 0xffff); - ar = 0x93; - if (seg == VCPU_SREG_CS) - ar |= 0x08; /* code segment */ - - vmcs_write32(sf->ar_bytes, ar); -} - -static int alloc_apic_access_page(struct kvm *kvm) -{ - struct page *page; - int r = 0; - - mutex_lock(&kvm->slots_lock); - if (kvm->arch.apic_access_page_done) - goto out; - r = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, - APIC_DEFAULT_PHYS_BASE, PAGE_SIZE); - if (r) - goto out; - - page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); - if (is_error_page(page)) { - r = -EFAULT; - goto out; - } - - /* - * Do not pin the page in memory, so that memory hot-unplug - * is able to migrate it. - */ - put_page(page); - kvm->arch.apic_access_page_done = true; -out: - mutex_unlock(&kvm->slots_lock); - return r; -} - -static int allocate_vpid(void) -{ - int vpid; - - if (!enable_vpid) - return 0; - spin_lock(&vmx_vpid_lock); - vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); - if (vpid < VMX_NR_VPIDS) - __set_bit(vpid, vmx_vpid_bitmap); - else - vpid = 0; - spin_unlock(&vmx_vpid_lock); - return vpid; -} - -static void free_vpid(int vpid) -{ - if (!enable_vpid || vpid == 0) - return; - spin_lock(&vmx_vpid_lock); - __clear_bit(vpid, vmx_vpid_bitmap); - spin_unlock(&vmx_vpid_lock); -} - -static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, - u32 msr, int type) -{ - int f = sizeof(unsigned long); - - if (!cpu_has_vmx_msr_bitmap()) - return; - - if (static_branch_unlikely(&enable_evmcs)) - evmcs_touch_msr_bitmap(); - - /* - * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals - * have the write-low and read-high bitmap offsets the wrong way round. - * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. - */ - if (msr <= 0x1fff) { - if (type & MSR_TYPE_R) - /* read-low */ - __clear_bit(msr, msr_bitmap + 0x000 / f); - - if (type & MSR_TYPE_W) - /* write-low */ - __clear_bit(msr, msr_bitmap + 0x800 / f); - - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { - msr &= 0x1fff; - if (type & MSR_TYPE_R) - /* read-high */ - __clear_bit(msr, msr_bitmap + 0x400 / f); - - if (type & MSR_TYPE_W) - /* write-high */ - __clear_bit(msr, msr_bitmap + 0xc00 / f); - - } -} - -static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, - u32 msr, int type) -{ - int f = sizeof(unsigned long); - - if (!cpu_has_vmx_msr_bitmap()) - return; - - if (static_branch_unlikely(&enable_evmcs)) - evmcs_touch_msr_bitmap(); - - /* - * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals - * have the write-low and read-high bitmap offsets the wrong way round. - * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. - */ - if (msr <= 0x1fff) { - if (type & MSR_TYPE_R) - /* read-low */ - __set_bit(msr, msr_bitmap + 0x000 / f); - - if (type & MSR_TYPE_W) - /* write-low */ - __set_bit(msr, msr_bitmap + 0x800 / f); - - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { - msr &= 0x1fff; - if (type & MSR_TYPE_R) - /* read-high */ - __set_bit(msr, msr_bitmap + 0x400 / f); - - if (type & MSR_TYPE_W) - /* write-high */ - __set_bit(msr, msr_bitmap + 0xc00 / f); - - } -} - -static __always_inline void vmx_set_intercept_for_msr(unsigned long *msr_bitmap, - u32 msr, int type, bool value) -{ - if (value) - vmx_enable_intercept_for_msr(msr_bitmap, msr, type); - else - vmx_disable_intercept_for_msr(msr_bitmap, msr, type); -} - -/* - * If a msr is allowed by L0, we should check whether it is allowed by L1. - * The corresponding bit will be cleared unless both of L0 and L1 allow it. - */ -static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, - unsigned long *msr_bitmap_nested, - u32 msr, int type) -{ - int f = sizeof(unsigned long); - - /* - * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals - * have the write-low and read-high bitmap offsets the wrong way round. - * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. - */ - if (msr <= 0x1fff) { - if (type & MSR_TYPE_R && - !test_bit(msr, msr_bitmap_l1 + 0x000 / f)) - /* read-low */ - __clear_bit(msr, msr_bitmap_nested + 0x000 / f); - - if (type & MSR_TYPE_W && - !test_bit(msr, msr_bitmap_l1 + 0x800 / f)) - /* write-low */ - __clear_bit(msr, msr_bitmap_nested + 0x800 / f); - - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { - msr &= 0x1fff; - if (type & MSR_TYPE_R && - !test_bit(msr, msr_bitmap_l1 + 0x400 / f)) - /* read-high */ - __clear_bit(msr, msr_bitmap_nested + 0x400 / f); - - if (type & MSR_TYPE_W && - !test_bit(msr, msr_bitmap_l1 + 0xc00 / f)) - /* write-high */ - __clear_bit(msr, msr_bitmap_nested + 0xc00 / f); - - } -} - -static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu) -{ - u8 mode = 0; - - if (cpu_has_secondary_exec_ctrls() && - (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) & - SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { - mode |= MSR_BITMAP_MODE_X2APIC; - if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) - mode |= MSR_BITMAP_MODE_X2APIC_APICV; - } - - return mode; -} - -#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4)) - -static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap, - u8 mode) -{ - int msr; - - for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { - unsigned word = msr / BITS_PER_LONG; - msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0; - msr_bitmap[word + (0x800 / sizeof(long))] = ~0; - } - - if (mode & MSR_BITMAP_MODE_X2APIC) { - /* - * TPR reads and writes can be virtualized even if virtual interrupt - * delivery is not in use. - */ - vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW); - if (mode & MSR_BITMAP_MODE_X2APIC_APICV) { - vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R); - vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); - vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); - } - } -} - -static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; - u8 mode = vmx_msr_bitmap_mode(vcpu); - u8 changed = mode ^ vmx->msr_bitmap_mode; - - if (!changed) - return; - - if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV)) - vmx_update_msr_bitmap_x2apic(msr_bitmap, mode); - - vmx->msr_bitmap_mode = mode; -} - -static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu) -{ - return enable_apicv; -} - -static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) -{ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - gfn_t gfn; - - /* - * Don't need to mark the APIC access page dirty; it is never - * written to by the CPU during APIC virtualization. - */ - - if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { - gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; - kvm_vcpu_mark_page_dirty(vcpu, gfn); - } - - if (nested_cpu_has_posted_intr(vmcs12)) { - gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; - kvm_vcpu_mark_page_dirty(vcpu, gfn); - } -} - - -static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - int max_irr; - void *vapic_page; - u16 status; - - if (!vmx->nested.pi_desc || !vmx->nested.pi_pending) - return; - - vmx->nested.pi_pending = false; - if (!pi_test_and_clear_on(vmx->nested.pi_desc)) - return; - - max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); - if (max_irr != 256) { - vapic_page = kmap(vmx->nested.virtual_apic_page); - __kvm_apic_update_irr(vmx->nested.pi_desc->pir, - vapic_page, &max_irr); - kunmap(vmx->nested.virtual_apic_page); - - status = vmcs_read16(GUEST_INTR_STATUS); - if ((u8)max_irr > ((u8)status & 0xff)) { - status &= ~0xff; - status |= (u8)max_irr; - vmcs_write16(GUEST_INTR_STATUS, status); - } - } - - nested_mark_vmcs12_pages_dirty(vcpu); -} - -static u8 vmx_get_rvi(void) -{ - return vmcs_read16(GUEST_INTR_STATUS) & 0xff; -} - -static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - void *vapic_page; - u32 vppr; - int rvi; - - if (WARN_ON_ONCE(!is_guest_mode(vcpu)) || - !nested_cpu_has_vid(get_vmcs12(vcpu)) || - WARN_ON_ONCE(!vmx->nested.virtual_apic_page)) - return false; - - rvi = vmx_get_rvi(); - - vapic_page = kmap(vmx->nested.virtual_apic_page); - vppr = *((u32 *)(vapic_page + APIC_PROCPRI)); - kunmap(vmx->nested.virtual_apic_page); - - return ((rvi & 0xf0) > (vppr & 0xf0)); -} - -static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, - bool nested) -{ -#ifdef CONFIG_SMP - int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR; - - if (vcpu->mode == IN_GUEST_MODE) { - /* - * The vector of interrupt to be delivered to vcpu had - * been set in PIR before this function. - * - * Following cases will be reached in this block, and - * we always send a notification event in all cases as - * explained below. - * - * Case 1: vcpu keeps in non-root mode. Sending a - * notification event posts the interrupt to vcpu. - * - * Case 2: vcpu exits to root mode and is still - * runnable. PIR will be synced to vIRR before the - * next vcpu entry. Sending a notification event in - * this case has no effect, as vcpu is not in root - * mode. - * - * Case 3: vcpu exits to root mode and is blocked. - * vcpu_block() has already synced PIR to vIRR and - * never blocks vcpu if vIRR is not cleared. Therefore, - * a blocked vcpu here does not wait for any requested - * interrupts in PIR, and sending a notification event - * which has no effect is safe here. - */ - - apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); - return true; - } -#endif - return false; -} - -static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, - int vector) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (is_guest_mode(vcpu) && - vector == vmx->nested.posted_intr_nv) { - /* - * If a posted intr is not recognized by hardware, - * we will accomplish it in the next vmentry. - */ - vmx->nested.pi_pending = true; - kvm_make_request(KVM_REQ_EVENT, vcpu); - /* the PIR and ON have been set by L1. */ - if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true)) - kvm_vcpu_kick(vcpu); - return 0; - } - return -1; -} -/* - * Send interrupt to vcpu via posted interrupt way. - * 1. If target vcpu is running(non-root mode), send posted interrupt - * notification to vcpu and hardware will sync PIR to vIRR atomically. - * 2. If target vcpu isn't running(root mode), kick it to pick up the - * interrupt from PIR in next vmentry. - */ -static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - int r; - - r = vmx_deliver_nested_posted_interrupt(vcpu, vector); - if (!r) - return; - - if (pi_test_and_set_pir(vector, &vmx->pi_desc)) - return; - - /* If a previous notification has sent the IPI, nothing to do. */ - if (pi_test_and_set_on(&vmx->pi_desc)) - return; - - if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false)) - kvm_vcpu_kick(vcpu); -} - -/* - * Set up the vmcs's constant host-state fields, i.e., host-state fields that - * will not change in the lifetime of the guest. - * Note that host-state that does change is set elsewhere. E.g., host-state - * that is set differently for each CPU is set in vmx_vcpu_load(), not here. - */ -static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) -{ - u32 low32, high32; - unsigned long tmpl; - struct desc_ptr dt; - unsigned long cr0, cr3, cr4; - - cr0 = read_cr0(); - WARN_ON(cr0 & X86_CR0_TS); - vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */ - - /* - * Save the most likely value for this task's CR3 in the VMCS. - * We can't use __get_current_cr3_fast() because we're not atomic. - */ - cr3 = __read_cr3(); - vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ - vmx->loaded_vmcs->host_state.cr3 = cr3; - - /* Save the most likely value for this task's CR4 in the VMCS. */ - cr4 = cr4_read_shadow(); - vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ - vmx->loaded_vmcs->host_state.cr4 = cr4; - - vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ -#ifdef CONFIG_X86_64 - /* - * Load null selectors, so we can avoid reloading them in - * vmx_prepare_switch_to_host(), in case userspace uses - * the null selectors too (the expected case). - */ - vmcs_write16(HOST_DS_SELECTOR, 0); - vmcs_write16(HOST_ES_SELECTOR, 0); -#else - vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ - vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ -#endif - vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ - vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ - - store_idt(&dt); - vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ - vmx->host_idt_base = dt.address; - - vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */ - - rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); - vmcs_write32(HOST_IA32_SYSENTER_CS, low32); - rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl); - vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ - - if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { - rdmsr(MSR_IA32_CR_PAT, low32, high32); - vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32)); - } - - if (cpu_has_load_ia32_efer) - vmcs_write64(HOST_IA32_EFER, host_efer); -} - -static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) -{ - vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; - if (enable_ept) - vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; - if (is_guest_mode(&vmx->vcpu)) - vmx->vcpu.arch.cr4_guest_owned_bits &= - ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask; - vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); -} - -static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) -{ - u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; - - if (!kvm_vcpu_apicv_active(&vmx->vcpu)) - pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; - - if (!enable_vnmi) - pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS; - - /* Enable the preemption timer dynamically */ - pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; - return pin_based_exec_ctrl; -} - -static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); - if (cpu_has_secondary_exec_ctrls()) { - if (kvm_vcpu_apicv_active(vcpu)) - vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, - SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); - else - vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, - SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); - } - - if (cpu_has_vmx_msr_bitmap()) - vmx_update_msr_bitmap(vcpu); -} - -static u32 vmx_exec_control(struct vcpu_vmx *vmx) -{ - u32 exec_control = vmcs_config.cpu_based_exec_ctrl; - - if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) - exec_control &= ~CPU_BASED_MOV_DR_EXITING; - - if (!cpu_need_tpr_shadow(&vmx->vcpu)) { - exec_control &= ~CPU_BASED_TPR_SHADOW; -#ifdef CONFIG_X86_64 - exec_control |= CPU_BASED_CR8_STORE_EXITING | - CPU_BASED_CR8_LOAD_EXITING; -#endif - } - if (!enable_ept) - exec_control |= CPU_BASED_CR3_STORE_EXITING | - CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_INVLPG_EXITING; - if (kvm_mwait_in_guest(vmx->vcpu.kvm)) - exec_control &= ~(CPU_BASED_MWAIT_EXITING | - CPU_BASED_MONITOR_EXITING); - if (kvm_hlt_in_guest(vmx->vcpu.kvm)) - exec_control &= ~CPU_BASED_HLT_EXITING; - return exec_control; -} - -static bool vmx_rdrand_supported(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_RDRAND_EXITING; -} - -static bool vmx_rdseed_supported(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_RDSEED_EXITING; -} - -static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) -{ - struct kvm_vcpu *vcpu = &vmx->vcpu; - - u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; - - if (!cpu_need_virtualize_apic_accesses(vcpu)) - exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; - if (vmx->vpid == 0) - exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; - if (!enable_ept) { - exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; - enable_unrestricted_guest = 0; - } - if (!enable_unrestricted_guest) - exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; - if (kvm_pause_in_guest(vmx->vcpu.kvm)) - exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; - if (!kvm_vcpu_apicv_active(vcpu)) - exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); - exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; - - /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP, - * in vmx_set_cr4. */ - exec_control &= ~SECONDARY_EXEC_DESC; - - /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD - (handle_vmptrld). - We can NOT enable shadow_vmcs here because we don't have yet - a current VMCS12 - */ - exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; - - if (!enable_pml) - exec_control &= ~SECONDARY_EXEC_ENABLE_PML; - - if (vmx_xsaves_supported()) { - /* Exposing XSAVES only when XSAVE is exposed */ - bool xsaves_enabled = - guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && - guest_cpuid_has(vcpu, X86_FEATURE_XSAVES); - - if (!xsaves_enabled) - exec_control &= ~SECONDARY_EXEC_XSAVES; - - if (nested) { - if (xsaves_enabled) - vmx->nested.msrs.secondary_ctls_high |= - SECONDARY_EXEC_XSAVES; - else - vmx->nested.msrs.secondary_ctls_high &= - ~SECONDARY_EXEC_XSAVES; - } - } - - if (vmx_rdtscp_supported()) { - bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP); - if (!rdtscp_enabled) - exec_control &= ~SECONDARY_EXEC_RDTSCP; - - if (nested) { - if (rdtscp_enabled) - vmx->nested.msrs.secondary_ctls_high |= - SECONDARY_EXEC_RDTSCP; - else - vmx->nested.msrs.secondary_ctls_high &= - ~SECONDARY_EXEC_RDTSCP; - } - } - - if (vmx_invpcid_supported()) { - /* Exposing INVPCID only when PCID is exposed */ - bool invpcid_enabled = - guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) && - guest_cpuid_has(vcpu, X86_FEATURE_PCID); - - if (!invpcid_enabled) { - exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; - guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID); - } - - if (nested) { - if (invpcid_enabled) - vmx->nested.msrs.secondary_ctls_high |= - SECONDARY_EXEC_ENABLE_INVPCID; - else - vmx->nested.msrs.secondary_ctls_high &= - ~SECONDARY_EXEC_ENABLE_INVPCID; - } - } - - if (vmx_rdrand_supported()) { - bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND); - if (rdrand_enabled) - exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING; - - if (nested) { - if (rdrand_enabled) - vmx->nested.msrs.secondary_ctls_high |= - SECONDARY_EXEC_RDRAND_EXITING; - else - vmx->nested.msrs.secondary_ctls_high &= - ~SECONDARY_EXEC_RDRAND_EXITING; - } - } - - if (vmx_rdseed_supported()) { - bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED); - if (rdseed_enabled) - exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING; - - if (nested) { - if (rdseed_enabled) - vmx->nested.msrs.secondary_ctls_high |= - SECONDARY_EXEC_RDSEED_EXITING; - else - vmx->nested.msrs.secondary_ctls_high &= - ~SECONDARY_EXEC_RDSEED_EXITING; - } - } - - vmx->secondary_exec_control = exec_control; -} - -static void ept_set_mmio_spte_mask(void) -{ - /* - * EPT Misconfigurations can be generated if the value of bits 2:0 - * of an EPT paging-structure entry is 110b (write/execute). - */ - kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK, - VMX_EPT_MISCONFIG_WX_VALUE); -} - -#define VMX_XSS_EXIT_BITMAP 0 -/* - * Sets up the vmcs for emulated real mode. - */ -static void vmx_vcpu_setup(struct vcpu_vmx *vmx) -{ - int i; - - if (enable_shadow_vmcs) { - /* - * At vCPU creation, "VMWRITE to any supported field - * in the VMCS" is supported, so use the more - * permissive vmx_vmread_bitmap to specify both read - * and write permissions for the shadow VMCS. - */ - vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); - vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmread_bitmap)); - } - if (cpu_has_vmx_msr_bitmap()) - vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap)); - - vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ - - /* Control */ - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); - vmx->hv_deadline_tsc = -1; - - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); - - if (cpu_has_secondary_exec_ctrls()) { - vmx_compute_secondary_exec_control(vmx); - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, - vmx->secondary_exec_control); - } - - if (kvm_vcpu_apicv_active(&vmx->vcpu)) { - vmcs_write64(EOI_EXIT_BITMAP0, 0); - vmcs_write64(EOI_EXIT_BITMAP1, 0); - vmcs_write64(EOI_EXIT_BITMAP2, 0); - vmcs_write64(EOI_EXIT_BITMAP3, 0); - - vmcs_write16(GUEST_INTR_STATUS, 0); - - vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); - vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); - } - - if (!kvm_pause_in_guest(vmx->vcpu.kvm)) { - vmcs_write32(PLE_GAP, ple_gap); - vmx->ple_window = ple_window; - vmx->ple_window_dirty = true; - } - - vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); - vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); - vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ - - vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ - vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ - vmx_set_constant_host_state(vmx); - vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ - vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ - - if (cpu_has_vmx_vmfunc()) - vmcs_write64(VM_FUNCTION_CONTROL, 0); - - vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); - vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); - vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); - - if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) - vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); - - for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) { - u32 index = vmx_msr_index[i]; - u32 data_low, data_high; - int j = vmx->nmsrs; - - if (rdmsr_safe(index, &data_low, &data_high) < 0) - continue; - if (wrmsr_safe(index, data_low, data_high) < 0) - continue; - vmx->guest_msrs[j].index = i; - vmx->guest_msrs[j].data = 0; - vmx->guest_msrs[j].mask = -1ull; - ++vmx->nmsrs; - } - - vmx->arch_capabilities = kvm_get_arch_capabilities(); - - vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl); - - /* 22.2.1, 20.8.1 */ - vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl); - - vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS; - vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS); - - set_cr4_guest_host_mask(vmx); - - if (vmx_xsaves_supported()) - vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); - - if (enable_pml) { - vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); - vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); - } - - if (cpu_has_vmx_encls_vmexit()) - vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); -} - -static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct msr_data apic_base_msr; - u64 cr0; - - vmx->rmode.vm86_active = 0; - vmx->spec_ctrl = 0; - - vcpu->arch.microcode_version = 0x100000000ULL; - vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); - kvm_set_cr8(vcpu, 0); - - if (!init_event) { - apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | - MSR_IA32_APICBASE_ENABLE; - if (kvm_vcpu_is_reset_bsp(vcpu)) - apic_base_msr.data |= MSR_IA32_APICBASE_BSP; - apic_base_msr.host_initiated = true; - kvm_set_apic_base(vcpu, &apic_base_msr); - } - - vmx_segment_cache_clear(vmx); - - seg_setup(VCPU_SREG_CS); - vmcs_write16(GUEST_CS_SELECTOR, 0xf000); - vmcs_writel(GUEST_CS_BASE, 0xffff0000ul); - - seg_setup(VCPU_SREG_DS); - seg_setup(VCPU_SREG_ES); - seg_setup(VCPU_SREG_FS); - seg_setup(VCPU_SREG_GS); - seg_setup(VCPU_SREG_SS); - - vmcs_write16(GUEST_TR_SELECTOR, 0); - vmcs_writel(GUEST_TR_BASE, 0); - vmcs_write32(GUEST_TR_LIMIT, 0xffff); - vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); - - vmcs_write16(GUEST_LDTR_SELECTOR, 0); - vmcs_writel(GUEST_LDTR_BASE, 0); - vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); - vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); - - if (!init_event) { - vmcs_write32(GUEST_SYSENTER_CS, 0); - vmcs_writel(GUEST_SYSENTER_ESP, 0); - vmcs_writel(GUEST_SYSENTER_EIP, 0); - vmcs_write64(GUEST_IA32_DEBUGCTL, 0); - } - - kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); - kvm_rip_write(vcpu, 0xfff0); - - vmcs_writel(GUEST_GDTR_BASE, 0); - vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); - - vmcs_writel(GUEST_IDTR_BASE, 0); - vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); - - vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); - vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); - vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0); - if (kvm_mpx_supported()) - vmcs_write64(GUEST_BNDCFGS, 0); - - setup_msrs(vmx); - - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ - - if (cpu_has_vmx_tpr_shadow() && !init_event) { - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); - if (cpu_need_tpr_shadow(vcpu)) - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, - __pa(vcpu->arch.apic->regs)); - vmcs_write32(TPR_THRESHOLD, 0); - } - - kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); - - if (vmx->vpid != 0) - vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); - - cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; - vmx->vcpu.arch.cr0 = cr0; - vmx_set_cr0(vcpu, cr0); /* enter rmode */ - vmx_set_cr4(vcpu, 0); - vmx_set_efer(vcpu, 0); - - update_exception_bitmap(vcpu); - - vpid_sync_context(vmx->vpid); - if (init_event) - vmx_clear_hlt(vcpu); -} - -/* - * In nested virtualization, check if L1 asked to exit on external interrupts. - * For most existing hypervisors, this will always return true. - */ -static bool nested_exit_on_intr(struct kvm_vcpu *vcpu) -{ - return get_vmcs12(vcpu)->pin_based_vm_exec_control & - PIN_BASED_EXT_INTR_MASK; -} - -/* - * In nested virtualization, check if L1 has set - * VM_EXIT_ACK_INTR_ON_EXIT - */ -static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) -{ - return get_vmcs12(vcpu)->vm_exit_controls & - VM_EXIT_ACK_INTR_ON_EXIT; -} - -static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) -{ - return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu)); -} - -static void enable_irq_window(struct kvm_vcpu *vcpu) -{ - vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, - CPU_BASED_VIRTUAL_INTR_PENDING); -} - -static void enable_nmi_window(struct kvm_vcpu *vcpu) -{ - if (!enable_vnmi || - vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { - enable_irq_window(vcpu); - return; - } - - vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, - CPU_BASED_VIRTUAL_NMI_PENDING); -} - -static void vmx_inject_irq(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - uint32_t intr; - int irq = vcpu->arch.interrupt.nr; - - trace_kvm_inj_virq(irq); - - ++vcpu->stat.irq_injections; - if (vmx->rmode.vm86_active) { - int inc_eip = 0; - if (vcpu->arch.interrupt.soft) - inc_eip = vcpu->arch.event_exit_inst_len; - if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE) - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); - return; - } - intr = irq | INTR_INFO_VALID_MASK; - if (vcpu->arch.interrupt.soft) { - intr |= INTR_TYPE_SOFT_INTR; - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, - vmx->vcpu.arch.event_exit_inst_len); - } else - intr |= INTR_TYPE_EXT_INTR; - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); - - vmx_clear_hlt(vcpu); -} - -static void vmx_inject_nmi(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (!enable_vnmi) { - /* - * Tracking the NMI-blocked state in software is built upon - * finding the next open IRQ window. This, in turn, depends on - * well-behaving guests: They have to keep IRQs disabled at - * least as long as the NMI handler runs. Otherwise we may - * cause NMI nesting, maybe breaking the guest. But as this is - * highly unlikely, we can live with the residual risk. - */ - vmx->loaded_vmcs->soft_vnmi_blocked = 1; - vmx->loaded_vmcs->vnmi_blocked_time = 0; - } - - ++vcpu->stat.nmi_injections; - vmx->loaded_vmcs->nmi_known_unmasked = false; - - if (vmx->rmode.vm86_active) { - if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE) - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); - return; - } - - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, - INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); - - vmx_clear_hlt(vcpu); -} - -static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - bool masked; - - if (!enable_vnmi) - return vmx->loaded_vmcs->soft_vnmi_blocked; - if (vmx->loaded_vmcs->nmi_known_unmasked) - return false; - masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; - vmx->loaded_vmcs->nmi_known_unmasked = !masked; - return masked; -} - -static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (!enable_vnmi) { - if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) { - vmx->loaded_vmcs->soft_vnmi_blocked = masked; - vmx->loaded_vmcs->vnmi_blocked_time = 0; - } - } else { - vmx->loaded_vmcs->nmi_known_unmasked = !masked; - if (masked) - vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); - else - vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); - } -} - -static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) -{ - if (to_vmx(vcpu)->nested.nested_run_pending) - return 0; - - if (!enable_vnmi && - to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked) - return 0; - - return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & - (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI - | GUEST_INTR_STATE_NMI)); -} - -static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) -{ - return (!to_vmx(vcpu)->nested.nested_run_pending && - vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && - !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & - (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); -} - -static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) -{ - int ret; - - if (enable_unrestricted_guest) - return 0; - - ret = x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr, - PAGE_SIZE * 3); - if (ret) - return ret; - to_kvm_vmx(kvm)->tss_addr = addr; - return init_rmode_tss(kvm); -} - -static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) -{ - to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr; - return 0; -} - -static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) -{ - switch (vec) { - case BP_VECTOR: - /* - * Update instruction length as we may reinject the exception - * from user space while in guest debugging mode. - */ - to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = - vmcs_read32(VM_EXIT_INSTRUCTION_LEN); - if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) - return false; - /* fall through */ - case DB_VECTOR: - if (vcpu->guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) - return false; - /* fall through */ - case DE_VECTOR: - case OF_VECTOR: - case BR_VECTOR: - case UD_VECTOR: - case DF_VECTOR: - case SS_VECTOR: - case GP_VECTOR: - case MF_VECTOR: - return true; - break; - } - return false; -} - -static int handle_rmode_exception(struct kvm_vcpu *vcpu, - int vec, u32 err_code) -{ - /* - * Instruction with address size override prefix opcode 0x67 - * Cause the #SS fault with 0 error code in VM86 mode. - */ - if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) { - if (kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE) { - if (vcpu->arch.halt_request) { - vcpu->arch.halt_request = 0; - return kvm_vcpu_halt(vcpu); - } - return 1; - } - return 0; - } - - /* - * Forward all other exceptions that are valid in real mode. - * FIXME: Breaks guest debugging in real mode, needs to be fixed with - * the required debugging infrastructure rework. - */ - kvm_queue_exception(vcpu, vec); - return 1; -} - -/* - * Trigger machine check on the host. We assume all the MSRs are already set up - * by the CPU and that we still run on the same CPU as the MCE occurred on. - * We pass a fake environment to the machine check handler because we want - * the guest to be always treated like user space, no matter what context - * it used internally. - */ -static void kvm_machine_check(void) -{ -#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64) - struct pt_regs regs = { - .cs = 3, /* Fake ring 3 no matter what the guest ran on */ - .flags = X86_EFLAGS_IF, - }; - - do_machine_check(®s, 0); -#endif -} - -static int handle_machine_check(struct kvm_vcpu *vcpu) -{ - /* already handled by vcpu_run */ - return 1; -} - -static int handle_exception(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct kvm_run *kvm_run = vcpu->run; - u32 intr_info, ex_no, error_code; - unsigned long cr2, rip, dr6; - u32 vect_info; - enum emulation_result er; - - vect_info = vmx->idt_vectoring_info; - intr_info = vmx->exit_intr_info; - - if (is_machine_check(intr_info)) - return handle_machine_check(vcpu); - - if (is_nmi(intr_info)) - return 1; /* already handled by vmx_vcpu_run() */ - - if (is_invalid_opcode(intr_info)) - return handle_ud(vcpu); - - error_code = 0; - if (intr_info & INTR_INFO_DELIVER_CODE_MASK) - error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); - - if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) { - WARN_ON_ONCE(!enable_vmware_backdoor); - er = kvm_emulate_instruction(vcpu, - EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL); - if (er == EMULATE_USER_EXIT) - return 0; - else if (er != EMULATE_DONE) - kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); - return 1; - } - - /* - * The #PF with PFEC.RSVD = 1 indicates the guest is accessing - * MMIO, it is better to report an internal error. - * See the comments in vmx_handle_exit. - */ - if ((vect_info & VECTORING_INFO_VALID_MASK) && - !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; - vcpu->run->internal.ndata = 3; - vcpu->run->internal.data[0] = vect_info; - vcpu->run->internal.data[1] = intr_info; - vcpu->run->internal.data[2] = error_code; - return 0; - } - - if (is_page_fault(intr_info)) { - cr2 = vmcs_readl(EXIT_QUALIFICATION); - /* EPT won't cause page fault directly */ - WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept); - return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); - } - - ex_no = intr_info & INTR_INFO_VECTOR_MASK; - - if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no)) - return handle_rmode_exception(vcpu, ex_no, error_code); - - switch (ex_no) { - case AC_VECTOR: - kvm_queue_exception_e(vcpu, AC_VECTOR, error_code); - return 1; - case DB_VECTOR: - dr6 = vmcs_readl(EXIT_QUALIFICATION); - if (!(vcpu->guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { - vcpu->arch.dr6 &= ~15; - vcpu->arch.dr6 |= dr6 | DR6_RTM; - if (is_icebp(intr_info)) - skip_emulated_instruction(vcpu); - - kvm_queue_exception(vcpu, DB_VECTOR); - return 1; - } - kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1; - kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); - /* fall through */ - case BP_VECTOR: - /* - * Update instruction length as we may reinject #BP from - * user space while in guest debugging mode. Reading it for - * #DB as well causes no harm, it is not used in that case. - */ - vmx->vcpu.arch.event_exit_inst_len = - vmcs_read32(VM_EXIT_INSTRUCTION_LEN); - kvm_run->exit_reason = KVM_EXIT_DEBUG; - rip = kvm_rip_read(vcpu); - kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; - kvm_run->debug.arch.exception = ex_no; - break; - default: - kvm_run->exit_reason = KVM_EXIT_EXCEPTION; - kvm_run->ex.exception = ex_no; - kvm_run->ex.error_code = error_code; - break; - } - return 0; -} - -static int handle_external_interrupt(struct kvm_vcpu *vcpu) -{ - ++vcpu->stat.irq_exits; - return 1; -} - -static int handle_triple_fault(struct kvm_vcpu *vcpu) -{ - vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; - vcpu->mmio_needed = 0; - return 0; -} - -static int handle_io(struct kvm_vcpu *vcpu) -{ - unsigned long exit_qualification; - int size, in, string; - unsigned port; - - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - string = (exit_qualification & 16) != 0; - - ++vcpu->stat.io_exits; - - if (string) - return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; - - port = exit_qualification >> 16; - size = (exit_qualification & 7) + 1; - in = (exit_qualification & 8) != 0; - - return kvm_fast_pio(vcpu, size, port, in); -} - -static void -vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) -{ - /* - * Patch in the VMCALL instruction: - */ - hypercall[0] = 0x0f; - hypercall[1] = 0x01; - hypercall[2] = 0xc1; -} - -/* called to set cr0 as appropriate for a mov-to-cr0 exit. */ -static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) -{ - if (is_guest_mode(vcpu)) { - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - unsigned long orig_val = val; - - /* - * We get here when L2 changed cr0 in a way that did not change - * any of L1's shadowed bits (see nested_vmx_exit_handled_cr), - * but did change L0 shadowed bits. So we first calculate the - * effective cr0 value that L1 would like to write into the - * hardware. It consists of the L2-owned bits from the new - * value combined with the L1-owned bits from L1's guest_cr0. - */ - val = (val & ~vmcs12->cr0_guest_host_mask) | - (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); - - if (!nested_guest_cr0_valid(vcpu, val)) - return 1; - - if (kvm_set_cr0(vcpu, val)) - return 1; - vmcs_writel(CR0_READ_SHADOW, orig_val); - return 0; - } else { - if (to_vmx(vcpu)->nested.vmxon && - !nested_host_cr0_valid(vcpu, val)) - return 1; - - return kvm_set_cr0(vcpu, val); - } -} - -static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) -{ - if (is_guest_mode(vcpu)) { - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - unsigned long orig_val = val; - - /* analogously to handle_set_cr0 */ - val = (val & ~vmcs12->cr4_guest_host_mask) | - (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask); - if (kvm_set_cr4(vcpu, val)) - return 1; - vmcs_writel(CR4_READ_SHADOW, orig_val); - return 0; - } else - return kvm_set_cr4(vcpu, val); -} - -static int handle_desc(struct kvm_vcpu *vcpu) -{ - WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP)); - return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; -} - -static int handle_cr(struct kvm_vcpu *vcpu) -{ - unsigned long exit_qualification, val; - int cr; - int reg; - int err; - int ret; - - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - cr = exit_qualification & 15; - reg = (exit_qualification >> 8) & 15; - switch ((exit_qualification >> 4) & 3) { - case 0: /* mov to cr */ - val = kvm_register_readl(vcpu, reg); - trace_kvm_cr_write(cr, val); - switch (cr) { - case 0: - err = handle_set_cr0(vcpu, val); - return kvm_complete_insn_gp(vcpu, err); - case 3: - WARN_ON_ONCE(enable_unrestricted_guest); - err = kvm_set_cr3(vcpu, val); - return kvm_complete_insn_gp(vcpu, err); - case 4: - err = handle_set_cr4(vcpu, val); - return kvm_complete_insn_gp(vcpu, err); - case 8: { - u8 cr8_prev = kvm_get_cr8(vcpu); - u8 cr8 = (u8)val; - err = kvm_set_cr8(vcpu, cr8); - ret = kvm_complete_insn_gp(vcpu, err); - if (lapic_in_kernel(vcpu)) - return ret; - if (cr8_prev <= cr8) - return ret; - /* - * TODO: we might be squashing a - * KVM_GUESTDBG_SINGLESTEP-triggered - * KVM_EXIT_DEBUG here. - */ - vcpu->run->exit_reason = KVM_EXIT_SET_TPR; - return 0; - } - } - break; - case 2: /* clts */ - WARN_ONCE(1, "Guest should always own CR0.TS"); - vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); - trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); - return kvm_skip_emulated_instruction(vcpu); - case 1: /*mov from cr*/ - switch (cr) { - case 3: - WARN_ON_ONCE(enable_unrestricted_guest); - val = kvm_read_cr3(vcpu); - kvm_register_write(vcpu, reg, val); - trace_kvm_cr_read(cr, val); - return kvm_skip_emulated_instruction(vcpu); - case 8: - val = kvm_get_cr8(vcpu); - kvm_register_write(vcpu, reg, val); - trace_kvm_cr_read(cr, val); - return kvm_skip_emulated_instruction(vcpu); - } - break; - case 3: /* lmsw */ - val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; - trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val); - kvm_lmsw(vcpu, val); - - return kvm_skip_emulated_instruction(vcpu); - default: - break; - } - vcpu->run->exit_reason = 0; - vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n", - (int)(exit_qualification >> 4) & 3, cr); - return 0; -} - -static int handle_dr(struct kvm_vcpu *vcpu) -{ - unsigned long exit_qualification; - int dr, dr7, reg; - - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - dr = exit_qualification & DEBUG_REG_ACCESS_NUM; - - /* First, if DR does not exist, trigger UD */ - if (!kvm_require_dr(vcpu, dr)) - return 1; - - /* Do not handle if the CPL > 0, will trigger GP on re-entry */ - if (!kvm_require_cpl(vcpu, 0)) - return 1; - dr7 = vmcs_readl(GUEST_DR7); - if (dr7 & DR7_GD) { - /* - * As the vm-exit takes precedence over the debug trap, we - * need to emulate the latter, either for the host or the - * guest debugging itself. - */ - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { - vcpu->run->debug.arch.dr6 = vcpu->arch.dr6; - vcpu->run->debug.arch.dr7 = dr7; - vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu); - vcpu->run->debug.arch.exception = DB_VECTOR; - vcpu->run->exit_reason = KVM_EXIT_DEBUG; - return 0; - } else { - vcpu->arch.dr6 &= ~15; - vcpu->arch.dr6 |= DR6_BD | DR6_RTM; - kvm_queue_exception(vcpu, DB_VECTOR); - return 1; - } - } - - if (vcpu->guest_debug == 0) { - vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, - CPU_BASED_MOV_DR_EXITING); - - /* - * No more DR vmexits; force a reload of the debug registers - * and reenter on this instruction. The next vmexit will - * retrieve the full state of the debug registers. - */ - vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; - return 1; - } - - reg = DEBUG_REG_ACCESS_REG(exit_qualification); - if (exit_qualification & TYPE_MOV_FROM_DR) { - unsigned long val; - - if (kvm_get_dr(vcpu, dr, &val)) - return 1; - kvm_register_write(vcpu, reg, val); - } else - if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg))) - return 1; - - return kvm_skip_emulated_instruction(vcpu); -} - -static u64 vmx_get_dr6(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.dr6; -} - -static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) -{ -} - -static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) -{ - get_debugreg(vcpu->arch.db[0], 0); - get_debugreg(vcpu->arch.db[1], 1); - get_debugreg(vcpu->arch.db[2], 2); - get_debugreg(vcpu->arch.db[3], 3); - get_debugreg(vcpu->arch.dr6, 6); - vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); - - vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; - vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING); -} - -static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) -{ - vmcs_writel(GUEST_DR7, val); -} - -static int handle_cpuid(struct kvm_vcpu *vcpu) -{ - return kvm_emulate_cpuid(vcpu); -} - -static int handle_rdmsr(struct kvm_vcpu *vcpu) -{ - u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; - struct msr_data msr_info; - - msr_info.index = ecx; - msr_info.host_initiated = false; - if (vmx_get_msr(vcpu, &msr_info)) { - trace_kvm_msr_read_ex(ecx); - kvm_inject_gp(vcpu, 0); - return 1; - } - - trace_kvm_msr_read(ecx, msr_info.data); - - /* FIXME: handling of bits 32:63 of rax, rdx */ - vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u; - vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u; - return kvm_skip_emulated_instruction(vcpu); -} - -static int handle_wrmsr(struct kvm_vcpu *vcpu) -{ - struct msr_data msr; - u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; - u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) - | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); - - msr.data = data; - msr.index = ecx; - msr.host_initiated = false; - if (kvm_set_msr(vcpu, &msr) != 0) { - trace_kvm_msr_write_ex(ecx, data); - kvm_inject_gp(vcpu, 0); - return 1; - } - - trace_kvm_msr_write(ecx, data); - return kvm_skip_emulated_instruction(vcpu); -} - -static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) -{ - kvm_apic_update_ppr(vcpu); - return 1; -} - -static int handle_interrupt_window(struct kvm_vcpu *vcpu) -{ - vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, - CPU_BASED_VIRTUAL_INTR_PENDING); - - kvm_make_request(KVM_REQ_EVENT, vcpu); - - ++vcpu->stat.irq_window_exits; - return 1; -} - -static int handle_halt(struct kvm_vcpu *vcpu) -{ - return kvm_emulate_halt(vcpu); -} - -static int handle_vmcall(struct kvm_vcpu *vcpu) -{ - return kvm_emulate_hypercall(vcpu); -} - -static int handle_invd(struct kvm_vcpu *vcpu) -{ - return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; -} - -static int handle_invlpg(struct kvm_vcpu *vcpu) -{ - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - - kvm_mmu_invlpg(vcpu, exit_qualification); - return kvm_skip_emulated_instruction(vcpu); -} - -static int handle_rdpmc(struct kvm_vcpu *vcpu) -{ - int err; - - err = kvm_rdpmc(vcpu); - return kvm_complete_insn_gp(vcpu, err); -} - -static int handle_wbinvd(struct kvm_vcpu *vcpu) -{ - return kvm_emulate_wbinvd(vcpu); -} - -static int handle_xsetbv(struct kvm_vcpu *vcpu) -{ - u64 new_bv = kvm_read_edx_eax(vcpu); - u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX); - - if (kvm_set_xcr(vcpu, index, new_bv) == 0) - return kvm_skip_emulated_instruction(vcpu); - return 1; -} - -static int handle_xsaves(struct kvm_vcpu *vcpu) -{ - kvm_skip_emulated_instruction(vcpu); - WARN(1, "this should never happen\n"); - return 1; -} - -static int handle_xrstors(struct kvm_vcpu *vcpu) -{ - kvm_skip_emulated_instruction(vcpu); - WARN(1, "this should never happen\n"); - return 1; -} - -static int handle_apic_access(struct kvm_vcpu *vcpu) -{ - if (likely(fasteoi)) { - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - int access_type, offset; - - access_type = exit_qualification & APIC_ACCESS_TYPE; - offset = exit_qualification & APIC_ACCESS_OFFSET; - /* - * Sane guest uses MOV to write EOI, with written value - * not cared. So make a short-circuit here by avoiding - * heavy instruction emulation. - */ - if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) && - (offset == APIC_EOI)) { - kvm_lapic_set_eoi(vcpu); - return kvm_skip_emulated_instruction(vcpu); - } - } - return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; -} - -static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) -{ - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - int vector = exit_qualification & 0xff; - - /* EOI-induced VM exit is trap-like and thus no need to adjust IP */ - kvm_apic_set_eoi_accelerated(vcpu, vector); - return 1; -} - -static int handle_apic_write(struct kvm_vcpu *vcpu) -{ - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - u32 offset = exit_qualification & 0xfff; - - /* APIC-write VM exit is trap-like and thus no need to adjust IP */ - kvm_apic_write_nodecode(vcpu, offset); - return 1; -} - -static int handle_task_switch(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long exit_qualification; - bool has_error_code = false; - u32 error_code = 0; - u16 tss_selector; - int reason, type, idt_v, idt_index; - - idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); - idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); - type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); - - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - - reason = (u32)exit_qualification >> 30; - if (reason == TASK_SWITCH_GATE && idt_v) { - switch (type) { - case INTR_TYPE_NMI_INTR: - vcpu->arch.nmi_injected = false; - vmx_set_nmi_mask(vcpu, true); - break; - case INTR_TYPE_EXT_INTR: - case INTR_TYPE_SOFT_INTR: - kvm_clear_interrupt_queue(vcpu); - break; - case INTR_TYPE_HARD_EXCEPTION: - if (vmx->idt_vectoring_info & - VECTORING_INFO_DELIVER_CODE_MASK) { - has_error_code = true; - error_code = - vmcs_read32(IDT_VECTORING_ERROR_CODE); - } - /* fall through */ - case INTR_TYPE_SOFT_EXCEPTION: - kvm_clear_exception_queue(vcpu); - break; - default: - break; - } - } - tss_selector = exit_qualification; - - if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION && - type != INTR_TYPE_EXT_INTR && - type != INTR_TYPE_NMI_INTR)) - skip_emulated_instruction(vcpu); - - if (kvm_task_switch(vcpu, tss_selector, - type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason, - has_error_code, error_code) == EMULATE_FAIL) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; - return 0; - } - - /* - * TODO: What about debug traps on tss switch? - * Are we supposed to inject them and update dr6? - */ - - return 1; -} - -static int handle_ept_violation(struct kvm_vcpu *vcpu) -{ - unsigned long exit_qualification; - gpa_t gpa; - u64 error_code; - - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - - /* - * EPT violation happened while executing iret from NMI, - * "blocked by NMI" bit has to be set before next VM entry. - * There are errata that may cause this bit to not be set: - * AAK134, BY25. - */ - if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && - enable_vnmi && - (exit_qualification & INTR_INFO_UNBLOCK_NMI)) - vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); - - gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); - trace_kvm_page_fault(gpa, exit_qualification); - - /* Is it a read fault? */ - error_code = (exit_qualification & EPT_VIOLATION_ACC_READ) - ? PFERR_USER_MASK : 0; - /* Is it a write fault? */ - error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE) - ? PFERR_WRITE_MASK : 0; - /* Is it a fetch fault? */ - error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) - ? PFERR_FETCH_MASK : 0; - /* ept page table entry is present? */ - error_code |= (exit_qualification & - (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE | - EPT_VIOLATION_EXECUTABLE)) - ? PFERR_PRESENT_MASK : 0; - - error_code |= (exit_qualification & 0x100) != 0 ? - PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; - - vcpu->arch.exit_qualification = exit_qualification; - return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); -} - -static int handle_ept_misconfig(struct kvm_vcpu *vcpu) -{ - gpa_t gpa; - - /* - * A nested guest cannot optimize MMIO vmexits, because we have an - * nGPA here instead of the required GPA. - */ - gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); - if (!is_guest_mode(vcpu) && - !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { - trace_kvm_fast_mmio(gpa); - /* - * Doing kvm_skip_emulated_instruction() depends on undefined - * behavior: Intel's manual doesn't mandate - * VM_EXIT_INSTRUCTION_LEN to be set in VMCS when EPT MISCONFIG - * occurs and while on real hardware it was observed to be set, - * other hypervisors (namely Hyper-V) don't set it, we end up - * advancing IP with some random value. Disable fast mmio when - * running nested and keep it for real hardware in hope that - * VM_EXIT_INSTRUCTION_LEN will always be set correctly. - */ - if (!static_cpu_has(X86_FEATURE_HYPERVISOR)) - return kvm_skip_emulated_instruction(vcpu); - else - return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) == - EMULATE_DONE; - } - - return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); -} - -static int handle_nmi_window(struct kvm_vcpu *vcpu) -{ - WARN_ON_ONCE(!enable_vnmi); - vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, - CPU_BASED_VIRTUAL_NMI_PENDING); - ++vcpu->stat.nmi_window_exits; - kvm_make_request(KVM_REQ_EVENT, vcpu); - - return 1; -} - -static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - enum emulation_result err = EMULATE_DONE; - int ret = 1; - u32 cpu_exec_ctrl; - bool intr_window_requested; - unsigned count = 130; - - /* - * We should never reach the point where we are emulating L2 - * due to invalid guest state as that means we incorrectly - * allowed a nested VMEntry with an invalid vmcs12. - */ - WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending); - - cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); - intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; - - while (vmx->emulation_required && count-- != 0) { - if (intr_window_requested && vmx_interrupt_allowed(vcpu)) - return handle_interrupt_window(&vmx->vcpu); - - if (kvm_test_request(KVM_REQ_EVENT, vcpu)) - return 1; - - err = kvm_emulate_instruction(vcpu, 0); - - if (err == EMULATE_USER_EXIT) { - ++vcpu->stat.mmio_exits; - ret = 0; - goto out; - } - - if (err != EMULATE_DONE) - goto emulation_error; - - if (vmx->emulation_required && !vmx->rmode.vm86_active && - vcpu->arch.exception.pending) - goto emulation_error; - - if (vcpu->arch.halt_request) { - vcpu->arch.halt_request = 0; - ret = kvm_vcpu_halt(vcpu); - goto out; - } - - if (signal_pending(current)) - goto out; - if (need_resched()) - schedule(); - } - -out: - return ret; - -emulation_error: - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; - return 0; -} - -static void grow_ple_window(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - int old = vmx->ple_window; - - vmx->ple_window = __grow_ple_window(old, ple_window, - ple_window_grow, - ple_window_max); - - if (vmx->ple_window != old) - vmx->ple_window_dirty = true; - - trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old); -} - -static void shrink_ple_window(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - int old = vmx->ple_window; - - vmx->ple_window = __shrink_ple_window(old, ple_window, - ple_window_shrink, - ple_window); - - if (vmx->ple_window != old) - vmx->ple_window_dirty = true; - - trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old); -} - -/* - * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. - */ -static void wakeup_handler(void) -{ - struct kvm_vcpu *vcpu; - int cpu = smp_processor_id(); - - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); - list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu), - blocked_vcpu_list) { - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - - if (pi_test_on(pi_desc) == 1) - kvm_vcpu_kick(vcpu); - } - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); -} - -static void vmx_enable_tdp(void) -{ - kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, - enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull, - enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull, - 0ull, VMX_EPT_EXECUTABLE_MASK, - cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK, - VMX_EPT_RWX_MASK, 0ull); - - ept_set_mmio_spte_mask(); - kvm_enable_tdp(); -} - -static __init int hardware_setup(void) -{ - unsigned long host_bndcfgs; - int r = -ENOMEM, i; - - rdmsrl_safe(MSR_EFER, &host_efer); - - for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) - kvm_define_shared_msr(i, vmx_msr_index[i]); - - if (setup_vmcs_config(&vmcs_config) < 0) - return -EIO; - - if (boot_cpu_has(X86_FEATURE_NX)) - kvm_enable_efer_bits(EFER_NX); - - if (boot_cpu_has(X86_FEATURE_MPX)) { - rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs); - WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost"); - } - - if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || - !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) - enable_vpid = 0; - - if (!cpu_has_vmx_ept() || - !cpu_has_vmx_ept_4levels() || - !cpu_has_vmx_ept_mt_wb() || - !cpu_has_vmx_invept_global()) - enable_ept = 0; - - if (!cpu_has_vmx_ept_ad_bits() || !enable_ept) - enable_ept_ad_bits = 0; - - if (!cpu_has_vmx_unrestricted_guest() || !enable_ept) - enable_unrestricted_guest = 0; - - if (!cpu_has_vmx_flexpriority()) - flexpriority_enabled = 0; - - if (!cpu_has_virtual_nmis()) - enable_vnmi = 0; - - /* - * set_apic_access_page_addr() is used to reload apic access - * page upon invalidation. No need to do anything if not - * using the APIC_ACCESS_ADDR VMCS field. - */ - if (!flexpriority_enabled) - kvm_x86_ops->set_apic_access_page_addr = NULL; - - if (!cpu_has_vmx_tpr_shadow()) - kvm_x86_ops->update_cr8_intercept = NULL; - - if (enable_ept && !cpu_has_vmx_ept_2m_page()) - kvm_disable_largepages(); - -#if IS_ENABLED(CONFIG_HYPERV) - if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH - && enable_ept) - kvm_x86_ops->tlb_remote_flush = vmx_hv_remote_flush_tlb; -#endif - - if (!cpu_has_vmx_ple()) { - ple_gap = 0; - ple_window = 0; - ple_window_grow = 0; - ple_window_max = 0; - ple_window_shrink = 0; - } - - if (!cpu_has_vmx_apicv()) { - enable_apicv = 0; - kvm_x86_ops->sync_pir_to_irr = NULL; - } - - if (cpu_has_vmx_tsc_scaling()) { - kvm_has_tsc_control = true; - kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; - kvm_tsc_scaling_ratio_frac_bits = 48; - } - - set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ - - if (enable_ept) - vmx_enable_tdp(); - else - kvm_disable_tdp(); - - if (!nested) { - kvm_x86_ops->get_nested_state = NULL; - kvm_x86_ops->set_nested_state = NULL; - } - - /* - * Only enable PML when hardware supports PML feature, and both EPT - * and EPT A/D bit features are enabled -- PML depends on them to work. - */ - if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) - enable_pml = 0; - - if (!enable_pml) { - kvm_x86_ops->slot_enable_log_dirty = NULL; - kvm_x86_ops->slot_disable_log_dirty = NULL; - kvm_x86_ops->flush_log_dirty = NULL; - kvm_x86_ops->enable_log_dirty_pt_masked = NULL; - } - - if (!cpu_has_vmx_preemption_timer()) - kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit; - - if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) { - u64 vmx_msr; - - rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); - cpu_preemption_timer_multi = - vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; - } else { - kvm_x86_ops->set_hv_timer = NULL; - kvm_x86_ops->cancel_hv_timer = NULL; - } - - if (!cpu_has_vmx_shadow_vmcs() || !nested) - enable_shadow_vmcs = 0; - if (enable_shadow_vmcs) { - for (i = 0; i < VMX_BITMAP_NR; i++) { - vmx_bitmap[i] = (unsigned long *) - __get_free_page(GFP_KERNEL); - if (!vmx_bitmap[i]) - goto out; - } - - init_vmcs_shadow_fields(); - } - - kvm_set_posted_intr_wakeup_handler(wakeup_handler); - nested_vmx_setup_ctls_msrs(&vmcs_config.nested, enable_apicv); - - kvm_mce_cap_supported |= MCG_LMCE_P; - - r = alloc_kvm_area(); - if (r) - goto out; - return 0; - -out: - if (enable_shadow_vmcs) { - for (i = 0; i < VMX_BITMAP_NR; i++) - free_page((unsigned long)vmx_bitmap[i]); - } - return r; -} - -static __exit void hardware_unsetup(void) -{ - int i; - - if (enable_shadow_vmcs) { - for (i = 0; i < VMX_BITMAP_NR; i++) - free_page((unsigned long)vmx_bitmap[i]); - } - - free_kvm_area(); -} - -/* - * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE - * exiting, so only get here on cpu with PAUSE-Loop-Exiting. - */ -static int handle_pause(struct kvm_vcpu *vcpu) -{ - if (!kvm_pause_in_guest(vcpu->kvm)) - grow_ple_window(vcpu); - - /* - * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting" - * VM-execution control is ignored if CPL > 0. OTOH, KVM - * never set PAUSE_EXITING and just set PLE if supported, - * so the vcpu must be CPL=0 if it gets a PAUSE exit. - */ - kvm_vcpu_on_spin(vcpu, true); - return kvm_skip_emulated_instruction(vcpu); -} - -static int handle_nop(struct kvm_vcpu *vcpu) -{ - return kvm_skip_emulated_instruction(vcpu); -} - -static int handle_mwait(struct kvm_vcpu *vcpu) -{ - printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n"); - return handle_nop(vcpu); -} - -static int handle_invalid_op(struct kvm_vcpu *vcpu) -{ - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; -} - -static int handle_monitor_trap(struct kvm_vcpu *vcpu) -{ - return 1; -} - -static int handle_monitor(struct kvm_vcpu *vcpu) -{ - printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n"); - return handle_nop(vcpu); -} - -/* - * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), - * set the success or error code of an emulated VMX instruction (as specified - * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated - * instruction. - */ -static int nested_vmx_succeed(struct kvm_vcpu *vcpu) -{ - vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) - & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | - X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); - return kvm_skip_emulated_instruction(vcpu); -} - -static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) -{ - vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) - & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | - X86_EFLAGS_SF | X86_EFLAGS_OF)) - | X86_EFLAGS_CF); - return kvm_skip_emulated_instruction(vcpu); -} - -static int nested_vmx_failValid(struct kvm_vcpu *vcpu, - u32 vm_instruction_error) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - /* - * failValid writes the error number to the current VMCS, which - * can't be done if there isn't a current VMCS. - */ - if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs) - return nested_vmx_failInvalid(vcpu); - - vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) - & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | - X86_EFLAGS_SF | X86_EFLAGS_OF)) - | X86_EFLAGS_ZF); - get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; - /* - * We don't need to force a shadow sync because - * VM_INSTRUCTION_ERROR is not shadowed - */ - return kvm_skip_emulated_instruction(vcpu); -} - -static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) -{ - /* TODO: not to reset guest simply here. */ - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); - pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator); -} - -static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) -{ - struct vcpu_vmx *vmx = - container_of(timer, struct vcpu_vmx, nested.preemption_timer); - - vmx->nested.preemption_timer_expired = true; - kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); - kvm_vcpu_kick(&vmx->vcpu); - - return HRTIMER_NORESTART; -} - -/* - * Decode the memory-address operand of a vmx instruction, as recorded on an - * exit caused by such an instruction (run by a guest hypervisor). - * On success, returns 0. When the operand is invalid, returns 1 and throws - * #UD or #GP. - */ -static int get_vmx_mem_address(struct kvm_vcpu *vcpu, - unsigned long exit_qualification, - u32 vmx_instruction_info, bool wr, gva_t *ret) -{ - gva_t off; - bool exn; - struct kvm_segment s; - - /* - * According to Vol. 3B, "Information for VM Exits Due to Instruction - * Execution", on an exit, vmx_instruction_info holds most of the - * addressing components of the operand. Only the displacement part - * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). - * For how an actual address is calculated from all these components, - * refer to Vol. 1, "Operand Addressing". - */ - int scaling = vmx_instruction_info & 3; - int addr_size = (vmx_instruction_info >> 7) & 7; - bool is_reg = vmx_instruction_info & (1u << 10); - int seg_reg = (vmx_instruction_info >> 15) & 7; - int index_reg = (vmx_instruction_info >> 18) & 0xf; - bool index_is_valid = !(vmx_instruction_info & (1u << 22)); - int base_reg = (vmx_instruction_info >> 23) & 0xf; - bool base_is_valid = !(vmx_instruction_info & (1u << 27)); - - if (is_reg) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } - - /* Addr = segment_base + offset */ - /* offset = base + [index * scale] + displacement */ - off = exit_qualification; /* holds the displacement */ - if (base_is_valid) - off += kvm_register_read(vcpu, base_reg); - if (index_is_valid) - off += kvm_register_read(vcpu, index_reg)< s.limit); - } - if (exn) { - kvm_queue_exception_e(vcpu, - seg_reg == VCPU_SREG_SS ? - SS_VECTOR : GP_VECTOR, - 0); - return 1; - } - - return 0; -} - -static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer) -{ - gva_t gva; - struct x86_exception e; - - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), - vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva)) - return 1; - - if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) { - kvm_inject_page_fault(vcpu, &e); - return 1; - } - - return 0; -} - -/* - * Allocate a shadow VMCS and associate it with the currently loaded - * VMCS, unless such a shadow VMCS already exists. The newly allocated - * VMCS is also VMCLEARed, so that it is ready for use. - */ -static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; - - /* - * We should allocate a shadow vmcs for vmcs01 only when L1 - * executes VMXON and free it when L1 executes VMXOFF. - * As it is invalid to execute VMXON twice, we shouldn't reach - * here when vmcs01 already have an allocated shadow vmcs. - */ - WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs); - - if (!loaded_vmcs->shadow_vmcs) { - loaded_vmcs->shadow_vmcs = alloc_vmcs(true); - if (loaded_vmcs->shadow_vmcs) - vmcs_clear(loaded_vmcs->shadow_vmcs); - } - return loaded_vmcs->shadow_vmcs; -} - -static int enter_vmx_operation(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - int r; - - r = alloc_loaded_vmcs(&vmx->nested.vmcs02); - if (r < 0) - goto out_vmcs02; - - vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); - if (!vmx->nested.cached_vmcs12) - goto out_cached_vmcs12; - - vmx->nested.cached_shadow_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); - if (!vmx->nested.cached_shadow_vmcs12) - goto out_cached_shadow_vmcs12; - - if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) - goto out_shadow_vmcs; - - hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_PINNED); - vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; - - vmx->nested.vpid02 = allocate_vpid(); - - vmx->nested.vmcs02_initialized = false; - vmx->nested.vmxon = true; - return 0; - -out_shadow_vmcs: - kfree(vmx->nested.cached_shadow_vmcs12); - -out_cached_shadow_vmcs12: - kfree(vmx->nested.cached_vmcs12); - -out_cached_vmcs12: - free_loaded_vmcs(&vmx->nested.vmcs02); - -out_vmcs02: - return -ENOMEM; -} - -/* - * Emulate the VMXON instruction. - * Currently, we just remember that VMX is active, and do not save or even - * inspect the argument to VMXON (the so-called "VMXON pointer") because we - * do not currently need to store anything in that guest-allocated memory - * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their - * argument is different from the VMXON pointer (which the spec says they do). - */ -static int handle_vmon(struct kvm_vcpu *vcpu) -{ - int ret; - gpa_t vmptr; - struct page *page; - struct vcpu_vmx *vmx = to_vmx(vcpu); - const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED - | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; - - /* - * The Intel VMX Instruction Reference lists a bunch of bits that are - * prerequisite to running VMXON, most notably cr4.VMXE must be set to - * 1 (see vmx_set_cr4() for when we allow the guest to set this). - * Otherwise, we should fail with #UD. But most faulting conditions - * have already been checked by hardware, prior to the VM-exit for - * VMXON. We do test guest cr4.VMXE because processor CR4 always has - * that bit set to 1 in non-root mode. - */ - if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } - - /* CPL=0 must be checked manually. */ - if (vmx_get_cpl(vcpu)) { - kvm_inject_gp(vcpu, 0); - return 1; - } - - if (vmx->nested.vmxon) - return nested_vmx_failValid(vcpu, - VMXERR_VMXON_IN_VMX_ROOT_OPERATION); - - if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) - != VMXON_NEEDED_FEATURES) { - kvm_inject_gp(vcpu, 0); - return 1; - } - - if (nested_vmx_get_vmptr(vcpu, &vmptr)) - return 1; - - /* - * SDM 3: 24.11.5 - * The first 4 bytes of VMXON region contain the supported - * VMCS revision identifier - * - * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; - * which replaces physical address width with 32 - */ - if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) - return nested_vmx_failInvalid(vcpu); - - page = kvm_vcpu_gpa_to_page(vcpu, vmptr); - if (is_error_page(page)) - return nested_vmx_failInvalid(vcpu); - - if (*(u32 *)kmap(page) != VMCS12_REVISION) { - kunmap(page); - kvm_release_page_clean(page); - return nested_vmx_failInvalid(vcpu); - } - kunmap(page); - kvm_release_page_clean(page); - - vmx->nested.vmxon_ptr = vmptr; - ret = enter_vmx_operation(vcpu); - if (ret) - return ret; - - return nested_vmx_succeed(vcpu); -} - -/* - * Intel's VMX Instruction Reference specifies a common set of prerequisites - * for running VMX instructions (except VMXON, whose prerequisites are - * slightly different). It also specifies what exception to inject otherwise. - * Note that many of these exceptions have priority over VM exits, so they - * don't have to be checked again here. - */ -static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) -{ - if (!to_vmx(vcpu)->nested.vmxon) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 0; - } - - if (vmx_get_cpl(vcpu)) { - kvm_inject_gp(vcpu, 0); - return 0; - } - - return 1; -} - -static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) -{ - vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS); - vmcs_write64(VMCS_LINK_POINTER, -1ull); -} - -static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (!vmx->nested.hv_evmcs) - return; - - kunmap(vmx->nested.hv_evmcs_page); - kvm_release_page_dirty(vmx->nested.hv_evmcs_page); - vmx->nested.hv_evmcs_vmptr = -1ull; - vmx->nested.hv_evmcs_page = NULL; - vmx->nested.hv_evmcs = NULL; -} - -static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (vmx->nested.current_vmptr == -1ull) - return; - - if (enable_shadow_vmcs) { - /* copy to memory all shadowed fields in case - they were modified */ - copy_shadow_to_vmcs12(vmx); - vmx->nested.need_vmcs12_sync = false; - vmx_disable_shadow_vmcs(vmx); - } - vmx->nested.posted_intr_nv = -1; - - /* Flush VMCS12 to guest memory */ - kvm_vcpu_write_guest_page(vcpu, - vmx->nested.current_vmptr >> PAGE_SHIFT, - vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); - - kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); - - vmx->nested.current_vmptr = -1ull; -} - -/* - * Free whatever needs to be freed from vmx->nested when L1 goes down, or - * just stops using VMX. - */ -static void free_nested(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) - return; - - vmx->nested.vmxon = false; - vmx->nested.smm.vmxon = false; - free_vpid(vmx->nested.vpid02); - vmx->nested.posted_intr_nv = -1; - vmx->nested.current_vmptr = -1ull; - if (enable_shadow_vmcs) { - vmx_disable_shadow_vmcs(vmx); - vmcs_clear(vmx->vmcs01.shadow_vmcs); - free_vmcs(vmx->vmcs01.shadow_vmcs); - vmx->vmcs01.shadow_vmcs = NULL; - } - kfree(vmx->nested.cached_vmcs12); - kfree(vmx->nested.cached_shadow_vmcs12); - /* Unpin physical memory we referred to in the vmcs02 */ - if (vmx->nested.apic_access_page) { - kvm_release_page_dirty(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = NULL; - } - if (vmx->nested.virtual_apic_page) { - kvm_release_page_dirty(vmx->nested.virtual_apic_page); - vmx->nested.virtual_apic_page = NULL; - } - if (vmx->nested.pi_desc_page) { - kunmap(vmx->nested.pi_desc_page); - kvm_release_page_dirty(vmx->nested.pi_desc_page); - vmx->nested.pi_desc_page = NULL; - vmx->nested.pi_desc = NULL; - } - - kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); - - nested_release_evmcs(vcpu); - - free_loaded_vmcs(&vmx->nested.vmcs02); -} - -/* Emulate the VMXOFF instruction */ -static int handle_vmoff(struct kvm_vcpu *vcpu) -{ - if (!nested_vmx_check_permission(vcpu)) - return 1; - free_nested(vcpu); - return nested_vmx_succeed(vcpu); -} - -/* Emulate the VMCLEAR instruction */ -static int handle_vmclear(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 zero = 0; - gpa_t vmptr; - - if (!nested_vmx_check_permission(vcpu)) - return 1; - - if (nested_vmx_get_vmptr(vcpu, &vmptr)) - return 1; - - if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) - return nested_vmx_failValid(vcpu, - VMXERR_VMCLEAR_INVALID_ADDRESS); - - if (vmptr == vmx->nested.vmxon_ptr) - return nested_vmx_failValid(vcpu, - VMXERR_VMCLEAR_VMXON_POINTER); - - if (vmx->nested.hv_evmcs_page) { - if (vmptr == vmx->nested.hv_evmcs_vmptr) - nested_release_evmcs(vcpu); - } else { - if (vmptr == vmx->nested.current_vmptr) - nested_release_vmcs12(vcpu); - - kvm_vcpu_write_guest(vcpu, - vmptr + offsetof(struct vmcs12, - launch_state), - &zero, sizeof(zero)); - } - - return nested_vmx_succeed(vcpu); -} - -static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch); - -/* Emulate the VMLAUNCH instruction */ -static int handle_vmlaunch(struct kvm_vcpu *vcpu) -{ - return nested_vmx_run(vcpu, true); -} - -/* Emulate the VMRESUME instruction */ -static int handle_vmresume(struct kvm_vcpu *vcpu) -{ - - return nested_vmx_run(vcpu, false); -} - -/* - * Read a vmcs12 field. Since these can have varying lengths and we return - * one type, we chose the biggest type (u64) and zero-extend the return value - * to that size. Note that the caller, handle_vmread, might need to use only - * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of - * 64-bit fields are to be returned). - */ -static inline int vmcs12_read_any(struct vmcs12 *vmcs12, - unsigned long field, u64 *ret) -{ - short offset = vmcs_field_to_offset(field); - char *p; - - if (offset < 0) - return offset; - - p = (char *)vmcs12 + offset; - - switch (vmcs_field_width(field)) { - case VMCS_FIELD_WIDTH_NATURAL_WIDTH: - *ret = *((natural_width *)p); - return 0; - case VMCS_FIELD_WIDTH_U16: - *ret = *((u16 *)p); - return 0; - case VMCS_FIELD_WIDTH_U32: - *ret = *((u32 *)p); - return 0; - case VMCS_FIELD_WIDTH_U64: - *ret = *((u64 *)p); - return 0; - default: - WARN_ON(1); - return -ENOENT; - } -} - - -static inline int vmcs12_write_any(struct vmcs12 *vmcs12, - unsigned long field, u64 field_value){ - short offset = vmcs_field_to_offset(field); - char *p = (char *)vmcs12 + offset; - if (offset < 0) - return offset; - - switch (vmcs_field_width(field)) { - case VMCS_FIELD_WIDTH_U16: - *(u16 *)p = field_value; - return 0; - case VMCS_FIELD_WIDTH_U32: - *(u32 *)p = field_value; - return 0; - case VMCS_FIELD_WIDTH_U64: - *(u64 *)p = field_value; - return 0; - case VMCS_FIELD_WIDTH_NATURAL_WIDTH: - *(natural_width *)p = field_value; - return 0; - default: - WARN_ON(1); - return -ENOENT; - } - -} - -static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) -{ - struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; - struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; - - /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ - vmcs12->tpr_threshold = evmcs->tpr_threshold; - vmcs12->guest_rip = evmcs->guest_rip; - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { - vmcs12->guest_rsp = evmcs->guest_rsp; - vmcs12->guest_rflags = evmcs->guest_rflags; - vmcs12->guest_interruptibility_info = - evmcs->guest_interruptibility_info; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { - vmcs12->cpu_based_vm_exec_control = - evmcs->cpu_based_vm_exec_control; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { - vmcs12->exception_bitmap = evmcs->exception_bitmap; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { - vmcs12->vm_entry_controls = evmcs->vm_entry_controls; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { - vmcs12->vm_entry_intr_info_field = - evmcs->vm_entry_intr_info_field; - vmcs12->vm_entry_exception_error_code = - evmcs->vm_entry_exception_error_code; - vmcs12->vm_entry_instruction_len = - evmcs->vm_entry_instruction_len; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { - vmcs12->host_ia32_pat = evmcs->host_ia32_pat; - vmcs12->host_ia32_efer = evmcs->host_ia32_efer; - vmcs12->host_cr0 = evmcs->host_cr0; - vmcs12->host_cr3 = evmcs->host_cr3; - vmcs12->host_cr4 = evmcs->host_cr4; - vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; - vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; - vmcs12->host_rip = evmcs->host_rip; - vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; - vmcs12->host_es_selector = evmcs->host_es_selector; - vmcs12->host_cs_selector = evmcs->host_cs_selector; - vmcs12->host_ss_selector = evmcs->host_ss_selector; - vmcs12->host_ds_selector = evmcs->host_ds_selector; - vmcs12->host_fs_selector = evmcs->host_fs_selector; - vmcs12->host_gs_selector = evmcs->host_gs_selector; - vmcs12->host_tr_selector = evmcs->host_tr_selector; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { - vmcs12->pin_based_vm_exec_control = - evmcs->pin_based_vm_exec_control; - vmcs12->vm_exit_controls = evmcs->vm_exit_controls; - vmcs12->secondary_vm_exec_control = - evmcs->secondary_vm_exec_control; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { - vmcs12->io_bitmap_a = evmcs->io_bitmap_a; - vmcs12->io_bitmap_b = evmcs->io_bitmap_b; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { - vmcs12->msr_bitmap = evmcs->msr_bitmap; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { - vmcs12->guest_es_base = evmcs->guest_es_base; - vmcs12->guest_cs_base = evmcs->guest_cs_base; - vmcs12->guest_ss_base = evmcs->guest_ss_base; - vmcs12->guest_ds_base = evmcs->guest_ds_base; - vmcs12->guest_fs_base = evmcs->guest_fs_base; - vmcs12->guest_gs_base = evmcs->guest_gs_base; - vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; - vmcs12->guest_tr_base = evmcs->guest_tr_base; - vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; - vmcs12->guest_idtr_base = evmcs->guest_idtr_base; - vmcs12->guest_es_limit = evmcs->guest_es_limit; - vmcs12->guest_cs_limit = evmcs->guest_cs_limit; - vmcs12->guest_ss_limit = evmcs->guest_ss_limit; - vmcs12->guest_ds_limit = evmcs->guest_ds_limit; - vmcs12->guest_fs_limit = evmcs->guest_fs_limit; - vmcs12->guest_gs_limit = evmcs->guest_gs_limit; - vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; - vmcs12->guest_tr_limit = evmcs->guest_tr_limit; - vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; - vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; - vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; - vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; - vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; - vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; - vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; - vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; - vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; - vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; - vmcs12->guest_es_selector = evmcs->guest_es_selector; - vmcs12->guest_cs_selector = evmcs->guest_cs_selector; - vmcs12->guest_ss_selector = evmcs->guest_ss_selector; - vmcs12->guest_ds_selector = evmcs->guest_ds_selector; - vmcs12->guest_fs_selector = evmcs->guest_fs_selector; - vmcs12->guest_gs_selector = evmcs->guest_gs_selector; - vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; - vmcs12->guest_tr_selector = evmcs->guest_tr_selector; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { - vmcs12->tsc_offset = evmcs->tsc_offset; - vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; - vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { - vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; - vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; - vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; - vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; - vmcs12->guest_cr0 = evmcs->guest_cr0; - vmcs12->guest_cr3 = evmcs->guest_cr3; - vmcs12->guest_cr4 = evmcs->guest_cr4; - vmcs12->guest_dr7 = evmcs->guest_dr7; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { - vmcs12->host_fs_base = evmcs->host_fs_base; - vmcs12->host_gs_base = evmcs->host_gs_base; - vmcs12->host_tr_base = evmcs->host_tr_base; - vmcs12->host_gdtr_base = evmcs->host_gdtr_base; - vmcs12->host_idtr_base = evmcs->host_idtr_base; - vmcs12->host_rsp = evmcs->host_rsp; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { - vmcs12->ept_pointer = evmcs->ept_pointer; - vmcs12->virtual_processor_id = evmcs->virtual_processor_id; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { - vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; - vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; - vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; - vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; - vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; - vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; - vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; - vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; - vmcs12->guest_pending_dbg_exceptions = - evmcs->guest_pending_dbg_exceptions; - vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; - vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; - vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; - vmcs12->guest_activity_state = evmcs->guest_activity_state; - vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; - } - - /* - * Not used? - * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; - * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; - * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; - * vmcs12->cr3_target_value0 = evmcs->cr3_target_value0; - * vmcs12->cr3_target_value1 = evmcs->cr3_target_value1; - * vmcs12->cr3_target_value2 = evmcs->cr3_target_value2; - * vmcs12->cr3_target_value3 = evmcs->cr3_target_value3; - * vmcs12->page_fault_error_code_mask = - * evmcs->page_fault_error_code_mask; - * vmcs12->page_fault_error_code_match = - * evmcs->page_fault_error_code_match; - * vmcs12->cr3_target_count = evmcs->cr3_target_count; - * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; - * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; - * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; - */ - - /* - * Read only fields: - * vmcs12->guest_physical_address = evmcs->guest_physical_address; - * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; - * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; - * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; - * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; - * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; - * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; - * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; - * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; - * vmcs12->exit_qualification = evmcs->exit_qualification; - * vmcs12->guest_linear_address = evmcs->guest_linear_address; - * - * Not present in struct vmcs12: - * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; - * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; - * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; - * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; - */ - - return 0; -} - -static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) -{ - struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; - struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; - - /* - * Should not be changed by KVM: - * - * evmcs->host_es_selector = vmcs12->host_es_selector; - * evmcs->host_cs_selector = vmcs12->host_cs_selector; - * evmcs->host_ss_selector = vmcs12->host_ss_selector; - * evmcs->host_ds_selector = vmcs12->host_ds_selector; - * evmcs->host_fs_selector = vmcs12->host_fs_selector; - * evmcs->host_gs_selector = vmcs12->host_gs_selector; - * evmcs->host_tr_selector = vmcs12->host_tr_selector; - * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; - * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; - * evmcs->host_cr0 = vmcs12->host_cr0; - * evmcs->host_cr3 = vmcs12->host_cr3; - * evmcs->host_cr4 = vmcs12->host_cr4; - * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; - * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; - * evmcs->host_rip = vmcs12->host_rip; - * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; - * evmcs->host_fs_base = vmcs12->host_fs_base; - * evmcs->host_gs_base = vmcs12->host_gs_base; - * evmcs->host_tr_base = vmcs12->host_tr_base; - * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; - * evmcs->host_idtr_base = vmcs12->host_idtr_base; - * evmcs->host_rsp = vmcs12->host_rsp; - * sync_vmcs12() doesn't read these: - * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; - * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; - * evmcs->msr_bitmap = vmcs12->msr_bitmap; - * evmcs->ept_pointer = vmcs12->ept_pointer; - * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; - * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; - * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; - * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; - * evmcs->cr3_target_value0 = vmcs12->cr3_target_value0; - * evmcs->cr3_target_value1 = vmcs12->cr3_target_value1; - * evmcs->cr3_target_value2 = vmcs12->cr3_target_value2; - * evmcs->cr3_target_value3 = vmcs12->cr3_target_value3; - * evmcs->tpr_threshold = vmcs12->tpr_threshold; - * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; - * evmcs->exception_bitmap = vmcs12->exception_bitmap; - * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; - * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; - * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; - * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; - * evmcs->page_fault_error_code_mask = - * vmcs12->page_fault_error_code_mask; - * evmcs->page_fault_error_code_match = - * vmcs12->page_fault_error_code_match; - * evmcs->cr3_target_count = vmcs12->cr3_target_count; - * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; - * evmcs->tsc_offset = vmcs12->tsc_offset; - * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; - * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; - * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; - * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; - * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; - * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; - * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; - * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; - * - * Not present in struct vmcs12: - * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; - * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; - * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; - * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; - */ - - evmcs->guest_es_selector = vmcs12->guest_es_selector; - evmcs->guest_cs_selector = vmcs12->guest_cs_selector; - evmcs->guest_ss_selector = vmcs12->guest_ss_selector; - evmcs->guest_ds_selector = vmcs12->guest_ds_selector; - evmcs->guest_fs_selector = vmcs12->guest_fs_selector; - evmcs->guest_gs_selector = vmcs12->guest_gs_selector; - evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; - evmcs->guest_tr_selector = vmcs12->guest_tr_selector; - - evmcs->guest_es_limit = vmcs12->guest_es_limit; - evmcs->guest_cs_limit = vmcs12->guest_cs_limit; - evmcs->guest_ss_limit = vmcs12->guest_ss_limit; - evmcs->guest_ds_limit = vmcs12->guest_ds_limit; - evmcs->guest_fs_limit = vmcs12->guest_fs_limit; - evmcs->guest_gs_limit = vmcs12->guest_gs_limit; - evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; - evmcs->guest_tr_limit = vmcs12->guest_tr_limit; - evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; - evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; - - evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; - evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; - evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; - evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; - evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; - evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; - evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; - evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; - - evmcs->guest_es_base = vmcs12->guest_es_base; - evmcs->guest_cs_base = vmcs12->guest_cs_base; - evmcs->guest_ss_base = vmcs12->guest_ss_base; - evmcs->guest_ds_base = vmcs12->guest_ds_base; - evmcs->guest_fs_base = vmcs12->guest_fs_base; - evmcs->guest_gs_base = vmcs12->guest_gs_base; - evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; - evmcs->guest_tr_base = vmcs12->guest_tr_base; - evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; - evmcs->guest_idtr_base = vmcs12->guest_idtr_base; - - evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; - evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; - - evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; - evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; - evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; - evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; - - evmcs->guest_pending_dbg_exceptions = - vmcs12->guest_pending_dbg_exceptions; - evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; - evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; - - evmcs->guest_activity_state = vmcs12->guest_activity_state; - evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; - - evmcs->guest_cr0 = vmcs12->guest_cr0; - evmcs->guest_cr3 = vmcs12->guest_cr3; - evmcs->guest_cr4 = vmcs12->guest_cr4; - evmcs->guest_dr7 = vmcs12->guest_dr7; - - evmcs->guest_physical_address = vmcs12->guest_physical_address; - - evmcs->vm_instruction_error = vmcs12->vm_instruction_error; - evmcs->vm_exit_reason = vmcs12->vm_exit_reason; - evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; - evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; - evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; - evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; - evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; - evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; - - evmcs->exit_qualification = vmcs12->exit_qualification; - - evmcs->guest_linear_address = vmcs12->guest_linear_address; - evmcs->guest_rsp = vmcs12->guest_rsp; - evmcs->guest_rflags = vmcs12->guest_rflags; - - evmcs->guest_interruptibility_info = - vmcs12->guest_interruptibility_info; - evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; - evmcs->vm_entry_controls = vmcs12->vm_entry_controls; - evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; - evmcs->vm_entry_exception_error_code = - vmcs12->vm_entry_exception_error_code; - evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; - - evmcs->guest_rip = vmcs12->guest_rip; - - evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; - - return 0; -} - -/* - * Copy the writable VMCS shadow fields back to the VMCS12, in case - * they have been modified by the L1 guest. Note that the "read-only" - * VM-exit information fields are actually writable if the vCPU is - * configured to support "VMWRITE to any supported field in the VMCS." - */ -static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) -{ - const u16 *fields[] = { - shadow_read_write_fields, - shadow_read_only_fields - }; - const int max_fields[] = { - max_shadow_read_write_fields, - max_shadow_read_only_fields - }; - int i, q; - unsigned long field; - u64 field_value; - struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; - - preempt_disable(); - - vmcs_load(shadow_vmcs); - - for (q = 0; q < ARRAY_SIZE(fields); q++) { - for (i = 0; i < max_fields[q]; i++) { - field = fields[q][i]; - field_value = __vmcs_readl(field); - vmcs12_write_any(get_vmcs12(&vmx->vcpu), field, field_value); - } - /* - * Skip the VM-exit information fields if they are read-only. - */ - if (!nested_cpu_has_vmwrite_any_field(&vmx->vcpu)) - break; - } - - vmcs_clear(shadow_vmcs); - vmcs_load(vmx->loaded_vmcs->vmcs); - - preempt_enable(); -} - -static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) -{ - const u16 *fields[] = { - shadow_read_write_fields, - shadow_read_only_fields - }; - const int max_fields[] = { - max_shadow_read_write_fields, - max_shadow_read_only_fields - }; - int i, q; - unsigned long field; - u64 field_value = 0; - struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; - - vmcs_load(shadow_vmcs); - - for (q = 0; q < ARRAY_SIZE(fields); q++) { - for (i = 0; i < max_fields[q]; i++) { - field = fields[q][i]; - vmcs12_read_any(get_vmcs12(&vmx->vcpu), field, &field_value); - __vmcs_writel(field, field_value); - } - } - - vmcs_clear(shadow_vmcs); - vmcs_load(vmx->loaded_vmcs->vmcs); -} - -static int handle_vmread(struct kvm_vcpu *vcpu) -{ - unsigned long field; - u64 field_value; - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - gva_t gva = 0; - struct vmcs12 *vmcs12; - - if (!nested_vmx_check_permission(vcpu)) - return 1; - - if (to_vmx(vcpu)->nested.current_vmptr == -1ull) - return nested_vmx_failInvalid(vcpu); - - if (!is_guest_mode(vcpu)) - vmcs12 = get_vmcs12(vcpu); - else { - /* - * When vmcs->vmcs_link_pointer is -1ull, any VMREAD - * to shadowed-field sets the ALU flags for VMfailInvalid. - */ - if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) - return nested_vmx_failInvalid(vcpu); - vmcs12 = get_shadow_vmcs12(vcpu); - } - - /* Decode instruction info and find the field to read */ - field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); - /* Read the field, zero-extended to a u64 field_value */ - if (vmcs12_read_any(vmcs12, field, &field_value) < 0) - return nested_vmx_failValid(vcpu, - VMXERR_UNSUPPORTED_VMCS_COMPONENT); - - /* - * Now copy part of this value to register or memory, as requested. - * Note that the number of bits actually copied is 32 or 64 depending - * on the guest's mode (32 or 64 bit), not on the given field's length. - */ - if (vmx_instruction_info & (1u << 10)) { - kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf), - field_value); - } else { - if (get_vmx_mem_address(vcpu, exit_qualification, - vmx_instruction_info, true, &gva)) - return 1; - /* _system ok, nested_vmx_check_permission has verified cpl=0 */ - kvm_write_guest_virt_system(vcpu, gva, &field_value, - (is_long_mode(vcpu) ? 8 : 4), NULL); - } - - return nested_vmx_succeed(vcpu); -} - - -static int handle_vmwrite(struct kvm_vcpu *vcpu) -{ - unsigned long field; - gva_t gva; - struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - - /* The value to write might be 32 or 64 bits, depending on L1's long - * mode, and eventually we need to write that into a field of several - * possible lengths. The code below first zero-extends the value to 64 - * bit (field_value), and then copies only the appropriate number of - * bits into the vmcs12 field. - */ - u64 field_value = 0; - struct x86_exception e; - struct vmcs12 *vmcs12; - - if (!nested_vmx_check_permission(vcpu)) - return 1; - - if (vmx->nested.current_vmptr == -1ull) - return nested_vmx_failInvalid(vcpu); - - if (vmx_instruction_info & (1u << 10)) - field_value = kvm_register_readl(vcpu, - (((vmx_instruction_info) >> 3) & 0xf)); - else { - if (get_vmx_mem_address(vcpu, exit_qualification, - vmx_instruction_info, false, &gva)) - return 1; - if (kvm_read_guest_virt(vcpu, gva, &field_value, - (is_64_bit_mode(vcpu) ? 8 : 4), &e)) { - kvm_inject_page_fault(vcpu, &e); - return 1; - } - } - - - field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); - /* - * If the vCPU supports "VMWRITE to any supported field in the - * VMCS," then the "read-only" fields are actually read/write. - */ - if (vmcs_field_readonly(field) && - !nested_cpu_has_vmwrite_any_field(vcpu)) - return nested_vmx_failValid(vcpu, - VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); - - if (!is_guest_mode(vcpu)) - vmcs12 = get_vmcs12(vcpu); - else { - /* - * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE - * to shadowed-field sets the ALU flags for VMfailInvalid. - */ - if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) - return nested_vmx_failInvalid(vcpu); - vmcs12 = get_shadow_vmcs12(vcpu); - } - - if (vmcs12_write_any(vmcs12, field, field_value) < 0) - return nested_vmx_failValid(vcpu, - VMXERR_UNSUPPORTED_VMCS_COMPONENT); - - /* - * Do not track vmcs12 dirty-state if in guest-mode - * as we actually dirty shadow vmcs12 instead of vmcs12. - */ - if (!is_guest_mode(vcpu)) { - switch (field) { -#define SHADOW_FIELD_RW(x) case x: -#include "vmx_shadow_fields.h" - /* - * The fields that can be updated by L1 without a vmexit are - * always updated in the vmcs02, the others go down the slow - * path of prepare_vmcs02. - */ - break; - default: - vmx->nested.dirty_vmcs12 = true; - break; - } - } - - return nested_vmx_succeed(vcpu); -} - -static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) -{ - vmx->nested.current_vmptr = vmptr; - if (enable_shadow_vmcs) { - vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, - SECONDARY_EXEC_SHADOW_VMCS); - vmcs_write64(VMCS_LINK_POINTER, - __pa(vmx->vmcs01.shadow_vmcs)); - vmx->nested.need_vmcs12_sync = true; - } - vmx->nested.dirty_vmcs12 = true; -} - -/* Emulate the VMPTRLD instruction */ -static int handle_vmptrld(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - gpa_t vmptr; - - if (!nested_vmx_check_permission(vcpu)) - return 1; - - if (nested_vmx_get_vmptr(vcpu, &vmptr)) - return 1; - - if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) - return nested_vmx_failValid(vcpu, - VMXERR_VMPTRLD_INVALID_ADDRESS); - - if (vmptr == vmx->nested.vmxon_ptr) - return nested_vmx_failValid(vcpu, - VMXERR_VMPTRLD_VMXON_POINTER); - - /* Forbid normal VMPTRLD if Enlightened version was used */ - if (vmx->nested.hv_evmcs) - return 1; - - if (vmx->nested.current_vmptr != vmptr) { - struct vmcs12 *new_vmcs12; - struct page *page; - page = kvm_vcpu_gpa_to_page(vcpu, vmptr); - if (is_error_page(page)) { - /* - * Reads from an unbacked page return all 1s, - * which means that the 32 bits located at the - * given physical address won't match the required - * VMCS12_REVISION identifier. - */ - nested_vmx_failValid(vcpu, - VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); - return kvm_skip_emulated_instruction(vcpu); - } - new_vmcs12 = kmap(page); - if (new_vmcs12->hdr.revision_id != VMCS12_REVISION || - (new_vmcs12->hdr.shadow_vmcs && - !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { - kunmap(page); - kvm_release_page_clean(page); - return nested_vmx_failValid(vcpu, - VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); - } - - nested_release_vmcs12(vcpu); - - /* - * Load VMCS12 from guest memory since it is not already - * cached. - */ - memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE); - kunmap(page); - kvm_release_page_clean(page); - - set_current_vmptr(vmx, vmptr); - } - - return nested_vmx_succeed(vcpu); -} - -/* - * This is an equivalent of the nested hypervisor executing the vmptrld - * instruction. - */ -static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, - bool from_launch) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct hv_vp_assist_page assist_page; - - if (likely(!vmx->nested.enlightened_vmcs_enabled)) - return 1; - - if (unlikely(!kvm_hv_get_assist_page(vcpu, &assist_page))) - return 1; - - if (unlikely(!assist_page.enlighten_vmentry)) - return 1; - - if (unlikely(assist_page.current_nested_vmcs != - vmx->nested.hv_evmcs_vmptr)) { - - if (!vmx->nested.hv_evmcs) - vmx->nested.current_vmptr = -1ull; - - nested_release_evmcs(vcpu); - - vmx->nested.hv_evmcs_page = kvm_vcpu_gpa_to_page( - vcpu, assist_page.current_nested_vmcs); - - if (unlikely(is_error_page(vmx->nested.hv_evmcs_page))) - return 0; - - vmx->nested.hv_evmcs = kmap(vmx->nested.hv_evmcs_page); - - /* - * Currently, KVM only supports eVMCS version 1 - * (== KVM_EVMCS_VERSION) and thus we expect guest to set this - * value to first u32 field of eVMCS which should specify eVMCS - * VersionNumber. - * - * Guest should be aware of supported eVMCS versions by host by - * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is - * expected to set this CPUID leaf according to the value - * returned in vmcs_version from nested_enable_evmcs(). - * - * However, it turns out that Microsoft Hyper-V fails to comply - * to their own invented interface: When Hyper-V use eVMCS, it - * just sets first u32 field of eVMCS to revision_id specified - * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number - * which is one of the supported versions specified in - * CPUID.0x4000000A.EAX[0:15]. - * - * To overcome Hyper-V bug, we accept here either a supported - * eVMCS version or VMCS12 revision_id as valid values for first - * u32 field of eVMCS. - */ - if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && - (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { - nested_release_evmcs(vcpu); - return 0; - } - - vmx->nested.dirty_vmcs12 = true; - /* - * As we keep L2 state for one guest only 'hv_clean_fields' mask - * can't be used when we switch between them. Reset it here for - * simplicity. - */ - vmx->nested.hv_evmcs->hv_clean_fields &= - ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; - vmx->nested.hv_evmcs_vmptr = assist_page.current_nested_vmcs; - - /* - * Unlike normal vmcs12, enlightened vmcs12 is not fully - * reloaded from guest's memory (read only fields, fields not - * present in struct hv_enlightened_vmcs, ...). Make sure there - * are no leftovers. - */ - if (from_launch) { - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - memset(vmcs12, 0, sizeof(*vmcs12)); - vmcs12->hdr.revision_id = VMCS12_REVISION; - } - - } - return 1; -} - -/* Emulate the VMPTRST instruction */ -static int handle_vmptrst(struct kvm_vcpu *vcpu) -{ - unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION); - u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); - gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; - struct x86_exception e; - gva_t gva; - - if (!nested_vmx_check_permission(vcpu)) - return 1; - - if (unlikely(to_vmx(vcpu)->nested.hv_evmcs)) - return 1; - - if (get_vmx_mem_address(vcpu, exit_qual, instr_info, true, &gva)) - return 1; - /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ - if (kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, - sizeof(gpa_t), &e)) { - kvm_inject_page_fault(vcpu, &e); - return 1; - } - return nested_vmx_succeed(vcpu); -} - -/* Emulate the INVEPT instruction */ -static int handle_invept(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 vmx_instruction_info, types; - unsigned long type; - gva_t gva; - struct x86_exception e; - struct { - u64 eptp, gpa; - } operand; - - if (!(vmx->nested.msrs.secondary_ctls_high & - SECONDARY_EXEC_ENABLE_EPT) || - !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } - - if (!nested_vmx_check_permission(vcpu)) - return 1; - - vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); - - types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; - - if (type >= 32 || !(types & (1 << type))) - return nested_vmx_failValid(vcpu, - VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); - - /* According to the Intel VMX instruction reference, the memory - * operand is read even if it isn't needed (e.g., for type==global) - */ - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), - vmx_instruction_info, false, &gva)) - return 1; - if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { - kvm_inject_page_fault(vcpu, &e); - return 1; - } - - switch (type) { - case VMX_EPT_EXTENT_GLOBAL: - /* - * TODO: track mappings and invalidate - * single context requests appropriately - */ - case VMX_EPT_EXTENT_CONTEXT: - kvm_mmu_sync_roots(vcpu); - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); - break; - default: - BUG_ON(1); - break; - } - - return nested_vmx_succeed(vcpu); -} - -static u16 nested_get_vpid02(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid; -} - -static int handle_invvpid(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 vmx_instruction_info; - unsigned long type, types; - gva_t gva; - struct x86_exception e; - struct { - u64 vpid; - u64 gla; - } operand; - u16 vpid02; - - if (!(vmx->nested.msrs.secondary_ctls_high & - SECONDARY_EXEC_ENABLE_VPID) || - !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } - - if (!nested_vmx_check_permission(vcpu)) - return 1; - - vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); - - types = (vmx->nested.msrs.vpid_caps & - VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; - - if (type >= 32 || !(types & (1 << type))) - return nested_vmx_failValid(vcpu, - VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); - - /* according to the intel vmx instruction reference, the memory - * operand is read even if it isn't needed (e.g., for type==global) - */ - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), - vmx_instruction_info, false, &gva)) - return 1; - if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { - kvm_inject_page_fault(vcpu, &e); - return 1; - } - if (operand.vpid >> 16) - return nested_vmx_failValid(vcpu, - VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); - - vpid02 = nested_get_vpid02(vcpu); - switch (type) { - case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: - if (!operand.vpid || - is_noncanonical_address(operand.gla, vcpu)) - return nested_vmx_failValid(vcpu, - VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); - if (cpu_has_vmx_invvpid_individual_addr()) { - __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, - vpid02, operand.gla); - } else - __vmx_flush_tlb(vcpu, vpid02, false); - break; - case VMX_VPID_EXTENT_SINGLE_CONTEXT: - case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: - if (!operand.vpid) - return nested_vmx_failValid(vcpu, - VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); - __vmx_flush_tlb(vcpu, vpid02, false); - break; - case VMX_VPID_EXTENT_ALL_CONTEXT: - __vmx_flush_tlb(vcpu, vpid02, false); - break; - default: - WARN_ON_ONCE(1); - return kvm_skip_emulated_instruction(vcpu); - } - - return nested_vmx_succeed(vcpu); -} - -static int handle_invpcid(struct kvm_vcpu *vcpu) -{ - u32 vmx_instruction_info; - unsigned long type; - bool pcid_enabled; - gva_t gva; - struct x86_exception e; - unsigned i; - unsigned long roots_to_free = 0; - struct { - u64 pcid; - u64 gla; - } operand; - - if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } - - vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); - - if (type > 3) { - kvm_inject_gp(vcpu, 0); - return 1; - } - - /* According to the Intel instruction reference, the memory operand - * is read even if it isn't needed (e.g., for type==all) - */ - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), - vmx_instruction_info, false, &gva)) - return 1; - - if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { - kvm_inject_page_fault(vcpu, &e); - return 1; - } - - if (operand.pcid >> 12 != 0) { - kvm_inject_gp(vcpu, 0); - return 1; - } - - pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); - - switch (type) { - case INVPCID_TYPE_INDIV_ADDR: - if ((!pcid_enabled && (operand.pcid != 0)) || - is_noncanonical_address(operand.gla, vcpu)) { - kvm_inject_gp(vcpu, 0); - return 1; - } - kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid); - return kvm_skip_emulated_instruction(vcpu); - - case INVPCID_TYPE_SINGLE_CTXT: - if (!pcid_enabled && (operand.pcid != 0)) { - kvm_inject_gp(vcpu, 0); - return 1; - } - - if (kvm_get_active_pcid(vcpu) == operand.pcid) { - kvm_mmu_sync_roots(vcpu); - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); - } - - for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) - if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].cr3) - == operand.pcid) - roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); - - kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free); - /* - * If neither the current cr3 nor any of the prev_roots use the - * given PCID, then nothing needs to be done here because a - * resync will happen anyway before switching to any other CR3. - */ - - return kvm_skip_emulated_instruction(vcpu); - - case INVPCID_TYPE_ALL_NON_GLOBAL: - /* - * Currently, KVM doesn't mark global entries in the shadow - * page tables, so a non-global flush just degenerates to a - * global flush. If needed, we could optimize this later by - * keeping track of global entries in shadow page tables. - */ - - /* fall-through */ - case INVPCID_TYPE_ALL_INCL_GLOBAL: - kvm_mmu_unload(vcpu); - return kvm_skip_emulated_instruction(vcpu); - - default: - BUG(); /* We have already checked above that type <= 3 */ - } -} - -static int handle_pml_full(struct kvm_vcpu *vcpu) -{ - unsigned long exit_qualification; - - trace_kvm_pml_full(vcpu->vcpu_id); - - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - - /* - * PML buffer FULL happened while executing iret from NMI, - * "blocked by NMI" bit has to be set before next VM entry. - */ - if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && - enable_vnmi && - (exit_qualification & INTR_INFO_UNBLOCK_NMI)) - vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); - - /* - * PML buffer already flushed at beginning of VMEXIT. Nothing to do - * here.., and there's no userspace involvement needed for PML. - */ - return 1; -} - -static int handle_preemption_timer(struct kvm_vcpu *vcpu) -{ - if (!to_vmx(vcpu)->req_immediate_exit) - kvm_lapic_expired_hv_timer(vcpu); - return 1; -} - -static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - int maxphyaddr = cpuid_maxphyaddr(vcpu); - - /* Check for memory type validity */ - switch (address & VMX_EPTP_MT_MASK) { - case VMX_EPTP_MT_UC: - if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT)) - return false; - break; - case VMX_EPTP_MT_WB: - if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT)) - return false; - break; - default: - return false; - } - - /* only 4 levels page-walk length are valid */ - if ((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4) - return false; - - /* Reserved bits should not be set */ - if (address >> maxphyaddr || ((address >> 7) & 0x1f)) - return false; - - /* AD, if set, should be supported */ - if (address & VMX_EPTP_AD_ENABLE_BIT) { - if (!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT)) - return false; - } - - return true; -} - -static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - u32 index = vcpu->arch.regs[VCPU_REGS_RCX]; - u64 address; - bool accessed_dirty; - struct kvm_mmu *mmu = vcpu->arch.walk_mmu; - - if (!nested_cpu_has_eptp_switching(vmcs12) || - !nested_cpu_has_ept(vmcs12)) - return 1; - - if (index >= VMFUNC_EPTP_ENTRIES) - return 1; - - - if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, - &address, index * 8, 8)) - return 1; - - accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT); - - /* - * If the (L2) guest does a vmfunc to the currently - * active ept pointer, we don't have to do anything else - */ - if (vmcs12->ept_pointer != address) { - if (!valid_ept_address(vcpu, address)) - return 1; - - kvm_mmu_unload(vcpu); - mmu->ept_ad = accessed_dirty; - mmu->mmu_role.base.ad_disabled = !accessed_dirty; - vmcs12->ept_pointer = address; - /* - * TODO: Check what's the correct approach in case - * mmu reload fails. Currently, we just let the next - * reload potentially fail - */ - kvm_mmu_reload(vcpu); - } - - return 0; -} - -static int handle_vmfunc(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct vmcs12 *vmcs12; - u32 function = vcpu->arch.regs[VCPU_REGS_RAX]; - - /* - * VMFUNC is only supported for nested guests, but we always enable the - * secondary control for simplicity; for non-nested mode, fake that we - * didn't by injecting #UD. - */ - if (!is_guest_mode(vcpu)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } - - vmcs12 = get_vmcs12(vcpu); - if ((vmcs12->vm_function_control & (1 << function)) == 0) - goto fail; - - switch (function) { - case 0: - if (nested_vmx_eptp_switching(vcpu, vmcs12)) - goto fail; - break; - default: - goto fail; - } - return kvm_skip_emulated_instruction(vcpu); - -fail: - nested_vmx_vmexit(vcpu, vmx->exit_reason, - vmcs_read32(VM_EXIT_INTR_INFO), - vmcs_readl(EXIT_QUALIFICATION)); - return 1; -} - -static int handle_encls(struct kvm_vcpu *vcpu) -{ - /* - * SGX virtualization is not yet supported. There is no software - * enable bit for SGX, so we have to trap ENCLS and inject a #UD - * to prevent the guest from executing ENCLS. - */ - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; -} - -/* - * The exit handlers return 1 if the exit was handled fully and guest execution - * may resume. Otherwise they set the kvm_run parameter to indicate what needs - * to be done to userspace and return 0. - */ -static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { - [EXIT_REASON_EXCEPTION_NMI] = handle_exception, - [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, - [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, - [EXIT_REASON_NMI_WINDOW] = handle_nmi_window, - [EXIT_REASON_IO_INSTRUCTION] = handle_io, - [EXIT_REASON_CR_ACCESS] = handle_cr, - [EXIT_REASON_DR_ACCESS] = handle_dr, - [EXIT_REASON_CPUID] = handle_cpuid, - [EXIT_REASON_MSR_READ] = handle_rdmsr, - [EXIT_REASON_MSR_WRITE] = handle_wrmsr, - [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, - [EXIT_REASON_HLT] = handle_halt, - [EXIT_REASON_INVD] = handle_invd, - [EXIT_REASON_INVLPG] = handle_invlpg, - [EXIT_REASON_RDPMC] = handle_rdpmc, - [EXIT_REASON_VMCALL] = handle_vmcall, - [EXIT_REASON_VMCLEAR] = handle_vmclear, - [EXIT_REASON_VMLAUNCH] = handle_vmlaunch, - [EXIT_REASON_VMPTRLD] = handle_vmptrld, - [EXIT_REASON_VMPTRST] = handle_vmptrst, - [EXIT_REASON_VMREAD] = handle_vmread, - [EXIT_REASON_VMRESUME] = handle_vmresume, - [EXIT_REASON_VMWRITE] = handle_vmwrite, - [EXIT_REASON_VMOFF] = handle_vmoff, - [EXIT_REASON_VMON] = handle_vmon, - [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, - [EXIT_REASON_APIC_ACCESS] = handle_apic_access, - [EXIT_REASON_APIC_WRITE] = handle_apic_write, - [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced, - [EXIT_REASON_WBINVD] = handle_wbinvd, - [EXIT_REASON_XSETBV] = handle_xsetbv, - [EXIT_REASON_TASK_SWITCH] = handle_task_switch, - [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, - [EXIT_REASON_GDTR_IDTR] = handle_desc, - [EXIT_REASON_LDTR_TR] = handle_desc, - [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, - [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, - [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, - [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait, - [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap, - [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor, - [EXIT_REASON_INVEPT] = handle_invept, - [EXIT_REASON_INVVPID] = handle_invvpid, - [EXIT_REASON_RDRAND] = handle_invalid_op, - [EXIT_REASON_RDSEED] = handle_invalid_op, - [EXIT_REASON_XSAVES] = handle_xsaves, - [EXIT_REASON_XRSTORS] = handle_xrstors, - [EXIT_REASON_PML_FULL] = handle_pml_full, - [EXIT_REASON_INVPCID] = handle_invpcid, - [EXIT_REASON_VMFUNC] = handle_vmfunc, - [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, - [EXIT_REASON_ENCLS] = handle_encls, -}; - -static const int kvm_vmx_max_exit_handlers = - ARRAY_SIZE(kvm_vmx_exit_handlers); - -static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - unsigned long exit_qualification; - gpa_t bitmap, last_bitmap; - unsigned int port; - int size; - u8 b; - - if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) - return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); - - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - - port = exit_qualification >> 16; - size = (exit_qualification & 7) + 1; - - last_bitmap = (gpa_t)-1; - b = -1; - - while (size > 0) { - if (port < 0x8000) - bitmap = vmcs12->io_bitmap_a; - else if (port < 0x10000) - bitmap = vmcs12->io_bitmap_b; - else - return true; - bitmap += (port & 0x7fff) / 8; - - if (last_bitmap != bitmap) - if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) - return true; - if (b & (1 << (port & 7))) - return true; - - port++; - size--; - last_bitmap = bitmap; - } - - return false; -} - -/* - * Return 1 if we should exit from L2 to L1 to handle an MSR access access, - * rather than handle it ourselves in L0. I.e., check whether L1 expressed - * disinterest in the current event (read or write a specific MSR) by using an - * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. - */ -static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12, u32 exit_reason) -{ - u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX]; - gpa_t bitmap; - - if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) - return true; - - /* - * The MSR_BITMAP page is divided into four 1024-byte bitmaps, - * for the four combinations of read/write and low/high MSR numbers. - * First we need to figure out which of the four to use: - */ - bitmap = vmcs12->msr_bitmap; - if (exit_reason == EXIT_REASON_MSR_WRITE) - bitmap += 2048; - if (msr_index >= 0xc0000000) { - msr_index -= 0xc0000000; - bitmap += 1024; - } - - /* Then read the msr_index'th bit from this bitmap: */ - if (msr_index < 1024*8) { - unsigned char b; - if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) - return true; - return 1 & (b >> (msr_index & 7)); - } else - return true; /* let L1 handle the wrong parameter */ -} - -/* - * Return 1 if we should exit from L2 to L1 to handle a CR access exit, - * rather than handle it ourselves in L0. I.e., check if L1 wanted to - * intercept (via guest_host_mask etc.) the current event. - */ -static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - int cr = exit_qualification & 15; - int reg; - unsigned long val; - - switch ((exit_qualification >> 4) & 3) { - case 0: /* mov to cr */ - reg = (exit_qualification >> 8) & 15; - val = kvm_register_readl(vcpu, reg); - switch (cr) { - case 0: - if (vmcs12->cr0_guest_host_mask & - (val ^ vmcs12->cr0_read_shadow)) - return true; - break; - case 3: - if ((vmcs12->cr3_target_count >= 1 && - vmcs12->cr3_target_value0 == val) || - (vmcs12->cr3_target_count >= 2 && - vmcs12->cr3_target_value1 == val) || - (vmcs12->cr3_target_count >= 3 && - vmcs12->cr3_target_value2 == val) || - (vmcs12->cr3_target_count >= 4 && - vmcs12->cr3_target_value3 == val)) - return false; - if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) - return true; - break; - case 4: - if (vmcs12->cr4_guest_host_mask & - (vmcs12->cr4_read_shadow ^ val)) - return true; - break; - case 8: - if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) - return true; - break; - } - break; - case 2: /* clts */ - if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && - (vmcs12->cr0_read_shadow & X86_CR0_TS)) - return true; - break; - case 1: /* mov from cr */ - switch (cr) { - case 3: - if (vmcs12->cpu_based_vm_exec_control & - CPU_BASED_CR3_STORE_EXITING) - return true; - break; - case 8: - if (vmcs12->cpu_based_vm_exec_control & - CPU_BASED_CR8_STORE_EXITING) - return true; - break; - } - break; - case 3: /* lmsw */ - /* - * lmsw can change bits 1..3 of cr0, and only set bit 0 of - * cr0. Other attempted changes are ignored, with no exit. - */ - val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; - if (vmcs12->cr0_guest_host_mask & 0xe & - (val ^ vmcs12->cr0_read_shadow)) - return true; - if ((vmcs12->cr0_guest_host_mask & 0x1) && - !(vmcs12->cr0_read_shadow & 0x1) && - (val & 0x1)) - return true; - break; - } - return false; -} - -static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12, gpa_t bitmap) -{ - u32 vmx_instruction_info; - unsigned long field; - u8 b; - - if (!nested_cpu_has_shadow_vmcs(vmcs12)) - return true; - - /* Decode instruction info and find the field to access */ - vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); - - /* Out-of-range fields always cause a VM exit from L2 to L1 */ - if (field >> 15) - return true; - - if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) - return true; - - return 1 & (b >> (field & 7)); -} - -/* - * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we - * should handle it ourselves in L0 (and then continue L2). Only call this - * when in is_guest_mode (L2). - */ -static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) -{ - u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - - if (vmx->nested.nested_run_pending) - return false; - - if (unlikely(vmx->fail)) { - pr_info_ratelimited("%s failed vm entry %x\n", __func__, - vmcs_read32(VM_INSTRUCTION_ERROR)); - return true; - } - - /* - * The host physical addresses of some pages of guest memory - * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC - * Page). The CPU may write to these pages via their host - * physical address while L2 is running, bypassing any - * address-translation-based dirty tracking (e.g. EPT write - * protection). - * - * Mark them dirty on every exit from L2 to prevent them from - * getting out of sync with dirty tracking. - */ - nested_mark_vmcs12_pages_dirty(vcpu); - - trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, - vmcs_readl(EXIT_QUALIFICATION), - vmx->idt_vectoring_info, - intr_info, - vmcs_read32(VM_EXIT_INTR_ERROR_CODE), - KVM_ISA_VMX); - - switch (exit_reason) { - case EXIT_REASON_EXCEPTION_NMI: - if (is_nmi(intr_info)) - return false; - else if (is_page_fault(intr_info)) - return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept; - else if (is_debug(intr_info) && - vcpu->guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) - return false; - else if (is_breakpoint(intr_info) && - vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) - return false; - return vmcs12->exception_bitmap & - (1u << (intr_info & INTR_INFO_VECTOR_MASK)); - case EXIT_REASON_EXTERNAL_INTERRUPT: - return false; - case EXIT_REASON_TRIPLE_FAULT: - return true; - case EXIT_REASON_PENDING_INTERRUPT: - return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING); - case EXIT_REASON_NMI_WINDOW: - return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING); - case EXIT_REASON_TASK_SWITCH: - return true; - case EXIT_REASON_CPUID: - return true; - case EXIT_REASON_HLT: - return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); - case EXIT_REASON_INVD: - return true; - case EXIT_REASON_INVLPG: - return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); - case EXIT_REASON_RDPMC: - return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); - case EXIT_REASON_RDRAND: - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); - case EXIT_REASON_RDSEED: - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); - case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: - return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); - case EXIT_REASON_VMREAD: - return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, - vmcs12->vmread_bitmap); - case EXIT_REASON_VMWRITE: - return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, - vmcs12->vmwrite_bitmap); - case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: - case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: - case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: - case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: - case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: - /* - * VMX instructions trap unconditionally. This allows L1 to - * emulate them for its L2 guest, i.e., allows 3-level nesting! - */ - return true; - case EXIT_REASON_CR_ACCESS: - return nested_vmx_exit_handled_cr(vcpu, vmcs12); - case EXIT_REASON_DR_ACCESS: - return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); - case EXIT_REASON_IO_INSTRUCTION: - return nested_vmx_exit_handled_io(vcpu, vmcs12); - case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); - case EXIT_REASON_MSR_READ: - case EXIT_REASON_MSR_WRITE: - return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); - case EXIT_REASON_INVALID_STATE: - return true; - case EXIT_REASON_MWAIT_INSTRUCTION: - return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); - case EXIT_REASON_MONITOR_TRAP_FLAG: - return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG); - case EXIT_REASON_MONITOR_INSTRUCTION: - return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); - case EXIT_REASON_PAUSE_INSTRUCTION: - return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || - nested_cpu_has2(vmcs12, - SECONDARY_EXEC_PAUSE_LOOP_EXITING); - case EXIT_REASON_MCE_DURING_VMENTRY: - return false; - case EXIT_REASON_TPR_BELOW_THRESHOLD: - return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); - case EXIT_REASON_APIC_ACCESS: - case EXIT_REASON_APIC_WRITE: - case EXIT_REASON_EOI_INDUCED: - /* - * The controls for "virtualize APIC accesses," "APIC- - * register virtualization," and "virtual-interrupt - * delivery" only come from vmcs12. - */ - return true; - case EXIT_REASON_EPT_VIOLATION: - /* - * L0 always deals with the EPT violation. If nested EPT is - * used, and the nested mmu code discovers that the address is - * missing in the guest EPT table (EPT12), the EPT violation - * will be injected with nested_ept_inject_page_fault() - */ - return false; - case EXIT_REASON_EPT_MISCONFIG: - /* - * L2 never uses directly L1's EPT, but rather L0's own EPT - * table (shadow on EPT) or a merged EPT table that L0 built - * (EPT on EPT). So any problems with the structure of the - * table is L0's fault. - */ - return false; - case EXIT_REASON_INVPCID: - return - nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && - nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); - case EXIT_REASON_WBINVD: - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); - case EXIT_REASON_XSETBV: - return true; - case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: - /* - * This should never happen, since it is not possible to - * set XSS to a non-zero value---neither in L1 nor in L2. - * If if it were, XSS would have to be checked against - * the XSS exit bitmap in vmcs12. - */ - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); - case EXIT_REASON_PREEMPTION_TIMER: - return false; - case EXIT_REASON_PML_FULL: - /* We emulate PML support to L1. */ - return false; - case EXIT_REASON_VMFUNC: - /* VM functions are emulated through L2->L0 vmexits. */ - return false; - case EXIT_REASON_ENCLS: - /* SGX is never exposed to L1 */ - return false; - default: - return true; - } -} - -static int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason) -{ - u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - - /* - * At this point, the exit interruption info in exit_intr_info - * is only valid for EXCEPTION_NMI exits. For EXTERNAL_INTERRUPT - * we need to query the in-kernel LAPIC. - */ - WARN_ON(exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT); - if ((exit_intr_info & - (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) == - (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) { - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - vmcs12->vm_exit_intr_error_code = - vmcs_read32(VM_EXIT_INTR_ERROR_CODE); - } - - nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info, - vmcs_readl(EXIT_QUALIFICATION)); - return 1; -} - -static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) -{ - *info1 = vmcs_readl(EXIT_QUALIFICATION); - *info2 = vmcs_read32(VM_EXIT_INTR_INFO); -} - -static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) -{ - if (vmx->pml_pg) { - __free_page(vmx->pml_pg); - vmx->pml_pg = NULL; - } -} - -static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - u64 *pml_buf; - u16 pml_idx; - - pml_idx = vmcs_read16(GUEST_PML_INDEX); - - /* Do nothing if PML buffer is empty */ - if (pml_idx == (PML_ENTITY_NUM - 1)) - return; - - /* PML index always points to next available PML buffer entity */ - if (pml_idx >= PML_ENTITY_NUM) - pml_idx = 0; - else - pml_idx++; - - pml_buf = page_address(vmx->pml_pg); - for (; pml_idx < PML_ENTITY_NUM; pml_idx++) { - u64 gpa; - - gpa = pml_buf[pml_idx]; - WARN_ON(gpa & (PAGE_SIZE - 1)); - kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); - } - - /* reset PML index */ - vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); -} - -/* - * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap. - * Called before reporting dirty_bitmap to userspace. - */ -static void kvm_flush_pml_buffers(struct kvm *kvm) -{ - int i; - struct kvm_vcpu *vcpu; - /* - * We only need to kick vcpu out of guest mode here, as PML buffer - * is flushed at beginning of all VMEXITs, and it's obvious that only - * vcpus running in guest are possible to have unflushed GPAs in PML - * buffer. - */ - kvm_for_each_vcpu(i, vcpu, kvm) - kvm_vcpu_kick(vcpu); -} - -static void vmx_dump_sel(char *name, uint32_t sel) -{ - pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", - name, vmcs_read16(sel), - vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR), - vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR), - vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR)); -} - -static void vmx_dump_dtsel(char *name, uint32_t limit) -{ - pr_err("%s limit=0x%08x, base=0x%016lx\n", - name, vmcs_read32(limit), - vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); -} - -static void dump_vmcs(void) -{ - u32 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS); - u32 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS); - u32 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); - u32 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); - u32 secondary_exec_control = 0; - unsigned long cr4 = vmcs_readl(GUEST_CR4); - u64 efer = vmcs_read64(GUEST_IA32_EFER); - int i, n; - - if (cpu_has_secondary_exec_ctrls()) - secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - - pr_err("*** Guest State ***\n"); - pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", - vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW), - vmcs_readl(CR0_GUEST_HOST_MASK)); - pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", - cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK)); - pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3)); - if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) && - (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA)) - { - pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n", - vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1)); - pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n", - vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3)); - } - pr_err("RSP = 0x%016lx RIP = 0x%016lx\n", - vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP)); - pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n", - vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7)); - pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", - vmcs_readl(GUEST_SYSENTER_ESP), - vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP)); - vmx_dump_sel("CS: ", GUEST_CS_SELECTOR); - vmx_dump_sel("DS: ", GUEST_DS_SELECTOR); - vmx_dump_sel("SS: ", GUEST_SS_SELECTOR); - vmx_dump_sel("ES: ", GUEST_ES_SELECTOR); - vmx_dump_sel("FS: ", GUEST_FS_SELECTOR); - vmx_dump_sel("GS: ", GUEST_GS_SELECTOR); - vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT); - vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR); - vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT); - vmx_dump_sel("TR: ", GUEST_TR_SELECTOR); - if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) || - (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER))) - pr_err("EFER = 0x%016llx PAT = 0x%016llx\n", - efer, vmcs_read64(GUEST_IA32_PAT)); - pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n", - vmcs_read64(GUEST_IA32_DEBUGCTL), - vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS)); - if (cpu_has_load_perf_global_ctrl && - vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) - pr_err("PerfGlobCtl = 0x%016llx\n", - vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL)); - if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) - pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS)); - pr_err("Interruptibility = %08x ActivityState = %08x\n", - vmcs_read32(GUEST_INTERRUPTIBILITY_INFO), - vmcs_read32(GUEST_ACTIVITY_STATE)); - if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) - pr_err("InterruptStatus = %04x\n", - vmcs_read16(GUEST_INTR_STATUS)); - - pr_err("*** Host State ***\n"); - pr_err("RIP = 0x%016lx RSP = 0x%016lx\n", - vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP)); - pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n", - vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR), - vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR), - vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR), - vmcs_read16(HOST_TR_SELECTOR)); - pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n", - vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE), - vmcs_readl(HOST_TR_BASE)); - pr_err("GDTBase=%016lx IDTBase=%016lx\n", - vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE)); - pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n", - vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3), - vmcs_readl(HOST_CR4)); - pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", - vmcs_readl(HOST_IA32_SYSENTER_ESP), - vmcs_read32(HOST_IA32_SYSENTER_CS), - vmcs_readl(HOST_IA32_SYSENTER_EIP)); - if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER)) - pr_err("EFER = 0x%016llx PAT = 0x%016llx\n", - vmcs_read64(HOST_IA32_EFER), - vmcs_read64(HOST_IA32_PAT)); - if (cpu_has_load_perf_global_ctrl && - vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) - pr_err("PerfGlobCtl = 0x%016llx\n", - vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); - - pr_err("*** Control State ***\n"); - pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n", - pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control); - pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl); - pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n", - vmcs_read32(EXCEPTION_BITMAP), - vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK), - vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH)); - pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n", - vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), - vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE), - vmcs_read32(VM_ENTRY_INSTRUCTION_LEN)); - pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n", - vmcs_read32(VM_EXIT_INTR_INFO), - vmcs_read32(VM_EXIT_INTR_ERROR_CODE), - vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); - pr_err(" reason=%08x qualification=%016lx\n", - vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION)); - pr_err("IDTVectoring: info=%08x errcode=%08x\n", - vmcs_read32(IDT_VECTORING_INFO_FIELD), - vmcs_read32(IDT_VECTORING_ERROR_CODE)); - pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET)); - if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING) - pr_err("TSC Multiplier = 0x%016llx\n", - vmcs_read64(TSC_MULTIPLIER)); - if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) - pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); - if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) - pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); - if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) - pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER)); - n = vmcs_read32(CR3_TARGET_COUNT); - for (i = 0; i + 1 < n; i += 4) - pr_err("CR3 target%u=%016lx target%u=%016lx\n", - i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2), - i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2)); - if (i < n) - pr_err("CR3 target%u=%016lx\n", - i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2)); - if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) - pr_err("PLE Gap=%08x Window=%08x\n", - vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW)); - if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID) - pr_err("Virtual processor ID = 0x%04x\n", - vmcs_read16(VIRTUAL_PROCESSOR_ID)); -} - -/* - * The guest has exited. See if we can fix it or if we need userspace - * assistance. - */ -static int vmx_handle_exit(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 exit_reason = vmx->exit_reason; - u32 vectoring_info = vmx->idt_vectoring_info; - - trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX); - - /* - * Flush logged GPAs PML buffer, this will make dirty_bitmap more - * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before - * querying dirty_bitmap, we only need to kick all vcpus out of guest - * mode as if vcpus is in root mode, the PML buffer must has been - * flushed already. - */ - if (enable_pml) - vmx_flush_pml_buffer(vcpu); - - /* If guest state is invalid, start emulating */ - if (vmx->emulation_required) - return handle_invalid_guest_state(vcpu); - - if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason)) - return nested_vmx_reflect_vmexit(vcpu, exit_reason); - - if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { - dump_vmcs(); - vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; - vcpu->run->fail_entry.hardware_entry_failure_reason - = exit_reason; - return 0; - } - - if (unlikely(vmx->fail)) { - vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; - vcpu->run->fail_entry.hardware_entry_failure_reason - = vmcs_read32(VM_INSTRUCTION_ERROR); - return 0; - } - - /* - * Note: - * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by - * delivery event since it indicates guest is accessing MMIO. - * The vm-exit can be triggered again after return to guest that - * will cause infinite loop. - */ - if ((vectoring_info & VECTORING_INFO_VALID_MASK) && - (exit_reason != EXIT_REASON_EXCEPTION_NMI && - exit_reason != EXIT_REASON_EPT_VIOLATION && - exit_reason != EXIT_REASON_PML_FULL && - exit_reason != EXIT_REASON_TASK_SWITCH)) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; - vcpu->run->internal.ndata = 3; - vcpu->run->internal.data[0] = vectoring_info; - vcpu->run->internal.data[1] = exit_reason; - vcpu->run->internal.data[2] = vcpu->arch.exit_qualification; - if (exit_reason == EXIT_REASON_EPT_MISCONFIG) { - vcpu->run->internal.ndata++; - vcpu->run->internal.data[3] = - vmcs_read64(GUEST_PHYSICAL_ADDRESS); - } - return 0; - } - - if (unlikely(!enable_vnmi && - vmx->loaded_vmcs->soft_vnmi_blocked)) { - if (vmx_interrupt_allowed(vcpu)) { - vmx->loaded_vmcs->soft_vnmi_blocked = 0; - } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL && - vcpu->arch.nmi_pending) { - /* - * This CPU don't support us in finding the end of an - * NMI-blocked window if the guest runs with IRQs - * disabled. So we pull the trigger after 1 s of - * futile waiting, but inform the user about this. - */ - printk(KERN_WARNING "%s: Breaking out of NMI-blocked " - "state on VCPU %d after 1 s timeout\n", - __func__, vcpu->vcpu_id); - vmx->loaded_vmcs->soft_vnmi_blocked = 0; - } - } - - if (exit_reason < kvm_vmx_max_exit_handlers - && kvm_vmx_exit_handlers[exit_reason]) - return kvm_vmx_exit_handlers[exit_reason](vcpu); - else { - vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", - exit_reason); - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } -} - -/* - * Software based L1D cache flush which is used when microcode providing - * the cache control MSR is not loaded. - * - * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to - * flush it is required to read in 64 KiB because the replacement algorithm - * is not exactly LRU. This could be sized at runtime via topology - * information but as all relevant affected CPUs have 32KiB L1D cache size - * there is no point in doing so. - */ -static void vmx_l1d_flush(struct kvm_vcpu *vcpu) -{ - int size = PAGE_SIZE << L1D_CACHE_ORDER; - - /* - * This code is only executed when the the flush mode is 'cond' or - * 'always' - */ - if (static_branch_likely(&vmx_l1d_flush_cond)) { - bool flush_l1d; - - /* - * Clear the per-vcpu flush bit, it gets set again - * either from vcpu_run() or from one of the unsafe - * VMEXIT handlers. - */ - flush_l1d = vcpu->arch.l1tf_flush_l1d; - vcpu->arch.l1tf_flush_l1d = false; - - /* - * Clear the per-cpu flush bit, it gets set again from - * the interrupt handlers. - */ - flush_l1d |= kvm_get_cpu_l1tf_flush_l1d(); - kvm_clear_cpu_l1tf_flush_l1d(); - - if (!flush_l1d) - return; - } - - vcpu->stat.l1d_flush++; - - if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { - wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); - return; - } - - asm volatile( - /* First ensure the pages are in the TLB */ - "xorl %%eax, %%eax\n" - ".Lpopulate_tlb:\n\t" - "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" - "addl $4096, %%eax\n\t" - "cmpl %%eax, %[size]\n\t" - "jne .Lpopulate_tlb\n\t" - "xorl %%eax, %%eax\n\t" - "cpuid\n\t" - /* Now fill the cache */ - "xorl %%eax, %%eax\n" - ".Lfill_cache:\n" - "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" - "addl $64, %%eax\n\t" - "cmpl %%eax, %[size]\n\t" - "jne .Lfill_cache\n\t" - "lfence\n" - :: [flush_pages] "r" (vmx_l1d_flush_pages), - [size] "r" (size) - : "eax", "ebx", "ecx", "edx"); -} - -static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) -{ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - - if (is_guest_mode(vcpu) && - nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) - return; - - if (irr == -1 || tpr < irr) { - vmcs_write32(TPR_THRESHOLD, 0); - return; - } - - vmcs_write32(TPR_THRESHOLD, irr); -} - -static void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) -{ - u32 sec_exec_control; - - if (!lapic_in_kernel(vcpu)) - return; - - if (!flexpriority_enabled && - !cpu_has_vmx_virtualize_x2apic_mode()) - return; - - /* Postpone execution until vmcs01 is the current VMCS. */ - if (is_guest_mode(vcpu)) { - to_vmx(vcpu)->nested.change_vmcs01_virtual_apic_mode = true; - return; - } - - sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | - SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); - - switch (kvm_get_apic_mode(vcpu)) { - case LAPIC_MODE_INVALID: - WARN_ONCE(true, "Invalid local APIC state"); - case LAPIC_MODE_DISABLED: - break; - case LAPIC_MODE_XAPIC: - if (flexpriority_enabled) { - sec_exec_control |= - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; - vmx_flush_tlb(vcpu, true); - } - break; - case LAPIC_MODE_X2APIC: - if (cpu_has_vmx_virtualize_x2apic_mode()) - sec_exec_control |= - SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; - break; - } - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); - - vmx_update_msr_bitmap(vcpu); -} - -static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) -{ - if (!is_guest_mode(vcpu)) { - vmcs_write64(APIC_ACCESS_ADDR, hpa); - vmx_flush_tlb(vcpu, true); - } -} - -static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) -{ - u16 status; - u8 old; - - if (max_isr == -1) - max_isr = 0; - - status = vmcs_read16(GUEST_INTR_STATUS); - old = status >> 8; - if (max_isr != old) { - status &= 0xff; - status |= max_isr << 8; - vmcs_write16(GUEST_INTR_STATUS, status); - } -} - -static void vmx_set_rvi(int vector) -{ - u16 status; - u8 old; - - if (vector == -1) - vector = 0; - - status = vmcs_read16(GUEST_INTR_STATUS); - old = (u8)status & 0xff; - if ((u8)vector != old) { - status &= ~0xff; - status |= (u8)vector; - vmcs_write16(GUEST_INTR_STATUS, status); - } -} - -static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) -{ - /* - * When running L2, updating RVI is only relevant when - * vmcs12 virtual-interrupt-delivery enabled. - * However, it can be enabled only when L1 also - * intercepts external-interrupts and in that case - * we should not update vmcs02 RVI but instead intercept - * interrupt. Therefore, do nothing when running L2. - */ - if (!is_guest_mode(vcpu)) - vmx_set_rvi(max_irr); -} - -static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - int max_irr; - bool max_irr_updated; - - WARN_ON(!vcpu->arch.apicv_active); - if (pi_test_on(&vmx->pi_desc)) { - pi_clear_on(&vmx->pi_desc); - /* - * IOMMU can write to PIR.ON, so the barrier matters even on UP. - * But on x86 this is just a compiler barrier anyway. - */ - smp_mb__after_atomic(); - max_irr_updated = - kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr); - - /* - * If we are running L2 and L1 has a new pending interrupt - * which can be injected, we should re-evaluate - * what should be done with this new L1 interrupt. - * If L1 intercepts external-interrupts, we should - * exit from L2 to L1. Otherwise, interrupt should be - * delivered directly to L2. - */ - if (is_guest_mode(vcpu) && max_irr_updated) { - if (nested_exit_on_intr(vcpu)) - kvm_vcpu_exiting_guest_mode(vcpu); - else - kvm_make_request(KVM_REQ_EVENT, vcpu); - } - } else { - max_irr = kvm_lapic_find_highest_irr(vcpu); - } - vmx_hwapic_irr_update(vcpu, max_irr); - return max_irr; -} - -static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu) -{ - u8 rvi = vmx_get_rvi(); - u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI); - - return ((rvi & 0xf0) > (vppr & 0xf0)); -} - -static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) -{ - if (!kvm_vcpu_apicv_active(vcpu)) - return; - - vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); - vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); - vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); - vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); -} - -static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - pi_clear_on(&vmx->pi_desc); - memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir)); -} - -static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) -{ - u32 exit_intr_info = 0; - u16 basic_exit_reason = (u16)vmx->exit_reason; - - if (!(basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY - || basic_exit_reason == EXIT_REASON_EXCEPTION_NMI)) - return; - - if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) - exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - vmx->exit_intr_info = exit_intr_info; - - /* if exit due to PF check for async PF */ - if (is_page_fault(exit_intr_info)) - vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason(); - - /* Handle machine checks before interrupts are enabled */ - if (basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY || - is_machine_check(exit_intr_info)) - kvm_machine_check(); - - /* We need to handle NMIs before interrupts are enabled */ - if (is_nmi(exit_intr_info)) { - kvm_before_interrupt(&vmx->vcpu); - asm("int $2"); - kvm_after_interrupt(&vmx->vcpu); - } -} - -static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) -{ - u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - - if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK)) - == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) { - unsigned int vector; - unsigned long entry; - gate_desc *desc; - struct vcpu_vmx *vmx = to_vmx(vcpu); -#ifdef CONFIG_X86_64 - unsigned long tmp; -#endif - - vector = exit_intr_info & INTR_INFO_VECTOR_MASK; - desc = (gate_desc *)vmx->host_idt_base + vector; - entry = gate_offset(desc); - asm volatile( -#ifdef CONFIG_X86_64 - "mov %%" _ASM_SP ", %[sp]\n\t" - "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t" - "push $%c[ss]\n\t" - "push %[sp]\n\t" -#endif - "pushf\n\t" - __ASM_SIZE(push) " $%c[cs]\n\t" - CALL_NOSPEC - : -#ifdef CONFIG_X86_64 - [sp]"=&r"(tmp), -#endif - ASM_CALL_CONSTRAINT - : - THUNK_TARGET(entry), - [ss]"i"(__KERNEL_DS), - [cs]"i"(__KERNEL_CS) - ); - } -} -STACK_FRAME_NON_STANDARD(vmx_handle_external_intr); - -static bool vmx_has_emulated_msr(int index) -{ - switch (index) { - case MSR_IA32_SMBASE: - /* - * We cannot do SMM unless we can run the guest in big - * real mode. - */ - return enable_unrestricted_guest || emulate_invalid_guest_state; - case MSR_AMD64_VIRT_SPEC_CTRL: - /* This is AMD only. */ - return false; - default: - return true; - } -} - -static bool vmx_mpx_supported(void) -{ - return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) && - (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS); -} - -static bool vmx_xsaves_supported(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_XSAVES; -} - -static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) -{ - u32 exit_intr_info; - bool unblock_nmi; - u8 vector; - bool idtv_info_valid; - - idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; - - if (enable_vnmi) { - if (vmx->loaded_vmcs->nmi_known_unmasked) - return; - /* - * Can't use vmx->exit_intr_info since we're not sure what - * the exit reason is. - */ - exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; - vector = exit_intr_info & INTR_INFO_VECTOR_MASK; - /* - * SDM 3: 27.7.1.2 (September 2008) - * Re-set bit "block by NMI" before VM entry if vmexit caused by - * a guest IRET fault. - * SDM 3: 23.2.2 (September 2008) - * Bit 12 is undefined in any of the following cases: - * If the VM exit sets the valid bit in the IDT-vectoring - * information field. - * If the VM exit is due to a double fault. - */ - if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && - vector != DF_VECTOR && !idtv_info_valid) - vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); - else - vmx->loaded_vmcs->nmi_known_unmasked = - !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) - & GUEST_INTR_STATE_NMI); - } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked)) - vmx->loaded_vmcs->vnmi_blocked_time += - ktime_to_ns(ktime_sub(ktime_get(), - vmx->loaded_vmcs->entry_time)); -} - -static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, - u32 idt_vectoring_info, - int instr_len_field, - int error_code_field) -{ - u8 vector; - int type; - bool idtv_info_valid; - - idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; - - vcpu->arch.nmi_injected = false; - kvm_clear_exception_queue(vcpu); - kvm_clear_interrupt_queue(vcpu); - - if (!idtv_info_valid) - return; - - kvm_make_request(KVM_REQ_EVENT, vcpu); - - vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; - type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; - - switch (type) { - case INTR_TYPE_NMI_INTR: - vcpu->arch.nmi_injected = true; - /* - * SDM 3: 27.7.1.2 (September 2008) - * Clear bit "block by NMI" before VM entry if a NMI - * delivery faulted. - */ - vmx_set_nmi_mask(vcpu, false); - break; - case INTR_TYPE_SOFT_EXCEPTION: - vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); - /* fall through */ - case INTR_TYPE_HARD_EXCEPTION: - if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { - u32 err = vmcs_read32(error_code_field); - kvm_requeue_exception_e(vcpu, vector, err); - } else - kvm_requeue_exception(vcpu, vector); - break; - case INTR_TYPE_SOFT_INTR: - vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); - /* fall through */ - case INTR_TYPE_EXT_INTR: - kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR); - break; - default: - break; - } -} - -static void vmx_complete_interrupts(struct vcpu_vmx *vmx) -{ - __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info, - VM_EXIT_INSTRUCTION_LEN, - IDT_VECTORING_ERROR_CODE); -} - -static void vmx_cancel_injection(struct kvm_vcpu *vcpu) -{ - __vmx_complete_interrupts(vcpu, - vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), - VM_ENTRY_INSTRUCTION_LEN, - VM_ENTRY_EXCEPTION_ERROR_CODE); - - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); -} - -static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) -{ - int i, nr_msrs; - struct perf_guest_switch_msr *msrs; - - msrs = perf_guest_get_msrs(&nr_msrs); - - if (!msrs) - return; - - for (i = 0; i < nr_msrs; i++) - if (msrs[i].host == msrs[i].guest) - clear_atomic_switch_msr(vmx, msrs[i].msr); - else - add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, - msrs[i].host, false); -} - -static void vmx_arm_hv_timer(struct vcpu_vmx *vmx, u32 val) -{ - vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, val); - if (!vmx->loaded_vmcs->hv_timer_armed) - vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, - PIN_BASED_VMX_PREEMPTION_TIMER); - vmx->loaded_vmcs->hv_timer_armed = true; -} - -static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - u64 tscl; - u32 delta_tsc; - - if (vmx->req_immediate_exit) { - vmx_arm_hv_timer(vmx, 0); - return; - } - - if (vmx->hv_deadline_tsc != -1) { - tscl = rdtsc(); - if (vmx->hv_deadline_tsc > tscl) - /* set_hv_timer ensures the delta fits in 32-bits */ - delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> - cpu_preemption_timer_multi); - else - delta_tsc = 0; - - vmx_arm_hv_timer(vmx, delta_tsc); - return; - } - - if (vmx->loaded_vmcs->hv_timer_armed) - vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, - PIN_BASED_VMX_PREEMPTION_TIMER); - vmx->loaded_vmcs->hv_timer_armed = false; -} - -static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long cr3, cr4, evmcs_rsp; - - /* Record the guest's net vcpu time for enforced NMI injections. */ - if (unlikely(!enable_vnmi && - vmx->loaded_vmcs->soft_vnmi_blocked)) - vmx->loaded_vmcs->entry_time = ktime_get(); - - /* Don't enter VMX if guest state is invalid, let the exit handler - start emulation until we arrive back to a valid state */ - if (vmx->emulation_required) - return; - - if (vmx->ple_window_dirty) { - vmx->ple_window_dirty = false; - vmcs_write32(PLE_WINDOW, vmx->ple_window); - } - - if (vmx->nested.need_vmcs12_sync) { - /* - * hv_evmcs may end up being not mapped after migration (when - * L2 was running), map it here to make sure vmcs12 changes are - * properly reflected. - */ - if (vmx->nested.enlightened_vmcs_enabled && - !vmx->nested.hv_evmcs) - nested_vmx_handle_enlightened_vmptrld(vcpu, false); - - if (vmx->nested.hv_evmcs) { - copy_vmcs12_to_enlightened(vmx); - /* All fields are clean */ - vmx->nested.hv_evmcs->hv_clean_fields |= - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; - } else { - copy_vmcs12_to_shadow(vmx); - } - vmx->nested.need_vmcs12_sync = false; - } - - if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) - vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); - if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) - vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); - - cr3 = __get_current_cr3_fast(); - if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { - vmcs_writel(HOST_CR3, cr3); - vmx->loaded_vmcs->host_state.cr3 = cr3; - } - - cr4 = cr4_read_shadow(); - if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { - vmcs_writel(HOST_CR4, cr4); - vmx->loaded_vmcs->host_state.cr4 = cr4; - } - - /* When single-stepping over STI and MOV SS, we must clear the - * corresponding interruptibility bits in the guest state. Otherwise - * vmentry fails as it then expects bit 14 (BS) in pending debug - * exceptions being set, but that's not correct for the guest debugging - * case. */ - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) - vmx_set_interrupt_shadow(vcpu, 0); - - if (static_cpu_has(X86_FEATURE_PKU) && - kvm_read_cr4_bits(vcpu, X86_CR4_PKE) && - vcpu->arch.pkru != vmx->host_pkru) - __write_pkru(vcpu->arch.pkru); - - atomic_switch_perf_msrs(vmx); - - vmx_update_hv_timer(vcpu); - - /* - * If this vCPU has touched SPEC_CTRL, restore the guest's value if - * it's non-zero. Since vmentry is serialising on affected CPUs, there - * is no need to worry about the conditional branch over the wrmsr - * being speculatively taken. - */ - x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0); - - vmx->__launched = vmx->loaded_vmcs->launched; - - evmcs_rsp = static_branch_unlikely(&enable_evmcs) ? - (unsigned long)¤t_evmcs->host_rsp : 0; - - if (static_branch_unlikely(&vmx_l1d_should_flush)) - vmx_l1d_flush(vcpu); - - asm( - /* Store host registers */ - "push %%" _ASM_DX "; push %%" _ASM_BP ";" - "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */ - "push %%" _ASM_CX " \n\t" - "cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t" - "je 1f \n\t" - "mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t" - /* Avoid VMWRITE when Enlightened VMCS is in use */ - "test %%" _ASM_SI ", %%" _ASM_SI " \n\t" - "jz 2f \n\t" - "mov %%" _ASM_SP ", (%%" _ASM_SI ") \n\t" - "jmp 1f \n\t" - "2: \n\t" - __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t" - "1: \n\t" - /* Reload cr2 if changed */ - "mov %c[cr2](%0), %%" _ASM_AX " \n\t" - "mov %%cr2, %%" _ASM_DX " \n\t" - "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t" - "je 3f \n\t" - "mov %%" _ASM_AX", %%cr2 \n\t" - "3: \n\t" - /* Check if vmlaunch or vmresume is needed */ - "cmpl $0, %c[launched](%0) \n\t" - /* Load guest registers. Don't clobber flags. */ - "mov %c[rax](%0), %%" _ASM_AX " \n\t" - "mov %c[rbx](%0), %%" _ASM_BX " \n\t" - "mov %c[rdx](%0), %%" _ASM_DX " \n\t" - "mov %c[rsi](%0), %%" _ASM_SI " \n\t" - "mov %c[rdi](%0), %%" _ASM_DI " \n\t" - "mov %c[rbp](%0), %%" _ASM_BP " \n\t" -#ifdef CONFIG_X86_64 - "mov %c[r8](%0), %%r8 \n\t" - "mov %c[r9](%0), %%r9 \n\t" - "mov %c[r10](%0), %%r10 \n\t" - "mov %c[r11](%0), %%r11 \n\t" - "mov %c[r12](%0), %%r12 \n\t" - "mov %c[r13](%0), %%r13 \n\t" - "mov %c[r14](%0), %%r14 \n\t" - "mov %c[r15](%0), %%r15 \n\t" -#endif - "mov %c[rcx](%0), %%" _ASM_CX " \n\t" /* kills %0 (ecx) */ - - /* Enter guest mode */ - "jne 1f \n\t" - __ex("vmlaunch") "\n\t" - "jmp 2f \n\t" - "1: " __ex("vmresume") "\n\t" - "2: " - /* Save guest registers, load host registers, keep flags */ - "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t" - "pop %0 \n\t" - "setbe %c[fail](%0)\n\t" - "mov %%" _ASM_AX ", %c[rax](%0) \n\t" - "mov %%" _ASM_BX ", %c[rbx](%0) \n\t" - __ASM_SIZE(pop) " %c[rcx](%0) \n\t" - "mov %%" _ASM_DX ", %c[rdx](%0) \n\t" - "mov %%" _ASM_SI ", %c[rsi](%0) \n\t" - "mov %%" _ASM_DI ", %c[rdi](%0) \n\t" - "mov %%" _ASM_BP ", %c[rbp](%0) \n\t" -#ifdef CONFIG_X86_64 - "mov %%r8, %c[r8](%0) \n\t" - "mov %%r9, %c[r9](%0) \n\t" - "mov %%r10, %c[r10](%0) \n\t" - "mov %%r11, %c[r11](%0) \n\t" - "mov %%r12, %c[r12](%0) \n\t" - "mov %%r13, %c[r13](%0) \n\t" - "mov %%r14, %c[r14](%0) \n\t" - "mov %%r15, %c[r15](%0) \n\t" - /* - * Clear host registers marked as clobbered to prevent - * speculative use. - */ - "xor %%r8d, %%r8d \n\t" - "xor %%r9d, %%r9d \n\t" - "xor %%r10d, %%r10d \n\t" - "xor %%r11d, %%r11d \n\t" - "xor %%r12d, %%r12d \n\t" - "xor %%r13d, %%r13d \n\t" - "xor %%r14d, %%r14d \n\t" - "xor %%r15d, %%r15d \n\t" -#endif - "mov %%cr2, %%" _ASM_AX " \n\t" - "mov %%" _ASM_AX ", %c[cr2](%0) \n\t" - - "xor %%eax, %%eax \n\t" - "xor %%ebx, %%ebx \n\t" - "xor %%esi, %%esi \n\t" - "xor %%edi, %%edi \n\t" - "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t" - ".pushsection .rodata \n\t" - ".global vmx_return \n\t" - "vmx_return: " _ASM_PTR " 2b \n\t" - ".popsection" - : : "c"(vmx), "d"((unsigned long)HOST_RSP), "S"(evmcs_rsp), - [launched]"i"(offsetof(struct vcpu_vmx, __launched)), - [fail]"i"(offsetof(struct vcpu_vmx, fail)), - [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), - [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), - [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), - [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), - [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])), - [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])), - [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])), - [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])), -#ifdef CONFIG_X86_64 - [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])), - [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])), - [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])), - [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])), - [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])), - [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])), - [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), - [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), -#endif - [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)), - [wordsize]"i"(sizeof(ulong)) - : "cc", "memory" -#ifdef CONFIG_X86_64 - , "rax", "rbx", "rdi" - , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" -#else - , "eax", "ebx", "edi" -#endif - ); - - /* - * We do not use IBRS in the kernel. If this vCPU has used the - * SPEC_CTRL MSR it may have left it on; save the value and - * turn it off. This is much more efficient than blindly adding - * it to the atomic save/restore list. Especially as the former - * (Saving guest MSRs on vmexit) doesn't even exist in KVM. - * - * For non-nested case: - * If the L01 MSR bitmap does not intercept the MSR, then we need to - * save it. - * - * For nested case: - * If the L02 MSR bitmap does not intercept the MSR, then we need to - * save it. - */ - if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) - vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); - - x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0); - - /* Eliminate branch target predictions from guest mode */ - vmexit_fill_RSB(); - - /* All fields are clean at this point */ - if (static_branch_unlikely(&enable_evmcs)) - current_evmcs->hv_clean_fields |= - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; - - /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ - if (vmx->host_debugctlmsr) - update_debugctlmsr(vmx->host_debugctlmsr); - -#ifndef CONFIG_X86_64 - /* - * The sysexit path does not restore ds/es, so we must set them to - * a reasonable value ourselves. - * - * We can't defer this to vmx_prepare_switch_to_host() since that - * function may be executed in interrupt context, which saves and - * restore segments around it, nullifying its effect. - */ - loadsegment(ds, __USER_DS); - loadsegment(es, __USER_DS); -#endif - - vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) - | (1 << VCPU_EXREG_RFLAGS) - | (1 << VCPU_EXREG_PDPTR) - | (1 << VCPU_EXREG_SEGMENTS) - | (1 << VCPU_EXREG_CR3)); - vcpu->arch.regs_dirty = 0; - - /* - * eager fpu is enabled if PKEY is supported and CR4 is switched - * back on host, so it is safe to read guest PKRU from current - * XSAVE. - */ - if (static_cpu_has(X86_FEATURE_PKU) && - kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) { - vcpu->arch.pkru = __read_pkru(); - if (vcpu->arch.pkru != vmx->host_pkru) - __write_pkru(vmx->host_pkru); - } - - vmx->nested.nested_run_pending = 0; - vmx->idt_vectoring_info = 0; - - vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON); - if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) - return; - - vmx->loaded_vmcs->launched = 1; - vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); - - vmx_complete_atomic_exit(vmx); - vmx_recover_nmi_blocking(vmx); - vmx_complete_interrupts(vmx); -} -STACK_FRAME_NON_STANDARD(vmx_vcpu_run); - -static struct kvm *vmx_vm_alloc(void) -{ - struct kvm_vmx *kvm_vmx = vzalloc(sizeof(struct kvm_vmx)); - return &kvm_vmx->kvm; -} - -static void vmx_vm_free(struct kvm *kvm) -{ - vfree(to_kvm_vmx(kvm)); -} - -static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - int cpu; - - if (vmx->loaded_vmcs == vmcs) - return; - - cpu = get_cpu(); - vmx_vcpu_put(vcpu); - vmx->loaded_vmcs = vmcs; - vmx_vcpu_load(vcpu, cpu); - put_cpu(); - - vm_entry_controls_reset_shadow(vmx); - vm_exit_controls_reset_shadow(vmx); - vmx_segment_cache_clear(vmx); -} - -/* - * Ensure that the current vmcs of the logical processor is the - * vmcs01 of the vcpu before calling free_nested(). - */ -static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu) -{ - vcpu_load(vcpu); - vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01); - free_nested(vcpu); - vcpu_put(vcpu); -} - -static void vmx_free_vcpu(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (enable_pml) - vmx_destroy_pml_buffer(vmx); - free_vpid(vmx->vpid); - leave_guest_mode(vcpu); - vmx_free_vcpu_nested(vcpu); - free_loaded_vmcs(vmx->loaded_vmcs); - kfree(vmx->guest_msrs); - kvm_vcpu_uninit(vcpu); - kmem_cache_free(kvm_vcpu_cache, vmx); -} - -static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) -{ - int err; - struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); - unsigned long *msr_bitmap; - int cpu; - - if (!vmx) - return ERR_PTR(-ENOMEM); - - vmx->vpid = allocate_vpid(); - - err = kvm_vcpu_init(&vmx->vcpu, kvm, id); - if (err) - goto free_vcpu; - - err = -ENOMEM; - - /* - * If PML is turned on, failure on enabling PML just results in failure - * of creating the vcpu, therefore we can simplify PML logic (by - * avoiding dealing with cases, such as enabling PML partially on vcpus - * for the guest, etc. - */ - if (enable_pml) { - vmx->pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!vmx->pml_pg) - goto uninit_vcpu; - } - - vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); - BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0]) - > PAGE_SIZE); - - if (!vmx->guest_msrs) - goto free_pml; - - err = alloc_loaded_vmcs(&vmx->vmcs01); - if (err < 0) - goto free_msrs; - - msr_bitmap = vmx->vmcs01.msr_bitmap; - vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); - vmx->msr_bitmap_mode = 0; - - vmx->loaded_vmcs = &vmx->vmcs01; - cpu = get_cpu(); - vmx_vcpu_load(&vmx->vcpu, cpu); - vmx->vcpu.cpu = cpu; - vmx_vcpu_setup(vmx); - vmx_vcpu_put(&vmx->vcpu); - put_cpu(); - if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { - err = alloc_apic_access_page(kvm); - if (err) - goto free_vmcs; - } - - if (enable_ept && !enable_unrestricted_guest) { - err = init_rmode_identity_map(kvm); - if (err) - goto free_vmcs; - } - - if (nested) - nested_vmx_setup_ctls_msrs(&vmx->nested.msrs, - kvm_vcpu_apicv_active(&vmx->vcpu)); - - vmx->nested.posted_intr_nv = -1; - vmx->nested.current_vmptr = -1ull; - - vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED; - - /* - * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR - * or POSTED_INTR_WAKEUP_VECTOR. - */ - vmx->pi_desc.nv = POSTED_INTR_VECTOR; - vmx->pi_desc.sn = 1; - - return &vmx->vcpu; - -free_vmcs: - free_loaded_vmcs(vmx->loaded_vmcs); -free_msrs: - kfree(vmx->guest_msrs); -free_pml: - vmx_destroy_pml_buffer(vmx); -uninit_vcpu: - kvm_vcpu_uninit(&vmx->vcpu); -free_vcpu: - free_vpid(vmx->vpid); - kmem_cache_free(kvm_vcpu_cache, vmx); - return ERR_PTR(err); -} - -#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n" -#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n" - -static int vmx_vm_init(struct kvm *kvm) -{ - spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock); - - if (!ple_gap) - kvm->arch.pause_in_guest = true; - - if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) { - switch (l1tf_mitigation) { - case L1TF_MITIGATION_OFF: - case L1TF_MITIGATION_FLUSH_NOWARN: - /* 'I explicitly don't care' is set */ - break; - case L1TF_MITIGATION_FLUSH: - case L1TF_MITIGATION_FLUSH_NOSMT: - case L1TF_MITIGATION_FULL: - /* - * Warn upon starting the first VM in a potentially - * insecure environment. - */ - if (cpu_smt_control == CPU_SMT_ENABLED) - pr_warn_once(L1TF_MSG_SMT); - if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER) - pr_warn_once(L1TF_MSG_L1D); - break; - case L1TF_MITIGATION_FULL_FORCE: - /* Flush is enforced */ - break; - } - } - return 0; -} - -static void __init vmx_check_processor_compat(void *rtn) -{ - struct vmcs_config vmcs_conf; - - *(int *)rtn = 0; - if (setup_vmcs_config(&vmcs_conf) < 0) - *(int *)rtn = -EIO; - nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, enable_apicv); - if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { - printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n", - smp_processor_id()); - *(int *)rtn = -EIO; - } -} - -static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) -{ - u8 cache; - u64 ipat = 0; - - /* For VT-d and EPT combination - * 1. MMIO: always map as UC - * 2. EPT with VT-d: - * a. VT-d without snooping control feature: can't guarantee the - * result, try to trust guest. - * b. VT-d with snooping control feature: snooping control feature of - * VT-d engine can guarantee the cache correctness. Just set it - * to WB to keep consistent with host. So the same as item 3. - * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep - * consistent with host MTRR - */ - if (is_mmio) { - cache = MTRR_TYPE_UNCACHABLE; - goto exit; - } - - if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) { - ipat = VMX_EPT_IPAT_BIT; - cache = MTRR_TYPE_WRBACK; - goto exit; - } - - if (kvm_read_cr0(vcpu) & X86_CR0_CD) { - ipat = VMX_EPT_IPAT_BIT; - if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) - cache = MTRR_TYPE_WRBACK; - else - cache = MTRR_TYPE_UNCACHABLE; - goto exit; - } - - cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn); - -exit: - return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat; -} - -static int vmx_get_lpage_level(void) -{ - if (enable_ept && !cpu_has_vmx_ept_1g_page()) - return PT_DIRECTORY_LEVEL; - else - /* For shadow and EPT supported 1GB page */ - return PT_PDPE_LEVEL; -} - -static void vmcs_set_secondary_exec_control(u32 new_ctl) -{ - /* - * These bits in the secondary execution controls field - * are dynamic, the others are mostly based on the hypervisor - * architecture and the guest's CPUID. Do not touch the - * dynamic bits. - */ - u32 mask = - SECONDARY_EXEC_SHADOW_VMCS | - SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | - SECONDARY_EXEC_DESC; - - u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, - (new_ctl & ~mask) | (cur_ctl & mask)); -} - -/* - * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits - * (indicating "allowed-1") if they are supported in the guest's CPUID. - */ -static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct kvm_cpuid_entry2 *entry; - - vmx->nested.msrs.cr0_fixed1 = 0xffffffff; - vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE; - -#define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \ - if (entry && (entry->_reg & (_cpuid_mask))) \ - vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \ -} while (0) - - entry = kvm_find_cpuid_entry(vcpu, 0x1, 0); - cr4_fixed1_update(X86_CR4_VME, edx, bit(X86_FEATURE_VME)); - cr4_fixed1_update(X86_CR4_PVI, edx, bit(X86_FEATURE_VME)); - cr4_fixed1_update(X86_CR4_TSD, edx, bit(X86_FEATURE_TSC)); - cr4_fixed1_update(X86_CR4_DE, edx, bit(X86_FEATURE_DE)); - cr4_fixed1_update(X86_CR4_PSE, edx, bit(X86_FEATURE_PSE)); - cr4_fixed1_update(X86_CR4_PAE, edx, bit(X86_FEATURE_PAE)); - cr4_fixed1_update(X86_CR4_MCE, edx, bit(X86_FEATURE_MCE)); - cr4_fixed1_update(X86_CR4_PGE, edx, bit(X86_FEATURE_PGE)); - cr4_fixed1_update(X86_CR4_OSFXSR, edx, bit(X86_FEATURE_FXSR)); - cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, bit(X86_FEATURE_XMM)); - cr4_fixed1_update(X86_CR4_VMXE, ecx, bit(X86_FEATURE_VMX)); - cr4_fixed1_update(X86_CR4_SMXE, ecx, bit(X86_FEATURE_SMX)); - cr4_fixed1_update(X86_CR4_PCIDE, ecx, bit(X86_FEATURE_PCID)); - cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, bit(X86_FEATURE_XSAVE)); - - entry = kvm_find_cpuid_entry(vcpu, 0x7, 0); - cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, bit(X86_FEATURE_FSGSBASE)); - cr4_fixed1_update(X86_CR4_SMEP, ebx, bit(X86_FEATURE_SMEP)); - cr4_fixed1_update(X86_CR4_SMAP, ebx, bit(X86_FEATURE_SMAP)); - cr4_fixed1_update(X86_CR4_PKE, ecx, bit(X86_FEATURE_PKU)); - cr4_fixed1_update(X86_CR4_UMIP, ecx, bit(X86_FEATURE_UMIP)); - -#undef cr4_fixed1_update -} - -static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (kvm_mpx_supported()) { - bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX); - - if (mpx_enabled) { - vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; - vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; - } else { - vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS; - vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS; - } - } -} - -static void vmx_cpuid_update(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (cpu_has_secondary_exec_ctrls()) { - vmx_compute_secondary_exec_control(vmx); - vmcs_set_secondary_exec_control(vmx->secondary_exec_control); - } - - if (nested_vmx_allowed(vcpu)) - to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= - FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; - else - to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= - ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; - - if (nested_vmx_allowed(vcpu)) { - nested_vmx_cr_fixed1_bits_update(vcpu); - nested_vmx_entry_exit_ctls_update(vcpu); - } -} - -static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) -{ - if (func == 1 && nested) - entry->ecx |= bit(X86_FEATURE_VMX); -} - -static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, - struct x86_exception *fault) -{ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 exit_reason; - unsigned long exit_qualification = vcpu->arch.exit_qualification; - - if (vmx->nested.pml_full) { - exit_reason = EXIT_REASON_PML_FULL; - vmx->nested.pml_full = false; - exit_qualification &= INTR_INFO_UNBLOCK_NMI; - } else if (fault->error_code & PFERR_RSVD_MASK) - exit_reason = EXIT_REASON_EPT_MISCONFIG; - else - exit_reason = EXIT_REASON_EPT_VIOLATION; - - nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification); - vmcs12->guest_physical_address = fault->address; -} - -static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu) -{ - return nested_ept_get_cr3(vcpu) & VMX_EPTP_AD_ENABLE_BIT; -} - -/* Callbacks for nested_ept_init_mmu_context: */ - -static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) -{ - /* return the page table to be shadowed - in our case, EPT12 */ - return get_vmcs12(vcpu)->ept_pointer; -} - -static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) -{ - WARN_ON(mmu_is_nested(vcpu)); - - vcpu->arch.mmu = &vcpu->arch.guest_mmu; - kvm_init_shadow_ept_mmu(vcpu, - to_vmx(vcpu)->nested.msrs.ept_caps & - VMX_EPT_EXECUTE_ONLY_BIT, - nested_ept_ad_enabled(vcpu), - nested_ept_get_cr3(vcpu)); - vcpu->arch.mmu->set_cr3 = vmx_set_cr3; - vcpu->arch.mmu->get_cr3 = nested_ept_get_cr3; - vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; - vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; - - vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; -} - -static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) -{ - vcpu->arch.mmu = &vcpu->arch.root_mmu; - vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; -} - -static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, - u16 error_code) -{ - bool inequality, bit; - - bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; - inequality = - (error_code & vmcs12->page_fault_error_code_mask) != - vmcs12->page_fault_error_code_match; - return inequality ^ bit; -} - -static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, - struct x86_exception *fault) -{ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - - WARN_ON(!is_guest_mode(vcpu)); - - if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) && - !to_vmx(vcpu)->nested.nested_run_pending) { - vmcs12->vm_exit_intr_error_code = fault->error_code; - nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, - PF_VECTOR | INTR_TYPE_HARD_EXCEPTION | - INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK, - fault->address); - } else { - kvm_inject_page_fault(vcpu, fault); - } -} - -static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12); - -static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) -{ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct page *page; - u64 hpa; - - if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { - /* - * Translate L1 physical address to host physical - * address for vmcs02. Keep the page pinned, so this - * physical address remains valid. We keep a reference - * to it so we can release it later. - */ - if (vmx->nested.apic_access_page) { /* shouldn't happen */ - kvm_release_page_dirty(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = NULL; - } - page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr); - /* - * If translation failed, no matter: This feature asks - * to exit when accessing the given address, and if it - * can never be accessed, this feature won't do - * anything anyway. - */ - if (!is_error_page(page)) { - vmx->nested.apic_access_page = page; - hpa = page_to_phys(vmx->nested.apic_access_page); - vmcs_write64(APIC_ACCESS_ADDR, hpa); - } else { - vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); - } - } - - if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { - if (vmx->nested.virtual_apic_page) { /* shouldn't happen */ - kvm_release_page_dirty(vmx->nested.virtual_apic_page); - vmx->nested.virtual_apic_page = NULL; - } - page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->virtual_apic_page_addr); - - /* - * If translation failed, VM entry will fail because - * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull. - * Failing the vm entry is _not_ what the processor - * does but it's basically the only possibility we - * have. We could still enter the guest if CR8 load - * exits are enabled, CR8 store exits are enabled, and - * virtualize APIC access is disabled; in this case - * the processor would never use the TPR shadow and we - * could simply clear the bit from the execution - * control. But such a configuration is useless, so - * let's keep the code simple. - */ - if (!is_error_page(page)) { - vmx->nested.virtual_apic_page = page; - hpa = page_to_phys(vmx->nested.virtual_apic_page); - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa); - } - } - - if (nested_cpu_has_posted_intr(vmcs12)) { - if (vmx->nested.pi_desc_page) { /* shouldn't happen */ - kunmap(vmx->nested.pi_desc_page); - kvm_release_page_dirty(vmx->nested.pi_desc_page); - vmx->nested.pi_desc_page = NULL; - } - page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->posted_intr_desc_addr); - if (is_error_page(page)) - return; - vmx->nested.pi_desc_page = page; - vmx->nested.pi_desc = kmap(vmx->nested.pi_desc_page); - vmx->nested.pi_desc = - (struct pi_desc *)((void *)vmx->nested.pi_desc + - (unsigned long)(vmcs12->posted_intr_desc_addr & - (PAGE_SIZE - 1))); - vmcs_write64(POSTED_INTR_DESC_ADDR, - page_to_phys(vmx->nested.pi_desc_page) + - (unsigned long)(vmcs12->posted_intr_desc_addr & - (PAGE_SIZE - 1))); - } - if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) - vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, - CPU_BASED_USE_MSR_BITMAPS); - else - vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, - CPU_BASED_USE_MSR_BITMAPS); -} - -static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) -{ - u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; - struct vcpu_vmx *vmx = to_vmx(vcpu); - - /* - * A timer value of zero is architecturally guaranteed to cause - * a VMExit prior to executing any instructions in the guest. - */ - if (preemption_timeout == 0) { - vmx_preemption_timer_fn(&vmx->nested.preemption_timer); - return; - } - - if (vcpu->arch.virtual_tsc_khz == 0) - return; - - preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; - preemption_timeout *= 1000000; - do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); - hrtimer_start(&vmx->nested.preemption_timer, - ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); -} - -static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) - return 0; - - if (!page_address_valid(vcpu, vmcs12->io_bitmap_a) || - !page_address_valid(vcpu, vmcs12->io_bitmap_b)) - return -EINVAL; - - return 0; -} - -static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) - return 0; - - if (!page_address_valid(vcpu, vmcs12->msr_bitmap)) - return -EINVAL; - - return 0; -} - -static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) - return 0; - - if (!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr)) - return -EINVAL; - - return 0; -} - -/* - * Merge L0's and L1's MSR bitmap, return false to indicate that - * we do not use the hardware. - */ -static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - int msr; - struct page *page; - unsigned long *msr_bitmap_l1; - unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; - /* - * pred_cmd & spec_ctrl are trying to verify two things: - * - * 1. L0 gave a permission to L1 to actually passthrough the MSR. This - * ensures that we do not accidentally generate an L02 MSR bitmap - * from the L12 MSR bitmap that is too permissive. - * 2. That L1 or L2s have actually used the MSR. This avoids - * unnecessarily merging of the bitmap if the MSR is unused. This - * works properly because we only update the L01 MSR bitmap lazily. - * So even if L0 should pass L1 these MSRs, the L01 bitmap is only - * updated to reflect this when L1 (or its L2s) actually write to - * the MSR. - */ - bool pred_cmd = !msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD); - bool spec_ctrl = !msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL); - - /* Nothing to do if the MSR bitmap is not in use. */ - if (!cpu_has_vmx_msr_bitmap() || - !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) - return false; - - if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && - !pred_cmd && !spec_ctrl) - return false; - - page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap); - if (is_error_page(page)) - return false; - - msr_bitmap_l1 = (unsigned long *)kmap(page); - if (nested_cpu_has_apic_reg_virt(vmcs12)) { - /* - * L0 need not intercept reads for MSRs between 0x800 and 0x8ff, it - * just lets the processor take the value from the virtual-APIC page; - * take those 256 bits directly from the L1 bitmap. - */ - for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { - unsigned word = msr / BITS_PER_LONG; - msr_bitmap_l0[word] = msr_bitmap_l1[word]; - msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0; - } - } else { - for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { - unsigned word = msr / BITS_PER_LONG; - msr_bitmap_l0[word] = ~0; - msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0; - } - } - - nested_vmx_disable_intercept_for_msr( - msr_bitmap_l1, msr_bitmap_l0, - X2APIC_MSR(APIC_TASKPRI), - MSR_TYPE_W); - - if (nested_cpu_has_vid(vmcs12)) { - nested_vmx_disable_intercept_for_msr( - msr_bitmap_l1, msr_bitmap_l0, - X2APIC_MSR(APIC_EOI), - MSR_TYPE_W); - nested_vmx_disable_intercept_for_msr( - msr_bitmap_l1, msr_bitmap_l0, - X2APIC_MSR(APIC_SELF_IPI), - MSR_TYPE_W); - } - - if (spec_ctrl) - nested_vmx_disable_intercept_for_msr( - msr_bitmap_l1, msr_bitmap_l0, - MSR_IA32_SPEC_CTRL, - MSR_TYPE_R | MSR_TYPE_W); - - if (pred_cmd) - nested_vmx_disable_intercept_for_msr( - msr_bitmap_l1, msr_bitmap_l0, - MSR_IA32_PRED_CMD, - MSR_TYPE_W); - - kunmap(page); - kvm_release_page_clean(page); - - return true; -} - -static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - struct vmcs12 *shadow; - struct page *page; - - if (!nested_cpu_has_shadow_vmcs(vmcs12) || - vmcs12->vmcs_link_pointer == -1ull) - return; - - shadow = get_shadow_vmcs12(vcpu); - page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer); - - memcpy(shadow, kmap(page), VMCS12_SIZE); - - kunmap(page); - kvm_release_page_clean(page); -} - -static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (!nested_cpu_has_shadow_vmcs(vmcs12) || - vmcs12->vmcs_link_pointer == -1ull) - return; - - kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer, - get_shadow_vmcs12(vcpu), VMCS12_SIZE); -} - -static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && - !page_address_valid(vcpu, vmcs12->apic_access_addr)) - return -EINVAL; - else - return 0; -} - -static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && - !nested_cpu_has_apic_reg_virt(vmcs12) && - !nested_cpu_has_vid(vmcs12) && - !nested_cpu_has_posted_intr(vmcs12)) - return 0; - - /* - * If virtualize x2apic mode is enabled, - * virtualize apic access must be disabled. - */ - if (nested_cpu_has_virt_x2apic_mode(vmcs12) && - nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) - return -EINVAL; - - /* - * If virtual interrupt delivery is enabled, - * we must exit on external interrupts. - */ - if (nested_cpu_has_vid(vmcs12) && - !nested_exit_on_intr(vcpu)) - return -EINVAL; - - /* - * bits 15:8 should be zero in posted_intr_nv, - * the descriptor address has been already checked - * in nested_get_vmcs12_pages. - * - * bits 5:0 of posted_intr_desc_addr should be zero. - */ - if (nested_cpu_has_posted_intr(vmcs12) && - (!nested_cpu_has_vid(vmcs12) || - !nested_exit_intr_ack_set(vcpu) || - (vmcs12->posted_intr_nv & 0xff00) || - (vmcs12->posted_intr_desc_addr & 0x3f) || - (vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu)))) - return -EINVAL; - - /* tpr shadow is needed by all apicv features. */ - if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) - return -EINVAL; - - return 0; -} - -static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, - unsigned long count_field, - unsigned long addr_field) -{ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - int maxphyaddr; - u64 count, addr; - - if (vmcs12_read_any(vmcs12, count_field, &count) || - vmcs12_read_any(vmcs12, addr_field, &addr)) { - WARN_ON(1); - return -EINVAL; - } - if (count == 0) - return 0; - maxphyaddr = cpuid_maxphyaddr(vcpu); - if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr || - (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) { - pr_debug_ratelimited( - "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)", - addr_field, maxphyaddr, count, addr); - return -EINVAL; - } - return 0; -} - -static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - if (vmcs12->vm_exit_msr_load_count == 0 && - vmcs12->vm_exit_msr_store_count == 0 && - vmcs12->vm_entry_msr_load_count == 0) - return 0; /* Fast path */ - if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT, - VM_EXIT_MSR_LOAD_ADDR) || - nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT, - VM_EXIT_MSR_STORE_ADDR) || - nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT, - VM_ENTRY_MSR_LOAD_ADDR)) - return -EINVAL; - return 0; -} - -static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - if (!nested_cpu_has_pml(vmcs12)) - return 0; - - if (!nested_cpu_has_ept(vmcs12) || - !page_address_valid(vcpu, vmcs12->pml_address)) - return -EINVAL; - - return 0; -} - -static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && - !nested_cpu_has_ept(vmcs12)) - return -EINVAL; - return 0; -} - -static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && - !nested_cpu_has_ept(vmcs12)) - return -EINVAL; - return 0; -} - -static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - if (!nested_cpu_has_shadow_vmcs(vmcs12)) - return 0; - - if (!page_address_valid(vcpu, vmcs12->vmread_bitmap) || - !page_address_valid(vcpu, vmcs12->vmwrite_bitmap)) - return -EINVAL; - - return 0; -} - -static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, - struct vmx_msr_entry *e) -{ - /* x2APIC MSR accesses are not allowed */ - if (vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8) - return -EINVAL; - if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */ - e->index == MSR_IA32_UCODE_REV) - return -EINVAL; - if (e->reserved != 0) - return -EINVAL; - return 0; -} - -static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, - struct vmx_msr_entry *e) -{ - if (e->index == MSR_FS_BASE || - e->index == MSR_GS_BASE || - e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */ - nested_vmx_msr_check_common(vcpu, e)) - return -EINVAL; - return 0; -} - -static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, - struct vmx_msr_entry *e) -{ - if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */ - nested_vmx_msr_check_common(vcpu, e)) - return -EINVAL; - return 0; -} - -/* - * Load guest's/host's msr at nested entry/exit. - * return 0 for success, entry index for failure. - */ -static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) -{ - u32 i; - struct vmx_msr_entry e; - struct msr_data msr; - - msr.host_initiated = false; - for (i = 0; i < count; i++) { - if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), - &e, sizeof(e))) { - pr_debug_ratelimited( - "%s cannot read MSR entry (%u, 0x%08llx)\n", - __func__, i, gpa + i * sizeof(e)); - goto fail; - } - if (nested_vmx_load_msr_check(vcpu, &e)) { - pr_debug_ratelimited( - "%s check failed (%u, 0x%x, 0x%x)\n", - __func__, i, e.index, e.reserved); - goto fail; - } - msr.index = e.index; - msr.data = e.value; - if (kvm_set_msr(vcpu, &msr)) { - pr_debug_ratelimited( - "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", - __func__, i, e.index, e.value); - goto fail; - } - } - return 0; -fail: - return i + 1; -} - -static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) -{ - u32 i; - struct vmx_msr_entry e; - - for (i = 0; i < count; i++) { - struct msr_data msr_info; - if (kvm_vcpu_read_guest(vcpu, - gpa + i * sizeof(e), - &e, 2 * sizeof(u32))) { - pr_debug_ratelimited( - "%s cannot read MSR entry (%u, 0x%08llx)\n", - __func__, i, gpa + i * sizeof(e)); - return -EINVAL; - } - if (nested_vmx_store_msr_check(vcpu, &e)) { - pr_debug_ratelimited( - "%s check failed (%u, 0x%x, 0x%x)\n", - __func__, i, e.index, e.reserved); - return -EINVAL; - } - msr_info.host_initiated = false; - msr_info.index = e.index; - if (kvm_get_msr(vcpu, &msr_info)) { - pr_debug_ratelimited( - "%s cannot read MSR (%u, 0x%x)\n", - __func__, i, e.index); - return -EINVAL; - } - if (kvm_vcpu_write_guest(vcpu, - gpa + i * sizeof(e) + - offsetof(struct vmx_msr_entry, value), - &msr_info.data, sizeof(msr_info.data))) { - pr_debug_ratelimited( - "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", - __func__, i, e.index, msr_info.data); - return -EINVAL; - } - } - return 0; -} - -static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val) -{ - unsigned long invalid_mask; - - invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu); - return (val & invalid_mask) == 0; -} - -/* - * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are - * emulating VM entry into a guest with EPT enabled. - * Returns 0 on success, 1 on failure. Invalid state exit qualification code - * is assigned to entry_failure_code on failure. - */ -static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept, - u32 *entry_failure_code) -{ - if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) { - if (!nested_cr3_valid(vcpu, cr3)) { - *entry_failure_code = ENTRY_FAIL_DEFAULT; - return 1; - } - - /* - * If PAE paging and EPT are both on, CR3 is not used by the CPU and - * must not be dereferenced. - */ - if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu) && - !nested_ept) { - if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) { - *entry_failure_code = ENTRY_FAIL_PDPTE; - return 1; - } - } - } - - if (!nested_ept) - kvm_mmu_new_cr3(vcpu, cr3, false); - - vcpu->arch.cr3 = cr3; - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); - - kvm_init_mmu(vcpu, false); - - return 0; -} - -/* - * Returns if KVM is able to config CPU to tag TLB entries - * populated by L2 differently than TLB entries populated - * by L1. - * - * If L1 uses EPT, then TLB entries are tagged with different EPTP. - * - * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged - * with different VPID (L1 entries are tagged with vmx->vpid - * while L2 entries are tagged with vmx->nested.vpid02). - */ -static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) -{ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - - return nested_cpu_has_ept(vmcs12) || - (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); -} - -static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) -{ - if (vmx->nested.nested_run_pending && - (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) - return vmcs12->guest_ia32_efer; - else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) - return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); - else - return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); -} - -static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) -{ - /* - * If vmcs02 hasn't been initialized, set the constant vmcs02 state - * according to L0's settings (vmcs12 is irrelevant here). Host - * fields that come from L0 and are not constant, e.g. HOST_CR3, - * will be set as needed prior to VMLAUNCH/VMRESUME. - */ - if (vmx->nested.vmcs02_initialized) - return; - vmx->nested.vmcs02_initialized = true; - - /* - * We don't care what the EPTP value is we just need to guarantee - * it's valid so we don't get a false positive when doing early - * consistency checks. - */ - if (enable_ept && nested_early_check) - vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0)); - - /* All VMFUNCs are currently emulated through L0 vmexits. */ - if (cpu_has_vmx_vmfunc()) - vmcs_write64(VM_FUNCTION_CONTROL, 0); - - if (cpu_has_vmx_posted_intr()) - vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); - - if (cpu_has_vmx_msr_bitmap()) - vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); - - if (enable_pml) - vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); - - /* - * Set the MSR load/store lists to match L0's settings. Only the - * addresses are constant (for vmcs02), the counts can change based - * on L2's behavior, e.g. switching to/from long mode. - */ - vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); - vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); - vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); - - vmx_set_constant_host_state(vmx); -} - -static void prepare_vmcs02_early_full(struct vcpu_vmx *vmx, - struct vmcs12 *vmcs12) -{ - prepare_vmcs02_constant_state(vmx); - - vmcs_write64(VMCS_LINK_POINTER, -1ull); - - if (enable_vpid) { - if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) - vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); - else - vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); - } -} - -static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) -{ - u32 exec_control, vmcs12_exec_ctrl; - u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); - - if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) - prepare_vmcs02_early_full(vmx, vmcs12); - - /* - * HOST_RSP is normally set correctly in vmx_vcpu_run() just before - * entry, but only if the current (host) sp changed from the value - * we wrote last (vmx->host_rsp). This cache is no longer relevant - * if we switch vmcs, and rather than hold a separate cache per vmcs, - * here we just force the write to happen on entry. host_rsp will - * also be written unconditionally by nested_vmx_check_vmentry_hw() - * if we are doing early consistency checks via hardware. - */ - vmx->host_rsp = 0; - - /* - * PIN CONTROLS - */ - exec_control = vmcs12->pin_based_vm_exec_control; - - /* Preemption timer setting is computed directly in vmx_vcpu_run. */ - exec_control |= vmcs_config.pin_based_exec_ctrl; - exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; - vmx->loaded_vmcs->hv_timer_armed = false; - - /* Posted interrupts setting is only taken from vmcs12. */ - if (nested_cpu_has_posted_intr(vmcs12)) { - vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; - vmx->nested.pi_pending = false; - } else { - exec_control &= ~PIN_BASED_POSTED_INTR; - } - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); - - /* - * EXEC CONTROLS - */ - exec_control = vmx_exec_control(vmx); /* L0's desires */ - exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; - exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; - exec_control &= ~CPU_BASED_TPR_SHADOW; - exec_control |= vmcs12->cpu_based_vm_exec_control; - - /* - * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if - * nested_get_vmcs12_pages can't fix it up, the illegal value - * will result in a VM entry failure. - */ - if (exec_control & CPU_BASED_TPR_SHADOW) { - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull); - vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); - } else { -#ifdef CONFIG_X86_64 - exec_control |= CPU_BASED_CR8_LOAD_EXITING | - CPU_BASED_CR8_STORE_EXITING; -#endif - } - - /* - * A vmexit (to either L1 hypervisor or L0 userspace) is always needed - * for I/O port accesses. - */ - exec_control &= ~CPU_BASED_USE_IO_BITMAPS; - exec_control |= CPU_BASED_UNCOND_IO_EXITING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); - - /* - * SECONDARY EXEC CONTROLS - */ - if (cpu_has_secondary_exec_ctrls()) { - exec_control = vmx->secondary_exec_control; - - /* Take the following fields only from vmcs12 */ - exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | - SECONDARY_EXEC_ENABLE_INVPCID | - SECONDARY_EXEC_RDTSCP | - SECONDARY_EXEC_XSAVES | - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | - SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_ENABLE_VMFUNC); - if (nested_cpu_has(vmcs12, - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) { - vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control & - ~SECONDARY_EXEC_ENABLE_PML; - exec_control |= vmcs12_exec_ctrl; - } - - /* VMCS shadowing for L2 is emulated for now */ - exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; - - if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) - vmcs_write16(GUEST_INTR_STATUS, - vmcs12->guest_intr_status); - - /* - * Write an illegal value to APIC_ACCESS_ADDR. Later, - * nested_get_vmcs12_pages will either fix it up or - * remove the VM execution control. - */ - if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) - vmcs_write64(APIC_ACCESS_ADDR, -1ull); - - if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) - vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); - - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); - } - - /* - * ENTRY CONTROLS - * - * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE - * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate - * on the related bits (if supported by the CPU) in the hope that - * we can avoid VMWrites during vmx_set_efer(). - */ - exec_control = (vmcs12->vm_entry_controls | vmcs_config.vmentry_ctrl) & - ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER; - if (cpu_has_load_ia32_efer) { - if (guest_efer & EFER_LMA) - exec_control |= VM_ENTRY_IA32E_MODE; - if (guest_efer != host_efer) - exec_control |= VM_ENTRY_LOAD_IA32_EFER; - } - vm_entry_controls_init(vmx, exec_control); - - /* - * EXIT CONTROLS - * - * L2->L1 exit controls are emulated - the hardware exit is to L0 so - * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER - * bits may be modified by vmx_set_efer() in prepare_vmcs02(). - */ - exec_control = vmcs_config.vmexit_ctrl; - if (cpu_has_load_ia32_efer && guest_efer != host_efer) - exec_control |= VM_EXIT_LOAD_IA32_EFER; - vm_exit_controls_init(vmx, exec_control); - - /* - * Conceptually we want to copy the PML address and index from - * vmcs01 here, and then back to vmcs01 on nested vmexit. But, - * since we always flush the log on each vmexit and never change - * the PML address (once set), this happens to be equivalent to - * simply resetting the index in vmcs02. - */ - if (enable_pml) - vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); - - /* - * Interrupt/Exception Fields - */ - if (vmx->nested.nested_run_pending) { - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, - vmcs12->vm_entry_intr_info_field); - vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, - vmcs12->vm_entry_exception_error_code); - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, - vmcs12->vm_entry_instruction_len); - vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, - vmcs12->guest_interruptibility_info); - vmx->loaded_vmcs->nmi_known_unmasked = - !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); - } else { - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); - } -} - -static void prepare_vmcs02_full(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) -{ - struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; - - if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { - vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); - vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); - vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); - vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); - vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); - vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); - vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); - vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); - vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); - vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); - vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); - vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); - vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); - vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); - vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); - vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); - vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); - vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); - vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); - vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); - vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); - vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); - vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); - vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); - vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); - vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); - vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); - vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); - vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); - vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); - vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); - vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); - vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); - vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); - } - - if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { - vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); - vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, - vmcs12->guest_pending_dbg_exceptions); - vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); - vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); - - /* - * L1 may access the L2's PDPTR, so save them to construct - * vmcs12 - */ - if (enable_ept) { - vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); - vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); - vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); - vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); - } - } - - if (nested_cpu_has_xsaves(vmcs12)) - vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); - - /* - * Whether page-faults are trapped is determined by a combination of - * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. - * If enable_ept, L0 doesn't care about page faults and we should - * set all of these to L1's desires. However, if !enable_ept, L0 does - * care about (at least some) page faults, and because it is not easy - * (if at all possible?) to merge L0 and L1's desires, we simply ask - * to exit on each and every L2 page fault. This is done by setting - * MASK=MATCH=0 and (see below) EB.PF=1. - * Note that below we don't need special code to set EB.PF beyond the - * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, - * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when - * !enable_ept, EB.PF is 1, so the "or" will always be 1. - */ - vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, - enable_ept ? vmcs12->page_fault_error_code_mask : 0); - vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, - enable_ept ? vmcs12->page_fault_error_code_match : 0); - - if (cpu_has_vmx_apicv()) { - vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); - vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); - vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); - vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); - } - - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); - - set_cr4_guest_host_mask(vmx); - - if (kvm_mpx_supported()) { - if (vmx->nested.nested_run_pending && - (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) - vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); - else - vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); - } -} - -/* - * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested - * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it - * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 - * guest in a way that will both be appropriate to L1's requests, and our - * needs. In addition to modifying the active vmcs (which is vmcs02), this - * function also has additional necessary side-effects, like setting various - * vcpu->arch fields. - * Returns 0 on success, 1 on failure. Invalid state exit qualification code - * is assigned to entry_failure_code on failure. - */ -static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, - u32 *entry_failure_code) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; - - if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) { - prepare_vmcs02_full(vmx, vmcs12); - vmx->nested.dirty_vmcs12 = false; - } - - /* - * First, the fields that are shadowed. This must be kept in sync - * with vmx_shadow_fields.h. - */ - if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { - vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); - vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); - } - - if (vmx->nested.nested_run_pending && - (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { - kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); - vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); - } else { - kvm_set_dr(vcpu, 7, vcpu->arch.dr7); - vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); - } - vmx_set_rflags(vcpu, vmcs12->guest_rflags); - - vmx->nested.preemption_timer_expired = false; - if (nested_cpu_has_preemption_timer(vmcs12)) - vmx_start_preemption_timer(vcpu); - - /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the - * bitwise-or of what L1 wants to trap for L2, and what we want to - * trap. Note that CR0.TS also needs updating - we do this later. - */ - update_exception_bitmap(vcpu); - vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; - vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); - - if (vmx->nested.nested_run_pending && - (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { - vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); - vcpu->arch.pat = vmcs12->guest_ia32_pat; - } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { - vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); - } - - vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); - - if (kvm_has_tsc_control) - decache_tsc_multiplier(vmx); - - if (enable_vpid) { - /* - * There is no direct mapping between vpid02 and vpid12, the - * vpid02 is per-vCPU for L0 and reused while the value of - * vpid12 is changed w/ one invvpid during nested vmentry. - * The vpid12 is allocated by L1 for L2, so it will not - * influence global bitmap(for vpid01 and vpid02 allocation) - * even if spawn a lot of nested vCPUs. - */ - if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) { - if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) { - vmx->nested.last_vpid = vmcs12->virtual_processor_id; - __vmx_flush_tlb(vcpu, nested_get_vpid02(vcpu), false); - } - } else { - /* - * If L1 use EPT, then L0 needs to execute INVEPT on - * EPTP02 instead of EPTP01. Therefore, delay TLB - * flush until vmcs02->eptp is fully updated by - * KVM_REQ_LOAD_CR3. Note that this assumes - * KVM_REQ_TLB_FLUSH is evaluated after - * KVM_REQ_LOAD_CR3 in vcpu_enter_guest(). - */ - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); - } - } - - if (nested_cpu_has_ept(vmcs12)) - nested_ept_init_mmu_context(vcpu); - else if (nested_cpu_has2(vmcs12, - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) - vmx_flush_tlb(vcpu, true); - - /* - * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those - * bits which we consider mandatory enabled. - * The CR0_READ_SHADOW is what L2 should have expected to read given - * the specifications by L1; It's not enough to take - * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we - * have more bits than L1 expected. - */ - vmx_set_cr0(vcpu, vmcs12->guest_cr0); - vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); - - vmx_set_cr4(vcpu, vmcs12->guest_cr4); - vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); - - vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); - /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ - vmx_set_efer(vcpu, vcpu->arch.efer); - - /* - * Guest state is invalid and unrestricted guest is disabled, - * which means L1 attempted VMEntry to L2 with invalid state. - * Fail the VMEntry. - */ - if (vmx->emulation_required) { - *entry_failure_code = ENTRY_FAIL_DEFAULT; - return 1; - } - - /* Shadow page tables on either EPT or shadow page tables. */ - if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), - entry_failure_code)) - return 1; - - if (!enable_ept) - vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; - - kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp); - kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); - return 0; -} - -static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) -{ - if (!nested_cpu_has_nmi_exiting(vmcs12) && - nested_cpu_has_virtual_nmis(vmcs12)) - return -EINVAL; - - if (!nested_cpu_has_virtual_nmis(vmcs12) && - nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING)) - return -EINVAL; - - return 0; -} - -static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - bool ia32e; - - if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && - vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_vmx_check_io_bitmap_controls(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_vmx_check_apic_access_controls(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (!nested_cpu_has_preemption_timer(vmcs12) && - nested_cpu_has_save_preemption_timer(vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_vmx_check_pml_controls(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, - vmx->nested.msrs.procbased_ctls_low, - vmx->nested.msrs.procbased_ctls_high) || - (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && - !vmx_control_verify(vmcs12->secondary_vm_exec_control, - vmx->nested.msrs.secondary_ctls_low, - vmx->nested.msrs.secondary_ctls_high)) || - !vmx_control_verify(vmcs12->pin_based_vm_exec_control, - vmx->nested.msrs.pinbased_ctls_low, - vmx->nested.msrs.pinbased_ctls_high) || - !vmx_control_verify(vmcs12->vm_exit_controls, - vmx->nested.msrs.exit_ctls_low, - vmx->nested.msrs.exit_ctls_high) || - !vmx_control_verify(vmcs12->vm_entry_controls, - vmx->nested.msrs.entry_ctls_low, - vmx->nested.msrs.entry_ctls_high)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_vmx_check_nmi_controls(vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_cpu_has_vmfunc(vmcs12)) { - if (vmcs12->vm_function_control & - ~vmx->nested.msrs.vmfunc_controls) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_cpu_has_eptp_switching(vmcs12)) { - if (!nested_cpu_has_ept(vmcs12) || - !page_address_valid(vcpu, vmcs12->eptp_list_address)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - } - } - - if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) || - !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) || - !nested_cr3_valid(vcpu, vmcs12->host_cr3)) - return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD; - - /* - * If the load IA32_EFER VM-exit control is 1, bits reserved in the - * IA32_EFER MSR must be 0 in the field for that register. In addition, - * the values of the LMA and LME bits in the field must each be that of - * the host address-space size VM-exit control. - */ - if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { - ia32e = (vmcs12->vm_exit_controls & - VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0; - if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) || - ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) || - ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) - return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD; - } - - /* - * From the Intel SDM, volume 3: - * Fields relevant to VM-entry event injection must be set properly. - * These fields are the VM-entry interruption-information field, the - * VM-entry exception error code, and the VM-entry instruction length. - */ - if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { - u32 intr_info = vmcs12->vm_entry_intr_info_field; - u8 vector = intr_info & INTR_INFO_VECTOR_MASK; - u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; - bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; - bool should_have_error_code; - bool urg = nested_cpu_has2(vmcs12, - SECONDARY_EXEC_UNRESTRICTED_GUEST); - bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; - - /* VM-entry interruption-info field: interruption type */ - if (intr_type == INTR_TYPE_RESERVED || - (intr_type == INTR_TYPE_OTHER_EVENT && - !nested_cpu_supports_monitor_trap_flag(vcpu))) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - /* VM-entry interruption-info field: vector */ - if ((intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || - (intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || - (intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - /* VM-entry interruption-info field: deliver error code */ - should_have_error_code = - intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && - x86_exception_has_error_code(vector); - if (has_error_code != should_have_error_code) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - /* VM-entry exception error code */ - if (has_error_code && - vmcs12->vm_entry_exception_error_code & GENMASK(31, 15)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - /* VM-entry interruption-info field: reserved bits */ - if (intr_info & INTR_INFO_RESVD_BITS_MASK) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - /* VM-entry instruction length */ - switch (intr_type) { - case INTR_TYPE_SOFT_EXCEPTION: - case INTR_TYPE_SOFT_INTR: - case INTR_TYPE_PRIV_SW_EXCEPTION: - if ((vmcs12->vm_entry_instruction_len > 15) || - (vmcs12->vm_entry_instruction_len == 0 && - !nested_cpu_has_zero_length_injection(vcpu))) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - } - } - - if (nested_cpu_has_ept(vmcs12) && - !valid_ept_address(vcpu, vmcs12->ept_pointer)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - return 0; -} - -static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - int r; - struct page *page; - struct vmcs12 *shadow; - - if (vmcs12->vmcs_link_pointer == -1ull) - return 0; - - if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)) - return -EINVAL; - - page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer); - if (is_error_page(page)) - return -EINVAL; - - r = 0; - shadow = kmap(page); - if (shadow->hdr.revision_id != VMCS12_REVISION || - shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)) - r = -EINVAL; - kunmap(page); - kvm_release_page_clean(page); - return r; -} - -static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, - u32 *exit_qual) -{ - bool ia32e; - - *exit_qual = ENTRY_FAIL_DEFAULT; - - if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) || - !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) - return 1; - - if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { - *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR; - return 1; - } - - /* - * If the load IA32_EFER VM-entry control is 1, the following checks - * are performed on the field for the IA32_EFER MSR: - * - Bits reserved in the IA32_EFER MSR must be 0. - * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of - * the IA-32e mode guest VM-exit control. It must also be identical - * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to - * CR0.PG) is 1. - */ - if (to_vmx(vcpu)->nested.nested_run_pending && - (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { - ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; - if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) || - ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) || - ((vmcs12->guest_cr0 & X86_CR0_PG) && - ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) - return 1; - } - - if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && - (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) || - (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))) - return 1; - - return 0; -} - -static int __noclone nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long cr3, cr4; - - if (!nested_early_check) - return 0; - - if (vmx->msr_autoload.host.nr) - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); - if (vmx->msr_autoload.guest.nr) - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); - - preempt_disable(); - - vmx_prepare_switch_to_guest(vcpu); - - /* - * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, - * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to - * be written (by preparve_vmcs02()) before the "real" VMEnter, i.e. - * there is no need to preserve other bits or save/restore the field. - */ - vmcs_writel(GUEST_RFLAGS, 0); - - vmcs_writel(HOST_RIP, vmx_early_consistency_check_return); - - cr3 = __get_current_cr3_fast(); - if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { - vmcs_writel(HOST_CR3, cr3); - vmx->loaded_vmcs->host_state.cr3 = cr3; - } - - cr4 = cr4_read_shadow(); - if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { - vmcs_writel(HOST_CR4, cr4); - vmx->loaded_vmcs->host_state.cr4 = cr4; - } - - vmx->__launched = vmx->loaded_vmcs->launched; - - asm( - /* Set HOST_RSP */ - __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t" - "mov %%" _ASM_SP ", %c[host_rsp](%0)\n\t" - - /* Check if vmlaunch or vmresume is needed */ - "cmpl $0, %c[launched](%0)\n\t" - "jne 1f\n\t" - __ex("vmlaunch") "\n\t" - "jmp 2f\n\t" - "1: " __ex("vmresume") "\n\t" - "2: " - /* Set vmx->fail accordingly */ - "setbe %c[fail](%0)\n\t" - - ".pushsection .rodata\n\t" - ".global vmx_early_consistency_check_return\n\t" - "vmx_early_consistency_check_return: " _ASM_PTR " 2b\n\t" - ".popsection" - : - : "c"(vmx), "d"((unsigned long)HOST_RSP), - [launched]"i"(offsetof(struct vcpu_vmx, __launched)), - [fail]"i"(offsetof(struct vcpu_vmx, fail)), - [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)) - : "rax", "cc", "memory" - ); - - vmcs_writel(HOST_RIP, vmx_return); - - preempt_enable(); - - if (vmx->msr_autoload.host.nr) - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); - if (vmx->msr_autoload.guest.nr) - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); - - if (vmx->fail) { - WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != - VMXERR_ENTRY_INVALID_CONTROL_FIELD); - vmx->fail = 0; - return 1; - } - - /* - * VMExit clears RFLAGS.IF and DR7, even on a consistency check. - */ - local_irq_enable(); - if (hw_breakpoint_active()) - set_debugreg(__this_cpu_read(cpu_dr7), 7); - - /* - * A non-failing VMEntry means we somehow entered guest mode with - * an illegal RIP, and that's just the tip of the iceberg. There - * is no telling what memory has been modified or what state has - * been exposed to unknown code. Hitting this all but guarantees - * a (very critical) hardware issue. - */ - WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & - VMX_EXIT_REASONS_FAILED_VMENTRY)); - - return 0; -} -STACK_FRAME_NON_STANDARD(nested_vmx_check_vmentry_hw); - -static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12); - -/* - * If from_vmentry is false, this is being called from state restore (either RSM - * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. -+ * -+ * Returns: -+ * 0 - success, i.e. proceed with actual VMEnter -+ * 1 - consistency check VMExit -+ * -1 - consistency check VMFail - */ -static int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, - bool from_vmentry) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - bool evaluate_pending_interrupts; - u32 exit_reason = EXIT_REASON_INVALID_STATE; - u32 exit_qual; - - evaluate_pending_interrupts = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) & - (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING); - if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) - evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); - - if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) - vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); - if (kvm_mpx_supported() && - !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) - vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); - - vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); - - prepare_vmcs02_early(vmx, vmcs12); - - if (from_vmentry) { - nested_get_vmcs12_pages(vcpu); - - if (nested_vmx_check_vmentry_hw(vcpu)) { - vmx_switch_vmcs(vcpu, &vmx->vmcs01); - return -1; - } - - if (check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) - goto vmentry_fail_vmexit; - } - - enter_guest_mode(vcpu); - if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) - vcpu->arch.tsc_offset += vmcs12->tsc_offset; - - if (prepare_vmcs02(vcpu, vmcs12, &exit_qual)) - goto vmentry_fail_vmexit_guest_mode; - - if (from_vmentry) { - exit_reason = EXIT_REASON_MSR_LOAD_FAIL; - exit_qual = nested_vmx_load_msr(vcpu, - vmcs12->vm_entry_msr_load_addr, - vmcs12->vm_entry_msr_load_count); - if (exit_qual) - goto vmentry_fail_vmexit_guest_mode; - } else { - /* - * The MMU is not initialized to point at the right entities yet and - * "get pages" would need to read data from the guest (i.e. we will - * need to perform gpa to hpa translation). Request a call - * to nested_get_vmcs12_pages before the next VM-entry. The MSRs - * have already been set at vmentry time and should not be reset. - */ - kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); - } - - /* - * If L1 had a pending IRQ/NMI until it executed - * VMLAUNCH/VMRESUME which wasn't delivered because it was - * disallowed (e.g. interrupts disabled), L0 needs to - * evaluate if this pending event should cause an exit from L2 - * to L1 or delivered directly to L2 (e.g. In case L1 don't - * intercept EXTERNAL_INTERRUPT). - * - * Usually this would be handled by the processor noticing an - * IRQ/NMI window request, or checking RVI during evaluation of - * pending virtual interrupts. However, this setting was done - * on VMCS01 and now VMCS02 is active instead. Thus, we force L0 - * to perform pending event evaluation by requesting a KVM_REQ_EVENT. - */ - if (unlikely(evaluate_pending_interrupts)) - kvm_make_request(KVM_REQ_EVENT, vcpu); - - /* - * Note no nested_vmx_succeed or nested_vmx_fail here. At this point - * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet - * returned as far as L1 is concerned. It will only return (and set - * the success flag) when L2 exits (see nested_vmx_vmexit()). - */ - return 0; - - /* - * A failed consistency check that leads to a VMExit during L1's - * VMEnter to L2 is a variation of a normal VMexit, as explained in - * 26.7 "VM-entry failures during or after loading guest state". - */ -vmentry_fail_vmexit_guest_mode: - if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) - vcpu->arch.tsc_offset -= vmcs12->tsc_offset; - leave_guest_mode(vcpu); - -vmentry_fail_vmexit: - vmx_switch_vmcs(vcpu, &vmx->vmcs01); - - if (!from_vmentry) - return 1; - - load_vmcs12_host_state(vcpu, vmcs12); - vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY; - vmcs12->exit_qualification = exit_qual; - if (enable_shadow_vmcs || vmx->nested.hv_evmcs) - vmx->nested.need_vmcs12_sync = true; - return 1; -} - -/* - * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 - * for running an L2 nested guest. - */ -static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) -{ - struct vmcs12 *vmcs12; - struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); - int ret; - - if (!nested_vmx_check_permission(vcpu)) - return 1; - - if (!nested_vmx_handle_enlightened_vmptrld(vcpu, true)) - return 1; - - if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull) - return nested_vmx_failInvalid(vcpu); - - vmcs12 = get_vmcs12(vcpu); - - /* - * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact - * that there *is* a valid VMCS pointer, RFLAGS.CF is set - * rather than RFLAGS.ZF, and no error number is stored to the - * VM-instruction error field. - */ - if (vmcs12->hdr.shadow_vmcs) - return nested_vmx_failInvalid(vcpu); - - if (vmx->nested.hv_evmcs) { - copy_enlightened_to_vmcs12(vmx); - /* Enlightened VMCS doesn't have launch state */ - vmcs12->launch_state = !launch; - } else if (enable_shadow_vmcs) { - copy_shadow_to_vmcs12(vmx); - } - - /* - * The nested entry process starts with enforcing various prerequisites - * on vmcs12 as required by the Intel SDM, and act appropriately when - * they fail: As the SDM explains, some conditions should cause the - * instruction to fail, while others will cause the instruction to seem - * to succeed, but return an EXIT_REASON_INVALID_STATE. - * To speed up the normal (success) code path, we should avoid checking - * for misconfigurations which will anyway be caught by the processor - * when using the merged vmcs02. - */ - if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS) - return nested_vmx_failValid(vcpu, - VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); - - if (vmcs12->launch_state == launch) - return nested_vmx_failValid(vcpu, - launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS - : VMXERR_VMRESUME_NONLAUNCHED_VMCS); - - ret = check_vmentry_prereqs(vcpu, vmcs12); - if (ret) - return nested_vmx_failValid(vcpu, ret); - - /* - * We're finally done with prerequisite checking, and can start with - * the nested entry. - */ - vmx->nested.nested_run_pending = 1; - ret = nested_vmx_enter_non_root_mode(vcpu, true); - vmx->nested.nested_run_pending = !ret; - if (ret > 0) - return 1; - else if (ret) - return nested_vmx_failValid(vcpu, - VMXERR_ENTRY_INVALID_CONTROL_FIELD); - - /* Hide L1D cache contents from the nested guest. */ - vmx->vcpu.arch.l1tf_flush_l1d = true; - - /* - * Must happen outside of nested_vmx_enter_non_root_mode() as it will - * also be used as part of restoring nVMX state for - * snapshot restore (migration). - * - * In this flow, it is assumed that vmcs12 cache was - * trasferred as part of captured nVMX state and should - * therefore not be read from guest memory (which may not - * exist on destination host yet). - */ - nested_cache_shadow_vmcs12(vcpu, vmcs12); - - /* - * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken - * by event injection, halt vcpu. - */ - if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) && - !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK)) { - vmx->nested.nested_run_pending = 0; - return kvm_vcpu_halt(vcpu); - } - return 1; -} - -/* - * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date - * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK). - * This function returns the new value we should put in vmcs12.guest_cr0. - * It's not enough to just return the vmcs02 GUEST_CR0. Rather, - * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now - * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 - * didn't trap the bit, because if L1 did, so would L0). - * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have - * been modified by L2, and L1 knows it. So just leave the old value of - * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 - * isn't relevant, because if L0 traps this bit it can set it to anything. - * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have - * changed these bits, and therefore they need to be updated, but L0 - * didn't necessarily allow them to be changed in GUEST_CR0 - and rather - * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. - */ -static inline unsigned long -vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) -{ - return - /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | - /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | - /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | - vcpu->arch.cr0_guest_owned_bits)); -} - -static inline unsigned long -vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) -{ - return - /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | - /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | - /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | - vcpu->arch.cr4_guest_owned_bits)); -} - -static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - u32 idt_vectoring; - unsigned int nr; - - if (vcpu->arch.exception.injected) { - nr = vcpu->arch.exception.nr; - idt_vectoring = nr | VECTORING_INFO_VALID_MASK; - - if (kvm_exception_is_soft(nr)) { - vmcs12->vm_exit_instruction_len = - vcpu->arch.event_exit_inst_len; - idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; - } else - idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; - - if (vcpu->arch.exception.has_error_code) { - idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; - vmcs12->idt_vectoring_error_code = - vcpu->arch.exception.error_code; - } - - vmcs12->idt_vectoring_info_field = idt_vectoring; - } else if (vcpu->arch.nmi_injected) { - vmcs12->idt_vectoring_info_field = - INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; - } else if (vcpu->arch.interrupt.injected) { - nr = vcpu->arch.interrupt.nr; - idt_vectoring = nr | VECTORING_INFO_VALID_MASK; - - if (vcpu->arch.interrupt.soft) { - idt_vectoring |= INTR_TYPE_SOFT_INTR; - vmcs12->vm_entry_instruction_len = - vcpu->arch.event_exit_inst_len; - } else - idt_vectoring |= INTR_TYPE_EXT_INTR; - - vmcs12->idt_vectoring_info_field = idt_vectoring; - } -} - -static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long exit_qual; - bool block_nested_events = - vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu); - - if (vcpu->arch.exception.pending && - nested_vmx_check_exception(vcpu, &exit_qual)) { - if (block_nested_events) - return -EBUSY; - nested_vmx_inject_exception_vmexit(vcpu, exit_qual); - return 0; - } - - if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && - vmx->nested.preemption_timer_expired) { - if (block_nested_events) - return -EBUSY; - nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); - return 0; - } - - if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) { - if (block_nested_events) - return -EBUSY; - nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, - NMI_VECTOR | INTR_TYPE_NMI_INTR | - INTR_INFO_VALID_MASK, 0); - /* - * The NMI-triggered VM exit counts as injection: - * clear this one and block further NMIs. - */ - vcpu->arch.nmi_pending = 0; - vmx_set_nmi_mask(vcpu, true); - return 0; - } - - if ((kvm_cpu_has_interrupt(vcpu) || external_intr) && - nested_exit_on_intr(vcpu)) { - if (block_nested_events) - return -EBUSY; - nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); - return 0; - } - - vmx_complete_nested_posted_interrupt(vcpu); - return 0; -} - -static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu) -{ - to_vmx(vcpu)->req_immediate_exit = true; -} - -static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) -{ - ktime_t remaining = - hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); - u64 value; - - if (ktime_to_ns(remaining) <= 0) - return 0; - - value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; - do_div(value, 1000000); - return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; -} - -/* - * Update the guest state fields of vmcs12 to reflect changes that - * occurred while L2 was running. (The "IA-32e mode guest" bit of the - * VM-entry controls is also updated, since this is really a guest - * state bit.) - */ -static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) -{ - vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); - vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); - - vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); - vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP); - vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); - - vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); - vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); - vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); - vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); - vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); - vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); - vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); - vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); - vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); - vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); - vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); - vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); - vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); - vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); - vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); - vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); - vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); - vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); - vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); - vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); - vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); - vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); - vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); - vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); - vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); - vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); - vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); - vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); - vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); - vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); - vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); - vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); - vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); - vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); - vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); - vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); - - vmcs12->guest_interruptibility_info = - vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); - vmcs12->guest_pending_dbg_exceptions = - vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); - if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) - vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; - else - vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; - - if (nested_cpu_has_preemption_timer(vmcs12)) { - if (vmcs12->vm_exit_controls & - VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) - vmcs12->vmx_preemption_timer_value = - vmx_get_preemption_timer_value(vcpu); - hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); - } - - /* - * In some cases (usually, nested EPT), L2 is allowed to change its - * own CR3 without exiting. If it has changed it, we must keep it. - * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined - * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. - * - * Additionally, restore L2's PDPTR to vmcs12. - */ - if (enable_ept) { - vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); - vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); - vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); - vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); - vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); - } - - vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); - - if (nested_cpu_has_vid(vmcs12)) - vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); - - vmcs12->vm_entry_controls = - (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | - (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); - - if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) { - kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); - vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); - } - - /* TODO: These cannot have changed unless we have MSR bitmaps and - * the relevant bit asks not to trap the change */ - if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) - vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT); - if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) - vmcs12->guest_ia32_efer = vcpu->arch.efer; - vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); - vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); - vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); - if (kvm_mpx_supported()) - vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); -} - -/* - * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits - * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), - * and this function updates it to reflect the changes to the guest state while - * L2 was running (and perhaps made some exits which were handled directly by L0 - * without going back to L1), and to reflect the exit reason. - * Note that we do not have to copy here all VMCS fields, just those that - * could have changed by the L2 guest or the exit - i.e., the guest-state and - * exit-information fields only. Other fields are modified by L1 with VMWRITE, - * which already writes to vmcs12 directly. - */ -static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, - u32 exit_reason, u32 exit_intr_info, - unsigned long exit_qualification) -{ - /* update guest state fields: */ - sync_vmcs12(vcpu, vmcs12); - - /* update exit information fields: */ - - vmcs12->vm_exit_reason = exit_reason; - vmcs12->exit_qualification = exit_qualification; - vmcs12->vm_exit_intr_info = exit_intr_info; - - vmcs12->idt_vectoring_info_field = 0; - vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); - vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - - if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { - vmcs12->launch_state = 1; - - /* vm_entry_intr_info_field is cleared on exit. Emulate this - * instead of reading the real value. */ - vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; - - /* - * Transfer the event that L0 or L1 may wanted to inject into - * L2 to IDT_VECTORING_INFO_FIELD. - */ - vmcs12_save_pending_event(vcpu, vmcs12); - } - - /* - * Drop what we picked up for L2 via vmx_complete_interrupts. It is - * preserved above and would only end up incorrectly in L1. - */ - vcpu->arch.nmi_injected = false; - kvm_clear_exception_queue(vcpu); - kvm_clear_interrupt_queue(vcpu); -} - -/* - * A part of what we need to when the nested L2 guest exits and we want to - * run its L1 parent, is to reset L1's guest state to the host state specified - * in vmcs12. - * This function is to be called not only on normal nested exit, but also on - * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry - * Failures During or After Loading Guest State"). - * This function should be called when the active VMCS is L1's (vmcs01). - */ -static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - struct kvm_segment seg; - u32 entry_failure_code; - - if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) - vcpu->arch.efer = vmcs12->host_ia32_efer; - else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) - vcpu->arch.efer |= (EFER_LMA | EFER_LME); - else - vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); - vmx_set_efer(vcpu, vcpu->arch.efer); - - kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp); - kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip); - vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); - vmx_set_interrupt_shadow(vcpu, 0); - - /* - * Note that calling vmx_set_cr0 is important, even if cr0 hasn't - * actually changed, because vmx_set_cr0 refers to efer set above. - * - * CR0_GUEST_HOST_MASK is already set in the original vmcs01 - * (KVM doesn't change it); - */ - vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; - vmx_set_cr0(vcpu, vmcs12->host_cr0); - - /* Same as above - no reason to call set_cr4_guest_host_mask(). */ - vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); - vmx_set_cr4(vcpu, vmcs12->host_cr4); - - nested_ept_uninit_mmu_context(vcpu); - - /* - * Only PDPTE load can fail as the value of cr3 was checked on entry and - * couldn't have changed. - */ - if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code)) - nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); - - if (!enable_ept) - vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; - - /* - * If vmcs01 doesn't use VPID, CPU flushes TLB on every - * VMEntry/VMExit. Thus, no need to flush TLB. - * - * If vmcs12 doesn't use VPID, L1 expects TLB to be - * flushed on every VMEntry/VMExit. - * - * Otherwise, we can preserve TLB entries as long as we are - * able to tag L1 TLB entries differently than L2 TLB entries. - * - * If vmcs12 uses EPT, we need to execute this flush on EPTP01 - * and therefore we request the TLB flush to happen only after VMCS EPTP - * has been set by KVM_REQ_LOAD_CR3. - */ - if (enable_vpid && - (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) { - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); - } - - vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); - vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); - vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); - vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); - vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); - vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); - vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); - - /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ - if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) - vmcs_write64(GUEST_BNDCFGS, 0); - - if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { - vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); - vcpu->arch.pat = vmcs12->host_ia32_pat; - } - if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) - vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, - vmcs12->host_ia32_perf_global_ctrl); - - /* Set L1 segment info according to Intel SDM - 27.5.2 Loading Host Segment and Descriptor-Table Registers */ - seg = (struct kvm_segment) { - .base = 0, - .limit = 0xFFFFFFFF, - .selector = vmcs12->host_cs_selector, - .type = 11, - .present = 1, - .s = 1, - .g = 1 - }; - if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) - seg.l = 1; - else - seg.db = 1; - vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); - seg = (struct kvm_segment) { - .base = 0, - .limit = 0xFFFFFFFF, - .type = 3, - .present = 1, - .s = 1, - .db = 1, - .g = 1 - }; - seg.selector = vmcs12->host_ds_selector; - vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); - seg.selector = vmcs12->host_es_selector; - vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); - seg.selector = vmcs12->host_ss_selector; - vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); - seg.selector = vmcs12->host_fs_selector; - seg.base = vmcs12->host_fs_base; - vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); - seg.selector = vmcs12->host_gs_selector; - seg.base = vmcs12->host_gs_base; - vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); - seg = (struct kvm_segment) { - .base = vmcs12->host_tr_base, - .limit = 0x67, - .selector = vmcs12->host_tr_selector, - .type = 11, - .present = 1 - }; - vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); - - kvm_set_dr(vcpu, 7, 0x400); - vmcs_write64(GUEST_IA32_DEBUGCTL, 0); - - if (cpu_has_vmx_msr_bitmap()) - vmx_update_msr_bitmap(vcpu); - - if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, - vmcs12->vm_exit_msr_load_count)) - nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); -} - -static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) -{ - struct shared_msr_entry *efer_msr; - unsigned int i; - - if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) - return vmcs_read64(GUEST_IA32_EFER); - - if (cpu_has_load_ia32_efer) - return host_efer; - - for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { - if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) - return vmx->msr_autoload.guest.val[i].value; - } - - efer_msr = find_msr_entry(vmx, MSR_EFER); - if (efer_msr) - return efer_msr->data; - - return host_efer; -} - -static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) -{ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct vmx_msr_entry g, h; - struct msr_data msr; - gpa_t gpa; - u32 i, j; - - vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); - - if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { - /* - * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set - * as vmcs01.GUEST_DR7 contains a userspace defined value - * and vcpu->arch.dr7 is not squirreled away before the - * nested VMENTER (not worth adding a variable in nested_vmx). - */ - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) - kvm_set_dr(vcpu, 7, DR7_FIXED_1); - else - WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); - } - - /* - * Note that calling vmx_set_{efer,cr0,cr4} is important as they - * handle a variety of side effects to KVM's software model. - */ - vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); - - vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; - vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); - - vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); - vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); - - nested_ept_uninit_mmu_context(vcpu); - vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); - - /* - * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs - * from vmcs01 (if necessary). The PDPTRs are not loaded on - * VMFail, like everything else we just need to ensure our - * software model is up-to-date. - */ - ept_save_pdptrs(vcpu); - - kvm_mmu_reset_context(vcpu); - - if (cpu_has_vmx_msr_bitmap()) - vmx_update_msr_bitmap(vcpu); - - /* - * This nasty bit of open coding is a compromise between blindly - * loading L1's MSRs using the exit load lists (incorrect emulation - * of VMFail), leaving the nested VM's MSRs in the software model - * (incorrect behavior) and snapshotting the modified MSRs (too - * expensive since the lists are unbound by hardware). For each - * MSR that was (prematurely) loaded from the nested VMEntry load - * list, reload it from the exit load list if it exists and differs - * from the guest value. The intent is to stuff host state as - * silently as possible, not to fully process the exit load list. - */ - msr.host_initiated = false; - for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { - gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); - if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { - pr_debug_ratelimited( - "%s read MSR index failed (%u, 0x%08llx)\n", - __func__, i, gpa); - goto vmabort; - } - - for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { - gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); - if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { - pr_debug_ratelimited( - "%s read MSR failed (%u, 0x%08llx)\n", - __func__, j, gpa); - goto vmabort; - } - if (h.index != g.index) - continue; - if (h.value == g.value) - break; - - if (nested_vmx_load_msr_check(vcpu, &h)) { - pr_debug_ratelimited( - "%s check failed (%u, 0x%x, 0x%x)\n", - __func__, j, h.index, h.reserved); - goto vmabort; - } - - msr.index = h.index; - msr.data = h.value; - if (kvm_set_msr(vcpu, &msr)) { - pr_debug_ratelimited( - "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", - __func__, j, h.index, h.value); - goto vmabort; - } - } - } - - return; - -vmabort: - nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); -} - -/* - * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 - * and modify vmcs12 to make it see what it would expect to see there if - * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) - */ -static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, - u32 exit_intr_info, - unsigned long exit_qualification) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - - /* trying to cancel vmlaunch/vmresume is a bug */ - WARN_ON_ONCE(vmx->nested.nested_run_pending); - - leave_guest_mode(vcpu); - - if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) - vcpu->arch.tsc_offset -= vmcs12->tsc_offset; - - if (likely(!vmx->fail)) { - if (exit_reason == -1) - sync_vmcs12(vcpu, vmcs12); - else - prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, - exit_qualification); - - /* - * Must happen outside of sync_vmcs12() as it will - * also be used to capture vmcs12 cache as part of - * capturing nVMX state for snapshot (migration). - * - * Otherwise, this flush will dirty guest memory at a - * point it is already assumed by user-space to be - * immutable. - */ - nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); - - if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr, - vmcs12->vm_exit_msr_store_count)) - nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL); - } else { - /* - * The only expected VM-instruction error is "VM entry with - * invalid control field(s)." Anything else indicates a - * problem with L0. And we should never get here with a - * VMFail of any type if early consistency checks are enabled. - */ - WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != - VMXERR_ENTRY_INVALID_CONTROL_FIELD); - WARN_ON_ONCE(nested_early_check); - } - - vmx_switch_vmcs(vcpu, &vmx->vmcs01); - - /* Update any VMCS fields that might have changed while L2 ran */ - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); - vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); - - if (kvm_has_tsc_control) - decache_tsc_multiplier(vmx); - - if (vmx->nested.change_vmcs01_virtual_apic_mode) { - vmx->nested.change_vmcs01_virtual_apic_mode = false; - vmx_set_virtual_apic_mode(vcpu); - } else if (!nested_cpu_has_ept(vmcs12) && - nested_cpu_has2(vmcs12, - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { - vmx_flush_tlb(vcpu, true); - } - - /* This is needed for same reason as it was needed in prepare_vmcs02 */ - vmx->host_rsp = 0; - - /* Unpin physical memory we referred to in vmcs02 */ - if (vmx->nested.apic_access_page) { - kvm_release_page_dirty(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = NULL; - } - if (vmx->nested.virtual_apic_page) { - kvm_release_page_dirty(vmx->nested.virtual_apic_page); - vmx->nested.virtual_apic_page = NULL; - } - if (vmx->nested.pi_desc_page) { - kunmap(vmx->nested.pi_desc_page); - kvm_release_page_dirty(vmx->nested.pi_desc_page); - vmx->nested.pi_desc_page = NULL; - vmx->nested.pi_desc = NULL; - } - - /* - * We are now running in L2, mmu_notifier will force to reload the - * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1. - */ - kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); - - if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs)) - vmx->nested.need_vmcs12_sync = true; - - /* in case we halted in L2 */ - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; - - if (likely(!vmx->fail)) { - /* - * TODO: SDM says that with acknowledge interrupt on - * exit, bit 31 of the VM-exit interrupt information - * (valid interrupt) is always set to 1 on - * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't - * need kvm_cpu_has_interrupt(). See the commit - * message for details. - */ - if (nested_exit_intr_ack_set(vcpu) && - exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && - kvm_cpu_has_interrupt(vcpu)) { - int irq = kvm_cpu_get_interrupt(vcpu); - WARN_ON(irq < 0); - vmcs12->vm_exit_intr_info = irq | - INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; - } - - if (exit_reason != -1) - trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, - vmcs12->exit_qualification, - vmcs12->idt_vectoring_info_field, - vmcs12->vm_exit_intr_info, - vmcs12->vm_exit_intr_error_code, - KVM_ISA_VMX); - - load_vmcs12_host_state(vcpu, vmcs12); - - return; - } - - /* - * After an early L2 VM-entry failure, we're now back - * in L1 which thinks it just finished a VMLAUNCH or - * VMRESUME instruction, so we need to set the failure - * flag and the VM-instruction error field of the VMCS - * accordingly, and skip the emulated instruction. - */ - (void)nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - - /* - * Restore L1's host state to KVM's software model. We're here - * because a consistency check was caught by hardware, which - * means some amount of guest state has been propagated to KVM's - * model and needs to be unwound to the host's state. - */ - nested_vmx_restore_host_state(vcpu); - - vmx->fail = 0; -} - -/* - * Forcibly leave nested mode in order to be able to reset the VCPU later on. - */ -static void vmx_leave_nested(struct kvm_vcpu *vcpu) -{ - if (is_guest_mode(vcpu)) { - to_vmx(vcpu)->nested.nested_run_pending = 0; - nested_vmx_vmexit(vcpu, -1, 0, 0); - } - free_nested(vcpu); -} - -static int vmx_check_intercept(struct kvm_vcpu *vcpu, - struct x86_instruction_info *info, - enum x86_intercept_stage stage) -{ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; - - /* - * RDPID causes #UD if disabled through secondary execution controls. - * Because it is marked as EmulateOnUD, we need to intercept it here. - */ - if (info->intercept == x86_intercept_rdtscp && - !nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) { - ctxt->exception.vector = UD_VECTOR; - ctxt->exception.error_code_valid = false; - return X86EMUL_PROPAGATE_FAULT; - } - - /* TODO: check more intercepts... */ - return X86EMUL_CONTINUE; -} - -#ifdef CONFIG_X86_64 -/* (a << shift) / divisor, return 1 if overflow otherwise 0 */ -static inline int u64_shl_div_u64(u64 a, unsigned int shift, - u64 divisor, u64 *result) -{ - u64 low = a << shift, high = a >> (64 - shift); - - /* To avoid the overflow on divq */ - if (high >= divisor) - return 1; - - /* Low hold the result, high hold rem which is discarded */ - asm("divq %2\n\t" : "=a" (low), "=d" (high) : - "rm" (divisor), "0" (low), "1" (high)); - *result = low; - - return 0; -} - -static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) -{ - struct vcpu_vmx *vmx; - u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles; - - if (kvm_mwait_in_guest(vcpu->kvm)) - return -EOPNOTSUPP; - - vmx = to_vmx(vcpu); - tscl = rdtsc(); - guest_tscl = kvm_read_l1_tsc(vcpu, tscl); - delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; - lapic_timer_advance_cycles = nsec_to_cycles(vcpu, lapic_timer_advance_ns); - - if (delta_tsc > lapic_timer_advance_cycles) - delta_tsc -= lapic_timer_advance_cycles; - else - delta_tsc = 0; - - /* Convert to host delta tsc if tsc scaling is enabled */ - if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio && - u64_shl_div_u64(delta_tsc, - kvm_tsc_scaling_ratio_frac_bits, - vcpu->arch.tsc_scaling_ratio, - &delta_tsc)) - return -ERANGE; - - /* - * If the delta tsc can't fit in the 32 bit after the multi shift, - * we can't use the preemption timer. - * It's possible that it fits on later vmentries, but checking - * on every vmentry is costly so we just use an hrtimer. - */ - if (delta_tsc >> (cpu_preemption_timer_multi + 32)) - return -ERANGE; - - vmx->hv_deadline_tsc = tscl + delta_tsc; - return delta_tsc == 0; -} - -static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) -{ - to_vmx(vcpu)->hv_deadline_tsc = -1; -} -#endif - -static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) -{ - if (!kvm_pause_in_guest(vcpu->kvm)) - shrink_ple_window(vcpu); -} - -static void vmx_slot_enable_log_dirty(struct kvm *kvm, - struct kvm_memory_slot *slot) -{ - kvm_mmu_slot_leaf_clear_dirty(kvm, slot); - kvm_mmu_slot_largepage_remove_write_access(kvm, slot); -} - -static void vmx_slot_disable_log_dirty(struct kvm *kvm, - struct kvm_memory_slot *slot) -{ - kvm_mmu_slot_set_dirty(kvm, slot); -} - -static void vmx_flush_log_dirty(struct kvm *kvm) -{ - kvm_flush_pml_buffers(kvm); -} - -static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu) -{ - struct vmcs12 *vmcs12; - struct vcpu_vmx *vmx = to_vmx(vcpu); - gpa_t gpa; - struct page *page = NULL; - u64 *pml_address; - - if (is_guest_mode(vcpu)) { - WARN_ON_ONCE(vmx->nested.pml_full); - - /* - * Check if PML is enabled for the nested guest. - * Whether eptp bit 6 is set is already checked - * as part of A/D emulation. - */ - vmcs12 = get_vmcs12(vcpu); - if (!nested_cpu_has_pml(vmcs12)) - return 0; - - if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) { - vmx->nested.pml_full = true; - return 1; - } - - gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull; - - page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->pml_address); - if (is_error_page(page)) - return 0; - - pml_address = kmap(page); - pml_address[vmcs12->guest_pml_index--] = gpa; - kunmap(page); - kvm_release_page_clean(page); - } - - return 0; -} - -static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, - struct kvm_memory_slot *memslot, - gfn_t offset, unsigned long mask) -{ - kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); -} - -static void __pi_post_block(struct kvm_vcpu *vcpu) -{ - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - struct pi_desc old, new; - unsigned int dest; - - do { - old.control = new.control = pi_desc->control; - WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR, - "Wakeup handler not enabled while the VCPU is blocked\n"); - - dest = cpu_physical_id(vcpu->cpu); - - if (x2apic_enabled()) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - - /* set 'NV' to 'notification vector' */ - new.nv = POSTED_INTR_VECTOR; - } while (cmpxchg64(&pi_desc->control, old.control, - new.control) != old.control); - - if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) { - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - list_del(&vcpu->blocked_vcpu_list); - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - vcpu->pre_pcpu = -1; - } -} - -/* - * This routine does the following things for vCPU which is going - * to be blocked if VT-d PI is enabled. - * - Store the vCPU to the wakeup list, so when interrupts happen - * we can find the right vCPU to wake up. - * - Change the Posted-interrupt descriptor as below: - * 'NDST' <-- vcpu->pre_pcpu - * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR - * - If 'ON' is set during this process, which means at least one - * interrupt is posted for this vCPU, we cannot block it, in - * this case, return 1, otherwise, return 0. - * - */ -static int pi_pre_block(struct kvm_vcpu *vcpu) -{ - unsigned int dest; - struct pi_desc old, new; - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) - return 0; - - WARN_ON(irqs_disabled()); - local_irq_disable(); - if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) { - vcpu->pre_pcpu = vcpu->cpu; - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - list_add_tail(&vcpu->blocked_vcpu_list, - &per_cpu(blocked_vcpu_on_cpu, - vcpu->pre_pcpu)); - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - } - - do { - old.control = new.control = pi_desc->control; - - WARN((pi_desc->sn == 1), - "Warning: SN field of posted-interrupts " - "is set before blocking\n"); - - /* - * Since vCPU can be preempted during this process, - * vcpu->cpu could be different with pre_pcpu, we - * need to set pre_pcpu as the destination of wakeup - * notification event, then we can find the right vCPU - * to wakeup in wakeup handler if interrupts happen - * when the vCPU is in blocked state. - */ - dest = cpu_physical_id(vcpu->pre_pcpu); - - if (x2apic_enabled()) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - - /* set 'NV' to 'wakeup vector' */ - new.nv = POSTED_INTR_WAKEUP_VECTOR; - } while (cmpxchg64(&pi_desc->control, old.control, - new.control) != old.control); - - /* We should not block the vCPU if an interrupt is posted for it. */ - if (pi_test_on(pi_desc) == 1) - __pi_post_block(vcpu); - - local_irq_enable(); - return (vcpu->pre_pcpu == -1); -} - -static int vmx_pre_block(struct kvm_vcpu *vcpu) -{ - if (pi_pre_block(vcpu)) - return 1; - - if (kvm_lapic_hv_timer_in_use(vcpu)) - kvm_lapic_switch_to_sw_timer(vcpu); - - return 0; -} - -static void pi_post_block(struct kvm_vcpu *vcpu) -{ - if (vcpu->pre_pcpu == -1) - return; - - WARN_ON(irqs_disabled()); - local_irq_disable(); - __pi_post_block(vcpu); - local_irq_enable(); -} - -static void vmx_post_block(struct kvm_vcpu *vcpu) -{ - if (kvm_x86_ops->set_hv_timer) - kvm_lapic_switch_to_hv_timer(vcpu); - - pi_post_block(vcpu); -} - -/* - * vmx_update_pi_irte - set IRTE for Posted-Interrupts - * - * @kvm: kvm - * @host_irq: host irq of the interrupt - * @guest_irq: gsi of the interrupt - * @set: set or unset PI - * returns 0 on success, < 0 on failure - */ -static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set) -{ - struct kvm_kernel_irq_routing_entry *e; - struct kvm_irq_routing_table *irq_rt; - struct kvm_lapic_irq irq; - struct kvm_vcpu *vcpu; - struct vcpu_data vcpu_info; - int idx, ret = 0; - - if (!kvm_arch_has_assigned_device(kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(kvm->vcpus[0])) - return 0; - - idx = srcu_read_lock(&kvm->irq_srcu); - irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); - if (guest_irq >= irq_rt->nr_rt_entries || - hlist_empty(&irq_rt->map[guest_irq])) { - pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n", - guest_irq, irq_rt->nr_rt_entries); - goto out; - } - - hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { - if (e->type != KVM_IRQ_ROUTING_MSI) - continue; - /* - * VT-d PI cannot support posting multicast/broadcast - * interrupts to a vCPU, we still use interrupt remapping - * for these kind of interrupts. - * - * For lowest-priority interrupts, we only support - * those with single CPU as the destination, e.g. user - * configures the interrupts via /proc/irq or uses - * irqbalance to make the interrupts single-CPU. - * - * We will support full lowest-priority interrupt later. - */ - - kvm_set_msi_irq(kvm, e, &irq); - if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) { - /* - * Make sure the IRTE is in remapped mode if - * we don't handle it in posted mode. - */ - ret = irq_set_vcpu_affinity(host_irq, NULL); - if (ret < 0) { - printk(KERN_INFO - "failed to back to remapped mode, irq: %u\n", - host_irq); - goto out; - } - - continue; - } - - vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)); - vcpu_info.vector = irq.vector; - - trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi, - vcpu_info.vector, vcpu_info.pi_desc_addr, set); - - if (set) - ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); - else - ret = irq_set_vcpu_affinity(host_irq, NULL); - - if (ret < 0) { - printk(KERN_INFO "%s: failed to update PI IRTE\n", - __func__); - goto out; - } - } - - ret = 0; -out: - srcu_read_unlock(&kvm->irq_srcu, idx); - return ret; -} - -static void vmx_setup_mce(struct kvm_vcpu *vcpu) -{ - if (vcpu->arch.mcg_cap & MCG_LMCE_P) - to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= - FEATURE_CONTROL_LMCE; - else - to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= - ~FEATURE_CONTROL_LMCE; -} - -static int vmx_smi_allowed(struct kvm_vcpu *vcpu) -{ - /* we need a nested vmexit to enter SMM, postpone if run is pending */ - if (to_vmx(vcpu)->nested.nested_run_pending) - return 0; - return 1; -} - -static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - vmx->nested.smm.guest_mode = is_guest_mode(vcpu); - if (vmx->nested.smm.guest_mode) - nested_vmx_vmexit(vcpu, -1, 0, 0); - - vmx->nested.smm.vmxon = vmx->nested.vmxon; - vmx->nested.vmxon = false; - vmx_clear_hlt(vcpu); - return 0; -} - -static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - int ret; - - if (vmx->nested.smm.vmxon) { - vmx->nested.vmxon = true; - vmx->nested.smm.vmxon = false; - } - - if (vmx->nested.smm.guest_mode) { - vcpu->arch.hflags &= ~HF_SMM_MASK; - ret = nested_vmx_enter_non_root_mode(vcpu, false); - vcpu->arch.hflags |= HF_SMM_MASK; - if (ret) - return ret; - - vmx->nested.smm.guest_mode = false; - } - return 0; -} - -static int enable_smi_window(struct kvm_vcpu *vcpu) -{ - return 0; -} - -static inline int vmx_has_valid_vmcs12(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - /* - * In case we do two consecutive get/set_nested_state()s while L2 was - * running hv_evmcs may end up not being mapped (we map it from - * nested_vmx_run()/vmx_vcpu_run()). Check is_guest_mode() as we always - * have vmcs12 if it is true. - */ - return is_guest_mode(vcpu) || vmx->nested.current_vmptr != -1ull || - vmx->nested.hv_evmcs; -} - -static int vmx_get_nested_state(struct kvm_vcpu *vcpu, - struct kvm_nested_state __user *user_kvm_nested_state, - u32 user_data_size) -{ - struct vcpu_vmx *vmx; - struct vmcs12 *vmcs12; - struct kvm_nested_state kvm_state = { - .flags = 0, - .format = 0, - .size = sizeof(kvm_state), - .vmx.vmxon_pa = -1ull, - .vmx.vmcs_pa = -1ull, - }; - - if (!vcpu) - return kvm_state.size + 2 * VMCS12_SIZE; - - vmx = to_vmx(vcpu); - vmcs12 = get_vmcs12(vcpu); - - if (nested_vmx_allowed(vcpu) && vmx->nested.enlightened_vmcs_enabled) - kvm_state.flags |= KVM_STATE_NESTED_EVMCS; - - if (nested_vmx_allowed(vcpu) && - (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { - kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr; - kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr; - - if (vmx_has_valid_vmcs12(vcpu)) { - kvm_state.size += VMCS12_SIZE; - - if (is_guest_mode(vcpu) && - nested_cpu_has_shadow_vmcs(vmcs12) && - vmcs12->vmcs_link_pointer != -1ull) - kvm_state.size += VMCS12_SIZE; - } - - if (vmx->nested.smm.vmxon) - kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; - - if (vmx->nested.smm.guest_mode) - kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; - - if (is_guest_mode(vcpu)) { - kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; - - if (vmx->nested.nested_run_pending) - kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; - } - } - - if (user_data_size < kvm_state.size) - goto out; - - if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) - return -EFAULT; - - if (!vmx_has_valid_vmcs12(vcpu)) - goto out; - - /* - * When running L2, the authoritative vmcs12 state is in the - * vmcs02. When running L1, the authoritative vmcs12 state is - * in the shadow or enlightened vmcs linked to vmcs01, unless - * need_vmcs12_sync is set, in which case, the authoritative - * vmcs12 state is in the vmcs12 already. - */ - if (is_guest_mode(vcpu)) { - sync_vmcs12(vcpu, vmcs12); - } else if (!vmx->nested.need_vmcs12_sync) { - if (vmx->nested.hv_evmcs) - copy_enlightened_to_vmcs12(vmx); - else if (enable_shadow_vmcs) - copy_shadow_to_vmcs12(vmx); - } - - if (copy_to_user(user_kvm_nested_state->data, vmcs12, sizeof(*vmcs12))) - return -EFAULT; - - if (nested_cpu_has_shadow_vmcs(vmcs12) && - vmcs12->vmcs_link_pointer != -1ull) { - if (copy_to_user(user_kvm_nested_state->data + VMCS12_SIZE, - get_shadow_vmcs12(vcpu), sizeof(*vmcs12))) - return -EFAULT; - } - -out: - return kvm_state.size; -} - -static int vmx_set_nested_state(struct kvm_vcpu *vcpu, - struct kvm_nested_state __user *user_kvm_nested_state, - struct kvm_nested_state *kvm_state) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct vmcs12 *vmcs12; - u32 exit_qual; - int ret; - - if (kvm_state->format != 0) - return -EINVAL; - - if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) - nested_enable_evmcs(vcpu, NULL); - - if (!nested_vmx_allowed(vcpu)) - return kvm_state->vmx.vmxon_pa == -1ull ? 0 : -EINVAL; - - if (kvm_state->vmx.vmxon_pa == -1ull) { - if (kvm_state->vmx.smm.flags) - return -EINVAL; - - if (kvm_state->vmx.vmcs_pa != -1ull) - return -EINVAL; - - vmx_leave_nested(vcpu); - return 0; - } - - if (!page_address_valid(vcpu, kvm_state->vmx.vmxon_pa)) - return -EINVAL; - - if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && - (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) - return -EINVAL; - - if (kvm_state->vmx.smm.flags & - ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) - return -EINVAL; - - /* - * SMM temporarily disables VMX, so we cannot be in guest mode, - * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags - * must be zero. - */ - if (is_smm(vcpu) ? kvm_state->flags : kvm_state->vmx.smm.flags) - return -EINVAL; - - if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && - !(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) - return -EINVAL; - - vmx_leave_nested(vcpu); - if (kvm_state->vmx.vmxon_pa == -1ull) - return 0; - - vmx->nested.vmxon_ptr = kvm_state->vmx.vmxon_pa; - ret = enter_vmx_operation(vcpu); - if (ret) - return ret; - - /* Empty 'VMXON' state is permitted */ - if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12)) - return 0; - - if (kvm_state->vmx.vmcs_pa != -1ull) { - if (kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa || - !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa)) - return -EINVAL; - - set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa); - } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { - /* - * Sync eVMCS upon entry as we may not have - * HV_X64_MSR_VP_ASSIST_PAGE set up yet. - */ - vmx->nested.need_vmcs12_sync = true; - } else { - return -EINVAL; - } - - if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { - vmx->nested.smm.vmxon = true; - vmx->nested.vmxon = false; - - if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) - vmx->nested.smm.guest_mode = true; - } - - vmcs12 = get_vmcs12(vcpu); - if (copy_from_user(vmcs12, user_kvm_nested_state->data, sizeof(*vmcs12))) - return -EFAULT; - - if (vmcs12->hdr.revision_id != VMCS12_REVISION) - return -EINVAL; - - if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) - return 0; - - vmx->nested.nested_run_pending = - !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); - - if (nested_cpu_has_shadow_vmcs(vmcs12) && - vmcs12->vmcs_link_pointer != -1ull) { - struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); - if (kvm_state->size < sizeof(kvm_state) + 2 * sizeof(*vmcs12)) - return -EINVAL; - - if (copy_from_user(shadow_vmcs12, - user_kvm_nested_state->data + VMCS12_SIZE, - sizeof(*vmcs12))) - return -EFAULT; - - if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || - !shadow_vmcs12->hdr.shadow_vmcs) - return -EINVAL; - } - - if (check_vmentry_prereqs(vcpu, vmcs12) || - check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) - return -EINVAL; - - vmx->nested.dirty_vmcs12 = true; - ret = nested_vmx_enter_non_root_mode(vcpu, false); - if (ret) - return -EINVAL; - - return 0; -} - -static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { - .cpu_has_kvm_support = cpu_has_kvm_support, - .disabled_by_bios = vmx_disabled_by_bios, - .hardware_setup = hardware_setup, - .hardware_unsetup = hardware_unsetup, - .check_processor_compatibility = vmx_check_processor_compat, - .hardware_enable = hardware_enable, - .hardware_disable = hardware_disable, - .cpu_has_accelerated_tpr = report_flexpriority, - .has_emulated_msr = vmx_has_emulated_msr, - - .vm_init = vmx_vm_init, - .vm_alloc = vmx_vm_alloc, - .vm_free = vmx_vm_free, - - .vcpu_create = vmx_create_vcpu, - .vcpu_free = vmx_free_vcpu, - .vcpu_reset = vmx_vcpu_reset, - - .prepare_guest_switch = vmx_prepare_switch_to_guest, - .vcpu_load = vmx_vcpu_load, - .vcpu_put = vmx_vcpu_put, - - .update_bp_intercept = update_exception_bitmap, - .get_msr_feature = vmx_get_msr_feature, - .get_msr = vmx_get_msr, - .set_msr = vmx_set_msr, - .get_segment_base = vmx_get_segment_base, - .get_segment = vmx_get_segment, - .set_segment = vmx_set_segment, - .get_cpl = vmx_get_cpl, - .get_cs_db_l_bits = vmx_get_cs_db_l_bits, - .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, - .decache_cr3 = vmx_decache_cr3, - .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, - .set_cr0 = vmx_set_cr0, - .set_cr3 = vmx_set_cr3, - .set_cr4 = vmx_set_cr4, - .set_efer = vmx_set_efer, - .get_idt = vmx_get_idt, - .set_idt = vmx_set_idt, - .get_gdt = vmx_get_gdt, - .set_gdt = vmx_set_gdt, - .get_dr6 = vmx_get_dr6, - .set_dr6 = vmx_set_dr6, - .set_dr7 = vmx_set_dr7, - .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, - .cache_reg = vmx_cache_reg, - .get_rflags = vmx_get_rflags, - .set_rflags = vmx_set_rflags, - - .tlb_flush = vmx_flush_tlb, - .tlb_flush_gva = vmx_flush_tlb_gva, - - .run = vmx_vcpu_run, - .handle_exit = vmx_handle_exit, - .skip_emulated_instruction = skip_emulated_instruction, - .set_interrupt_shadow = vmx_set_interrupt_shadow, - .get_interrupt_shadow = vmx_get_interrupt_shadow, - .patch_hypercall = vmx_patch_hypercall, - .set_irq = vmx_inject_irq, - .set_nmi = vmx_inject_nmi, - .queue_exception = vmx_queue_exception, - .cancel_injection = vmx_cancel_injection, - .interrupt_allowed = vmx_interrupt_allowed, - .nmi_allowed = vmx_nmi_allowed, - .get_nmi_mask = vmx_get_nmi_mask, - .set_nmi_mask = vmx_set_nmi_mask, - .enable_nmi_window = enable_nmi_window, - .enable_irq_window = enable_irq_window, - .update_cr8_intercept = update_cr8_intercept, - .set_virtual_apic_mode = vmx_set_virtual_apic_mode, - .set_apic_access_page_addr = vmx_set_apic_access_page_addr, - .get_enable_apicv = vmx_get_enable_apicv, - .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, - .load_eoi_exitmap = vmx_load_eoi_exitmap, - .apicv_post_state_restore = vmx_apicv_post_state_restore, - .hwapic_irr_update = vmx_hwapic_irr_update, - .hwapic_isr_update = vmx_hwapic_isr_update, - .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, - .sync_pir_to_irr = vmx_sync_pir_to_irr, - .deliver_posted_interrupt = vmx_deliver_posted_interrupt, - - .set_tss_addr = vmx_set_tss_addr, - .set_identity_map_addr = vmx_set_identity_map_addr, - .get_tdp_level = get_ept_level, - .get_mt_mask = vmx_get_mt_mask, - - .get_exit_info = vmx_get_exit_info, - - .get_lpage_level = vmx_get_lpage_level, - - .cpuid_update = vmx_cpuid_update, - - .rdtscp_supported = vmx_rdtscp_supported, - .invpcid_supported = vmx_invpcid_supported, - - .set_supported_cpuid = vmx_set_supported_cpuid, - - .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, - - .read_l1_tsc_offset = vmx_read_l1_tsc_offset, - .write_l1_tsc_offset = vmx_write_l1_tsc_offset, - - .set_tdp_cr3 = vmx_set_cr3, - - .check_intercept = vmx_check_intercept, - .handle_external_intr = vmx_handle_external_intr, - .mpx_supported = vmx_mpx_supported, - .xsaves_supported = vmx_xsaves_supported, - .umip_emulated = vmx_umip_emulated, - - .check_nested_events = vmx_check_nested_events, - .request_immediate_exit = vmx_request_immediate_exit, - - .sched_in = vmx_sched_in, - - .slot_enable_log_dirty = vmx_slot_enable_log_dirty, - .slot_disable_log_dirty = vmx_slot_disable_log_dirty, - .flush_log_dirty = vmx_flush_log_dirty, - .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, - .write_log_dirty = vmx_write_pml_buffer, - - .pre_block = vmx_pre_block, - .post_block = vmx_post_block, - - .pmu_ops = &intel_pmu_ops, - - .update_pi_irte = vmx_update_pi_irte, - -#ifdef CONFIG_X86_64 - .set_hv_timer = vmx_set_hv_timer, - .cancel_hv_timer = vmx_cancel_hv_timer, -#endif - - .setup_mce = vmx_setup_mce, - - .get_nested_state = vmx_get_nested_state, - .set_nested_state = vmx_set_nested_state, - .get_vmcs12_pages = nested_get_vmcs12_pages, - - .smi_allowed = vmx_smi_allowed, - .pre_enter_smm = vmx_pre_enter_smm, - .pre_leave_smm = vmx_pre_leave_smm, - .enable_smi_window = enable_smi_window, - - .nested_enable_evmcs = nested_enable_evmcs, -}; - -static void vmx_cleanup_l1d_flush(void) -{ - if (vmx_l1d_flush_pages) { - free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER); - vmx_l1d_flush_pages = NULL; - } - /* Restore state so sysfs ignores VMX */ - l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; -} - -static void vmx_exit(void) -{ -#ifdef CONFIG_KEXEC_CORE - RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); - synchronize_rcu(); -#endif - - kvm_exit(); - -#if IS_ENABLED(CONFIG_HYPERV) - if (static_branch_unlikely(&enable_evmcs)) { - int cpu; - struct hv_vp_assist_page *vp_ap; - /* - * Reset everything to support using non-enlightened VMCS - * access later (e.g. when we reload the module with - * enlightened_vmcs=0) - */ - for_each_online_cpu(cpu) { - vp_ap = hv_get_vp_assist_page(cpu); - - if (!vp_ap) - continue; - - vp_ap->current_nested_vmcs = 0; - vp_ap->enlighten_vmentry = 0; - } - - static_branch_disable(&enable_evmcs); - } -#endif - vmx_cleanup_l1d_flush(); -} -module_exit(vmx_exit); - -static int __init vmx_init(void) -{ - int r; - -#if IS_ENABLED(CONFIG_HYPERV) - /* - * Enlightened VMCS usage should be recommended and the host needs - * to support eVMCS v1 or above. We can also disable eVMCS support - * with module parameter. - */ - if (enlightened_vmcs && - ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED && - (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >= - KVM_EVMCS_VERSION) { - int cpu; - - /* Check that we have assist pages on all online CPUs */ - for_each_online_cpu(cpu) { - if (!hv_get_vp_assist_page(cpu)) { - enlightened_vmcs = false; - break; - } - } - - if (enlightened_vmcs) { - pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n"); - static_branch_enable(&enable_evmcs); - } - } else { - enlightened_vmcs = false; - } -#endif - - r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), - __alignof__(struct vcpu_vmx), THIS_MODULE); - if (r) - return r; - - /* - * Must be called after kvm_init() so enable_ept is properly set - * up. Hand the parameter mitigation value in which was stored in - * the pre module init parser. If no parameter was given, it will - * contain 'auto' which will be turned into the default 'cond' - * mitigation mode. - */ - if (boot_cpu_has(X86_BUG_L1TF)) { - r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); - if (r) { - vmx_exit(); - return r; - } - } - -#ifdef CONFIG_KEXEC_CORE - rcu_assign_pointer(crash_vmclear_loaded_vmcss, - crash_vmclear_local_loaded_vmcss); -#endif - vmx_check_vmcs12_offsets(); - - return 0; -} -module_init(vmx_init); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c new file mode 100644 index 000000000000..5ab4a364348e --- /dev/null +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -0,0 +1,358 @@ +/* + * KVM PMU support for Intel CPUs + * + * Copyright 2011 Red Hat, Inc. and/or its affiliates. + * + * Authors: + * Avi Kivity + * Gleb Natapov + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ +#include +#include +#include +#include +#include "x86.h" +#include "cpuid.h" +#include "lapic.h" +#include "pmu.h" + +static struct kvm_event_hw_type_mapping intel_arch_events[] = { + /* Index must match CPUID 0x0A.EBX bit vector */ + [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES }, + [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, + [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES }, + [3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES }, + [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES }, + [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, + [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, + [7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES }, +}; + +/* mapping between fixed pmc index and intel_arch_events array */ +static int fixed_pmc_events[] = {1, 0, 7}; + +static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) +{ + int i; + + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { + u8 new_ctrl = fixed_ctrl_field(data, i); + u8 old_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, i); + struct kvm_pmc *pmc; + + pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i); + + if (old_ctrl == new_ctrl) + continue; + + reprogram_fixed_counter(pmc, new_ctrl, i); + } + + pmu->fixed_ctr_ctrl = data; +} + +/* function is called when global control register has been updated. */ +static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data) +{ + int bit; + u64 diff = pmu->global_ctrl ^ data; + + pmu->global_ctrl = data; + + for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) + reprogram_counter(pmu, bit); +} + +static unsigned intel_find_arch_event(struct kvm_pmu *pmu, + u8 event_select, + u8 unit_mask) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++) + if (intel_arch_events[i].eventsel == event_select + && intel_arch_events[i].unit_mask == unit_mask + && (pmu->available_event_types & (1 << i))) + break; + + if (i == ARRAY_SIZE(intel_arch_events)) + return PERF_COUNT_HW_MAX; + + return intel_arch_events[i].event_type; +} + +static unsigned intel_find_fixed_event(int idx) +{ + if (idx >= ARRAY_SIZE(fixed_pmc_events)) + return PERF_COUNT_HW_MAX; + + return intel_arch_events[fixed_pmc_events[idx]].event_type; +} + +/* check if a PMC is enabled by comparing it with globl_ctrl bits. */ +static bool intel_pmc_is_enabled(struct kvm_pmc *pmc) +{ + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + + return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); +} + +static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) +{ + if (pmc_idx < INTEL_PMC_IDX_FIXED) + return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx, + MSR_P6_EVNTSEL0); + else { + u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED; + + return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0); + } +} + +/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */ +static int intel_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + bool fixed = idx & (1u << 30); + + idx &= ~(3u << 30); + + return (!fixed && idx >= pmu->nr_arch_gp_counters) || + (fixed && idx >= pmu->nr_arch_fixed_counters); +} + +static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu, + unsigned idx) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + bool fixed = idx & (1u << 30); + struct kvm_pmc *counters; + + idx &= ~(3u << 30); + if (!fixed && idx >= pmu->nr_arch_gp_counters) + return NULL; + if (fixed && idx >= pmu->nr_arch_fixed_counters) + return NULL; + counters = fixed ? pmu->fixed_counters : pmu->gp_counters; + + return &counters[idx]; +} + +static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + int ret; + + switch (msr) { + case MSR_CORE_PERF_FIXED_CTR_CTRL: + case MSR_CORE_PERF_GLOBAL_STATUS: + case MSR_CORE_PERF_GLOBAL_CTRL: + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + ret = pmu->version > 1; + break; + default: + ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) || + get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) || + get_fixed_pmc(pmu, msr); + break; + } + + return ret; +} + +static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + + switch (msr) { + case MSR_CORE_PERF_FIXED_CTR_CTRL: + *data = pmu->fixed_ctr_ctrl; + return 0; + case MSR_CORE_PERF_GLOBAL_STATUS: + *data = pmu->global_status; + return 0; + case MSR_CORE_PERF_GLOBAL_CTRL: + *data = pmu->global_ctrl; + return 0; + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + *data = pmu->global_ovf_ctrl; + return 0; + default: + if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || + (pmc = get_fixed_pmc(pmu, msr))) { + *data = pmc_read_counter(pmc); + return 0; + } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { + *data = pmc->eventsel; + return 0; + } + } + + return 1; +} + +static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + u32 msr = msr_info->index; + u64 data = msr_info->data; + + switch (msr) { + case MSR_CORE_PERF_FIXED_CTR_CTRL: + if (pmu->fixed_ctr_ctrl == data) + return 0; + if (!(data & 0xfffffffffffff444ull)) { + reprogram_fixed_counters(pmu, data); + return 0; + } + break; + case MSR_CORE_PERF_GLOBAL_STATUS: + if (msr_info->host_initiated) { + pmu->global_status = data; + return 0; + } + break; /* RO MSR */ + case MSR_CORE_PERF_GLOBAL_CTRL: + if (pmu->global_ctrl == data) + return 0; + if (!(data & pmu->global_ctrl_mask)) { + global_ctrl_changed(pmu, data); + return 0; + } + break; + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) { + if (!msr_info->host_initiated) + pmu->global_status &= ~data; + pmu->global_ovf_ctrl = data; + return 0; + } + break; + default: + if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || + (pmc = get_fixed_pmc(pmu, msr))) { + if (!msr_info->host_initiated) + data = (s64)(s32)data; + pmc->counter += data - pmc_read_counter(pmc); + return 0; + } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { + if (data == pmc->eventsel) + return 0; + if (!(data & pmu->reserved_bits)) { + reprogram_gp_counter(pmc, data); + return 0; + } + } + } + + return 1; +} + +static void intel_pmu_refresh(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_cpuid_entry2 *entry; + union cpuid10_eax eax; + union cpuid10_edx edx; + + pmu->nr_arch_gp_counters = 0; + pmu->nr_arch_fixed_counters = 0; + pmu->counter_bitmask[KVM_PMC_GP] = 0; + pmu->counter_bitmask[KVM_PMC_FIXED] = 0; + pmu->version = 0; + pmu->reserved_bits = 0xffffffff00200000ull; + + entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); + if (!entry) + return; + eax.full = entry->eax; + edx.full = entry->edx; + + pmu->version = eax.split.version_id; + if (!pmu->version) + return; + + pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters, + INTEL_PMC_MAX_GENERIC); + pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1; + pmu->available_event_types = ~entry->ebx & + ((1ull << eax.split.mask_length) - 1); + + if (pmu->version == 1) { + pmu->nr_arch_fixed_counters = 0; + } else { + pmu->nr_arch_fixed_counters = + min_t(int, edx.split.num_counters_fixed, + INTEL_PMC_MAX_FIXED); + pmu->counter_bitmask[KVM_PMC_FIXED] = + ((u64)1 << edx.split.bit_width_fixed) - 1; + } + + pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) | + (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED); + pmu->global_ctrl_mask = ~pmu->global_ctrl; + + entry = kvm_find_cpuid_entry(vcpu, 7, 0); + if (entry && + (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) && + (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) + pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED; +} + +static void intel_pmu_init(struct kvm_vcpu *vcpu) +{ + int i; + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + + for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { + pmu->gp_counters[i].type = KVM_PMC_GP; + pmu->gp_counters[i].vcpu = vcpu; + pmu->gp_counters[i].idx = i; + } + + for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) { + pmu->fixed_counters[i].type = KVM_PMC_FIXED; + pmu->fixed_counters[i].vcpu = vcpu; + pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED; + } +} + +static void intel_pmu_reset(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + int i; + + for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { + struct kvm_pmc *pmc = &pmu->gp_counters[i]; + + pmc_stop_counter(pmc); + pmc->counter = pmc->eventsel = 0; + } + + for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) + pmc_stop_counter(&pmu->fixed_counters[i]); + + pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = + pmu->global_ovf_ctrl = 0; +} + +struct kvm_pmu_ops intel_pmu_ops = { + .find_arch_event = intel_find_arch_event, + .find_fixed_event = intel_find_fixed_event, + .pmc_is_enabled = intel_pmc_is_enabled, + .pmc_idx_to_pmc = intel_pmc_idx_to_pmc, + .msr_idx_to_pmc = intel_msr_idx_to_pmc, + .is_valid_msr_idx = intel_is_valid_msr_idx, + .is_valid_msr = intel_is_valid_msr, + .get_msr = intel_pmu_get_msr, + .set_msr = intel_pmu_set_msr, + .refresh = intel_pmu_refresh, + .init = intel_pmu_init, + .reset = intel_pmu_reset, +}; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c new file mode 100644 index 000000000000..589230c923e2 --- /dev/null +++ b/arch/x86/kvm/vmx/vmx.c @@ -0,0 +1,15295 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * + * This module enables machines with Intel VT-x extensions to run virtual + * machines without emulation or binary translation. + * + * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * + * Authors: + * Avi Kivity + * Yaniv Kamay + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpuid.h" +#include "hyperv.h" +#include "irq.h" +#include "kvm_cache_regs.h" +#include "lapic.h" +#include "mmu.h" +#include "pmu.h" +#include "trace.h" +#include "vmx_evmcs.h" +#include "x86.h" + +#define __ex(x) __kvm_handle_fault_on_reboot(x) +#define __ex_clear(x, reg) \ + ____kvm_handle_fault_on_reboot(x, "xor " reg ", " reg) + +MODULE_AUTHOR("Qumranet"); +MODULE_LICENSE("GPL"); + +static const struct x86_cpu_id vmx_cpu_id[] = { + X86_FEATURE_MATCH(X86_FEATURE_VMX), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); + +static bool __read_mostly enable_vpid = 1; +module_param_named(vpid, enable_vpid, bool, 0444); + +static bool __read_mostly enable_vnmi = 1; +module_param_named(vnmi, enable_vnmi, bool, S_IRUGO); + +static bool __read_mostly flexpriority_enabled = 1; +module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); + +static bool __read_mostly enable_ept = 1; +module_param_named(ept, enable_ept, bool, S_IRUGO); + +static bool __read_mostly enable_unrestricted_guest = 1; +module_param_named(unrestricted_guest, + enable_unrestricted_guest, bool, S_IRUGO); + +static bool __read_mostly enable_ept_ad_bits = 1; +module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO); + +static bool __read_mostly emulate_invalid_guest_state = true; +module_param(emulate_invalid_guest_state, bool, S_IRUGO); + +static bool __read_mostly fasteoi = 1; +module_param(fasteoi, bool, S_IRUGO); + +static bool __read_mostly enable_apicv = 1; +module_param(enable_apicv, bool, S_IRUGO); + +static bool __read_mostly enable_shadow_vmcs = 1; +module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); +/* + * If nested=1, nested virtualization is supported, i.e., guests may use + * VMX and be a hypervisor for its own guests. If nested=0, guests may not + * use VMX instructions. + */ +static bool __read_mostly nested = 1; +module_param(nested, bool, S_IRUGO); + +static bool __read_mostly nested_early_check = 0; +module_param(nested_early_check, bool, S_IRUGO); + +static u64 __read_mostly host_xss; + +static bool __read_mostly enable_pml = 1; +module_param_named(pml, enable_pml, bool, S_IRUGO); + +#define MSR_TYPE_R 1 +#define MSR_TYPE_W 2 +#define MSR_TYPE_RW 3 + +#define MSR_BITMAP_MODE_X2APIC 1 +#define MSR_BITMAP_MODE_X2APIC_APICV 2 + +#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL + +/* Guest_tsc -> host_tsc conversion requires 64-bit division. */ +static int __read_mostly cpu_preemption_timer_multi; +static bool __read_mostly enable_preemption_timer = 1; +#ifdef CONFIG_X86_64 +module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); +#endif + +#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD) +#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE +#define KVM_VM_CR0_ALWAYS_ON \ + (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \ + X86_CR0_WP | X86_CR0_PG | X86_CR0_PE) +#define KVM_CR4_GUEST_OWNED_BITS \ + (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ + | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD) + +#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE +#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) +#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) + +#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) + +#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 + +/* + * Hyper-V requires all of these, so mark them as supported even though + * they are just treated the same as all-context. + */ +#define VMX_VPID_EXTENT_SUPPORTED_MASK \ + (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ + VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ + VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ + VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) + +/* + * These 2 parameters are used to config the controls for Pause-Loop Exiting: + * ple_gap: upper bound on the amount of time between two successive + * executions of PAUSE in a loop. Also indicate if ple enabled. + * According to test, this time is usually smaller than 128 cycles. + * ple_window: upper bound on the amount of time a guest is allowed to execute + * in a PAUSE loop. Tests indicate that most spinlocks are held for + * less than 2^12 cycles + * Time is measured based on a counter that runs at the same rate as the TSC, + * refer SDM volume 3b section 21.6.13 & 22.1.3. + */ +static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP; +module_param(ple_gap, uint, 0444); + +static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; +module_param(ple_window, uint, 0444); + +/* Default doubles per-vcpu window every exit. */ +static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW; +module_param(ple_window_grow, uint, 0444); + +/* Default resets per-vcpu window every exit to ple_window. */ +static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK; +module_param(ple_window_shrink, uint, 0444); + +/* Default is to compute the maximum so we can never overflow. */ +static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; +module_param(ple_window_max, uint, 0444); + +extern const ulong vmx_return; +extern const ulong vmx_early_consistency_check_return; + +static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); +static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); +static DEFINE_MUTEX(vmx_l1d_flush_mutex); + +/* Storage for pre module init parameter parsing */ +static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO; + +static const struct { + const char *option; + bool for_parse; +} vmentry_l1d_param[] = { + [VMENTER_L1D_FLUSH_AUTO] = {"auto", true}, + [VMENTER_L1D_FLUSH_NEVER] = {"never", true}, + [VMENTER_L1D_FLUSH_COND] = {"cond", true}, + [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true}, + [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false}, + [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false}, +}; + +#define L1D_CACHE_ORDER 4 +static void *vmx_l1d_flush_pages; + +static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) +{ + struct page *page; + unsigned int i; + + if (!enable_ept) { + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED; + return 0; + } + + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) { + u64 msr; + + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr); + if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; + return 0; + } + } + + /* If set to auto use the default l1tf mitigation method */ + if (l1tf == VMENTER_L1D_FLUSH_AUTO) { + switch (l1tf_mitigation) { + case L1TF_MITIGATION_OFF: + l1tf = VMENTER_L1D_FLUSH_NEVER; + break; + case L1TF_MITIGATION_FLUSH_NOWARN: + case L1TF_MITIGATION_FLUSH: + case L1TF_MITIGATION_FLUSH_NOSMT: + l1tf = VMENTER_L1D_FLUSH_COND; + break; + case L1TF_MITIGATION_FULL: + case L1TF_MITIGATION_FULL_FORCE: + l1tf = VMENTER_L1D_FLUSH_ALWAYS; + break; + } + } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) { + l1tf = VMENTER_L1D_FLUSH_ALWAYS; + } + + if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages && + !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) { + page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER); + if (!page) + return -ENOMEM; + vmx_l1d_flush_pages = page_address(page); + + /* + * Initialize each page with a different pattern in + * order to protect against KSM in the nested + * virtualization case. + */ + for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) { + memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1, + PAGE_SIZE); + } + } + + l1tf_vmx_mitigation = l1tf; + + if (l1tf != VMENTER_L1D_FLUSH_NEVER) + static_branch_enable(&vmx_l1d_should_flush); + else + static_branch_disable(&vmx_l1d_should_flush); + + if (l1tf == VMENTER_L1D_FLUSH_COND) + static_branch_enable(&vmx_l1d_flush_cond); + else + static_branch_disable(&vmx_l1d_flush_cond); + return 0; +} + +static int vmentry_l1d_flush_parse(const char *s) +{ + unsigned int i; + + if (s) { + for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) { + if (vmentry_l1d_param[i].for_parse && + sysfs_streq(s, vmentry_l1d_param[i].option)) + return i; + } + } + return -EINVAL; +} + +static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) +{ + int l1tf, ret; + + l1tf = vmentry_l1d_flush_parse(s); + if (l1tf < 0) + return l1tf; + + if (!boot_cpu_has(X86_BUG_L1TF)) + return 0; + + /* + * Has vmx_init() run already? If not then this is the pre init + * parameter parsing. In that case just store the value and let + * vmx_init() do the proper setup after enable_ept has been + * established. + */ + if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) { + vmentry_l1d_flush_param = l1tf; + return 0; + } + + mutex_lock(&vmx_l1d_flush_mutex); + ret = vmx_setup_l1d_flush(l1tf); + mutex_unlock(&vmx_l1d_flush_mutex); + return ret; +} + +static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) +{ + if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param))) + return sprintf(s, "???\n"); + + return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); +} + +static const struct kernel_param_ops vmentry_l1d_flush_ops = { + .set = vmentry_l1d_flush_set, + .get = vmentry_l1d_flush_get, +}; +module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); + +enum ept_pointers_status { + EPT_POINTERS_CHECK = 0, + EPT_POINTERS_MATCH = 1, + EPT_POINTERS_MISMATCH = 2 +}; + +struct kvm_vmx { + struct kvm kvm; + + unsigned int tss_addr; + bool ept_identity_pagetable_done; + gpa_t ept_identity_map_addr; + + enum ept_pointers_status ept_pointers_match; + spinlock_t ept_pointer_lock; +}; + +#define NR_AUTOLOAD_MSRS 8 + +struct vmcs_hdr { + u32 revision_id:31; + u32 shadow_vmcs:1; +}; + +struct vmcs { + struct vmcs_hdr hdr; + u32 abort; + char data[0]; +}; + +/* + * vmcs_host_state tracks registers that are loaded from the VMCS on VMEXIT + * and whose values change infrequently, but are not constant. I.e. this is + * used as a write-through cache of the corresponding VMCS fields. + */ +struct vmcs_host_state { + unsigned long cr3; /* May not match real cr3 */ + unsigned long cr4; /* May not match real cr4 */ + unsigned long gs_base; + unsigned long fs_base; + + u16 fs_sel, gs_sel, ldt_sel; +#ifdef CONFIG_X86_64 + u16 ds_sel, es_sel; +#endif +}; + +/* + * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also + * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs + * loaded on this CPU (so we can clear them if the CPU goes down). + */ +struct loaded_vmcs { + struct vmcs *vmcs; + struct vmcs *shadow_vmcs; + int cpu; + bool launched; + bool nmi_known_unmasked; + bool hv_timer_armed; + /* Support for vnmi-less CPUs */ + int soft_vnmi_blocked; + ktime_t entry_time; + s64 vnmi_blocked_time; + unsigned long *msr_bitmap; + struct list_head loaded_vmcss_on_cpu_link; + struct vmcs_host_state host_state; +}; + +struct shared_msr_entry { + unsigned index; + u64 data; + u64 mask; +}; + +/* + * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a + * single nested guest (L2), hence the name vmcs12. Any VMX implementation has + * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is + * stored in guest memory specified by VMPTRLD, but is opaque to the guest, + * which must access it using VMREAD/VMWRITE/VMCLEAR instructions. + * More than one of these structures may exist, if L1 runs multiple L2 guests. + * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the + * underlying hardware which will be used to run L2. + * This structure is packed to ensure that its layout is identical across + * machines (necessary for live migration). + * + * IMPORTANT: Changing the layout of existing fields in this structure + * will break save/restore compatibility with older kvm releases. When + * adding new fields, either use space in the reserved padding* arrays + * or add the new fields to the end of the structure. + */ +typedef u64 natural_width; +struct __packed vmcs12 { + /* According to the Intel spec, a VMCS region must start with the + * following two fields. Then follow implementation-specific data. + */ + struct vmcs_hdr hdr; + u32 abort; + + u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ + u32 padding[7]; /* room for future expansion */ + + u64 io_bitmap_a; + u64 io_bitmap_b; + u64 msr_bitmap; + u64 vm_exit_msr_store_addr; + u64 vm_exit_msr_load_addr; + u64 vm_entry_msr_load_addr; + u64 tsc_offset; + u64 virtual_apic_page_addr; + u64 apic_access_addr; + u64 posted_intr_desc_addr; + u64 ept_pointer; + u64 eoi_exit_bitmap0; + u64 eoi_exit_bitmap1; + u64 eoi_exit_bitmap2; + u64 eoi_exit_bitmap3; + u64 xss_exit_bitmap; + u64 guest_physical_address; + u64 vmcs_link_pointer; + u64 guest_ia32_debugctl; + u64 guest_ia32_pat; + u64 guest_ia32_efer; + u64 guest_ia32_perf_global_ctrl; + u64 guest_pdptr0; + u64 guest_pdptr1; + u64 guest_pdptr2; + u64 guest_pdptr3; + u64 guest_bndcfgs; + u64 host_ia32_pat; + u64 host_ia32_efer; + u64 host_ia32_perf_global_ctrl; + u64 vmread_bitmap; + u64 vmwrite_bitmap; + u64 vm_function_control; + u64 eptp_list_address; + u64 pml_address; + u64 padding64[3]; /* room for future expansion */ + /* + * To allow migration of L1 (complete with its L2 guests) between + * machines of different natural widths (32 or 64 bit), we cannot have + * unsigned long fields with no explict size. We use u64 (aliased + * natural_width) instead. Luckily, x86 is little-endian. + */ + natural_width cr0_guest_host_mask; + natural_width cr4_guest_host_mask; + natural_width cr0_read_shadow; + natural_width cr4_read_shadow; + natural_width cr3_target_value0; + natural_width cr3_target_value1; + natural_width cr3_target_value2; + natural_width cr3_target_value3; + natural_width exit_qualification; + natural_width guest_linear_address; + natural_width guest_cr0; + natural_width guest_cr3; + natural_width guest_cr4; + natural_width guest_es_base; + natural_width guest_cs_base; + natural_width guest_ss_base; + natural_width guest_ds_base; + natural_width guest_fs_base; + natural_width guest_gs_base; + natural_width guest_ldtr_base; + natural_width guest_tr_base; + natural_width guest_gdtr_base; + natural_width guest_idtr_base; + natural_width guest_dr7; + natural_width guest_rsp; + natural_width guest_rip; + natural_width guest_rflags; + natural_width guest_pending_dbg_exceptions; + natural_width guest_sysenter_esp; + natural_width guest_sysenter_eip; + natural_width host_cr0; + natural_width host_cr3; + natural_width host_cr4; + natural_width host_fs_base; + natural_width host_gs_base; + natural_width host_tr_base; + natural_width host_gdtr_base; + natural_width host_idtr_base; + natural_width host_ia32_sysenter_esp; + natural_width host_ia32_sysenter_eip; + natural_width host_rsp; + natural_width host_rip; + natural_width paddingl[8]; /* room for future expansion */ + u32 pin_based_vm_exec_control; + u32 cpu_based_vm_exec_control; + u32 exception_bitmap; + u32 page_fault_error_code_mask; + u32 page_fault_error_code_match; + u32 cr3_target_count; + u32 vm_exit_controls; + u32 vm_exit_msr_store_count; + u32 vm_exit_msr_load_count; + u32 vm_entry_controls; + u32 vm_entry_msr_load_count; + u32 vm_entry_intr_info_field; + u32 vm_entry_exception_error_code; + u32 vm_entry_instruction_len; + u32 tpr_threshold; + u32 secondary_vm_exec_control; + u32 vm_instruction_error; + u32 vm_exit_reason; + u32 vm_exit_intr_info; + u32 vm_exit_intr_error_code; + u32 idt_vectoring_info_field; + u32 idt_vectoring_error_code; + u32 vm_exit_instruction_len; + u32 vmx_instruction_info; + u32 guest_es_limit; + u32 guest_cs_limit; + u32 guest_ss_limit; + u32 guest_ds_limit; + u32 guest_fs_limit; + u32 guest_gs_limit; + u32 guest_ldtr_limit; + u32 guest_tr_limit; + u32 guest_gdtr_limit; + u32 guest_idtr_limit; + u32 guest_es_ar_bytes; + u32 guest_cs_ar_bytes; + u32 guest_ss_ar_bytes; + u32 guest_ds_ar_bytes; + u32 guest_fs_ar_bytes; + u32 guest_gs_ar_bytes; + u32 guest_ldtr_ar_bytes; + u32 guest_tr_ar_bytes; + u32 guest_interruptibility_info; + u32 guest_activity_state; + u32 guest_sysenter_cs; + u32 host_ia32_sysenter_cs; + u32 vmx_preemption_timer_value; + u32 padding32[7]; /* room for future expansion */ + u16 virtual_processor_id; + u16 posted_intr_nv; + u16 guest_es_selector; + u16 guest_cs_selector; + u16 guest_ss_selector; + u16 guest_ds_selector; + u16 guest_fs_selector; + u16 guest_gs_selector; + u16 guest_ldtr_selector; + u16 guest_tr_selector; + u16 guest_intr_status; + u16 host_es_selector; + u16 host_cs_selector; + u16 host_ss_selector; + u16 host_ds_selector; + u16 host_fs_selector; + u16 host_gs_selector; + u16 host_tr_selector; + u16 guest_pml_index; +}; + +/* + * For save/restore compatibility, the vmcs12 field offsets must not change. + */ +#define CHECK_OFFSET(field, loc) \ + BUILD_BUG_ON_MSG(offsetof(struct vmcs12, field) != (loc), \ + "Offset of " #field " in struct vmcs12 has changed.") + +static inline void vmx_check_vmcs12_offsets(void) { + CHECK_OFFSET(hdr, 0); + CHECK_OFFSET(abort, 4); + CHECK_OFFSET(launch_state, 8); + CHECK_OFFSET(io_bitmap_a, 40); + CHECK_OFFSET(io_bitmap_b, 48); + CHECK_OFFSET(msr_bitmap, 56); + CHECK_OFFSET(vm_exit_msr_store_addr, 64); + CHECK_OFFSET(vm_exit_msr_load_addr, 72); + CHECK_OFFSET(vm_entry_msr_load_addr, 80); + CHECK_OFFSET(tsc_offset, 88); + CHECK_OFFSET(virtual_apic_page_addr, 96); + CHECK_OFFSET(apic_access_addr, 104); + CHECK_OFFSET(posted_intr_desc_addr, 112); + CHECK_OFFSET(ept_pointer, 120); + CHECK_OFFSET(eoi_exit_bitmap0, 128); + CHECK_OFFSET(eoi_exit_bitmap1, 136); + CHECK_OFFSET(eoi_exit_bitmap2, 144); + CHECK_OFFSET(eoi_exit_bitmap3, 152); + CHECK_OFFSET(xss_exit_bitmap, 160); + CHECK_OFFSET(guest_physical_address, 168); + CHECK_OFFSET(vmcs_link_pointer, 176); + CHECK_OFFSET(guest_ia32_debugctl, 184); + CHECK_OFFSET(guest_ia32_pat, 192); + CHECK_OFFSET(guest_ia32_efer, 200); + CHECK_OFFSET(guest_ia32_perf_global_ctrl, 208); + CHECK_OFFSET(guest_pdptr0, 216); + CHECK_OFFSET(guest_pdptr1, 224); + CHECK_OFFSET(guest_pdptr2, 232); + CHECK_OFFSET(guest_pdptr3, 240); + CHECK_OFFSET(guest_bndcfgs, 248); + CHECK_OFFSET(host_ia32_pat, 256); + CHECK_OFFSET(host_ia32_efer, 264); + CHECK_OFFSET(host_ia32_perf_global_ctrl, 272); + CHECK_OFFSET(vmread_bitmap, 280); + CHECK_OFFSET(vmwrite_bitmap, 288); + CHECK_OFFSET(vm_function_control, 296); + CHECK_OFFSET(eptp_list_address, 304); + CHECK_OFFSET(pml_address, 312); + CHECK_OFFSET(cr0_guest_host_mask, 344); + CHECK_OFFSET(cr4_guest_host_mask, 352); + CHECK_OFFSET(cr0_read_shadow, 360); + CHECK_OFFSET(cr4_read_shadow, 368); + CHECK_OFFSET(cr3_target_value0, 376); + CHECK_OFFSET(cr3_target_value1, 384); + CHECK_OFFSET(cr3_target_value2, 392); + CHECK_OFFSET(cr3_target_value3, 400); + CHECK_OFFSET(exit_qualification, 408); + CHECK_OFFSET(guest_linear_address, 416); + CHECK_OFFSET(guest_cr0, 424); + CHECK_OFFSET(guest_cr3, 432); + CHECK_OFFSET(guest_cr4, 440); + CHECK_OFFSET(guest_es_base, 448); + CHECK_OFFSET(guest_cs_base, 456); + CHECK_OFFSET(guest_ss_base, 464); + CHECK_OFFSET(guest_ds_base, 472); + CHECK_OFFSET(guest_fs_base, 480); + CHECK_OFFSET(guest_gs_base, 488); + CHECK_OFFSET(guest_ldtr_base, 496); + CHECK_OFFSET(guest_tr_base, 504); + CHECK_OFFSET(guest_gdtr_base, 512); + CHECK_OFFSET(guest_idtr_base, 520); + CHECK_OFFSET(guest_dr7, 528); + CHECK_OFFSET(guest_rsp, 536); + CHECK_OFFSET(guest_rip, 544); + CHECK_OFFSET(guest_rflags, 552); + CHECK_OFFSET(guest_pending_dbg_exceptions, 560); + CHECK_OFFSET(guest_sysenter_esp, 568); + CHECK_OFFSET(guest_sysenter_eip, 576); + CHECK_OFFSET(host_cr0, 584); + CHECK_OFFSET(host_cr3, 592); + CHECK_OFFSET(host_cr4, 600); + CHECK_OFFSET(host_fs_base, 608); + CHECK_OFFSET(host_gs_base, 616); + CHECK_OFFSET(host_tr_base, 624); + CHECK_OFFSET(host_gdtr_base, 632); + CHECK_OFFSET(host_idtr_base, 640); + CHECK_OFFSET(host_ia32_sysenter_esp, 648); + CHECK_OFFSET(host_ia32_sysenter_eip, 656); + CHECK_OFFSET(host_rsp, 664); + CHECK_OFFSET(host_rip, 672); + CHECK_OFFSET(pin_based_vm_exec_control, 744); + CHECK_OFFSET(cpu_based_vm_exec_control, 748); + CHECK_OFFSET(exception_bitmap, 752); + CHECK_OFFSET(page_fault_error_code_mask, 756); + CHECK_OFFSET(page_fault_error_code_match, 760); + CHECK_OFFSET(cr3_target_count, 764); + CHECK_OFFSET(vm_exit_controls, 768); + CHECK_OFFSET(vm_exit_msr_store_count, 772); + CHECK_OFFSET(vm_exit_msr_load_count, 776); + CHECK_OFFSET(vm_entry_controls, 780); + CHECK_OFFSET(vm_entry_msr_load_count, 784); + CHECK_OFFSET(vm_entry_intr_info_field, 788); + CHECK_OFFSET(vm_entry_exception_error_code, 792); + CHECK_OFFSET(vm_entry_instruction_len, 796); + CHECK_OFFSET(tpr_threshold, 800); + CHECK_OFFSET(secondary_vm_exec_control, 804); + CHECK_OFFSET(vm_instruction_error, 808); + CHECK_OFFSET(vm_exit_reason, 812); + CHECK_OFFSET(vm_exit_intr_info, 816); + CHECK_OFFSET(vm_exit_intr_error_code, 820); + CHECK_OFFSET(idt_vectoring_info_field, 824); + CHECK_OFFSET(idt_vectoring_error_code, 828); + CHECK_OFFSET(vm_exit_instruction_len, 832); + CHECK_OFFSET(vmx_instruction_info, 836); + CHECK_OFFSET(guest_es_limit, 840); + CHECK_OFFSET(guest_cs_limit, 844); + CHECK_OFFSET(guest_ss_limit, 848); + CHECK_OFFSET(guest_ds_limit, 852); + CHECK_OFFSET(guest_fs_limit, 856); + CHECK_OFFSET(guest_gs_limit, 860); + CHECK_OFFSET(guest_ldtr_limit, 864); + CHECK_OFFSET(guest_tr_limit, 868); + CHECK_OFFSET(guest_gdtr_limit, 872); + CHECK_OFFSET(guest_idtr_limit, 876); + CHECK_OFFSET(guest_es_ar_bytes, 880); + CHECK_OFFSET(guest_cs_ar_bytes, 884); + CHECK_OFFSET(guest_ss_ar_bytes, 888); + CHECK_OFFSET(guest_ds_ar_bytes, 892); + CHECK_OFFSET(guest_fs_ar_bytes, 896); + CHECK_OFFSET(guest_gs_ar_bytes, 900); + CHECK_OFFSET(guest_ldtr_ar_bytes, 904); + CHECK_OFFSET(guest_tr_ar_bytes, 908); + CHECK_OFFSET(guest_interruptibility_info, 912); + CHECK_OFFSET(guest_activity_state, 916); + CHECK_OFFSET(guest_sysenter_cs, 920); + CHECK_OFFSET(host_ia32_sysenter_cs, 924); + CHECK_OFFSET(vmx_preemption_timer_value, 928); + CHECK_OFFSET(virtual_processor_id, 960); + CHECK_OFFSET(posted_intr_nv, 962); + CHECK_OFFSET(guest_es_selector, 964); + CHECK_OFFSET(guest_cs_selector, 966); + CHECK_OFFSET(guest_ss_selector, 968); + CHECK_OFFSET(guest_ds_selector, 970); + CHECK_OFFSET(guest_fs_selector, 972); + CHECK_OFFSET(guest_gs_selector, 974); + CHECK_OFFSET(guest_ldtr_selector, 976); + CHECK_OFFSET(guest_tr_selector, 978); + CHECK_OFFSET(guest_intr_status, 980); + CHECK_OFFSET(host_es_selector, 982); + CHECK_OFFSET(host_cs_selector, 984); + CHECK_OFFSET(host_ss_selector, 986); + CHECK_OFFSET(host_ds_selector, 988); + CHECK_OFFSET(host_fs_selector, 990); + CHECK_OFFSET(host_gs_selector, 992); + CHECK_OFFSET(host_tr_selector, 994); + CHECK_OFFSET(guest_pml_index, 996); +} + +/* + * VMCS12_REVISION is an arbitrary id that should be changed if the content or + * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and + * VMPTRLD verifies that the VMCS region that L1 is loading contains this id. + * + * IMPORTANT: Changing this value will break save/restore compatibility with + * older kvm releases. + */ +#define VMCS12_REVISION 0x11e57ed0 + +/* + * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region + * and any VMCS region. Although only sizeof(struct vmcs12) are used by the + * current implementation, 4K are reserved to avoid future complications. + */ +#define VMCS12_SIZE 0x1000 + +/* + * VMCS12_MAX_FIELD_INDEX is the highest index value used in any + * supported VMCS12 field encoding. + */ +#define VMCS12_MAX_FIELD_INDEX 0x17 + +struct nested_vmx_msrs { + /* + * We only store the "true" versions of the VMX capability MSRs. We + * generate the "non-true" versions by setting the must-be-1 bits + * according to the SDM. + */ + u32 procbased_ctls_low; + u32 procbased_ctls_high; + u32 secondary_ctls_low; + u32 secondary_ctls_high; + u32 pinbased_ctls_low; + u32 pinbased_ctls_high; + u32 exit_ctls_low; + u32 exit_ctls_high; + u32 entry_ctls_low; + u32 entry_ctls_high; + u32 misc_low; + u32 misc_high; + u32 ept_caps; + u32 vpid_caps; + u64 basic; + u64 cr0_fixed0; + u64 cr0_fixed1; + u64 cr4_fixed0; + u64 cr4_fixed1; + u64 vmcs_enum; + u64 vmfunc_controls; +}; + +/* + * The nested_vmx structure is part of vcpu_vmx, and holds information we need + * for correct emulation of VMX (i.e., nested VMX) on this vcpu. + */ +struct nested_vmx { + /* Has the level1 guest done vmxon? */ + bool vmxon; + gpa_t vmxon_ptr; + bool pml_full; + + /* The guest-physical address of the current VMCS L1 keeps for L2 */ + gpa_t current_vmptr; + /* + * Cache of the guest's VMCS, existing outside of guest memory. + * Loaded from guest memory during VMPTRLD. Flushed to guest + * memory during VMCLEAR and VMPTRLD. + */ + struct vmcs12 *cached_vmcs12; + /* + * Cache of the guest's shadow VMCS, existing outside of guest + * memory. Loaded from guest memory during VM entry. Flushed + * to guest memory during VM exit. + */ + struct vmcs12 *cached_shadow_vmcs12; + /* + * Indicates if the shadow vmcs or enlightened vmcs must be updated + * with the data held by struct vmcs12. + */ + bool need_vmcs12_sync; + bool dirty_vmcs12; + + /* + * vmcs02 has been initialized, i.e. state that is constant for + * vmcs02 has been written to the backing VMCS. Initialization + * is delayed until L1 actually attempts to run a nested VM. + */ + bool vmcs02_initialized; + + bool change_vmcs01_virtual_apic_mode; + + /* + * Enlightened VMCS has been enabled. It does not mean that L1 has to + * use it. However, VMX features available to L1 will be limited based + * on what the enlightened VMCS supports. + */ + bool enlightened_vmcs_enabled; + + /* L2 must run next, and mustn't decide to exit to L1. */ + bool nested_run_pending; + + struct loaded_vmcs vmcs02; + + /* + * Guest pages referred to in the vmcs02 with host-physical + * pointers, so we must keep them pinned while L2 runs. + */ + struct page *apic_access_page; + struct page *virtual_apic_page; + struct page *pi_desc_page; + struct pi_desc *pi_desc; + bool pi_pending; + u16 posted_intr_nv; + + struct hrtimer preemption_timer; + bool preemption_timer_expired; + + /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ + u64 vmcs01_debugctl; + u64 vmcs01_guest_bndcfgs; + + u16 vpid02; + u16 last_vpid; + + struct nested_vmx_msrs msrs; + + /* SMM related state */ + struct { + /* in VMX operation on SMM entry? */ + bool vmxon; + /* in guest mode on SMM entry? */ + bool guest_mode; + } smm; + + gpa_t hv_evmcs_vmptr; + struct page *hv_evmcs_page; + struct hv_enlightened_vmcs *hv_evmcs; +}; + +#define POSTED_INTR_ON 0 +#define POSTED_INTR_SN 1 + +/* Posted-Interrupt Descriptor */ +struct pi_desc { + u32 pir[8]; /* Posted interrupt requested */ + union { + struct { + /* bit 256 - Outstanding Notification */ + u16 on : 1, + /* bit 257 - Suppress Notification */ + sn : 1, + /* bit 271:258 - Reserved */ + rsvd_1 : 14; + /* bit 279:272 - Notification Vector */ + u8 nv; + /* bit 287:280 - Reserved */ + u8 rsvd_2; + /* bit 319:288 - Notification Destination */ + u32 ndst; + }; + u64 control; + }; + u32 rsvd[6]; +} __aligned(64); + +static bool pi_test_and_set_on(struct pi_desc *pi_desc) +{ + return test_and_set_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static bool pi_test_and_clear_on(struct pi_desc *pi_desc) +{ + return test_and_clear_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) +{ + return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); +} + +static inline void pi_clear_sn(struct pi_desc *pi_desc) +{ + return clear_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + +static inline void pi_set_sn(struct pi_desc *pi_desc) +{ + return set_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + +static inline void pi_clear_on(struct pi_desc *pi_desc) +{ + clear_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static inline int pi_test_on(struct pi_desc *pi_desc) +{ + return test_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static inline int pi_test_sn(struct pi_desc *pi_desc) +{ + return test_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + +struct vmx_msrs { + unsigned int nr; + struct vmx_msr_entry val[NR_AUTOLOAD_MSRS]; +}; + +struct vcpu_vmx { + struct kvm_vcpu vcpu; + unsigned long host_rsp; + u8 fail; + u8 msr_bitmap_mode; + u32 exit_intr_info; + u32 idt_vectoring_info; + ulong rflags; + struct shared_msr_entry *guest_msrs; + int nmsrs; + int save_nmsrs; + bool guest_msrs_dirty; + unsigned long host_idt_base; +#ifdef CONFIG_X86_64 + u64 msr_host_kernel_gs_base; + u64 msr_guest_kernel_gs_base; +#endif + + u64 arch_capabilities; + u64 spec_ctrl; + + u32 vm_entry_controls_shadow; + u32 vm_exit_controls_shadow; + u32 secondary_exec_control; + + /* + * loaded_vmcs points to the VMCS currently used in this vcpu. For a + * non-nested (L1) guest, it always points to vmcs01. For a nested + * guest (L2), it points to a different VMCS. loaded_cpu_state points + * to the VMCS whose state is loaded into the CPU registers that only + * need to be switched when transitioning to/from the kernel; a NULL + * value indicates that host state is loaded. + */ + struct loaded_vmcs vmcs01; + struct loaded_vmcs *loaded_vmcs; + struct loaded_vmcs *loaded_cpu_state; + bool __launched; /* temporary, used in vmx_vcpu_run */ + struct msr_autoload { + struct vmx_msrs guest; + struct vmx_msrs host; + } msr_autoload; + + struct { + int vm86_active; + ulong save_rflags; + struct kvm_segment segs[8]; + } rmode; + struct { + u32 bitmask; /* 4 bits per segment (1 bit per field) */ + struct kvm_save_segment { + u16 selector; + unsigned long base; + u32 limit; + u32 ar; + } seg[8]; + } segment_cache; + int vpid; + bool emulation_required; + + u32 exit_reason; + + /* Posted interrupt descriptor */ + struct pi_desc pi_desc; + + /* Support for a guest hypervisor (nested VMX) */ + struct nested_vmx nested; + + /* Dynamic PLE window. */ + int ple_window; + bool ple_window_dirty; + + bool req_immediate_exit; + + /* Support for PML */ +#define PML_ENTITY_NUM 512 + struct page *pml_pg; + + /* apic deadline value in host tsc */ + u64 hv_deadline_tsc; + + u64 current_tsc_ratio; + + u32 host_pkru; + + unsigned long host_debugctlmsr; + + /* + * Only bits masked by msr_ia32_feature_control_valid_bits can be set in + * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included + * in msr_ia32_feature_control_valid_bits. + */ + u64 msr_ia32_feature_control; + u64 msr_ia32_feature_control_valid_bits; + u64 ept_pointer; +}; + +enum segment_cache_field { + SEG_FIELD_SEL = 0, + SEG_FIELD_BASE = 1, + SEG_FIELD_LIMIT = 2, + SEG_FIELD_AR = 3, + + SEG_FIELD_NR = 4 +}; + +static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm) +{ + return container_of(kvm, struct kvm_vmx, kvm); +} + +static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) +{ + return container_of(vcpu, struct vcpu_vmx, vcpu); +} + +static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) +{ + return &(to_vmx(vcpu)->pi_desc); +} + +#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) +#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) +#define FIELD(number, name) [ROL16(number, 6)] = VMCS12_OFFSET(name) +#define FIELD64(number, name) \ + FIELD(number, name), \ + [ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32) + + +static u16 shadow_read_only_fields[] = { +#define SHADOW_FIELD_RO(x) x, +#include "vmx_shadow_fields.h" +}; +static int max_shadow_read_only_fields = + ARRAY_SIZE(shadow_read_only_fields); + +static u16 shadow_read_write_fields[] = { +#define SHADOW_FIELD_RW(x) x, +#include "vmx_shadow_fields.h" +}; +static int max_shadow_read_write_fields = + ARRAY_SIZE(shadow_read_write_fields); + +static const unsigned short vmcs_field_to_offset_table[] = { + FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), + FIELD(POSTED_INTR_NV, posted_intr_nv), + FIELD(GUEST_ES_SELECTOR, guest_es_selector), + FIELD(GUEST_CS_SELECTOR, guest_cs_selector), + FIELD(GUEST_SS_SELECTOR, guest_ss_selector), + FIELD(GUEST_DS_SELECTOR, guest_ds_selector), + FIELD(GUEST_FS_SELECTOR, guest_fs_selector), + FIELD(GUEST_GS_SELECTOR, guest_gs_selector), + FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector), + FIELD(GUEST_TR_SELECTOR, guest_tr_selector), + FIELD(GUEST_INTR_STATUS, guest_intr_status), + FIELD(GUEST_PML_INDEX, guest_pml_index), + FIELD(HOST_ES_SELECTOR, host_es_selector), + FIELD(HOST_CS_SELECTOR, host_cs_selector), + FIELD(HOST_SS_SELECTOR, host_ss_selector), + FIELD(HOST_DS_SELECTOR, host_ds_selector), + FIELD(HOST_FS_SELECTOR, host_fs_selector), + FIELD(HOST_GS_SELECTOR, host_gs_selector), + FIELD(HOST_TR_SELECTOR, host_tr_selector), + FIELD64(IO_BITMAP_A, io_bitmap_a), + FIELD64(IO_BITMAP_B, io_bitmap_b), + FIELD64(MSR_BITMAP, msr_bitmap), + FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr), + FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr), + FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr), + FIELD64(PML_ADDRESS, pml_address), + FIELD64(TSC_OFFSET, tsc_offset), + FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr), + FIELD64(APIC_ACCESS_ADDR, apic_access_addr), + FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr), + FIELD64(VM_FUNCTION_CONTROL, vm_function_control), + FIELD64(EPT_POINTER, ept_pointer), + FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0), + FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1), + FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2), + FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3), + FIELD64(EPTP_LIST_ADDRESS, eptp_list_address), + FIELD64(VMREAD_BITMAP, vmread_bitmap), + FIELD64(VMWRITE_BITMAP, vmwrite_bitmap), + FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), + FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), + FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), + FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl), + FIELD64(GUEST_IA32_PAT, guest_ia32_pat), + FIELD64(GUEST_IA32_EFER, guest_ia32_efer), + FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl), + FIELD64(GUEST_PDPTR0, guest_pdptr0), + FIELD64(GUEST_PDPTR1, guest_pdptr1), + FIELD64(GUEST_PDPTR2, guest_pdptr2), + FIELD64(GUEST_PDPTR3, guest_pdptr3), + FIELD64(GUEST_BNDCFGS, guest_bndcfgs), + FIELD64(HOST_IA32_PAT, host_ia32_pat), + FIELD64(HOST_IA32_EFER, host_ia32_efer), + FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl), + FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control), + FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control), + FIELD(EXCEPTION_BITMAP, exception_bitmap), + FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask), + FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match), + FIELD(CR3_TARGET_COUNT, cr3_target_count), + FIELD(VM_EXIT_CONTROLS, vm_exit_controls), + FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count), + FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count), + FIELD(VM_ENTRY_CONTROLS, vm_entry_controls), + FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count), + FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field), + FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code), + FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len), + FIELD(TPR_THRESHOLD, tpr_threshold), + FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control), + FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error), + FIELD(VM_EXIT_REASON, vm_exit_reason), + FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info), + FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code), + FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field), + FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code), + FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len), + FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info), + FIELD(GUEST_ES_LIMIT, guest_es_limit), + FIELD(GUEST_CS_LIMIT, guest_cs_limit), + FIELD(GUEST_SS_LIMIT, guest_ss_limit), + FIELD(GUEST_DS_LIMIT, guest_ds_limit), + FIELD(GUEST_FS_LIMIT, guest_fs_limit), + FIELD(GUEST_GS_LIMIT, guest_gs_limit), + FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit), + FIELD(GUEST_TR_LIMIT, guest_tr_limit), + FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit), + FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit), + FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes), + FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes), + FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes), + FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes), + FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes), + FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes), + FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes), + FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes), + FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info), + FIELD(GUEST_ACTIVITY_STATE, guest_activity_state), + FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs), + FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs), + FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value), + FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask), + FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask), + FIELD(CR0_READ_SHADOW, cr0_read_shadow), + FIELD(CR4_READ_SHADOW, cr4_read_shadow), + FIELD(CR3_TARGET_VALUE0, cr3_target_value0), + FIELD(CR3_TARGET_VALUE1, cr3_target_value1), + FIELD(CR3_TARGET_VALUE2, cr3_target_value2), + FIELD(CR3_TARGET_VALUE3, cr3_target_value3), + FIELD(EXIT_QUALIFICATION, exit_qualification), + FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address), + FIELD(GUEST_CR0, guest_cr0), + FIELD(GUEST_CR3, guest_cr3), + FIELD(GUEST_CR4, guest_cr4), + FIELD(GUEST_ES_BASE, guest_es_base), + FIELD(GUEST_CS_BASE, guest_cs_base), + FIELD(GUEST_SS_BASE, guest_ss_base), + FIELD(GUEST_DS_BASE, guest_ds_base), + FIELD(GUEST_FS_BASE, guest_fs_base), + FIELD(GUEST_GS_BASE, guest_gs_base), + FIELD(GUEST_LDTR_BASE, guest_ldtr_base), + FIELD(GUEST_TR_BASE, guest_tr_base), + FIELD(GUEST_GDTR_BASE, guest_gdtr_base), + FIELD(GUEST_IDTR_BASE, guest_idtr_base), + FIELD(GUEST_DR7, guest_dr7), + FIELD(GUEST_RSP, guest_rsp), + FIELD(GUEST_RIP, guest_rip), + FIELD(GUEST_RFLAGS, guest_rflags), + FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions), + FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp), + FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip), + FIELD(HOST_CR0, host_cr0), + FIELD(HOST_CR3, host_cr3), + FIELD(HOST_CR4, host_cr4), + FIELD(HOST_FS_BASE, host_fs_base), + FIELD(HOST_GS_BASE, host_gs_base), + FIELD(HOST_TR_BASE, host_tr_base), + FIELD(HOST_GDTR_BASE, host_gdtr_base), + FIELD(HOST_IDTR_BASE, host_idtr_base), + FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp), + FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip), + FIELD(HOST_RSP, host_rsp), + FIELD(HOST_RIP, host_rip), +}; + +static inline short vmcs_field_to_offset(unsigned long field) +{ + const size_t size = ARRAY_SIZE(vmcs_field_to_offset_table); + unsigned short offset; + unsigned index; + + if (field >> 15) + return -ENOENT; + + index = ROL16(field, 6); + if (index >= size) + return -ENOENT; + + index = array_index_nospec(index, size); + offset = vmcs_field_to_offset_table[index]; + if (offset == 0) + return -ENOENT; + return offset; +} + +static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->nested.cached_vmcs12; +} + +static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->nested.cached_shadow_vmcs12; +} + +static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu); +static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); +static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); +static bool vmx_xsaves_supported(void); +static void vmx_set_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg); +static void vmx_get_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg); +static bool guest_state_valid(struct kvm_vcpu *vcpu); +static u32 vmx_segment_access_rights(struct kvm_segment *var); +static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); +static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); +static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); +static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, + u16 error_code); +static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); +static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, + u32 msr, int type); + +static DEFINE_PER_CPU(struct vmcs *, vmxarea); +static DEFINE_PER_CPU(struct vmcs *, current_vmcs); +/* + * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed + * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. + */ +static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); + +/* + * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we + * can find which vCPU should be waken up. + */ +static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); +static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); + +enum { + VMX_VMREAD_BITMAP, + VMX_VMWRITE_BITMAP, + VMX_BITMAP_NR +}; + +static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; + +#define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) +#define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) + +static bool cpu_has_load_ia32_efer; +static bool cpu_has_load_perf_global_ctrl; + +static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); +static DEFINE_SPINLOCK(vmx_vpid_lock); + +static struct vmcs_config { + int size; + int order; + u32 basic_cap; + u32 revision_id; + u32 pin_based_exec_ctrl; + u32 cpu_based_exec_ctrl; + u32 cpu_based_2nd_exec_ctrl; + u32 vmexit_ctrl; + u32 vmentry_ctrl; + struct nested_vmx_msrs nested; +} vmcs_config; + +static struct vmx_capability { + u32 ept; + u32 vpid; +} vmx_capability; + +#define VMX_SEGMENT_FIELD(seg) \ + [VCPU_SREG_##seg] = { \ + .selector = GUEST_##seg##_SELECTOR, \ + .base = GUEST_##seg##_BASE, \ + .limit = GUEST_##seg##_LIMIT, \ + .ar_bytes = GUEST_##seg##_AR_BYTES, \ + } + +static const struct kvm_vmx_segment_field { + unsigned selector; + unsigned base; + unsigned limit; + unsigned ar_bytes; +} kvm_vmx_segment_fields[] = { + VMX_SEGMENT_FIELD(CS), + VMX_SEGMENT_FIELD(DS), + VMX_SEGMENT_FIELD(ES), + VMX_SEGMENT_FIELD(FS), + VMX_SEGMENT_FIELD(GS), + VMX_SEGMENT_FIELD(SS), + VMX_SEGMENT_FIELD(TR), + VMX_SEGMENT_FIELD(LDTR), +}; + +static u64 host_efer; + +static void ept_save_pdptrs(struct kvm_vcpu *vcpu); + +/* + * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it + * away by decrementing the array size. + */ +static const u32 vmx_msr_index[] = { +#ifdef CONFIG_X86_64 + MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, +#endif + MSR_EFER, MSR_TSC_AUX, MSR_STAR, +}; + +DEFINE_STATIC_KEY_FALSE(enable_evmcs); + +#define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs)) + +#define KVM_EVMCS_VERSION 1 + +/* + * Enlightened VMCSv1 doesn't support these: + * + * POSTED_INTR_NV = 0x00000002, + * GUEST_INTR_STATUS = 0x00000810, + * APIC_ACCESS_ADDR = 0x00002014, + * POSTED_INTR_DESC_ADDR = 0x00002016, + * EOI_EXIT_BITMAP0 = 0x0000201c, + * EOI_EXIT_BITMAP1 = 0x0000201e, + * EOI_EXIT_BITMAP2 = 0x00002020, + * EOI_EXIT_BITMAP3 = 0x00002022, + * GUEST_PML_INDEX = 0x00000812, + * PML_ADDRESS = 0x0000200e, + * VM_FUNCTION_CONTROL = 0x00002018, + * EPTP_LIST_ADDRESS = 0x00002024, + * VMREAD_BITMAP = 0x00002026, + * VMWRITE_BITMAP = 0x00002028, + * + * TSC_MULTIPLIER = 0x00002032, + * PLE_GAP = 0x00004020, + * PLE_WINDOW = 0x00004022, + * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E, + * GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808, + * HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04, + * + * Currently unsupported in KVM: + * GUEST_IA32_RTIT_CTL = 0x00002814, + */ +#define EVMCS1_UNSUPPORTED_PINCTRL (PIN_BASED_POSTED_INTR | \ + PIN_BASED_VMX_PREEMPTION_TIMER) +#define EVMCS1_UNSUPPORTED_2NDEXEC \ + (SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \ + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | \ + SECONDARY_EXEC_APIC_REGISTER_VIRT | \ + SECONDARY_EXEC_ENABLE_PML | \ + SECONDARY_EXEC_ENABLE_VMFUNC | \ + SECONDARY_EXEC_SHADOW_VMCS | \ + SECONDARY_EXEC_TSC_SCALING | \ + SECONDARY_EXEC_PAUSE_LOOP_EXITING) +#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) +#define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) +#define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING) + +#if IS_ENABLED(CONFIG_HYPERV) +static bool __read_mostly enlightened_vmcs = true; +module_param(enlightened_vmcs, bool, 0444); + +static inline void evmcs_write64(unsigned long field, u64 value) +{ + u16 clean_field; + int offset = get_evmcs_offset(field, &clean_field); + + if (offset < 0) + return; + + *(u64 *)((char *)current_evmcs + offset) = value; + + current_evmcs->hv_clean_fields &= ~clean_field; +} + +static inline void evmcs_write32(unsigned long field, u32 value) +{ + u16 clean_field; + int offset = get_evmcs_offset(field, &clean_field); + + if (offset < 0) + return; + + *(u32 *)((char *)current_evmcs + offset) = value; + current_evmcs->hv_clean_fields &= ~clean_field; +} + +static inline void evmcs_write16(unsigned long field, u16 value) +{ + u16 clean_field; + int offset = get_evmcs_offset(field, &clean_field); + + if (offset < 0) + return; + + *(u16 *)((char *)current_evmcs + offset) = value; + current_evmcs->hv_clean_fields &= ~clean_field; +} + +static inline u64 evmcs_read64(unsigned long field) +{ + int offset = get_evmcs_offset(field, NULL); + + if (offset < 0) + return 0; + + return *(u64 *)((char *)current_evmcs + offset); +} + +static inline u32 evmcs_read32(unsigned long field) +{ + int offset = get_evmcs_offset(field, NULL); + + if (offset < 0) + return 0; + + return *(u32 *)((char *)current_evmcs + offset); +} + +static inline u16 evmcs_read16(unsigned long field) +{ + int offset = get_evmcs_offset(field, NULL); + + if (offset < 0) + return 0; + + return *(u16 *)((char *)current_evmcs + offset); +} + +static inline void evmcs_touch_msr_bitmap(void) +{ + if (unlikely(!current_evmcs)) + return; + + if (current_evmcs->hv_enlightenments_control.msr_bitmap) + current_evmcs->hv_clean_fields &= + ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP; +} + +static void evmcs_load(u64 phys_addr) +{ + struct hv_vp_assist_page *vp_ap = + hv_get_vp_assist_page(smp_processor_id()); + + vp_ap->current_nested_vmcs = phys_addr; + vp_ap->enlighten_vmentry = 1; +} + +static void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) +{ + vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL; + vmcs_conf->cpu_based_2nd_exec_ctrl &= ~EVMCS1_UNSUPPORTED_2NDEXEC; + + vmcs_conf->vmexit_ctrl &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL; + vmcs_conf->vmentry_ctrl &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL; + +} + +/* check_ept_pointer() should be under protection of ept_pointer_lock. */ +static void check_ept_pointer_match(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + u64 tmp_eptp = INVALID_PAGE; + int i; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!VALID_PAGE(tmp_eptp)) { + tmp_eptp = to_vmx(vcpu)->ept_pointer; + } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) { + to_kvm_vmx(kvm)->ept_pointers_match + = EPT_POINTERS_MISMATCH; + return; + } + } + + to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH; +} + +static int vmx_hv_remote_flush_tlb(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + int ret = -ENOTSUPP, i; + + spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock); + + if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK) + check_ept_pointer_match(kvm); + + /* + * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs the address of the + * base of EPT PML4 table, strip off EPT configuration information. + */ + if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) { + kvm_for_each_vcpu(i, vcpu, kvm) + ret |= hyperv_flush_guest_mapping( + to_vmx(kvm_get_vcpu(kvm, i))->ept_pointer & PAGE_MASK); + } else { + ret = hyperv_flush_guest_mapping( + to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer & PAGE_MASK); + } + + spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); + return ret; +} +#else /* !IS_ENABLED(CONFIG_HYPERV) */ +static inline void evmcs_write64(unsigned long field, u64 value) {} +static inline void evmcs_write32(unsigned long field, u32 value) {} +static inline void evmcs_write16(unsigned long field, u16 value) {} +static inline u64 evmcs_read64(unsigned long field) { return 0; } +static inline u32 evmcs_read32(unsigned long field) { return 0; } +static inline u16 evmcs_read16(unsigned long field) { return 0; } +static inline void evmcs_load(u64 phys_addr) {} +static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {} +static inline void evmcs_touch_msr_bitmap(void) {} +#endif /* IS_ENABLED(CONFIG_HYPERV) */ + +static int nested_enable_evmcs(struct kvm_vcpu *vcpu, + uint16_t *vmcs_version) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * vmcs_version represents the range of supported Enlightened VMCS + * versions: lower 8 bits is the minimal version, higher 8 bits is the + * maximum supported version. KVM supports versions from 1 to + * KVM_EVMCS_VERSION. + */ + if (vmcs_version) + *vmcs_version = (KVM_EVMCS_VERSION << 8) | 1; + + /* We don't support disabling the feature for simplicity. */ + if (vmx->nested.enlightened_vmcs_enabled) + return 0; + + vmx->nested.enlightened_vmcs_enabled = true; + + vmx->nested.msrs.pinbased_ctls_high &= ~EVMCS1_UNSUPPORTED_PINCTRL; + vmx->nested.msrs.entry_ctls_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL; + vmx->nested.msrs.exit_ctls_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL; + vmx->nested.msrs.secondary_ctls_high &= ~EVMCS1_UNSUPPORTED_2NDEXEC; + vmx->nested.msrs.vmfunc_controls &= ~EVMCS1_UNSUPPORTED_VMFUNC; + + return 0; +} + +static inline bool is_exception_n(u32 intr_info, u8 vector) +{ + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | + INTR_INFO_VALID_MASK)) == + (INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK); +} + +static inline bool is_debug(u32 intr_info) +{ + return is_exception_n(intr_info, DB_VECTOR); +} + +static inline bool is_breakpoint(u32 intr_info) +{ + return is_exception_n(intr_info, BP_VECTOR); +} + +static inline bool is_page_fault(u32 intr_info) +{ + return is_exception_n(intr_info, PF_VECTOR); +} + +static inline bool is_invalid_opcode(u32 intr_info) +{ + return is_exception_n(intr_info, UD_VECTOR); +} + +static inline bool is_gp_fault(u32 intr_info) +{ + return is_exception_n(intr_info, GP_VECTOR); +} + +static inline bool is_machine_check(u32 intr_info) +{ + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | + INTR_INFO_VALID_MASK)) == + (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK); +} + +/* Undocumented: icebp/int1 */ +static inline bool is_icebp(u32 intr_info) +{ + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) + == (INTR_TYPE_PRIV_SW_EXCEPTION | INTR_INFO_VALID_MASK); +} + +static inline bool cpu_has_vmx_msr_bitmap(void) +{ + return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS; +} + +static inline bool cpu_has_vmx_tpr_shadow(void) +{ + return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW; +} + +static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu) +{ + return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu); +} + +static inline bool cpu_has_secondary_exec_ctrls(void) +{ + return vmcs_config.cpu_based_exec_ctrl & + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; +} + +static inline bool cpu_has_vmx_virtualize_apic_accesses(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; +} + +static inline bool cpu_has_vmx_virtualize_x2apic_mode(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; +} + +static inline bool cpu_has_vmx_apic_register_virt(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_APIC_REGISTER_VIRT; +} + +static inline bool cpu_has_vmx_virtual_intr_delivery(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; +} + +static inline bool cpu_has_vmx_encls_vmexit(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENCLS_EXITING; +} + +/* + * Comment's format: document - errata name - stepping - processor name. + * Refer from + * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp + */ +static u32 vmx_preemption_cpu_tfms[] = { +/* 323344.pdf - BA86 - D0 - Xeon 7500 Series */ +0x000206E6, +/* 323056.pdf - AAX65 - C2 - Xeon L3406 */ +/* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */ +/* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */ +0x00020652, +/* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */ +0x00020655, +/* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */ +/* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */ +/* + * 320767.pdf - AAP86 - B1 - + * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile + */ +0x000106E5, +/* 321333.pdf - AAM126 - C0 - Xeon 3500 */ +0x000106A0, +/* 321333.pdf - AAM126 - C1 - Xeon 3500 */ +0x000106A1, +/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */ +0x000106A4, + /* 321333.pdf - AAM126 - D0 - Xeon 3500 */ + /* 321324.pdf - AAK139 - D0 - Xeon 5500 */ + /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */ +0x000106A5, +}; + +static inline bool cpu_has_broken_vmx_preemption_timer(void) +{ + u32 eax = cpuid_eax(0x00000001), i; + + /* Clear the reserved bits */ + eax &= ~(0x3U << 14 | 0xfU << 28); + for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++) + if (eax == vmx_preemption_cpu_tfms[i]) + return true; + + return false; +} + +static inline bool cpu_has_vmx_preemption_timer(void) +{ + return vmcs_config.pin_based_exec_ctrl & + PIN_BASED_VMX_PREEMPTION_TIMER; +} + +static inline bool cpu_has_vmx_posted_intr(void) +{ + return IS_ENABLED(CONFIG_X86_LOCAL_APIC) && + vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; +} + +static inline bool cpu_has_vmx_apicv(void) +{ + return cpu_has_vmx_apic_register_virt() && + cpu_has_vmx_virtual_intr_delivery() && + cpu_has_vmx_posted_intr(); +} + +static inline bool cpu_has_vmx_flexpriority(void) +{ + return cpu_has_vmx_tpr_shadow() && + cpu_has_vmx_virtualize_apic_accesses(); +} + +static inline bool cpu_has_vmx_ept_execute_only(void) +{ + return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT; +} + +static inline bool cpu_has_vmx_ept_2m_page(void) +{ + return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT; +} + +static inline bool cpu_has_vmx_ept_1g_page(void) +{ + return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT; +} + +static inline bool cpu_has_vmx_ept_4levels(void) +{ + return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; +} + +static inline bool cpu_has_vmx_ept_mt_wb(void) +{ + return vmx_capability.ept & VMX_EPTP_WB_BIT; +} + +static inline bool cpu_has_vmx_ept_5levels(void) +{ + return vmx_capability.ept & VMX_EPT_PAGE_WALK_5_BIT; +} + +static inline bool cpu_has_vmx_ept_ad_bits(void) +{ + return vmx_capability.ept & VMX_EPT_AD_BIT; +} + +static inline bool cpu_has_vmx_invept_context(void) +{ + return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT; +} + +static inline bool cpu_has_vmx_invept_global(void) +{ + return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT; +} + +static inline bool cpu_has_vmx_invvpid_individual_addr(void) +{ + return vmx_capability.vpid & VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT; +} + +static inline bool cpu_has_vmx_invvpid_single(void) +{ + return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT; +} + +static inline bool cpu_has_vmx_invvpid_global(void) +{ + return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; +} + +static inline bool cpu_has_vmx_invvpid(void) +{ + return vmx_capability.vpid & VMX_VPID_INVVPID_BIT; +} + +static inline bool cpu_has_vmx_ept(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENABLE_EPT; +} + +static inline bool cpu_has_vmx_unrestricted_guest(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_UNRESTRICTED_GUEST; +} + +static inline bool cpu_has_vmx_ple(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_PAUSE_LOOP_EXITING; +} + +static inline bool cpu_has_vmx_basic_inout(void) +{ + return (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT); +} + +static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) +{ + return flexpriority_enabled && lapic_in_kernel(vcpu); +} + +static inline bool cpu_has_vmx_vpid(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENABLE_VPID; +} + +static inline bool cpu_has_vmx_rdtscp(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_RDTSCP; +} + +static inline bool cpu_has_vmx_invpcid(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENABLE_INVPCID; +} + +static inline bool cpu_has_virtual_nmis(void) +{ + return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; +} + +static inline bool cpu_has_vmx_wbinvd_exit(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_WBINVD_EXITING; +} + +static inline bool cpu_has_vmx_shadow_vmcs(void) +{ + u64 vmx_msr; + rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); + /* check if the cpu supports writing r/o exit information fields */ + if (!(vmx_msr & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS)) + return false; + + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_SHADOW_VMCS; +} + +static inline bool cpu_has_vmx_pml(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML; +} + +static inline bool cpu_has_vmx_tsc_scaling(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_TSC_SCALING; +} + +static inline bool cpu_has_vmx_vmfunc(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENABLE_VMFUNC; +} + +static bool vmx_umip_emulated(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_DESC; +} + +static inline bool report_flexpriority(void) +{ + return flexpriority_enabled; +} + +static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu) +{ + return vmx_misc_cr3_count(to_vmx(vcpu)->nested.msrs.misc_low); +} + +/* + * Do the virtual VMX capability MSRs specify that L1 can use VMWRITE + * to modify any valid field of the VMCS, or are the VM-exit + * information fields read-only? + */ +static inline bool nested_cpu_has_vmwrite_any_field(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->nested.msrs.misc_low & + MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS; +} + +static inline bool nested_cpu_has_zero_length_injection(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->nested.msrs.misc_low & VMX_MISC_ZERO_LEN_INS; +} + +static inline bool nested_cpu_supports_monitor_trap_flag(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->nested.msrs.procbased_ctls_high & + CPU_BASED_MONITOR_TRAP_FLAG; +} + +static inline bool nested_cpu_has_vmx_shadow_vmcs(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->nested.msrs.secondary_ctls_high & + SECONDARY_EXEC_SHADOW_VMCS; +} + +static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit) +{ + return vmcs12->cpu_based_vm_exec_control & bit; +} + +static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit) +{ + return (vmcs12->cpu_based_vm_exec_control & + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && + (vmcs12->secondary_vm_exec_control & bit); +} + +static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12) +{ + return vmcs12->pin_based_vm_exec_control & + PIN_BASED_VMX_PREEMPTION_TIMER; +} + +static inline bool nested_cpu_has_nmi_exiting(struct vmcs12 *vmcs12) +{ + return vmcs12->pin_based_vm_exec_control & PIN_BASED_NMI_EXITING; +} + +static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12) +{ + return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; +} + +static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); +} + +static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); +} + +static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML); +} + +static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); +} + +static inline bool nested_cpu_has_vpid(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID); +} + +static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT); +} + +static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); +} + +static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12) +{ + return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR; +} + +static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC); +} + +static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12) +{ + return nested_cpu_has_vmfunc(vmcs12) && + (vmcs12->vm_function_control & + VMX_VMFUNC_EPTP_SWITCHING); +} + +static inline bool nested_cpu_has_shadow_vmcs(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS); +} + +static inline bool nested_cpu_has_save_preemption_timer(struct vmcs12 *vmcs12) +{ + return vmcs12->vm_exit_controls & + VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; +} + +static inline bool is_nmi(u32 intr_info) +{ + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) + == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK); +} + +static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, + u32 exit_intr_info, + unsigned long exit_qualification); + +static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) +{ + int i; + + for (i = 0; i < vmx->nmsrs; ++i) + if (vmx_msr_index[vmx->guest_msrs[i].index] == msr) + return i; + return -1; +} + +static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva) +{ + struct { + u64 vpid : 16; + u64 rsvd : 48; + u64 gva; + } operand = { vpid, 0, gva }; + bool error; + + asm volatile (__ex("invvpid %2, %1") CC_SET(na) + : CC_OUT(na) (error) : "r"(ext), "m"(operand)); + BUG_ON(error); +} + +static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa) +{ + struct { + u64 eptp, gpa; + } operand = {eptp, gpa}; + bool error; + + asm volatile (__ex("invept %2, %1") CC_SET(na) + : CC_OUT(na) (error) : "r"(ext), "m"(operand)); + BUG_ON(error); +} + +static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) +{ + int i; + + i = __find_msr_index(vmx, msr); + if (i >= 0) + return &vmx->guest_msrs[i]; + return NULL; +} + +static void vmcs_clear(struct vmcs *vmcs) +{ + u64 phys_addr = __pa(vmcs); + bool error; + + asm volatile (__ex("vmclear %1") CC_SET(na) + : CC_OUT(na) (error) : "m"(phys_addr)); + if (unlikely(error)) + printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", + vmcs, phys_addr); +} + +static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) +{ + vmcs_clear(loaded_vmcs->vmcs); + if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched) + vmcs_clear(loaded_vmcs->shadow_vmcs); + loaded_vmcs->cpu = -1; + loaded_vmcs->launched = 0; +} + +static void vmcs_load(struct vmcs *vmcs) +{ + u64 phys_addr = __pa(vmcs); + bool error; + + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_load(phys_addr); + + asm volatile (__ex("vmptrld %1") CC_SET(na) + : CC_OUT(na) (error) : "m"(phys_addr)); + if (unlikely(error)) + printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n", + vmcs, phys_addr); +} + +#ifdef CONFIG_KEXEC_CORE +/* + * This bitmap is used to indicate whether the vmclear + * operation is enabled on all cpus. All disabled by + * default. + */ +static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE; + +static inline void crash_enable_local_vmclear(int cpu) +{ + cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap); +} + +static inline void crash_disable_local_vmclear(int cpu) +{ + cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap); +} + +static inline int crash_local_vmclear_enabled(int cpu) +{ + return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap); +} + +static void crash_vmclear_local_loaded_vmcss(void) +{ + int cpu = raw_smp_processor_id(); + struct loaded_vmcs *v; + + if (!crash_local_vmclear_enabled(cpu)) + return; + + list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), + loaded_vmcss_on_cpu_link) + vmcs_clear(v->vmcs); +} +#else +static inline void crash_enable_local_vmclear(int cpu) { } +static inline void crash_disable_local_vmclear(int cpu) { } +#endif /* CONFIG_KEXEC_CORE */ + +static void __loaded_vmcs_clear(void *arg) +{ + struct loaded_vmcs *loaded_vmcs = arg; + int cpu = raw_smp_processor_id(); + + if (loaded_vmcs->cpu != cpu) + return; /* vcpu migration can race with cpu offline */ + if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) + per_cpu(current_vmcs, cpu) = NULL; + crash_disable_local_vmclear(cpu); + list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); + + /* + * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link + * is before setting loaded_vmcs->vcpu to -1 which is done in + * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist + * then adds the vmcs into percpu list before it is deleted. + */ + smp_wmb(); + + loaded_vmcs_init(loaded_vmcs); + crash_enable_local_vmclear(cpu); +} + +static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) +{ + int cpu = loaded_vmcs->cpu; + + if (cpu != -1) + smp_call_function_single(cpu, + __loaded_vmcs_clear, loaded_vmcs, 1); +} + +static inline bool vpid_sync_vcpu_addr(int vpid, gva_t addr) +{ + if (vpid == 0) + return true; + + if (cpu_has_vmx_invvpid_individual_addr()) { + __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr); + return true; + } + + return false; +} + +static inline void vpid_sync_vcpu_single(int vpid) +{ + if (vpid == 0) + return; + + if (cpu_has_vmx_invvpid_single()) + __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0); +} + +static inline void vpid_sync_vcpu_global(void) +{ + if (cpu_has_vmx_invvpid_global()) + __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0); +} + +static inline void vpid_sync_context(int vpid) +{ + if (cpu_has_vmx_invvpid_single()) + vpid_sync_vcpu_single(vpid); + else + vpid_sync_vcpu_global(); +} + +static inline void ept_sync_global(void) +{ + __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0); +} + +static inline void ept_sync_context(u64 eptp) +{ + if (cpu_has_vmx_invept_context()) + __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0); + else + ept_sync_global(); +} + +static __always_inline void vmcs_check16(unsigned long field) +{ + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000, + "16-bit accessor invalid for 64-bit field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, + "16-bit accessor invalid for 64-bit high field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, + "16-bit accessor invalid for 32-bit high field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, + "16-bit accessor invalid for natural width field"); +} + +static __always_inline void vmcs_check32(unsigned long field) +{ + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, + "32-bit accessor invalid for 16-bit field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, + "32-bit accessor invalid for natural width field"); +} + +static __always_inline void vmcs_check64(unsigned long field) +{ + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, + "64-bit accessor invalid for 16-bit field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, + "64-bit accessor invalid for 64-bit high field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, + "64-bit accessor invalid for 32-bit field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, + "64-bit accessor invalid for natural width field"); +} + +static __always_inline void vmcs_checkl(unsigned long field) +{ + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, + "Natural width accessor invalid for 16-bit field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000, + "Natural width accessor invalid for 64-bit field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, + "Natural width accessor invalid for 64-bit high field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, + "Natural width accessor invalid for 32-bit field"); +} + +static __always_inline unsigned long __vmcs_readl(unsigned long field) +{ + unsigned long value; + + asm volatile (__ex_clear("vmread %1, %0", "%k0") + : "=r"(value) : "r"(field)); + return value; +} + +static __always_inline u16 vmcs_read16(unsigned long field) +{ + vmcs_check16(field); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_read16(field); + return __vmcs_readl(field); +} + +static __always_inline u32 vmcs_read32(unsigned long field) +{ + vmcs_check32(field); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_read32(field); + return __vmcs_readl(field); +} + +static __always_inline u64 vmcs_read64(unsigned long field) +{ + vmcs_check64(field); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_read64(field); +#ifdef CONFIG_X86_64 + return __vmcs_readl(field); +#else + return __vmcs_readl(field) | ((u64)__vmcs_readl(field+1) << 32); +#endif +} + +static __always_inline unsigned long vmcs_readl(unsigned long field) +{ + vmcs_checkl(field); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_read64(field); + return __vmcs_readl(field); +} + +static noinline void vmwrite_error(unsigned long field, unsigned long value) +{ + printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n", + field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); + dump_stack(); +} + +static __always_inline void __vmcs_writel(unsigned long field, unsigned long value) +{ + bool error; + + asm volatile (__ex("vmwrite %2, %1") CC_SET(na) + : CC_OUT(na) (error) : "r"(field), "rm"(value)); + if (unlikely(error)) + vmwrite_error(field, value); +} + +static __always_inline void vmcs_write16(unsigned long field, u16 value) +{ + vmcs_check16(field); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_write16(field, value); + + __vmcs_writel(field, value); +} + +static __always_inline void vmcs_write32(unsigned long field, u32 value) +{ + vmcs_check32(field); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_write32(field, value); + + __vmcs_writel(field, value); +} + +static __always_inline void vmcs_write64(unsigned long field, u64 value) +{ + vmcs_check64(field); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_write64(field, value); + + __vmcs_writel(field, value); +#ifndef CONFIG_X86_64 + asm volatile (""); + __vmcs_writel(field+1, value >> 32); +#endif +} + +static __always_inline void vmcs_writel(unsigned long field, unsigned long value) +{ + vmcs_checkl(field); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_write64(field, value); + + __vmcs_writel(field, value); +} + +static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask) +{ + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000, + "vmcs_clear_bits does not support 64-bit fields"); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_write32(field, evmcs_read32(field) & ~mask); + + __vmcs_writel(field, __vmcs_readl(field) & ~mask); +} + +static __always_inline void vmcs_set_bits(unsigned long field, u32 mask) +{ + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000, + "vmcs_set_bits does not support 64-bit fields"); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_write32(field, evmcs_read32(field) | mask); + + __vmcs_writel(field, __vmcs_readl(field) | mask); +} + +static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx) +{ + vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS); +} + +static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val) +{ + vmcs_write32(VM_ENTRY_CONTROLS, val); + vmx->vm_entry_controls_shadow = val; +} + +static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val) +{ + if (vmx->vm_entry_controls_shadow != val) + vm_entry_controls_init(vmx, val); +} + +static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx) +{ + return vmx->vm_entry_controls_shadow; +} + + +static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val) +{ + vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val); +} + +static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val) +{ + vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val); +} + +static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx) +{ + vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS); +} + +static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val) +{ + vmcs_write32(VM_EXIT_CONTROLS, val); + vmx->vm_exit_controls_shadow = val; +} + +static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val) +{ + if (vmx->vm_exit_controls_shadow != val) + vm_exit_controls_init(vmx, val); +} + +static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx) +{ + return vmx->vm_exit_controls_shadow; +} + + +static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val) +{ + vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val); +} + +static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val) +{ + vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val); +} + +static void vmx_segment_cache_clear(struct vcpu_vmx *vmx) +{ + vmx->segment_cache.bitmask = 0; +} + +static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg, + unsigned field) +{ + bool ret; + u32 mask = 1 << (seg * SEG_FIELD_NR + field); + + if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) { + vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS); + vmx->segment_cache.bitmask = 0; + } + ret = vmx->segment_cache.bitmask & mask; + vmx->segment_cache.bitmask |= mask; + return ret; +} + +static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg) +{ + u16 *p = &vmx->segment_cache.seg[seg].selector; + + if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL)) + *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector); + return *p; +} + +static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg) +{ + ulong *p = &vmx->segment_cache.seg[seg].base; + + if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE)) + *p = vmcs_readl(kvm_vmx_segment_fields[seg].base); + return *p; +} + +static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg) +{ + u32 *p = &vmx->segment_cache.seg[seg].limit; + + if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT)) + *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit); + return *p; +} + +static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg) +{ + u32 *p = &vmx->segment_cache.seg[seg].ar; + + if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR)) + *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes); + return *p; +} + +static void update_exception_bitmap(struct kvm_vcpu *vcpu) +{ + u32 eb; + + eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | + (1u << DB_VECTOR) | (1u << AC_VECTOR); + /* + * Guest access to VMware backdoor ports could legitimately + * trigger #GP because of TSS I/O permission bitmap. + * We intercept those #GP and allow access to them anyway + * as VMware does. + */ + if (enable_vmware_backdoor) + eb |= (1u << GP_VECTOR); + if ((vcpu->guest_debug & + (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == + (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) + eb |= 1u << BP_VECTOR; + if (to_vmx(vcpu)->rmode.vm86_active) + eb = ~0; + if (enable_ept) + eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ + + /* When we are running a nested L2 guest and L1 specified for it a + * certain exception bitmap, we must trap the same exceptions and pass + * them to L1. When running L2, we will only handle the exceptions + * specified above if L1 did not want them. + */ + if (is_guest_mode(vcpu)) + eb |= get_vmcs12(vcpu)->exception_bitmap; + + vmcs_write32(EXCEPTION_BITMAP, eb); +} + +/* + * Check if MSR is intercepted for currently loaded MSR bitmap. + */ +static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) +{ + unsigned long *msr_bitmap; + int f = sizeof(unsigned long); + + if (!cpu_has_vmx_msr_bitmap()) + return true; + + msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap; + + if (msr <= 0x1fff) { + return !!test_bit(msr, msr_bitmap + 0x800 / f); + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { + msr &= 0x1fff; + return !!test_bit(msr, msr_bitmap + 0xc00 / f); + } + + return true; +} + +/* + * Check if MSR is intercepted for L01 MSR bitmap. + */ +static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr) +{ + unsigned long *msr_bitmap; + int f = sizeof(unsigned long); + + if (!cpu_has_vmx_msr_bitmap()) + return true; + + msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; + + if (msr <= 0x1fff) { + return !!test_bit(msr, msr_bitmap + 0x800 / f); + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { + msr &= 0x1fff; + return !!test_bit(msr, msr_bitmap + 0xc00 / f); + } + + return true; +} + +static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, + unsigned long entry, unsigned long exit) +{ + vm_entry_controls_clearbit(vmx, entry); + vm_exit_controls_clearbit(vmx, exit); +} + +static int find_msr(struct vmx_msrs *m, unsigned int msr) +{ + unsigned int i; + + for (i = 0; i < m->nr; ++i) { + if (m->val[i].index == msr) + return i; + } + return -ENOENT; +} + +static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) +{ + int i; + struct msr_autoload *m = &vmx->msr_autoload; + + switch (msr) { + case MSR_EFER: + if (cpu_has_load_ia32_efer) { + clear_atomic_switch_msr_special(vmx, + VM_ENTRY_LOAD_IA32_EFER, + VM_EXIT_LOAD_IA32_EFER); + return; + } + break; + case MSR_CORE_PERF_GLOBAL_CTRL: + if (cpu_has_load_perf_global_ctrl) { + clear_atomic_switch_msr_special(vmx, + VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, + VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); + return; + } + break; + } + i = find_msr(&m->guest, msr); + if (i < 0) + goto skip_guest; + --m->guest.nr; + m->guest.val[i] = m->guest.val[m->guest.nr]; + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); + +skip_guest: + i = find_msr(&m->host, msr); + if (i < 0) + return; + + --m->host.nr; + m->host.val[i] = m->host.val[m->host.nr]; + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); +} + +static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, + unsigned long entry, unsigned long exit, + unsigned long guest_val_vmcs, unsigned long host_val_vmcs, + u64 guest_val, u64 host_val) +{ + vmcs_write64(guest_val_vmcs, guest_val); + if (host_val_vmcs != HOST_IA32_EFER) + vmcs_write64(host_val_vmcs, host_val); + vm_entry_controls_setbit(vmx, entry); + vm_exit_controls_setbit(vmx, exit); +} + +static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, + u64 guest_val, u64 host_val, bool entry_only) +{ + int i, j = 0; + struct msr_autoload *m = &vmx->msr_autoload; + + switch (msr) { + case MSR_EFER: + if (cpu_has_load_ia32_efer) { + add_atomic_switch_msr_special(vmx, + VM_ENTRY_LOAD_IA32_EFER, + VM_EXIT_LOAD_IA32_EFER, + GUEST_IA32_EFER, + HOST_IA32_EFER, + guest_val, host_val); + return; + } + break; + case MSR_CORE_PERF_GLOBAL_CTRL: + if (cpu_has_load_perf_global_ctrl) { + add_atomic_switch_msr_special(vmx, + VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, + VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL, + GUEST_IA32_PERF_GLOBAL_CTRL, + HOST_IA32_PERF_GLOBAL_CTRL, + guest_val, host_val); + return; + } + break; + case MSR_IA32_PEBS_ENABLE: + /* PEBS needs a quiescent period after being disabled (to write + * a record). Disabling PEBS through VMX MSR swapping doesn't + * provide that period, so a CPU could write host's record into + * guest's memory. + */ + wrmsrl(MSR_IA32_PEBS_ENABLE, 0); + } + + i = find_msr(&m->guest, msr); + if (!entry_only) + j = find_msr(&m->host, msr); + + if (i == NR_AUTOLOAD_MSRS || j == NR_AUTOLOAD_MSRS) { + printk_once(KERN_WARNING "Not enough msr switch entries. " + "Can't add msr %x\n", msr); + return; + } + if (i < 0) { + i = m->guest.nr++; + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); + } + m->guest.val[i].index = msr; + m->guest.val[i].value = guest_val; + + if (entry_only) + return; + + if (j < 0) { + j = m->host.nr++; + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); + } + m->host.val[j].index = msr; + m->host.val[j].value = host_val; +} + +static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) +{ + u64 guest_efer = vmx->vcpu.arch.efer; + u64 ignore_bits = 0; + + if (!enable_ept) { + /* + * NX is needed to handle CR0.WP=1, CR4.SMEP=1. Testing + * host CPUID is more efficient than testing guest CPUID + * or CR4. Host SMEP is anyway a requirement for guest SMEP. + */ + if (boot_cpu_has(X86_FEATURE_SMEP)) + guest_efer |= EFER_NX; + else if (!(guest_efer & EFER_NX)) + ignore_bits |= EFER_NX; + } + + /* + * LMA and LME handled by hardware; SCE meaningless outside long mode. + */ + ignore_bits |= EFER_SCE; +#ifdef CONFIG_X86_64 + ignore_bits |= EFER_LMA | EFER_LME; + /* SCE is meaningful only in long mode on Intel */ + if (guest_efer & EFER_LMA) + ignore_bits &= ~(u64)EFER_SCE; +#endif + + /* + * On EPT, we can't emulate NX, so we must switch EFER atomically. + * On CPUs that support "load IA32_EFER", always switch EFER + * atomically, since it's faster than switching it manually. + */ + if (cpu_has_load_ia32_efer || + (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) { + if (!(guest_efer & EFER_LMA)) + guest_efer &= ~EFER_LME; + if (guest_efer != host_efer) + add_atomic_switch_msr(vmx, MSR_EFER, + guest_efer, host_efer, false); + else + clear_atomic_switch_msr(vmx, MSR_EFER); + return false; + } else { + clear_atomic_switch_msr(vmx, MSR_EFER); + + guest_efer &= ~ignore_bits; + guest_efer |= host_efer & ignore_bits; + + vmx->guest_msrs[efer_offset].data = guest_efer; + vmx->guest_msrs[efer_offset].mask = ~ignore_bits; + + return true; + } +} + +#ifdef CONFIG_X86_32 +/* + * On 32-bit kernels, VM exits still load the FS and GS bases from the + * VMCS rather than the segment table. KVM uses this helper to figure + * out the current bases to poke them into the VMCS before entry. + */ +static unsigned long segment_base(u16 selector) +{ + struct desc_struct *table; + unsigned long v; + + if (!(selector & ~SEGMENT_RPL_MASK)) + return 0; + + table = get_current_gdt_ro(); + + if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) { + u16 ldt_selector = kvm_read_ldt(); + + if (!(ldt_selector & ~SEGMENT_RPL_MASK)) + return 0; + + table = (struct desc_struct *)segment_base(ldt_selector); + } + v = get_desc_base(&table[selector >> 3]); + return v; +} +#endif + +static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs_host_state *host_state; +#ifdef CONFIG_X86_64 + int cpu = raw_smp_processor_id(); +#endif + unsigned long fs_base, gs_base; + u16 fs_sel, gs_sel; + int i; + + vmx->req_immediate_exit = false; + + /* + * Note that guest MSRs to be saved/restored can also be changed + * when guest state is loaded. This happens when guest transitions + * to/from long-mode by setting MSR_EFER.LMA. + */ + if (!vmx->loaded_cpu_state || vmx->guest_msrs_dirty) { + vmx->guest_msrs_dirty = false; + for (i = 0; i < vmx->save_nmsrs; ++i) + kvm_set_shared_msr(vmx->guest_msrs[i].index, + vmx->guest_msrs[i].data, + vmx->guest_msrs[i].mask); + + } + + if (vmx->loaded_cpu_state) + return; + + vmx->loaded_cpu_state = vmx->loaded_vmcs; + host_state = &vmx->loaded_cpu_state->host_state; + + /* + * Set host fs and gs selectors. Unfortunately, 22.2.3 does not + * allow segment selectors with cpl > 0 or ti == 1. + */ + host_state->ldt_sel = kvm_read_ldt(); + +#ifdef CONFIG_X86_64 + savesegment(ds, host_state->ds_sel); + savesegment(es, host_state->es_sel); + + gs_base = cpu_kernelmode_gs_base(cpu); + if (likely(is_64bit_mm(current->mm))) { + save_fsgs_for_kvm(); + fs_sel = current->thread.fsindex; + gs_sel = current->thread.gsindex; + fs_base = current->thread.fsbase; + vmx->msr_host_kernel_gs_base = current->thread.gsbase; + } else { + savesegment(fs, fs_sel); + savesegment(gs, gs_sel); + fs_base = read_msr(MSR_FS_BASE); + vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); + } + + wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); +#else + savesegment(fs, fs_sel); + savesegment(gs, gs_sel); + fs_base = segment_base(fs_sel); + gs_base = segment_base(gs_sel); +#endif + + if (unlikely(fs_sel != host_state->fs_sel)) { + if (!(fs_sel & 7)) + vmcs_write16(HOST_FS_SELECTOR, fs_sel); + else + vmcs_write16(HOST_FS_SELECTOR, 0); + host_state->fs_sel = fs_sel; + } + if (unlikely(gs_sel != host_state->gs_sel)) { + if (!(gs_sel & 7)) + vmcs_write16(HOST_GS_SELECTOR, gs_sel); + else + vmcs_write16(HOST_GS_SELECTOR, 0); + host_state->gs_sel = gs_sel; + } + if (unlikely(fs_base != host_state->fs_base)) { + vmcs_writel(HOST_FS_BASE, fs_base); + host_state->fs_base = fs_base; + } + if (unlikely(gs_base != host_state->gs_base)) { + vmcs_writel(HOST_GS_BASE, gs_base); + host_state->gs_base = gs_base; + } +} + +static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) +{ + struct vmcs_host_state *host_state; + + if (!vmx->loaded_cpu_state) + return; + + WARN_ON_ONCE(vmx->loaded_cpu_state != vmx->loaded_vmcs); + host_state = &vmx->loaded_cpu_state->host_state; + + ++vmx->vcpu.stat.host_state_reload; + vmx->loaded_cpu_state = NULL; + +#ifdef CONFIG_X86_64 + rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); +#endif + if (host_state->ldt_sel || (host_state->gs_sel & 7)) { + kvm_load_ldt(host_state->ldt_sel); +#ifdef CONFIG_X86_64 + load_gs_index(host_state->gs_sel); +#else + loadsegment(gs, host_state->gs_sel); +#endif + } + if (host_state->fs_sel & 7) + loadsegment(fs, host_state->fs_sel); +#ifdef CONFIG_X86_64 + if (unlikely(host_state->ds_sel | host_state->es_sel)) { + loadsegment(ds, host_state->ds_sel); + loadsegment(es, host_state->es_sel); + } +#endif + invalidate_tss_limit(); +#ifdef CONFIG_X86_64 + wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); +#endif + load_fixmap_gdt(raw_smp_processor_id()); +} + +#ifdef CONFIG_X86_64 +static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) +{ + preempt_disable(); + if (vmx->loaded_cpu_state) + rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + preempt_enable(); + return vmx->msr_guest_kernel_gs_base; +} + +static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) +{ + preempt_disable(); + if (vmx->loaded_cpu_state) + wrmsrl(MSR_KERNEL_GS_BASE, data); + preempt_enable(); + vmx->msr_guest_kernel_gs_base = data; +} +#endif + +static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct pi_desc old, new; + unsigned int dest; + + /* + * In case of hot-plug or hot-unplug, we may have to undo + * vmx_vcpu_pi_put even if there is no assigned device. And we + * always keep PI.NDST up to date for simplicity: it makes the + * code easier, and CPU migration is not a fast path. + */ + if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) + return; + + /* + * First handle the simple case where no cmpxchg is necessary; just + * allow posting non-urgent interrupts. + * + * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change + * PI.NDST: pi_post_block will do it for us and the wakeup_handler + * expects the VCPU to be on the blocked_vcpu_list that matches + * PI.NDST. + */ + if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || + vcpu->cpu == cpu) { + pi_clear_sn(pi_desc); + return; + } + + /* The full case. */ + do { + old.control = new.control = pi_desc->control; + + dest = cpu_physical_id(cpu); + + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; + + new.sn = 0; + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); +} + +static void decache_tsc_multiplier(struct vcpu_vmx *vmx) +{ + vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio; + vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio); +} + +/* + * Switches to specified vcpu, until a matching vcpu_put(), but assumes + * vcpu mutex is already taken. + */ +static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + bool already_loaded = vmx->loaded_vmcs->cpu == cpu; + + if (!already_loaded) { + loaded_vmcs_clear(vmx->loaded_vmcs); + local_irq_disable(); + crash_disable_local_vmclear(cpu); + + /* + * Read loaded_vmcs->cpu should be before fetching + * loaded_vmcs->loaded_vmcss_on_cpu_link. + * See the comments in __loaded_vmcs_clear(). + */ + smp_rmb(); + + list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, + &per_cpu(loaded_vmcss_on_cpu, cpu)); + crash_enable_local_vmclear(cpu); + local_irq_enable(); + } + + if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { + per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; + vmcs_load(vmx->loaded_vmcs->vmcs); + indirect_branch_prediction_barrier(); + } + + if (!already_loaded) { + void *gdt = get_current_gdt_ro(); + unsigned long sysenter_esp; + + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + + /* + * Linux uses per-cpu TSS and GDT, so set these when switching + * processors. See 22.2.4. + */ + vmcs_writel(HOST_TR_BASE, + (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss); + vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ + + /* + * VM exits change the host TR limit to 0x67 after a VM + * exit. This is okay, since 0x67 covers everything except + * the IO bitmap and have have code to handle the IO bitmap + * being lost after a VM exit. + */ + BUILD_BUG_ON(IO_BITMAP_OFFSET - 1 != 0x67); + + rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); + vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ + + vmx->loaded_vmcs->cpu = cpu; + } + + /* Setup TSC multiplier */ + if (kvm_has_tsc_control && + vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio) + decache_tsc_multiplier(vmx); + + vmx_vcpu_pi_load(vcpu, cpu); + vmx->host_pkru = read_pkru(); + vmx->host_debugctlmsr = get_debugctlmsr(); +} + +static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + if (!kvm_arch_has_assigned_device(vcpu->kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP) || + !kvm_vcpu_apicv_active(vcpu)) + return; + + /* Set SN when the vCPU is preempted */ + if (vcpu->preempted) + pi_set_sn(pi_desc); +} + +static void vmx_vcpu_put(struct kvm_vcpu *vcpu) +{ + vmx_vcpu_pi_put(vcpu); + + vmx_prepare_switch_to_host(to_vmx(vcpu)); +} + +static bool emulation_required(struct kvm_vcpu *vcpu) +{ + return emulate_invalid_guest_state && !guest_state_valid(vcpu); +} + +static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); + +/* + * Return the cr0 value that a nested guest would read. This is a combination + * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by + * its hypervisor (cr0_read_shadow). + */ +static inline unsigned long nested_read_cr0(struct vmcs12 *fields) +{ + return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) | + (fields->cr0_read_shadow & fields->cr0_guest_host_mask); +} +static inline unsigned long nested_read_cr4(struct vmcs12 *fields) +{ + return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) | + (fields->cr4_read_shadow & fields->cr4_guest_host_mask); +} + +static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) +{ + unsigned long rflags, save_rflags; + + if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) { + __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); + rflags = vmcs_readl(GUEST_RFLAGS); + if (to_vmx(vcpu)->rmode.vm86_active) { + rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; + save_rflags = to_vmx(vcpu)->rmode.save_rflags; + rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; + } + to_vmx(vcpu)->rflags = rflags; + } + return to_vmx(vcpu)->rflags; +} + +static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +{ + unsigned long old_rflags = vmx_get_rflags(vcpu); + + __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); + to_vmx(vcpu)->rflags = rflags; + if (to_vmx(vcpu)->rmode.vm86_active) { + to_vmx(vcpu)->rmode.save_rflags = rflags; + rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; + } + vmcs_writel(GUEST_RFLAGS, rflags); + + if ((old_rflags ^ to_vmx(vcpu)->rflags) & X86_EFLAGS_VM) + to_vmx(vcpu)->emulation_required = emulation_required(vcpu); +} + +static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) +{ + u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); + int ret = 0; + + if (interruptibility & GUEST_INTR_STATE_STI) + ret |= KVM_X86_SHADOW_INT_STI; + if (interruptibility & GUEST_INTR_STATE_MOV_SS) + ret |= KVM_X86_SHADOW_INT_MOV_SS; + + return ret; +} + +static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) +{ + u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); + u32 interruptibility = interruptibility_old; + + interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); + + if (mask & KVM_X86_SHADOW_INT_MOV_SS) + interruptibility |= GUEST_INTR_STATE_MOV_SS; + else if (mask & KVM_X86_SHADOW_INT_STI) + interruptibility |= GUEST_INTR_STATE_STI; + + if ((interruptibility != interruptibility_old)) + vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); +} + +static void skip_emulated_instruction(struct kvm_vcpu *vcpu) +{ + unsigned long rip; + + rip = kvm_rip_read(vcpu); + rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + kvm_rip_write(vcpu, rip); + + /* skipping an emulated instruction also counts */ + vmx_set_interrupt_shadow(vcpu, 0); +} + +static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu, + unsigned long exit_qual) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + unsigned int nr = vcpu->arch.exception.nr; + u32 intr_info = nr | INTR_INFO_VALID_MASK; + + if (vcpu->arch.exception.has_error_code) { + vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code; + intr_info |= INTR_INFO_DELIVER_CODE_MASK; + } + + if (kvm_exception_is_soft(nr)) + intr_info |= INTR_TYPE_SOFT_EXCEPTION; + else + intr_info |= INTR_TYPE_HARD_EXCEPTION; + + if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && + vmx_get_nmi_mask(vcpu)) + intr_info |= INTR_INFO_UNBLOCK_NMI; + + nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); +} + +/* + * KVM wants to inject page-faults which it got to the guest. This function + * checks whether in a nested guest, we need to inject them to L1 or L2. + */ +static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + unsigned int nr = vcpu->arch.exception.nr; + bool has_payload = vcpu->arch.exception.has_payload; + unsigned long payload = vcpu->arch.exception.payload; + + if (nr == PF_VECTOR) { + if (vcpu->arch.exception.nested_apf) { + *exit_qual = vcpu->arch.apf.nested_apf_token; + return 1; + } + if (nested_vmx_is_page_fault_vmexit(vmcs12, + vcpu->arch.exception.error_code)) { + *exit_qual = has_payload ? payload : vcpu->arch.cr2; + return 1; + } + } else if (vmcs12->exception_bitmap & (1u << nr)) { + if (nr == DB_VECTOR) { + if (!has_payload) { + payload = vcpu->arch.dr6; + payload &= ~(DR6_FIXED_1 | DR6_BT); + payload ^= DR6_RTM; + } + *exit_qual = payload; + } else + *exit_qual = 0; + return 1; + } + + return 0; +} + +static void vmx_clear_hlt(struct kvm_vcpu *vcpu) +{ + /* + * Ensure that we clear the HLT state in the VMCS. We don't need to + * explicitly skip the instruction because if the HLT state is set, + * then the instruction is already executing and RIP has already been + * advanced. + */ + if (kvm_hlt_in_guest(vcpu->kvm) && + vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT) + vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); +} + +static void vmx_queue_exception(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned nr = vcpu->arch.exception.nr; + bool has_error_code = vcpu->arch.exception.has_error_code; + u32 error_code = vcpu->arch.exception.error_code; + u32 intr_info = nr | INTR_INFO_VALID_MASK; + + kvm_deliver_exception_payload(vcpu); + + if (has_error_code) { + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); + intr_info |= INTR_INFO_DELIVER_CODE_MASK; + } + + if (vmx->rmode.vm86_active) { + int inc_eip = 0; + if (kvm_exception_is_soft(nr)) + inc_eip = vcpu->arch.event_exit_inst_len; + if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE) + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + return; + } + + WARN_ON_ONCE(vmx->emulation_required); + + if (kvm_exception_is_soft(nr)) { + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, + vmx->vcpu.arch.event_exit_inst_len); + intr_info |= INTR_TYPE_SOFT_EXCEPTION; + } else + intr_info |= INTR_TYPE_HARD_EXCEPTION; + + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); + + vmx_clear_hlt(vcpu); +} + +static bool vmx_rdtscp_supported(void) +{ + return cpu_has_vmx_rdtscp(); +} + +static bool vmx_invpcid_supported(void) +{ + return cpu_has_vmx_invpcid(); +} + +/* + * Swap MSR entry in host/guest MSR entry array. + */ +static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) +{ + struct shared_msr_entry tmp; + + tmp = vmx->guest_msrs[to]; + vmx->guest_msrs[to] = vmx->guest_msrs[from]; + vmx->guest_msrs[from] = tmp; +} + +/* + * Set up the vmcs to automatically save and restore system + * msrs. Don't touch the 64-bit msrs if the guest is in legacy + * mode, as fiddling with msrs is very expensive. + */ +static void setup_msrs(struct vcpu_vmx *vmx) +{ + int save_nmsrs, index; + + save_nmsrs = 0; +#ifdef CONFIG_X86_64 + if (is_long_mode(&vmx->vcpu)) { + index = __find_msr_index(vmx, MSR_SYSCALL_MASK); + if (index >= 0) + move_msr_up(vmx, index, save_nmsrs++); + index = __find_msr_index(vmx, MSR_LSTAR); + if (index >= 0) + move_msr_up(vmx, index, save_nmsrs++); + index = __find_msr_index(vmx, MSR_CSTAR); + if (index >= 0) + move_msr_up(vmx, index, save_nmsrs++); + index = __find_msr_index(vmx, MSR_TSC_AUX); + if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP)) + move_msr_up(vmx, index, save_nmsrs++); + /* + * MSR_STAR is only needed on long mode guests, and only + * if efer.sce is enabled. + */ + index = __find_msr_index(vmx, MSR_STAR); + if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE)) + move_msr_up(vmx, index, save_nmsrs++); + } +#endif + index = __find_msr_index(vmx, MSR_EFER); + if (index >= 0 && update_transition_efer(vmx, index)) + move_msr_up(vmx, index, save_nmsrs++); + + vmx->save_nmsrs = save_nmsrs; + vmx->guest_msrs_dirty = true; + + if (cpu_has_vmx_msr_bitmap()) + vmx_update_msr_bitmap(&vmx->vcpu); +} + +static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + if (is_guest_mode(vcpu) && + (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)) + return vcpu->arch.tsc_offset - vmcs12->tsc_offset; + + return vcpu->arch.tsc_offset; +} + +static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + u64 g_tsc_offset = 0; + + /* + * We're here if L1 chose not to trap WRMSR to TSC. According + * to the spec, this should set L1's TSC; The offset that L1 + * set for L2 remains unchanged, and still needs to be added + * to the newly set TSC to get L2's TSC. + */ + if (is_guest_mode(vcpu) && + (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)) + g_tsc_offset = vmcs12->tsc_offset; + + trace_kvm_write_tsc_offset(vcpu->vcpu_id, + vcpu->arch.tsc_offset - g_tsc_offset, + offset); + vmcs_write64(TSC_OFFSET, offset + g_tsc_offset); + return offset + g_tsc_offset; +} + +/* + * nested_vmx_allowed() checks whether a guest should be allowed to use VMX + * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for + * all guests if the "nested" module option is off, and can also be disabled + * for a single guest by disabling its VMX cpuid bit. + */ +static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu) +{ + return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX); +} + +/* + * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be + * returned for the various VMX controls MSRs when nested VMX is enabled. + * The same values should also be used to verify that vmcs12 control fields are + * valid during nested entry from L1 to L2. + * Each of these control msrs has a low and high 32-bit half: A low bit is on + * if the corresponding bit in the (32-bit) control field *must* be on, and a + * bit in the high half is on if the corresponding bit in the control field + * may be on. See also vmx_control_verify(). + */ +static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv) +{ + if (!nested) { + memset(msrs, 0, sizeof(*msrs)); + return; + } + + /* + * Note that as a general rule, the high half of the MSRs (bits in + * the control fields which may be 1) should be initialized by the + * intersection of the underlying hardware's MSR (i.e., features which + * can be supported) and the list of features we want to expose - + * because they are known to be properly supported in our code. + * Also, usually, the low half of the MSRs (bits which must be 1) can + * be set to 0, meaning that L1 may turn off any of these bits. The + * reason is that if one of these bits is necessary, it will appear + * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control + * fields of vmcs01 and vmcs02, will turn these bits off - and + * nested_vmx_exit_reflected() will not pass related exits to L1. + * These rules have exceptions below. + */ + + /* pin-based controls */ + rdmsr(MSR_IA32_VMX_PINBASED_CTLS, + msrs->pinbased_ctls_low, + msrs->pinbased_ctls_high); + msrs->pinbased_ctls_low |= + PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; + msrs->pinbased_ctls_high &= + PIN_BASED_EXT_INTR_MASK | + PIN_BASED_NMI_EXITING | + PIN_BASED_VIRTUAL_NMIS | + (apicv ? PIN_BASED_POSTED_INTR : 0); + msrs->pinbased_ctls_high |= + PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | + PIN_BASED_VMX_PREEMPTION_TIMER; + + /* exit controls */ + rdmsr(MSR_IA32_VMX_EXIT_CTLS, + msrs->exit_ctls_low, + msrs->exit_ctls_high); + msrs->exit_ctls_low = + VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; + + msrs->exit_ctls_high &= +#ifdef CONFIG_X86_64 + VM_EXIT_HOST_ADDR_SPACE_SIZE | +#endif + VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; + msrs->exit_ctls_high |= + VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | + VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | + VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; + + /* We support free control of debug control saving. */ + msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; + + /* entry controls */ + rdmsr(MSR_IA32_VMX_ENTRY_CTLS, + msrs->entry_ctls_low, + msrs->entry_ctls_high); + msrs->entry_ctls_low = + VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; + msrs->entry_ctls_high &= +#ifdef CONFIG_X86_64 + VM_ENTRY_IA32E_MODE | +#endif + VM_ENTRY_LOAD_IA32_PAT; + msrs->entry_ctls_high |= + (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); + + /* We support free control of debug control loading. */ + msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; + + /* cpu-based controls */ + rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, + msrs->procbased_ctls_low, + msrs->procbased_ctls_high); + msrs->procbased_ctls_low = + CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; + msrs->procbased_ctls_high &= + CPU_BASED_VIRTUAL_INTR_PENDING | + CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | + CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | + CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING | +#ifdef CONFIG_X86_64 + CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | +#endif + CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | + CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | + CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | + CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | + CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; + /* + * We can allow some features even when not supported by the + * hardware. For example, L1 can specify an MSR bitmap - and we + * can use it to avoid exits to L1 - even when L0 runs L2 + * without MSR bitmaps. + */ + msrs->procbased_ctls_high |= + CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | + CPU_BASED_USE_MSR_BITMAPS; + + /* We support free control of CR3 access interception. */ + msrs->procbased_ctls_low &= + ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); + + /* + * secondary cpu-based controls. Do not include those that + * depend on CPUID bits, they are added later by vmx_cpuid_update. + */ + rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, + msrs->secondary_ctls_low, + msrs->secondary_ctls_high); + msrs->secondary_ctls_low = 0; + msrs->secondary_ctls_high &= + SECONDARY_EXEC_DESC | + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | + SECONDARY_EXEC_WBINVD_EXITING; + + /* + * We can emulate "VMCS shadowing," even if the hardware + * doesn't support it. + */ + msrs->secondary_ctls_high |= + SECONDARY_EXEC_SHADOW_VMCS; + + if (enable_ept) { + /* nested EPT: emulate EPT also to L1 */ + msrs->secondary_ctls_high |= + SECONDARY_EXEC_ENABLE_EPT; + msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT | + VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT; + if (cpu_has_vmx_ept_execute_only()) + msrs->ept_caps |= + VMX_EPT_EXECUTE_ONLY_BIT; + msrs->ept_caps &= vmx_capability.ept; + msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | + VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | + VMX_EPT_1GB_PAGE_BIT; + if (enable_ept_ad_bits) { + msrs->secondary_ctls_high |= + SECONDARY_EXEC_ENABLE_PML; + msrs->ept_caps |= VMX_EPT_AD_BIT; + } + } + + if (cpu_has_vmx_vmfunc()) { + msrs->secondary_ctls_high |= + SECONDARY_EXEC_ENABLE_VMFUNC; + /* + * Advertise EPTP switching unconditionally + * since we emulate it + */ + if (enable_ept) + msrs->vmfunc_controls = + VMX_VMFUNC_EPTP_SWITCHING; + } + + /* + * Old versions of KVM use the single-context version without + * checking for support, so declare that it is supported even + * though it is treated as global context. The alternative is + * not failing the single-context invvpid, and it is worse. + */ + if (enable_vpid) { + msrs->secondary_ctls_high |= + SECONDARY_EXEC_ENABLE_VPID; + msrs->vpid_caps = VMX_VPID_INVVPID_BIT | + VMX_VPID_EXTENT_SUPPORTED_MASK; + } + + if (enable_unrestricted_guest) + msrs->secondary_ctls_high |= + SECONDARY_EXEC_UNRESTRICTED_GUEST; + + if (flexpriority_enabled) + msrs->secondary_ctls_high |= + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + + /* miscellaneous data */ + rdmsr(MSR_IA32_VMX_MISC, + msrs->misc_low, + msrs->misc_high); + msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA; + msrs->misc_low |= + MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | + VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | + VMX_MISC_ACTIVITY_HLT; + msrs->misc_high = 0; + + /* + * This MSR reports some information about VMX support. We + * should return information about the VMX we emulate for the + * guest, and the VMCS structure we give it - not about the + * VMX support of the underlying hardware. + */ + msrs->basic = + VMCS12_REVISION | + VMX_BASIC_TRUE_CTLS | + ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | + (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); + + if (cpu_has_vmx_basic_inout()) + msrs->basic |= VMX_BASIC_INOUT; + + /* + * These MSRs specify bits which the guest must keep fixed on + * while L1 is in VMXON mode (in L1's root mode, or running an L2). + * We picked the standard core2 setting. + */ +#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) +#define VMXON_CR4_ALWAYSON X86_CR4_VMXE + msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; + msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; + + /* These MSRs specify bits which the guest must keep fixed off. */ + rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); + rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); + + /* highest index: VMX_PREEMPTION_TIMER_VALUE */ + msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1; +} + +/* + * if fixed0[i] == 1: val[i] must be 1 + * if fixed1[i] == 0: val[i] must be 0 + */ +static inline bool fixed_bits_valid(u64 val, u64 fixed0, u64 fixed1) +{ + return ((val & fixed1) | fixed0) == val; +} + +static inline bool vmx_control_verify(u32 control, u32 low, u32 high) +{ + return fixed_bits_valid(control, low, high); +} + +static inline u64 vmx_control_msr(u32 low, u32 high) +{ + return low | ((u64)high << 32); +} + +static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) +{ + superset &= mask; + subset &= mask; + + return (superset | subset) == superset; +} + +static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) +{ + const u64 feature_and_reserved = + /* feature (except bit 48; see below) */ + BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | + /* reserved */ + BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); + u64 vmx_basic = vmx->nested.msrs.basic; + + if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) + return -EINVAL; + + /* + * KVM does not emulate a version of VMX that constrains physical + * addresses of VMX structures (e.g. VMCS) to 32-bits. + */ + if (data & BIT_ULL(48)) + return -EINVAL; + + if (vmx_basic_vmcs_revision_id(vmx_basic) != + vmx_basic_vmcs_revision_id(data)) + return -EINVAL; + + if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) + return -EINVAL; + + vmx->nested.msrs.basic = data; + return 0; +} + +static int +vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +{ + u64 supported; + u32 *lowp, *highp; + + switch (msr_index) { + case MSR_IA32_VMX_TRUE_PINBASED_CTLS: + lowp = &vmx->nested.msrs.pinbased_ctls_low; + highp = &vmx->nested.msrs.pinbased_ctls_high; + break; + case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: + lowp = &vmx->nested.msrs.procbased_ctls_low; + highp = &vmx->nested.msrs.procbased_ctls_high; + break; + case MSR_IA32_VMX_TRUE_EXIT_CTLS: + lowp = &vmx->nested.msrs.exit_ctls_low; + highp = &vmx->nested.msrs.exit_ctls_high; + break; + case MSR_IA32_VMX_TRUE_ENTRY_CTLS: + lowp = &vmx->nested.msrs.entry_ctls_low; + highp = &vmx->nested.msrs.entry_ctls_high; + break; + case MSR_IA32_VMX_PROCBASED_CTLS2: + lowp = &vmx->nested.msrs.secondary_ctls_low; + highp = &vmx->nested.msrs.secondary_ctls_high; + break; + default: + BUG(); + } + + supported = vmx_control_msr(*lowp, *highp); + + /* Check must-be-1 bits are still 1. */ + if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) + return -EINVAL; + + /* Check must-be-0 bits are still 0. */ + if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) + return -EINVAL; + + *lowp = data; + *highp = data >> 32; + return 0; +} + +static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) +{ + const u64 feature_and_reserved_bits = + /* feature */ + BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | + BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | + /* reserved */ + GENMASK_ULL(13, 9) | BIT_ULL(31); + u64 vmx_misc; + + vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, + vmx->nested.msrs.misc_high); + + if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) + return -EINVAL; + + if ((vmx->nested.msrs.pinbased_ctls_high & + PIN_BASED_VMX_PREEMPTION_TIMER) && + vmx_misc_preemption_timer_rate(data) != + vmx_misc_preemption_timer_rate(vmx_misc)) + return -EINVAL; + + if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) + return -EINVAL; + + if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) + return -EINVAL; + + if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) + return -EINVAL; + + vmx->nested.msrs.misc_low = data; + vmx->nested.msrs.misc_high = data >> 32; + + /* + * If L1 has read-only VM-exit information fields, use the + * less permissive vmx_vmwrite_bitmap to specify write + * permissions for the shadow VMCS. + */ + if (enable_shadow_vmcs && !nested_cpu_has_vmwrite_any_field(&vmx->vcpu)) + vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); + + return 0; +} + +static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) +{ + u64 vmx_ept_vpid_cap; + + vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps, + vmx->nested.msrs.vpid_caps); + + /* Every bit is either reserved or a feature bit. */ + if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) + return -EINVAL; + + vmx->nested.msrs.ept_caps = data; + vmx->nested.msrs.vpid_caps = data >> 32; + return 0; +} + +static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +{ + u64 *msr; + + switch (msr_index) { + case MSR_IA32_VMX_CR0_FIXED0: + msr = &vmx->nested.msrs.cr0_fixed0; + break; + case MSR_IA32_VMX_CR4_FIXED0: + msr = &vmx->nested.msrs.cr4_fixed0; + break; + default: + BUG(); + } + + /* + * 1 bits (which indicates bits which "must-be-1" during VMX operation) + * must be 1 in the restored value. + */ + if (!is_bitwise_subset(data, *msr, -1ULL)) + return -EINVAL; + + *msr = data; + return 0; +} + +/* + * Called when userspace is restoring VMX MSRs. + * + * Returns 0 on success, non-0 otherwise. + */ +static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * Don't allow changes to the VMX capability MSRs while the vCPU + * is in VMX operation. + */ + if (vmx->nested.vmxon) + return -EBUSY; + + switch (msr_index) { + case MSR_IA32_VMX_BASIC: + return vmx_restore_vmx_basic(vmx, data); + case MSR_IA32_VMX_PINBASED_CTLS: + case MSR_IA32_VMX_PROCBASED_CTLS: + case MSR_IA32_VMX_EXIT_CTLS: + case MSR_IA32_VMX_ENTRY_CTLS: + /* + * The "non-true" VMX capability MSRs are generated from the + * "true" MSRs, so we do not support restoring them directly. + * + * If userspace wants to emulate VMX_BASIC[55]=0, userspace + * should restore the "true" MSRs with the must-be-1 bits + * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND + * DEFAULT SETTINGS". + */ + return -EINVAL; + case MSR_IA32_VMX_TRUE_PINBASED_CTLS: + case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: + case MSR_IA32_VMX_TRUE_EXIT_CTLS: + case MSR_IA32_VMX_TRUE_ENTRY_CTLS: + case MSR_IA32_VMX_PROCBASED_CTLS2: + return vmx_restore_control_msr(vmx, msr_index, data); + case MSR_IA32_VMX_MISC: + return vmx_restore_vmx_misc(vmx, data); + case MSR_IA32_VMX_CR0_FIXED0: + case MSR_IA32_VMX_CR4_FIXED0: + return vmx_restore_fixed0_msr(vmx, msr_index, data); + case MSR_IA32_VMX_CR0_FIXED1: + case MSR_IA32_VMX_CR4_FIXED1: + /* + * These MSRs are generated based on the vCPU's CPUID, so we + * do not support restoring them directly. + */ + return -EINVAL; + case MSR_IA32_VMX_EPT_VPID_CAP: + return vmx_restore_vmx_ept_vpid_cap(vmx, data); + case MSR_IA32_VMX_VMCS_ENUM: + vmx->nested.msrs.vmcs_enum = data; + return 0; + default: + /* + * The rest of the VMX capability MSRs do not support restore. + */ + return -EINVAL; + } +} + +/* Returns 0 on success, non-0 otherwise. */ +static int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) +{ + switch (msr_index) { + case MSR_IA32_VMX_BASIC: + *pdata = msrs->basic; + break; + case MSR_IA32_VMX_TRUE_PINBASED_CTLS: + case MSR_IA32_VMX_PINBASED_CTLS: + *pdata = vmx_control_msr( + msrs->pinbased_ctls_low, + msrs->pinbased_ctls_high); + if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) + *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; + break; + case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: + case MSR_IA32_VMX_PROCBASED_CTLS: + *pdata = vmx_control_msr( + msrs->procbased_ctls_low, + msrs->procbased_ctls_high); + if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) + *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; + break; + case MSR_IA32_VMX_TRUE_EXIT_CTLS: + case MSR_IA32_VMX_EXIT_CTLS: + *pdata = vmx_control_msr( + msrs->exit_ctls_low, + msrs->exit_ctls_high); + if (msr_index == MSR_IA32_VMX_EXIT_CTLS) + *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; + break; + case MSR_IA32_VMX_TRUE_ENTRY_CTLS: + case MSR_IA32_VMX_ENTRY_CTLS: + *pdata = vmx_control_msr( + msrs->entry_ctls_low, + msrs->entry_ctls_high); + if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) + *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; + break; + case MSR_IA32_VMX_MISC: + *pdata = vmx_control_msr( + msrs->misc_low, + msrs->misc_high); + break; + case MSR_IA32_VMX_CR0_FIXED0: + *pdata = msrs->cr0_fixed0; + break; + case MSR_IA32_VMX_CR0_FIXED1: + *pdata = msrs->cr0_fixed1; + break; + case MSR_IA32_VMX_CR4_FIXED0: + *pdata = msrs->cr4_fixed0; + break; + case MSR_IA32_VMX_CR4_FIXED1: + *pdata = msrs->cr4_fixed1; + break; + case MSR_IA32_VMX_VMCS_ENUM: + *pdata = msrs->vmcs_enum; + break; + case MSR_IA32_VMX_PROCBASED_CTLS2: + *pdata = vmx_control_msr( + msrs->secondary_ctls_low, + msrs->secondary_ctls_high); + break; + case MSR_IA32_VMX_EPT_VPID_CAP: + *pdata = msrs->ept_caps | + ((u64)msrs->vpid_caps << 32); + break; + case MSR_IA32_VMX_VMFUNC: + *pdata = msrs->vmfunc_controls; + break; + default: + return 1; + } + + return 0; +} + +static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu, + uint64_t val) +{ + uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits; + + return !(val & ~valid_bits); +} + +static int vmx_get_msr_feature(struct kvm_msr_entry *msr) +{ + switch (msr->index) { + case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: + if (!nested) + return 1; + return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data); + default: + return 1; + } + + return 0; +} + +/* + * Reads an msr value (of 'msr_index') into 'pdata'. + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct shared_msr_entry *msr; + + switch (msr_info->index) { +#ifdef CONFIG_X86_64 + case MSR_FS_BASE: + msr_info->data = vmcs_readl(GUEST_FS_BASE); + break; + case MSR_GS_BASE: + msr_info->data = vmcs_readl(GUEST_GS_BASE); + break; + case MSR_KERNEL_GS_BASE: + msr_info->data = vmx_read_guest_kernel_gs_base(vmx); + break; +#endif + case MSR_EFER: + return kvm_get_msr_common(vcpu, msr_info); + case MSR_IA32_SPEC_CTRL: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) + return 1; + + msr_info->data = to_vmx(vcpu)->spec_ctrl; + break; + case MSR_IA32_ARCH_CAPABILITIES: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) + return 1; + msr_info->data = to_vmx(vcpu)->arch_capabilities; + break; + case MSR_IA32_SYSENTER_CS: + msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); + break; + case MSR_IA32_SYSENTER_EIP: + msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP); + break; + case MSR_IA32_SYSENTER_ESP: + msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); + break; + case MSR_IA32_BNDCFGS: + if (!kvm_mpx_supported() || + (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_MPX))) + return 1; + msr_info->data = vmcs_read64(GUEST_BNDCFGS); + break; + case MSR_IA32_MCG_EXT_CTL: + if (!msr_info->host_initiated && + !(vmx->msr_ia32_feature_control & + FEATURE_CONTROL_LMCE)) + return 1; + msr_info->data = vcpu->arch.mcg_ext_ctl; + break; + case MSR_IA32_FEATURE_CONTROL: + msr_info->data = vmx->msr_ia32_feature_control; + break; + case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: + if (!nested_vmx_allowed(vcpu)) + return 1; + return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, + &msr_info->data); + case MSR_IA32_XSS: + if (!vmx_xsaves_supported()) + return 1; + msr_info->data = vcpu->arch.ia32_xss; + break; + case MSR_TSC_AUX: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) + return 1; + /* Otherwise falls through */ + default: + msr = find_msr_entry(vmx, msr_info->index); + if (msr) { + msr_info->data = msr->data; + break; + } + return kvm_get_msr_common(vcpu, msr_info); + } + + return 0; +} + +static void vmx_leave_nested(struct kvm_vcpu *vcpu); + +/* + * Writes msr value into into the appropriate "register". + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct shared_msr_entry *msr; + int ret = 0; + u32 msr_index = msr_info->index; + u64 data = msr_info->data; + + switch (msr_index) { + case MSR_EFER: + ret = kvm_set_msr_common(vcpu, msr_info); + break; +#ifdef CONFIG_X86_64 + case MSR_FS_BASE: + vmx_segment_cache_clear(vmx); + vmcs_writel(GUEST_FS_BASE, data); + break; + case MSR_GS_BASE: + vmx_segment_cache_clear(vmx); + vmcs_writel(GUEST_GS_BASE, data); + break; + case MSR_KERNEL_GS_BASE: + vmx_write_guest_kernel_gs_base(vmx, data); + break; +#endif + case MSR_IA32_SYSENTER_CS: + vmcs_write32(GUEST_SYSENTER_CS, data); + break; + case MSR_IA32_SYSENTER_EIP: + vmcs_writel(GUEST_SYSENTER_EIP, data); + break; + case MSR_IA32_SYSENTER_ESP: + vmcs_writel(GUEST_SYSENTER_ESP, data); + break; + case MSR_IA32_BNDCFGS: + if (!kvm_mpx_supported() || + (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_MPX))) + return 1; + if (is_noncanonical_address(data & PAGE_MASK, vcpu) || + (data & MSR_IA32_BNDCFGS_RSVD)) + return 1; + vmcs_write64(GUEST_BNDCFGS, data); + break; + case MSR_IA32_SPEC_CTRL: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) + return 1; + + /* The STIBP bit doesn't fault even if it's not advertised */ + if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD)) + return 1; + + vmx->spec_ctrl = data; + + if (!data) + break; + + /* + * For non-nested: + * When it's written (to non-zero) for the first time, pass + * it through. + * + * For nested: + * The handling of the MSR bitmap for L2 guests is done in + * nested_vmx_merge_msr_bitmap. We should not touch the + * vmcs02.msr_bitmap here since it gets completely overwritten + * in the merging. We update the vmcs01 here for L1 as well + * since it will end up touching the MSR anyway now. + */ + vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, + MSR_IA32_SPEC_CTRL, + MSR_TYPE_RW); + break; + case MSR_IA32_PRED_CMD: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) + return 1; + + if (data & ~PRED_CMD_IBPB) + return 1; + + if (!data) + break; + + wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); + + /* + * For non-nested: + * When it's written (to non-zero) for the first time, pass + * it through. + * + * For nested: + * The handling of the MSR bitmap for L2 guests is done in + * nested_vmx_merge_msr_bitmap. We should not touch the + * vmcs02.msr_bitmap here since it gets completely overwritten + * in the merging. + */ + vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD, + MSR_TYPE_W); + break; + case MSR_IA32_ARCH_CAPABILITIES: + if (!msr_info->host_initiated) + return 1; + vmx->arch_capabilities = data; + break; + case MSR_IA32_CR_PAT: + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { + if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) + return 1; + vmcs_write64(GUEST_IA32_PAT, data); + vcpu->arch.pat = data; + break; + } + ret = kvm_set_msr_common(vcpu, msr_info); + break; + case MSR_IA32_TSC_ADJUST: + ret = kvm_set_msr_common(vcpu, msr_info); + break; + case MSR_IA32_MCG_EXT_CTL: + if ((!msr_info->host_initiated && + !(to_vmx(vcpu)->msr_ia32_feature_control & + FEATURE_CONTROL_LMCE)) || + (data & ~MCG_EXT_CTL_LMCE_EN)) + return 1; + vcpu->arch.mcg_ext_ctl = data; + break; + case MSR_IA32_FEATURE_CONTROL: + if (!vmx_feature_control_msr_valid(vcpu, data) || + (to_vmx(vcpu)->msr_ia32_feature_control & + FEATURE_CONTROL_LOCKED && !msr_info->host_initiated)) + return 1; + vmx->msr_ia32_feature_control = data; + if (msr_info->host_initiated && data == 0) + vmx_leave_nested(vcpu); + break; + case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: + if (!msr_info->host_initiated) + return 1; /* they are read-only */ + if (!nested_vmx_allowed(vcpu)) + return 1; + return vmx_set_vmx_msr(vcpu, msr_index, data); + case MSR_IA32_XSS: + if (!vmx_xsaves_supported()) + return 1; + /* + * The only supported bit as of Skylake is bit 8, but + * it is not supported on KVM. + */ + if (data != 0) + return 1; + vcpu->arch.ia32_xss = data; + if (vcpu->arch.ia32_xss != host_xss) + add_atomic_switch_msr(vmx, MSR_IA32_XSS, + vcpu->arch.ia32_xss, host_xss, false); + else + clear_atomic_switch_msr(vmx, MSR_IA32_XSS); + break; + case MSR_TSC_AUX: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) + return 1; + /* Check reserved bit, higher 32 bits should be zero */ + if ((data >> 32) != 0) + return 1; + /* Otherwise falls through */ + default: + msr = find_msr_entry(vmx, msr_index); + if (msr) { + u64 old_msr_data = msr->data; + msr->data = data; + if (msr - vmx->guest_msrs < vmx->save_nmsrs) { + preempt_disable(); + ret = kvm_set_shared_msr(msr->index, msr->data, + msr->mask); + preempt_enable(); + if (ret) + msr->data = old_msr_data; + } + break; + } + ret = kvm_set_msr_common(vcpu, msr_info); + } + + return ret; +} + +static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) +{ + __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); + switch (reg) { + case VCPU_REGS_RSP: + vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); + break; + case VCPU_REGS_RIP: + vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); + break; + case VCPU_EXREG_PDPTR: + if (enable_ept) + ept_save_pdptrs(vcpu); + break; + default: + break; + } +} + +static __init int cpu_has_kvm_support(void) +{ + return cpu_has_vmx(); +} + +static __init int vmx_disabled_by_bios(void) +{ + u64 msr; + + rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); + if (msr & FEATURE_CONTROL_LOCKED) { + /* launched w/ TXT and VMX disabled */ + if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) + && tboot_enabled()) + return 1; + /* launched w/o TXT and VMX only enabled w/ TXT */ + if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) + && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) + && !tboot_enabled()) { + printk(KERN_WARNING "kvm: disable TXT in the BIOS or " + "activate TXT before enabling KVM\n"); + return 1; + } + /* launched w/o TXT and VMX disabled */ + if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) + && !tboot_enabled()) + return 1; + } + + return 0; +} + +static void kvm_cpu_vmxon(u64 addr) +{ + cr4_set_bits(X86_CR4_VMXE); + intel_pt_handle_vmx(1); + + asm volatile ("vmxon %0" : : "m"(addr)); +} + +static int hardware_enable(void) +{ + int cpu = raw_smp_processor_id(); + u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); + u64 old, test_bits; + + if (cr4_read_shadow() & X86_CR4_VMXE) + return -EBUSY; + + /* + * This can happen if we hot-added a CPU but failed to allocate + * VP assist page for it. + */ + if (static_branch_unlikely(&enable_evmcs) && + !hv_get_vp_assist_page(cpu)) + return -EFAULT; + + INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); + INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); + spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); + + /* + * Now we can enable the vmclear operation in kdump + * since the loaded_vmcss_on_cpu list on this cpu + * has been initialized. + * + * Though the cpu is not in VMX operation now, there + * is no problem to enable the vmclear operation + * for the loaded_vmcss_on_cpu list is empty! + */ + crash_enable_local_vmclear(cpu); + + rdmsrl(MSR_IA32_FEATURE_CONTROL, old); + + test_bits = FEATURE_CONTROL_LOCKED; + test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; + if (tboot_enabled()) + test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX; + + if ((old & test_bits) != test_bits) { + /* enable and lock */ + wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); + } + kvm_cpu_vmxon(phys_addr); + if (enable_ept) + ept_sync_global(); + + return 0; +} + +static void vmclear_local_loaded_vmcss(void) +{ + int cpu = raw_smp_processor_id(); + struct loaded_vmcs *v, *n; + + list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), + loaded_vmcss_on_cpu_link) + __loaded_vmcs_clear(v); +} + + +/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot() + * tricks. + */ +static void kvm_cpu_vmxoff(void) +{ + asm volatile (__ex("vmxoff")); + + intel_pt_handle_vmx(0); + cr4_clear_bits(X86_CR4_VMXE); +} + +static void hardware_disable(void) +{ + vmclear_local_loaded_vmcss(); + kvm_cpu_vmxoff(); +} + +static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, + u32 msr, u32 *result) +{ + u32 vmx_msr_low, vmx_msr_high; + u32 ctl = ctl_min | ctl_opt; + + rdmsr(msr, vmx_msr_low, vmx_msr_high); + + ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ + ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ + + /* Ensure minimum (required) set of control bits are supported. */ + if (ctl_min & ~ctl) + return -EIO; + + *result = ctl; + return 0; +} + +static __init bool allow_1_setting(u32 msr, u32 ctl) +{ + u32 vmx_msr_low, vmx_msr_high; + + rdmsr(msr, vmx_msr_low, vmx_msr_high); + return vmx_msr_high & ctl; +} + +static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) +{ + u32 vmx_msr_low, vmx_msr_high; + u32 min, opt, min2, opt2; + u32 _pin_based_exec_control = 0; + u32 _cpu_based_exec_control = 0; + u32 _cpu_based_2nd_exec_control = 0; + u32 _vmexit_control = 0; + u32 _vmentry_control = 0; + + memset(vmcs_conf, 0, sizeof(*vmcs_conf)); + min = CPU_BASED_HLT_EXITING | +#ifdef CONFIG_X86_64 + CPU_BASED_CR8_LOAD_EXITING | + CPU_BASED_CR8_STORE_EXITING | +#endif + CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING | + CPU_BASED_UNCOND_IO_EXITING | + CPU_BASED_MOV_DR_EXITING | + CPU_BASED_USE_TSC_OFFSETING | + CPU_BASED_MWAIT_EXITING | + CPU_BASED_MONITOR_EXITING | + CPU_BASED_INVLPG_EXITING | + CPU_BASED_RDPMC_EXITING; + + opt = CPU_BASED_TPR_SHADOW | + CPU_BASED_USE_MSR_BITMAPS | + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; + if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, + &_cpu_based_exec_control) < 0) + return -EIO; +#ifdef CONFIG_X86_64 + if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) + _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING & + ~CPU_BASED_CR8_STORE_EXITING; +#endif + if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { + min2 = 0; + opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | + SECONDARY_EXEC_WBINVD_EXITING | + SECONDARY_EXEC_ENABLE_VPID | + SECONDARY_EXEC_ENABLE_EPT | + SECONDARY_EXEC_UNRESTRICTED_GUEST | + SECONDARY_EXEC_PAUSE_LOOP_EXITING | + SECONDARY_EXEC_DESC | + SECONDARY_EXEC_RDTSCP | + SECONDARY_EXEC_ENABLE_INVPCID | + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | + SECONDARY_EXEC_SHADOW_VMCS | + SECONDARY_EXEC_XSAVES | + SECONDARY_EXEC_RDSEED_EXITING | + SECONDARY_EXEC_RDRAND_EXITING | + SECONDARY_EXEC_ENABLE_PML | + SECONDARY_EXEC_TSC_SCALING | + SECONDARY_EXEC_ENABLE_VMFUNC | + SECONDARY_EXEC_ENCLS_EXITING; + if (adjust_vmx_controls(min2, opt2, + MSR_IA32_VMX_PROCBASED_CTLS2, + &_cpu_based_2nd_exec_control) < 0) + return -EIO; + } +#ifndef CONFIG_X86_64 + if (!(_cpu_based_2nd_exec_control & + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) + _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; +#endif + + if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) + _cpu_based_2nd_exec_control &= ~( + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + + rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, + &vmx_capability.ept, &vmx_capability.vpid); + + if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { + /* CR3 accesses and invlpg don't need to cause VM Exits when EPT + enabled */ + _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING | + CPU_BASED_INVLPG_EXITING); + } else if (vmx_capability.ept) { + vmx_capability.ept = 0; + pr_warn_once("EPT CAP should not exist if not support " + "1-setting enable EPT VM-execution control\n"); + } + if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && + vmx_capability.vpid) { + vmx_capability.vpid = 0; + pr_warn_once("VPID CAP should not exist if not support " + "1-setting enable VPID VM-execution control\n"); + } + + min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT; +#ifdef CONFIG_X86_64 + min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; +#endif + opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT | + VM_EXIT_CLEAR_BNDCFGS; + if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, + &_vmexit_control) < 0) + return -EIO; + + min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; + opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR | + PIN_BASED_VMX_PREEMPTION_TIMER; + if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, + &_pin_based_exec_control) < 0) + return -EIO; + + if (cpu_has_broken_vmx_preemption_timer()) + _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + if (!(_cpu_based_2nd_exec_control & + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)) + _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; + + min = VM_ENTRY_LOAD_DEBUG_CONTROLS; + opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; + if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, + &_vmentry_control) < 0) + return -EIO; + + rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); + + /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ + if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) + return -EIO; + +#ifdef CONFIG_X86_64 + /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */ + if (vmx_msr_high & (1u<<16)) + return -EIO; +#endif + + /* Require Write-Back (WB) memory type for VMCS accesses. */ + if (((vmx_msr_high >> 18) & 15) != 6) + return -EIO; + + vmcs_conf->size = vmx_msr_high & 0x1fff; + vmcs_conf->order = get_order(vmcs_conf->size); + vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff; + + vmcs_conf->revision_id = vmx_msr_low; + + vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; + vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; + vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; + vmcs_conf->vmexit_ctrl = _vmexit_control; + vmcs_conf->vmentry_ctrl = _vmentry_control; + + if (static_branch_unlikely(&enable_evmcs)) + evmcs_sanitize_exec_ctrls(vmcs_conf); + + cpu_has_load_ia32_efer = + allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS, + VM_ENTRY_LOAD_IA32_EFER) + && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS, + VM_EXIT_LOAD_IA32_EFER); + + cpu_has_load_perf_global_ctrl = + allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS, + VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) + && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS, + VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); + + /* + * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL + * but due to errata below it can't be used. Workaround is to use + * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL. + * + * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32] + * + * AAK155 (model 26) + * AAP115 (model 30) + * AAT100 (model 37) + * BC86,AAY89,BD102 (model 44) + * BA97 (model 46) + * + */ + if (cpu_has_load_perf_global_ctrl && boot_cpu_data.x86 == 0x6) { + switch (boot_cpu_data.x86_model) { + case 26: + case 30: + case 37: + case 44: + case 46: + cpu_has_load_perf_global_ctrl = false; + printk_once(KERN_WARNING"kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " + "does not work properly. Using workaround\n"); + break; + default: + break; + } + } + + if (boot_cpu_has(X86_FEATURE_XSAVES)) + rdmsrl(MSR_IA32_XSS, host_xss); + + return 0; +} + +static struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu) +{ + int node = cpu_to_node(cpu); + struct page *pages; + struct vmcs *vmcs; + + pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order); + if (!pages) + return NULL; + vmcs = page_address(pages); + memset(vmcs, 0, vmcs_config.size); + + /* KVM supports Enlightened VMCS v1 only */ + if (static_branch_unlikely(&enable_evmcs)) + vmcs->hdr.revision_id = KVM_EVMCS_VERSION; + else + vmcs->hdr.revision_id = vmcs_config.revision_id; + + if (shadow) + vmcs->hdr.shadow_vmcs = 1; + return vmcs; +} + +static void free_vmcs(struct vmcs *vmcs) +{ + free_pages((unsigned long)vmcs, vmcs_config.order); +} + +/* + * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded + */ +static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) +{ + if (!loaded_vmcs->vmcs) + return; + loaded_vmcs_clear(loaded_vmcs); + free_vmcs(loaded_vmcs->vmcs); + loaded_vmcs->vmcs = NULL; + if (loaded_vmcs->msr_bitmap) + free_page((unsigned long)loaded_vmcs->msr_bitmap); + WARN_ON(loaded_vmcs->shadow_vmcs != NULL); +} + +static struct vmcs *alloc_vmcs(bool shadow) +{ + return alloc_vmcs_cpu(shadow, raw_smp_processor_id()); +} + +static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) +{ + loaded_vmcs->vmcs = alloc_vmcs(false); + if (!loaded_vmcs->vmcs) + return -ENOMEM; + + loaded_vmcs->shadow_vmcs = NULL; + loaded_vmcs_init(loaded_vmcs); + + if (cpu_has_vmx_msr_bitmap()) { + loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!loaded_vmcs->msr_bitmap) + goto out_vmcs; + memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE); + + if (IS_ENABLED(CONFIG_HYPERV) && + static_branch_unlikely(&enable_evmcs) && + (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) { + struct hv_enlightened_vmcs *evmcs = + (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs; + + evmcs->hv_enlightenments_control.msr_bitmap = 1; + } + } + + memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state)); + + return 0; + +out_vmcs: + free_loaded_vmcs(loaded_vmcs); + return -ENOMEM; +} + +static void free_kvm_area(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + free_vmcs(per_cpu(vmxarea, cpu)); + per_cpu(vmxarea, cpu) = NULL; + } +} + +enum vmcs_field_width { + VMCS_FIELD_WIDTH_U16 = 0, + VMCS_FIELD_WIDTH_U64 = 1, + VMCS_FIELD_WIDTH_U32 = 2, + VMCS_FIELD_WIDTH_NATURAL_WIDTH = 3 +}; + +static inline int vmcs_field_width(unsigned long field) +{ + if (0x1 & field) /* the *_HIGH fields are all 32 bit */ + return VMCS_FIELD_WIDTH_U32; + return (field >> 13) & 0x3 ; +} + +static inline int vmcs_field_readonly(unsigned long field) +{ + return (((field >> 10) & 0x3) == 1); +} + +static void init_vmcs_shadow_fields(void) +{ + int i, j; + + memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); + memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); + + for (i = j = 0; i < max_shadow_read_only_fields; i++) { + u16 field = shadow_read_only_fields[i]; + if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && + (i + 1 == max_shadow_read_only_fields || + shadow_read_only_fields[i + 1] != field + 1)) + pr_err("Missing field from shadow_read_only_field %x\n", + field + 1); + + clear_bit(field, vmx_vmread_bitmap); +#ifdef CONFIG_X86_64 + if (field & 1) + continue; +#endif + if (j < i) + shadow_read_only_fields[j] = field; + j++; + } + max_shadow_read_only_fields = j; + + for (i = j = 0; i < max_shadow_read_write_fields; i++) { + u16 field = shadow_read_write_fields[i]; + if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && + (i + 1 == max_shadow_read_write_fields || + shadow_read_write_fields[i + 1] != field + 1)) + pr_err("Missing field from shadow_read_write_field %x\n", + field + 1); + + /* + * PML and the preemption timer can be emulated, but the + * processor cannot vmwrite to fields that don't exist + * on bare metal. + */ + switch (field) { + case GUEST_PML_INDEX: + if (!cpu_has_vmx_pml()) + continue; + break; + case VMX_PREEMPTION_TIMER_VALUE: + if (!cpu_has_vmx_preemption_timer()) + continue; + break; + case GUEST_INTR_STATUS: + if (!cpu_has_vmx_apicv()) + continue; + break; + default: + break; + } + + clear_bit(field, vmx_vmwrite_bitmap); + clear_bit(field, vmx_vmread_bitmap); +#ifdef CONFIG_X86_64 + if (field & 1) + continue; +#endif + if (j < i) + shadow_read_write_fields[j] = field; + j++; + } + max_shadow_read_write_fields = j; +} + +static __init int alloc_kvm_area(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct vmcs *vmcs; + + vmcs = alloc_vmcs_cpu(false, cpu); + if (!vmcs) { + free_kvm_area(); + return -ENOMEM; + } + + /* + * When eVMCS is enabled, alloc_vmcs_cpu() sets + * vmcs->revision_id to KVM_EVMCS_VERSION instead of + * revision_id reported by MSR_IA32_VMX_BASIC. + * + * However, even though not explictly documented by + * TLFS, VMXArea passed as VMXON argument should + * still be marked with revision_id reported by + * physical CPU. + */ + if (static_branch_unlikely(&enable_evmcs)) + vmcs->hdr.revision_id = vmcs_config.revision_id; + + per_cpu(vmxarea, cpu) = vmcs; + } + return 0; +} + +static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, + struct kvm_segment *save) +{ + if (!emulate_invalid_guest_state) { + /* + * CS and SS RPL should be equal during guest entry according + * to VMX spec, but in reality it is not always so. Since vcpu + * is in the middle of the transition from real mode to + * protected mode it is safe to assume that RPL 0 is a good + * default value. + */ + if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) + save->selector &= ~SEGMENT_RPL_MASK; + save->dpl = save->selector & SEGMENT_RPL_MASK; + save->s = 1; + } + vmx_set_segment(vcpu, save, seg); +} + +static void enter_pmode(struct kvm_vcpu *vcpu) +{ + unsigned long flags; + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * Update real mode segment cache. It may be not up-to-date if sement + * register was written while vcpu was in a guest mode. + */ + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); + + vmx->rmode.vm86_active = 0; + + vmx_segment_cache_clear(vmx); + + vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); + + flags = vmcs_readl(GUEST_RFLAGS); + flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; + flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; + vmcs_writel(GUEST_RFLAGS, flags); + + vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | + (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME)); + + update_exception_bitmap(vcpu); + + fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); + fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); + fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); + fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); + fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); + fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); +} + +static void fix_rmode_seg(int seg, struct kvm_segment *save) +{ + const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + struct kvm_segment var = *save; + + var.dpl = 0x3; + if (seg == VCPU_SREG_CS) + var.type = 0x3; + + if (!emulate_invalid_guest_state) { + var.selector = var.base >> 4; + var.base = var.base & 0xffff0; + var.limit = 0xffff; + var.g = 0; + var.db = 0; + var.present = 1; + var.s = 1; + var.l = 0; + var.unusable = 0; + var.type = 0x3; + var.avl = 0; + if (save->base & 0xf) + printk_once(KERN_WARNING "kvm: segment base is not " + "paragraph aligned when entering " + "protected mode (seg=%d)", seg); + } + + vmcs_write16(sf->selector, var.selector); + vmcs_writel(sf->base, var.base); + vmcs_write32(sf->limit, var.limit); + vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var)); +} + +static void enter_rmode(struct kvm_vcpu *vcpu) +{ + unsigned long flags; + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm); + + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); + + vmx->rmode.vm86_active = 1; + + /* + * Very old userspace does not call KVM_SET_TSS_ADDR before entering + * vcpu. Warn the user that an update is overdue. + */ + if (!kvm_vmx->tss_addr) + printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be " + "called before entering vcpu\n"); + + vmx_segment_cache_clear(vmx); + + vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr); + vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); + vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); + + flags = vmcs_readl(GUEST_RFLAGS); + vmx->rmode.save_rflags = flags; + + flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; + + vmcs_writel(GUEST_RFLAGS, flags); + vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); + update_exception_bitmap(vcpu); + + fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); + fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); + fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); + fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); + fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); + fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); + + kvm_mmu_reset_context(vcpu); +} + +static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); + + if (!msr) + return; + + vcpu->arch.efer = efer; + if (efer & EFER_LMA) { + vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); + msr->data = efer; + } else { + vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); + + msr->data = efer & ~EFER_LME; + } + setup_msrs(vmx); +} + +#ifdef CONFIG_X86_64 + +static void enter_lmode(struct kvm_vcpu *vcpu) +{ + u32 guest_tr_ar; + + vmx_segment_cache_clear(to_vmx(vcpu)); + + guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); + if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) { + pr_debug_ratelimited("%s: tss fixup for long mode. \n", + __func__); + vmcs_write32(GUEST_TR_AR_BYTES, + (guest_tr_ar & ~VMX_AR_TYPE_MASK) + | VMX_AR_TYPE_BUSY_64_TSS); + } + vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA); +} + +static void exit_lmode(struct kvm_vcpu *vcpu) +{ + vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); + vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA); +} + +#endif + +static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid, + bool invalidate_gpa) +{ + if (enable_ept && (invalidate_gpa || !enable_vpid)) { + if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) + return; + ept_sync_context(construct_eptp(vcpu, + vcpu->arch.mmu->root_hpa)); + } else { + vpid_sync_context(vpid); + } +} + +static void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) +{ + __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa); +} + +static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) +{ + int vpid = to_vmx(vcpu)->vpid; + + if (!vpid_sync_vcpu_addr(vpid, addr)) + vpid_sync_context(vpid); + + /* + * If VPIDs are not supported or enabled, then the above is a no-op. + * But we don't really need a TLB flush in that case anyway, because + * each VM entry/exit includes an implicit flush when VPID is 0. + */ +} + +static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) +{ + ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; + + vcpu->arch.cr0 &= ~cr0_guest_owned_bits; + vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; +} + +static void vmx_decache_cr3(struct kvm_vcpu *vcpu) +{ + if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu))) + vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); + __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); +} + +static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) +{ + ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; + + vcpu->arch.cr4 &= ~cr4_guest_owned_bits; + vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits; +} + +static void ept_load_pdptrs(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + + if (!test_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_dirty)) + return; + + if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { + vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]); + vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]); + vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]); + vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]); + } +} + +static void ept_save_pdptrs(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + + if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { + mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); + mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); + mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); + mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); + } + + __set_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_avail); + __set_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_dirty); +} + +static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) +{ + u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0; + u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1; + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + if (to_vmx(vcpu)->nested.msrs.secondary_ctls_high & + SECONDARY_EXEC_UNRESTRICTED_GUEST && + nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) + fixed0 &= ~(X86_CR0_PE | X86_CR0_PG); + + return fixed_bits_valid(val, fixed0, fixed1); +} + +static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) +{ + u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0; + u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1; + + return fixed_bits_valid(val, fixed0, fixed1); +} + +static bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val) +{ + u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr4_fixed0; + u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr4_fixed1; + + return fixed_bits_valid(val, fixed0, fixed1); +} + +/* No difference in the restrictions on guest and host CR4 in VMX operation. */ +#define nested_guest_cr4_valid nested_cr4_valid +#define nested_host_cr4_valid nested_cr4_valid + +static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); + +static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, + unsigned long cr0, + struct kvm_vcpu *vcpu) +{ + if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) + vmx_decache_cr3(vcpu); + if (!(cr0 & X86_CR0_PG)) { + /* From paging/starting to nonpaging */ + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, + vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) | + (CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING)); + vcpu->arch.cr0 = cr0; + vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); + } else if (!is_paging(vcpu)) { + /* From nonpaging to paging */ + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, + vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) & + ~(CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING)); + vcpu->arch.cr0 = cr0; + vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); + } + + if (!(cr0 & X86_CR0_WP)) + *hw_cr0 &= ~X86_CR0_WP; +} + +static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long hw_cr0; + + hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF); + if (enable_unrestricted_guest) + hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; + else { + hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; + + if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) + enter_pmode(vcpu); + + if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) + enter_rmode(vcpu); + } + +#ifdef CONFIG_X86_64 + if (vcpu->arch.efer & EFER_LME) { + if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) + enter_lmode(vcpu); + if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) + exit_lmode(vcpu); + } +#endif + + if (enable_ept && !enable_unrestricted_guest) + ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); + + vmcs_writel(CR0_READ_SHADOW, cr0); + vmcs_writel(GUEST_CR0, hw_cr0); + vcpu->arch.cr0 = cr0; + + /* depends on vcpu->arch.cr0 to be set to a new value */ + vmx->emulation_required = emulation_required(vcpu); +} + +static int get_ept_level(struct kvm_vcpu *vcpu) +{ + if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48)) + return 5; + return 4; +} + +static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) +{ + u64 eptp = VMX_EPTP_MT_WB; + + eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; + + if (enable_ept_ad_bits && + (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) + eptp |= VMX_EPTP_AD_ENABLE_BIT; + eptp |= (root_hpa & PAGE_MASK); + + return eptp; +} + +static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +{ + struct kvm *kvm = vcpu->kvm; + unsigned long guest_cr3; + u64 eptp; + + guest_cr3 = cr3; + if (enable_ept) { + eptp = construct_eptp(vcpu, cr3); + vmcs_write64(EPT_POINTER, eptp); + + if (kvm_x86_ops->tlb_remote_flush) { + spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock); + to_vmx(vcpu)->ept_pointer = eptp; + to_kvm_vmx(kvm)->ept_pointers_match + = EPT_POINTERS_CHECK; + spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); + } + + if (enable_unrestricted_guest || is_paging(vcpu) || + is_guest_mode(vcpu)) + guest_cr3 = kvm_read_cr3(vcpu); + else + guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; + ept_load_pdptrs(vcpu); + } + + vmcs_writel(GUEST_CR3, guest_cr3); +} + +static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + /* + * Pass through host's Machine Check Enable value to hw_cr4, which + * is in force while we are in guest mode. Do not let guests control + * this bit, even if host CR4.MCE == 0. + */ + unsigned long hw_cr4; + + hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE); + if (enable_unrestricted_guest) + hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST; + else if (to_vmx(vcpu)->rmode.vm86_active) + hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON; + else + hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON; + + if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) { + if (cr4 & X86_CR4_UMIP) { + vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, + SECONDARY_EXEC_DESC); + hw_cr4 &= ~X86_CR4_UMIP; + } else if (!is_guest_mode(vcpu) || + !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) + vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, + SECONDARY_EXEC_DESC); + } + + if (cr4 & X86_CR4_VMXE) { + /* + * To use VMXON (and later other VMX instructions), a guest + * must first be able to turn on cr4.VMXE (see handle_vmon()). + * So basically the check on whether to allow nested VMX + * is here. We operate under the default treatment of SMM, + * so VMX cannot be enabled under SMM. + */ + if (!nested_vmx_allowed(vcpu) || is_smm(vcpu)) + return 1; + } + + if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4)) + return 1; + + vcpu->arch.cr4 = cr4; + + if (!enable_unrestricted_guest) { + if (enable_ept) { + if (!is_paging(vcpu)) { + hw_cr4 &= ~X86_CR4_PAE; + hw_cr4 |= X86_CR4_PSE; + } else if (!(cr4 & X86_CR4_PAE)) { + hw_cr4 &= ~X86_CR4_PAE; + } + } + + /* + * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in + * hardware. To emulate this behavior, SMEP/SMAP/PKU needs + * to be manually disabled when guest switches to non-paging + * mode. + * + * If !enable_unrestricted_guest, the CPU is always running + * with CR0.PG=1 and CR4 needs to be modified. + * If enable_unrestricted_guest, the CPU automatically + * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0. + */ + if (!is_paging(vcpu)) + hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); + } + + vmcs_writel(CR4_READ_SHADOW, cr4); + vmcs_writel(GUEST_CR4, hw_cr4); + return 0; +} + +static void vmx_get_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 ar; + + if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { + *var = vmx->rmode.segs[seg]; + if (seg == VCPU_SREG_TR + || var->selector == vmx_read_guest_seg_selector(vmx, seg)) + return; + var->base = vmx_read_guest_seg_base(vmx, seg); + var->selector = vmx_read_guest_seg_selector(vmx, seg); + return; + } + var->base = vmx_read_guest_seg_base(vmx, seg); + var->limit = vmx_read_guest_seg_limit(vmx, seg); + var->selector = vmx_read_guest_seg_selector(vmx, seg); + ar = vmx_read_guest_seg_ar(vmx, seg); + var->unusable = (ar >> 16) & 1; + var->type = ar & 15; + var->s = (ar >> 4) & 1; + var->dpl = (ar >> 5) & 3; + /* + * Some userspaces do not preserve unusable property. Since usable + * segment has to be present according to VMX spec we can use present + * property to amend userspace bug by making unusable segment always + * nonpresent. vmx_segment_access_rights() already marks nonpresent + * segment as unusable. + */ + var->present = !var->unusable; + var->avl = (ar >> 12) & 1; + var->l = (ar >> 13) & 1; + var->db = (ar >> 14) & 1; + var->g = (ar >> 15) & 1; +} + +static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) +{ + struct kvm_segment s; + + if (to_vmx(vcpu)->rmode.vm86_active) { + vmx_get_segment(vcpu, &s, seg); + return s.base; + } + return vmx_read_guest_seg_base(to_vmx(vcpu), seg); +} + +static int vmx_get_cpl(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (unlikely(vmx->rmode.vm86_active)) + return 0; + else { + int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); + return VMX_AR_DPL(ar); + } +} + +static u32 vmx_segment_access_rights(struct kvm_segment *var) +{ + u32 ar; + + if (var->unusable || !var->present) + ar = 1 << 16; + else { + ar = var->type & 15; + ar |= (var->s & 1) << 4; + ar |= (var->dpl & 3) << 5; + ar |= (var->present & 1) << 7; + ar |= (var->avl & 1) << 12; + ar |= (var->l & 1) << 13; + ar |= (var->db & 1) << 14; + ar |= (var->g & 1) << 15; + } + + return ar; +} + +static void vmx_set_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + + vmx_segment_cache_clear(vmx); + + if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { + vmx->rmode.segs[seg] = *var; + if (seg == VCPU_SREG_TR) + vmcs_write16(sf->selector, var->selector); + else if (var->s) + fix_rmode_seg(seg, &vmx->rmode.segs[seg]); + goto out; + } + + vmcs_writel(sf->base, var->base); + vmcs_write32(sf->limit, var->limit); + vmcs_write16(sf->selector, var->selector); + + /* + * Fix the "Accessed" bit in AR field of segment registers for older + * qemu binaries. + * IA32 arch specifies that at the time of processor reset the + * "Accessed" bit in the AR field of segment registers is 1. And qemu + * is setting it to 0 in the userland code. This causes invalid guest + * state vmexit when "unrestricted guest" mode is turned on. + * Fix for this setup issue in cpu_reset is being pushed in the qemu + * tree. Newer qemu binaries with that qemu fix would not need this + * kvm hack. + */ + if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR)) + var->type |= 0x1; /* Accessed */ + + vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); + +out: + vmx->emulation_required = emulation_required(vcpu); +} + +static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) +{ + u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS); + + *db = (ar >> 14) & 1; + *l = (ar >> 13) & 1; +} + +static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +{ + dt->size = vmcs_read32(GUEST_IDTR_LIMIT); + dt->address = vmcs_readl(GUEST_IDTR_BASE); +} + +static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +{ + vmcs_write32(GUEST_IDTR_LIMIT, dt->size); + vmcs_writel(GUEST_IDTR_BASE, dt->address); +} + +static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +{ + dt->size = vmcs_read32(GUEST_GDTR_LIMIT); + dt->address = vmcs_readl(GUEST_GDTR_BASE); +} + +static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +{ + vmcs_write32(GUEST_GDTR_LIMIT, dt->size); + vmcs_writel(GUEST_GDTR_BASE, dt->address); +} + +static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) +{ + struct kvm_segment var; + u32 ar; + + vmx_get_segment(vcpu, &var, seg); + var.dpl = 0x3; + if (seg == VCPU_SREG_CS) + var.type = 0x3; + ar = vmx_segment_access_rights(&var); + + if (var.base != (var.selector << 4)) + return false; + if (var.limit != 0xffff) + return false; + if (ar != 0xf3) + return false; + + return true; +} + +static bool code_segment_valid(struct kvm_vcpu *vcpu) +{ + struct kvm_segment cs; + unsigned int cs_rpl; + + vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); + cs_rpl = cs.selector & SEGMENT_RPL_MASK; + + if (cs.unusable) + return false; + if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK)) + return false; + if (!cs.s) + return false; + if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) { + if (cs.dpl > cs_rpl) + return false; + } else { + if (cs.dpl != cs_rpl) + return false; + } + if (!cs.present) + return false; + + /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */ + return true; +} + +static bool stack_segment_valid(struct kvm_vcpu *vcpu) +{ + struct kvm_segment ss; + unsigned int ss_rpl; + + vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); + ss_rpl = ss.selector & SEGMENT_RPL_MASK; + + if (ss.unusable) + return true; + if (ss.type != 3 && ss.type != 7) + return false; + if (!ss.s) + return false; + if (ss.dpl != ss_rpl) /* DPL != RPL */ + return false; + if (!ss.present) + return false; + + return true; +} + +static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) +{ + struct kvm_segment var; + unsigned int rpl; + + vmx_get_segment(vcpu, &var, seg); + rpl = var.selector & SEGMENT_RPL_MASK; + + if (var.unusable) + return true; + if (!var.s) + return false; + if (!var.present) + return false; + if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) { + if (var.dpl < rpl) /* DPL < RPL */ + return false; + } + + /* TODO: Add other members to kvm_segment_field to allow checking for other access + * rights flags + */ + return true; +} + +static bool tr_valid(struct kvm_vcpu *vcpu) +{ + struct kvm_segment tr; + + vmx_get_segment(vcpu, &tr, VCPU_SREG_TR); + + if (tr.unusable) + return false; + if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */ + return false; + if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */ + return false; + if (!tr.present) + return false; + + return true; +} + +static bool ldtr_valid(struct kvm_vcpu *vcpu) +{ + struct kvm_segment ldtr; + + vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR); + + if (ldtr.unusable) + return true; + if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */ + return false; + if (ldtr.type != 2) + return false; + if (!ldtr.present) + return false; + + return true; +} + +static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) +{ + struct kvm_segment cs, ss; + + vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); + vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); + + return ((cs.selector & SEGMENT_RPL_MASK) == + (ss.selector & SEGMENT_RPL_MASK)); +} + +/* + * Check if guest state is valid. Returns true if valid, false if + * not. + * We assume that registers are always usable + */ +static bool guest_state_valid(struct kvm_vcpu *vcpu) +{ + if (enable_unrestricted_guest) + return true; + + /* real mode guest state checks */ + if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { + if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) + return false; + if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) + return false; + if (!rmode_segment_valid(vcpu, VCPU_SREG_DS)) + return false; + if (!rmode_segment_valid(vcpu, VCPU_SREG_ES)) + return false; + if (!rmode_segment_valid(vcpu, VCPU_SREG_FS)) + return false; + if (!rmode_segment_valid(vcpu, VCPU_SREG_GS)) + return false; + } else { + /* protected mode guest state checks */ + if (!cs_ss_rpl_check(vcpu)) + return false; + if (!code_segment_valid(vcpu)) + return false; + if (!stack_segment_valid(vcpu)) + return false; + if (!data_segment_valid(vcpu, VCPU_SREG_DS)) + return false; + if (!data_segment_valid(vcpu, VCPU_SREG_ES)) + return false; + if (!data_segment_valid(vcpu, VCPU_SREG_FS)) + return false; + if (!data_segment_valid(vcpu, VCPU_SREG_GS)) + return false; + if (!tr_valid(vcpu)) + return false; + if (!ldtr_valid(vcpu)) + return false; + } + /* TODO: + * - Add checks on RIP + * - Add checks on RFLAGS + */ + + return true; +} + +static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu)); +} + +static int init_rmode_tss(struct kvm *kvm) +{ + gfn_t fn; + u16 data = 0; + int idx, r; + + idx = srcu_read_lock(&kvm->srcu); + fn = to_kvm_vmx(kvm)->tss_addr >> PAGE_SHIFT; + r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); + if (r < 0) + goto out; + data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; + r = kvm_write_guest_page(kvm, fn++, &data, + TSS_IOPB_BASE_OFFSET, sizeof(u16)); + if (r < 0) + goto out; + r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); + if (r < 0) + goto out; + r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); + if (r < 0) + goto out; + data = ~0; + r = kvm_write_guest_page(kvm, fn, &data, + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1, + sizeof(u8)); +out: + srcu_read_unlock(&kvm->srcu, idx); + return r; +} + +static int init_rmode_identity_map(struct kvm *kvm) +{ + struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); + int i, idx, r = 0; + kvm_pfn_t identity_map_pfn; + u32 tmp; + + /* Protect kvm_vmx->ept_identity_pagetable_done. */ + mutex_lock(&kvm->slots_lock); + + if (likely(kvm_vmx->ept_identity_pagetable_done)) + goto out2; + + if (!kvm_vmx->ept_identity_map_addr) + kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; + identity_map_pfn = kvm_vmx->ept_identity_map_addr >> PAGE_SHIFT; + + r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, + kvm_vmx->ept_identity_map_addr, PAGE_SIZE); + if (r < 0) + goto out2; + + idx = srcu_read_lock(&kvm->srcu); + r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); + if (r < 0) + goto out; + /* Set up identity-mapping pagetable for EPT in real mode */ + for (i = 0; i < PT32_ENT_PER_PAGE; i++) { + tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | + _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); + r = kvm_write_guest_page(kvm, identity_map_pfn, + &tmp, i * sizeof(tmp), sizeof(tmp)); + if (r < 0) + goto out; + } + kvm_vmx->ept_identity_pagetable_done = true; + +out: + srcu_read_unlock(&kvm->srcu, idx); + +out2: + mutex_unlock(&kvm->slots_lock); + return r; +} + +static void seg_setup(int seg) +{ + const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + unsigned int ar; + + vmcs_write16(sf->selector, 0); + vmcs_writel(sf->base, 0); + vmcs_write32(sf->limit, 0xffff); + ar = 0x93; + if (seg == VCPU_SREG_CS) + ar |= 0x08; /* code segment */ + + vmcs_write32(sf->ar_bytes, ar); +} + +static int alloc_apic_access_page(struct kvm *kvm) +{ + struct page *page; + int r = 0; + + mutex_lock(&kvm->slots_lock); + if (kvm->arch.apic_access_page_done) + goto out; + r = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, + APIC_DEFAULT_PHYS_BASE, PAGE_SIZE); + if (r) + goto out; + + page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); + if (is_error_page(page)) { + r = -EFAULT; + goto out; + } + + /* + * Do not pin the page in memory, so that memory hot-unplug + * is able to migrate it. + */ + put_page(page); + kvm->arch.apic_access_page_done = true; +out: + mutex_unlock(&kvm->slots_lock); + return r; +} + +static int allocate_vpid(void) +{ + int vpid; + + if (!enable_vpid) + return 0; + spin_lock(&vmx_vpid_lock); + vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); + if (vpid < VMX_NR_VPIDS) + __set_bit(vpid, vmx_vpid_bitmap); + else + vpid = 0; + spin_unlock(&vmx_vpid_lock); + return vpid; +} + +static void free_vpid(int vpid) +{ + if (!enable_vpid || vpid == 0) + return; + spin_lock(&vmx_vpid_lock); + __clear_bit(vpid, vmx_vpid_bitmap); + spin_unlock(&vmx_vpid_lock); +} + +static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, + u32 msr, int type) +{ + int f = sizeof(unsigned long); + + if (!cpu_has_vmx_msr_bitmap()) + return; + + if (static_branch_unlikely(&enable_evmcs)) + evmcs_touch_msr_bitmap(); + + /* + * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals + * have the write-low and read-high bitmap offsets the wrong way round. + * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. + */ + if (msr <= 0x1fff) { + if (type & MSR_TYPE_R) + /* read-low */ + __clear_bit(msr, msr_bitmap + 0x000 / f); + + if (type & MSR_TYPE_W) + /* write-low */ + __clear_bit(msr, msr_bitmap + 0x800 / f); + + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { + msr &= 0x1fff; + if (type & MSR_TYPE_R) + /* read-high */ + __clear_bit(msr, msr_bitmap + 0x400 / f); + + if (type & MSR_TYPE_W) + /* write-high */ + __clear_bit(msr, msr_bitmap + 0xc00 / f); + + } +} + +static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, + u32 msr, int type) +{ + int f = sizeof(unsigned long); + + if (!cpu_has_vmx_msr_bitmap()) + return; + + if (static_branch_unlikely(&enable_evmcs)) + evmcs_touch_msr_bitmap(); + + /* + * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals + * have the write-low and read-high bitmap offsets the wrong way round. + * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. + */ + if (msr <= 0x1fff) { + if (type & MSR_TYPE_R) + /* read-low */ + __set_bit(msr, msr_bitmap + 0x000 / f); + + if (type & MSR_TYPE_W) + /* write-low */ + __set_bit(msr, msr_bitmap + 0x800 / f); + + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { + msr &= 0x1fff; + if (type & MSR_TYPE_R) + /* read-high */ + __set_bit(msr, msr_bitmap + 0x400 / f); + + if (type & MSR_TYPE_W) + /* write-high */ + __set_bit(msr, msr_bitmap + 0xc00 / f); + + } +} + +static __always_inline void vmx_set_intercept_for_msr(unsigned long *msr_bitmap, + u32 msr, int type, bool value) +{ + if (value) + vmx_enable_intercept_for_msr(msr_bitmap, msr, type); + else + vmx_disable_intercept_for_msr(msr_bitmap, msr, type); +} + +/* + * If a msr is allowed by L0, we should check whether it is allowed by L1. + * The corresponding bit will be cleared unless both of L0 and L1 allow it. + */ +static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, + unsigned long *msr_bitmap_nested, + u32 msr, int type) +{ + int f = sizeof(unsigned long); + + /* + * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals + * have the write-low and read-high bitmap offsets the wrong way round. + * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. + */ + if (msr <= 0x1fff) { + if (type & MSR_TYPE_R && + !test_bit(msr, msr_bitmap_l1 + 0x000 / f)) + /* read-low */ + __clear_bit(msr, msr_bitmap_nested + 0x000 / f); + + if (type & MSR_TYPE_W && + !test_bit(msr, msr_bitmap_l1 + 0x800 / f)) + /* write-low */ + __clear_bit(msr, msr_bitmap_nested + 0x800 / f); + + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { + msr &= 0x1fff; + if (type & MSR_TYPE_R && + !test_bit(msr, msr_bitmap_l1 + 0x400 / f)) + /* read-high */ + __clear_bit(msr, msr_bitmap_nested + 0x400 / f); + + if (type & MSR_TYPE_W && + !test_bit(msr, msr_bitmap_l1 + 0xc00 / f)) + /* write-high */ + __clear_bit(msr, msr_bitmap_nested + 0xc00 / f); + + } +} + +static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu) +{ + u8 mode = 0; + + if (cpu_has_secondary_exec_ctrls() && + (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) & + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { + mode |= MSR_BITMAP_MODE_X2APIC; + if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) + mode |= MSR_BITMAP_MODE_X2APIC_APICV; + } + + return mode; +} + +#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4)) + +static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap, + u8 mode) +{ + int msr; + + for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { + unsigned word = msr / BITS_PER_LONG; + msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0; + msr_bitmap[word + (0x800 / sizeof(long))] = ~0; + } + + if (mode & MSR_BITMAP_MODE_X2APIC) { + /* + * TPR reads and writes can be virtualized even if virtual interrupt + * delivery is not in use. + */ + vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW); + if (mode & MSR_BITMAP_MODE_X2APIC_APICV) { + vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R); + vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); + vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); + } + } +} + +static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; + u8 mode = vmx_msr_bitmap_mode(vcpu); + u8 changed = mode ^ vmx->msr_bitmap_mode; + + if (!changed) + return; + + if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV)) + vmx_update_msr_bitmap_x2apic(msr_bitmap, mode); + + vmx->msr_bitmap_mode = mode; +} + +static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu) +{ + return enable_apicv; +} + +static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + gfn_t gfn; + + /* + * Don't need to mark the APIC access page dirty; it is never + * written to by the CPU during APIC virtualization. + */ + + if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { + gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; + kvm_vcpu_mark_page_dirty(vcpu, gfn); + } + + if (nested_cpu_has_posted_intr(vmcs12)) { + gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; + kvm_vcpu_mark_page_dirty(vcpu, gfn); + } +} + + +static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int max_irr; + void *vapic_page; + u16 status; + + if (!vmx->nested.pi_desc || !vmx->nested.pi_pending) + return; + + vmx->nested.pi_pending = false; + if (!pi_test_and_clear_on(vmx->nested.pi_desc)) + return; + + max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); + if (max_irr != 256) { + vapic_page = kmap(vmx->nested.virtual_apic_page); + __kvm_apic_update_irr(vmx->nested.pi_desc->pir, + vapic_page, &max_irr); + kunmap(vmx->nested.virtual_apic_page); + + status = vmcs_read16(GUEST_INTR_STATUS); + if ((u8)max_irr > ((u8)status & 0xff)) { + status &= ~0xff; + status |= (u8)max_irr; + vmcs_write16(GUEST_INTR_STATUS, status); + } + } + + nested_mark_vmcs12_pages_dirty(vcpu); +} + +static u8 vmx_get_rvi(void) +{ + return vmcs_read16(GUEST_INTR_STATUS) & 0xff; +} + +static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + void *vapic_page; + u32 vppr; + int rvi; + + if (WARN_ON_ONCE(!is_guest_mode(vcpu)) || + !nested_cpu_has_vid(get_vmcs12(vcpu)) || + WARN_ON_ONCE(!vmx->nested.virtual_apic_page)) + return false; + + rvi = vmx_get_rvi(); + + vapic_page = kmap(vmx->nested.virtual_apic_page); + vppr = *((u32 *)(vapic_page + APIC_PROCPRI)); + kunmap(vmx->nested.virtual_apic_page); + + return ((rvi & 0xf0) > (vppr & 0xf0)); +} + +static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, + bool nested) +{ +#ifdef CONFIG_SMP + int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR; + + if (vcpu->mode == IN_GUEST_MODE) { + /* + * The vector of interrupt to be delivered to vcpu had + * been set in PIR before this function. + * + * Following cases will be reached in this block, and + * we always send a notification event in all cases as + * explained below. + * + * Case 1: vcpu keeps in non-root mode. Sending a + * notification event posts the interrupt to vcpu. + * + * Case 2: vcpu exits to root mode and is still + * runnable. PIR will be synced to vIRR before the + * next vcpu entry. Sending a notification event in + * this case has no effect, as vcpu is not in root + * mode. + * + * Case 3: vcpu exits to root mode and is blocked. + * vcpu_block() has already synced PIR to vIRR and + * never blocks vcpu if vIRR is not cleared. Therefore, + * a blocked vcpu here does not wait for any requested + * interrupts in PIR, and sending a notification event + * which has no effect is safe here. + */ + + apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); + return true; + } +#endif + return false; +} + +static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, + int vector) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (is_guest_mode(vcpu) && + vector == vmx->nested.posted_intr_nv) { + /* + * If a posted intr is not recognized by hardware, + * we will accomplish it in the next vmentry. + */ + vmx->nested.pi_pending = true; + kvm_make_request(KVM_REQ_EVENT, vcpu); + /* the PIR and ON have been set by L1. */ + if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true)) + kvm_vcpu_kick(vcpu); + return 0; + } + return -1; +} +/* + * Send interrupt to vcpu via posted interrupt way. + * 1. If target vcpu is running(non-root mode), send posted interrupt + * notification to vcpu and hardware will sync PIR to vIRR atomically. + * 2. If target vcpu isn't running(root mode), kick it to pick up the + * interrupt from PIR in next vmentry. + */ +static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int r; + + r = vmx_deliver_nested_posted_interrupt(vcpu, vector); + if (!r) + return; + + if (pi_test_and_set_pir(vector, &vmx->pi_desc)) + return; + + /* If a previous notification has sent the IPI, nothing to do. */ + if (pi_test_and_set_on(&vmx->pi_desc)) + return; + + if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false)) + kvm_vcpu_kick(vcpu); +} + +/* + * Set up the vmcs's constant host-state fields, i.e., host-state fields that + * will not change in the lifetime of the guest. + * Note that host-state that does change is set elsewhere. E.g., host-state + * that is set differently for each CPU is set in vmx_vcpu_load(), not here. + */ +static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) +{ + u32 low32, high32; + unsigned long tmpl; + struct desc_ptr dt; + unsigned long cr0, cr3, cr4; + + cr0 = read_cr0(); + WARN_ON(cr0 & X86_CR0_TS); + vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */ + + /* + * Save the most likely value for this task's CR3 in the VMCS. + * We can't use __get_current_cr3_fast() because we're not atomic. + */ + cr3 = __read_cr3(); + vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ + vmx->loaded_vmcs->host_state.cr3 = cr3; + + /* Save the most likely value for this task's CR4 in the VMCS. */ + cr4 = cr4_read_shadow(); + vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ + vmx->loaded_vmcs->host_state.cr4 = cr4; + + vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ +#ifdef CONFIG_X86_64 + /* + * Load null selectors, so we can avoid reloading them in + * vmx_prepare_switch_to_host(), in case userspace uses + * the null selectors too (the expected case). + */ + vmcs_write16(HOST_DS_SELECTOR, 0); + vmcs_write16(HOST_ES_SELECTOR, 0); +#else + vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ + vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ +#endif + vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ + vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ + + store_idt(&dt); + vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ + vmx->host_idt_base = dt.address; + + vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */ + + rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); + vmcs_write32(HOST_IA32_SYSENTER_CS, low32); + rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl); + vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ + + if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { + rdmsr(MSR_IA32_CR_PAT, low32, high32); + vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32)); + } + + if (cpu_has_load_ia32_efer) + vmcs_write64(HOST_IA32_EFER, host_efer); +} + +static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) +{ + vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; + if (enable_ept) + vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; + if (is_guest_mode(&vmx->vcpu)) + vmx->vcpu.arch.cr4_guest_owned_bits &= + ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask; + vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); +} + +static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) +{ + u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; + + if (!kvm_vcpu_apicv_active(&vmx->vcpu)) + pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; + + if (!enable_vnmi) + pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS; + + /* Enable the preemption timer dynamically */ + pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + return pin_based_exec_ctrl; +} + +static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); + if (cpu_has_secondary_exec_ctrls()) { + if (kvm_vcpu_apicv_active(vcpu)) + vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + else + vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + } + + if (cpu_has_vmx_msr_bitmap()) + vmx_update_msr_bitmap(vcpu); +} + +static u32 vmx_exec_control(struct vcpu_vmx *vmx) +{ + u32 exec_control = vmcs_config.cpu_based_exec_ctrl; + + if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) + exec_control &= ~CPU_BASED_MOV_DR_EXITING; + + if (!cpu_need_tpr_shadow(&vmx->vcpu)) { + exec_control &= ~CPU_BASED_TPR_SHADOW; +#ifdef CONFIG_X86_64 + exec_control |= CPU_BASED_CR8_STORE_EXITING | + CPU_BASED_CR8_LOAD_EXITING; +#endif + } + if (!enable_ept) + exec_control |= CPU_BASED_CR3_STORE_EXITING | + CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_INVLPG_EXITING; + if (kvm_mwait_in_guest(vmx->vcpu.kvm)) + exec_control &= ~(CPU_BASED_MWAIT_EXITING | + CPU_BASED_MONITOR_EXITING); + if (kvm_hlt_in_guest(vmx->vcpu.kvm)) + exec_control &= ~CPU_BASED_HLT_EXITING; + return exec_control; +} + +static bool vmx_rdrand_supported(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_RDRAND_EXITING; +} + +static bool vmx_rdseed_supported(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_RDSEED_EXITING; +} + +static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) +{ + struct kvm_vcpu *vcpu = &vmx->vcpu; + + u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; + + if (!cpu_need_virtualize_apic_accesses(vcpu)) + exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + if (vmx->vpid == 0) + exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; + if (!enable_ept) { + exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; + enable_unrestricted_guest = 0; + } + if (!enable_unrestricted_guest) + exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; + if (kvm_pause_in_guest(vmx->vcpu.kvm)) + exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; + if (!kvm_vcpu_apicv_active(vcpu)) + exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; + + /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP, + * in vmx_set_cr4. */ + exec_control &= ~SECONDARY_EXEC_DESC; + + /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD + (handle_vmptrld). + We can NOT enable shadow_vmcs here because we don't have yet + a current VMCS12 + */ + exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; + + if (!enable_pml) + exec_control &= ~SECONDARY_EXEC_ENABLE_PML; + + if (vmx_xsaves_supported()) { + /* Exposing XSAVES only when XSAVE is exposed */ + bool xsaves_enabled = + guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && + guest_cpuid_has(vcpu, X86_FEATURE_XSAVES); + + if (!xsaves_enabled) + exec_control &= ~SECONDARY_EXEC_XSAVES; + + if (nested) { + if (xsaves_enabled) + vmx->nested.msrs.secondary_ctls_high |= + SECONDARY_EXEC_XSAVES; + else + vmx->nested.msrs.secondary_ctls_high &= + ~SECONDARY_EXEC_XSAVES; + } + } + + if (vmx_rdtscp_supported()) { + bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP); + if (!rdtscp_enabled) + exec_control &= ~SECONDARY_EXEC_RDTSCP; + + if (nested) { + if (rdtscp_enabled) + vmx->nested.msrs.secondary_ctls_high |= + SECONDARY_EXEC_RDTSCP; + else + vmx->nested.msrs.secondary_ctls_high &= + ~SECONDARY_EXEC_RDTSCP; + } + } + + if (vmx_invpcid_supported()) { + /* Exposing INVPCID only when PCID is exposed */ + bool invpcid_enabled = + guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) && + guest_cpuid_has(vcpu, X86_FEATURE_PCID); + + if (!invpcid_enabled) { + exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; + guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID); + } + + if (nested) { + if (invpcid_enabled) + vmx->nested.msrs.secondary_ctls_high |= + SECONDARY_EXEC_ENABLE_INVPCID; + else + vmx->nested.msrs.secondary_ctls_high &= + ~SECONDARY_EXEC_ENABLE_INVPCID; + } + } + + if (vmx_rdrand_supported()) { + bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND); + if (rdrand_enabled) + exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING; + + if (nested) { + if (rdrand_enabled) + vmx->nested.msrs.secondary_ctls_high |= + SECONDARY_EXEC_RDRAND_EXITING; + else + vmx->nested.msrs.secondary_ctls_high &= + ~SECONDARY_EXEC_RDRAND_EXITING; + } + } + + if (vmx_rdseed_supported()) { + bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED); + if (rdseed_enabled) + exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING; + + if (nested) { + if (rdseed_enabled) + vmx->nested.msrs.secondary_ctls_high |= + SECONDARY_EXEC_RDSEED_EXITING; + else + vmx->nested.msrs.secondary_ctls_high &= + ~SECONDARY_EXEC_RDSEED_EXITING; + } + } + + vmx->secondary_exec_control = exec_control; +} + +static void ept_set_mmio_spte_mask(void) +{ + /* + * EPT Misconfigurations can be generated if the value of bits 2:0 + * of an EPT paging-structure entry is 110b (write/execute). + */ + kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK, + VMX_EPT_MISCONFIG_WX_VALUE); +} + +#define VMX_XSS_EXIT_BITMAP 0 +/* + * Sets up the vmcs for emulated real mode. + */ +static void vmx_vcpu_setup(struct vcpu_vmx *vmx) +{ + int i; + + if (enable_shadow_vmcs) { + /* + * At vCPU creation, "VMWRITE to any supported field + * in the VMCS" is supported, so use the more + * permissive vmx_vmread_bitmap to specify both read + * and write permissions for the shadow VMCS. + */ + vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); + vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmread_bitmap)); + } + if (cpu_has_vmx_msr_bitmap()) + vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap)); + + vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ + + /* Control */ + vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); + vmx->hv_deadline_tsc = -1; + + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); + + if (cpu_has_secondary_exec_ctrls()) { + vmx_compute_secondary_exec_control(vmx); + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, + vmx->secondary_exec_control); + } + + if (kvm_vcpu_apicv_active(&vmx->vcpu)) { + vmcs_write64(EOI_EXIT_BITMAP0, 0); + vmcs_write64(EOI_EXIT_BITMAP1, 0); + vmcs_write64(EOI_EXIT_BITMAP2, 0); + vmcs_write64(EOI_EXIT_BITMAP3, 0); + + vmcs_write16(GUEST_INTR_STATUS, 0); + + vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); + vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); + } + + if (!kvm_pause_in_guest(vmx->vcpu.kvm)) { + vmcs_write32(PLE_GAP, ple_gap); + vmx->ple_window = ple_window; + vmx->ple_window_dirty = true; + } + + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); + vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ + + vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ + vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ + vmx_set_constant_host_state(vmx); + vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ + vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ + + if (cpu_has_vmx_vmfunc()) + vmcs_write64(VM_FUNCTION_CONTROL, 0); + + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); + vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); + vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); + + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) + vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); + + for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) { + u32 index = vmx_msr_index[i]; + u32 data_low, data_high; + int j = vmx->nmsrs; + + if (rdmsr_safe(index, &data_low, &data_high) < 0) + continue; + if (wrmsr_safe(index, data_low, data_high) < 0) + continue; + vmx->guest_msrs[j].index = i; + vmx->guest_msrs[j].data = 0; + vmx->guest_msrs[j].mask = -1ull; + ++vmx->nmsrs; + } + + vmx->arch_capabilities = kvm_get_arch_capabilities(); + + vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl); + + /* 22.2.1, 20.8.1 */ + vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl); + + vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS; + vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS); + + set_cr4_guest_host_mask(vmx); + + if (vmx_xsaves_supported()) + vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); + + if (enable_pml) { + vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); + vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); + } + + if (cpu_has_vmx_encls_vmexit()) + vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); +} + +static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct msr_data apic_base_msr; + u64 cr0; + + vmx->rmode.vm86_active = 0; + vmx->spec_ctrl = 0; + + vcpu->arch.microcode_version = 0x100000000ULL; + vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); + kvm_set_cr8(vcpu, 0); + + if (!init_event) { + apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | + MSR_IA32_APICBASE_ENABLE; + if (kvm_vcpu_is_reset_bsp(vcpu)) + apic_base_msr.data |= MSR_IA32_APICBASE_BSP; + apic_base_msr.host_initiated = true; + kvm_set_apic_base(vcpu, &apic_base_msr); + } + + vmx_segment_cache_clear(vmx); + + seg_setup(VCPU_SREG_CS); + vmcs_write16(GUEST_CS_SELECTOR, 0xf000); + vmcs_writel(GUEST_CS_BASE, 0xffff0000ul); + + seg_setup(VCPU_SREG_DS); + seg_setup(VCPU_SREG_ES); + seg_setup(VCPU_SREG_FS); + seg_setup(VCPU_SREG_GS); + seg_setup(VCPU_SREG_SS); + + vmcs_write16(GUEST_TR_SELECTOR, 0); + vmcs_writel(GUEST_TR_BASE, 0); + vmcs_write32(GUEST_TR_LIMIT, 0xffff); + vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); + + vmcs_write16(GUEST_LDTR_SELECTOR, 0); + vmcs_writel(GUEST_LDTR_BASE, 0); + vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); + vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); + + if (!init_event) { + vmcs_write32(GUEST_SYSENTER_CS, 0); + vmcs_writel(GUEST_SYSENTER_ESP, 0); + vmcs_writel(GUEST_SYSENTER_EIP, 0); + vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + } + + kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); + kvm_rip_write(vcpu, 0xfff0); + + vmcs_writel(GUEST_GDTR_BASE, 0); + vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); + + vmcs_writel(GUEST_IDTR_BASE, 0); + vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); + + vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); + vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); + vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0); + if (kvm_mpx_supported()) + vmcs_write64(GUEST_BNDCFGS, 0); + + setup_msrs(vmx); + + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ + + if (cpu_has_vmx_tpr_shadow() && !init_event) { + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); + if (cpu_need_tpr_shadow(vcpu)) + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, + __pa(vcpu->arch.apic->regs)); + vmcs_write32(TPR_THRESHOLD, 0); + } + + kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); + + if (vmx->vpid != 0) + vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); + + cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; + vmx->vcpu.arch.cr0 = cr0; + vmx_set_cr0(vcpu, cr0); /* enter rmode */ + vmx_set_cr4(vcpu, 0); + vmx_set_efer(vcpu, 0); + + update_exception_bitmap(vcpu); + + vpid_sync_context(vmx->vpid); + if (init_event) + vmx_clear_hlt(vcpu); +} + +/* + * In nested virtualization, check if L1 asked to exit on external interrupts. + * For most existing hypervisors, this will always return true. + */ +static bool nested_exit_on_intr(struct kvm_vcpu *vcpu) +{ + return get_vmcs12(vcpu)->pin_based_vm_exec_control & + PIN_BASED_EXT_INTR_MASK; +} + +/* + * In nested virtualization, check if L1 has set + * VM_EXIT_ACK_INTR_ON_EXIT + */ +static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) +{ + return get_vmcs12(vcpu)->vm_exit_controls & + VM_EXIT_ACK_INTR_ON_EXIT; +} + +static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) +{ + return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu)); +} + +static void enable_irq_window(struct kvm_vcpu *vcpu) +{ + vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, + CPU_BASED_VIRTUAL_INTR_PENDING); +} + +static void enable_nmi_window(struct kvm_vcpu *vcpu) +{ + if (!enable_vnmi || + vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { + enable_irq_window(vcpu); + return; + } + + vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, + CPU_BASED_VIRTUAL_NMI_PENDING); +} + +static void vmx_inject_irq(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + uint32_t intr; + int irq = vcpu->arch.interrupt.nr; + + trace_kvm_inj_virq(irq); + + ++vcpu->stat.irq_injections; + if (vmx->rmode.vm86_active) { + int inc_eip = 0; + if (vcpu->arch.interrupt.soft) + inc_eip = vcpu->arch.event_exit_inst_len; + if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE) + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + return; + } + intr = irq | INTR_INFO_VALID_MASK; + if (vcpu->arch.interrupt.soft) { + intr |= INTR_TYPE_SOFT_INTR; + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, + vmx->vcpu.arch.event_exit_inst_len); + } else + intr |= INTR_TYPE_EXT_INTR; + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); + + vmx_clear_hlt(vcpu); +} + +static void vmx_inject_nmi(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!enable_vnmi) { + /* + * Tracking the NMI-blocked state in software is built upon + * finding the next open IRQ window. This, in turn, depends on + * well-behaving guests: They have to keep IRQs disabled at + * least as long as the NMI handler runs. Otherwise we may + * cause NMI nesting, maybe breaking the guest. But as this is + * highly unlikely, we can live with the residual risk. + */ + vmx->loaded_vmcs->soft_vnmi_blocked = 1; + vmx->loaded_vmcs->vnmi_blocked_time = 0; + } + + ++vcpu->stat.nmi_injections; + vmx->loaded_vmcs->nmi_known_unmasked = false; + + if (vmx->rmode.vm86_active) { + if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE) + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + return; + } + + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); + + vmx_clear_hlt(vcpu); +} + +static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + bool masked; + + if (!enable_vnmi) + return vmx->loaded_vmcs->soft_vnmi_blocked; + if (vmx->loaded_vmcs->nmi_known_unmasked) + return false; + masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; + vmx->loaded_vmcs->nmi_known_unmasked = !masked; + return masked; +} + +static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!enable_vnmi) { + if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) { + vmx->loaded_vmcs->soft_vnmi_blocked = masked; + vmx->loaded_vmcs->vnmi_blocked_time = 0; + } + } else { + vmx->loaded_vmcs->nmi_known_unmasked = !masked; + if (masked) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + else + vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + } +} + +static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) +{ + if (to_vmx(vcpu)->nested.nested_run_pending) + return 0; + + if (!enable_vnmi && + to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked) + return 0; + + return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI + | GUEST_INTR_STATE_NMI)); +} + +static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) +{ + return (!to_vmx(vcpu)->nested.nested_run_pending && + vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && + !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); +} + +static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) +{ + int ret; + + if (enable_unrestricted_guest) + return 0; + + ret = x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr, + PAGE_SIZE * 3); + if (ret) + return ret; + to_kvm_vmx(kvm)->tss_addr = addr; + return init_rmode_tss(kvm); +} + +static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) +{ + to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr; + return 0; +} + +static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) +{ + switch (vec) { + case BP_VECTOR: + /* + * Update instruction length as we may reinject the exception + * from user space while in guest debugging mode. + */ + to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) + return false; + /* fall through */ + case DB_VECTOR: + if (vcpu->guest_debug & + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) + return false; + /* fall through */ + case DE_VECTOR: + case OF_VECTOR: + case BR_VECTOR: + case UD_VECTOR: + case DF_VECTOR: + case SS_VECTOR: + case GP_VECTOR: + case MF_VECTOR: + return true; + break; + } + return false; +} + +static int handle_rmode_exception(struct kvm_vcpu *vcpu, + int vec, u32 err_code) +{ + /* + * Instruction with address size override prefix opcode 0x67 + * Cause the #SS fault with 0 error code in VM86 mode. + */ + if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) { + if (kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE) { + if (vcpu->arch.halt_request) { + vcpu->arch.halt_request = 0; + return kvm_vcpu_halt(vcpu); + } + return 1; + } + return 0; + } + + /* + * Forward all other exceptions that are valid in real mode. + * FIXME: Breaks guest debugging in real mode, needs to be fixed with + * the required debugging infrastructure rework. + */ + kvm_queue_exception(vcpu, vec); + return 1; +} + +/* + * Trigger machine check on the host. We assume all the MSRs are already set up + * by the CPU and that we still run on the same CPU as the MCE occurred on. + * We pass a fake environment to the machine check handler because we want + * the guest to be always treated like user space, no matter what context + * it used internally. + */ +static void kvm_machine_check(void) +{ +#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64) + struct pt_regs regs = { + .cs = 3, /* Fake ring 3 no matter what the guest ran on */ + .flags = X86_EFLAGS_IF, + }; + + do_machine_check(®s, 0); +#endif +} + +static int handle_machine_check(struct kvm_vcpu *vcpu) +{ + /* already handled by vcpu_run */ + return 1; +} + +static int handle_exception(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_run *kvm_run = vcpu->run; + u32 intr_info, ex_no, error_code; + unsigned long cr2, rip, dr6; + u32 vect_info; + enum emulation_result er; + + vect_info = vmx->idt_vectoring_info; + intr_info = vmx->exit_intr_info; + + if (is_machine_check(intr_info)) + return handle_machine_check(vcpu); + + if (is_nmi(intr_info)) + return 1; /* already handled by vmx_vcpu_run() */ + + if (is_invalid_opcode(intr_info)) + return handle_ud(vcpu); + + error_code = 0; + if (intr_info & INTR_INFO_DELIVER_CODE_MASK) + error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + + if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) { + WARN_ON_ONCE(!enable_vmware_backdoor); + er = kvm_emulate_instruction(vcpu, + EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL); + if (er == EMULATE_USER_EXIT) + return 0; + else if (er != EMULATE_DONE) + kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); + return 1; + } + + /* + * The #PF with PFEC.RSVD = 1 indicates the guest is accessing + * MMIO, it is better to report an internal error. + * See the comments in vmx_handle_exit. + */ + if ((vect_info & VECTORING_INFO_VALID_MASK) && + !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; + vcpu->run->internal.ndata = 3; + vcpu->run->internal.data[0] = vect_info; + vcpu->run->internal.data[1] = intr_info; + vcpu->run->internal.data[2] = error_code; + return 0; + } + + if (is_page_fault(intr_info)) { + cr2 = vmcs_readl(EXIT_QUALIFICATION); + /* EPT won't cause page fault directly */ + WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept); + return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); + } + + ex_no = intr_info & INTR_INFO_VECTOR_MASK; + + if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no)) + return handle_rmode_exception(vcpu, ex_no, error_code); + + switch (ex_no) { + case AC_VECTOR: + kvm_queue_exception_e(vcpu, AC_VECTOR, error_code); + return 1; + case DB_VECTOR: + dr6 = vmcs_readl(EXIT_QUALIFICATION); + if (!(vcpu->guest_debug & + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { + vcpu->arch.dr6 &= ~15; + vcpu->arch.dr6 |= dr6 | DR6_RTM; + if (is_icebp(intr_info)) + skip_emulated_instruction(vcpu); + + kvm_queue_exception(vcpu, DB_VECTOR); + return 1; + } + kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1; + kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); + /* fall through */ + case BP_VECTOR: + /* + * Update instruction length as we may reinject #BP from + * user space while in guest debugging mode. Reading it for + * #DB as well causes no harm, it is not used in that case. + */ + vmx->vcpu.arch.event_exit_inst_len = + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + kvm_run->exit_reason = KVM_EXIT_DEBUG; + rip = kvm_rip_read(vcpu); + kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; + kvm_run->debug.arch.exception = ex_no; + break; + default: + kvm_run->exit_reason = KVM_EXIT_EXCEPTION; + kvm_run->ex.exception = ex_no; + kvm_run->ex.error_code = error_code; + break; + } + return 0; +} + +static int handle_external_interrupt(struct kvm_vcpu *vcpu) +{ + ++vcpu->stat.irq_exits; + return 1; +} + +static int handle_triple_fault(struct kvm_vcpu *vcpu) +{ + vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; + vcpu->mmio_needed = 0; + return 0; +} + +static int handle_io(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification; + int size, in, string; + unsigned port; + + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + string = (exit_qualification & 16) != 0; + + ++vcpu->stat.io_exits; + + if (string) + return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; + + port = exit_qualification >> 16; + size = (exit_qualification & 7) + 1; + in = (exit_qualification & 8) != 0; + + return kvm_fast_pio(vcpu, size, port, in); +} + +static void +vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) +{ + /* + * Patch in the VMCALL instruction: + */ + hypercall[0] = 0x0f; + hypercall[1] = 0x01; + hypercall[2] = 0xc1; +} + +/* called to set cr0 as appropriate for a mov-to-cr0 exit. */ +static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) +{ + if (is_guest_mode(vcpu)) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + unsigned long orig_val = val; + + /* + * We get here when L2 changed cr0 in a way that did not change + * any of L1's shadowed bits (see nested_vmx_exit_handled_cr), + * but did change L0 shadowed bits. So we first calculate the + * effective cr0 value that L1 would like to write into the + * hardware. It consists of the L2-owned bits from the new + * value combined with the L1-owned bits from L1's guest_cr0. + */ + val = (val & ~vmcs12->cr0_guest_host_mask) | + (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); + + if (!nested_guest_cr0_valid(vcpu, val)) + return 1; + + if (kvm_set_cr0(vcpu, val)) + return 1; + vmcs_writel(CR0_READ_SHADOW, orig_val); + return 0; + } else { + if (to_vmx(vcpu)->nested.vmxon && + !nested_host_cr0_valid(vcpu, val)) + return 1; + + return kvm_set_cr0(vcpu, val); + } +} + +static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) +{ + if (is_guest_mode(vcpu)) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + unsigned long orig_val = val; + + /* analogously to handle_set_cr0 */ + val = (val & ~vmcs12->cr4_guest_host_mask) | + (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask); + if (kvm_set_cr4(vcpu, val)) + return 1; + vmcs_writel(CR4_READ_SHADOW, orig_val); + return 0; + } else + return kvm_set_cr4(vcpu, val); +} + +static int handle_desc(struct kvm_vcpu *vcpu) +{ + WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP)); + return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; +} + +static int handle_cr(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification, val; + int cr; + int reg; + int err; + int ret; + + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + cr = exit_qualification & 15; + reg = (exit_qualification >> 8) & 15; + switch ((exit_qualification >> 4) & 3) { + case 0: /* mov to cr */ + val = kvm_register_readl(vcpu, reg); + trace_kvm_cr_write(cr, val); + switch (cr) { + case 0: + err = handle_set_cr0(vcpu, val); + return kvm_complete_insn_gp(vcpu, err); + case 3: + WARN_ON_ONCE(enable_unrestricted_guest); + err = kvm_set_cr3(vcpu, val); + return kvm_complete_insn_gp(vcpu, err); + case 4: + err = handle_set_cr4(vcpu, val); + return kvm_complete_insn_gp(vcpu, err); + case 8: { + u8 cr8_prev = kvm_get_cr8(vcpu); + u8 cr8 = (u8)val; + err = kvm_set_cr8(vcpu, cr8); + ret = kvm_complete_insn_gp(vcpu, err); + if (lapic_in_kernel(vcpu)) + return ret; + if (cr8_prev <= cr8) + return ret; + /* + * TODO: we might be squashing a + * KVM_GUESTDBG_SINGLESTEP-triggered + * KVM_EXIT_DEBUG here. + */ + vcpu->run->exit_reason = KVM_EXIT_SET_TPR; + return 0; + } + } + break; + case 2: /* clts */ + WARN_ONCE(1, "Guest should always own CR0.TS"); + vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); + trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); + return kvm_skip_emulated_instruction(vcpu); + case 1: /*mov from cr*/ + switch (cr) { + case 3: + WARN_ON_ONCE(enable_unrestricted_guest); + val = kvm_read_cr3(vcpu); + kvm_register_write(vcpu, reg, val); + trace_kvm_cr_read(cr, val); + return kvm_skip_emulated_instruction(vcpu); + case 8: + val = kvm_get_cr8(vcpu); + kvm_register_write(vcpu, reg, val); + trace_kvm_cr_read(cr, val); + return kvm_skip_emulated_instruction(vcpu); + } + break; + case 3: /* lmsw */ + val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; + trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val); + kvm_lmsw(vcpu, val); + + return kvm_skip_emulated_instruction(vcpu); + default: + break; + } + vcpu->run->exit_reason = 0; + vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n", + (int)(exit_qualification >> 4) & 3, cr); + return 0; +} + +static int handle_dr(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification; + int dr, dr7, reg; + + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + dr = exit_qualification & DEBUG_REG_ACCESS_NUM; + + /* First, if DR does not exist, trigger UD */ + if (!kvm_require_dr(vcpu, dr)) + return 1; + + /* Do not handle if the CPL > 0, will trigger GP on re-entry */ + if (!kvm_require_cpl(vcpu, 0)) + return 1; + dr7 = vmcs_readl(GUEST_DR7); + if (dr7 & DR7_GD) { + /* + * As the vm-exit takes precedence over the debug trap, we + * need to emulate the latter, either for the host or the + * guest debugging itself. + */ + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { + vcpu->run->debug.arch.dr6 = vcpu->arch.dr6; + vcpu->run->debug.arch.dr7 = dr7; + vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu); + vcpu->run->debug.arch.exception = DB_VECTOR; + vcpu->run->exit_reason = KVM_EXIT_DEBUG; + return 0; + } else { + vcpu->arch.dr6 &= ~15; + vcpu->arch.dr6 |= DR6_BD | DR6_RTM; + kvm_queue_exception(vcpu, DB_VECTOR); + return 1; + } + } + + if (vcpu->guest_debug == 0) { + vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, + CPU_BASED_MOV_DR_EXITING); + + /* + * No more DR vmexits; force a reload of the debug registers + * and reenter on this instruction. The next vmexit will + * retrieve the full state of the debug registers. + */ + vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; + return 1; + } + + reg = DEBUG_REG_ACCESS_REG(exit_qualification); + if (exit_qualification & TYPE_MOV_FROM_DR) { + unsigned long val; + + if (kvm_get_dr(vcpu, dr, &val)) + return 1; + kvm_register_write(vcpu, reg, val); + } else + if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg))) + return 1; + + return kvm_skip_emulated_instruction(vcpu); +} + +static u64 vmx_get_dr6(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.dr6; +} + +static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) +{ +} + +static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) +{ + get_debugreg(vcpu->arch.db[0], 0); + get_debugreg(vcpu->arch.db[1], 1); + get_debugreg(vcpu->arch.db[2], 2); + get_debugreg(vcpu->arch.db[3], 3); + get_debugreg(vcpu->arch.dr6, 6); + vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); + + vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; + vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING); +} + +static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) +{ + vmcs_writel(GUEST_DR7, val); +} + +static int handle_cpuid(struct kvm_vcpu *vcpu) +{ + return kvm_emulate_cpuid(vcpu); +} + +static int handle_rdmsr(struct kvm_vcpu *vcpu) +{ + u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; + struct msr_data msr_info; + + msr_info.index = ecx; + msr_info.host_initiated = false; + if (vmx_get_msr(vcpu, &msr_info)) { + trace_kvm_msr_read_ex(ecx); + kvm_inject_gp(vcpu, 0); + return 1; + } + + trace_kvm_msr_read(ecx, msr_info.data); + + /* FIXME: handling of bits 32:63 of rax, rdx */ + vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u; + vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u; + return kvm_skip_emulated_instruction(vcpu); +} + +static int handle_wrmsr(struct kvm_vcpu *vcpu) +{ + struct msr_data msr; + u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; + u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) + | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); + + msr.data = data; + msr.index = ecx; + msr.host_initiated = false; + if (kvm_set_msr(vcpu, &msr) != 0) { + trace_kvm_msr_write_ex(ecx, data); + kvm_inject_gp(vcpu, 0); + return 1; + } + + trace_kvm_msr_write(ecx, data); + return kvm_skip_emulated_instruction(vcpu); +} + +static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) +{ + kvm_apic_update_ppr(vcpu); + return 1; +} + +static int handle_interrupt_window(struct kvm_vcpu *vcpu) +{ + vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, + CPU_BASED_VIRTUAL_INTR_PENDING); + + kvm_make_request(KVM_REQ_EVENT, vcpu); + + ++vcpu->stat.irq_window_exits; + return 1; +} + +static int handle_halt(struct kvm_vcpu *vcpu) +{ + return kvm_emulate_halt(vcpu); +} + +static int handle_vmcall(struct kvm_vcpu *vcpu) +{ + return kvm_emulate_hypercall(vcpu); +} + +static int handle_invd(struct kvm_vcpu *vcpu) +{ + return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; +} + +static int handle_invlpg(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + + kvm_mmu_invlpg(vcpu, exit_qualification); + return kvm_skip_emulated_instruction(vcpu); +} + +static int handle_rdpmc(struct kvm_vcpu *vcpu) +{ + int err; + + err = kvm_rdpmc(vcpu); + return kvm_complete_insn_gp(vcpu, err); +} + +static int handle_wbinvd(struct kvm_vcpu *vcpu) +{ + return kvm_emulate_wbinvd(vcpu); +} + +static int handle_xsetbv(struct kvm_vcpu *vcpu) +{ + u64 new_bv = kvm_read_edx_eax(vcpu); + u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX); + + if (kvm_set_xcr(vcpu, index, new_bv) == 0) + return kvm_skip_emulated_instruction(vcpu); + return 1; +} + +static int handle_xsaves(struct kvm_vcpu *vcpu) +{ + kvm_skip_emulated_instruction(vcpu); + WARN(1, "this should never happen\n"); + return 1; +} + +static int handle_xrstors(struct kvm_vcpu *vcpu) +{ + kvm_skip_emulated_instruction(vcpu); + WARN(1, "this should never happen\n"); + return 1; +} + +static int handle_apic_access(struct kvm_vcpu *vcpu) +{ + if (likely(fasteoi)) { + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + int access_type, offset; + + access_type = exit_qualification & APIC_ACCESS_TYPE; + offset = exit_qualification & APIC_ACCESS_OFFSET; + /* + * Sane guest uses MOV to write EOI, with written value + * not cared. So make a short-circuit here by avoiding + * heavy instruction emulation. + */ + if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) && + (offset == APIC_EOI)) { + kvm_lapic_set_eoi(vcpu); + return kvm_skip_emulated_instruction(vcpu); + } + } + return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; +} + +static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + int vector = exit_qualification & 0xff; + + /* EOI-induced VM exit is trap-like and thus no need to adjust IP */ + kvm_apic_set_eoi_accelerated(vcpu, vector); + return 1; +} + +static int handle_apic_write(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + u32 offset = exit_qualification & 0xfff; + + /* APIC-write VM exit is trap-like and thus no need to adjust IP */ + kvm_apic_write_nodecode(vcpu, offset); + return 1; +} + +static int handle_task_switch(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long exit_qualification; + bool has_error_code = false; + u32 error_code = 0; + u16 tss_selector; + int reason, type, idt_v, idt_index; + + idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); + idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); + type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); + + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + + reason = (u32)exit_qualification >> 30; + if (reason == TASK_SWITCH_GATE && idt_v) { + switch (type) { + case INTR_TYPE_NMI_INTR: + vcpu->arch.nmi_injected = false; + vmx_set_nmi_mask(vcpu, true); + break; + case INTR_TYPE_EXT_INTR: + case INTR_TYPE_SOFT_INTR: + kvm_clear_interrupt_queue(vcpu); + break; + case INTR_TYPE_HARD_EXCEPTION: + if (vmx->idt_vectoring_info & + VECTORING_INFO_DELIVER_CODE_MASK) { + has_error_code = true; + error_code = + vmcs_read32(IDT_VECTORING_ERROR_CODE); + } + /* fall through */ + case INTR_TYPE_SOFT_EXCEPTION: + kvm_clear_exception_queue(vcpu); + break; + default: + break; + } + } + tss_selector = exit_qualification; + + if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION && + type != INTR_TYPE_EXT_INTR && + type != INTR_TYPE_NMI_INTR)) + skip_emulated_instruction(vcpu); + + if (kvm_task_switch(vcpu, tss_selector, + type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason, + has_error_code, error_code) == EMULATE_FAIL) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return 0; + } + + /* + * TODO: What about debug traps on tss switch? + * Are we supposed to inject them and update dr6? + */ + + return 1; +} + +static int handle_ept_violation(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification; + gpa_t gpa; + u64 error_code; + + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + + /* + * EPT violation happened while executing iret from NMI, + * "blocked by NMI" bit has to be set before next VM entry. + * There are errata that may cause this bit to not be set: + * AAK134, BY25. + */ + if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && + enable_vnmi && + (exit_qualification & INTR_INFO_UNBLOCK_NMI)) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); + + gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); + trace_kvm_page_fault(gpa, exit_qualification); + + /* Is it a read fault? */ + error_code = (exit_qualification & EPT_VIOLATION_ACC_READ) + ? PFERR_USER_MASK : 0; + /* Is it a write fault? */ + error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE) + ? PFERR_WRITE_MASK : 0; + /* Is it a fetch fault? */ + error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) + ? PFERR_FETCH_MASK : 0; + /* ept page table entry is present? */ + error_code |= (exit_qualification & + (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE | + EPT_VIOLATION_EXECUTABLE)) + ? PFERR_PRESENT_MASK : 0; + + error_code |= (exit_qualification & 0x100) != 0 ? + PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; + + vcpu->arch.exit_qualification = exit_qualification; + return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); +} + +static int handle_ept_misconfig(struct kvm_vcpu *vcpu) +{ + gpa_t gpa; + + /* + * A nested guest cannot optimize MMIO vmexits, because we have an + * nGPA here instead of the required GPA. + */ + gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); + if (!is_guest_mode(vcpu) && + !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { + trace_kvm_fast_mmio(gpa); + /* + * Doing kvm_skip_emulated_instruction() depends on undefined + * behavior: Intel's manual doesn't mandate + * VM_EXIT_INSTRUCTION_LEN to be set in VMCS when EPT MISCONFIG + * occurs and while on real hardware it was observed to be set, + * other hypervisors (namely Hyper-V) don't set it, we end up + * advancing IP with some random value. Disable fast mmio when + * running nested and keep it for real hardware in hope that + * VM_EXIT_INSTRUCTION_LEN will always be set correctly. + */ + if (!static_cpu_has(X86_FEATURE_HYPERVISOR)) + return kvm_skip_emulated_instruction(vcpu); + else + return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) == + EMULATE_DONE; + } + + return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); +} + +static int handle_nmi_window(struct kvm_vcpu *vcpu) +{ + WARN_ON_ONCE(!enable_vnmi); + vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, + CPU_BASED_VIRTUAL_NMI_PENDING); + ++vcpu->stat.nmi_window_exits; + kvm_make_request(KVM_REQ_EVENT, vcpu); + + return 1; +} + +static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + enum emulation_result err = EMULATE_DONE; + int ret = 1; + u32 cpu_exec_ctrl; + bool intr_window_requested; + unsigned count = 130; + + /* + * We should never reach the point where we are emulating L2 + * due to invalid guest state as that means we incorrectly + * allowed a nested VMEntry with an invalid vmcs12. + */ + WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending); + + cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; + + while (vmx->emulation_required && count-- != 0) { + if (intr_window_requested && vmx_interrupt_allowed(vcpu)) + return handle_interrupt_window(&vmx->vcpu); + + if (kvm_test_request(KVM_REQ_EVENT, vcpu)) + return 1; + + err = kvm_emulate_instruction(vcpu, 0); + + if (err == EMULATE_USER_EXIT) { + ++vcpu->stat.mmio_exits; + ret = 0; + goto out; + } + + if (err != EMULATE_DONE) + goto emulation_error; + + if (vmx->emulation_required && !vmx->rmode.vm86_active && + vcpu->arch.exception.pending) + goto emulation_error; + + if (vcpu->arch.halt_request) { + vcpu->arch.halt_request = 0; + ret = kvm_vcpu_halt(vcpu); + goto out; + } + + if (signal_pending(current)) + goto out; + if (need_resched()) + schedule(); + } + +out: + return ret; + +emulation_error: + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return 0; +} + +static void grow_ple_window(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int old = vmx->ple_window; + + vmx->ple_window = __grow_ple_window(old, ple_window, + ple_window_grow, + ple_window_max); + + if (vmx->ple_window != old) + vmx->ple_window_dirty = true; + + trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old); +} + +static void shrink_ple_window(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int old = vmx->ple_window; + + vmx->ple_window = __shrink_ple_window(old, ple_window, + ple_window_shrink, + ple_window); + + if (vmx->ple_window != old) + vmx->ple_window_dirty = true; + + trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old); +} + +/* + * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. + */ +static void wakeup_handler(void) +{ + struct kvm_vcpu *vcpu; + int cpu = smp_processor_id(); + + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); + list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu), + blocked_vcpu_list) { + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + if (pi_test_on(pi_desc) == 1) + kvm_vcpu_kick(vcpu); + } + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); +} + +static void vmx_enable_tdp(void) +{ + kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, + enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull, + enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull, + 0ull, VMX_EPT_EXECUTABLE_MASK, + cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK, + VMX_EPT_RWX_MASK, 0ull); + + ept_set_mmio_spte_mask(); + kvm_enable_tdp(); +} + +static __init int hardware_setup(void) +{ + unsigned long host_bndcfgs; + int r = -ENOMEM, i; + + rdmsrl_safe(MSR_EFER, &host_efer); + + for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) + kvm_define_shared_msr(i, vmx_msr_index[i]); + + if (setup_vmcs_config(&vmcs_config) < 0) + return -EIO; + + if (boot_cpu_has(X86_FEATURE_NX)) + kvm_enable_efer_bits(EFER_NX); + + if (boot_cpu_has(X86_FEATURE_MPX)) { + rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs); + WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost"); + } + + if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || + !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) + enable_vpid = 0; + + if (!cpu_has_vmx_ept() || + !cpu_has_vmx_ept_4levels() || + !cpu_has_vmx_ept_mt_wb() || + !cpu_has_vmx_invept_global()) + enable_ept = 0; + + if (!cpu_has_vmx_ept_ad_bits() || !enable_ept) + enable_ept_ad_bits = 0; + + if (!cpu_has_vmx_unrestricted_guest() || !enable_ept) + enable_unrestricted_guest = 0; + + if (!cpu_has_vmx_flexpriority()) + flexpriority_enabled = 0; + + if (!cpu_has_virtual_nmis()) + enable_vnmi = 0; + + /* + * set_apic_access_page_addr() is used to reload apic access + * page upon invalidation. No need to do anything if not + * using the APIC_ACCESS_ADDR VMCS field. + */ + if (!flexpriority_enabled) + kvm_x86_ops->set_apic_access_page_addr = NULL; + + if (!cpu_has_vmx_tpr_shadow()) + kvm_x86_ops->update_cr8_intercept = NULL; + + if (enable_ept && !cpu_has_vmx_ept_2m_page()) + kvm_disable_largepages(); + +#if IS_ENABLED(CONFIG_HYPERV) + if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH + && enable_ept) + kvm_x86_ops->tlb_remote_flush = vmx_hv_remote_flush_tlb; +#endif + + if (!cpu_has_vmx_ple()) { + ple_gap = 0; + ple_window = 0; + ple_window_grow = 0; + ple_window_max = 0; + ple_window_shrink = 0; + } + + if (!cpu_has_vmx_apicv()) { + enable_apicv = 0; + kvm_x86_ops->sync_pir_to_irr = NULL; + } + + if (cpu_has_vmx_tsc_scaling()) { + kvm_has_tsc_control = true; + kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; + kvm_tsc_scaling_ratio_frac_bits = 48; + } + + set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ + + if (enable_ept) + vmx_enable_tdp(); + else + kvm_disable_tdp(); + + if (!nested) { + kvm_x86_ops->get_nested_state = NULL; + kvm_x86_ops->set_nested_state = NULL; + } + + /* + * Only enable PML when hardware supports PML feature, and both EPT + * and EPT A/D bit features are enabled -- PML depends on them to work. + */ + if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) + enable_pml = 0; + + if (!enable_pml) { + kvm_x86_ops->slot_enable_log_dirty = NULL; + kvm_x86_ops->slot_disable_log_dirty = NULL; + kvm_x86_ops->flush_log_dirty = NULL; + kvm_x86_ops->enable_log_dirty_pt_masked = NULL; + } + + if (!cpu_has_vmx_preemption_timer()) + kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit; + + if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) { + u64 vmx_msr; + + rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); + cpu_preemption_timer_multi = + vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; + } else { + kvm_x86_ops->set_hv_timer = NULL; + kvm_x86_ops->cancel_hv_timer = NULL; + } + + if (!cpu_has_vmx_shadow_vmcs() || !nested) + enable_shadow_vmcs = 0; + if (enable_shadow_vmcs) { + for (i = 0; i < VMX_BITMAP_NR; i++) { + vmx_bitmap[i] = (unsigned long *) + __get_free_page(GFP_KERNEL); + if (!vmx_bitmap[i]) + goto out; + } + + init_vmcs_shadow_fields(); + } + + kvm_set_posted_intr_wakeup_handler(wakeup_handler); + nested_vmx_setup_ctls_msrs(&vmcs_config.nested, enable_apicv); + + kvm_mce_cap_supported |= MCG_LMCE_P; + + r = alloc_kvm_area(); + if (r) + goto out; + return 0; + +out: + if (enable_shadow_vmcs) { + for (i = 0; i < VMX_BITMAP_NR; i++) + free_page((unsigned long)vmx_bitmap[i]); + } + return r; +} + +static __exit void hardware_unsetup(void) +{ + int i; + + if (enable_shadow_vmcs) { + for (i = 0; i < VMX_BITMAP_NR; i++) + free_page((unsigned long)vmx_bitmap[i]); + } + + free_kvm_area(); +} + +/* + * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE + * exiting, so only get here on cpu with PAUSE-Loop-Exiting. + */ +static int handle_pause(struct kvm_vcpu *vcpu) +{ + if (!kvm_pause_in_guest(vcpu->kvm)) + grow_ple_window(vcpu); + + /* + * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting" + * VM-execution control is ignored if CPL > 0. OTOH, KVM + * never set PAUSE_EXITING and just set PLE if supported, + * so the vcpu must be CPL=0 if it gets a PAUSE exit. + */ + kvm_vcpu_on_spin(vcpu, true); + return kvm_skip_emulated_instruction(vcpu); +} + +static int handle_nop(struct kvm_vcpu *vcpu) +{ + return kvm_skip_emulated_instruction(vcpu); +} + +static int handle_mwait(struct kvm_vcpu *vcpu) +{ + printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n"); + return handle_nop(vcpu); +} + +static int handle_invalid_op(struct kvm_vcpu *vcpu) +{ + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; +} + +static int handle_monitor_trap(struct kvm_vcpu *vcpu) +{ + return 1; +} + +static int handle_monitor(struct kvm_vcpu *vcpu) +{ + printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n"); + return handle_nop(vcpu); +} + +/* + * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), + * set the success or error code of an emulated VMX instruction (as specified + * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated + * instruction. + */ +static int nested_vmx_succeed(struct kvm_vcpu *vcpu) +{ + vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) + & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | + X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); + return kvm_skip_emulated_instruction(vcpu); +} + +static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) +{ + vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) + & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | + X86_EFLAGS_SF | X86_EFLAGS_OF)) + | X86_EFLAGS_CF); + return kvm_skip_emulated_instruction(vcpu); +} + +static int nested_vmx_failValid(struct kvm_vcpu *vcpu, + u32 vm_instruction_error) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * failValid writes the error number to the current VMCS, which + * can't be done if there isn't a current VMCS. + */ + if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs) + return nested_vmx_failInvalid(vcpu); + + vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) + & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | + X86_EFLAGS_SF | X86_EFLAGS_OF)) + | X86_EFLAGS_ZF); + get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; + /* + * We don't need to force a shadow sync because + * VM_INSTRUCTION_ERROR is not shadowed + */ + return kvm_skip_emulated_instruction(vcpu); +} + +static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) +{ + /* TODO: not to reset guest simply here. */ + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator); +} + +static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) +{ + struct vcpu_vmx *vmx = + container_of(timer, struct vcpu_vmx, nested.preemption_timer); + + vmx->nested.preemption_timer_expired = true; + kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); + kvm_vcpu_kick(&vmx->vcpu); + + return HRTIMER_NORESTART; +} + +/* + * Decode the memory-address operand of a vmx instruction, as recorded on an + * exit caused by such an instruction (run by a guest hypervisor). + * On success, returns 0. When the operand is invalid, returns 1 and throws + * #UD or #GP. + */ +static int get_vmx_mem_address(struct kvm_vcpu *vcpu, + unsigned long exit_qualification, + u32 vmx_instruction_info, bool wr, gva_t *ret) +{ + gva_t off; + bool exn; + struct kvm_segment s; + + /* + * According to Vol. 3B, "Information for VM Exits Due to Instruction + * Execution", on an exit, vmx_instruction_info holds most of the + * addressing components of the operand. Only the displacement part + * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). + * For how an actual address is calculated from all these components, + * refer to Vol. 1, "Operand Addressing". + */ + int scaling = vmx_instruction_info & 3; + int addr_size = (vmx_instruction_info >> 7) & 7; + bool is_reg = vmx_instruction_info & (1u << 10); + int seg_reg = (vmx_instruction_info >> 15) & 7; + int index_reg = (vmx_instruction_info >> 18) & 0xf; + bool index_is_valid = !(vmx_instruction_info & (1u << 22)); + int base_reg = (vmx_instruction_info >> 23) & 0xf; + bool base_is_valid = !(vmx_instruction_info & (1u << 27)); + + if (is_reg) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + /* Addr = segment_base + offset */ + /* offset = base + [index * scale] + displacement */ + off = exit_qualification; /* holds the displacement */ + if (base_is_valid) + off += kvm_register_read(vcpu, base_reg); + if (index_is_valid) + off += kvm_register_read(vcpu, index_reg)< s.limit); + } + if (exn) { + kvm_queue_exception_e(vcpu, + seg_reg == VCPU_SREG_SS ? + SS_VECTOR : GP_VECTOR, + 0); + return 1; + } + + return 0; +} + +static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer) +{ + gva_t gva; + struct x86_exception e; + + if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva)) + return 1; + + if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) { + kvm_inject_page_fault(vcpu, &e); + return 1; + } + + return 0; +} + +/* + * Allocate a shadow VMCS and associate it with the currently loaded + * VMCS, unless such a shadow VMCS already exists. The newly allocated + * VMCS is also VMCLEARed, so that it is ready for use. + */ +static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; + + /* + * We should allocate a shadow vmcs for vmcs01 only when L1 + * executes VMXON and free it when L1 executes VMXOFF. + * As it is invalid to execute VMXON twice, we shouldn't reach + * here when vmcs01 already have an allocated shadow vmcs. + */ + WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs); + + if (!loaded_vmcs->shadow_vmcs) { + loaded_vmcs->shadow_vmcs = alloc_vmcs(true); + if (loaded_vmcs->shadow_vmcs) + vmcs_clear(loaded_vmcs->shadow_vmcs); + } + return loaded_vmcs->shadow_vmcs; +} + +static int enter_vmx_operation(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int r; + + r = alloc_loaded_vmcs(&vmx->nested.vmcs02); + if (r < 0) + goto out_vmcs02; + + vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); + if (!vmx->nested.cached_vmcs12) + goto out_cached_vmcs12; + + vmx->nested.cached_shadow_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); + if (!vmx->nested.cached_shadow_vmcs12) + goto out_cached_shadow_vmcs12; + + if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) + goto out_shadow_vmcs; + + hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_PINNED); + vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; + + vmx->nested.vpid02 = allocate_vpid(); + + vmx->nested.vmcs02_initialized = false; + vmx->nested.vmxon = true; + return 0; + +out_shadow_vmcs: + kfree(vmx->nested.cached_shadow_vmcs12); + +out_cached_shadow_vmcs12: + kfree(vmx->nested.cached_vmcs12); + +out_cached_vmcs12: + free_loaded_vmcs(&vmx->nested.vmcs02); + +out_vmcs02: + return -ENOMEM; +} + +/* + * Emulate the VMXON instruction. + * Currently, we just remember that VMX is active, and do not save or even + * inspect the argument to VMXON (the so-called "VMXON pointer") because we + * do not currently need to store anything in that guest-allocated memory + * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their + * argument is different from the VMXON pointer (which the spec says they do). + */ +static int handle_vmon(struct kvm_vcpu *vcpu) +{ + int ret; + gpa_t vmptr; + struct page *page; + struct vcpu_vmx *vmx = to_vmx(vcpu); + const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED + | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; + + /* + * The Intel VMX Instruction Reference lists a bunch of bits that are + * prerequisite to running VMXON, most notably cr4.VMXE must be set to + * 1 (see vmx_set_cr4() for when we allow the guest to set this). + * Otherwise, we should fail with #UD. But most faulting conditions + * have already been checked by hardware, prior to the VM-exit for + * VMXON. We do test guest cr4.VMXE because processor CR4 always has + * that bit set to 1 in non-root mode. + */ + if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + /* CPL=0 must be checked manually. */ + if (vmx_get_cpl(vcpu)) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + if (vmx->nested.vmxon) + return nested_vmx_failValid(vcpu, + VMXERR_VMXON_IN_VMX_ROOT_OPERATION); + + if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) + != VMXON_NEEDED_FEATURES) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + if (nested_vmx_get_vmptr(vcpu, &vmptr)) + return 1; + + /* + * SDM 3: 24.11.5 + * The first 4 bytes of VMXON region contain the supported + * VMCS revision identifier + * + * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; + * which replaces physical address width with 32 + */ + if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) + return nested_vmx_failInvalid(vcpu); + + page = kvm_vcpu_gpa_to_page(vcpu, vmptr); + if (is_error_page(page)) + return nested_vmx_failInvalid(vcpu); + + if (*(u32 *)kmap(page) != VMCS12_REVISION) { + kunmap(page); + kvm_release_page_clean(page); + return nested_vmx_failInvalid(vcpu); + } + kunmap(page); + kvm_release_page_clean(page); + + vmx->nested.vmxon_ptr = vmptr; + ret = enter_vmx_operation(vcpu); + if (ret) + return ret; + + return nested_vmx_succeed(vcpu); +} + +/* + * Intel's VMX Instruction Reference specifies a common set of prerequisites + * for running VMX instructions (except VMXON, whose prerequisites are + * slightly different). It also specifies what exception to inject otherwise. + * Note that many of these exceptions have priority over VM exits, so they + * don't have to be checked again here. + */ +static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) +{ + if (!to_vmx(vcpu)->nested.vmxon) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 0; + } + + if (vmx_get_cpl(vcpu)) { + kvm_inject_gp(vcpu, 0); + return 0; + } + + return 1; +} + +static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) +{ + vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS); + vmcs_write64(VMCS_LINK_POINTER, -1ull); +} + +static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!vmx->nested.hv_evmcs) + return; + + kunmap(vmx->nested.hv_evmcs_page); + kvm_release_page_dirty(vmx->nested.hv_evmcs_page); + vmx->nested.hv_evmcs_vmptr = -1ull; + vmx->nested.hv_evmcs_page = NULL; + vmx->nested.hv_evmcs = NULL; +} + +static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (vmx->nested.current_vmptr == -1ull) + return; + + if (enable_shadow_vmcs) { + /* copy to memory all shadowed fields in case + they were modified */ + copy_shadow_to_vmcs12(vmx); + vmx->nested.need_vmcs12_sync = false; + vmx_disable_shadow_vmcs(vmx); + } + vmx->nested.posted_intr_nv = -1; + + /* Flush VMCS12 to guest memory */ + kvm_vcpu_write_guest_page(vcpu, + vmx->nested.current_vmptr >> PAGE_SHIFT, + vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); + + kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); + + vmx->nested.current_vmptr = -1ull; +} + +/* + * Free whatever needs to be freed from vmx->nested when L1 goes down, or + * just stops using VMX. + */ +static void free_nested(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) + return; + + vmx->nested.vmxon = false; + vmx->nested.smm.vmxon = false; + free_vpid(vmx->nested.vpid02); + vmx->nested.posted_intr_nv = -1; + vmx->nested.current_vmptr = -1ull; + if (enable_shadow_vmcs) { + vmx_disable_shadow_vmcs(vmx); + vmcs_clear(vmx->vmcs01.shadow_vmcs); + free_vmcs(vmx->vmcs01.shadow_vmcs); + vmx->vmcs01.shadow_vmcs = NULL; + } + kfree(vmx->nested.cached_vmcs12); + kfree(vmx->nested.cached_shadow_vmcs12); + /* Unpin physical memory we referred to in the vmcs02 */ + if (vmx->nested.apic_access_page) { + kvm_release_page_dirty(vmx->nested.apic_access_page); + vmx->nested.apic_access_page = NULL; + } + if (vmx->nested.virtual_apic_page) { + kvm_release_page_dirty(vmx->nested.virtual_apic_page); + vmx->nested.virtual_apic_page = NULL; + } + if (vmx->nested.pi_desc_page) { + kunmap(vmx->nested.pi_desc_page); + kvm_release_page_dirty(vmx->nested.pi_desc_page); + vmx->nested.pi_desc_page = NULL; + vmx->nested.pi_desc = NULL; + } + + kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); + + nested_release_evmcs(vcpu); + + free_loaded_vmcs(&vmx->nested.vmcs02); +} + +/* Emulate the VMXOFF instruction */ +static int handle_vmoff(struct kvm_vcpu *vcpu) +{ + if (!nested_vmx_check_permission(vcpu)) + return 1; + free_nested(vcpu); + return nested_vmx_succeed(vcpu); +} + +/* Emulate the VMCLEAR instruction */ +static int handle_vmclear(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 zero = 0; + gpa_t vmptr; + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + if (nested_vmx_get_vmptr(vcpu, &vmptr)) + return 1; + + if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) + return nested_vmx_failValid(vcpu, + VMXERR_VMCLEAR_INVALID_ADDRESS); + + if (vmptr == vmx->nested.vmxon_ptr) + return nested_vmx_failValid(vcpu, + VMXERR_VMCLEAR_VMXON_POINTER); + + if (vmx->nested.hv_evmcs_page) { + if (vmptr == vmx->nested.hv_evmcs_vmptr) + nested_release_evmcs(vcpu); + } else { + if (vmptr == vmx->nested.current_vmptr) + nested_release_vmcs12(vcpu); + + kvm_vcpu_write_guest(vcpu, + vmptr + offsetof(struct vmcs12, + launch_state), + &zero, sizeof(zero)); + } + + return nested_vmx_succeed(vcpu); +} + +static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch); + +/* Emulate the VMLAUNCH instruction */ +static int handle_vmlaunch(struct kvm_vcpu *vcpu) +{ + return nested_vmx_run(vcpu, true); +} + +/* Emulate the VMRESUME instruction */ +static int handle_vmresume(struct kvm_vcpu *vcpu) +{ + + return nested_vmx_run(vcpu, false); +} + +/* + * Read a vmcs12 field. Since these can have varying lengths and we return + * one type, we chose the biggest type (u64) and zero-extend the return value + * to that size. Note that the caller, handle_vmread, might need to use only + * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of + * 64-bit fields are to be returned). + */ +static inline int vmcs12_read_any(struct vmcs12 *vmcs12, + unsigned long field, u64 *ret) +{ + short offset = vmcs_field_to_offset(field); + char *p; + + if (offset < 0) + return offset; + + p = (char *)vmcs12 + offset; + + switch (vmcs_field_width(field)) { + case VMCS_FIELD_WIDTH_NATURAL_WIDTH: + *ret = *((natural_width *)p); + return 0; + case VMCS_FIELD_WIDTH_U16: + *ret = *((u16 *)p); + return 0; + case VMCS_FIELD_WIDTH_U32: + *ret = *((u32 *)p); + return 0; + case VMCS_FIELD_WIDTH_U64: + *ret = *((u64 *)p); + return 0; + default: + WARN_ON(1); + return -ENOENT; + } +} + + +static inline int vmcs12_write_any(struct vmcs12 *vmcs12, + unsigned long field, u64 field_value){ + short offset = vmcs_field_to_offset(field); + char *p = (char *)vmcs12 + offset; + if (offset < 0) + return offset; + + switch (vmcs_field_width(field)) { + case VMCS_FIELD_WIDTH_U16: + *(u16 *)p = field_value; + return 0; + case VMCS_FIELD_WIDTH_U32: + *(u32 *)p = field_value; + return 0; + case VMCS_FIELD_WIDTH_U64: + *(u64 *)p = field_value; + return 0; + case VMCS_FIELD_WIDTH_NATURAL_WIDTH: + *(natural_width *)p = field_value; + return 0; + default: + WARN_ON(1); + return -ENOENT; + } + +} + +static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) +{ + struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; + struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; + + /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ + vmcs12->tpr_threshold = evmcs->tpr_threshold; + vmcs12->guest_rip = evmcs->guest_rip; + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { + vmcs12->guest_rsp = evmcs->guest_rsp; + vmcs12->guest_rflags = evmcs->guest_rflags; + vmcs12->guest_interruptibility_info = + evmcs->guest_interruptibility_info; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { + vmcs12->cpu_based_vm_exec_control = + evmcs->cpu_based_vm_exec_control; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { + vmcs12->exception_bitmap = evmcs->exception_bitmap; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { + vmcs12->vm_entry_controls = evmcs->vm_entry_controls; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { + vmcs12->vm_entry_intr_info_field = + evmcs->vm_entry_intr_info_field; + vmcs12->vm_entry_exception_error_code = + evmcs->vm_entry_exception_error_code; + vmcs12->vm_entry_instruction_len = + evmcs->vm_entry_instruction_len; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { + vmcs12->host_ia32_pat = evmcs->host_ia32_pat; + vmcs12->host_ia32_efer = evmcs->host_ia32_efer; + vmcs12->host_cr0 = evmcs->host_cr0; + vmcs12->host_cr3 = evmcs->host_cr3; + vmcs12->host_cr4 = evmcs->host_cr4; + vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; + vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; + vmcs12->host_rip = evmcs->host_rip; + vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; + vmcs12->host_es_selector = evmcs->host_es_selector; + vmcs12->host_cs_selector = evmcs->host_cs_selector; + vmcs12->host_ss_selector = evmcs->host_ss_selector; + vmcs12->host_ds_selector = evmcs->host_ds_selector; + vmcs12->host_fs_selector = evmcs->host_fs_selector; + vmcs12->host_gs_selector = evmcs->host_gs_selector; + vmcs12->host_tr_selector = evmcs->host_tr_selector; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { + vmcs12->pin_based_vm_exec_control = + evmcs->pin_based_vm_exec_control; + vmcs12->vm_exit_controls = evmcs->vm_exit_controls; + vmcs12->secondary_vm_exec_control = + evmcs->secondary_vm_exec_control; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { + vmcs12->io_bitmap_a = evmcs->io_bitmap_a; + vmcs12->io_bitmap_b = evmcs->io_bitmap_b; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { + vmcs12->msr_bitmap = evmcs->msr_bitmap; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { + vmcs12->guest_es_base = evmcs->guest_es_base; + vmcs12->guest_cs_base = evmcs->guest_cs_base; + vmcs12->guest_ss_base = evmcs->guest_ss_base; + vmcs12->guest_ds_base = evmcs->guest_ds_base; + vmcs12->guest_fs_base = evmcs->guest_fs_base; + vmcs12->guest_gs_base = evmcs->guest_gs_base; + vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; + vmcs12->guest_tr_base = evmcs->guest_tr_base; + vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; + vmcs12->guest_idtr_base = evmcs->guest_idtr_base; + vmcs12->guest_es_limit = evmcs->guest_es_limit; + vmcs12->guest_cs_limit = evmcs->guest_cs_limit; + vmcs12->guest_ss_limit = evmcs->guest_ss_limit; + vmcs12->guest_ds_limit = evmcs->guest_ds_limit; + vmcs12->guest_fs_limit = evmcs->guest_fs_limit; + vmcs12->guest_gs_limit = evmcs->guest_gs_limit; + vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; + vmcs12->guest_tr_limit = evmcs->guest_tr_limit; + vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; + vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; + vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; + vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; + vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; + vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; + vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; + vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; + vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; + vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; + vmcs12->guest_es_selector = evmcs->guest_es_selector; + vmcs12->guest_cs_selector = evmcs->guest_cs_selector; + vmcs12->guest_ss_selector = evmcs->guest_ss_selector; + vmcs12->guest_ds_selector = evmcs->guest_ds_selector; + vmcs12->guest_fs_selector = evmcs->guest_fs_selector; + vmcs12->guest_gs_selector = evmcs->guest_gs_selector; + vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; + vmcs12->guest_tr_selector = evmcs->guest_tr_selector; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { + vmcs12->tsc_offset = evmcs->tsc_offset; + vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; + vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { + vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; + vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; + vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; + vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; + vmcs12->guest_cr0 = evmcs->guest_cr0; + vmcs12->guest_cr3 = evmcs->guest_cr3; + vmcs12->guest_cr4 = evmcs->guest_cr4; + vmcs12->guest_dr7 = evmcs->guest_dr7; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { + vmcs12->host_fs_base = evmcs->host_fs_base; + vmcs12->host_gs_base = evmcs->host_gs_base; + vmcs12->host_tr_base = evmcs->host_tr_base; + vmcs12->host_gdtr_base = evmcs->host_gdtr_base; + vmcs12->host_idtr_base = evmcs->host_idtr_base; + vmcs12->host_rsp = evmcs->host_rsp; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { + vmcs12->ept_pointer = evmcs->ept_pointer; + vmcs12->virtual_processor_id = evmcs->virtual_processor_id; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { + vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; + vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; + vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; + vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; + vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; + vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; + vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; + vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; + vmcs12->guest_pending_dbg_exceptions = + evmcs->guest_pending_dbg_exceptions; + vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; + vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; + vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; + vmcs12->guest_activity_state = evmcs->guest_activity_state; + vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; + } + + /* + * Not used? + * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; + * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; + * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; + * vmcs12->cr3_target_value0 = evmcs->cr3_target_value0; + * vmcs12->cr3_target_value1 = evmcs->cr3_target_value1; + * vmcs12->cr3_target_value2 = evmcs->cr3_target_value2; + * vmcs12->cr3_target_value3 = evmcs->cr3_target_value3; + * vmcs12->page_fault_error_code_mask = + * evmcs->page_fault_error_code_mask; + * vmcs12->page_fault_error_code_match = + * evmcs->page_fault_error_code_match; + * vmcs12->cr3_target_count = evmcs->cr3_target_count; + * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; + * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; + * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; + */ + + /* + * Read only fields: + * vmcs12->guest_physical_address = evmcs->guest_physical_address; + * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; + * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; + * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; + * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; + * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; + * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; + * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; + * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; + * vmcs12->exit_qualification = evmcs->exit_qualification; + * vmcs12->guest_linear_address = evmcs->guest_linear_address; + * + * Not present in struct vmcs12: + * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; + * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; + * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; + * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; + */ + + return 0; +} + +static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) +{ + struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; + struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; + + /* + * Should not be changed by KVM: + * + * evmcs->host_es_selector = vmcs12->host_es_selector; + * evmcs->host_cs_selector = vmcs12->host_cs_selector; + * evmcs->host_ss_selector = vmcs12->host_ss_selector; + * evmcs->host_ds_selector = vmcs12->host_ds_selector; + * evmcs->host_fs_selector = vmcs12->host_fs_selector; + * evmcs->host_gs_selector = vmcs12->host_gs_selector; + * evmcs->host_tr_selector = vmcs12->host_tr_selector; + * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; + * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; + * evmcs->host_cr0 = vmcs12->host_cr0; + * evmcs->host_cr3 = vmcs12->host_cr3; + * evmcs->host_cr4 = vmcs12->host_cr4; + * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; + * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; + * evmcs->host_rip = vmcs12->host_rip; + * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; + * evmcs->host_fs_base = vmcs12->host_fs_base; + * evmcs->host_gs_base = vmcs12->host_gs_base; + * evmcs->host_tr_base = vmcs12->host_tr_base; + * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; + * evmcs->host_idtr_base = vmcs12->host_idtr_base; + * evmcs->host_rsp = vmcs12->host_rsp; + * sync_vmcs12() doesn't read these: + * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; + * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; + * evmcs->msr_bitmap = vmcs12->msr_bitmap; + * evmcs->ept_pointer = vmcs12->ept_pointer; + * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; + * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; + * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; + * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; + * evmcs->cr3_target_value0 = vmcs12->cr3_target_value0; + * evmcs->cr3_target_value1 = vmcs12->cr3_target_value1; + * evmcs->cr3_target_value2 = vmcs12->cr3_target_value2; + * evmcs->cr3_target_value3 = vmcs12->cr3_target_value3; + * evmcs->tpr_threshold = vmcs12->tpr_threshold; + * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; + * evmcs->exception_bitmap = vmcs12->exception_bitmap; + * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; + * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; + * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; + * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; + * evmcs->page_fault_error_code_mask = + * vmcs12->page_fault_error_code_mask; + * evmcs->page_fault_error_code_match = + * vmcs12->page_fault_error_code_match; + * evmcs->cr3_target_count = vmcs12->cr3_target_count; + * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; + * evmcs->tsc_offset = vmcs12->tsc_offset; + * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; + * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; + * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; + * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; + * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; + * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; + * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; + * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; + * + * Not present in struct vmcs12: + * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; + * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; + * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; + * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; + */ + + evmcs->guest_es_selector = vmcs12->guest_es_selector; + evmcs->guest_cs_selector = vmcs12->guest_cs_selector; + evmcs->guest_ss_selector = vmcs12->guest_ss_selector; + evmcs->guest_ds_selector = vmcs12->guest_ds_selector; + evmcs->guest_fs_selector = vmcs12->guest_fs_selector; + evmcs->guest_gs_selector = vmcs12->guest_gs_selector; + evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; + evmcs->guest_tr_selector = vmcs12->guest_tr_selector; + + evmcs->guest_es_limit = vmcs12->guest_es_limit; + evmcs->guest_cs_limit = vmcs12->guest_cs_limit; + evmcs->guest_ss_limit = vmcs12->guest_ss_limit; + evmcs->guest_ds_limit = vmcs12->guest_ds_limit; + evmcs->guest_fs_limit = vmcs12->guest_fs_limit; + evmcs->guest_gs_limit = vmcs12->guest_gs_limit; + evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; + evmcs->guest_tr_limit = vmcs12->guest_tr_limit; + evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; + evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; + + evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; + evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; + evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; + evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; + evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; + evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; + evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; + evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; + + evmcs->guest_es_base = vmcs12->guest_es_base; + evmcs->guest_cs_base = vmcs12->guest_cs_base; + evmcs->guest_ss_base = vmcs12->guest_ss_base; + evmcs->guest_ds_base = vmcs12->guest_ds_base; + evmcs->guest_fs_base = vmcs12->guest_fs_base; + evmcs->guest_gs_base = vmcs12->guest_gs_base; + evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; + evmcs->guest_tr_base = vmcs12->guest_tr_base; + evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; + evmcs->guest_idtr_base = vmcs12->guest_idtr_base; + + evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; + evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; + + evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; + evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; + evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; + evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; + + evmcs->guest_pending_dbg_exceptions = + vmcs12->guest_pending_dbg_exceptions; + evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; + evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; + + evmcs->guest_activity_state = vmcs12->guest_activity_state; + evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; + + evmcs->guest_cr0 = vmcs12->guest_cr0; + evmcs->guest_cr3 = vmcs12->guest_cr3; + evmcs->guest_cr4 = vmcs12->guest_cr4; + evmcs->guest_dr7 = vmcs12->guest_dr7; + + evmcs->guest_physical_address = vmcs12->guest_physical_address; + + evmcs->vm_instruction_error = vmcs12->vm_instruction_error; + evmcs->vm_exit_reason = vmcs12->vm_exit_reason; + evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; + evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; + evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; + evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; + evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; + evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; + + evmcs->exit_qualification = vmcs12->exit_qualification; + + evmcs->guest_linear_address = vmcs12->guest_linear_address; + evmcs->guest_rsp = vmcs12->guest_rsp; + evmcs->guest_rflags = vmcs12->guest_rflags; + + evmcs->guest_interruptibility_info = + vmcs12->guest_interruptibility_info; + evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; + evmcs->vm_entry_controls = vmcs12->vm_entry_controls; + evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; + evmcs->vm_entry_exception_error_code = + vmcs12->vm_entry_exception_error_code; + evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; + + evmcs->guest_rip = vmcs12->guest_rip; + + evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; + + return 0; +} + +/* + * Copy the writable VMCS shadow fields back to the VMCS12, in case + * they have been modified by the L1 guest. Note that the "read-only" + * VM-exit information fields are actually writable if the vCPU is + * configured to support "VMWRITE to any supported field in the VMCS." + */ +static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) +{ + const u16 *fields[] = { + shadow_read_write_fields, + shadow_read_only_fields + }; + const int max_fields[] = { + max_shadow_read_write_fields, + max_shadow_read_only_fields + }; + int i, q; + unsigned long field; + u64 field_value; + struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; + + preempt_disable(); + + vmcs_load(shadow_vmcs); + + for (q = 0; q < ARRAY_SIZE(fields); q++) { + for (i = 0; i < max_fields[q]; i++) { + field = fields[q][i]; + field_value = __vmcs_readl(field); + vmcs12_write_any(get_vmcs12(&vmx->vcpu), field, field_value); + } + /* + * Skip the VM-exit information fields if they are read-only. + */ + if (!nested_cpu_has_vmwrite_any_field(&vmx->vcpu)) + break; + } + + vmcs_clear(shadow_vmcs); + vmcs_load(vmx->loaded_vmcs->vmcs); + + preempt_enable(); +} + +static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) +{ + const u16 *fields[] = { + shadow_read_write_fields, + shadow_read_only_fields + }; + const int max_fields[] = { + max_shadow_read_write_fields, + max_shadow_read_only_fields + }; + int i, q; + unsigned long field; + u64 field_value = 0; + struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; + + vmcs_load(shadow_vmcs); + + for (q = 0; q < ARRAY_SIZE(fields); q++) { + for (i = 0; i < max_fields[q]; i++) { + field = fields[q][i]; + vmcs12_read_any(get_vmcs12(&vmx->vcpu), field, &field_value); + __vmcs_writel(field, field_value); + } + } + + vmcs_clear(shadow_vmcs); + vmcs_load(vmx->loaded_vmcs->vmcs); +} + +static int handle_vmread(struct kvm_vcpu *vcpu) +{ + unsigned long field; + u64 field_value; + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + gva_t gva = 0; + struct vmcs12 *vmcs12; + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + if (to_vmx(vcpu)->nested.current_vmptr == -1ull) + return nested_vmx_failInvalid(vcpu); + + if (!is_guest_mode(vcpu)) + vmcs12 = get_vmcs12(vcpu); + else { + /* + * When vmcs->vmcs_link_pointer is -1ull, any VMREAD + * to shadowed-field sets the ALU flags for VMfailInvalid. + */ + if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) + return nested_vmx_failInvalid(vcpu); + vmcs12 = get_shadow_vmcs12(vcpu); + } + + /* Decode instruction info and find the field to read */ + field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); + /* Read the field, zero-extended to a u64 field_value */ + if (vmcs12_read_any(vmcs12, field, &field_value) < 0) + return nested_vmx_failValid(vcpu, + VMXERR_UNSUPPORTED_VMCS_COMPONENT); + + /* + * Now copy part of this value to register or memory, as requested. + * Note that the number of bits actually copied is 32 or 64 depending + * on the guest's mode (32 or 64 bit), not on the given field's length. + */ + if (vmx_instruction_info & (1u << 10)) { + kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf), + field_value); + } else { + if (get_vmx_mem_address(vcpu, exit_qualification, + vmx_instruction_info, true, &gva)) + return 1; + /* _system ok, nested_vmx_check_permission has verified cpl=0 */ + kvm_write_guest_virt_system(vcpu, gva, &field_value, + (is_long_mode(vcpu) ? 8 : 4), NULL); + } + + return nested_vmx_succeed(vcpu); +} + + +static int handle_vmwrite(struct kvm_vcpu *vcpu) +{ + unsigned long field; + gva_t gva; + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + + /* The value to write might be 32 or 64 bits, depending on L1's long + * mode, and eventually we need to write that into a field of several + * possible lengths. The code below first zero-extends the value to 64 + * bit (field_value), and then copies only the appropriate number of + * bits into the vmcs12 field. + */ + u64 field_value = 0; + struct x86_exception e; + struct vmcs12 *vmcs12; + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + if (vmx->nested.current_vmptr == -1ull) + return nested_vmx_failInvalid(vcpu); + + if (vmx_instruction_info & (1u << 10)) + field_value = kvm_register_readl(vcpu, + (((vmx_instruction_info) >> 3) & 0xf)); + else { + if (get_vmx_mem_address(vcpu, exit_qualification, + vmx_instruction_info, false, &gva)) + return 1; + if (kvm_read_guest_virt(vcpu, gva, &field_value, + (is_64_bit_mode(vcpu) ? 8 : 4), &e)) { + kvm_inject_page_fault(vcpu, &e); + return 1; + } + } + + + field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); + /* + * If the vCPU supports "VMWRITE to any supported field in the + * VMCS," then the "read-only" fields are actually read/write. + */ + if (vmcs_field_readonly(field) && + !nested_cpu_has_vmwrite_any_field(vcpu)) + return nested_vmx_failValid(vcpu, + VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); + + if (!is_guest_mode(vcpu)) + vmcs12 = get_vmcs12(vcpu); + else { + /* + * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE + * to shadowed-field sets the ALU flags for VMfailInvalid. + */ + if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) + return nested_vmx_failInvalid(vcpu); + vmcs12 = get_shadow_vmcs12(vcpu); + } + + if (vmcs12_write_any(vmcs12, field, field_value) < 0) + return nested_vmx_failValid(vcpu, + VMXERR_UNSUPPORTED_VMCS_COMPONENT); + + /* + * Do not track vmcs12 dirty-state if in guest-mode + * as we actually dirty shadow vmcs12 instead of vmcs12. + */ + if (!is_guest_mode(vcpu)) { + switch (field) { +#define SHADOW_FIELD_RW(x) case x: +#include "vmx_shadow_fields.h" + /* + * The fields that can be updated by L1 without a vmexit are + * always updated in the vmcs02, the others go down the slow + * path of prepare_vmcs02. + */ + break; + default: + vmx->nested.dirty_vmcs12 = true; + break; + } + } + + return nested_vmx_succeed(vcpu); +} + +static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) +{ + vmx->nested.current_vmptr = vmptr; + if (enable_shadow_vmcs) { + vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, + SECONDARY_EXEC_SHADOW_VMCS); + vmcs_write64(VMCS_LINK_POINTER, + __pa(vmx->vmcs01.shadow_vmcs)); + vmx->nested.need_vmcs12_sync = true; + } + vmx->nested.dirty_vmcs12 = true; +} + +/* Emulate the VMPTRLD instruction */ +static int handle_vmptrld(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + gpa_t vmptr; + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + if (nested_vmx_get_vmptr(vcpu, &vmptr)) + return 1; + + if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) + return nested_vmx_failValid(vcpu, + VMXERR_VMPTRLD_INVALID_ADDRESS); + + if (vmptr == vmx->nested.vmxon_ptr) + return nested_vmx_failValid(vcpu, + VMXERR_VMPTRLD_VMXON_POINTER); + + /* Forbid normal VMPTRLD if Enlightened version was used */ + if (vmx->nested.hv_evmcs) + return 1; + + if (vmx->nested.current_vmptr != vmptr) { + struct vmcs12 *new_vmcs12; + struct page *page; + page = kvm_vcpu_gpa_to_page(vcpu, vmptr); + if (is_error_page(page)) { + /* + * Reads from an unbacked page return all 1s, + * which means that the 32 bits located at the + * given physical address won't match the required + * VMCS12_REVISION identifier. + */ + nested_vmx_failValid(vcpu, + VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); + return kvm_skip_emulated_instruction(vcpu); + } + new_vmcs12 = kmap(page); + if (new_vmcs12->hdr.revision_id != VMCS12_REVISION || + (new_vmcs12->hdr.shadow_vmcs && + !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { + kunmap(page); + kvm_release_page_clean(page); + return nested_vmx_failValid(vcpu, + VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); + } + + nested_release_vmcs12(vcpu); + + /* + * Load VMCS12 from guest memory since it is not already + * cached. + */ + memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE); + kunmap(page); + kvm_release_page_clean(page); + + set_current_vmptr(vmx, vmptr); + } + + return nested_vmx_succeed(vcpu); +} + +/* + * This is an equivalent of the nested hypervisor executing the vmptrld + * instruction. + */ +static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, + bool from_launch) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct hv_vp_assist_page assist_page; + + if (likely(!vmx->nested.enlightened_vmcs_enabled)) + return 1; + + if (unlikely(!kvm_hv_get_assist_page(vcpu, &assist_page))) + return 1; + + if (unlikely(!assist_page.enlighten_vmentry)) + return 1; + + if (unlikely(assist_page.current_nested_vmcs != + vmx->nested.hv_evmcs_vmptr)) { + + if (!vmx->nested.hv_evmcs) + vmx->nested.current_vmptr = -1ull; + + nested_release_evmcs(vcpu); + + vmx->nested.hv_evmcs_page = kvm_vcpu_gpa_to_page( + vcpu, assist_page.current_nested_vmcs); + + if (unlikely(is_error_page(vmx->nested.hv_evmcs_page))) + return 0; + + vmx->nested.hv_evmcs = kmap(vmx->nested.hv_evmcs_page); + + /* + * Currently, KVM only supports eVMCS version 1 + * (== KVM_EVMCS_VERSION) and thus we expect guest to set this + * value to first u32 field of eVMCS which should specify eVMCS + * VersionNumber. + * + * Guest should be aware of supported eVMCS versions by host by + * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is + * expected to set this CPUID leaf according to the value + * returned in vmcs_version from nested_enable_evmcs(). + * + * However, it turns out that Microsoft Hyper-V fails to comply + * to their own invented interface: When Hyper-V use eVMCS, it + * just sets first u32 field of eVMCS to revision_id specified + * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number + * which is one of the supported versions specified in + * CPUID.0x4000000A.EAX[0:15]. + * + * To overcome Hyper-V bug, we accept here either a supported + * eVMCS version or VMCS12 revision_id as valid values for first + * u32 field of eVMCS. + */ + if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && + (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { + nested_release_evmcs(vcpu); + return 0; + } + + vmx->nested.dirty_vmcs12 = true; + /* + * As we keep L2 state for one guest only 'hv_clean_fields' mask + * can't be used when we switch between them. Reset it here for + * simplicity. + */ + vmx->nested.hv_evmcs->hv_clean_fields &= + ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + vmx->nested.hv_evmcs_vmptr = assist_page.current_nested_vmcs; + + /* + * Unlike normal vmcs12, enlightened vmcs12 is not fully + * reloaded from guest's memory (read only fields, fields not + * present in struct hv_enlightened_vmcs, ...). Make sure there + * are no leftovers. + */ + if (from_launch) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + memset(vmcs12, 0, sizeof(*vmcs12)); + vmcs12->hdr.revision_id = VMCS12_REVISION; + } + + } + return 1; +} + +/* Emulate the VMPTRST instruction */ +static int handle_vmptrst(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION); + u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); + gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; + struct x86_exception e; + gva_t gva; + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + if (unlikely(to_vmx(vcpu)->nested.hv_evmcs)) + return 1; + + if (get_vmx_mem_address(vcpu, exit_qual, instr_info, true, &gva)) + return 1; + /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ + if (kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, + sizeof(gpa_t), &e)) { + kvm_inject_page_fault(vcpu, &e); + return 1; + } + return nested_vmx_succeed(vcpu); +} + +/* Emulate the INVEPT instruction */ +static int handle_invept(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 vmx_instruction_info, types; + unsigned long type; + gva_t gva; + struct x86_exception e; + struct { + u64 eptp, gpa; + } operand; + + if (!(vmx->nested.msrs.secondary_ctls_high & + SECONDARY_EXEC_ENABLE_EPT) || + !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); + + types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; + + if (type >= 32 || !(types & (1 << type))) + return nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + + /* According to the Intel VMX instruction reference, the memory + * operand is read even if it isn't needed (e.g., for type==global) + */ + if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + vmx_instruction_info, false, &gva)) + return 1; + if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { + kvm_inject_page_fault(vcpu, &e); + return 1; + } + + switch (type) { + case VMX_EPT_EXTENT_GLOBAL: + /* + * TODO: track mappings and invalidate + * single context requests appropriately + */ + case VMX_EPT_EXTENT_CONTEXT: + kvm_mmu_sync_roots(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + break; + default: + BUG_ON(1); + break; + } + + return nested_vmx_succeed(vcpu); +} + +static u16 nested_get_vpid02(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid; +} + +static int handle_invvpid(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 vmx_instruction_info; + unsigned long type, types; + gva_t gva; + struct x86_exception e; + struct { + u64 vpid; + u64 gla; + } operand; + u16 vpid02; + + if (!(vmx->nested.msrs.secondary_ctls_high & + SECONDARY_EXEC_ENABLE_VPID) || + !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); + + types = (vmx->nested.msrs.vpid_caps & + VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; + + if (type >= 32 || !(types & (1 << type))) + return nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + + /* according to the intel vmx instruction reference, the memory + * operand is read even if it isn't needed (e.g., for type==global) + */ + if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + vmx_instruction_info, false, &gva)) + return 1; + if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { + kvm_inject_page_fault(vcpu, &e); + return 1; + } + if (operand.vpid >> 16) + return nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + + vpid02 = nested_get_vpid02(vcpu); + switch (type) { + case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: + if (!operand.vpid || + is_noncanonical_address(operand.gla, vcpu)) + return nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + if (cpu_has_vmx_invvpid_individual_addr()) { + __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, + vpid02, operand.gla); + } else + __vmx_flush_tlb(vcpu, vpid02, false); + break; + case VMX_VPID_EXTENT_SINGLE_CONTEXT: + case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: + if (!operand.vpid) + return nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + __vmx_flush_tlb(vcpu, vpid02, false); + break; + case VMX_VPID_EXTENT_ALL_CONTEXT: + __vmx_flush_tlb(vcpu, vpid02, false); + break; + default: + WARN_ON_ONCE(1); + return kvm_skip_emulated_instruction(vcpu); + } + + return nested_vmx_succeed(vcpu); +} + +static int handle_invpcid(struct kvm_vcpu *vcpu) +{ + u32 vmx_instruction_info; + unsigned long type; + bool pcid_enabled; + gva_t gva; + struct x86_exception e; + unsigned i; + unsigned long roots_to_free = 0; + struct { + u64 pcid; + u64 gla; + } operand; + + if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); + + if (type > 3) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + /* According to the Intel instruction reference, the memory operand + * is read even if it isn't needed (e.g., for type==all) + */ + if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + vmx_instruction_info, false, &gva)) + return 1; + + if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { + kvm_inject_page_fault(vcpu, &e); + return 1; + } + + if (operand.pcid >> 12 != 0) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); + + switch (type) { + case INVPCID_TYPE_INDIV_ADDR: + if ((!pcid_enabled && (operand.pcid != 0)) || + is_noncanonical_address(operand.gla, vcpu)) { + kvm_inject_gp(vcpu, 0); + return 1; + } + kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid); + return kvm_skip_emulated_instruction(vcpu); + + case INVPCID_TYPE_SINGLE_CTXT: + if (!pcid_enabled && (operand.pcid != 0)) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + if (kvm_get_active_pcid(vcpu) == operand.pcid) { + kvm_mmu_sync_roots(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + } + + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].cr3) + == operand.pcid) + roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); + + kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free); + /* + * If neither the current cr3 nor any of the prev_roots use the + * given PCID, then nothing needs to be done here because a + * resync will happen anyway before switching to any other CR3. + */ + + return kvm_skip_emulated_instruction(vcpu); + + case INVPCID_TYPE_ALL_NON_GLOBAL: + /* + * Currently, KVM doesn't mark global entries in the shadow + * page tables, so a non-global flush just degenerates to a + * global flush. If needed, we could optimize this later by + * keeping track of global entries in shadow page tables. + */ + + /* fall-through */ + case INVPCID_TYPE_ALL_INCL_GLOBAL: + kvm_mmu_unload(vcpu); + return kvm_skip_emulated_instruction(vcpu); + + default: + BUG(); /* We have already checked above that type <= 3 */ + } +} + +static int handle_pml_full(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification; + + trace_kvm_pml_full(vcpu->vcpu_id); + + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + + /* + * PML buffer FULL happened while executing iret from NMI, + * "blocked by NMI" bit has to be set before next VM entry. + */ + if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && + enable_vnmi && + (exit_qualification & INTR_INFO_UNBLOCK_NMI)) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + + /* + * PML buffer already flushed at beginning of VMEXIT. Nothing to do + * here.., and there's no userspace involvement needed for PML. + */ + return 1; +} + +static int handle_preemption_timer(struct kvm_vcpu *vcpu) +{ + if (!to_vmx(vcpu)->req_immediate_exit) + kvm_lapic_expired_hv_timer(vcpu); + return 1; +} + +static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int maxphyaddr = cpuid_maxphyaddr(vcpu); + + /* Check for memory type validity */ + switch (address & VMX_EPTP_MT_MASK) { + case VMX_EPTP_MT_UC: + if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT)) + return false; + break; + case VMX_EPTP_MT_WB: + if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT)) + return false; + break; + default: + return false; + } + + /* only 4 levels page-walk length are valid */ + if ((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4) + return false; + + /* Reserved bits should not be set */ + if (address >> maxphyaddr || ((address >> 7) & 0x1f)) + return false; + + /* AD, if set, should be supported */ + if (address & VMX_EPTP_AD_ENABLE_BIT) { + if (!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT)) + return false; + } + + return true; +} + +static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + u32 index = vcpu->arch.regs[VCPU_REGS_RCX]; + u64 address; + bool accessed_dirty; + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + + if (!nested_cpu_has_eptp_switching(vmcs12) || + !nested_cpu_has_ept(vmcs12)) + return 1; + + if (index >= VMFUNC_EPTP_ENTRIES) + return 1; + + + if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, + &address, index * 8, 8)) + return 1; + + accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT); + + /* + * If the (L2) guest does a vmfunc to the currently + * active ept pointer, we don't have to do anything else + */ + if (vmcs12->ept_pointer != address) { + if (!valid_ept_address(vcpu, address)) + return 1; + + kvm_mmu_unload(vcpu); + mmu->ept_ad = accessed_dirty; + mmu->mmu_role.base.ad_disabled = !accessed_dirty; + vmcs12->ept_pointer = address; + /* + * TODO: Check what's the correct approach in case + * mmu reload fails. Currently, we just let the next + * reload potentially fail + */ + kvm_mmu_reload(vcpu); + } + + return 0; +} + +static int handle_vmfunc(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs12 *vmcs12; + u32 function = vcpu->arch.regs[VCPU_REGS_RAX]; + + /* + * VMFUNC is only supported for nested guests, but we always enable the + * secondary control for simplicity; for non-nested mode, fake that we + * didn't by injecting #UD. + */ + if (!is_guest_mode(vcpu)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + vmcs12 = get_vmcs12(vcpu); + if ((vmcs12->vm_function_control & (1 << function)) == 0) + goto fail; + + switch (function) { + case 0: + if (nested_vmx_eptp_switching(vcpu, vmcs12)) + goto fail; + break; + default: + goto fail; + } + return kvm_skip_emulated_instruction(vcpu); + +fail: + nested_vmx_vmexit(vcpu, vmx->exit_reason, + vmcs_read32(VM_EXIT_INTR_INFO), + vmcs_readl(EXIT_QUALIFICATION)); + return 1; +} + +static int handle_encls(struct kvm_vcpu *vcpu) +{ + /* + * SGX virtualization is not yet supported. There is no software + * enable bit for SGX, so we have to trap ENCLS and inject a #UD + * to prevent the guest from executing ENCLS. + */ + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; +} + +/* + * The exit handlers return 1 if the exit was handled fully and guest execution + * may resume. Otherwise they set the kvm_run parameter to indicate what needs + * to be done to userspace and return 0. + */ +static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { + [EXIT_REASON_EXCEPTION_NMI] = handle_exception, + [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, + [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, + [EXIT_REASON_NMI_WINDOW] = handle_nmi_window, + [EXIT_REASON_IO_INSTRUCTION] = handle_io, + [EXIT_REASON_CR_ACCESS] = handle_cr, + [EXIT_REASON_DR_ACCESS] = handle_dr, + [EXIT_REASON_CPUID] = handle_cpuid, + [EXIT_REASON_MSR_READ] = handle_rdmsr, + [EXIT_REASON_MSR_WRITE] = handle_wrmsr, + [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, + [EXIT_REASON_HLT] = handle_halt, + [EXIT_REASON_INVD] = handle_invd, + [EXIT_REASON_INVLPG] = handle_invlpg, + [EXIT_REASON_RDPMC] = handle_rdpmc, + [EXIT_REASON_VMCALL] = handle_vmcall, + [EXIT_REASON_VMCLEAR] = handle_vmclear, + [EXIT_REASON_VMLAUNCH] = handle_vmlaunch, + [EXIT_REASON_VMPTRLD] = handle_vmptrld, + [EXIT_REASON_VMPTRST] = handle_vmptrst, + [EXIT_REASON_VMREAD] = handle_vmread, + [EXIT_REASON_VMRESUME] = handle_vmresume, + [EXIT_REASON_VMWRITE] = handle_vmwrite, + [EXIT_REASON_VMOFF] = handle_vmoff, + [EXIT_REASON_VMON] = handle_vmon, + [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, + [EXIT_REASON_APIC_ACCESS] = handle_apic_access, + [EXIT_REASON_APIC_WRITE] = handle_apic_write, + [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced, + [EXIT_REASON_WBINVD] = handle_wbinvd, + [EXIT_REASON_XSETBV] = handle_xsetbv, + [EXIT_REASON_TASK_SWITCH] = handle_task_switch, + [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, + [EXIT_REASON_GDTR_IDTR] = handle_desc, + [EXIT_REASON_LDTR_TR] = handle_desc, + [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, + [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, + [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, + [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait, + [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap, + [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor, + [EXIT_REASON_INVEPT] = handle_invept, + [EXIT_REASON_INVVPID] = handle_invvpid, + [EXIT_REASON_RDRAND] = handle_invalid_op, + [EXIT_REASON_RDSEED] = handle_invalid_op, + [EXIT_REASON_XSAVES] = handle_xsaves, + [EXIT_REASON_XRSTORS] = handle_xrstors, + [EXIT_REASON_PML_FULL] = handle_pml_full, + [EXIT_REASON_INVPCID] = handle_invpcid, + [EXIT_REASON_VMFUNC] = handle_vmfunc, + [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, + [EXIT_REASON_ENCLS] = handle_encls, +}; + +static const int kvm_vmx_max_exit_handlers = + ARRAY_SIZE(kvm_vmx_exit_handlers); + +static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + unsigned long exit_qualification; + gpa_t bitmap, last_bitmap; + unsigned int port; + int size; + u8 b; + + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) + return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); + + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + + port = exit_qualification >> 16; + size = (exit_qualification & 7) + 1; + + last_bitmap = (gpa_t)-1; + b = -1; + + while (size > 0) { + if (port < 0x8000) + bitmap = vmcs12->io_bitmap_a; + else if (port < 0x10000) + bitmap = vmcs12->io_bitmap_b; + else + return true; + bitmap += (port & 0x7fff) / 8; + + if (last_bitmap != bitmap) + if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) + return true; + if (b & (1 << (port & 7))) + return true; + + port++; + size--; + last_bitmap = bitmap; + } + + return false; +} + +/* + * Return 1 if we should exit from L2 to L1 to handle an MSR access access, + * rather than handle it ourselves in L0. I.e., check whether L1 expressed + * disinterest in the current event (read or write a specific MSR) by using an + * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. + */ +static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12, u32 exit_reason) +{ + u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX]; + gpa_t bitmap; + + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) + return true; + + /* + * The MSR_BITMAP page is divided into four 1024-byte bitmaps, + * for the four combinations of read/write and low/high MSR numbers. + * First we need to figure out which of the four to use: + */ + bitmap = vmcs12->msr_bitmap; + if (exit_reason == EXIT_REASON_MSR_WRITE) + bitmap += 2048; + if (msr_index >= 0xc0000000) { + msr_index -= 0xc0000000; + bitmap += 1024; + } + + /* Then read the msr_index'th bit from this bitmap: */ + if (msr_index < 1024*8) { + unsigned char b; + if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) + return true; + return 1 & (b >> (msr_index & 7)); + } else + return true; /* let L1 handle the wrong parameter */ +} + +/* + * Return 1 if we should exit from L2 to L1 to handle a CR access exit, + * rather than handle it ourselves in L0. I.e., check if L1 wanted to + * intercept (via guest_host_mask etc.) the current event. + */ +static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + int cr = exit_qualification & 15; + int reg; + unsigned long val; + + switch ((exit_qualification >> 4) & 3) { + case 0: /* mov to cr */ + reg = (exit_qualification >> 8) & 15; + val = kvm_register_readl(vcpu, reg); + switch (cr) { + case 0: + if (vmcs12->cr0_guest_host_mask & + (val ^ vmcs12->cr0_read_shadow)) + return true; + break; + case 3: + if ((vmcs12->cr3_target_count >= 1 && + vmcs12->cr3_target_value0 == val) || + (vmcs12->cr3_target_count >= 2 && + vmcs12->cr3_target_value1 == val) || + (vmcs12->cr3_target_count >= 3 && + vmcs12->cr3_target_value2 == val) || + (vmcs12->cr3_target_count >= 4 && + vmcs12->cr3_target_value3 == val)) + return false; + if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) + return true; + break; + case 4: + if (vmcs12->cr4_guest_host_mask & + (vmcs12->cr4_read_shadow ^ val)) + return true; + break; + case 8: + if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) + return true; + break; + } + break; + case 2: /* clts */ + if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && + (vmcs12->cr0_read_shadow & X86_CR0_TS)) + return true; + break; + case 1: /* mov from cr */ + switch (cr) { + case 3: + if (vmcs12->cpu_based_vm_exec_control & + CPU_BASED_CR3_STORE_EXITING) + return true; + break; + case 8: + if (vmcs12->cpu_based_vm_exec_control & + CPU_BASED_CR8_STORE_EXITING) + return true; + break; + } + break; + case 3: /* lmsw */ + /* + * lmsw can change bits 1..3 of cr0, and only set bit 0 of + * cr0. Other attempted changes are ignored, with no exit. + */ + val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; + if (vmcs12->cr0_guest_host_mask & 0xe & + (val ^ vmcs12->cr0_read_shadow)) + return true; + if ((vmcs12->cr0_guest_host_mask & 0x1) && + !(vmcs12->cr0_read_shadow & 0x1) && + (val & 0x1)) + return true; + break; + } + return false; +} + +static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12, gpa_t bitmap) +{ + u32 vmx_instruction_info; + unsigned long field; + u8 b; + + if (!nested_cpu_has_shadow_vmcs(vmcs12)) + return true; + + /* Decode instruction info and find the field to access */ + vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); + + /* Out-of-range fields always cause a VM exit from L2 to L1 */ + if (field >> 15) + return true; + + if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) + return true; + + return 1 & (b >> (field & 7)); +} + +/* + * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we + * should handle it ourselves in L0 (and then continue L2). Only call this + * when in is_guest_mode (L2). + */ +static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) +{ + u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + if (vmx->nested.nested_run_pending) + return false; + + if (unlikely(vmx->fail)) { + pr_info_ratelimited("%s failed vm entry %x\n", __func__, + vmcs_read32(VM_INSTRUCTION_ERROR)); + return true; + } + + /* + * The host physical addresses of some pages of guest memory + * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC + * Page). The CPU may write to these pages via their host + * physical address while L2 is running, bypassing any + * address-translation-based dirty tracking (e.g. EPT write + * protection). + * + * Mark them dirty on every exit from L2 to prevent them from + * getting out of sync with dirty tracking. + */ + nested_mark_vmcs12_pages_dirty(vcpu); + + trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, + vmcs_readl(EXIT_QUALIFICATION), + vmx->idt_vectoring_info, + intr_info, + vmcs_read32(VM_EXIT_INTR_ERROR_CODE), + KVM_ISA_VMX); + + switch (exit_reason) { + case EXIT_REASON_EXCEPTION_NMI: + if (is_nmi(intr_info)) + return false; + else if (is_page_fault(intr_info)) + return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept; + else if (is_debug(intr_info) && + vcpu->guest_debug & + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) + return false; + else if (is_breakpoint(intr_info) && + vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) + return false; + return vmcs12->exception_bitmap & + (1u << (intr_info & INTR_INFO_VECTOR_MASK)); + case EXIT_REASON_EXTERNAL_INTERRUPT: + return false; + case EXIT_REASON_TRIPLE_FAULT: + return true; + case EXIT_REASON_PENDING_INTERRUPT: + return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING); + case EXIT_REASON_NMI_WINDOW: + return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING); + case EXIT_REASON_TASK_SWITCH: + return true; + case EXIT_REASON_CPUID: + return true; + case EXIT_REASON_HLT: + return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); + case EXIT_REASON_INVD: + return true; + case EXIT_REASON_INVLPG: + return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); + case EXIT_REASON_RDPMC: + return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); + case EXIT_REASON_RDRAND: + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); + case EXIT_REASON_RDSEED: + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); + case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: + return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); + case EXIT_REASON_VMREAD: + return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, + vmcs12->vmread_bitmap); + case EXIT_REASON_VMWRITE: + return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, + vmcs12->vmwrite_bitmap); + case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: + case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: + case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: + case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: + case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: + /* + * VMX instructions trap unconditionally. This allows L1 to + * emulate them for its L2 guest, i.e., allows 3-level nesting! + */ + return true; + case EXIT_REASON_CR_ACCESS: + return nested_vmx_exit_handled_cr(vcpu, vmcs12); + case EXIT_REASON_DR_ACCESS: + return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); + case EXIT_REASON_IO_INSTRUCTION: + return nested_vmx_exit_handled_io(vcpu, vmcs12); + case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); + case EXIT_REASON_MSR_READ: + case EXIT_REASON_MSR_WRITE: + return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); + case EXIT_REASON_INVALID_STATE: + return true; + case EXIT_REASON_MWAIT_INSTRUCTION: + return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); + case EXIT_REASON_MONITOR_TRAP_FLAG: + return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG); + case EXIT_REASON_MONITOR_INSTRUCTION: + return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); + case EXIT_REASON_PAUSE_INSTRUCTION: + return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || + nested_cpu_has2(vmcs12, + SECONDARY_EXEC_PAUSE_LOOP_EXITING); + case EXIT_REASON_MCE_DURING_VMENTRY: + return false; + case EXIT_REASON_TPR_BELOW_THRESHOLD: + return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); + case EXIT_REASON_APIC_ACCESS: + case EXIT_REASON_APIC_WRITE: + case EXIT_REASON_EOI_INDUCED: + /* + * The controls for "virtualize APIC accesses," "APIC- + * register virtualization," and "virtual-interrupt + * delivery" only come from vmcs12. + */ + return true; + case EXIT_REASON_EPT_VIOLATION: + /* + * L0 always deals with the EPT violation. If nested EPT is + * used, and the nested mmu code discovers that the address is + * missing in the guest EPT table (EPT12), the EPT violation + * will be injected with nested_ept_inject_page_fault() + */ + return false; + case EXIT_REASON_EPT_MISCONFIG: + /* + * L2 never uses directly L1's EPT, but rather L0's own EPT + * table (shadow on EPT) or a merged EPT table that L0 built + * (EPT on EPT). So any problems with the structure of the + * table is L0's fault. + */ + return false; + case EXIT_REASON_INVPCID: + return + nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && + nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); + case EXIT_REASON_WBINVD: + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); + case EXIT_REASON_XSETBV: + return true; + case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: + /* + * This should never happen, since it is not possible to + * set XSS to a non-zero value---neither in L1 nor in L2. + * If if it were, XSS would have to be checked against + * the XSS exit bitmap in vmcs12. + */ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); + case EXIT_REASON_PREEMPTION_TIMER: + return false; + case EXIT_REASON_PML_FULL: + /* We emulate PML support to L1. */ + return false; + case EXIT_REASON_VMFUNC: + /* VM functions are emulated through L2->L0 vmexits. */ + return false; + case EXIT_REASON_ENCLS: + /* SGX is never exposed to L1 */ + return false; + default: + return true; + } +} + +static int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason) +{ + u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + + /* + * At this point, the exit interruption info in exit_intr_info + * is only valid for EXCEPTION_NMI exits. For EXTERNAL_INTERRUPT + * we need to query the in-kernel LAPIC. + */ + WARN_ON(exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT); + if ((exit_intr_info & + (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) == + (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + vmcs12->vm_exit_intr_error_code = + vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + } + + nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info, + vmcs_readl(EXIT_QUALIFICATION)); + return 1; +} + +static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) +{ + *info1 = vmcs_readl(EXIT_QUALIFICATION); + *info2 = vmcs_read32(VM_EXIT_INTR_INFO); +} + +static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) +{ + if (vmx->pml_pg) { + __free_page(vmx->pml_pg); + vmx->pml_pg = NULL; + } +} + +static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u64 *pml_buf; + u16 pml_idx; + + pml_idx = vmcs_read16(GUEST_PML_INDEX); + + /* Do nothing if PML buffer is empty */ + if (pml_idx == (PML_ENTITY_NUM - 1)) + return; + + /* PML index always points to next available PML buffer entity */ + if (pml_idx >= PML_ENTITY_NUM) + pml_idx = 0; + else + pml_idx++; + + pml_buf = page_address(vmx->pml_pg); + for (; pml_idx < PML_ENTITY_NUM; pml_idx++) { + u64 gpa; + + gpa = pml_buf[pml_idx]; + WARN_ON(gpa & (PAGE_SIZE - 1)); + kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); + } + + /* reset PML index */ + vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); +} + +/* + * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap. + * Called before reporting dirty_bitmap to userspace. + */ +static void kvm_flush_pml_buffers(struct kvm *kvm) +{ + int i; + struct kvm_vcpu *vcpu; + /* + * We only need to kick vcpu out of guest mode here, as PML buffer + * is flushed at beginning of all VMEXITs, and it's obvious that only + * vcpus running in guest are possible to have unflushed GPAs in PML + * buffer. + */ + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_vcpu_kick(vcpu); +} + +static void vmx_dump_sel(char *name, uint32_t sel) +{ + pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", + name, vmcs_read16(sel), + vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR), + vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR), + vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR)); +} + +static void vmx_dump_dtsel(char *name, uint32_t limit) +{ + pr_err("%s limit=0x%08x, base=0x%016lx\n", + name, vmcs_read32(limit), + vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); +} + +static void dump_vmcs(void) +{ + u32 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS); + u32 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS); + u32 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + u32 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); + u32 secondary_exec_control = 0; + unsigned long cr4 = vmcs_readl(GUEST_CR4); + u64 efer = vmcs_read64(GUEST_IA32_EFER); + int i, n; + + if (cpu_has_secondary_exec_ctrls()) + secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + + pr_err("*** Guest State ***\n"); + pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", + vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW), + vmcs_readl(CR0_GUEST_HOST_MASK)); + pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", + cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK)); + pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3)); + if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) && + (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA)) + { + pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n", + vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1)); + pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n", + vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3)); + } + pr_err("RSP = 0x%016lx RIP = 0x%016lx\n", + vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP)); + pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n", + vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7)); + pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", + vmcs_readl(GUEST_SYSENTER_ESP), + vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP)); + vmx_dump_sel("CS: ", GUEST_CS_SELECTOR); + vmx_dump_sel("DS: ", GUEST_DS_SELECTOR); + vmx_dump_sel("SS: ", GUEST_SS_SELECTOR); + vmx_dump_sel("ES: ", GUEST_ES_SELECTOR); + vmx_dump_sel("FS: ", GUEST_FS_SELECTOR); + vmx_dump_sel("GS: ", GUEST_GS_SELECTOR); + vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT); + vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR); + vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT); + vmx_dump_sel("TR: ", GUEST_TR_SELECTOR); + if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) || + (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER))) + pr_err("EFER = 0x%016llx PAT = 0x%016llx\n", + efer, vmcs_read64(GUEST_IA32_PAT)); + pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n", + vmcs_read64(GUEST_IA32_DEBUGCTL), + vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS)); + if (cpu_has_load_perf_global_ctrl && + vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) + pr_err("PerfGlobCtl = 0x%016llx\n", + vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL)); + if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) + pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS)); + pr_err("Interruptibility = %08x ActivityState = %08x\n", + vmcs_read32(GUEST_INTERRUPTIBILITY_INFO), + vmcs_read32(GUEST_ACTIVITY_STATE)); + if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) + pr_err("InterruptStatus = %04x\n", + vmcs_read16(GUEST_INTR_STATUS)); + + pr_err("*** Host State ***\n"); + pr_err("RIP = 0x%016lx RSP = 0x%016lx\n", + vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP)); + pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n", + vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR), + vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR), + vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR), + vmcs_read16(HOST_TR_SELECTOR)); + pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n", + vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE), + vmcs_readl(HOST_TR_BASE)); + pr_err("GDTBase=%016lx IDTBase=%016lx\n", + vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE)); + pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n", + vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3), + vmcs_readl(HOST_CR4)); + pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", + vmcs_readl(HOST_IA32_SYSENTER_ESP), + vmcs_read32(HOST_IA32_SYSENTER_CS), + vmcs_readl(HOST_IA32_SYSENTER_EIP)); + if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER)) + pr_err("EFER = 0x%016llx PAT = 0x%016llx\n", + vmcs_read64(HOST_IA32_EFER), + vmcs_read64(HOST_IA32_PAT)); + if (cpu_has_load_perf_global_ctrl && + vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) + pr_err("PerfGlobCtl = 0x%016llx\n", + vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); + + pr_err("*** Control State ***\n"); + pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n", + pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control); + pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl); + pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n", + vmcs_read32(EXCEPTION_BITMAP), + vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK), + vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH)); + pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n", + vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), + vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE), + vmcs_read32(VM_ENTRY_INSTRUCTION_LEN)); + pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n", + vmcs_read32(VM_EXIT_INTR_INFO), + vmcs_read32(VM_EXIT_INTR_ERROR_CODE), + vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); + pr_err(" reason=%08x qualification=%016lx\n", + vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION)); + pr_err("IDTVectoring: info=%08x errcode=%08x\n", + vmcs_read32(IDT_VECTORING_INFO_FIELD), + vmcs_read32(IDT_VECTORING_ERROR_CODE)); + pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET)); + if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING) + pr_err("TSC Multiplier = 0x%016llx\n", + vmcs_read64(TSC_MULTIPLIER)); + if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) + pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); + if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) + pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); + if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) + pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER)); + n = vmcs_read32(CR3_TARGET_COUNT); + for (i = 0; i + 1 < n; i += 4) + pr_err("CR3 target%u=%016lx target%u=%016lx\n", + i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2), + i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2)); + if (i < n) + pr_err("CR3 target%u=%016lx\n", + i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2)); + if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) + pr_err("PLE Gap=%08x Window=%08x\n", + vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW)); + if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID) + pr_err("Virtual processor ID = 0x%04x\n", + vmcs_read16(VIRTUAL_PROCESSOR_ID)); +} + +/* + * The guest has exited. See if we can fix it or if we need userspace + * assistance. + */ +static int vmx_handle_exit(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 exit_reason = vmx->exit_reason; + u32 vectoring_info = vmx->idt_vectoring_info; + + trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX); + + /* + * Flush logged GPAs PML buffer, this will make dirty_bitmap more + * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before + * querying dirty_bitmap, we only need to kick all vcpus out of guest + * mode as if vcpus is in root mode, the PML buffer must has been + * flushed already. + */ + if (enable_pml) + vmx_flush_pml_buffer(vcpu); + + /* If guest state is invalid, start emulating */ + if (vmx->emulation_required) + return handle_invalid_guest_state(vcpu); + + if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason)) + return nested_vmx_reflect_vmexit(vcpu, exit_reason); + + if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { + dump_vmcs(); + vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; + vcpu->run->fail_entry.hardware_entry_failure_reason + = exit_reason; + return 0; + } + + if (unlikely(vmx->fail)) { + vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; + vcpu->run->fail_entry.hardware_entry_failure_reason + = vmcs_read32(VM_INSTRUCTION_ERROR); + return 0; + } + + /* + * Note: + * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by + * delivery event since it indicates guest is accessing MMIO. + * The vm-exit can be triggered again after return to guest that + * will cause infinite loop. + */ + if ((vectoring_info & VECTORING_INFO_VALID_MASK) && + (exit_reason != EXIT_REASON_EXCEPTION_NMI && + exit_reason != EXIT_REASON_EPT_VIOLATION && + exit_reason != EXIT_REASON_PML_FULL && + exit_reason != EXIT_REASON_TASK_SWITCH)) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; + vcpu->run->internal.ndata = 3; + vcpu->run->internal.data[0] = vectoring_info; + vcpu->run->internal.data[1] = exit_reason; + vcpu->run->internal.data[2] = vcpu->arch.exit_qualification; + if (exit_reason == EXIT_REASON_EPT_MISCONFIG) { + vcpu->run->internal.ndata++; + vcpu->run->internal.data[3] = + vmcs_read64(GUEST_PHYSICAL_ADDRESS); + } + return 0; + } + + if (unlikely(!enable_vnmi && + vmx->loaded_vmcs->soft_vnmi_blocked)) { + if (vmx_interrupt_allowed(vcpu)) { + vmx->loaded_vmcs->soft_vnmi_blocked = 0; + } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL && + vcpu->arch.nmi_pending) { + /* + * This CPU don't support us in finding the end of an + * NMI-blocked window if the guest runs with IRQs + * disabled. So we pull the trigger after 1 s of + * futile waiting, but inform the user about this. + */ + printk(KERN_WARNING "%s: Breaking out of NMI-blocked " + "state on VCPU %d after 1 s timeout\n", + __func__, vcpu->vcpu_id); + vmx->loaded_vmcs->soft_vnmi_blocked = 0; + } + } + + if (exit_reason < kvm_vmx_max_exit_handlers + && kvm_vmx_exit_handlers[exit_reason]) + return kvm_vmx_exit_handlers[exit_reason](vcpu); + else { + vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", + exit_reason); + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } +} + +/* + * Software based L1D cache flush which is used when microcode providing + * the cache control MSR is not loaded. + * + * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to + * flush it is required to read in 64 KiB because the replacement algorithm + * is not exactly LRU. This could be sized at runtime via topology + * information but as all relevant affected CPUs have 32KiB L1D cache size + * there is no point in doing so. + */ +static void vmx_l1d_flush(struct kvm_vcpu *vcpu) +{ + int size = PAGE_SIZE << L1D_CACHE_ORDER; + + /* + * This code is only executed when the the flush mode is 'cond' or + * 'always' + */ + if (static_branch_likely(&vmx_l1d_flush_cond)) { + bool flush_l1d; + + /* + * Clear the per-vcpu flush bit, it gets set again + * either from vcpu_run() or from one of the unsafe + * VMEXIT handlers. + */ + flush_l1d = vcpu->arch.l1tf_flush_l1d; + vcpu->arch.l1tf_flush_l1d = false; + + /* + * Clear the per-cpu flush bit, it gets set again from + * the interrupt handlers. + */ + flush_l1d |= kvm_get_cpu_l1tf_flush_l1d(); + kvm_clear_cpu_l1tf_flush_l1d(); + + if (!flush_l1d) + return; + } + + vcpu->stat.l1d_flush++; + + if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { + wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); + return; + } + + asm volatile( + /* First ensure the pages are in the TLB */ + "xorl %%eax, %%eax\n" + ".Lpopulate_tlb:\n\t" + "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" + "addl $4096, %%eax\n\t" + "cmpl %%eax, %[size]\n\t" + "jne .Lpopulate_tlb\n\t" + "xorl %%eax, %%eax\n\t" + "cpuid\n\t" + /* Now fill the cache */ + "xorl %%eax, %%eax\n" + ".Lfill_cache:\n" + "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" + "addl $64, %%eax\n\t" + "cmpl %%eax, %[size]\n\t" + "jne .Lfill_cache\n\t" + "lfence\n" + :: [flush_pages] "r" (vmx_l1d_flush_pages), + [size] "r" (size) + : "eax", "ebx", "ecx", "edx"); +} + +static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + if (is_guest_mode(vcpu) && + nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) + return; + + if (irr == -1 || tpr < irr) { + vmcs_write32(TPR_THRESHOLD, 0); + return; + } + + vmcs_write32(TPR_THRESHOLD, irr); +} + +static void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) +{ + u32 sec_exec_control; + + if (!lapic_in_kernel(vcpu)) + return; + + if (!flexpriority_enabled && + !cpu_has_vmx_virtualize_x2apic_mode()) + return; + + /* Postpone execution until vmcs01 is the current VMCS. */ + if (is_guest_mode(vcpu)) { + to_vmx(vcpu)->nested.change_vmcs01_virtual_apic_mode = true; + return; + } + + sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); + + switch (kvm_get_apic_mode(vcpu)) { + case LAPIC_MODE_INVALID: + WARN_ONCE(true, "Invalid local APIC state"); + case LAPIC_MODE_DISABLED: + break; + case LAPIC_MODE_XAPIC: + if (flexpriority_enabled) { + sec_exec_control |= + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + vmx_flush_tlb(vcpu, true); + } + break; + case LAPIC_MODE_X2APIC: + if (cpu_has_vmx_virtualize_x2apic_mode()) + sec_exec_control |= + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; + break; + } + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); + + vmx_update_msr_bitmap(vcpu); +} + +static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) +{ + if (!is_guest_mode(vcpu)) { + vmcs_write64(APIC_ACCESS_ADDR, hpa); + vmx_flush_tlb(vcpu, true); + } +} + +static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) +{ + u16 status; + u8 old; + + if (max_isr == -1) + max_isr = 0; + + status = vmcs_read16(GUEST_INTR_STATUS); + old = status >> 8; + if (max_isr != old) { + status &= 0xff; + status |= max_isr << 8; + vmcs_write16(GUEST_INTR_STATUS, status); + } +} + +static void vmx_set_rvi(int vector) +{ + u16 status; + u8 old; + + if (vector == -1) + vector = 0; + + status = vmcs_read16(GUEST_INTR_STATUS); + old = (u8)status & 0xff; + if ((u8)vector != old) { + status &= ~0xff; + status |= (u8)vector; + vmcs_write16(GUEST_INTR_STATUS, status); + } +} + +static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) +{ + /* + * When running L2, updating RVI is only relevant when + * vmcs12 virtual-interrupt-delivery enabled. + * However, it can be enabled only when L1 also + * intercepts external-interrupts and in that case + * we should not update vmcs02 RVI but instead intercept + * interrupt. Therefore, do nothing when running L2. + */ + if (!is_guest_mode(vcpu)) + vmx_set_rvi(max_irr); +} + +static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int max_irr; + bool max_irr_updated; + + WARN_ON(!vcpu->arch.apicv_active); + if (pi_test_on(&vmx->pi_desc)) { + pi_clear_on(&vmx->pi_desc); + /* + * IOMMU can write to PIR.ON, so the barrier matters even on UP. + * But on x86 this is just a compiler barrier anyway. + */ + smp_mb__after_atomic(); + max_irr_updated = + kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr); + + /* + * If we are running L2 and L1 has a new pending interrupt + * which can be injected, we should re-evaluate + * what should be done with this new L1 interrupt. + * If L1 intercepts external-interrupts, we should + * exit from L2 to L1. Otherwise, interrupt should be + * delivered directly to L2. + */ + if (is_guest_mode(vcpu) && max_irr_updated) { + if (nested_exit_on_intr(vcpu)) + kvm_vcpu_exiting_guest_mode(vcpu); + else + kvm_make_request(KVM_REQ_EVENT, vcpu); + } + } else { + max_irr = kvm_lapic_find_highest_irr(vcpu); + } + vmx_hwapic_irr_update(vcpu, max_irr); + return max_irr; +} + +static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu) +{ + u8 rvi = vmx_get_rvi(); + u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI); + + return ((rvi & 0xf0) > (vppr & 0xf0)); +} + +static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) +{ + if (!kvm_vcpu_apicv_active(vcpu)) + return; + + vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); + vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); + vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); + vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); +} + +static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + pi_clear_on(&vmx->pi_desc); + memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir)); +} + +static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) +{ + u32 exit_intr_info = 0; + u16 basic_exit_reason = (u16)vmx->exit_reason; + + if (!(basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY + || basic_exit_reason == EXIT_REASON_EXCEPTION_NMI)) + return; + + if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) + exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + vmx->exit_intr_info = exit_intr_info; + + /* if exit due to PF check for async PF */ + if (is_page_fault(exit_intr_info)) + vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason(); + + /* Handle machine checks before interrupts are enabled */ + if (basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY || + is_machine_check(exit_intr_info)) + kvm_machine_check(); + + /* We need to handle NMIs before interrupts are enabled */ + if (is_nmi(exit_intr_info)) { + kvm_before_interrupt(&vmx->vcpu); + asm("int $2"); + kvm_after_interrupt(&vmx->vcpu); + } +} + +static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) +{ + u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + + if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK)) + == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) { + unsigned int vector; + unsigned long entry; + gate_desc *desc; + struct vcpu_vmx *vmx = to_vmx(vcpu); +#ifdef CONFIG_X86_64 + unsigned long tmp; +#endif + + vector = exit_intr_info & INTR_INFO_VECTOR_MASK; + desc = (gate_desc *)vmx->host_idt_base + vector; + entry = gate_offset(desc); + asm volatile( +#ifdef CONFIG_X86_64 + "mov %%" _ASM_SP ", %[sp]\n\t" + "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t" + "push $%c[ss]\n\t" + "push %[sp]\n\t" +#endif + "pushf\n\t" + __ASM_SIZE(push) " $%c[cs]\n\t" + CALL_NOSPEC + : +#ifdef CONFIG_X86_64 + [sp]"=&r"(tmp), +#endif + ASM_CALL_CONSTRAINT + : + THUNK_TARGET(entry), + [ss]"i"(__KERNEL_DS), + [cs]"i"(__KERNEL_CS) + ); + } +} +STACK_FRAME_NON_STANDARD(vmx_handle_external_intr); + +static bool vmx_has_emulated_msr(int index) +{ + switch (index) { + case MSR_IA32_SMBASE: + /* + * We cannot do SMM unless we can run the guest in big + * real mode. + */ + return enable_unrestricted_guest || emulate_invalid_guest_state; + case MSR_AMD64_VIRT_SPEC_CTRL: + /* This is AMD only. */ + return false; + default: + return true; + } +} + +static bool vmx_mpx_supported(void) +{ + return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) && + (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS); +} + +static bool vmx_xsaves_supported(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_XSAVES; +} + +static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) +{ + u32 exit_intr_info; + bool unblock_nmi; + u8 vector; + bool idtv_info_valid; + + idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; + + if (enable_vnmi) { + if (vmx->loaded_vmcs->nmi_known_unmasked) + return; + /* + * Can't use vmx->exit_intr_info since we're not sure what + * the exit reason is. + */ + exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; + vector = exit_intr_info & INTR_INFO_VECTOR_MASK; + /* + * SDM 3: 27.7.1.2 (September 2008) + * Re-set bit "block by NMI" before VM entry if vmexit caused by + * a guest IRET fault. + * SDM 3: 23.2.2 (September 2008) + * Bit 12 is undefined in any of the following cases: + * If the VM exit sets the valid bit in the IDT-vectoring + * information field. + * If the VM exit is due to a double fault. + */ + if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && + vector != DF_VECTOR && !idtv_info_valid) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + else + vmx->loaded_vmcs->nmi_known_unmasked = + !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) + & GUEST_INTR_STATE_NMI); + } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked)) + vmx->loaded_vmcs->vnmi_blocked_time += + ktime_to_ns(ktime_sub(ktime_get(), + vmx->loaded_vmcs->entry_time)); +} + +static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, + u32 idt_vectoring_info, + int instr_len_field, + int error_code_field) +{ + u8 vector; + int type; + bool idtv_info_valid; + + idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; + + vcpu->arch.nmi_injected = false; + kvm_clear_exception_queue(vcpu); + kvm_clear_interrupt_queue(vcpu); + + if (!idtv_info_valid) + return; + + kvm_make_request(KVM_REQ_EVENT, vcpu); + + vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; + type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; + + switch (type) { + case INTR_TYPE_NMI_INTR: + vcpu->arch.nmi_injected = true; + /* + * SDM 3: 27.7.1.2 (September 2008) + * Clear bit "block by NMI" before VM entry if a NMI + * delivery faulted. + */ + vmx_set_nmi_mask(vcpu, false); + break; + case INTR_TYPE_SOFT_EXCEPTION: + vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); + /* fall through */ + case INTR_TYPE_HARD_EXCEPTION: + if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { + u32 err = vmcs_read32(error_code_field); + kvm_requeue_exception_e(vcpu, vector, err); + } else + kvm_requeue_exception(vcpu, vector); + break; + case INTR_TYPE_SOFT_INTR: + vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); + /* fall through */ + case INTR_TYPE_EXT_INTR: + kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR); + break; + default: + break; + } +} + +static void vmx_complete_interrupts(struct vcpu_vmx *vmx) +{ + __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info, + VM_EXIT_INSTRUCTION_LEN, + IDT_VECTORING_ERROR_CODE); +} + +static void vmx_cancel_injection(struct kvm_vcpu *vcpu) +{ + __vmx_complete_interrupts(vcpu, + vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), + VM_ENTRY_INSTRUCTION_LEN, + VM_ENTRY_EXCEPTION_ERROR_CODE); + + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); +} + +static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) +{ + int i, nr_msrs; + struct perf_guest_switch_msr *msrs; + + msrs = perf_guest_get_msrs(&nr_msrs); + + if (!msrs) + return; + + for (i = 0; i < nr_msrs; i++) + if (msrs[i].host == msrs[i].guest) + clear_atomic_switch_msr(vmx, msrs[i].msr); + else + add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, + msrs[i].host, false); +} + +static void vmx_arm_hv_timer(struct vcpu_vmx *vmx, u32 val) +{ + vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, val); + if (!vmx->loaded_vmcs->hv_timer_armed) + vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, + PIN_BASED_VMX_PREEMPTION_TIMER); + vmx->loaded_vmcs->hv_timer_armed = true; +} + +static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u64 tscl; + u32 delta_tsc; + + if (vmx->req_immediate_exit) { + vmx_arm_hv_timer(vmx, 0); + return; + } + + if (vmx->hv_deadline_tsc != -1) { + tscl = rdtsc(); + if (vmx->hv_deadline_tsc > tscl) + /* set_hv_timer ensures the delta fits in 32-bits */ + delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> + cpu_preemption_timer_multi); + else + delta_tsc = 0; + + vmx_arm_hv_timer(vmx, delta_tsc); + return; + } + + if (vmx->loaded_vmcs->hv_timer_armed) + vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, + PIN_BASED_VMX_PREEMPTION_TIMER); + vmx->loaded_vmcs->hv_timer_armed = false; +} + +static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long cr3, cr4, evmcs_rsp; + + /* Record the guest's net vcpu time for enforced NMI injections. */ + if (unlikely(!enable_vnmi && + vmx->loaded_vmcs->soft_vnmi_blocked)) + vmx->loaded_vmcs->entry_time = ktime_get(); + + /* Don't enter VMX if guest state is invalid, let the exit handler + start emulation until we arrive back to a valid state */ + if (vmx->emulation_required) + return; + + if (vmx->ple_window_dirty) { + vmx->ple_window_dirty = false; + vmcs_write32(PLE_WINDOW, vmx->ple_window); + } + + if (vmx->nested.need_vmcs12_sync) { + /* + * hv_evmcs may end up being not mapped after migration (when + * L2 was running), map it here to make sure vmcs12 changes are + * properly reflected. + */ + if (vmx->nested.enlightened_vmcs_enabled && + !vmx->nested.hv_evmcs) + nested_vmx_handle_enlightened_vmptrld(vcpu, false); + + if (vmx->nested.hv_evmcs) { + copy_vmcs12_to_enlightened(vmx); + /* All fields are clean */ + vmx->nested.hv_evmcs->hv_clean_fields |= + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + } else { + copy_vmcs12_to_shadow(vmx); + } + vmx->nested.need_vmcs12_sync = false; + } + + if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) + vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); + if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) + vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); + + cr3 = __get_current_cr3_fast(); + if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { + vmcs_writel(HOST_CR3, cr3); + vmx->loaded_vmcs->host_state.cr3 = cr3; + } + + cr4 = cr4_read_shadow(); + if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { + vmcs_writel(HOST_CR4, cr4); + vmx->loaded_vmcs->host_state.cr4 = cr4; + } + + /* When single-stepping over STI and MOV SS, we must clear the + * corresponding interruptibility bits in the guest state. Otherwise + * vmentry fails as it then expects bit 14 (BS) in pending debug + * exceptions being set, but that's not correct for the guest debugging + * case. */ + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + vmx_set_interrupt_shadow(vcpu, 0); + + if (static_cpu_has(X86_FEATURE_PKU) && + kvm_read_cr4_bits(vcpu, X86_CR4_PKE) && + vcpu->arch.pkru != vmx->host_pkru) + __write_pkru(vcpu->arch.pkru); + + atomic_switch_perf_msrs(vmx); + + vmx_update_hv_timer(vcpu); + + /* + * If this vCPU has touched SPEC_CTRL, restore the guest's value if + * it's non-zero. Since vmentry is serialising on affected CPUs, there + * is no need to worry about the conditional branch over the wrmsr + * being speculatively taken. + */ + x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0); + + vmx->__launched = vmx->loaded_vmcs->launched; + + evmcs_rsp = static_branch_unlikely(&enable_evmcs) ? + (unsigned long)¤t_evmcs->host_rsp : 0; + + if (static_branch_unlikely(&vmx_l1d_should_flush)) + vmx_l1d_flush(vcpu); + + asm( + /* Store host registers */ + "push %%" _ASM_DX "; push %%" _ASM_BP ";" + "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */ + "push %%" _ASM_CX " \n\t" + "cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t" + "je 1f \n\t" + "mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t" + /* Avoid VMWRITE when Enlightened VMCS is in use */ + "test %%" _ASM_SI ", %%" _ASM_SI " \n\t" + "jz 2f \n\t" + "mov %%" _ASM_SP ", (%%" _ASM_SI ") \n\t" + "jmp 1f \n\t" + "2: \n\t" + __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t" + "1: \n\t" + /* Reload cr2 if changed */ + "mov %c[cr2](%0), %%" _ASM_AX " \n\t" + "mov %%cr2, %%" _ASM_DX " \n\t" + "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t" + "je 3f \n\t" + "mov %%" _ASM_AX", %%cr2 \n\t" + "3: \n\t" + /* Check if vmlaunch or vmresume is needed */ + "cmpl $0, %c[launched](%0) \n\t" + /* Load guest registers. Don't clobber flags. */ + "mov %c[rax](%0), %%" _ASM_AX " \n\t" + "mov %c[rbx](%0), %%" _ASM_BX " \n\t" + "mov %c[rdx](%0), %%" _ASM_DX " \n\t" + "mov %c[rsi](%0), %%" _ASM_SI " \n\t" + "mov %c[rdi](%0), %%" _ASM_DI " \n\t" + "mov %c[rbp](%0), %%" _ASM_BP " \n\t" +#ifdef CONFIG_X86_64 + "mov %c[r8](%0), %%r8 \n\t" + "mov %c[r9](%0), %%r9 \n\t" + "mov %c[r10](%0), %%r10 \n\t" + "mov %c[r11](%0), %%r11 \n\t" + "mov %c[r12](%0), %%r12 \n\t" + "mov %c[r13](%0), %%r13 \n\t" + "mov %c[r14](%0), %%r14 \n\t" + "mov %c[r15](%0), %%r15 \n\t" +#endif + "mov %c[rcx](%0), %%" _ASM_CX " \n\t" /* kills %0 (ecx) */ + + /* Enter guest mode */ + "jne 1f \n\t" + __ex("vmlaunch") "\n\t" + "jmp 2f \n\t" + "1: " __ex("vmresume") "\n\t" + "2: " + /* Save guest registers, load host registers, keep flags */ + "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t" + "pop %0 \n\t" + "setbe %c[fail](%0)\n\t" + "mov %%" _ASM_AX ", %c[rax](%0) \n\t" + "mov %%" _ASM_BX ", %c[rbx](%0) \n\t" + __ASM_SIZE(pop) " %c[rcx](%0) \n\t" + "mov %%" _ASM_DX ", %c[rdx](%0) \n\t" + "mov %%" _ASM_SI ", %c[rsi](%0) \n\t" + "mov %%" _ASM_DI ", %c[rdi](%0) \n\t" + "mov %%" _ASM_BP ", %c[rbp](%0) \n\t" +#ifdef CONFIG_X86_64 + "mov %%r8, %c[r8](%0) \n\t" + "mov %%r9, %c[r9](%0) \n\t" + "mov %%r10, %c[r10](%0) \n\t" + "mov %%r11, %c[r11](%0) \n\t" + "mov %%r12, %c[r12](%0) \n\t" + "mov %%r13, %c[r13](%0) \n\t" + "mov %%r14, %c[r14](%0) \n\t" + "mov %%r15, %c[r15](%0) \n\t" + /* + * Clear host registers marked as clobbered to prevent + * speculative use. + */ + "xor %%r8d, %%r8d \n\t" + "xor %%r9d, %%r9d \n\t" + "xor %%r10d, %%r10d \n\t" + "xor %%r11d, %%r11d \n\t" + "xor %%r12d, %%r12d \n\t" + "xor %%r13d, %%r13d \n\t" + "xor %%r14d, %%r14d \n\t" + "xor %%r15d, %%r15d \n\t" +#endif + "mov %%cr2, %%" _ASM_AX " \n\t" + "mov %%" _ASM_AX ", %c[cr2](%0) \n\t" + + "xor %%eax, %%eax \n\t" + "xor %%ebx, %%ebx \n\t" + "xor %%esi, %%esi \n\t" + "xor %%edi, %%edi \n\t" + "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t" + ".pushsection .rodata \n\t" + ".global vmx_return \n\t" + "vmx_return: " _ASM_PTR " 2b \n\t" + ".popsection" + : : "c"(vmx), "d"((unsigned long)HOST_RSP), "S"(evmcs_rsp), + [launched]"i"(offsetof(struct vcpu_vmx, __launched)), + [fail]"i"(offsetof(struct vcpu_vmx, fail)), + [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), + [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), + [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), + [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), + [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])), + [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])), + [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])), + [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])), +#ifdef CONFIG_X86_64 + [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])), + [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])), + [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])), + [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])), + [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])), + [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])), + [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), + [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), +#endif + [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)), + [wordsize]"i"(sizeof(ulong)) + : "cc", "memory" +#ifdef CONFIG_X86_64 + , "rax", "rbx", "rdi" + , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" +#else + , "eax", "ebx", "edi" +#endif + ); + + /* + * We do not use IBRS in the kernel. If this vCPU has used the + * SPEC_CTRL MSR it may have left it on; save the value and + * turn it off. This is much more efficient than blindly adding + * it to the atomic save/restore list. Especially as the former + * (Saving guest MSRs on vmexit) doesn't even exist in KVM. + * + * For non-nested case: + * If the L01 MSR bitmap does not intercept the MSR, then we need to + * save it. + * + * For nested case: + * If the L02 MSR bitmap does not intercept the MSR, then we need to + * save it. + */ + if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) + vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); + + x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0); + + /* Eliminate branch target predictions from guest mode */ + vmexit_fill_RSB(); + + /* All fields are clean at this point */ + if (static_branch_unlikely(&enable_evmcs)) + current_evmcs->hv_clean_fields |= + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + + /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ + if (vmx->host_debugctlmsr) + update_debugctlmsr(vmx->host_debugctlmsr); + +#ifndef CONFIG_X86_64 + /* + * The sysexit path does not restore ds/es, so we must set them to + * a reasonable value ourselves. + * + * We can't defer this to vmx_prepare_switch_to_host() since that + * function may be executed in interrupt context, which saves and + * restore segments around it, nullifying its effect. + */ + loadsegment(ds, __USER_DS); + loadsegment(es, __USER_DS); +#endif + + vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) + | (1 << VCPU_EXREG_RFLAGS) + | (1 << VCPU_EXREG_PDPTR) + | (1 << VCPU_EXREG_SEGMENTS) + | (1 << VCPU_EXREG_CR3)); + vcpu->arch.regs_dirty = 0; + + /* + * eager fpu is enabled if PKEY is supported and CR4 is switched + * back on host, so it is safe to read guest PKRU from current + * XSAVE. + */ + if (static_cpu_has(X86_FEATURE_PKU) && + kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) { + vcpu->arch.pkru = __read_pkru(); + if (vcpu->arch.pkru != vmx->host_pkru) + __write_pkru(vmx->host_pkru); + } + + vmx->nested.nested_run_pending = 0; + vmx->idt_vectoring_info = 0; + + vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON); + if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) + return; + + vmx->loaded_vmcs->launched = 1; + vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); + + vmx_complete_atomic_exit(vmx); + vmx_recover_nmi_blocking(vmx); + vmx_complete_interrupts(vmx); +} +STACK_FRAME_NON_STANDARD(vmx_vcpu_run); + +static struct kvm *vmx_vm_alloc(void) +{ + struct kvm_vmx *kvm_vmx = vzalloc(sizeof(struct kvm_vmx)); + return &kvm_vmx->kvm; +} + +static void vmx_vm_free(struct kvm *kvm) +{ + vfree(to_kvm_vmx(kvm)); +} + +static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int cpu; + + if (vmx->loaded_vmcs == vmcs) + return; + + cpu = get_cpu(); + vmx_vcpu_put(vcpu); + vmx->loaded_vmcs = vmcs; + vmx_vcpu_load(vcpu, cpu); + put_cpu(); + + vm_entry_controls_reset_shadow(vmx); + vm_exit_controls_reset_shadow(vmx); + vmx_segment_cache_clear(vmx); +} + +/* + * Ensure that the current vmcs of the logical processor is the + * vmcs01 of the vcpu before calling free_nested(). + */ +static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu) +{ + vcpu_load(vcpu); + vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01); + free_nested(vcpu); + vcpu_put(vcpu); +} + +static void vmx_free_vcpu(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (enable_pml) + vmx_destroy_pml_buffer(vmx); + free_vpid(vmx->vpid); + leave_guest_mode(vcpu); + vmx_free_vcpu_nested(vcpu); + free_loaded_vmcs(vmx->loaded_vmcs); + kfree(vmx->guest_msrs); + kvm_vcpu_uninit(vcpu); + kmem_cache_free(kvm_vcpu_cache, vmx); +} + +static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) +{ + int err; + struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); + unsigned long *msr_bitmap; + int cpu; + + if (!vmx) + return ERR_PTR(-ENOMEM); + + vmx->vpid = allocate_vpid(); + + err = kvm_vcpu_init(&vmx->vcpu, kvm, id); + if (err) + goto free_vcpu; + + err = -ENOMEM; + + /* + * If PML is turned on, failure on enabling PML just results in failure + * of creating the vcpu, therefore we can simplify PML logic (by + * avoiding dealing with cases, such as enabling PML partially on vcpus + * for the guest, etc. + */ + if (enable_pml) { + vmx->pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!vmx->pml_pg) + goto uninit_vcpu; + } + + vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); + BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0]) + > PAGE_SIZE); + + if (!vmx->guest_msrs) + goto free_pml; + + err = alloc_loaded_vmcs(&vmx->vmcs01); + if (err < 0) + goto free_msrs; + + msr_bitmap = vmx->vmcs01.msr_bitmap; + vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); + vmx->msr_bitmap_mode = 0; + + vmx->loaded_vmcs = &vmx->vmcs01; + cpu = get_cpu(); + vmx_vcpu_load(&vmx->vcpu, cpu); + vmx->vcpu.cpu = cpu; + vmx_vcpu_setup(vmx); + vmx_vcpu_put(&vmx->vcpu); + put_cpu(); + if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { + err = alloc_apic_access_page(kvm); + if (err) + goto free_vmcs; + } + + if (enable_ept && !enable_unrestricted_guest) { + err = init_rmode_identity_map(kvm); + if (err) + goto free_vmcs; + } + + if (nested) + nested_vmx_setup_ctls_msrs(&vmx->nested.msrs, + kvm_vcpu_apicv_active(&vmx->vcpu)); + + vmx->nested.posted_intr_nv = -1; + vmx->nested.current_vmptr = -1ull; + + vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED; + + /* + * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR + * or POSTED_INTR_WAKEUP_VECTOR. + */ + vmx->pi_desc.nv = POSTED_INTR_VECTOR; + vmx->pi_desc.sn = 1; + + return &vmx->vcpu; + +free_vmcs: + free_loaded_vmcs(vmx->loaded_vmcs); +free_msrs: + kfree(vmx->guest_msrs); +free_pml: + vmx_destroy_pml_buffer(vmx); +uninit_vcpu: + kvm_vcpu_uninit(&vmx->vcpu); +free_vcpu: + free_vpid(vmx->vpid); + kmem_cache_free(kvm_vcpu_cache, vmx); + return ERR_PTR(err); +} + +#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n" +#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n" + +static int vmx_vm_init(struct kvm *kvm) +{ + spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock); + + if (!ple_gap) + kvm->arch.pause_in_guest = true; + + if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) { + switch (l1tf_mitigation) { + case L1TF_MITIGATION_OFF: + case L1TF_MITIGATION_FLUSH_NOWARN: + /* 'I explicitly don't care' is set */ + break; + case L1TF_MITIGATION_FLUSH: + case L1TF_MITIGATION_FLUSH_NOSMT: + case L1TF_MITIGATION_FULL: + /* + * Warn upon starting the first VM in a potentially + * insecure environment. + */ + if (cpu_smt_control == CPU_SMT_ENABLED) + pr_warn_once(L1TF_MSG_SMT); + if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER) + pr_warn_once(L1TF_MSG_L1D); + break; + case L1TF_MITIGATION_FULL_FORCE: + /* Flush is enforced */ + break; + } + } + return 0; +} + +static void __init vmx_check_processor_compat(void *rtn) +{ + struct vmcs_config vmcs_conf; + + *(int *)rtn = 0; + if (setup_vmcs_config(&vmcs_conf) < 0) + *(int *)rtn = -EIO; + nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, enable_apicv); + if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { + printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n", + smp_processor_id()); + *(int *)rtn = -EIO; + } +} + +static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) +{ + u8 cache; + u64 ipat = 0; + + /* For VT-d and EPT combination + * 1. MMIO: always map as UC + * 2. EPT with VT-d: + * a. VT-d without snooping control feature: can't guarantee the + * result, try to trust guest. + * b. VT-d with snooping control feature: snooping control feature of + * VT-d engine can guarantee the cache correctness. Just set it + * to WB to keep consistent with host. So the same as item 3. + * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep + * consistent with host MTRR + */ + if (is_mmio) { + cache = MTRR_TYPE_UNCACHABLE; + goto exit; + } + + if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) { + ipat = VMX_EPT_IPAT_BIT; + cache = MTRR_TYPE_WRBACK; + goto exit; + } + + if (kvm_read_cr0(vcpu) & X86_CR0_CD) { + ipat = VMX_EPT_IPAT_BIT; + if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) + cache = MTRR_TYPE_WRBACK; + else + cache = MTRR_TYPE_UNCACHABLE; + goto exit; + } + + cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn); + +exit: + return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat; +} + +static int vmx_get_lpage_level(void) +{ + if (enable_ept && !cpu_has_vmx_ept_1g_page()) + return PT_DIRECTORY_LEVEL; + else + /* For shadow and EPT supported 1GB page */ + return PT_PDPE_LEVEL; +} + +static void vmcs_set_secondary_exec_control(u32 new_ctl) +{ + /* + * These bits in the secondary execution controls field + * are dynamic, the others are mostly based on the hypervisor + * architecture and the guest's CPUID. Do not touch the + * dynamic bits. + */ + u32 mask = + SECONDARY_EXEC_SHADOW_VMCS | + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_DESC; + + u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, + (new_ctl & ~mask) | (cur_ctl & mask)); +} + +/* + * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits + * (indicating "allowed-1") if they are supported in the guest's CPUID. + */ +static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_cpuid_entry2 *entry; + + vmx->nested.msrs.cr0_fixed1 = 0xffffffff; + vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE; + +#define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \ + if (entry && (entry->_reg & (_cpuid_mask))) \ + vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \ +} while (0) + + entry = kvm_find_cpuid_entry(vcpu, 0x1, 0); + cr4_fixed1_update(X86_CR4_VME, edx, bit(X86_FEATURE_VME)); + cr4_fixed1_update(X86_CR4_PVI, edx, bit(X86_FEATURE_VME)); + cr4_fixed1_update(X86_CR4_TSD, edx, bit(X86_FEATURE_TSC)); + cr4_fixed1_update(X86_CR4_DE, edx, bit(X86_FEATURE_DE)); + cr4_fixed1_update(X86_CR4_PSE, edx, bit(X86_FEATURE_PSE)); + cr4_fixed1_update(X86_CR4_PAE, edx, bit(X86_FEATURE_PAE)); + cr4_fixed1_update(X86_CR4_MCE, edx, bit(X86_FEATURE_MCE)); + cr4_fixed1_update(X86_CR4_PGE, edx, bit(X86_FEATURE_PGE)); + cr4_fixed1_update(X86_CR4_OSFXSR, edx, bit(X86_FEATURE_FXSR)); + cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, bit(X86_FEATURE_XMM)); + cr4_fixed1_update(X86_CR4_VMXE, ecx, bit(X86_FEATURE_VMX)); + cr4_fixed1_update(X86_CR4_SMXE, ecx, bit(X86_FEATURE_SMX)); + cr4_fixed1_update(X86_CR4_PCIDE, ecx, bit(X86_FEATURE_PCID)); + cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, bit(X86_FEATURE_XSAVE)); + + entry = kvm_find_cpuid_entry(vcpu, 0x7, 0); + cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, bit(X86_FEATURE_FSGSBASE)); + cr4_fixed1_update(X86_CR4_SMEP, ebx, bit(X86_FEATURE_SMEP)); + cr4_fixed1_update(X86_CR4_SMAP, ebx, bit(X86_FEATURE_SMAP)); + cr4_fixed1_update(X86_CR4_PKE, ecx, bit(X86_FEATURE_PKU)); + cr4_fixed1_update(X86_CR4_UMIP, ecx, bit(X86_FEATURE_UMIP)); + +#undef cr4_fixed1_update +} + +static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (kvm_mpx_supported()) { + bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX); + + if (mpx_enabled) { + vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; + vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; + } else { + vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS; + vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS; + } + } +} + +static void vmx_cpuid_update(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (cpu_has_secondary_exec_ctrls()) { + vmx_compute_secondary_exec_control(vmx); + vmcs_set_secondary_exec_control(vmx->secondary_exec_control); + } + + if (nested_vmx_allowed(vcpu)) + to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= + FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; + else + to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= + ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; + + if (nested_vmx_allowed(vcpu)) { + nested_vmx_cr_fixed1_bits_update(vcpu); + nested_vmx_entry_exit_ctls_update(vcpu); + } +} + +static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) +{ + if (func == 1 && nested) + entry->ecx |= bit(X86_FEATURE_VMX); +} + +static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, + struct x86_exception *fault) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 exit_reason; + unsigned long exit_qualification = vcpu->arch.exit_qualification; + + if (vmx->nested.pml_full) { + exit_reason = EXIT_REASON_PML_FULL; + vmx->nested.pml_full = false; + exit_qualification &= INTR_INFO_UNBLOCK_NMI; + } else if (fault->error_code & PFERR_RSVD_MASK) + exit_reason = EXIT_REASON_EPT_MISCONFIG; + else + exit_reason = EXIT_REASON_EPT_VIOLATION; + + nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification); + vmcs12->guest_physical_address = fault->address; +} + +static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu) +{ + return nested_ept_get_cr3(vcpu) & VMX_EPTP_AD_ENABLE_BIT; +} + +/* Callbacks for nested_ept_init_mmu_context: */ + +static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) +{ + /* return the page table to be shadowed - in our case, EPT12 */ + return get_vmcs12(vcpu)->ept_pointer; +} + +static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) +{ + WARN_ON(mmu_is_nested(vcpu)); + + vcpu->arch.mmu = &vcpu->arch.guest_mmu; + kvm_init_shadow_ept_mmu(vcpu, + to_vmx(vcpu)->nested.msrs.ept_caps & + VMX_EPT_EXECUTE_ONLY_BIT, + nested_ept_ad_enabled(vcpu), + nested_ept_get_cr3(vcpu)); + vcpu->arch.mmu->set_cr3 = vmx_set_cr3; + vcpu->arch.mmu->get_cr3 = nested_ept_get_cr3; + vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; + vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; + + vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; +} + +static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) +{ + vcpu->arch.mmu = &vcpu->arch.root_mmu; + vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; +} + +static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, + u16 error_code) +{ + bool inequality, bit; + + bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; + inequality = + (error_code & vmcs12->page_fault_error_code_mask) != + vmcs12->page_fault_error_code_match; + return inequality ^ bit; +} + +static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, + struct x86_exception *fault) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + WARN_ON(!is_guest_mode(vcpu)); + + if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) && + !to_vmx(vcpu)->nested.nested_run_pending) { + vmcs12->vm_exit_intr_error_code = fault->error_code; + nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, + PF_VECTOR | INTR_TYPE_HARD_EXCEPTION | + INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK, + fault->address); + } else { + kvm_inject_page_fault(vcpu, fault); + } +} + +static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12); + +static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct page *page; + u64 hpa; + + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { + /* + * Translate L1 physical address to host physical + * address for vmcs02. Keep the page pinned, so this + * physical address remains valid. We keep a reference + * to it so we can release it later. + */ + if (vmx->nested.apic_access_page) { /* shouldn't happen */ + kvm_release_page_dirty(vmx->nested.apic_access_page); + vmx->nested.apic_access_page = NULL; + } + page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr); + /* + * If translation failed, no matter: This feature asks + * to exit when accessing the given address, and if it + * can never be accessed, this feature won't do + * anything anyway. + */ + if (!is_error_page(page)) { + vmx->nested.apic_access_page = page; + hpa = page_to_phys(vmx->nested.apic_access_page); + vmcs_write64(APIC_ACCESS_ADDR, hpa); + } else { + vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); + } + } + + if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { + if (vmx->nested.virtual_apic_page) { /* shouldn't happen */ + kvm_release_page_dirty(vmx->nested.virtual_apic_page); + vmx->nested.virtual_apic_page = NULL; + } + page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->virtual_apic_page_addr); + + /* + * If translation failed, VM entry will fail because + * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull. + * Failing the vm entry is _not_ what the processor + * does but it's basically the only possibility we + * have. We could still enter the guest if CR8 load + * exits are enabled, CR8 store exits are enabled, and + * virtualize APIC access is disabled; in this case + * the processor would never use the TPR shadow and we + * could simply clear the bit from the execution + * control. But such a configuration is useless, so + * let's keep the code simple. + */ + if (!is_error_page(page)) { + vmx->nested.virtual_apic_page = page; + hpa = page_to_phys(vmx->nested.virtual_apic_page); + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa); + } + } + + if (nested_cpu_has_posted_intr(vmcs12)) { + if (vmx->nested.pi_desc_page) { /* shouldn't happen */ + kunmap(vmx->nested.pi_desc_page); + kvm_release_page_dirty(vmx->nested.pi_desc_page); + vmx->nested.pi_desc_page = NULL; + } + page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->posted_intr_desc_addr); + if (is_error_page(page)) + return; + vmx->nested.pi_desc_page = page; + vmx->nested.pi_desc = kmap(vmx->nested.pi_desc_page); + vmx->nested.pi_desc = + (struct pi_desc *)((void *)vmx->nested.pi_desc + + (unsigned long)(vmcs12->posted_intr_desc_addr & + (PAGE_SIZE - 1))); + vmcs_write64(POSTED_INTR_DESC_ADDR, + page_to_phys(vmx->nested.pi_desc_page) + + (unsigned long)(vmcs12->posted_intr_desc_addr & + (PAGE_SIZE - 1))); + } + if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) + vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, + CPU_BASED_USE_MSR_BITMAPS); + else + vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, + CPU_BASED_USE_MSR_BITMAPS); +} + +static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) +{ + u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * A timer value of zero is architecturally guaranteed to cause + * a VMExit prior to executing any instructions in the guest. + */ + if (preemption_timeout == 0) { + vmx_preemption_timer_fn(&vmx->nested.preemption_timer); + return; + } + + if (vcpu->arch.virtual_tsc_khz == 0) + return; + + preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; + preemption_timeout *= 1000000; + do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); + hrtimer_start(&vmx->nested.preemption_timer, + ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); +} + +static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) + return 0; + + if (!page_address_valid(vcpu, vmcs12->io_bitmap_a) || + !page_address_valid(vcpu, vmcs12->io_bitmap_b)) + return -EINVAL; + + return 0; +} + +static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) + return 0; + + if (!page_address_valid(vcpu, vmcs12->msr_bitmap)) + return -EINVAL; + + return 0; +} + +static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) + return 0; + + if (!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr)) + return -EINVAL; + + return 0; +} + +/* + * Merge L0's and L1's MSR bitmap, return false to indicate that + * we do not use the hardware. + */ +static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + int msr; + struct page *page; + unsigned long *msr_bitmap_l1; + unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; + /* + * pred_cmd & spec_ctrl are trying to verify two things: + * + * 1. L0 gave a permission to L1 to actually passthrough the MSR. This + * ensures that we do not accidentally generate an L02 MSR bitmap + * from the L12 MSR bitmap that is too permissive. + * 2. That L1 or L2s have actually used the MSR. This avoids + * unnecessarily merging of the bitmap if the MSR is unused. This + * works properly because we only update the L01 MSR bitmap lazily. + * So even if L0 should pass L1 these MSRs, the L01 bitmap is only + * updated to reflect this when L1 (or its L2s) actually write to + * the MSR. + */ + bool pred_cmd = !msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD); + bool spec_ctrl = !msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL); + + /* Nothing to do if the MSR bitmap is not in use. */ + if (!cpu_has_vmx_msr_bitmap() || + !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) + return false; + + if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && + !pred_cmd && !spec_ctrl) + return false; + + page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap); + if (is_error_page(page)) + return false; + + msr_bitmap_l1 = (unsigned long *)kmap(page); + if (nested_cpu_has_apic_reg_virt(vmcs12)) { + /* + * L0 need not intercept reads for MSRs between 0x800 and 0x8ff, it + * just lets the processor take the value from the virtual-APIC page; + * take those 256 bits directly from the L1 bitmap. + */ + for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { + unsigned word = msr / BITS_PER_LONG; + msr_bitmap_l0[word] = msr_bitmap_l1[word]; + msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0; + } + } else { + for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { + unsigned word = msr / BITS_PER_LONG; + msr_bitmap_l0[word] = ~0; + msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0; + } + } + + nested_vmx_disable_intercept_for_msr( + msr_bitmap_l1, msr_bitmap_l0, + X2APIC_MSR(APIC_TASKPRI), + MSR_TYPE_W); + + if (nested_cpu_has_vid(vmcs12)) { + nested_vmx_disable_intercept_for_msr( + msr_bitmap_l1, msr_bitmap_l0, + X2APIC_MSR(APIC_EOI), + MSR_TYPE_W); + nested_vmx_disable_intercept_for_msr( + msr_bitmap_l1, msr_bitmap_l0, + X2APIC_MSR(APIC_SELF_IPI), + MSR_TYPE_W); + } + + if (spec_ctrl) + nested_vmx_disable_intercept_for_msr( + msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_SPEC_CTRL, + MSR_TYPE_R | MSR_TYPE_W); + + if (pred_cmd) + nested_vmx_disable_intercept_for_msr( + msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_PRED_CMD, + MSR_TYPE_W); + + kunmap(page); + kvm_release_page_clean(page); + + return true; +} + +static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + struct vmcs12 *shadow; + struct page *page; + + if (!nested_cpu_has_shadow_vmcs(vmcs12) || + vmcs12->vmcs_link_pointer == -1ull) + return; + + shadow = get_shadow_vmcs12(vcpu); + page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer); + + memcpy(shadow, kmap(page), VMCS12_SIZE); + + kunmap(page); + kvm_release_page_clean(page); +} + +static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!nested_cpu_has_shadow_vmcs(vmcs12) || + vmcs12->vmcs_link_pointer == -1ull) + return; + + kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer, + get_shadow_vmcs12(vcpu), VMCS12_SIZE); +} + +static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && + !page_address_valid(vcpu, vmcs12->apic_access_addr)) + return -EINVAL; + else + return 0; +} + +static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && + !nested_cpu_has_apic_reg_virt(vmcs12) && + !nested_cpu_has_vid(vmcs12) && + !nested_cpu_has_posted_intr(vmcs12)) + return 0; + + /* + * If virtualize x2apic mode is enabled, + * virtualize apic access must be disabled. + */ + if (nested_cpu_has_virt_x2apic_mode(vmcs12) && + nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) + return -EINVAL; + + /* + * If virtual interrupt delivery is enabled, + * we must exit on external interrupts. + */ + if (nested_cpu_has_vid(vmcs12) && + !nested_exit_on_intr(vcpu)) + return -EINVAL; + + /* + * bits 15:8 should be zero in posted_intr_nv, + * the descriptor address has been already checked + * in nested_get_vmcs12_pages. + * + * bits 5:0 of posted_intr_desc_addr should be zero. + */ + if (nested_cpu_has_posted_intr(vmcs12) && + (!nested_cpu_has_vid(vmcs12) || + !nested_exit_intr_ack_set(vcpu) || + (vmcs12->posted_intr_nv & 0xff00) || + (vmcs12->posted_intr_desc_addr & 0x3f) || + (vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu)))) + return -EINVAL; + + /* tpr shadow is needed by all apicv features. */ + if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) + return -EINVAL; + + return 0; +} + +static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, + unsigned long count_field, + unsigned long addr_field) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + int maxphyaddr; + u64 count, addr; + + if (vmcs12_read_any(vmcs12, count_field, &count) || + vmcs12_read_any(vmcs12, addr_field, &addr)) { + WARN_ON(1); + return -EINVAL; + } + if (count == 0) + return 0; + maxphyaddr = cpuid_maxphyaddr(vcpu); + if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr || + (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) { + pr_debug_ratelimited( + "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)", + addr_field, maxphyaddr, count, addr); + return -EINVAL; + } + return 0; +} + +static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (vmcs12->vm_exit_msr_load_count == 0 && + vmcs12->vm_exit_msr_store_count == 0 && + vmcs12->vm_entry_msr_load_count == 0) + return 0; /* Fast path */ + if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT, + VM_EXIT_MSR_LOAD_ADDR) || + nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT, + VM_EXIT_MSR_STORE_ADDR) || + nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT, + VM_ENTRY_MSR_LOAD_ADDR)) + return -EINVAL; + return 0; +} + +static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has_pml(vmcs12)) + return 0; + + if (!nested_cpu_has_ept(vmcs12) || + !page_address_valid(vcpu, vmcs12->pml_address)) + return -EINVAL; + + return 0; +} + +static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && + !nested_cpu_has_ept(vmcs12)) + return -EINVAL; + return 0; +} + +static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && + !nested_cpu_has_ept(vmcs12)) + return -EINVAL; + return 0; +} + +static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has_shadow_vmcs(vmcs12)) + return 0; + + if (!page_address_valid(vcpu, vmcs12->vmread_bitmap) || + !page_address_valid(vcpu, vmcs12->vmwrite_bitmap)) + return -EINVAL; + + return 0; +} + +static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, + struct vmx_msr_entry *e) +{ + /* x2APIC MSR accesses are not allowed */ + if (vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8) + return -EINVAL; + if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */ + e->index == MSR_IA32_UCODE_REV) + return -EINVAL; + if (e->reserved != 0) + return -EINVAL; + return 0; +} + +static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, + struct vmx_msr_entry *e) +{ + if (e->index == MSR_FS_BASE || + e->index == MSR_GS_BASE || + e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */ + nested_vmx_msr_check_common(vcpu, e)) + return -EINVAL; + return 0; +} + +static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, + struct vmx_msr_entry *e) +{ + if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */ + nested_vmx_msr_check_common(vcpu, e)) + return -EINVAL; + return 0; +} + +/* + * Load guest's/host's msr at nested entry/exit. + * return 0 for success, entry index for failure. + */ +static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) +{ + u32 i; + struct vmx_msr_entry e; + struct msr_data msr; + + msr.host_initiated = false; + for (i = 0; i < count; i++) { + if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), + &e, sizeof(e))) { + pr_debug_ratelimited( + "%s cannot read MSR entry (%u, 0x%08llx)\n", + __func__, i, gpa + i * sizeof(e)); + goto fail; + } + if (nested_vmx_load_msr_check(vcpu, &e)) { + pr_debug_ratelimited( + "%s check failed (%u, 0x%x, 0x%x)\n", + __func__, i, e.index, e.reserved); + goto fail; + } + msr.index = e.index; + msr.data = e.value; + if (kvm_set_msr(vcpu, &msr)) { + pr_debug_ratelimited( + "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", + __func__, i, e.index, e.value); + goto fail; + } + } + return 0; +fail: + return i + 1; +} + +static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) +{ + u32 i; + struct vmx_msr_entry e; + + for (i = 0; i < count; i++) { + struct msr_data msr_info; + if (kvm_vcpu_read_guest(vcpu, + gpa + i * sizeof(e), + &e, 2 * sizeof(u32))) { + pr_debug_ratelimited( + "%s cannot read MSR entry (%u, 0x%08llx)\n", + __func__, i, gpa + i * sizeof(e)); + return -EINVAL; + } + if (nested_vmx_store_msr_check(vcpu, &e)) { + pr_debug_ratelimited( + "%s check failed (%u, 0x%x, 0x%x)\n", + __func__, i, e.index, e.reserved); + return -EINVAL; + } + msr_info.host_initiated = false; + msr_info.index = e.index; + if (kvm_get_msr(vcpu, &msr_info)) { + pr_debug_ratelimited( + "%s cannot read MSR (%u, 0x%x)\n", + __func__, i, e.index); + return -EINVAL; + } + if (kvm_vcpu_write_guest(vcpu, + gpa + i * sizeof(e) + + offsetof(struct vmx_msr_entry, value), + &msr_info.data, sizeof(msr_info.data))) { + pr_debug_ratelimited( + "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", + __func__, i, e.index, msr_info.data); + return -EINVAL; + } + } + return 0; +} + +static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val) +{ + unsigned long invalid_mask; + + invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu); + return (val & invalid_mask) == 0; +} + +/* + * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are + * emulating VM entry into a guest with EPT enabled. + * Returns 0 on success, 1 on failure. Invalid state exit qualification code + * is assigned to entry_failure_code on failure. + */ +static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept, + u32 *entry_failure_code) +{ + if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) { + if (!nested_cr3_valid(vcpu, cr3)) { + *entry_failure_code = ENTRY_FAIL_DEFAULT; + return 1; + } + + /* + * If PAE paging and EPT are both on, CR3 is not used by the CPU and + * must not be dereferenced. + */ + if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu) && + !nested_ept) { + if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) { + *entry_failure_code = ENTRY_FAIL_PDPTE; + return 1; + } + } + } + + if (!nested_ept) + kvm_mmu_new_cr3(vcpu, cr3, false); + + vcpu->arch.cr3 = cr3; + __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + + kvm_init_mmu(vcpu, false); + + return 0; +} + +/* + * Returns if KVM is able to config CPU to tag TLB entries + * populated by L2 differently than TLB entries populated + * by L1. + * + * If L1 uses EPT, then TLB entries are tagged with different EPTP. + * + * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged + * with different VPID (L1 entries are tagged with vmx->vpid + * while L2 entries are tagged with vmx->nested.vpid02). + */ +static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + return nested_cpu_has_ept(vmcs12) || + (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); +} + +static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) +{ + if (vmx->nested.nested_run_pending && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) + return vmcs12->guest_ia32_efer; + else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) + return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); + else + return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); +} + +static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) +{ + /* + * If vmcs02 hasn't been initialized, set the constant vmcs02 state + * according to L0's settings (vmcs12 is irrelevant here). Host + * fields that come from L0 and are not constant, e.g. HOST_CR3, + * will be set as needed prior to VMLAUNCH/VMRESUME. + */ + if (vmx->nested.vmcs02_initialized) + return; + vmx->nested.vmcs02_initialized = true; + + /* + * We don't care what the EPTP value is we just need to guarantee + * it's valid so we don't get a false positive when doing early + * consistency checks. + */ + if (enable_ept && nested_early_check) + vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0)); + + /* All VMFUNCs are currently emulated through L0 vmexits. */ + if (cpu_has_vmx_vmfunc()) + vmcs_write64(VM_FUNCTION_CONTROL, 0); + + if (cpu_has_vmx_posted_intr()) + vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); + + if (cpu_has_vmx_msr_bitmap()) + vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); + + if (enable_pml) + vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); + + /* + * Set the MSR load/store lists to match L0's settings. Only the + * addresses are constant (for vmcs02), the counts can change based + * on L2's behavior, e.g. switching to/from long mode. + */ + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); + vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); + vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); + + vmx_set_constant_host_state(vmx); +} + +static void prepare_vmcs02_early_full(struct vcpu_vmx *vmx, + struct vmcs12 *vmcs12) +{ + prepare_vmcs02_constant_state(vmx); + + vmcs_write64(VMCS_LINK_POINTER, -1ull); + + if (enable_vpid) { + if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) + vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); + else + vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); + } +} + +static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) +{ + u32 exec_control, vmcs12_exec_ctrl; + u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); + + if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) + prepare_vmcs02_early_full(vmx, vmcs12); + + /* + * HOST_RSP is normally set correctly in vmx_vcpu_run() just before + * entry, but only if the current (host) sp changed from the value + * we wrote last (vmx->host_rsp). This cache is no longer relevant + * if we switch vmcs, and rather than hold a separate cache per vmcs, + * here we just force the write to happen on entry. host_rsp will + * also be written unconditionally by nested_vmx_check_vmentry_hw() + * if we are doing early consistency checks via hardware. + */ + vmx->host_rsp = 0; + + /* + * PIN CONTROLS + */ + exec_control = vmcs12->pin_based_vm_exec_control; + + /* Preemption timer setting is computed directly in vmx_vcpu_run. */ + exec_control |= vmcs_config.pin_based_exec_ctrl; + exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + vmx->loaded_vmcs->hv_timer_armed = false; + + /* Posted interrupts setting is only taken from vmcs12. */ + if (nested_cpu_has_posted_intr(vmcs12)) { + vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; + vmx->nested.pi_pending = false; + } else { + exec_control &= ~PIN_BASED_POSTED_INTR; + } + vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); + + /* + * EXEC CONTROLS + */ + exec_control = vmx_exec_control(vmx); /* L0's desires */ + exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; + exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; + exec_control &= ~CPU_BASED_TPR_SHADOW; + exec_control |= vmcs12->cpu_based_vm_exec_control; + + /* + * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if + * nested_get_vmcs12_pages can't fix it up, the illegal value + * will result in a VM entry failure. + */ + if (exec_control & CPU_BASED_TPR_SHADOW) { + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull); + vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); + } else { +#ifdef CONFIG_X86_64 + exec_control |= CPU_BASED_CR8_LOAD_EXITING | + CPU_BASED_CR8_STORE_EXITING; +#endif + } + + /* + * A vmexit (to either L1 hypervisor or L0 userspace) is always needed + * for I/O port accesses. + */ + exec_control &= ~CPU_BASED_USE_IO_BITMAPS; + exec_control |= CPU_BASED_UNCOND_IO_EXITING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); + + /* + * SECONDARY EXEC CONTROLS + */ + if (cpu_has_secondary_exec_ctrls()) { + exec_control = vmx->secondary_exec_control; + + /* Take the following fields only from vmcs12 */ + exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_ENABLE_INVPCID | + SECONDARY_EXEC_RDTSCP | + SECONDARY_EXEC_XSAVES | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_ENABLE_VMFUNC); + if (nested_cpu_has(vmcs12, + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) { + vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control & + ~SECONDARY_EXEC_ENABLE_PML; + exec_control |= vmcs12_exec_ctrl; + } + + /* VMCS shadowing for L2 is emulated for now */ + exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; + + if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) + vmcs_write16(GUEST_INTR_STATUS, + vmcs12->guest_intr_status); + + /* + * Write an illegal value to APIC_ACCESS_ADDR. Later, + * nested_get_vmcs12_pages will either fix it up or + * remove the VM execution control. + */ + if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) + vmcs_write64(APIC_ACCESS_ADDR, -1ull); + + if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) + vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); + + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + } + + /* + * ENTRY CONTROLS + * + * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE + * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate + * on the related bits (if supported by the CPU) in the hope that + * we can avoid VMWrites during vmx_set_efer(). + */ + exec_control = (vmcs12->vm_entry_controls | vmcs_config.vmentry_ctrl) & + ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER; + if (cpu_has_load_ia32_efer) { + if (guest_efer & EFER_LMA) + exec_control |= VM_ENTRY_IA32E_MODE; + if (guest_efer != host_efer) + exec_control |= VM_ENTRY_LOAD_IA32_EFER; + } + vm_entry_controls_init(vmx, exec_control); + + /* + * EXIT CONTROLS + * + * L2->L1 exit controls are emulated - the hardware exit is to L0 so + * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER + * bits may be modified by vmx_set_efer() in prepare_vmcs02(). + */ + exec_control = vmcs_config.vmexit_ctrl; + if (cpu_has_load_ia32_efer && guest_efer != host_efer) + exec_control |= VM_EXIT_LOAD_IA32_EFER; + vm_exit_controls_init(vmx, exec_control); + + /* + * Conceptually we want to copy the PML address and index from + * vmcs01 here, and then back to vmcs01 on nested vmexit. But, + * since we always flush the log on each vmexit and never change + * the PML address (once set), this happens to be equivalent to + * simply resetting the index in vmcs02. + */ + if (enable_pml) + vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); + + /* + * Interrupt/Exception Fields + */ + if (vmx->nested.nested_run_pending) { + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + vmcs12->vm_entry_intr_info_field); + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, + vmcs12->vm_entry_exception_error_code); + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, + vmcs12->vm_entry_instruction_len); + vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, + vmcs12->guest_interruptibility_info); + vmx->loaded_vmcs->nmi_known_unmasked = + !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); + } else { + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); + } +} + +static void prepare_vmcs02_full(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) +{ + struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; + + if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { + vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); + vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); + vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); + vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); + vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); + vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); + vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); + vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); + vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); + vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); + vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); + vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); + vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); + vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); + vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); + vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); + vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); + vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); + vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); + vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); + vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); + vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); + vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); + vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); + vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); + vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); + vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); + vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); + vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); + vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); + vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); + vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); + vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); + vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); + } + + if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { + vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); + vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, + vmcs12->guest_pending_dbg_exceptions); + vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); + vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); + + /* + * L1 may access the L2's PDPTR, so save them to construct + * vmcs12 + */ + if (enable_ept) { + vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); + vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); + vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); + vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); + } + } + + if (nested_cpu_has_xsaves(vmcs12)) + vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); + + /* + * Whether page-faults are trapped is determined by a combination of + * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. + * If enable_ept, L0 doesn't care about page faults and we should + * set all of these to L1's desires. However, if !enable_ept, L0 does + * care about (at least some) page faults, and because it is not easy + * (if at all possible?) to merge L0 and L1's desires, we simply ask + * to exit on each and every L2 page fault. This is done by setting + * MASK=MATCH=0 and (see below) EB.PF=1. + * Note that below we don't need special code to set EB.PF beyond the + * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, + * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when + * !enable_ept, EB.PF is 1, so the "or" will always be 1. + */ + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, + enable_ept ? vmcs12->page_fault_error_code_mask : 0); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, + enable_ept ? vmcs12->page_fault_error_code_match : 0); + + if (cpu_has_vmx_apicv()) { + vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); + vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); + vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); + vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); + } + + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); + + set_cr4_guest_host_mask(vmx); + + if (kvm_mpx_supported()) { + if (vmx->nested.nested_run_pending && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) + vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); + else + vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); + } +} + +/* + * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested + * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it + * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 + * guest in a way that will both be appropriate to L1's requests, and our + * needs. In addition to modifying the active vmcs (which is vmcs02), this + * function also has additional necessary side-effects, like setting various + * vcpu->arch fields. + * Returns 0 on success, 1 on failure. Invalid state exit qualification code + * is assigned to entry_failure_code on failure. + */ +static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, + u32 *entry_failure_code) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; + + if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) { + prepare_vmcs02_full(vmx, vmcs12); + vmx->nested.dirty_vmcs12 = false; + } + + /* + * First, the fields that are shadowed. This must be kept in sync + * with vmx_shadow_fields.h. + */ + if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { + vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); + vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); + } + + if (vmx->nested.nested_run_pending && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { + kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); + vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); + } else { + kvm_set_dr(vcpu, 7, vcpu->arch.dr7); + vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); + } + vmx_set_rflags(vcpu, vmcs12->guest_rflags); + + vmx->nested.preemption_timer_expired = false; + if (nested_cpu_has_preemption_timer(vmcs12)) + vmx_start_preemption_timer(vcpu); + + /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the + * bitwise-or of what L1 wants to trap for L2, and what we want to + * trap. Note that CR0.TS also needs updating - we do this later. + */ + update_exception_bitmap(vcpu); + vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; + vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); + + if (vmx->nested.nested_run_pending && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { + vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); + vcpu->arch.pat = vmcs12->guest_ia32_pat; + } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { + vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); + } + + vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); + + if (kvm_has_tsc_control) + decache_tsc_multiplier(vmx); + + if (enable_vpid) { + /* + * There is no direct mapping between vpid02 and vpid12, the + * vpid02 is per-vCPU for L0 and reused while the value of + * vpid12 is changed w/ one invvpid during nested vmentry. + * The vpid12 is allocated by L1 for L2, so it will not + * influence global bitmap(for vpid01 and vpid02 allocation) + * even if spawn a lot of nested vCPUs. + */ + if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) { + if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) { + vmx->nested.last_vpid = vmcs12->virtual_processor_id; + __vmx_flush_tlb(vcpu, nested_get_vpid02(vcpu), false); + } + } else { + /* + * If L1 use EPT, then L0 needs to execute INVEPT on + * EPTP02 instead of EPTP01. Therefore, delay TLB + * flush until vmcs02->eptp is fully updated by + * KVM_REQ_LOAD_CR3. Note that this assumes + * KVM_REQ_TLB_FLUSH is evaluated after + * KVM_REQ_LOAD_CR3 in vcpu_enter_guest(). + */ + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + } + } + + if (nested_cpu_has_ept(vmcs12)) + nested_ept_init_mmu_context(vcpu); + else if (nested_cpu_has2(vmcs12, + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) + vmx_flush_tlb(vcpu, true); + + /* + * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those + * bits which we consider mandatory enabled. + * The CR0_READ_SHADOW is what L2 should have expected to read given + * the specifications by L1; It's not enough to take + * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we + * have more bits than L1 expected. + */ + vmx_set_cr0(vcpu, vmcs12->guest_cr0); + vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); + + vmx_set_cr4(vcpu, vmcs12->guest_cr4); + vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); + + vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); + /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ + vmx_set_efer(vcpu, vcpu->arch.efer); + + /* + * Guest state is invalid and unrestricted guest is disabled, + * which means L1 attempted VMEntry to L2 with invalid state. + * Fail the VMEntry. + */ + if (vmx->emulation_required) { + *entry_failure_code = ENTRY_FAIL_DEFAULT; + return 1; + } + + /* Shadow page tables on either EPT or shadow page tables. */ + if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), + entry_failure_code)) + return 1; + + if (!enable_ept) + vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; + + kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp); + kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); + return 0; +} + +static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has_nmi_exiting(vmcs12) && + nested_cpu_has_virtual_nmis(vmcs12)) + return -EINVAL; + + if (!nested_cpu_has_virtual_nmis(vmcs12) && + nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING)) + return -EINVAL; + + return 0; +} + +static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + bool ia32e; + + if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && + vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (nested_vmx_check_io_bitmap_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (nested_vmx_check_apic_access_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (!nested_cpu_has_preemption_timer(vmcs12) && + nested_cpu_has_save_preemption_timer(vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (nested_vmx_check_pml_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, + vmx->nested.msrs.procbased_ctls_low, + vmx->nested.msrs.procbased_ctls_high) || + (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && + !vmx_control_verify(vmcs12->secondary_vm_exec_control, + vmx->nested.msrs.secondary_ctls_low, + vmx->nested.msrs.secondary_ctls_high)) || + !vmx_control_verify(vmcs12->pin_based_vm_exec_control, + vmx->nested.msrs.pinbased_ctls_low, + vmx->nested.msrs.pinbased_ctls_high) || + !vmx_control_verify(vmcs12->vm_exit_controls, + vmx->nested.msrs.exit_ctls_low, + vmx->nested.msrs.exit_ctls_high) || + !vmx_control_verify(vmcs12->vm_entry_controls, + vmx->nested.msrs.entry_ctls_low, + vmx->nested.msrs.entry_ctls_high)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (nested_vmx_check_nmi_controls(vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (nested_cpu_has_vmfunc(vmcs12)) { + if (vmcs12->vm_function_control & + ~vmx->nested.msrs.vmfunc_controls) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (nested_cpu_has_eptp_switching(vmcs12)) { + if (!nested_cpu_has_ept(vmcs12) || + !page_address_valid(vcpu, vmcs12->eptp_list_address)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + } + } + + if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) || + !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) || + !nested_cr3_valid(vcpu, vmcs12->host_cr3)) + return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD; + + /* + * If the load IA32_EFER VM-exit control is 1, bits reserved in the + * IA32_EFER MSR must be 0 in the field for that register. In addition, + * the values of the LMA and LME bits in the field must each be that of + * the host address-space size VM-exit control. + */ + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { + ia32e = (vmcs12->vm_exit_controls & + VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0; + if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) || + ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) || + ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) + return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD; + } + + /* + * From the Intel SDM, volume 3: + * Fields relevant to VM-entry event injection must be set properly. + * These fields are the VM-entry interruption-information field, the + * VM-entry exception error code, and the VM-entry instruction length. + */ + if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { + u32 intr_info = vmcs12->vm_entry_intr_info_field; + u8 vector = intr_info & INTR_INFO_VECTOR_MASK; + u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; + bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; + bool should_have_error_code; + bool urg = nested_cpu_has2(vmcs12, + SECONDARY_EXEC_UNRESTRICTED_GUEST); + bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; + + /* VM-entry interruption-info field: interruption type */ + if (intr_type == INTR_TYPE_RESERVED || + (intr_type == INTR_TYPE_OTHER_EVENT && + !nested_cpu_supports_monitor_trap_flag(vcpu))) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + /* VM-entry interruption-info field: vector */ + if ((intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || + (intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || + (intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + /* VM-entry interruption-info field: deliver error code */ + should_have_error_code = + intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && + x86_exception_has_error_code(vector); + if (has_error_code != should_have_error_code) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + /* VM-entry exception error code */ + if (has_error_code && + vmcs12->vm_entry_exception_error_code & GENMASK(31, 15)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + /* VM-entry interruption-info field: reserved bits */ + if (intr_info & INTR_INFO_RESVD_BITS_MASK) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + /* VM-entry instruction length */ + switch (intr_type) { + case INTR_TYPE_SOFT_EXCEPTION: + case INTR_TYPE_SOFT_INTR: + case INTR_TYPE_PRIV_SW_EXCEPTION: + if ((vmcs12->vm_entry_instruction_len > 15) || + (vmcs12->vm_entry_instruction_len == 0 && + !nested_cpu_has_zero_length_injection(vcpu))) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + } + } + + if (nested_cpu_has_ept(vmcs12) && + !valid_ept_address(vcpu, vmcs12->ept_pointer)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + return 0; +} + +static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + int r; + struct page *page; + struct vmcs12 *shadow; + + if (vmcs12->vmcs_link_pointer == -1ull) + return 0; + + if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)) + return -EINVAL; + + page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer); + if (is_error_page(page)) + return -EINVAL; + + r = 0; + shadow = kmap(page); + if (shadow->hdr.revision_id != VMCS12_REVISION || + shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)) + r = -EINVAL; + kunmap(page); + kvm_release_page_clean(page); + return r; +} + +static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, + u32 *exit_qual) +{ + bool ia32e; + + *exit_qual = ENTRY_FAIL_DEFAULT; + + if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) || + !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) + return 1; + + if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { + *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR; + return 1; + } + + /* + * If the load IA32_EFER VM-entry control is 1, the following checks + * are performed on the field for the IA32_EFER MSR: + * - Bits reserved in the IA32_EFER MSR must be 0. + * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of + * the IA-32e mode guest VM-exit control. It must also be identical + * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to + * CR0.PG) is 1. + */ + if (to_vmx(vcpu)->nested.nested_run_pending && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { + ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; + if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) || + ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) || + ((vmcs12->guest_cr0 & X86_CR0_PG) && + ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) + return 1; + } + + if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && + (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) || + (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))) + return 1; + + return 0; +} + +static int __noclone nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long cr3, cr4; + + if (!nested_early_check) + return 0; + + if (vmx->msr_autoload.host.nr) + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); + if (vmx->msr_autoload.guest.nr) + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); + + preempt_disable(); + + vmx_prepare_switch_to_guest(vcpu); + + /* + * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, + * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to + * be written (by preparve_vmcs02()) before the "real" VMEnter, i.e. + * there is no need to preserve other bits or save/restore the field. + */ + vmcs_writel(GUEST_RFLAGS, 0); + + vmcs_writel(HOST_RIP, vmx_early_consistency_check_return); + + cr3 = __get_current_cr3_fast(); + if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { + vmcs_writel(HOST_CR3, cr3); + vmx->loaded_vmcs->host_state.cr3 = cr3; + } + + cr4 = cr4_read_shadow(); + if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { + vmcs_writel(HOST_CR4, cr4); + vmx->loaded_vmcs->host_state.cr4 = cr4; + } + + vmx->__launched = vmx->loaded_vmcs->launched; + + asm( + /* Set HOST_RSP */ + __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t" + "mov %%" _ASM_SP ", %c[host_rsp](%0)\n\t" + + /* Check if vmlaunch or vmresume is needed */ + "cmpl $0, %c[launched](%0)\n\t" + "jne 1f\n\t" + __ex("vmlaunch") "\n\t" + "jmp 2f\n\t" + "1: " __ex("vmresume") "\n\t" + "2: " + /* Set vmx->fail accordingly */ + "setbe %c[fail](%0)\n\t" + + ".pushsection .rodata\n\t" + ".global vmx_early_consistency_check_return\n\t" + "vmx_early_consistency_check_return: " _ASM_PTR " 2b\n\t" + ".popsection" + : + : "c"(vmx), "d"((unsigned long)HOST_RSP), + [launched]"i"(offsetof(struct vcpu_vmx, __launched)), + [fail]"i"(offsetof(struct vcpu_vmx, fail)), + [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)) + : "rax", "cc", "memory" + ); + + vmcs_writel(HOST_RIP, vmx_return); + + preempt_enable(); + + if (vmx->msr_autoload.host.nr) + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); + if (vmx->msr_autoload.guest.nr) + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); + + if (vmx->fail) { + WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != + VMXERR_ENTRY_INVALID_CONTROL_FIELD); + vmx->fail = 0; + return 1; + } + + /* + * VMExit clears RFLAGS.IF and DR7, even on a consistency check. + */ + local_irq_enable(); + if (hw_breakpoint_active()) + set_debugreg(__this_cpu_read(cpu_dr7), 7); + + /* + * A non-failing VMEntry means we somehow entered guest mode with + * an illegal RIP, and that's just the tip of the iceberg. There + * is no telling what memory has been modified or what state has + * been exposed to unknown code. Hitting this all but guarantees + * a (very critical) hardware issue. + */ + WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & + VMX_EXIT_REASONS_FAILED_VMENTRY)); + + return 0; +} +STACK_FRAME_NON_STANDARD(nested_vmx_check_vmentry_hw); + +static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12); + +/* + * If from_vmentry is false, this is being called from state restore (either RSM + * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. ++ * ++ * Returns: ++ * 0 - success, i.e. proceed with actual VMEnter ++ * 1 - consistency check VMExit ++ * -1 - consistency check VMFail + */ +static int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, + bool from_vmentry) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + bool evaluate_pending_interrupts; + u32 exit_reason = EXIT_REASON_INVALID_STATE; + u32 exit_qual; + + evaluate_pending_interrupts = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) & + (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING); + if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) + evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); + + if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) + vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + if (kvm_mpx_supported() && + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) + vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); + + vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); + + prepare_vmcs02_early(vmx, vmcs12); + + if (from_vmentry) { + nested_get_vmcs12_pages(vcpu); + + if (nested_vmx_check_vmentry_hw(vcpu)) { + vmx_switch_vmcs(vcpu, &vmx->vmcs01); + return -1; + } + + if (check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) + goto vmentry_fail_vmexit; + } + + enter_guest_mode(vcpu); + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) + vcpu->arch.tsc_offset += vmcs12->tsc_offset; + + if (prepare_vmcs02(vcpu, vmcs12, &exit_qual)) + goto vmentry_fail_vmexit_guest_mode; + + if (from_vmentry) { + exit_reason = EXIT_REASON_MSR_LOAD_FAIL; + exit_qual = nested_vmx_load_msr(vcpu, + vmcs12->vm_entry_msr_load_addr, + vmcs12->vm_entry_msr_load_count); + if (exit_qual) + goto vmentry_fail_vmexit_guest_mode; + } else { + /* + * The MMU is not initialized to point at the right entities yet and + * "get pages" would need to read data from the guest (i.e. we will + * need to perform gpa to hpa translation). Request a call + * to nested_get_vmcs12_pages before the next VM-entry. The MSRs + * have already been set at vmentry time and should not be reset. + */ + kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); + } + + /* + * If L1 had a pending IRQ/NMI until it executed + * VMLAUNCH/VMRESUME which wasn't delivered because it was + * disallowed (e.g. interrupts disabled), L0 needs to + * evaluate if this pending event should cause an exit from L2 + * to L1 or delivered directly to L2 (e.g. In case L1 don't + * intercept EXTERNAL_INTERRUPT). + * + * Usually this would be handled by the processor noticing an + * IRQ/NMI window request, or checking RVI during evaluation of + * pending virtual interrupts. However, this setting was done + * on VMCS01 and now VMCS02 is active instead. Thus, we force L0 + * to perform pending event evaluation by requesting a KVM_REQ_EVENT. + */ + if (unlikely(evaluate_pending_interrupts)) + kvm_make_request(KVM_REQ_EVENT, vcpu); + + /* + * Note no nested_vmx_succeed or nested_vmx_fail here. At this point + * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet + * returned as far as L1 is concerned. It will only return (and set + * the success flag) when L2 exits (see nested_vmx_vmexit()). + */ + return 0; + + /* + * A failed consistency check that leads to a VMExit during L1's + * VMEnter to L2 is a variation of a normal VMexit, as explained in + * 26.7 "VM-entry failures during or after loading guest state". + */ +vmentry_fail_vmexit_guest_mode: + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) + vcpu->arch.tsc_offset -= vmcs12->tsc_offset; + leave_guest_mode(vcpu); + +vmentry_fail_vmexit: + vmx_switch_vmcs(vcpu, &vmx->vmcs01); + + if (!from_vmentry) + return 1; + + load_vmcs12_host_state(vcpu, vmcs12); + vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY; + vmcs12->exit_qualification = exit_qual; + if (enable_shadow_vmcs || vmx->nested.hv_evmcs) + vmx->nested.need_vmcs12_sync = true; + return 1; +} + +/* + * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 + * for running an L2 nested guest. + */ +static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) +{ + struct vmcs12 *vmcs12; + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); + int ret; + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + if (!nested_vmx_handle_enlightened_vmptrld(vcpu, true)) + return 1; + + if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull) + return nested_vmx_failInvalid(vcpu); + + vmcs12 = get_vmcs12(vcpu); + + /* + * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact + * that there *is* a valid VMCS pointer, RFLAGS.CF is set + * rather than RFLAGS.ZF, and no error number is stored to the + * VM-instruction error field. + */ + if (vmcs12->hdr.shadow_vmcs) + return nested_vmx_failInvalid(vcpu); + + if (vmx->nested.hv_evmcs) { + copy_enlightened_to_vmcs12(vmx); + /* Enlightened VMCS doesn't have launch state */ + vmcs12->launch_state = !launch; + } else if (enable_shadow_vmcs) { + copy_shadow_to_vmcs12(vmx); + } + + /* + * The nested entry process starts with enforcing various prerequisites + * on vmcs12 as required by the Intel SDM, and act appropriately when + * they fail: As the SDM explains, some conditions should cause the + * instruction to fail, while others will cause the instruction to seem + * to succeed, but return an EXIT_REASON_INVALID_STATE. + * To speed up the normal (success) code path, we should avoid checking + * for misconfigurations which will anyway be caught by the processor + * when using the merged vmcs02. + */ + if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS) + return nested_vmx_failValid(vcpu, + VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); + + if (vmcs12->launch_state == launch) + return nested_vmx_failValid(vcpu, + launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS + : VMXERR_VMRESUME_NONLAUNCHED_VMCS); + + ret = check_vmentry_prereqs(vcpu, vmcs12); + if (ret) + return nested_vmx_failValid(vcpu, ret); + + /* + * We're finally done with prerequisite checking, and can start with + * the nested entry. + */ + vmx->nested.nested_run_pending = 1; + ret = nested_vmx_enter_non_root_mode(vcpu, true); + vmx->nested.nested_run_pending = !ret; + if (ret > 0) + return 1; + else if (ret) + return nested_vmx_failValid(vcpu, + VMXERR_ENTRY_INVALID_CONTROL_FIELD); + + /* Hide L1D cache contents from the nested guest. */ + vmx->vcpu.arch.l1tf_flush_l1d = true; + + /* + * Must happen outside of nested_vmx_enter_non_root_mode() as it will + * also be used as part of restoring nVMX state for + * snapshot restore (migration). + * + * In this flow, it is assumed that vmcs12 cache was + * trasferred as part of captured nVMX state and should + * therefore not be read from guest memory (which may not + * exist on destination host yet). + */ + nested_cache_shadow_vmcs12(vcpu, vmcs12); + + /* + * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken + * by event injection, halt vcpu. + */ + if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) && + !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK)) { + vmx->nested.nested_run_pending = 0; + return kvm_vcpu_halt(vcpu); + } + return 1; +} + +/* + * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date + * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK). + * This function returns the new value we should put in vmcs12.guest_cr0. + * It's not enough to just return the vmcs02 GUEST_CR0. Rather, + * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now + * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 + * didn't trap the bit, because if L1 did, so would L0). + * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have + * been modified by L2, and L1 knows it. So just leave the old value of + * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 + * isn't relevant, because if L0 traps this bit it can set it to anything. + * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have + * changed these bits, and therefore they need to be updated, but L0 + * didn't necessarily allow them to be changed in GUEST_CR0 - and rather + * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. + */ +static inline unsigned long +vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +{ + return + /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | + /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | + /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | + vcpu->arch.cr0_guest_owned_bits)); +} + +static inline unsigned long +vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +{ + return + /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | + /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | + /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | + vcpu->arch.cr4_guest_owned_bits)); +} + +static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + u32 idt_vectoring; + unsigned int nr; + + if (vcpu->arch.exception.injected) { + nr = vcpu->arch.exception.nr; + idt_vectoring = nr | VECTORING_INFO_VALID_MASK; + + if (kvm_exception_is_soft(nr)) { + vmcs12->vm_exit_instruction_len = + vcpu->arch.event_exit_inst_len; + idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; + } else + idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; + + if (vcpu->arch.exception.has_error_code) { + idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; + vmcs12->idt_vectoring_error_code = + vcpu->arch.exception.error_code; + } + + vmcs12->idt_vectoring_info_field = idt_vectoring; + } else if (vcpu->arch.nmi_injected) { + vmcs12->idt_vectoring_info_field = + INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; + } else if (vcpu->arch.interrupt.injected) { + nr = vcpu->arch.interrupt.nr; + idt_vectoring = nr | VECTORING_INFO_VALID_MASK; + + if (vcpu->arch.interrupt.soft) { + idt_vectoring |= INTR_TYPE_SOFT_INTR; + vmcs12->vm_entry_instruction_len = + vcpu->arch.event_exit_inst_len; + } else + idt_vectoring |= INTR_TYPE_EXT_INTR; + + vmcs12->idt_vectoring_info_field = idt_vectoring; + } +} + +static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long exit_qual; + bool block_nested_events = + vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu); + + if (vcpu->arch.exception.pending && + nested_vmx_check_exception(vcpu, &exit_qual)) { + if (block_nested_events) + return -EBUSY; + nested_vmx_inject_exception_vmexit(vcpu, exit_qual); + return 0; + } + + if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && + vmx->nested.preemption_timer_expired) { + if (block_nested_events) + return -EBUSY; + nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); + return 0; + } + + if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) { + if (block_nested_events) + return -EBUSY; + nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, + NMI_VECTOR | INTR_TYPE_NMI_INTR | + INTR_INFO_VALID_MASK, 0); + /* + * The NMI-triggered VM exit counts as injection: + * clear this one and block further NMIs. + */ + vcpu->arch.nmi_pending = 0; + vmx_set_nmi_mask(vcpu, true); + return 0; + } + + if ((kvm_cpu_has_interrupt(vcpu) || external_intr) && + nested_exit_on_intr(vcpu)) { + if (block_nested_events) + return -EBUSY; + nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); + return 0; + } + + vmx_complete_nested_posted_interrupt(vcpu); + return 0; +} + +static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu) +{ + to_vmx(vcpu)->req_immediate_exit = true; +} + +static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) +{ + ktime_t remaining = + hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); + u64 value; + + if (ktime_to_ns(remaining) <= 0) + return 0; + + value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; + do_div(value, 1000000); + return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; +} + +/* + * Update the guest state fields of vmcs12 to reflect changes that + * occurred while L2 was running. (The "IA-32e mode guest" bit of the + * VM-entry controls is also updated, since this is really a guest + * state bit.) + */ +static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +{ + vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); + vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); + + vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); + vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP); + vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); + + vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); + vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); + vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); + vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); + vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); + vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); + vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); + vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); + vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); + vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); + vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); + vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); + vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); + vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); + vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); + vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); + vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); + vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); + vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); + vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); + vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); + vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); + vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); + vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); + vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); + vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); + vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); + vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); + vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); + vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); + vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); + vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); + vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); + vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); + vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); + vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); + + vmcs12->guest_interruptibility_info = + vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); + vmcs12->guest_pending_dbg_exceptions = + vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); + if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) + vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; + else + vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; + + if (nested_cpu_has_preemption_timer(vmcs12)) { + if (vmcs12->vm_exit_controls & + VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) + vmcs12->vmx_preemption_timer_value = + vmx_get_preemption_timer_value(vcpu); + hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); + } + + /* + * In some cases (usually, nested EPT), L2 is allowed to change its + * own CR3 without exiting. If it has changed it, we must keep it. + * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined + * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. + * + * Additionally, restore L2's PDPTR to vmcs12. + */ + if (enable_ept) { + vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); + vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); + vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); + vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); + vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); + } + + vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); + + if (nested_cpu_has_vid(vmcs12)) + vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); + + vmcs12->vm_entry_controls = + (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | + (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); + + if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) { + kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); + vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + } + + /* TODO: These cannot have changed unless we have MSR bitmaps and + * the relevant bit asks not to trap the change */ + if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) + vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT); + if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) + vmcs12->guest_ia32_efer = vcpu->arch.efer; + vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); + vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); + vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); + if (kvm_mpx_supported()) + vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); +} + +/* + * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits + * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), + * and this function updates it to reflect the changes to the guest state while + * L2 was running (and perhaps made some exits which were handled directly by L0 + * without going back to L1), and to reflect the exit reason. + * Note that we do not have to copy here all VMCS fields, just those that + * could have changed by the L2 guest or the exit - i.e., the guest-state and + * exit-information fields only. Other fields are modified by L1 with VMWRITE, + * which already writes to vmcs12 directly. + */ +static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, + u32 exit_reason, u32 exit_intr_info, + unsigned long exit_qualification) +{ + /* update guest state fields: */ + sync_vmcs12(vcpu, vmcs12); + + /* update exit information fields: */ + + vmcs12->vm_exit_reason = exit_reason; + vmcs12->exit_qualification = exit_qualification; + vmcs12->vm_exit_intr_info = exit_intr_info; + + vmcs12->idt_vectoring_info_field = 0; + vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + + if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { + vmcs12->launch_state = 1; + + /* vm_entry_intr_info_field is cleared on exit. Emulate this + * instead of reading the real value. */ + vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; + + /* + * Transfer the event that L0 or L1 may wanted to inject into + * L2 to IDT_VECTORING_INFO_FIELD. + */ + vmcs12_save_pending_event(vcpu, vmcs12); + } + + /* + * Drop what we picked up for L2 via vmx_complete_interrupts. It is + * preserved above and would only end up incorrectly in L1. + */ + vcpu->arch.nmi_injected = false; + kvm_clear_exception_queue(vcpu); + kvm_clear_interrupt_queue(vcpu); +} + +/* + * A part of what we need to when the nested L2 guest exits and we want to + * run its L1 parent, is to reset L1's guest state to the host state specified + * in vmcs12. + * This function is to be called not only on normal nested exit, but also on + * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry + * Failures During or After Loading Guest State"). + * This function should be called when the active VMCS is L1's (vmcs01). + */ +static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + struct kvm_segment seg; + u32 entry_failure_code; + + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) + vcpu->arch.efer = vmcs12->host_ia32_efer; + else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) + vcpu->arch.efer |= (EFER_LMA | EFER_LME); + else + vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); + vmx_set_efer(vcpu, vcpu->arch.efer); + + kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp); + kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip); + vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); + vmx_set_interrupt_shadow(vcpu, 0); + + /* + * Note that calling vmx_set_cr0 is important, even if cr0 hasn't + * actually changed, because vmx_set_cr0 refers to efer set above. + * + * CR0_GUEST_HOST_MASK is already set in the original vmcs01 + * (KVM doesn't change it); + */ + vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; + vmx_set_cr0(vcpu, vmcs12->host_cr0); + + /* Same as above - no reason to call set_cr4_guest_host_mask(). */ + vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); + vmx_set_cr4(vcpu, vmcs12->host_cr4); + + nested_ept_uninit_mmu_context(vcpu); + + /* + * Only PDPTE load can fail as the value of cr3 was checked on entry and + * couldn't have changed. + */ + if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code)) + nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); + + if (!enable_ept) + vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; + + /* + * If vmcs01 doesn't use VPID, CPU flushes TLB on every + * VMEntry/VMExit. Thus, no need to flush TLB. + * + * If vmcs12 doesn't use VPID, L1 expects TLB to be + * flushed on every VMEntry/VMExit. + * + * Otherwise, we can preserve TLB entries as long as we are + * able to tag L1 TLB entries differently than L2 TLB entries. + * + * If vmcs12 uses EPT, we need to execute this flush on EPTP01 + * and therefore we request the TLB flush to happen only after VMCS EPTP + * has been set by KVM_REQ_LOAD_CR3. + */ + if (enable_vpid && + (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) { + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + } + + vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); + vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); + vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); + vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); + vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); + vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); + vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); + + /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ + if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) + vmcs_write64(GUEST_BNDCFGS, 0); + + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { + vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); + vcpu->arch.pat = vmcs12->host_ia32_pat; + } + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) + vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, + vmcs12->host_ia32_perf_global_ctrl); + + /* Set L1 segment info according to Intel SDM + 27.5.2 Loading Host Segment and Descriptor-Table Registers */ + seg = (struct kvm_segment) { + .base = 0, + .limit = 0xFFFFFFFF, + .selector = vmcs12->host_cs_selector, + .type = 11, + .present = 1, + .s = 1, + .g = 1 + }; + if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) + seg.l = 1; + else + seg.db = 1; + vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); + seg = (struct kvm_segment) { + .base = 0, + .limit = 0xFFFFFFFF, + .type = 3, + .present = 1, + .s = 1, + .db = 1, + .g = 1 + }; + seg.selector = vmcs12->host_ds_selector; + vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); + seg.selector = vmcs12->host_es_selector; + vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); + seg.selector = vmcs12->host_ss_selector; + vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); + seg.selector = vmcs12->host_fs_selector; + seg.base = vmcs12->host_fs_base; + vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); + seg.selector = vmcs12->host_gs_selector; + seg.base = vmcs12->host_gs_base; + vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); + seg = (struct kvm_segment) { + .base = vmcs12->host_tr_base, + .limit = 0x67, + .selector = vmcs12->host_tr_selector, + .type = 11, + .present = 1 + }; + vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); + + kvm_set_dr(vcpu, 7, 0x400); + vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + + if (cpu_has_vmx_msr_bitmap()) + vmx_update_msr_bitmap(vcpu); + + if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, + vmcs12->vm_exit_msr_load_count)) + nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); +} + +static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) +{ + struct shared_msr_entry *efer_msr; + unsigned int i; + + if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) + return vmcs_read64(GUEST_IA32_EFER); + + if (cpu_has_load_ia32_efer) + return host_efer; + + for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { + if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) + return vmx->msr_autoload.guest.val[i].value; + } + + efer_msr = find_msr_entry(vmx, MSR_EFER); + if (efer_msr) + return efer_msr->data; + + return host_efer; +} + +static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmx_msr_entry g, h; + struct msr_data msr; + gpa_t gpa; + u32 i, j; + + vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); + + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { + /* + * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set + * as vmcs01.GUEST_DR7 contains a userspace defined value + * and vcpu->arch.dr7 is not squirreled away before the + * nested VMENTER (not worth adding a variable in nested_vmx). + */ + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) + kvm_set_dr(vcpu, 7, DR7_FIXED_1); + else + WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); + } + + /* + * Note that calling vmx_set_{efer,cr0,cr4} is important as they + * handle a variety of side effects to KVM's software model. + */ + vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); + + vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; + vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); + + vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); + vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); + + nested_ept_uninit_mmu_context(vcpu); + vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); + __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + + /* + * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs + * from vmcs01 (if necessary). The PDPTRs are not loaded on + * VMFail, like everything else we just need to ensure our + * software model is up-to-date. + */ + ept_save_pdptrs(vcpu); + + kvm_mmu_reset_context(vcpu); + + if (cpu_has_vmx_msr_bitmap()) + vmx_update_msr_bitmap(vcpu); + + /* + * This nasty bit of open coding is a compromise between blindly + * loading L1's MSRs using the exit load lists (incorrect emulation + * of VMFail), leaving the nested VM's MSRs in the software model + * (incorrect behavior) and snapshotting the modified MSRs (too + * expensive since the lists are unbound by hardware). For each + * MSR that was (prematurely) loaded from the nested VMEntry load + * list, reload it from the exit load list if it exists and differs + * from the guest value. The intent is to stuff host state as + * silently as possible, not to fully process the exit load list. + */ + msr.host_initiated = false; + for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { + gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); + if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { + pr_debug_ratelimited( + "%s read MSR index failed (%u, 0x%08llx)\n", + __func__, i, gpa); + goto vmabort; + } + + for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { + gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); + if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { + pr_debug_ratelimited( + "%s read MSR failed (%u, 0x%08llx)\n", + __func__, j, gpa); + goto vmabort; + } + if (h.index != g.index) + continue; + if (h.value == g.value) + break; + + if (nested_vmx_load_msr_check(vcpu, &h)) { + pr_debug_ratelimited( + "%s check failed (%u, 0x%x, 0x%x)\n", + __func__, j, h.index, h.reserved); + goto vmabort; + } + + msr.index = h.index; + msr.data = h.value; + if (kvm_set_msr(vcpu, &msr)) { + pr_debug_ratelimited( + "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", + __func__, j, h.index, h.value); + goto vmabort; + } + } + } + + return; + +vmabort: + nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); +} + +/* + * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 + * and modify vmcs12 to make it see what it would expect to see there if + * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) + */ +static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, + u32 exit_intr_info, + unsigned long exit_qualification) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + /* trying to cancel vmlaunch/vmresume is a bug */ + WARN_ON_ONCE(vmx->nested.nested_run_pending); + + leave_guest_mode(vcpu); + + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) + vcpu->arch.tsc_offset -= vmcs12->tsc_offset; + + if (likely(!vmx->fail)) { + if (exit_reason == -1) + sync_vmcs12(vcpu, vmcs12); + else + prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, + exit_qualification); + + /* + * Must happen outside of sync_vmcs12() as it will + * also be used to capture vmcs12 cache as part of + * capturing nVMX state for snapshot (migration). + * + * Otherwise, this flush will dirty guest memory at a + * point it is already assumed by user-space to be + * immutable. + */ + nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); + + if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr, + vmcs12->vm_exit_msr_store_count)) + nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL); + } else { + /* + * The only expected VM-instruction error is "VM entry with + * invalid control field(s)." Anything else indicates a + * problem with L0. And we should never get here with a + * VMFail of any type if early consistency checks are enabled. + */ + WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != + VMXERR_ENTRY_INVALID_CONTROL_FIELD); + WARN_ON_ONCE(nested_early_check); + } + + vmx_switch_vmcs(vcpu, &vmx->vmcs01); + + /* Update any VMCS fields that might have changed while L2 ran */ + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); + vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); + + if (kvm_has_tsc_control) + decache_tsc_multiplier(vmx); + + if (vmx->nested.change_vmcs01_virtual_apic_mode) { + vmx->nested.change_vmcs01_virtual_apic_mode = false; + vmx_set_virtual_apic_mode(vcpu); + } else if (!nested_cpu_has_ept(vmcs12) && + nested_cpu_has2(vmcs12, + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { + vmx_flush_tlb(vcpu, true); + } + + /* This is needed for same reason as it was needed in prepare_vmcs02 */ + vmx->host_rsp = 0; + + /* Unpin physical memory we referred to in vmcs02 */ + if (vmx->nested.apic_access_page) { + kvm_release_page_dirty(vmx->nested.apic_access_page); + vmx->nested.apic_access_page = NULL; + } + if (vmx->nested.virtual_apic_page) { + kvm_release_page_dirty(vmx->nested.virtual_apic_page); + vmx->nested.virtual_apic_page = NULL; + } + if (vmx->nested.pi_desc_page) { + kunmap(vmx->nested.pi_desc_page); + kvm_release_page_dirty(vmx->nested.pi_desc_page); + vmx->nested.pi_desc_page = NULL; + vmx->nested.pi_desc = NULL; + } + + /* + * We are now running in L2, mmu_notifier will force to reload the + * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1. + */ + kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); + + if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs)) + vmx->nested.need_vmcs12_sync = true; + + /* in case we halted in L2 */ + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + + if (likely(!vmx->fail)) { + /* + * TODO: SDM says that with acknowledge interrupt on + * exit, bit 31 of the VM-exit interrupt information + * (valid interrupt) is always set to 1 on + * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't + * need kvm_cpu_has_interrupt(). See the commit + * message for details. + */ + if (nested_exit_intr_ack_set(vcpu) && + exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && + kvm_cpu_has_interrupt(vcpu)) { + int irq = kvm_cpu_get_interrupt(vcpu); + WARN_ON(irq < 0); + vmcs12->vm_exit_intr_info = irq | + INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; + } + + if (exit_reason != -1) + trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, + vmcs12->exit_qualification, + vmcs12->idt_vectoring_info_field, + vmcs12->vm_exit_intr_info, + vmcs12->vm_exit_intr_error_code, + KVM_ISA_VMX); + + load_vmcs12_host_state(vcpu, vmcs12); + + return; + } + + /* + * After an early L2 VM-entry failure, we're now back + * in L1 which thinks it just finished a VMLAUNCH or + * VMRESUME instruction, so we need to set the failure + * flag and the VM-instruction error field of the VMCS + * accordingly, and skip the emulated instruction. + */ + (void)nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + + /* + * Restore L1's host state to KVM's software model. We're here + * because a consistency check was caught by hardware, which + * means some amount of guest state has been propagated to KVM's + * model and needs to be unwound to the host's state. + */ + nested_vmx_restore_host_state(vcpu); + + vmx->fail = 0; +} + +/* + * Forcibly leave nested mode in order to be able to reset the VCPU later on. + */ +static void vmx_leave_nested(struct kvm_vcpu *vcpu) +{ + if (is_guest_mode(vcpu)) { + to_vmx(vcpu)->nested.nested_run_pending = 0; + nested_vmx_vmexit(vcpu, -1, 0, 0); + } + free_nested(vcpu); +} + +static int vmx_check_intercept(struct kvm_vcpu *vcpu, + struct x86_instruction_info *info, + enum x86_intercept_stage stage) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + + /* + * RDPID causes #UD if disabled through secondary execution controls. + * Because it is marked as EmulateOnUD, we need to intercept it here. + */ + if (info->intercept == x86_intercept_rdtscp && + !nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) { + ctxt->exception.vector = UD_VECTOR; + ctxt->exception.error_code_valid = false; + return X86EMUL_PROPAGATE_FAULT; + } + + /* TODO: check more intercepts... */ + return X86EMUL_CONTINUE; +} + +#ifdef CONFIG_X86_64 +/* (a << shift) / divisor, return 1 if overflow otherwise 0 */ +static inline int u64_shl_div_u64(u64 a, unsigned int shift, + u64 divisor, u64 *result) +{ + u64 low = a << shift, high = a >> (64 - shift); + + /* To avoid the overflow on divq */ + if (high >= divisor) + return 1; + + /* Low hold the result, high hold rem which is discarded */ + asm("divq %2\n\t" : "=a" (low), "=d" (high) : + "rm" (divisor), "0" (low), "1" (high)); + *result = low; + + return 0; +} + +static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) +{ + struct vcpu_vmx *vmx; + u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles; + + if (kvm_mwait_in_guest(vcpu->kvm)) + return -EOPNOTSUPP; + + vmx = to_vmx(vcpu); + tscl = rdtsc(); + guest_tscl = kvm_read_l1_tsc(vcpu, tscl); + delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; + lapic_timer_advance_cycles = nsec_to_cycles(vcpu, lapic_timer_advance_ns); + + if (delta_tsc > lapic_timer_advance_cycles) + delta_tsc -= lapic_timer_advance_cycles; + else + delta_tsc = 0; + + /* Convert to host delta tsc if tsc scaling is enabled */ + if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio && + u64_shl_div_u64(delta_tsc, + kvm_tsc_scaling_ratio_frac_bits, + vcpu->arch.tsc_scaling_ratio, + &delta_tsc)) + return -ERANGE; + + /* + * If the delta tsc can't fit in the 32 bit after the multi shift, + * we can't use the preemption timer. + * It's possible that it fits on later vmentries, but checking + * on every vmentry is costly so we just use an hrtimer. + */ + if (delta_tsc >> (cpu_preemption_timer_multi + 32)) + return -ERANGE; + + vmx->hv_deadline_tsc = tscl + delta_tsc; + return delta_tsc == 0; +} + +static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) +{ + to_vmx(vcpu)->hv_deadline_tsc = -1; +} +#endif + +static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) +{ + if (!kvm_pause_in_guest(vcpu->kvm)) + shrink_ple_window(vcpu); +} + +static void vmx_slot_enable_log_dirty(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + kvm_mmu_slot_leaf_clear_dirty(kvm, slot); + kvm_mmu_slot_largepage_remove_write_access(kvm, slot); +} + +static void vmx_slot_disable_log_dirty(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + kvm_mmu_slot_set_dirty(kvm, slot); +} + +static void vmx_flush_log_dirty(struct kvm *kvm) +{ + kvm_flush_pml_buffers(kvm); +} + +static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12; + struct vcpu_vmx *vmx = to_vmx(vcpu); + gpa_t gpa; + struct page *page = NULL; + u64 *pml_address; + + if (is_guest_mode(vcpu)) { + WARN_ON_ONCE(vmx->nested.pml_full); + + /* + * Check if PML is enabled for the nested guest. + * Whether eptp bit 6 is set is already checked + * as part of A/D emulation. + */ + vmcs12 = get_vmcs12(vcpu); + if (!nested_cpu_has_pml(vmcs12)) + return 0; + + if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) { + vmx->nested.pml_full = true; + return 1; + } + + gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull; + + page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->pml_address); + if (is_error_page(page)) + return 0; + + pml_address = kmap(page); + pml_address[vmcs12->guest_pml_index--] = gpa; + kunmap(page); + kvm_release_page_clean(page); + } + + return 0; +} + +static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *memslot, + gfn_t offset, unsigned long mask) +{ + kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); +} + +static void __pi_post_block(struct kvm_vcpu *vcpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct pi_desc old, new; + unsigned int dest; + + do { + old.control = new.control = pi_desc->control; + WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR, + "Wakeup handler not enabled while the VCPU is blocked\n"); + + dest = cpu_physical_id(vcpu->cpu); + + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; + + /* set 'NV' to 'notification vector' */ + new.nv = POSTED_INTR_VECTOR; + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); + + if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) { + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + list_del(&vcpu->blocked_vcpu_list); + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + vcpu->pre_pcpu = -1; + } +} + +/* + * This routine does the following things for vCPU which is going + * to be blocked if VT-d PI is enabled. + * - Store the vCPU to the wakeup list, so when interrupts happen + * we can find the right vCPU to wake up. + * - Change the Posted-interrupt descriptor as below: + * 'NDST' <-- vcpu->pre_pcpu + * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR + * - If 'ON' is set during this process, which means at least one + * interrupt is posted for this vCPU, we cannot block it, in + * this case, return 1, otherwise, return 0. + * + */ +static int pi_pre_block(struct kvm_vcpu *vcpu) +{ + unsigned int dest; + struct pi_desc old, new; + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + if (!kvm_arch_has_assigned_device(vcpu->kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP) || + !kvm_vcpu_apicv_active(vcpu)) + return 0; + + WARN_ON(irqs_disabled()); + local_irq_disable(); + if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) { + vcpu->pre_pcpu = vcpu->cpu; + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + list_add_tail(&vcpu->blocked_vcpu_list, + &per_cpu(blocked_vcpu_on_cpu, + vcpu->pre_pcpu)); + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + } + + do { + old.control = new.control = pi_desc->control; + + WARN((pi_desc->sn == 1), + "Warning: SN field of posted-interrupts " + "is set before blocking\n"); + + /* + * Since vCPU can be preempted during this process, + * vcpu->cpu could be different with pre_pcpu, we + * need to set pre_pcpu as the destination of wakeup + * notification event, then we can find the right vCPU + * to wakeup in wakeup handler if interrupts happen + * when the vCPU is in blocked state. + */ + dest = cpu_physical_id(vcpu->pre_pcpu); + + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; + + /* set 'NV' to 'wakeup vector' */ + new.nv = POSTED_INTR_WAKEUP_VECTOR; + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); + + /* We should not block the vCPU if an interrupt is posted for it. */ + if (pi_test_on(pi_desc) == 1) + __pi_post_block(vcpu); + + local_irq_enable(); + return (vcpu->pre_pcpu == -1); +} + +static int vmx_pre_block(struct kvm_vcpu *vcpu) +{ + if (pi_pre_block(vcpu)) + return 1; + + if (kvm_lapic_hv_timer_in_use(vcpu)) + kvm_lapic_switch_to_sw_timer(vcpu); + + return 0; +} + +static void pi_post_block(struct kvm_vcpu *vcpu) +{ + if (vcpu->pre_pcpu == -1) + return; + + WARN_ON(irqs_disabled()); + local_irq_disable(); + __pi_post_block(vcpu); + local_irq_enable(); +} + +static void vmx_post_block(struct kvm_vcpu *vcpu) +{ + if (kvm_x86_ops->set_hv_timer) + kvm_lapic_switch_to_hv_timer(vcpu); + + pi_post_block(vcpu); +} + +/* + * vmx_update_pi_irte - set IRTE for Posted-Interrupts + * + * @kvm: kvm + * @host_irq: host irq of the interrupt + * @guest_irq: gsi of the interrupt + * @set: set or unset PI + * returns 0 on success, < 0 on failure + */ +static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set) +{ + struct kvm_kernel_irq_routing_entry *e; + struct kvm_irq_routing_table *irq_rt; + struct kvm_lapic_irq irq; + struct kvm_vcpu *vcpu; + struct vcpu_data vcpu_info; + int idx, ret = 0; + + if (!kvm_arch_has_assigned_device(kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP) || + !kvm_vcpu_apicv_active(kvm->vcpus[0])) + return 0; + + idx = srcu_read_lock(&kvm->irq_srcu); + irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); + if (guest_irq >= irq_rt->nr_rt_entries || + hlist_empty(&irq_rt->map[guest_irq])) { + pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n", + guest_irq, irq_rt->nr_rt_entries); + goto out; + } + + hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { + if (e->type != KVM_IRQ_ROUTING_MSI) + continue; + /* + * VT-d PI cannot support posting multicast/broadcast + * interrupts to a vCPU, we still use interrupt remapping + * for these kind of interrupts. + * + * For lowest-priority interrupts, we only support + * those with single CPU as the destination, e.g. user + * configures the interrupts via /proc/irq or uses + * irqbalance to make the interrupts single-CPU. + * + * We will support full lowest-priority interrupt later. + */ + + kvm_set_msi_irq(kvm, e, &irq); + if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) { + /* + * Make sure the IRTE is in remapped mode if + * we don't handle it in posted mode. + */ + ret = irq_set_vcpu_affinity(host_irq, NULL); + if (ret < 0) { + printk(KERN_INFO + "failed to back to remapped mode, irq: %u\n", + host_irq); + goto out; + } + + continue; + } + + vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)); + vcpu_info.vector = irq.vector; + + trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi, + vcpu_info.vector, vcpu_info.pi_desc_addr, set); + + if (set) + ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); + else + ret = irq_set_vcpu_affinity(host_irq, NULL); + + if (ret < 0) { + printk(KERN_INFO "%s: failed to update PI IRTE\n", + __func__); + goto out; + } + } + + ret = 0; +out: + srcu_read_unlock(&kvm->irq_srcu, idx); + return ret; +} + +static void vmx_setup_mce(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.mcg_cap & MCG_LMCE_P) + to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= + FEATURE_CONTROL_LMCE; + else + to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= + ~FEATURE_CONTROL_LMCE; +} + +static int vmx_smi_allowed(struct kvm_vcpu *vcpu) +{ + /* we need a nested vmexit to enter SMM, postpone if run is pending */ + if (to_vmx(vcpu)->nested.nested_run_pending) + return 0; + return 1; +} + +static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + vmx->nested.smm.guest_mode = is_guest_mode(vcpu); + if (vmx->nested.smm.guest_mode) + nested_vmx_vmexit(vcpu, -1, 0, 0); + + vmx->nested.smm.vmxon = vmx->nested.vmxon; + vmx->nested.vmxon = false; + vmx_clear_hlt(vcpu); + return 0; +} + +static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int ret; + + if (vmx->nested.smm.vmxon) { + vmx->nested.vmxon = true; + vmx->nested.smm.vmxon = false; + } + + if (vmx->nested.smm.guest_mode) { + vcpu->arch.hflags &= ~HF_SMM_MASK; + ret = nested_vmx_enter_non_root_mode(vcpu, false); + vcpu->arch.hflags |= HF_SMM_MASK; + if (ret) + return ret; + + vmx->nested.smm.guest_mode = false; + } + return 0; +} + +static int enable_smi_window(struct kvm_vcpu *vcpu) +{ + return 0; +} + +static inline int vmx_has_valid_vmcs12(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * In case we do two consecutive get/set_nested_state()s while L2 was + * running hv_evmcs may end up not being mapped (we map it from + * nested_vmx_run()/vmx_vcpu_run()). Check is_guest_mode() as we always + * have vmcs12 if it is true. + */ + return is_guest_mode(vcpu) || vmx->nested.current_vmptr != -1ull || + vmx->nested.hv_evmcs; +} + +static int vmx_get_nested_state(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + u32 user_data_size) +{ + struct vcpu_vmx *vmx; + struct vmcs12 *vmcs12; + struct kvm_nested_state kvm_state = { + .flags = 0, + .format = 0, + .size = sizeof(kvm_state), + .vmx.vmxon_pa = -1ull, + .vmx.vmcs_pa = -1ull, + }; + + if (!vcpu) + return kvm_state.size + 2 * VMCS12_SIZE; + + vmx = to_vmx(vcpu); + vmcs12 = get_vmcs12(vcpu); + + if (nested_vmx_allowed(vcpu) && vmx->nested.enlightened_vmcs_enabled) + kvm_state.flags |= KVM_STATE_NESTED_EVMCS; + + if (nested_vmx_allowed(vcpu) && + (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { + kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr; + kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr; + + if (vmx_has_valid_vmcs12(vcpu)) { + kvm_state.size += VMCS12_SIZE; + + if (is_guest_mode(vcpu) && + nested_cpu_has_shadow_vmcs(vmcs12) && + vmcs12->vmcs_link_pointer != -1ull) + kvm_state.size += VMCS12_SIZE; + } + + if (vmx->nested.smm.vmxon) + kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; + + if (vmx->nested.smm.guest_mode) + kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; + + if (is_guest_mode(vcpu)) { + kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; + + if (vmx->nested.nested_run_pending) + kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; + } + } + + if (user_data_size < kvm_state.size) + goto out; + + if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) + return -EFAULT; + + if (!vmx_has_valid_vmcs12(vcpu)) + goto out; + + /* + * When running L2, the authoritative vmcs12 state is in the + * vmcs02. When running L1, the authoritative vmcs12 state is + * in the shadow or enlightened vmcs linked to vmcs01, unless + * need_vmcs12_sync is set, in which case, the authoritative + * vmcs12 state is in the vmcs12 already. + */ + if (is_guest_mode(vcpu)) { + sync_vmcs12(vcpu, vmcs12); + } else if (!vmx->nested.need_vmcs12_sync) { + if (vmx->nested.hv_evmcs) + copy_enlightened_to_vmcs12(vmx); + else if (enable_shadow_vmcs) + copy_shadow_to_vmcs12(vmx); + } + + if (copy_to_user(user_kvm_nested_state->data, vmcs12, sizeof(*vmcs12))) + return -EFAULT; + + if (nested_cpu_has_shadow_vmcs(vmcs12) && + vmcs12->vmcs_link_pointer != -1ull) { + if (copy_to_user(user_kvm_nested_state->data + VMCS12_SIZE, + get_shadow_vmcs12(vcpu), sizeof(*vmcs12))) + return -EFAULT; + } + +out: + return kvm_state.size; +} + +static int vmx_set_nested_state(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + struct kvm_nested_state *kvm_state) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs12 *vmcs12; + u32 exit_qual; + int ret; + + if (kvm_state->format != 0) + return -EINVAL; + + if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) + nested_enable_evmcs(vcpu, NULL); + + if (!nested_vmx_allowed(vcpu)) + return kvm_state->vmx.vmxon_pa == -1ull ? 0 : -EINVAL; + + if (kvm_state->vmx.vmxon_pa == -1ull) { + if (kvm_state->vmx.smm.flags) + return -EINVAL; + + if (kvm_state->vmx.vmcs_pa != -1ull) + return -EINVAL; + + vmx_leave_nested(vcpu); + return 0; + } + + if (!page_address_valid(vcpu, kvm_state->vmx.vmxon_pa)) + return -EINVAL; + + if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && + (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) + return -EINVAL; + + if (kvm_state->vmx.smm.flags & + ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) + return -EINVAL; + + /* + * SMM temporarily disables VMX, so we cannot be in guest mode, + * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags + * must be zero. + */ + if (is_smm(vcpu) ? kvm_state->flags : kvm_state->vmx.smm.flags) + return -EINVAL; + + if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && + !(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) + return -EINVAL; + + vmx_leave_nested(vcpu); + if (kvm_state->vmx.vmxon_pa == -1ull) + return 0; + + vmx->nested.vmxon_ptr = kvm_state->vmx.vmxon_pa; + ret = enter_vmx_operation(vcpu); + if (ret) + return ret; + + /* Empty 'VMXON' state is permitted */ + if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12)) + return 0; + + if (kvm_state->vmx.vmcs_pa != -1ull) { + if (kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa || + !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa)) + return -EINVAL; + + set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa); + } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { + /* + * Sync eVMCS upon entry as we may not have + * HV_X64_MSR_VP_ASSIST_PAGE set up yet. + */ + vmx->nested.need_vmcs12_sync = true; + } else { + return -EINVAL; + } + + if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { + vmx->nested.smm.vmxon = true; + vmx->nested.vmxon = false; + + if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) + vmx->nested.smm.guest_mode = true; + } + + vmcs12 = get_vmcs12(vcpu); + if (copy_from_user(vmcs12, user_kvm_nested_state->data, sizeof(*vmcs12))) + return -EFAULT; + + if (vmcs12->hdr.revision_id != VMCS12_REVISION) + return -EINVAL; + + if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) + return 0; + + vmx->nested.nested_run_pending = + !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); + + if (nested_cpu_has_shadow_vmcs(vmcs12) && + vmcs12->vmcs_link_pointer != -1ull) { + struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); + if (kvm_state->size < sizeof(kvm_state) + 2 * sizeof(*vmcs12)) + return -EINVAL; + + if (copy_from_user(shadow_vmcs12, + user_kvm_nested_state->data + VMCS12_SIZE, + sizeof(*vmcs12))) + return -EFAULT; + + if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || + !shadow_vmcs12->hdr.shadow_vmcs) + return -EINVAL; + } + + if (check_vmentry_prereqs(vcpu, vmcs12) || + check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) + return -EINVAL; + + vmx->nested.dirty_vmcs12 = true; + ret = nested_vmx_enter_non_root_mode(vcpu, false); + if (ret) + return -EINVAL; + + return 0; +} + +static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { + .cpu_has_kvm_support = cpu_has_kvm_support, + .disabled_by_bios = vmx_disabled_by_bios, + .hardware_setup = hardware_setup, + .hardware_unsetup = hardware_unsetup, + .check_processor_compatibility = vmx_check_processor_compat, + .hardware_enable = hardware_enable, + .hardware_disable = hardware_disable, + .cpu_has_accelerated_tpr = report_flexpriority, + .has_emulated_msr = vmx_has_emulated_msr, + + .vm_init = vmx_vm_init, + .vm_alloc = vmx_vm_alloc, + .vm_free = vmx_vm_free, + + .vcpu_create = vmx_create_vcpu, + .vcpu_free = vmx_free_vcpu, + .vcpu_reset = vmx_vcpu_reset, + + .prepare_guest_switch = vmx_prepare_switch_to_guest, + .vcpu_load = vmx_vcpu_load, + .vcpu_put = vmx_vcpu_put, + + .update_bp_intercept = update_exception_bitmap, + .get_msr_feature = vmx_get_msr_feature, + .get_msr = vmx_get_msr, + .set_msr = vmx_set_msr, + .get_segment_base = vmx_get_segment_base, + .get_segment = vmx_get_segment, + .set_segment = vmx_set_segment, + .get_cpl = vmx_get_cpl, + .get_cs_db_l_bits = vmx_get_cs_db_l_bits, + .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, + .decache_cr3 = vmx_decache_cr3, + .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, + .set_cr0 = vmx_set_cr0, + .set_cr3 = vmx_set_cr3, + .set_cr4 = vmx_set_cr4, + .set_efer = vmx_set_efer, + .get_idt = vmx_get_idt, + .set_idt = vmx_set_idt, + .get_gdt = vmx_get_gdt, + .set_gdt = vmx_set_gdt, + .get_dr6 = vmx_get_dr6, + .set_dr6 = vmx_set_dr6, + .set_dr7 = vmx_set_dr7, + .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, + .cache_reg = vmx_cache_reg, + .get_rflags = vmx_get_rflags, + .set_rflags = vmx_set_rflags, + + .tlb_flush = vmx_flush_tlb, + .tlb_flush_gva = vmx_flush_tlb_gva, + + .run = vmx_vcpu_run, + .handle_exit = vmx_handle_exit, + .skip_emulated_instruction = skip_emulated_instruction, + .set_interrupt_shadow = vmx_set_interrupt_shadow, + .get_interrupt_shadow = vmx_get_interrupt_shadow, + .patch_hypercall = vmx_patch_hypercall, + .set_irq = vmx_inject_irq, + .set_nmi = vmx_inject_nmi, + .queue_exception = vmx_queue_exception, + .cancel_injection = vmx_cancel_injection, + .interrupt_allowed = vmx_interrupt_allowed, + .nmi_allowed = vmx_nmi_allowed, + .get_nmi_mask = vmx_get_nmi_mask, + .set_nmi_mask = vmx_set_nmi_mask, + .enable_nmi_window = enable_nmi_window, + .enable_irq_window = enable_irq_window, + .update_cr8_intercept = update_cr8_intercept, + .set_virtual_apic_mode = vmx_set_virtual_apic_mode, + .set_apic_access_page_addr = vmx_set_apic_access_page_addr, + .get_enable_apicv = vmx_get_enable_apicv, + .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, + .load_eoi_exitmap = vmx_load_eoi_exitmap, + .apicv_post_state_restore = vmx_apicv_post_state_restore, + .hwapic_irr_update = vmx_hwapic_irr_update, + .hwapic_isr_update = vmx_hwapic_isr_update, + .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, + .sync_pir_to_irr = vmx_sync_pir_to_irr, + .deliver_posted_interrupt = vmx_deliver_posted_interrupt, + + .set_tss_addr = vmx_set_tss_addr, + .set_identity_map_addr = vmx_set_identity_map_addr, + .get_tdp_level = get_ept_level, + .get_mt_mask = vmx_get_mt_mask, + + .get_exit_info = vmx_get_exit_info, + + .get_lpage_level = vmx_get_lpage_level, + + .cpuid_update = vmx_cpuid_update, + + .rdtscp_supported = vmx_rdtscp_supported, + .invpcid_supported = vmx_invpcid_supported, + + .set_supported_cpuid = vmx_set_supported_cpuid, + + .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, + + .read_l1_tsc_offset = vmx_read_l1_tsc_offset, + .write_l1_tsc_offset = vmx_write_l1_tsc_offset, + + .set_tdp_cr3 = vmx_set_cr3, + + .check_intercept = vmx_check_intercept, + .handle_external_intr = vmx_handle_external_intr, + .mpx_supported = vmx_mpx_supported, + .xsaves_supported = vmx_xsaves_supported, + .umip_emulated = vmx_umip_emulated, + + .check_nested_events = vmx_check_nested_events, + .request_immediate_exit = vmx_request_immediate_exit, + + .sched_in = vmx_sched_in, + + .slot_enable_log_dirty = vmx_slot_enable_log_dirty, + .slot_disable_log_dirty = vmx_slot_disable_log_dirty, + .flush_log_dirty = vmx_flush_log_dirty, + .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, + .write_log_dirty = vmx_write_pml_buffer, + + .pre_block = vmx_pre_block, + .post_block = vmx_post_block, + + .pmu_ops = &intel_pmu_ops, + + .update_pi_irte = vmx_update_pi_irte, + +#ifdef CONFIG_X86_64 + .set_hv_timer = vmx_set_hv_timer, + .cancel_hv_timer = vmx_cancel_hv_timer, +#endif + + .setup_mce = vmx_setup_mce, + + .get_nested_state = vmx_get_nested_state, + .set_nested_state = vmx_set_nested_state, + .get_vmcs12_pages = nested_get_vmcs12_pages, + + .smi_allowed = vmx_smi_allowed, + .pre_enter_smm = vmx_pre_enter_smm, + .pre_leave_smm = vmx_pre_leave_smm, + .enable_smi_window = enable_smi_window, + + .nested_enable_evmcs = nested_enable_evmcs, +}; + +static void vmx_cleanup_l1d_flush(void) +{ + if (vmx_l1d_flush_pages) { + free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER); + vmx_l1d_flush_pages = NULL; + } + /* Restore state so sysfs ignores VMX */ + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; +} + +static void vmx_exit(void) +{ +#ifdef CONFIG_KEXEC_CORE + RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); + synchronize_rcu(); +#endif + + kvm_exit(); + +#if IS_ENABLED(CONFIG_HYPERV) + if (static_branch_unlikely(&enable_evmcs)) { + int cpu; + struct hv_vp_assist_page *vp_ap; + /* + * Reset everything to support using non-enlightened VMCS + * access later (e.g. when we reload the module with + * enlightened_vmcs=0) + */ + for_each_online_cpu(cpu) { + vp_ap = hv_get_vp_assist_page(cpu); + + if (!vp_ap) + continue; + + vp_ap->current_nested_vmcs = 0; + vp_ap->enlighten_vmentry = 0; + } + + static_branch_disable(&enable_evmcs); + } +#endif + vmx_cleanup_l1d_flush(); +} +module_exit(vmx_exit); + +static int __init vmx_init(void) +{ + int r; + +#if IS_ENABLED(CONFIG_HYPERV) + /* + * Enlightened VMCS usage should be recommended and the host needs + * to support eVMCS v1 or above. We can also disable eVMCS support + * with module parameter. + */ + if (enlightened_vmcs && + ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED && + (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >= + KVM_EVMCS_VERSION) { + int cpu; + + /* Check that we have assist pages on all online CPUs */ + for_each_online_cpu(cpu) { + if (!hv_get_vp_assist_page(cpu)) { + enlightened_vmcs = false; + break; + } + } + + if (enlightened_vmcs) { + pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n"); + static_branch_enable(&enable_evmcs); + } + } else { + enlightened_vmcs = false; + } +#endif + + r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), + __alignof__(struct vcpu_vmx), THIS_MODULE); + if (r) + return r; + + /* + * Must be called after kvm_init() so enable_ept is properly set + * up. Hand the parameter mitigation value in which was stored in + * the pre module init parser. If no parameter was given, it will + * contain 'auto' which will be turned into the default 'cond' + * mitigation mode. + */ + if (boot_cpu_has(X86_BUG_L1TF)) { + r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); + if (r) { + vmx_exit(); + return r; + } + } + +#ifdef CONFIG_KEXEC_CORE + rcu_assign_pointer(crash_vmclear_loaded_vmcss, + crash_vmclear_local_loaded_vmcss); +#endif + vmx_check_vmcs12_offsets(); + + return 0; +} +module_init(vmx_init); diff --git a/arch/x86/kvm/vmx/vmx_evmcs.h b/arch/x86/kvm/vmx/vmx_evmcs.h new file mode 100644 index 000000000000..210a884090ad --- /dev/null +++ b/arch/x86/kvm/vmx/vmx_evmcs.h @@ -0,0 +1,324 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_VMX_EVMCS_H +#define __KVM_X86_VMX_EVMCS_H + +#include + +#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) +#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x) +#define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \ + {EVMCS1_OFFSET(name), clean_field} + +struct evmcs_field { + u16 offset; + u16 clean_field; +}; + +static const struct evmcs_field vmcs_field_to_evmcs_1[] = { + /* 64 bit rw */ + EVMCS1_FIELD(GUEST_RIP, guest_rip, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(GUEST_RSP, guest_rsp, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), + EVMCS1_FIELD(GUEST_RFLAGS, guest_rflags, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), + EVMCS1_FIELD(HOST_IA32_PAT, host_ia32_pat, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_IA32_EFER, host_ia32_efer, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_CR0, host_cr0, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_CR3, host_cr3, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_CR4, host_cr4, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_RIP, host_rip, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(IO_BITMAP_A, io_bitmap_a, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP), + EVMCS1_FIELD(IO_BITMAP_B, io_bitmap_b, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP), + EVMCS1_FIELD(MSR_BITMAP, msr_bitmap, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP), + EVMCS1_FIELD(GUEST_ES_BASE, guest_es_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_CS_BASE, guest_cs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_SS_BASE, guest_ss_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_DS_BASE, guest_ds_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_FS_BASE, guest_fs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GS_BASE, guest_gs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_LDTR_BASE, guest_ldtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_TR_BASE, guest_tr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GDTR_BASE, guest_gdtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_IDTR_BASE, guest_idtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(TSC_OFFSET, tsc_offset, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), + EVMCS1_FIELD(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), + EVMCS1_FIELD(VMCS_LINK_POINTER, vmcs_link_pointer, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_IA32_PAT, guest_ia32_pat, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_IA32_EFER, guest_ia32_efer, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_PDPTR0, guest_pdptr0, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_PDPTR1, guest_pdptr1, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_PDPTR2, guest_pdptr2, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_PDPTR3, guest_pdptr3, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(CR0_READ_SHADOW, cr0_read_shadow, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(CR4_READ_SHADOW, cr4_read_shadow, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(GUEST_CR0, guest_cr0, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(GUEST_CR3, guest_cr3, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(GUEST_CR4, guest_cr4, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(GUEST_DR7, guest_dr7, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(HOST_FS_BASE, host_fs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(HOST_GS_BASE, host_gs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(HOST_TR_BASE, host_tr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(HOST_GDTR_BASE, host_gdtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(HOST_IDTR_BASE, host_idtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(HOST_RSP, host_rsp, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(EPT_POINTER, ept_pointer, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT), + EVMCS1_FIELD(GUEST_BNDCFGS, guest_bndcfgs, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(XSS_EXIT_BITMAP, xss_exit_bitmap, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), + + /* 64 bit read only */ + EVMCS1_FIELD(GUEST_PHYSICAL_ADDRESS, guest_physical_address, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(EXIT_QUALIFICATION, exit_qualification, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + /* + * Not defined in KVM: + * + * EVMCS1_FIELD(0x00006402, exit_io_instruction_ecx, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); + * EVMCS1_FIELD(0x00006404, exit_io_instruction_esi, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); + * EVMCS1_FIELD(0x00006406, exit_io_instruction_esi, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); + * EVMCS1_FIELD(0x00006408, exit_io_instruction_eip, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); + */ + EVMCS1_FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + + /* + * No mask defined in the spec as Hyper-V doesn't currently support + * these. Future proof by resetting the whole clean field mask on + * access. + */ + EVMCS1_FIELD(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(CR3_TARGET_VALUE0, cr3_target_value0, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(CR3_TARGET_VALUE1, cr3_target_value1, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(CR3_TARGET_VALUE2, cr3_target_value2, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(CR3_TARGET_VALUE3, cr3_target_value3, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + + /* 32 bit rw */ + EVMCS1_FIELD(TPR_THRESHOLD, tpr_threshold, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), + EVMCS1_FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC), + EVMCS1_FIELD(EXCEPTION_BITMAP, exception_bitmap, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN), + EVMCS1_FIELD(VM_ENTRY_CONTROLS, vm_entry_controls, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY), + EVMCS1_FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), + EVMCS1_FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, + vm_entry_exception_error_code, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), + EVMCS1_FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), + EVMCS1_FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), + EVMCS1_FIELD(VM_EXIT_CONTROLS, vm_exit_controls, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), + EVMCS1_FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), + EVMCS1_FIELD(GUEST_ES_LIMIT, guest_es_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_CS_LIMIT, guest_cs_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_SS_LIMIT, guest_ss_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_DS_LIMIT, guest_ds_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_FS_LIMIT, guest_fs_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GS_LIMIT, guest_gs_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_TR_LIMIT, guest_tr_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_ACTIVITY_STATE, guest_activity_state, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + + /* 32 bit read only */ + EVMCS1_FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(VM_EXIT_REASON, vm_exit_reason, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + + /* No mask defined in the spec (not used) */ + EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(CR3_TARGET_COUNT, cr3_target_count, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + + /* 16 bit rw */ + EVMCS1_FIELD(HOST_ES_SELECTOR, host_es_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_CS_SELECTOR, host_cs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_SS_SELECTOR, host_ss_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_DS_SELECTOR, host_ds_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_FS_SELECTOR, host_fs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_GS_SELECTOR, host_gs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_TR_SELECTOR, host_tr_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(GUEST_ES_SELECTOR, guest_es_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_CS_SELECTOR, guest_cs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_SS_SELECTOR, guest_ss_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_DS_SELECTOR, guest_ds_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_FS_SELECTOR, guest_fs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GS_SELECTOR, guest_gs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_TR_SELECTOR, guest_tr_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT), +}; + +static __always_inline int get_evmcs_offset(unsigned long field, + u16 *clean_field) +{ + unsigned int index = ROL16(field, 6); + const struct evmcs_field *evmcs_field; + + if (unlikely(index >= ARRAY_SIZE(vmcs_field_to_evmcs_1))) { + WARN_ONCE(1, "KVM: accessing unsupported EVMCS field %lx\n", + field); + return -ENOENT; + } + + evmcs_field = &vmcs_field_to_evmcs_1[index]; + + if (clean_field) + *clean_field = evmcs_field->clean_field; + + return evmcs_field->offset; +} + +#undef ROL16 + +#endif /* __KVM_X86_VMX_EVMCS_H */ diff --git a/arch/x86/kvm/vmx/vmx_shadow_fields.h b/arch/x86/kvm/vmx/vmx_shadow_fields.h new file mode 100644 index 000000000000..132432f375c2 --- /dev/null +++ b/arch/x86/kvm/vmx/vmx_shadow_fields.h @@ -0,0 +1,74 @@ +#ifndef SHADOW_FIELD_RO +#define SHADOW_FIELD_RO(x) +#endif +#ifndef SHADOW_FIELD_RW +#define SHADOW_FIELD_RW(x) +#endif + +/* + * We do NOT shadow fields that are modified when L0 + * traps and emulates any vmx instruction (e.g. VMPTRLD, + * VMXON...) executed by L1. + * For example, VM_INSTRUCTION_ERROR is read + * by L1 if a vmx instruction fails (part of the error path). + * Note the code assumes this logic. If for some reason + * we start shadowing these fields then we need to + * force a shadow sync when L0 emulates vmx instructions + * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified + * by nested_vmx_failValid) + * + * When adding or removing fields here, note that shadowed + * fields must always be synced by prepare_vmcs02, not just + * prepare_vmcs02_full. + */ + +/* + * Keeping the fields ordered by size is an attempt at improving + * branch prediction in vmcs_read_any and vmcs_write_any. + */ + +/* 16-bits */ +SHADOW_FIELD_RW(GUEST_INTR_STATUS) +SHADOW_FIELD_RW(GUEST_PML_INDEX) +SHADOW_FIELD_RW(HOST_FS_SELECTOR) +SHADOW_FIELD_RW(HOST_GS_SELECTOR) + +/* 32-bits */ +SHADOW_FIELD_RO(VM_EXIT_REASON) +SHADOW_FIELD_RO(VM_EXIT_INTR_INFO) +SHADOW_FIELD_RO(VM_EXIT_INSTRUCTION_LEN) +SHADOW_FIELD_RO(IDT_VECTORING_INFO_FIELD) +SHADOW_FIELD_RO(IDT_VECTORING_ERROR_CODE) +SHADOW_FIELD_RO(VM_EXIT_INTR_ERROR_CODE) +SHADOW_FIELD_RW(CPU_BASED_VM_EXEC_CONTROL) +SHADOW_FIELD_RW(EXCEPTION_BITMAP) +SHADOW_FIELD_RW(VM_ENTRY_EXCEPTION_ERROR_CODE) +SHADOW_FIELD_RW(VM_ENTRY_INTR_INFO_FIELD) +SHADOW_FIELD_RW(VM_ENTRY_INSTRUCTION_LEN) +SHADOW_FIELD_RW(TPR_THRESHOLD) +SHADOW_FIELD_RW(GUEST_CS_AR_BYTES) +SHADOW_FIELD_RW(GUEST_SS_AR_BYTES) +SHADOW_FIELD_RW(GUEST_INTERRUPTIBILITY_INFO) +SHADOW_FIELD_RW(VMX_PREEMPTION_TIMER_VALUE) + +/* Natural width */ +SHADOW_FIELD_RO(EXIT_QUALIFICATION) +SHADOW_FIELD_RO(GUEST_LINEAR_ADDRESS) +SHADOW_FIELD_RW(GUEST_RIP) +SHADOW_FIELD_RW(GUEST_RSP) +SHADOW_FIELD_RW(GUEST_CR0) +SHADOW_FIELD_RW(GUEST_CR3) +SHADOW_FIELD_RW(GUEST_CR4) +SHADOW_FIELD_RW(GUEST_RFLAGS) +SHADOW_FIELD_RW(CR0_GUEST_HOST_MASK) +SHADOW_FIELD_RW(CR0_READ_SHADOW) +SHADOW_FIELD_RW(CR4_READ_SHADOW) +SHADOW_FIELD_RW(HOST_FS_BASE) +SHADOW_FIELD_RW(HOST_GS_BASE) + +/* 64-bit */ +SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS) +SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS_HIGH) + +#undef SHADOW_FIELD_RO +#undef SHADOW_FIELD_RW diff --git a/arch/x86/kvm/vmx_evmcs.h b/arch/x86/kvm/vmx_evmcs.h deleted file mode 100644 index 210a884090ad..000000000000 --- a/arch/x86/kvm/vmx_evmcs.h +++ /dev/null @@ -1,324 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __KVM_X86_VMX_EVMCS_H -#define __KVM_X86_VMX_EVMCS_H - -#include - -#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) -#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x) -#define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \ - {EVMCS1_OFFSET(name), clean_field} - -struct evmcs_field { - u16 offset; - u16 clean_field; -}; - -static const struct evmcs_field vmcs_field_to_evmcs_1[] = { - /* 64 bit rw */ - EVMCS1_FIELD(GUEST_RIP, guest_rip, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(GUEST_RSP, guest_rsp, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), - EVMCS1_FIELD(GUEST_RFLAGS, guest_rflags, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), - EVMCS1_FIELD(HOST_IA32_PAT, host_ia32_pat, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_IA32_EFER, host_ia32_efer, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_CR0, host_cr0, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_CR3, host_cr3, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_CR4, host_cr4, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_RIP, host_rip, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(IO_BITMAP_A, io_bitmap_a, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP), - EVMCS1_FIELD(IO_BITMAP_B, io_bitmap_b, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP), - EVMCS1_FIELD(MSR_BITMAP, msr_bitmap, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP), - EVMCS1_FIELD(GUEST_ES_BASE, guest_es_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_CS_BASE, guest_cs_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_SS_BASE, guest_ss_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_DS_BASE, guest_ds_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_FS_BASE, guest_fs_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_GS_BASE, guest_gs_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_LDTR_BASE, guest_ldtr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_TR_BASE, guest_tr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_GDTR_BASE, guest_gdtr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_IDTR_BASE, guest_idtr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(TSC_OFFSET, tsc_offset, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), - EVMCS1_FIELD(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), - EVMCS1_FIELD(VMCS_LINK_POINTER, vmcs_link_pointer, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_IA32_PAT, guest_ia32_pat, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_IA32_EFER, guest_ia32_efer, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_PDPTR0, guest_pdptr0, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_PDPTR1, guest_pdptr1, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_PDPTR2, guest_pdptr2, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_PDPTR3, guest_pdptr3, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(CR0_READ_SHADOW, cr0_read_shadow, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(CR4_READ_SHADOW, cr4_read_shadow, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(GUEST_CR0, guest_cr0, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(GUEST_CR3, guest_cr3, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(GUEST_CR4, guest_cr4, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(GUEST_DR7, guest_dr7, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(HOST_FS_BASE, host_fs_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), - EVMCS1_FIELD(HOST_GS_BASE, host_gs_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), - EVMCS1_FIELD(HOST_TR_BASE, host_tr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), - EVMCS1_FIELD(HOST_GDTR_BASE, host_gdtr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), - EVMCS1_FIELD(HOST_IDTR_BASE, host_idtr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), - EVMCS1_FIELD(HOST_RSP, host_rsp, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), - EVMCS1_FIELD(EPT_POINTER, ept_pointer, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT), - EVMCS1_FIELD(GUEST_BNDCFGS, guest_bndcfgs, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(XSS_EXIT_BITMAP, xss_exit_bitmap, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), - - /* 64 bit read only */ - EVMCS1_FIELD(GUEST_PHYSICAL_ADDRESS, guest_physical_address, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(EXIT_QUALIFICATION, exit_qualification, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - /* - * Not defined in KVM: - * - * EVMCS1_FIELD(0x00006402, exit_io_instruction_ecx, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); - * EVMCS1_FIELD(0x00006404, exit_io_instruction_esi, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); - * EVMCS1_FIELD(0x00006406, exit_io_instruction_esi, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); - * EVMCS1_FIELD(0x00006408, exit_io_instruction_eip, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); - */ - EVMCS1_FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - - /* - * No mask defined in the spec as Hyper-V doesn't currently support - * these. Future proof by resetting the whole clean field mask on - * access. - */ - EVMCS1_FIELD(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(CR3_TARGET_VALUE0, cr3_target_value0, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(CR3_TARGET_VALUE1, cr3_target_value1, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(CR3_TARGET_VALUE2, cr3_target_value2, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(CR3_TARGET_VALUE3, cr3_target_value3, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - - /* 32 bit rw */ - EVMCS1_FIELD(TPR_THRESHOLD, tpr_threshold, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), - EVMCS1_FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC), - EVMCS1_FIELD(EXCEPTION_BITMAP, exception_bitmap, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN), - EVMCS1_FIELD(VM_ENTRY_CONTROLS, vm_entry_controls, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY), - EVMCS1_FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), - EVMCS1_FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, - vm_entry_exception_error_code, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), - EVMCS1_FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), - EVMCS1_FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), - EVMCS1_FIELD(VM_EXIT_CONTROLS, vm_exit_controls, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), - EVMCS1_FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), - EVMCS1_FIELD(GUEST_ES_LIMIT, guest_es_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_CS_LIMIT, guest_cs_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_SS_LIMIT, guest_ss_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_DS_LIMIT, guest_ds_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_FS_LIMIT, guest_fs_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_GS_LIMIT, guest_gs_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_TR_LIMIT, guest_tr_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_ACTIVITY_STATE, guest_activity_state, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - - /* 32 bit read only */ - EVMCS1_FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(VM_EXIT_REASON, vm_exit_reason, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - - /* No mask defined in the spec (not used) */ - EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(CR3_TARGET_COUNT, cr3_target_count, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - - /* 16 bit rw */ - EVMCS1_FIELD(HOST_ES_SELECTOR, host_es_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_CS_SELECTOR, host_cs_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_SS_SELECTOR, host_ss_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_DS_SELECTOR, host_ds_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_FS_SELECTOR, host_fs_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_GS_SELECTOR, host_gs_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_TR_SELECTOR, host_tr_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(GUEST_ES_SELECTOR, guest_es_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_CS_SELECTOR, guest_cs_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_SS_SELECTOR, guest_ss_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_DS_SELECTOR, guest_ds_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_FS_SELECTOR, guest_fs_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_GS_SELECTOR, guest_gs_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_TR_SELECTOR, guest_tr_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT), -}; - -static __always_inline int get_evmcs_offset(unsigned long field, - u16 *clean_field) -{ - unsigned int index = ROL16(field, 6); - const struct evmcs_field *evmcs_field; - - if (unlikely(index >= ARRAY_SIZE(vmcs_field_to_evmcs_1))) { - WARN_ONCE(1, "KVM: accessing unsupported EVMCS field %lx\n", - field); - return -ENOENT; - } - - evmcs_field = &vmcs_field_to_evmcs_1[index]; - - if (clean_field) - *clean_field = evmcs_field->clean_field; - - return evmcs_field->offset; -} - -#undef ROL16 - -#endif /* __KVM_X86_VMX_EVMCS_H */ diff --git a/arch/x86/kvm/vmx_shadow_fields.h b/arch/x86/kvm/vmx_shadow_fields.h deleted file mode 100644 index 132432f375c2..000000000000 --- a/arch/x86/kvm/vmx_shadow_fields.h +++ /dev/null @@ -1,74 +0,0 @@ -#ifndef SHADOW_FIELD_RO -#define SHADOW_FIELD_RO(x) -#endif -#ifndef SHADOW_FIELD_RW -#define SHADOW_FIELD_RW(x) -#endif - -/* - * We do NOT shadow fields that are modified when L0 - * traps and emulates any vmx instruction (e.g. VMPTRLD, - * VMXON...) executed by L1. - * For example, VM_INSTRUCTION_ERROR is read - * by L1 if a vmx instruction fails (part of the error path). - * Note the code assumes this logic. If for some reason - * we start shadowing these fields then we need to - * force a shadow sync when L0 emulates vmx instructions - * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified - * by nested_vmx_failValid) - * - * When adding or removing fields here, note that shadowed - * fields must always be synced by prepare_vmcs02, not just - * prepare_vmcs02_full. - */ - -/* - * Keeping the fields ordered by size is an attempt at improving - * branch prediction in vmcs_read_any and vmcs_write_any. - */ - -/* 16-bits */ -SHADOW_FIELD_RW(GUEST_INTR_STATUS) -SHADOW_FIELD_RW(GUEST_PML_INDEX) -SHADOW_FIELD_RW(HOST_FS_SELECTOR) -SHADOW_FIELD_RW(HOST_GS_SELECTOR) - -/* 32-bits */ -SHADOW_FIELD_RO(VM_EXIT_REASON) -SHADOW_FIELD_RO(VM_EXIT_INTR_INFO) -SHADOW_FIELD_RO(VM_EXIT_INSTRUCTION_LEN) -SHADOW_FIELD_RO(IDT_VECTORING_INFO_FIELD) -SHADOW_FIELD_RO(IDT_VECTORING_ERROR_CODE) -SHADOW_FIELD_RO(VM_EXIT_INTR_ERROR_CODE) -SHADOW_FIELD_RW(CPU_BASED_VM_EXEC_CONTROL) -SHADOW_FIELD_RW(EXCEPTION_BITMAP) -SHADOW_FIELD_RW(VM_ENTRY_EXCEPTION_ERROR_CODE) -SHADOW_FIELD_RW(VM_ENTRY_INTR_INFO_FIELD) -SHADOW_FIELD_RW(VM_ENTRY_INSTRUCTION_LEN) -SHADOW_FIELD_RW(TPR_THRESHOLD) -SHADOW_FIELD_RW(GUEST_CS_AR_BYTES) -SHADOW_FIELD_RW(GUEST_SS_AR_BYTES) -SHADOW_FIELD_RW(GUEST_INTERRUPTIBILITY_INFO) -SHADOW_FIELD_RW(VMX_PREEMPTION_TIMER_VALUE) - -/* Natural width */ -SHADOW_FIELD_RO(EXIT_QUALIFICATION) -SHADOW_FIELD_RO(GUEST_LINEAR_ADDRESS) -SHADOW_FIELD_RW(GUEST_RIP) -SHADOW_FIELD_RW(GUEST_RSP) -SHADOW_FIELD_RW(GUEST_CR0) -SHADOW_FIELD_RW(GUEST_CR3) -SHADOW_FIELD_RW(GUEST_CR4) -SHADOW_FIELD_RW(GUEST_RFLAGS) -SHADOW_FIELD_RW(CR0_GUEST_HOST_MASK) -SHADOW_FIELD_RW(CR0_READ_SHADOW) -SHADOW_FIELD_RW(CR4_READ_SHADOW) -SHADOW_FIELD_RW(HOST_FS_BASE) -SHADOW_FIELD_RW(HOST_GS_BASE) - -/* 64-bit */ -SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS) -SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS_HIGH) - -#undef SHADOW_FIELD_RO -#undef SHADOW_FIELD_RW -- cgit v1.2.3 From e0123119a564de1a9aff712e0b968b3e21296a22 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 3 Dec 2018 13:52:57 -0800 Subject: KVM: VMX: rename vmx_shadow_fields.h to vmcs_shadow_fields.h VMX specific files now reside in a dedicated subdirectory. Drop the "vmx" prefix, which is redundant, and add a "vmcs" prefix to clarify that the file is referring to VMCS shadow fields. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmcs_shadow_fields.h | 74 +++++++++++++++++++++++++++++++++++ arch/x86/kvm/vmx/vmx.c | 8 ++-- arch/x86/kvm/vmx/vmx_shadow_fields.h | 74 ----------------------------------- 3 files changed, 78 insertions(+), 78 deletions(-) create mode 100644 arch/x86/kvm/vmx/vmcs_shadow_fields.h delete mode 100644 arch/x86/kvm/vmx/vmx_shadow_fields.h (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmcs_shadow_fields.h b/arch/x86/kvm/vmx/vmcs_shadow_fields.h new file mode 100644 index 000000000000..132432f375c2 --- /dev/null +++ b/arch/x86/kvm/vmx/vmcs_shadow_fields.h @@ -0,0 +1,74 @@ +#ifndef SHADOW_FIELD_RO +#define SHADOW_FIELD_RO(x) +#endif +#ifndef SHADOW_FIELD_RW +#define SHADOW_FIELD_RW(x) +#endif + +/* + * We do NOT shadow fields that are modified when L0 + * traps and emulates any vmx instruction (e.g. VMPTRLD, + * VMXON...) executed by L1. + * For example, VM_INSTRUCTION_ERROR is read + * by L1 if a vmx instruction fails (part of the error path). + * Note the code assumes this logic. If for some reason + * we start shadowing these fields then we need to + * force a shadow sync when L0 emulates vmx instructions + * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified + * by nested_vmx_failValid) + * + * When adding or removing fields here, note that shadowed + * fields must always be synced by prepare_vmcs02, not just + * prepare_vmcs02_full. + */ + +/* + * Keeping the fields ordered by size is an attempt at improving + * branch prediction in vmcs_read_any and vmcs_write_any. + */ + +/* 16-bits */ +SHADOW_FIELD_RW(GUEST_INTR_STATUS) +SHADOW_FIELD_RW(GUEST_PML_INDEX) +SHADOW_FIELD_RW(HOST_FS_SELECTOR) +SHADOW_FIELD_RW(HOST_GS_SELECTOR) + +/* 32-bits */ +SHADOW_FIELD_RO(VM_EXIT_REASON) +SHADOW_FIELD_RO(VM_EXIT_INTR_INFO) +SHADOW_FIELD_RO(VM_EXIT_INSTRUCTION_LEN) +SHADOW_FIELD_RO(IDT_VECTORING_INFO_FIELD) +SHADOW_FIELD_RO(IDT_VECTORING_ERROR_CODE) +SHADOW_FIELD_RO(VM_EXIT_INTR_ERROR_CODE) +SHADOW_FIELD_RW(CPU_BASED_VM_EXEC_CONTROL) +SHADOW_FIELD_RW(EXCEPTION_BITMAP) +SHADOW_FIELD_RW(VM_ENTRY_EXCEPTION_ERROR_CODE) +SHADOW_FIELD_RW(VM_ENTRY_INTR_INFO_FIELD) +SHADOW_FIELD_RW(VM_ENTRY_INSTRUCTION_LEN) +SHADOW_FIELD_RW(TPR_THRESHOLD) +SHADOW_FIELD_RW(GUEST_CS_AR_BYTES) +SHADOW_FIELD_RW(GUEST_SS_AR_BYTES) +SHADOW_FIELD_RW(GUEST_INTERRUPTIBILITY_INFO) +SHADOW_FIELD_RW(VMX_PREEMPTION_TIMER_VALUE) + +/* Natural width */ +SHADOW_FIELD_RO(EXIT_QUALIFICATION) +SHADOW_FIELD_RO(GUEST_LINEAR_ADDRESS) +SHADOW_FIELD_RW(GUEST_RIP) +SHADOW_FIELD_RW(GUEST_RSP) +SHADOW_FIELD_RW(GUEST_CR0) +SHADOW_FIELD_RW(GUEST_CR3) +SHADOW_FIELD_RW(GUEST_CR4) +SHADOW_FIELD_RW(GUEST_RFLAGS) +SHADOW_FIELD_RW(CR0_GUEST_HOST_MASK) +SHADOW_FIELD_RW(CR0_READ_SHADOW) +SHADOW_FIELD_RW(CR4_READ_SHADOW) +SHADOW_FIELD_RW(HOST_FS_BASE) +SHADOW_FIELD_RW(HOST_GS_BASE) + +/* 64-bit */ +SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS) +SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS_HIGH) + +#undef SHADOW_FIELD_RO +#undef SHADOW_FIELD_RW diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 589230c923e2..2fe25c13adfc 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1103,14 +1103,14 @@ static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) static u16 shadow_read_only_fields[] = { #define SHADOW_FIELD_RO(x) x, -#include "vmx_shadow_fields.h" +#include "vmcs_shadow_fields.h" }; static int max_shadow_read_only_fields = ARRAY_SIZE(shadow_read_only_fields); static u16 shadow_read_write_fields[] = { #define SHADOW_FIELD_RW(x) x, -#include "vmx_shadow_fields.h" +#include "vmcs_shadow_fields.h" }; static int max_shadow_read_write_fields = ARRAY_SIZE(shadow_read_write_fields); @@ -9266,7 +9266,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) if (!is_guest_mode(vcpu)) { switch (field) { #define SHADOW_FIELD_RW(x) case x: -#include "vmx_shadow_fields.h" +#include "vmcs_shadow_fields.h" /* * The fields that can be updated by L1 without a vmexit are * always updated in the vmcs02, the others go down the slow @@ -12910,7 +12910,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, /* * First, the fields that are shadowed. This must be kept in sync - * with vmx_shadow_fields.h. + * with vmcs_shadow_fields.h. */ if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { diff --git a/arch/x86/kvm/vmx/vmx_shadow_fields.h b/arch/x86/kvm/vmx/vmx_shadow_fields.h deleted file mode 100644 index 132432f375c2..000000000000 --- a/arch/x86/kvm/vmx/vmx_shadow_fields.h +++ /dev/null @@ -1,74 +0,0 @@ -#ifndef SHADOW_FIELD_RO -#define SHADOW_FIELD_RO(x) -#endif -#ifndef SHADOW_FIELD_RW -#define SHADOW_FIELD_RW(x) -#endif - -/* - * We do NOT shadow fields that are modified when L0 - * traps and emulates any vmx instruction (e.g. VMPTRLD, - * VMXON...) executed by L1. - * For example, VM_INSTRUCTION_ERROR is read - * by L1 if a vmx instruction fails (part of the error path). - * Note the code assumes this logic. If for some reason - * we start shadowing these fields then we need to - * force a shadow sync when L0 emulates vmx instructions - * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified - * by nested_vmx_failValid) - * - * When adding or removing fields here, note that shadowed - * fields must always be synced by prepare_vmcs02, not just - * prepare_vmcs02_full. - */ - -/* - * Keeping the fields ordered by size is an attempt at improving - * branch prediction in vmcs_read_any and vmcs_write_any. - */ - -/* 16-bits */ -SHADOW_FIELD_RW(GUEST_INTR_STATUS) -SHADOW_FIELD_RW(GUEST_PML_INDEX) -SHADOW_FIELD_RW(HOST_FS_SELECTOR) -SHADOW_FIELD_RW(HOST_GS_SELECTOR) - -/* 32-bits */ -SHADOW_FIELD_RO(VM_EXIT_REASON) -SHADOW_FIELD_RO(VM_EXIT_INTR_INFO) -SHADOW_FIELD_RO(VM_EXIT_INSTRUCTION_LEN) -SHADOW_FIELD_RO(IDT_VECTORING_INFO_FIELD) -SHADOW_FIELD_RO(IDT_VECTORING_ERROR_CODE) -SHADOW_FIELD_RO(VM_EXIT_INTR_ERROR_CODE) -SHADOW_FIELD_RW(CPU_BASED_VM_EXEC_CONTROL) -SHADOW_FIELD_RW(EXCEPTION_BITMAP) -SHADOW_FIELD_RW(VM_ENTRY_EXCEPTION_ERROR_CODE) -SHADOW_FIELD_RW(VM_ENTRY_INTR_INFO_FIELD) -SHADOW_FIELD_RW(VM_ENTRY_INSTRUCTION_LEN) -SHADOW_FIELD_RW(TPR_THRESHOLD) -SHADOW_FIELD_RW(GUEST_CS_AR_BYTES) -SHADOW_FIELD_RW(GUEST_SS_AR_BYTES) -SHADOW_FIELD_RW(GUEST_INTERRUPTIBILITY_INFO) -SHADOW_FIELD_RW(VMX_PREEMPTION_TIMER_VALUE) - -/* Natural width */ -SHADOW_FIELD_RO(EXIT_QUALIFICATION) -SHADOW_FIELD_RO(GUEST_LINEAR_ADDRESS) -SHADOW_FIELD_RW(GUEST_RIP) -SHADOW_FIELD_RW(GUEST_RSP) -SHADOW_FIELD_RW(GUEST_CR0) -SHADOW_FIELD_RW(GUEST_CR3) -SHADOW_FIELD_RW(GUEST_CR4) -SHADOW_FIELD_RW(GUEST_RFLAGS) -SHADOW_FIELD_RW(CR0_GUEST_HOST_MASK) -SHADOW_FIELD_RW(CR0_READ_SHADOW) -SHADOW_FIELD_RW(CR4_READ_SHADOW) -SHADOW_FIELD_RW(HOST_FS_BASE) -SHADOW_FIELD_RW(HOST_GS_BASE) - -/* 64-bit */ -SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS) -SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS_HIGH) - -#undef SHADOW_FIELD_RO -#undef SHADOW_FIELD_RW -- cgit v1.2.3 From 4cebd747d78559d1e8666d3e68a75f08966abc0c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 3 Dec 2018 13:52:58 -0800 Subject: KVM: VMX: Drop the "vmx" prefix from vmx_evmcs.h VMX specific files now reside in a dedicated subdirectory, i.e. the file name prefix is redundant. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/evmcs.h | 324 +++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/kvm/vmx/vmx_evmcs.h | 324 ------------------------------------------- 3 files changed, 325 insertions(+), 325 deletions(-) create mode 100644 arch/x86/kvm/vmx/evmcs.h delete mode 100644 arch/x86/kvm/vmx/vmx_evmcs.h (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h new file mode 100644 index 000000000000..210a884090ad --- /dev/null +++ b/arch/x86/kvm/vmx/evmcs.h @@ -0,0 +1,324 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_VMX_EVMCS_H +#define __KVM_X86_VMX_EVMCS_H + +#include + +#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) +#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x) +#define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \ + {EVMCS1_OFFSET(name), clean_field} + +struct evmcs_field { + u16 offset; + u16 clean_field; +}; + +static const struct evmcs_field vmcs_field_to_evmcs_1[] = { + /* 64 bit rw */ + EVMCS1_FIELD(GUEST_RIP, guest_rip, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(GUEST_RSP, guest_rsp, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), + EVMCS1_FIELD(GUEST_RFLAGS, guest_rflags, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), + EVMCS1_FIELD(HOST_IA32_PAT, host_ia32_pat, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_IA32_EFER, host_ia32_efer, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_CR0, host_cr0, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_CR3, host_cr3, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_CR4, host_cr4, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_RIP, host_rip, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(IO_BITMAP_A, io_bitmap_a, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP), + EVMCS1_FIELD(IO_BITMAP_B, io_bitmap_b, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP), + EVMCS1_FIELD(MSR_BITMAP, msr_bitmap, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP), + EVMCS1_FIELD(GUEST_ES_BASE, guest_es_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_CS_BASE, guest_cs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_SS_BASE, guest_ss_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_DS_BASE, guest_ds_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_FS_BASE, guest_fs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GS_BASE, guest_gs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_LDTR_BASE, guest_ldtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_TR_BASE, guest_tr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GDTR_BASE, guest_gdtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_IDTR_BASE, guest_idtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(TSC_OFFSET, tsc_offset, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), + EVMCS1_FIELD(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), + EVMCS1_FIELD(VMCS_LINK_POINTER, vmcs_link_pointer, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_IA32_PAT, guest_ia32_pat, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_IA32_EFER, guest_ia32_efer, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_PDPTR0, guest_pdptr0, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_PDPTR1, guest_pdptr1, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_PDPTR2, guest_pdptr2, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_PDPTR3, guest_pdptr3, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(CR0_READ_SHADOW, cr0_read_shadow, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(CR4_READ_SHADOW, cr4_read_shadow, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(GUEST_CR0, guest_cr0, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(GUEST_CR3, guest_cr3, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(GUEST_CR4, guest_cr4, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(GUEST_DR7, guest_dr7, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(HOST_FS_BASE, host_fs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(HOST_GS_BASE, host_gs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(HOST_TR_BASE, host_tr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(HOST_GDTR_BASE, host_gdtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(HOST_IDTR_BASE, host_idtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(HOST_RSP, host_rsp, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(EPT_POINTER, ept_pointer, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT), + EVMCS1_FIELD(GUEST_BNDCFGS, guest_bndcfgs, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(XSS_EXIT_BITMAP, xss_exit_bitmap, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), + + /* 64 bit read only */ + EVMCS1_FIELD(GUEST_PHYSICAL_ADDRESS, guest_physical_address, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(EXIT_QUALIFICATION, exit_qualification, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + /* + * Not defined in KVM: + * + * EVMCS1_FIELD(0x00006402, exit_io_instruction_ecx, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); + * EVMCS1_FIELD(0x00006404, exit_io_instruction_esi, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); + * EVMCS1_FIELD(0x00006406, exit_io_instruction_esi, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); + * EVMCS1_FIELD(0x00006408, exit_io_instruction_eip, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); + */ + EVMCS1_FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + + /* + * No mask defined in the spec as Hyper-V doesn't currently support + * these. Future proof by resetting the whole clean field mask on + * access. + */ + EVMCS1_FIELD(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(CR3_TARGET_VALUE0, cr3_target_value0, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(CR3_TARGET_VALUE1, cr3_target_value1, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(CR3_TARGET_VALUE2, cr3_target_value2, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(CR3_TARGET_VALUE3, cr3_target_value3, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + + /* 32 bit rw */ + EVMCS1_FIELD(TPR_THRESHOLD, tpr_threshold, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), + EVMCS1_FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC), + EVMCS1_FIELD(EXCEPTION_BITMAP, exception_bitmap, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN), + EVMCS1_FIELD(VM_ENTRY_CONTROLS, vm_entry_controls, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY), + EVMCS1_FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), + EVMCS1_FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, + vm_entry_exception_error_code, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), + EVMCS1_FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), + EVMCS1_FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), + EVMCS1_FIELD(VM_EXIT_CONTROLS, vm_exit_controls, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), + EVMCS1_FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), + EVMCS1_FIELD(GUEST_ES_LIMIT, guest_es_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_CS_LIMIT, guest_cs_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_SS_LIMIT, guest_ss_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_DS_LIMIT, guest_ds_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_FS_LIMIT, guest_fs_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GS_LIMIT, guest_gs_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_TR_LIMIT, guest_tr_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_ACTIVITY_STATE, guest_activity_state, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + + /* 32 bit read only */ + EVMCS1_FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(VM_EXIT_REASON, vm_exit_reason, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + + /* No mask defined in the spec (not used) */ + EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(CR3_TARGET_COUNT, cr3_target_count, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + + /* 16 bit rw */ + EVMCS1_FIELD(HOST_ES_SELECTOR, host_es_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_CS_SELECTOR, host_cs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_SS_SELECTOR, host_ss_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_DS_SELECTOR, host_ds_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_FS_SELECTOR, host_fs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_GS_SELECTOR, host_gs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_TR_SELECTOR, host_tr_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(GUEST_ES_SELECTOR, guest_es_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_CS_SELECTOR, guest_cs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_SS_SELECTOR, guest_ss_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_DS_SELECTOR, guest_ds_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_FS_SELECTOR, guest_fs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GS_SELECTOR, guest_gs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_TR_SELECTOR, guest_tr_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT), +}; + +static __always_inline int get_evmcs_offset(unsigned long field, + u16 *clean_field) +{ + unsigned int index = ROL16(field, 6); + const struct evmcs_field *evmcs_field; + + if (unlikely(index >= ARRAY_SIZE(vmcs_field_to_evmcs_1))) { + WARN_ONCE(1, "KVM: accessing unsupported EVMCS field %lx\n", + field); + return -ENOENT; + } + + evmcs_field = &vmcs_field_to_evmcs_1[index]; + + if (clean_field) + *clean_field = evmcs_field->clean_field; + + return evmcs_field->offset; +} + +#undef ROL16 + +#endif /* __KVM_X86_VMX_EVMCS_H */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 2fe25c13adfc..68a0bb7609b8 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -49,6 +49,7 @@ #include #include "cpuid.h" +#include "evmcs.h" #include "hyperv.h" #include "irq.h" #include "kvm_cache_regs.h" @@ -56,7 +57,6 @@ #include "mmu.h" #include "pmu.h" #include "trace.h" -#include "vmx_evmcs.h" #include "x86.h" #define __ex(x) __kvm_handle_fault_on_reboot(x) diff --git a/arch/x86/kvm/vmx/vmx_evmcs.h b/arch/x86/kvm/vmx/vmx_evmcs.h deleted file mode 100644 index 210a884090ad..000000000000 --- a/arch/x86/kvm/vmx/vmx_evmcs.h +++ /dev/null @@ -1,324 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __KVM_X86_VMX_EVMCS_H -#define __KVM_X86_VMX_EVMCS_H - -#include - -#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) -#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x) -#define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \ - {EVMCS1_OFFSET(name), clean_field} - -struct evmcs_field { - u16 offset; - u16 clean_field; -}; - -static const struct evmcs_field vmcs_field_to_evmcs_1[] = { - /* 64 bit rw */ - EVMCS1_FIELD(GUEST_RIP, guest_rip, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(GUEST_RSP, guest_rsp, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), - EVMCS1_FIELD(GUEST_RFLAGS, guest_rflags, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), - EVMCS1_FIELD(HOST_IA32_PAT, host_ia32_pat, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_IA32_EFER, host_ia32_efer, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_CR0, host_cr0, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_CR3, host_cr3, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_CR4, host_cr4, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_RIP, host_rip, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(IO_BITMAP_A, io_bitmap_a, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP), - EVMCS1_FIELD(IO_BITMAP_B, io_bitmap_b, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP), - EVMCS1_FIELD(MSR_BITMAP, msr_bitmap, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP), - EVMCS1_FIELD(GUEST_ES_BASE, guest_es_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_CS_BASE, guest_cs_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_SS_BASE, guest_ss_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_DS_BASE, guest_ds_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_FS_BASE, guest_fs_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_GS_BASE, guest_gs_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_LDTR_BASE, guest_ldtr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_TR_BASE, guest_tr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_GDTR_BASE, guest_gdtr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_IDTR_BASE, guest_idtr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(TSC_OFFSET, tsc_offset, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), - EVMCS1_FIELD(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), - EVMCS1_FIELD(VMCS_LINK_POINTER, vmcs_link_pointer, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_IA32_PAT, guest_ia32_pat, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_IA32_EFER, guest_ia32_efer, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_PDPTR0, guest_pdptr0, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_PDPTR1, guest_pdptr1, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_PDPTR2, guest_pdptr2, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_PDPTR3, guest_pdptr3, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(CR0_READ_SHADOW, cr0_read_shadow, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(CR4_READ_SHADOW, cr4_read_shadow, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(GUEST_CR0, guest_cr0, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(GUEST_CR3, guest_cr3, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(GUEST_CR4, guest_cr4, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(GUEST_DR7, guest_dr7, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(HOST_FS_BASE, host_fs_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), - EVMCS1_FIELD(HOST_GS_BASE, host_gs_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), - EVMCS1_FIELD(HOST_TR_BASE, host_tr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), - EVMCS1_FIELD(HOST_GDTR_BASE, host_gdtr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), - EVMCS1_FIELD(HOST_IDTR_BASE, host_idtr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), - EVMCS1_FIELD(HOST_RSP, host_rsp, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), - EVMCS1_FIELD(EPT_POINTER, ept_pointer, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT), - EVMCS1_FIELD(GUEST_BNDCFGS, guest_bndcfgs, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(XSS_EXIT_BITMAP, xss_exit_bitmap, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), - - /* 64 bit read only */ - EVMCS1_FIELD(GUEST_PHYSICAL_ADDRESS, guest_physical_address, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(EXIT_QUALIFICATION, exit_qualification, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - /* - * Not defined in KVM: - * - * EVMCS1_FIELD(0x00006402, exit_io_instruction_ecx, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); - * EVMCS1_FIELD(0x00006404, exit_io_instruction_esi, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); - * EVMCS1_FIELD(0x00006406, exit_io_instruction_esi, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); - * EVMCS1_FIELD(0x00006408, exit_io_instruction_eip, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); - */ - EVMCS1_FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - - /* - * No mask defined in the spec as Hyper-V doesn't currently support - * these. Future proof by resetting the whole clean field mask on - * access. - */ - EVMCS1_FIELD(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(CR3_TARGET_VALUE0, cr3_target_value0, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(CR3_TARGET_VALUE1, cr3_target_value1, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(CR3_TARGET_VALUE2, cr3_target_value2, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(CR3_TARGET_VALUE3, cr3_target_value3, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - - /* 32 bit rw */ - EVMCS1_FIELD(TPR_THRESHOLD, tpr_threshold, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), - EVMCS1_FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC), - EVMCS1_FIELD(EXCEPTION_BITMAP, exception_bitmap, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN), - EVMCS1_FIELD(VM_ENTRY_CONTROLS, vm_entry_controls, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY), - EVMCS1_FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), - EVMCS1_FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, - vm_entry_exception_error_code, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), - EVMCS1_FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), - EVMCS1_FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), - EVMCS1_FIELD(VM_EXIT_CONTROLS, vm_exit_controls, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), - EVMCS1_FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), - EVMCS1_FIELD(GUEST_ES_LIMIT, guest_es_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_CS_LIMIT, guest_cs_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_SS_LIMIT, guest_ss_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_DS_LIMIT, guest_ds_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_FS_LIMIT, guest_fs_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_GS_LIMIT, guest_gs_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_TR_LIMIT, guest_tr_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_ACTIVITY_STATE, guest_activity_state, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - - /* 32 bit read only */ - EVMCS1_FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(VM_EXIT_REASON, vm_exit_reason, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - - /* No mask defined in the spec (not used) */ - EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(CR3_TARGET_COUNT, cr3_target_count, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - - /* 16 bit rw */ - EVMCS1_FIELD(HOST_ES_SELECTOR, host_es_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_CS_SELECTOR, host_cs_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_SS_SELECTOR, host_ss_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_DS_SELECTOR, host_ds_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_FS_SELECTOR, host_fs_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_GS_SELECTOR, host_gs_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_TR_SELECTOR, host_tr_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(GUEST_ES_SELECTOR, guest_es_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_CS_SELECTOR, guest_cs_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_SS_SELECTOR, guest_ss_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_DS_SELECTOR, guest_ds_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_FS_SELECTOR, guest_fs_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_GS_SELECTOR, guest_gs_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_TR_SELECTOR, guest_tr_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT), -}; - -static __always_inline int get_evmcs_offset(unsigned long field, - u16 *clean_field) -{ - unsigned int index = ROL16(field, 6); - const struct evmcs_field *evmcs_field; - - if (unlikely(index >= ARRAY_SIZE(vmcs_field_to_evmcs_1))) { - WARN_ONCE(1, "KVM: accessing unsupported EVMCS field %lx\n", - field); - return -ENOENT; - } - - evmcs_field = &vmcs_field_to_evmcs_1[index]; - - if (clean_field) - *clean_field = evmcs_field->clean_field; - - return evmcs_field->offset; -} - -#undef ROL16 - -#endif /* __KVM_X86_VMX_EVMCS_H */ -- cgit v1.2.3 From 71d9409e20934e16f2d2ea88f0d1fb9851a7da3b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 3 Dec 2018 13:52:59 -0800 Subject: KVM: VMX: Move caching of MSR_IA32_XSS to hardware_setup() MSR_IA32_XSS has no relation to the VMCS whatsoever, it doesn't belong in setup_vmcs_config() and its reference to host_xss prevents moving setup_vmcs_config() to a dedicated file. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 68a0bb7609b8..1c99ca95d6c3 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4740,9 +4740,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) } } - if (boot_cpu_has(X86_FEATURE_XSAVES)) - rdmsrl(MSR_IA32_XSS, host_xss); - return 0; } @@ -7917,6 +7914,9 @@ static __init int hardware_setup(void) WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost"); } + if (boot_cpu_has(X86_FEATURE_XSAVES)) + rdmsrl(MSR_IA32_XSS, host_xss); + if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) enable_vpid = 0; -- cgit v1.2.3 From c73da3fcab43357feb68cac227194b13e998a8db Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 3 Dec 2018 13:53:00 -0800 Subject: KVM: VMX: Properly handle dynamic VM Entry/Exit controls EFER and PERF_GLOBAL_CTRL MSRs have dedicated VM Entry/Exit controls that KVM dynamically toggles based on whether or not the guest's value for each MSRs differs from the host. Handle the dynamic behavior by adding a helper that clears the dynamic bits so the bits aren't set when initializing the VMCS field outside of the dynamic toggling flow. This makes the handling consistent with similar behavior for other controls, e.g. pin, exec and sec_exec. More importantly, it eliminates two global bools that are stealthily modified by setup_vmcs_config. Opportunistically clean up a comment and print related to errata for IA32_PERF_GLOBAL_CTRL. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 145 +++++++++++++++++++++++++------------------------ 1 file changed, 74 insertions(+), 71 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1c99ca95d6c3..df874f330ec8 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1336,9 +1336,6 @@ static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) -static bool cpu_has_load_ia32_efer; -static bool cpu_has_load_perf_global_ctrl; - static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); static DEFINE_SPINLOCK(vmx_vpid_lock); @@ -1681,6 +1678,18 @@ static inline bool is_icebp(u32 intr_info) == (INTR_TYPE_PRIV_SW_EXCEPTION | INTR_INFO_VALID_MASK); } +static inline bool cpu_has_load_ia32_efer(void) +{ + return (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_EFER) && + (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_EFER); +} + +static inline bool cpu_has_load_perf_global_ctrl(void) +{ + return (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && + (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); +} + static inline bool cpu_has_vmx_msr_bitmap(void) { return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS; @@ -2696,7 +2705,7 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) switch (msr) { case MSR_EFER: - if (cpu_has_load_ia32_efer) { + if (cpu_has_load_ia32_efer()) { clear_atomic_switch_msr_special(vmx, VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER); @@ -2704,7 +2713,7 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) } break; case MSR_CORE_PERF_GLOBAL_CTRL: - if (cpu_has_load_perf_global_ctrl) { + if (cpu_has_load_perf_global_ctrl()) { clear_atomic_switch_msr_special(vmx, VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); @@ -2749,7 +2758,7 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, switch (msr) { case MSR_EFER: - if (cpu_has_load_ia32_efer) { + if (cpu_has_load_ia32_efer()) { add_atomic_switch_msr_special(vmx, VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER, @@ -2760,7 +2769,7 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, } break; case MSR_CORE_PERF_GLOBAL_CTRL: - if (cpu_has_load_perf_global_ctrl) { + if (cpu_has_load_perf_global_ctrl()) { add_atomic_switch_msr_special(vmx, VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL, @@ -2839,7 +2848,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) * On CPUs that support "load IA32_EFER", always switch EFER * atomically, since it's faster than switching it manually. */ - if (cpu_has_load_ia32_efer || + if (cpu_has_load_ia32_efer() || (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) { if (!(guest_efer & EFER_LMA)) guest_efer &= ~EFER_LME; @@ -4533,14 +4542,6 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, return 0; } -static __init bool allow_1_setting(u32 msr, u32 ctl) -{ - u32 vmx_msr_low, vmx_msr_high; - - rdmsr(msr, vmx_msr_low, vmx_msr_high); - return vmx_msr_high & ctl; -} - static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) { u32 vmx_msr_low, vmx_msr_high; @@ -4642,8 +4643,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) #ifdef CONFIG_X86_64 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; #endif - opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT | - VM_EXIT_CLEAR_BNDCFGS; + opt = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | + VM_EXIT_SAVE_IA32_PAT | + VM_EXIT_LOAD_IA32_PAT | + VM_EXIT_LOAD_IA32_EFER | + VM_EXIT_CLEAR_BNDCFGS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, &_vmexit_control) < 0) return -EIO; @@ -4662,11 +4666,38 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; min = VM_ENTRY_LOAD_DEBUG_CONTROLS; - opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; + opt = VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | + VM_ENTRY_LOAD_IA32_PAT | + VM_ENTRY_LOAD_IA32_EFER | + VM_ENTRY_LOAD_BNDCFGS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, &_vmentry_control) < 0) return -EIO; + /* + * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they + * can't be used due to an errata where VM Exit may incorrectly clear + * IA32_PERF_GLOBAL_CTRL[34:32]. Workaround the errata by using the + * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL. + */ + if (boot_cpu_data.x86 == 0x6) { + switch (boot_cpu_data.x86_model) { + case 26: /* AAK155 */ + case 30: /* AAP115 */ + case 37: /* AAT100 */ + case 44: /* BC86,AAY89,BD102 */ + case 46: /* BA97 */ + _vmexit_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; + _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; + pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " + "does not work properly. Using workaround\n"); + break; + default: + break; + } + } + + rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ @@ -4698,48 +4729,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) if (static_branch_unlikely(&enable_evmcs)) evmcs_sanitize_exec_ctrls(vmcs_conf); - cpu_has_load_ia32_efer = - allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS, - VM_ENTRY_LOAD_IA32_EFER) - && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS, - VM_EXIT_LOAD_IA32_EFER); - - cpu_has_load_perf_global_ctrl = - allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS, - VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) - && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS, - VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); - - /* - * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL - * but due to errata below it can't be used. Workaround is to use - * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL. - * - * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32] - * - * AAK155 (model 26) - * AAP115 (model 30) - * AAT100 (model 37) - * BC86,AAY89,BD102 (model 44) - * BA97 (model 46) - * - */ - if (cpu_has_load_perf_global_ctrl && boot_cpu_data.x86 == 0x6) { - switch (boot_cpu_data.x86_model) { - case 26: - case 30: - case 37: - case 44: - case 46: - cpu_has_load_perf_global_ctrl = false; - printk_once(KERN_WARNING"kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " - "does not work properly. Using workaround\n"); - break; - default: - break; - } - } - return 0; } @@ -6375,7 +6364,7 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32)); } - if (cpu_has_load_ia32_efer) + if (cpu_has_load_ia32_efer()) vmcs_write64(HOST_IA32_EFER, host_efer); } @@ -6425,6 +6414,20 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) vmx_update_msr_bitmap(vcpu); } +static u32 vmx_vmentry_ctrl(void) +{ + /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ + return vmcs_config.vmentry_ctrl & + ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER); +} + +static u32 vmx_vmexit_ctrl(void) +{ + /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ + return vmcs_config.vmexit_ctrl & + ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); +} + static u32 vmx_exec_control(struct vcpu_vmx *vmx) { u32 exec_control = vmcs_config.cpu_based_exec_ctrl; @@ -6690,10 +6693,10 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) vmx->arch_capabilities = kvm_get_arch_capabilities(); - vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl); + vm_exit_controls_init(vmx, vmx_vmexit_ctrl()); /* 22.2.1, 20.8.1 */ - vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl); + vm_entry_controls_init(vmx, vmx_vmentry_ctrl()); vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS; vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS); @@ -10468,7 +10471,7 @@ static void dump_vmcs(void) pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n", vmcs_read64(GUEST_IA32_DEBUGCTL), vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS)); - if (cpu_has_load_perf_global_ctrl && + if (cpu_has_load_perf_global_ctrl() && vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) pr_err("PerfGlobCtl = 0x%016llx\n", vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL)); @@ -10505,7 +10508,7 @@ static void dump_vmcs(void) pr_err("EFER = 0x%016llx PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_EFER), vmcs_read64(HOST_IA32_PAT)); - if (cpu_has_load_perf_global_ctrl && + if (cpu_has_load_perf_global_ctrl() && vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) pr_err("PerfGlobCtl = 0x%016llx\n", vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); @@ -12730,9 +12733,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) * on the related bits (if supported by the CPU) in the hope that * we can avoid VMWrites during vmx_set_efer(). */ - exec_control = (vmcs12->vm_entry_controls | vmcs_config.vmentry_ctrl) & + exec_control = (vmcs12->vm_entry_controls | vmx_vmentry_ctrl()) & ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER; - if (cpu_has_load_ia32_efer) { + if (cpu_has_load_ia32_efer()) { if (guest_efer & EFER_LMA) exec_control |= VM_ENTRY_IA32E_MODE; if (guest_efer != host_efer) @@ -12747,8 +12750,8 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER * bits may be modified by vmx_set_efer() in prepare_vmcs02(). */ - exec_control = vmcs_config.vmexit_ctrl; - if (cpu_has_load_ia32_efer && guest_efer != host_efer) + exec_control = vmx_vmexit_ctrl(); + if (cpu_has_load_ia32_efer() && guest_efer != host_efer) exec_control |= VM_EXIT_LOAD_IA32_EFER; vm_exit_controls_init(vmx, exec_control); @@ -14073,7 +14076,7 @@ static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) return vmcs_read64(GUEST_IA32_EFER); - if (cpu_has_load_ia32_efer) + if (cpu_has_load_ia32_efer()) return host_efer; for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { -- cgit v1.2.3 From 7caaa71108676944d60672b3446fadb6bcf29de4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 3 Dec 2018 13:53:01 -0800 Subject: KVM: VMX: Pass vmx_capability struct to setup_vmcs_config() ...instead of referencing the global struct. This will allow moving setup_vmcs_config() to a separate file that may not have access to the global variable. Modify nested_vmx_setup_ctls_msrs() appropriately since vmx_capability.ept may not be accurate when called by vmx_check_processor_compat(). No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index df874f330ec8..de76d3871463 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3521,7 +3521,8 @@ static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu) * bit in the high half is on if the corresponding bit in the control field * may be on. See also vmx_control_verify(). */ -static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv) +static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, + u32 ept_caps, bool apicv) { if (!nested) { memset(msrs, 0, sizeof(*msrs)); @@ -3660,7 +3661,7 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv) if (cpu_has_vmx_ept_execute_only()) msrs->ept_caps |= VMX_EPT_EXECUTE_ONLY_BIT; - msrs->ept_caps &= vmx_capability.ept; + msrs->ept_caps &= ept_caps; msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | VMX_EPT_1GB_PAGE_BIT; @@ -4542,7 +4543,8 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, return 0; } -static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) +static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, + struct vmx_capability *vmx_cap) { u32 vmx_msr_low, vmx_msr_high; u32 min, opt, min2, opt2; @@ -4619,7 +4621,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, - &vmx_capability.ept, &vmx_capability.vpid); + &vmx_cap->ept, &vmx_cap->vpid); if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { /* CR3 accesses and invlpg don't need to cause VM Exits when EPT @@ -4627,14 +4629,14 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING | CPU_BASED_INVLPG_EXITING); - } else if (vmx_capability.ept) { - vmx_capability.ept = 0; + } else if (vmx_cap->ept) { + vmx_cap->ept = 0; pr_warn_once("EPT CAP should not exist if not support " "1-setting enable EPT VM-execution control\n"); } if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && - vmx_capability.vpid) { - vmx_capability.vpid = 0; + vmx_cap->vpid) { + vmx_cap->vpid = 0; pr_warn_once("VPID CAP should not exist if not support " "1-setting enable VPID VM-execution control\n"); } @@ -7906,7 +7908,7 @@ static __init int hardware_setup(void) for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) kvm_define_shared_msr(i, vmx_msr_index[i]); - if (setup_vmcs_config(&vmcs_config) < 0) + if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) return -EIO; if (boot_cpu_has(X86_FEATURE_NX)) @@ -8035,7 +8037,8 @@ static __init int hardware_setup(void) } kvm_set_posted_intr_wakeup_handler(wakeup_handler); - nested_vmx_setup_ctls_msrs(&vmcs_config.nested, enable_apicv); + nested_vmx_setup_ctls_msrs(&vmcs_config.nested, vmx_capability.ept, + enable_apicv); kvm_mce_cap_supported |= MCG_LMCE_P; @@ -11608,6 +11611,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (nested) nested_vmx_setup_ctls_msrs(&vmx->nested.msrs, + vmx_capability.ept, kvm_vcpu_apicv_active(&vmx->vcpu)); vmx->nested.posted_intr_nv = -1; @@ -11677,11 +11681,12 @@ static int vmx_vm_init(struct kvm *kvm) static void __init vmx_check_processor_compat(void *rtn) { struct vmcs_config vmcs_conf; + struct vmx_capability vmx_cap; *(int *)rtn = 0; - if (setup_vmcs_config(&vmcs_conf) < 0) + if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) *(int *)rtn = -EIO; - nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, enable_apicv); + nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept, enable_apicv); if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n", smp_processor_id()); -- cgit v1.2.3 From 3077c191088255932c9e3780d8c6a59f147b75b8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 3 Dec 2018 13:53:02 -0800 Subject: KVM: VMX: Move capabilities structs and helpers to dedicated file Defining a separate capabilities.h as opposed to putting this code in e.g. vmx.h avoids circular dependencies between (the yet-to-be-added) vmx.h and nested.h. The aforementioned circular dependencies are why struct nested_vmx_msrs also resides in capabilities instead of e.g. nested.h. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/capabilities.h | 321 ++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/vmx/vmx.c | 313 +-------------------------------------- 2 files changed, 324 insertions(+), 310 deletions(-) create mode 100644 arch/x86/kvm/vmx/capabilities.h (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h new file mode 100644 index 000000000000..8326dce6900d --- /dev/null +++ b/arch/x86/kvm/vmx/capabilities.h @@ -0,0 +1,321 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_VMX_CAPS_H +#define __KVM_X86_VMX_CAPS_H + +#include "lapic.h" + +struct nested_vmx_msrs { + /* + * We only store the "true" versions of the VMX capability MSRs. We + * generate the "non-true" versions by setting the must-be-1 bits + * according to the SDM. + */ + u32 procbased_ctls_low; + u32 procbased_ctls_high; + u32 secondary_ctls_low; + u32 secondary_ctls_high; + u32 pinbased_ctls_low; + u32 pinbased_ctls_high; + u32 exit_ctls_low; + u32 exit_ctls_high; + u32 entry_ctls_low; + u32 entry_ctls_high; + u32 misc_low; + u32 misc_high; + u32 ept_caps; + u32 vpid_caps; + u64 basic; + u64 cr0_fixed0; + u64 cr0_fixed1; + u64 cr4_fixed0; + u64 cr4_fixed1; + u64 vmcs_enum; + u64 vmfunc_controls; +}; + +struct vmcs_config { + int size; + int order; + u32 basic_cap; + u32 revision_id; + u32 pin_based_exec_ctrl; + u32 cpu_based_exec_ctrl; + u32 cpu_based_2nd_exec_ctrl; + u32 vmexit_ctrl; + u32 vmentry_ctrl; + struct nested_vmx_msrs nested; +}; +extern struct vmcs_config vmcs_config; + +struct vmx_capability { + u32 ept; + u32 vpid; +}; +extern struct vmx_capability vmx_capability; + +static inline bool cpu_has_vmx_basic_inout(void) +{ + return (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT); +} + +static inline bool cpu_has_virtual_nmis(void) +{ + return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; +} + +static inline bool cpu_has_vmx_preemption_timer(void) +{ + return vmcs_config.pin_based_exec_ctrl & + PIN_BASED_VMX_PREEMPTION_TIMER; +} + +static inline bool cpu_has_vmx_posted_intr(void) +{ + return IS_ENABLED(CONFIG_X86_LOCAL_APIC) && + vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; +} + +static inline bool cpu_has_load_ia32_efer(void) +{ + return (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_EFER) && + (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_EFER); +} + +static inline bool cpu_has_load_perf_global_ctrl(void) +{ + return (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && + (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); +} + +static inline bool vmx_mpx_supported(void) +{ + return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) && + (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS); +} + +static inline bool cpu_has_vmx_tpr_shadow(void) +{ + return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW; +} + +static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu) +{ + return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu); +} + +static inline bool cpu_has_vmx_msr_bitmap(void) +{ + return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS; +} + +static inline bool cpu_has_secondary_exec_ctrls(void) +{ + return vmcs_config.cpu_based_exec_ctrl & + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; +} + +static inline bool cpu_has_vmx_virtualize_apic_accesses(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; +} + +static inline bool cpu_has_vmx_ept(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENABLE_EPT; +} + +static inline bool vmx_umip_emulated(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_DESC; +} + +static inline bool cpu_has_vmx_rdtscp(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_RDTSCP; +} + +static inline bool cpu_has_vmx_virtualize_x2apic_mode(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; +} + +static inline bool cpu_has_vmx_vpid(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENABLE_VPID; +} + +static inline bool cpu_has_vmx_wbinvd_exit(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_WBINVD_EXITING; +} + +static inline bool cpu_has_vmx_unrestricted_guest(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_UNRESTRICTED_GUEST; +} + +static inline bool cpu_has_vmx_apic_register_virt(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_APIC_REGISTER_VIRT; +} + +static inline bool cpu_has_vmx_virtual_intr_delivery(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; +} + +static inline bool cpu_has_vmx_ple(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_PAUSE_LOOP_EXITING; +} + +static inline bool vmx_rdrand_supported(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_RDRAND_EXITING; +} + +static inline bool cpu_has_vmx_invpcid(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENABLE_INVPCID; +} + +static inline bool cpu_has_vmx_vmfunc(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENABLE_VMFUNC; +} + +static inline bool cpu_has_vmx_shadow_vmcs(void) +{ + u64 vmx_msr; + + /* check if the cpu supports writing r/o exit information fields */ + rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); + if (!(vmx_msr & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS)) + return false; + + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_SHADOW_VMCS; +} + +static inline bool cpu_has_vmx_encls_vmexit(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENCLS_EXITING; +} + +static inline bool vmx_rdseed_supported(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_RDSEED_EXITING; +} + +static inline bool cpu_has_vmx_pml(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML; +} + +static inline bool vmx_xsaves_supported(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_XSAVES; +} + +static inline bool cpu_has_vmx_tsc_scaling(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_TSC_SCALING; +} + +static inline bool cpu_has_vmx_apicv(void) +{ + return cpu_has_vmx_apic_register_virt() && + cpu_has_vmx_virtual_intr_delivery() && + cpu_has_vmx_posted_intr(); +} + +static inline bool cpu_has_vmx_flexpriority(void) +{ + return cpu_has_vmx_tpr_shadow() && + cpu_has_vmx_virtualize_apic_accesses(); +} + +static inline bool cpu_has_vmx_ept_execute_only(void) +{ + return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT; +} + +static inline bool cpu_has_vmx_ept_4levels(void) +{ + return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; +} + +static inline bool cpu_has_vmx_ept_5levels(void) +{ + return vmx_capability.ept & VMX_EPT_PAGE_WALK_5_BIT; +} + +static inline bool cpu_has_vmx_ept_mt_wb(void) +{ + return vmx_capability.ept & VMX_EPTP_WB_BIT; +} + +static inline bool cpu_has_vmx_ept_2m_page(void) +{ + return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT; +} + +static inline bool cpu_has_vmx_ept_1g_page(void) +{ + return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT; +} + +static inline bool cpu_has_vmx_ept_ad_bits(void) +{ + return vmx_capability.ept & VMX_EPT_AD_BIT; +} + +static inline bool cpu_has_vmx_invept_context(void) +{ + return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT; +} + +static inline bool cpu_has_vmx_invept_global(void) +{ + return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT; +} + +static inline bool cpu_has_vmx_invvpid(void) +{ + return vmx_capability.vpid & VMX_VPID_INVVPID_BIT; +} + +static inline bool cpu_has_vmx_invvpid_individual_addr(void) +{ + return vmx_capability.vpid & VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT; +} + +static inline bool cpu_has_vmx_invvpid_single(void) +{ + return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT; +} + +static inline bool cpu_has_vmx_invvpid_global(void) +{ + return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; +} + +#endif /* __KVM_X86_VMX_CAPS_H */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index de76d3871463..3f2eaa886521 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -48,6 +48,7 @@ #include #include +#include "capabilities.h" #include "cpuid.h" #include "evmcs.h" #include "hyperv.h" @@ -778,35 +779,6 @@ static inline void vmx_check_vmcs12_offsets(void) { */ #define VMCS12_MAX_FIELD_INDEX 0x17 -struct nested_vmx_msrs { - /* - * We only store the "true" versions of the VMX capability MSRs. We - * generate the "non-true" versions by setting the must-be-1 bits - * according to the SDM. - */ - u32 procbased_ctls_low; - u32 procbased_ctls_high; - u32 secondary_ctls_low; - u32 secondary_ctls_high; - u32 pinbased_ctls_low; - u32 pinbased_ctls_high; - u32 exit_ctls_low; - u32 exit_ctls_high; - u32 entry_ctls_low; - u32 entry_ctls_high; - u32 misc_low; - u32 misc_high; - u32 ept_caps; - u32 vpid_caps; - u64 basic; - u64 cr0_fixed0; - u64 cr0_fixed1; - u64 cr4_fixed0; - u64 cr4_fixed1; - u64 vmcs_enum; - u64 vmfunc_controls; -}; - /* * The nested_vmx structure is part of vcpu_vmx, and holds information we need * for correct emulation of VMX (i.e., nested VMX) on this vcpu. @@ -1294,7 +1266,6 @@ static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu) static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu); static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); -static bool vmx_xsaves_supported(void); static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); static void vmx_get_segment(struct kvm_vcpu *vcpu, @@ -1339,23 +1310,8 @@ static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); static DEFINE_SPINLOCK(vmx_vpid_lock); -static struct vmcs_config { - int size; - int order; - u32 basic_cap; - u32 revision_id; - u32 pin_based_exec_ctrl; - u32 cpu_based_exec_ctrl; - u32 cpu_based_2nd_exec_ctrl; - u32 vmexit_ctrl; - u32 vmentry_ctrl; - struct nested_vmx_msrs nested; -} vmcs_config; - -static struct vmx_capability { - u32 ept; - u32 vpid; -} vmx_capability; +struct vmcs_config vmcs_config; +struct vmx_capability vmx_capability; #define VMX_SEGMENT_FIELD(seg) \ [VCPU_SREG_##seg] = { \ @@ -1678,69 +1634,6 @@ static inline bool is_icebp(u32 intr_info) == (INTR_TYPE_PRIV_SW_EXCEPTION | INTR_INFO_VALID_MASK); } -static inline bool cpu_has_load_ia32_efer(void) -{ - return (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_EFER) && - (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_EFER); -} - -static inline bool cpu_has_load_perf_global_ctrl(void) -{ - return (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && - (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); -} - -static inline bool cpu_has_vmx_msr_bitmap(void) -{ - return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS; -} - -static inline bool cpu_has_vmx_tpr_shadow(void) -{ - return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW; -} - -static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu) -{ - return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu); -} - -static inline bool cpu_has_secondary_exec_ctrls(void) -{ - return vmcs_config.cpu_based_exec_ctrl & - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; -} - -static inline bool cpu_has_vmx_virtualize_apic_accesses(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; -} - -static inline bool cpu_has_vmx_virtualize_x2apic_mode(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; -} - -static inline bool cpu_has_vmx_apic_register_virt(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_APIC_REGISTER_VIRT; -} - -static inline bool cpu_has_vmx_virtual_intr_delivery(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; -} - -static inline bool cpu_has_vmx_encls_vmexit(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_ENCLS_EXITING; -} - /* * Comment's format: document - errata name - stepping - processor name. * Refer from @@ -1787,188 +1680,11 @@ static inline bool cpu_has_broken_vmx_preemption_timer(void) return false; } -static inline bool cpu_has_vmx_preemption_timer(void) -{ - return vmcs_config.pin_based_exec_ctrl & - PIN_BASED_VMX_PREEMPTION_TIMER; -} - -static inline bool cpu_has_vmx_posted_intr(void) -{ - return IS_ENABLED(CONFIG_X86_LOCAL_APIC) && - vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; -} - -static inline bool cpu_has_vmx_apicv(void) -{ - return cpu_has_vmx_apic_register_virt() && - cpu_has_vmx_virtual_intr_delivery() && - cpu_has_vmx_posted_intr(); -} - -static inline bool cpu_has_vmx_flexpriority(void) -{ - return cpu_has_vmx_tpr_shadow() && - cpu_has_vmx_virtualize_apic_accesses(); -} - -static inline bool cpu_has_vmx_ept_execute_only(void) -{ - return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT; -} - -static inline bool cpu_has_vmx_ept_2m_page(void) -{ - return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT; -} - -static inline bool cpu_has_vmx_ept_1g_page(void) -{ - return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT; -} - -static inline bool cpu_has_vmx_ept_4levels(void) -{ - return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; -} - -static inline bool cpu_has_vmx_ept_mt_wb(void) -{ - return vmx_capability.ept & VMX_EPTP_WB_BIT; -} - -static inline bool cpu_has_vmx_ept_5levels(void) -{ - return vmx_capability.ept & VMX_EPT_PAGE_WALK_5_BIT; -} - -static inline bool cpu_has_vmx_ept_ad_bits(void) -{ - return vmx_capability.ept & VMX_EPT_AD_BIT; -} - -static inline bool cpu_has_vmx_invept_context(void) -{ - return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT; -} - -static inline bool cpu_has_vmx_invept_global(void) -{ - return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT; -} - -static inline bool cpu_has_vmx_invvpid_individual_addr(void) -{ - return vmx_capability.vpid & VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT; -} - -static inline bool cpu_has_vmx_invvpid_single(void) -{ - return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT; -} - -static inline bool cpu_has_vmx_invvpid_global(void) -{ - return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; -} - -static inline bool cpu_has_vmx_invvpid(void) -{ - return vmx_capability.vpid & VMX_VPID_INVVPID_BIT; -} - -static inline bool cpu_has_vmx_ept(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_ENABLE_EPT; -} - -static inline bool cpu_has_vmx_unrestricted_guest(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_UNRESTRICTED_GUEST; -} - -static inline bool cpu_has_vmx_ple(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_PAUSE_LOOP_EXITING; -} - -static inline bool cpu_has_vmx_basic_inout(void) -{ - return (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT); -} - static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) { return flexpriority_enabled && lapic_in_kernel(vcpu); } -static inline bool cpu_has_vmx_vpid(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_ENABLE_VPID; -} - -static inline bool cpu_has_vmx_rdtscp(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_RDTSCP; -} - -static inline bool cpu_has_vmx_invpcid(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_ENABLE_INVPCID; -} - -static inline bool cpu_has_virtual_nmis(void) -{ - return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; -} - -static inline bool cpu_has_vmx_wbinvd_exit(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_WBINVD_EXITING; -} - -static inline bool cpu_has_vmx_shadow_vmcs(void) -{ - u64 vmx_msr; - rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); - /* check if the cpu supports writing r/o exit information fields */ - if (!(vmx_msr & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS)) - return false; - - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_SHADOW_VMCS; -} - -static inline bool cpu_has_vmx_pml(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML; -} - -static inline bool cpu_has_vmx_tsc_scaling(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_TSC_SCALING; -} - -static inline bool cpu_has_vmx_vmfunc(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_ENABLE_VMFUNC; -} - -static bool vmx_umip_emulated(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_DESC; -} - static inline bool report_flexpriority(void) { return flexpriority_enabled; @@ -6456,17 +6172,6 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) return exec_control; } -static bool vmx_rdrand_supported(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_RDRAND_EXITING; -} - -static bool vmx_rdseed_supported(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_RDSEED_EXITING; -} static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) { @@ -11002,18 +10707,6 @@ static bool vmx_has_emulated_msr(int index) } } -static bool vmx_mpx_supported(void) -{ - return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) && - (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS); -} - -static bool vmx_xsaves_supported(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_XSAVES; -} - static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) { u32 exit_intr_info; -- cgit v1.2.3 From 2c4fd91d26643504011b7c2b75563a437e19d6a1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 3 Dec 2018 13:53:03 -0800 Subject: KVM: VMX: Expose various module param vars via capabilities.h Expose the variables associated with various module params that are needed by the nested VMX code. There is no ulterior logic for what variables are/aren't exposed, this is purely "what's needed by the nested code". Note that @nested is intentionally not exposed. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/capabilities.h | 7 +++++++ arch/x86/kvm/vmx/vmx.c | 12 ++++++------ 2 files changed, 13 insertions(+), 6 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 8326dce6900d..366b9dd2e4ae 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -4,6 +4,13 @@ #include "lapic.h" +extern bool __read_mostly enable_vpid; +extern bool __read_mostly flexpriority_enabled; +extern bool __read_mostly enable_ept; +extern bool __read_mostly enable_unrestricted_guest; +extern bool __read_mostly enable_ept_ad_bits; +extern bool __read_mostly enable_pml; + struct nested_vmx_msrs { /* * We only store the "true" versions of the VMX capability MSRs. We diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 3f2eaa886521..ac7999d40808 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -73,23 +73,23 @@ static const struct x86_cpu_id vmx_cpu_id[] = { }; MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); -static bool __read_mostly enable_vpid = 1; +bool __read_mostly enable_vpid = 1; module_param_named(vpid, enable_vpid, bool, 0444); static bool __read_mostly enable_vnmi = 1; module_param_named(vnmi, enable_vnmi, bool, S_IRUGO); -static bool __read_mostly flexpriority_enabled = 1; +bool __read_mostly flexpriority_enabled = 1; module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); -static bool __read_mostly enable_ept = 1; +bool __read_mostly enable_ept = 1; module_param_named(ept, enable_ept, bool, S_IRUGO); -static bool __read_mostly enable_unrestricted_guest = 1; +bool __read_mostly enable_unrestricted_guest = 1; module_param_named(unrestricted_guest, enable_unrestricted_guest, bool, S_IRUGO); -static bool __read_mostly enable_ept_ad_bits = 1; +bool __read_mostly enable_ept_ad_bits = 1; module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO); static bool __read_mostly emulate_invalid_guest_state = true; @@ -116,7 +116,7 @@ module_param(nested_early_check, bool, S_IRUGO); static u64 __read_mostly host_xss; -static bool __read_mostly enable_pml = 1; +bool __read_mostly enable_pml = 1; module_param_named(pml, enable_pml, bool, S_IRUGO); #define MSR_TYPE_R 1 -- cgit v1.2.3 From cb1d474b322550630841a58ea960a448fe8ddce3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 3 Dec 2018 13:53:04 -0800 Subject: KVM: VMX: Move VMCS definitions to dedicated file This isn't intended to be a pure reflection of hardware, e.g. struct loaded_vmcs and struct vmcs_host_state are KVM-defined constructs. Similar to capabilities.h, this is a standalone file to avoid circular dependencies between yet-to-be-created vmx.h and nested.h files. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmcs.h | 132 ++++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/vmx/vmx.c | 125 ++------------------------------------------- 2 files changed, 135 insertions(+), 122 deletions(-) create mode 100644 arch/x86/kvm/vmx/vmcs.h (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h new file mode 100644 index 000000000000..4112190feac1 --- /dev/null +++ b/arch/x86/kvm/vmx/vmcs.h @@ -0,0 +1,132 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_VMX_VMCS_H +#define __KVM_X86_VMX_VMCS_H + +#include +#include + +#include + +#include "capabilities.h" + +struct vmcs_hdr { + u32 revision_id:31; + u32 shadow_vmcs:1; +}; + +struct vmcs { + struct vmcs_hdr hdr; + u32 abort; + char data[0]; +}; + +/* + * vmcs_host_state tracks registers that are loaded from the VMCS on VMEXIT + * and whose values change infrequently, but are not constant. I.e. this is + * used as a write-through cache of the corresponding VMCS fields. + */ +struct vmcs_host_state { + unsigned long cr3; /* May not match real cr3 */ + unsigned long cr4; /* May not match real cr4 */ + unsigned long gs_base; + unsigned long fs_base; + + u16 fs_sel, gs_sel, ldt_sel; +#ifdef CONFIG_X86_64 + u16 ds_sel, es_sel; +#endif +}; + +/* + * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also + * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs + * loaded on this CPU (so we can clear them if the CPU goes down). + */ +struct loaded_vmcs { + struct vmcs *vmcs; + struct vmcs *shadow_vmcs; + int cpu; + bool launched; + bool nmi_known_unmasked; + bool hv_timer_armed; + /* Support for vnmi-less CPUs */ + int soft_vnmi_blocked; + ktime_t entry_time; + s64 vnmi_blocked_time; + unsigned long *msr_bitmap; + struct list_head loaded_vmcss_on_cpu_link; + struct vmcs_host_state host_state; +}; + +static inline bool is_exception_n(u32 intr_info, u8 vector) +{ + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | + INTR_INFO_VALID_MASK)) == + (INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK); +} + +static inline bool is_debug(u32 intr_info) +{ + return is_exception_n(intr_info, DB_VECTOR); +} + +static inline bool is_breakpoint(u32 intr_info) +{ + return is_exception_n(intr_info, BP_VECTOR); +} + +static inline bool is_page_fault(u32 intr_info) +{ + return is_exception_n(intr_info, PF_VECTOR); +} + +static inline bool is_invalid_opcode(u32 intr_info) +{ + return is_exception_n(intr_info, UD_VECTOR); +} + +static inline bool is_gp_fault(u32 intr_info) +{ + return is_exception_n(intr_info, GP_VECTOR); +} + +static inline bool is_machine_check(u32 intr_info) +{ + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | + INTR_INFO_VALID_MASK)) == + (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK); +} + +/* Undocumented: icebp/int1 */ +static inline bool is_icebp(u32 intr_info) +{ + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) + == (INTR_TYPE_PRIV_SW_EXCEPTION | INTR_INFO_VALID_MASK); +} + +static inline bool is_nmi(u32 intr_info) +{ + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) + == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK); +} + +enum vmcs_field_width { + VMCS_FIELD_WIDTH_U16 = 0, + VMCS_FIELD_WIDTH_U64 = 1, + VMCS_FIELD_WIDTH_U32 = 2, + VMCS_FIELD_WIDTH_NATURAL_WIDTH = 3 +}; + +static inline int vmcs_field_width(unsigned long field) +{ + if (0x1 & field) /* the *_HIGH fields are all 32 bit */ + return VMCS_FIELD_WIDTH_U32; + return (field >> 13) & 0x3; +} + +static inline int vmcs_field_readonly(unsigned long field) +{ + return (((field >> 10) & 0x3) == 1); +} + +#endif /* __KVM_X86_VMX_VMCS_H */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ac7999d40808..d2bafc22d594 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -58,6 +58,7 @@ #include "mmu.h" #include "pmu.h" #include "trace.h" +#include "vmcs.h" #include "x86.h" #define __ex(x) __kvm_handle_fault_on_reboot(x) @@ -361,57 +362,6 @@ struct kvm_vmx { spinlock_t ept_pointer_lock; }; -#define NR_AUTOLOAD_MSRS 8 - -struct vmcs_hdr { - u32 revision_id:31; - u32 shadow_vmcs:1; -}; - -struct vmcs { - struct vmcs_hdr hdr; - u32 abort; - char data[0]; -}; - -/* - * vmcs_host_state tracks registers that are loaded from the VMCS on VMEXIT - * and whose values change infrequently, but are not constant. I.e. this is - * used as a write-through cache of the corresponding VMCS fields. - */ -struct vmcs_host_state { - unsigned long cr3; /* May not match real cr3 */ - unsigned long cr4; /* May not match real cr4 */ - unsigned long gs_base; - unsigned long fs_base; - - u16 fs_sel, gs_sel, ldt_sel; -#ifdef CONFIG_X86_64 - u16 ds_sel, es_sel; -#endif -}; - -/* - * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also - * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs - * loaded on this CPU (so we can clear them if the CPU goes down). - */ -struct loaded_vmcs { - struct vmcs *vmcs; - struct vmcs *shadow_vmcs; - int cpu; - bool launched; - bool nmi_known_unmasked; - bool hv_timer_armed; - /* Support for vnmi-less CPUs */ - int soft_vnmi_blocked; - ktime_t entry_time; - s64 vnmi_blocked_time; - unsigned long *msr_bitmap; - struct list_head loaded_vmcss_on_cpu_link; - struct vmcs_host_state host_state; -}; - struct shared_msr_entry { unsigned index; u64 data; @@ -940,6 +890,8 @@ static inline int pi_test_sn(struct pi_desc *pi_desc) (unsigned long *)&pi_desc->control); } +#define NR_AUTOLOAD_MSRS 8 + struct vmx_msrs { unsigned int nr; struct vmx_msr_entry val[NR_AUTOLOAD_MSRS]; @@ -1588,52 +1540,6 @@ static int nested_enable_evmcs(struct kvm_vcpu *vcpu, return 0; } -static inline bool is_exception_n(u32 intr_info, u8 vector) -{ - return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | - INTR_INFO_VALID_MASK)) == - (INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK); -} - -static inline bool is_debug(u32 intr_info) -{ - return is_exception_n(intr_info, DB_VECTOR); -} - -static inline bool is_breakpoint(u32 intr_info) -{ - return is_exception_n(intr_info, BP_VECTOR); -} - -static inline bool is_page_fault(u32 intr_info) -{ - return is_exception_n(intr_info, PF_VECTOR); -} - -static inline bool is_invalid_opcode(u32 intr_info) -{ - return is_exception_n(intr_info, UD_VECTOR); -} - -static inline bool is_gp_fault(u32 intr_info) -{ - return is_exception_n(intr_info, GP_VECTOR); -} - -static inline bool is_machine_check(u32 intr_info) -{ - return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | - INTR_INFO_VALID_MASK)) == - (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK); -} - -/* Undocumented: icebp/int1 */ -static inline bool is_icebp(u32 intr_info) -{ - return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) - == (INTR_TYPE_PRIV_SW_EXCEPTION | INTR_INFO_VALID_MASK); -} - /* * Comment's format: document - errata name - stepping - processor name. * Refer from @@ -1814,12 +1720,6 @@ static inline bool nested_cpu_has_save_preemption_timer(struct vmcs12 *vmcs12) VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; } -static inline bool is_nmi(u32 intr_info) -{ - return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) - == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK); -} - static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, u32 exit_intr_info, unsigned long exit_qualification); @@ -4542,25 +4442,6 @@ static void free_kvm_area(void) } } -enum vmcs_field_width { - VMCS_FIELD_WIDTH_U16 = 0, - VMCS_FIELD_WIDTH_U64 = 1, - VMCS_FIELD_WIDTH_U32 = 2, - VMCS_FIELD_WIDTH_NATURAL_WIDTH = 3 -}; - -static inline int vmcs_field_width(unsigned long field) -{ - if (0x1 & field) /* the *_HIGH fields are all 32 bit */ - return VMCS_FIELD_WIDTH_U32; - return (field >> 13) & 0x3 ; -} - -static inline int vmcs_field_readonly(unsigned long field) -{ - return (((field >> 10) & 0x3) == 1); -} - static void init_vmcs_shadow_fields(void) { int i, j; -- cgit v1.2.3 From 609363cf81fcbd2c7fc93d1f920cef3a71154de8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 3 Dec 2018 13:53:05 -0800 Subject: KVM: nVMX: Move vmcs12 code to dedicated files vmcs12 is the KVM-defined struct used to track a nested VMCS, e.g. a VMCS created by L1 for L2. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/Makefile | 2 +- arch/x86/kvm/vmx/vmcs.h | 4 +- arch/x86/kvm/vmx/vmcs12.c | 157 ++++++++++++ arch/x86/kvm/vmx/vmcs12.h | 462 +++++++++++++++++++++++++++++++++++ arch/x86/kvm/vmx/vmx.c | 602 +--------------------------------------------- 5 files changed, 624 insertions(+), 603 deletions(-) create mode 100644 arch/x86/kvm/vmx/vmcs12.c create mode 100644 arch/x86/kvm/vmx/vmcs12.h (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 13fd54de5449..79d97d837cf3 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -16,7 +16,7 @@ kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ hyperv.o page_track.o debugfs.o -kvm-intel-y += vmx/vmx.o vmx/pmu_intel.o +kvm-intel-y += vmx/vmx.o vmx/pmu_intel.o vmx/vmcs12.o kvm-amd-y += svm.o pmu_amd.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 4112190feac1..3b8da04203e4 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -2,9 +2,11 @@ #ifndef __KVM_X86_VMX_VMCS_H #define __KVM_X86_VMX_VMCS_H -#include #include +#include +#include +#include #include #include "capabilities.h" diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c new file mode 100644 index 000000000000..53dfb401316d --- /dev/null +++ b/arch/x86/kvm/vmx/vmcs12.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmcs12.h" + +#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) +#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) +#define FIELD(number, name) [ROL16(number, 6)] = VMCS12_OFFSET(name) +#define FIELD64(number, name) \ + FIELD(number, name), \ + [ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32) + +const unsigned short vmcs_field_to_offset_table[] = { + FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), + FIELD(POSTED_INTR_NV, posted_intr_nv), + FIELD(GUEST_ES_SELECTOR, guest_es_selector), + FIELD(GUEST_CS_SELECTOR, guest_cs_selector), + FIELD(GUEST_SS_SELECTOR, guest_ss_selector), + FIELD(GUEST_DS_SELECTOR, guest_ds_selector), + FIELD(GUEST_FS_SELECTOR, guest_fs_selector), + FIELD(GUEST_GS_SELECTOR, guest_gs_selector), + FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector), + FIELD(GUEST_TR_SELECTOR, guest_tr_selector), + FIELD(GUEST_INTR_STATUS, guest_intr_status), + FIELD(GUEST_PML_INDEX, guest_pml_index), + FIELD(HOST_ES_SELECTOR, host_es_selector), + FIELD(HOST_CS_SELECTOR, host_cs_selector), + FIELD(HOST_SS_SELECTOR, host_ss_selector), + FIELD(HOST_DS_SELECTOR, host_ds_selector), + FIELD(HOST_FS_SELECTOR, host_fs_selector), + FIELD(HOST_GS_SELECTOR, host_gs_selector), + FIELD(HOST_TR_SELECTOR, host_tr_selector), + FIELD64(IO_BITMAP_A, io_bitmap_a), + FIELD64(IO_BITMAP_B, io_bitmap_b), + FIELD64(MSR_BITMAP, msr_bitmap), + FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr), + FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr), + FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr), + FIELD64(PML_ADDRESS, pml_address), + FIELD64(TSC_OFFSET, tsc_offset), + FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr), + FIELD64(APIC_ACCESS_ADDR, apic_access_addr), + FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr), + FIELD64(VM_FUNCTION_CONTROL, vm_function_control), + FIELD64(EPT_POINTER, ept_pointer), + FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0), + FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1), + FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2), + FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3), + FIELD64(EPTP_LIST_ADDRESS, eptp_list_address), + FIELD64(VMREAD_BITMAP, vmread_bitmap), + FIELD64(VMWRITE_BITMAP, vmwrite_bitmap), + FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), + FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), + FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), + FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl), + FIELD64(GUEST_IA32_PAT, guest_ia32_pat), + FIELD64(GUEST_IA32_EFER, guest_ia32_efer), + FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl), + FIELD64(GUEST_PDPTR0, guest_pdptr0), + FIELD64(GUEST_PDPTR1, guest_pdptr1), + FIELD64(GUEST_PDPTR2, guest_pdptr2), + FIELD64(GUEST_PDPTR3, guest_pdptr3), + FIELD64(GUEST_BNDCFGS, guest_bndcfgs), + FIELD64(HOST_IA32_PAT, host_ia32_pat), + FIELD64(HOST_IA32_EFER, host_ia32_efer), + FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl), + FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control), + FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control), + FIELD(EXCEPTION_BITMAP, exception_bitmap), + FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask), + FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match), + FIELD(CR3_TARGET_COUNT, cr3_target_count), + FIELD(VM_EXIT_CONTROLS, vm_exit_controls), + FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count), + FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count), + FIELD(VM_ENTRY_CONTROLS, vm_entry_controls), + FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count), + FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field), + FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code), + FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len), + FIELD(TPR_THRESHOLD, tpr_threshold), + FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control), + FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error), + FIELD(VM_EXIT_REASON, vm_exit_reason), + FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info), + FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code), + FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field), + FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code), + FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len), + FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info), + FIELD(GUEST_ES_LIMIT, guest_es_limit), + FIELD(GUEST_CS_LIMIT, guest_cs_limit), + FIELD(GUEST_SS_LIMIT, guest_ss_limit), + FIELD(GUEST_DS_LIMIT, guest_ds_limit), + FIELD(GUEST_FS_LIMIT, guest_fs_limit), + FIELD(GUEST_GS_LIMIT, guest_gs_limit), + FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit), + FIELD(GUEST_TR_LIMIT, guest_tr_limit), + FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit), + FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit), + FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes), + FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes), + FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes), + FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes), + FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes), + FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes), + FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes), + FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes), + FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info), + FIELD(GUEST_ACTIVITY_STATE, guest_activity_state), + FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs), + FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs), + FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value), + FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask), + FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask), + FIELD(CR0_READ_SHADOW, cr0_read_shadow), + FIELD(CR4_READ_SHADOW, cr4_read_shadow), + FIELD(CR3_TARGET_VALUE0, cr3_target_value0), + FIELD(CR3_TARGET_VALUE1, cr3_target_value1), + FIELD(CR3_TARGET_VALUE2, cr3_target_value2), + FIELD(CR3_TARGET_VALUE3, cr3_target_value3), + FIELD(EXIT_QUALIFICATION, exit_qualification), + FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address), + FIELD(GUEST_CR0, guest_cr0), + FIELD(GUEST_CR3, guest_cr3), + FIELD(GUEST_CR4, guest_cr4), + FIELD(GUEST_ES_BASE, guest_es_base), + FIELD(GUEST_CS_BASE, guest_cs_base), + FIELD(GUEST_SS_BASE, guest_ss_base), + FIELD(GUEST_DS_BASE, guest_ds_base), + FIELD(GUEST_FS_BASE, guest_fs_base), + FIELD(GUEST_GS_BASE, guest_gs_base), + FIELD(GUEST_LDTR_BASE, guest_ldtr_base), + FIELD(GUEST_TR_BASE, guest_tr_base), + FIELD(GUEST_GDTR_BASE, guest_gdtr_base), + FIELD(GUEST_IDTR_BASE, guest_idtr_base), + FIELD(GUEST_DR7, guest_dr7), + FIELD(GUEST_RSP, guest_rsp), + FIELD(GUEST_RIP, guest_rip), + FIELD(GUEST_RFLAGS, guest_rflags), + FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions), + FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp), + FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip), + FIELD(HOST_CR0, host_cr0), + FIELD(HOST_CR3, host_cr3), + FIELD(HOST_CR4, host_cr4), + FIELD(HOST_FS_BASE, host_fs_base), + FIELD(HOST_GS_BASE, host_gs_base), + FIELD(HOST_TR_BASE, host_tr_base), + FIELD(HOST_GDTR_BASE, host_gdtr_base), + FIELD(HOST_IDTR_BASE, host_idtr_base), + FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp), + FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip), + FIELD(HOST_RSP, host_rsp), + FIELD(HOST_RIP, host_rip), +}; +const unsigned int nr_vmcs12_fields = ARRAY_SIZE(vmcs_field_to_offset_table); diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h new file mode 100644 index 000000000000..3a742428ad17 --- /dev/null +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -0,0 +1,462 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_VMX_VMCS12_H +#define __KVM_X86_VMX_VMCS12_H + +#include + +#include "vmcs.h" + +/* + * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a + * single nested guest (L2), hence the name vmcs12. Any VMX implementation has + * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is + * stored in guest memory specified by VMPTRLD, but is opaque to the guest, + * which must access it using VMREAD/VMWRITE/VMCLEAR instructions. + * More than one of these structures may exist, if L1 runs multiple L2 guests. + * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the + * underlying hardware which will be used to run L2. + * This structure is packed to ensure that its layout is identical across + * machines (necessary for live migration). + * + * IMPORTANT: Changing the layout of existing fields in this structure + * will break save/restore compatibility with older kvm releases. When + * adding new fields, either use space in the reserved padding* arrays + * or add the new fields to the end of the structure. + */ +typedef u64 natural_width; +struct __packed vmcs12 { + /* According to the Intel spec, a VMCS region must start with the + * following two fields. Then follow implementation-specific data. + */ + struct vmcs_hdr hdr; + u32 abort; + + u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ + u32 padding[7]; /* room for future expansion */ + + u64 io_bitmap_a; + u64 io_bitmap_b; + u64 msr_bitmap; + u64 vm_exit_msr_store_addr; + u64 vm_exit_msr_load_addr; + u64 vm_entry_msr_load_addr; + u64 tsc_offset; + u64 virtual_apic_page_addr; + u64 apic_access_addr; + u64 posted_intr_desc_addr; + u64 ept_pointer; + u64 eoi_exit_bitmap0; + u64 eoi_exit_bitmap1; + u64 eoi_exit_bitmap2; + u64 eoi_exit_bitmap3; + u64 xss_exit_bitmap; + u64 guest_physical_address; + u64 vmcs_link_pointer; + u64 guest_ia32_debugctl; + u64 guest_ia32_pat; + u64 guest_ia32_efer; + u64 guest_ia32_perf_global_ctrl; + u64 guest_pdptr0; + u64 guest_pdptr1; + u64 guest_pdptr2; + u64 guest_pdptr3; + u64 guest_bndcfgs; + u64 host_ia32_pat; + u64 host_ia32_efer; + u64 host_ia32_perf_global_ctrl; + u64 vmread_bitmap; + u64 vmwrite_bitmap; + u64 vm_function_control; + u64 eptp_list_address; + u64 pml_address; + u64 padding64[3]; /* room for future expansion */ + /* + * To allow migration of L1 (complete with its L2 guests) between + * machines of different natural widths (32 or 64 bit), we cannot have + * unsigned long fields with no explicit size. We use u64 (aliased + * natural_width) instead. Luckily, x86 is little-endian. + */ + natural_width cr0_guest_host_mask; + natural_width cr4_guest_host_mask; + natural_width cr0_read_shadow; + natural_width cr4_read_shadow; + natural_width cr3_target_value0; + natural_width cr3_target_value1; + natural_width cr3_target_value2; + natural_width cr3_target_value3; + natural_width exit_qualification; + natural_width guest_linear_address; + natural_width guest_cr0; + natural_width guest_cr3; + natural_width guest_cr4; + natural_width guest_es_base; + natural_width guest_cs_base; + natural_width guest_ss_base; + natural_width guest_ds_base; + natural_width guest_fs_base; + natural_width guest_gs_base; + natural_width guest_ldtr_base; + natural_width guest_tr_base; + natural_width guest_gdtr_base; + natural_width guest_idtr_base; + natural_width guest_dr7; + natural_width guest_rsp; + natural_width guest_rip; + natural_width guest_rflags; + natural_width guest_pending_dbg_exceptions; + natural_width guest_sysenter_esp; + natural_width guest_sysenter_eip; + natural_width host_cr0; + natural_width host_cr3; + natural_width host_cr4; + natural_width host_fs_base; + natural_width host_gs_base; + natural_width host_tr_base; + natural_width host_gdtr_base; + natural_width host_idtr_base; + natural_width host_ia32_sysenter_esp; + natural_width host_ia32_sysenter_eip; + natural_width host_rsp; + natural_width host_rip; + natural_width paddingl[8]; /* room for future expansion */ + u32 pin_based_vm_exec_control; + u32 cpu_based_vm_exec_control; + u32 exception_bitmap; + u32 page_fault_error_code_mask; + u32 page_fault_error_code_match; + u32 cr3_target_count; + u32 vm_exit_controls; + u32 vm_exit_msr_store_count; + u32 vm_exit_msr_load_count; + u32 vm_entry_controls; + u32 vm_entry_msr_load_count; + u32 vm_entry_intr_info_field; + u32 vm_entry_exception_error_code; + u32 vm_entry_instruction_len; + u32 tpr_threshold; + u32 secondary_vm_exec_control; + u32 vm_instruction_error; + u32 vm_exit_reason; + u32 vm_exit_intr_info; + u32 vm_exit_intr_error_code; + u32 idt_vectoring_info_field; + u32 idt_vectoring_error_code; + u32 vm_exit_instruction_len; + u32 vmx_instruction_info; + u32 guest_es_limit; + u32 guest_cs_limit; + u32 guest_ss_limit; + u32 guest_ds_limit; + u32 guest_fs_limit; + u32 guest_gs_limit; + u32 guest_ldtr_limit; + u32 guest_tr_limit; + u32 guest_gdtr_limit; + u32 guest_idtr_limit; + u32 guest_es_ar_bytes; + u32 guest_cs_ar_bytes; + u32 guest_ss_ar_bytes; + u32 guest_ds_ar_bytes; + u32 guest_fs_ar_bytes; + u32 guest_gs_ar_bytes; + u32 guest_ldtr_ar_bytes; + u32 guest_tr_ar_bytes; + u32 guest_interruptibility_info; + u32 guest_activity_state; + u32 guest_sysenter_cs; + u32 host_ia32_sysenter_cs; + u32 vmx_preemption_timer_value; + u32 padding32[7]; /* room for future expansion */ + u16 virtual_processor_id; + u16 posted_intr_nv; + u16 guest_es_selector; + u16 guest_cs_selector; + u16 guest_ss_selector; + u16 guest_ds_selector; + u16 guest_fs_selector; + u16 guest_gs_selector; + u16 guest_ldtr_selector; + u16 guest_tr_selector; + u16 guest_intr_status; + u16 host_es_selector; + u16 host_cs_selector; + u16 host_ss_selector; + u16 host_ds_selector; + u16 host_fs_selector; + u16 host_gs_selector; + u16 host_tr_selector; + u16 guest_pml_index; +}; + +/* + * VMCS12_REVISION is an arbitrary id that should be changed if the content or + * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and + * VMPTRLD verifies that the VMCS region that L1 is loading contains this id. + * + * IMPORTANT: Changing this value will break save/restore compatibility with + * older kvm releases. + */ +#define VMCS12_REVISION 0x11e57ed0 + +/* + * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region + * and any VMCS region. Although only sizeof(struct vmcs12) are used by the + * current implementation, 4K are reserved to avoid future complications. + */ +#define VMCS12_SIZE 0x1000 + +/* + * VMCS12_MAX_FIELD_INDEX is the highest index value used in any + * supported VMCS12 field encoding. + */ +#define VMCS12_MAX_FIELD_INDEX 0x17 + +/* + * For save/restore compatibility, the vmcs12 field offsets must not change. + */ +#define CHECK_OFFSET(field, loc) \ + BUILD_BUG_ON_MSG(offsetof(struct vmcs12, field) != (loc), \ + "Offset of " #field " in struct vmcs12 has changed.") + +static inline void vmx_check_vmcs12_offsets(void) +{ + CHECK_OFFSET(hdr, 0); + CHECK_OFFSET(abort, 4); + CHECK_OFFSET(launch_state, 8); + CHECK_OFFSET(io_bitmap_a, 40); + CHECK_OFFSET(io_bitmap_b, 48); + CHECK_OFFSET(msr_bitmap, 56); + CHECK_OFFSET(vm_exit_msr_store_addr, 64); + CHECK_OFFSET(vm_exit_msr_load_addr, 72); + CHECK_OFFSET(vm_entry_msr_load_addr, 80); + CHECK_OFFSET(tsc_offset, 88); + CHECK_OFFSET(virtual_apic_page_addr, 96); + CHECK_OFFSET(apic_access_addr, 104); + CHECK_OFFSET(posted_intr_desc_addr, 112); + CHECK_OFFSET(ept_pointer, 120); + CHECK_OFFSET(eoi_exit_bitmap0, 128); + CHECK_OFFSET(eoi_exit_bitmap1, 136); + CHECK_OFFSET(eoi_exit_bitmap2, 144); + CHECK_OFFSET(eoi_exit_bitmap3, 152); + CHECK_OFFSET(xss_exit_bitmap, 160); + CHECK_OFFSET(guest_physical_address, 168); + CHECK_OFFSET(vmcs_link_pointer, 176); + CHECK_OFFSET(guest_ia32_debugctl, 184); + CHECK_OFFSET(guest_ia32_pat, 192); + CHECK_OFFSET(guest_ia32_efer, 200); + CHECK_OFFSET(guest_ia32_perf_global_ctrl, 208); + CHECK_OFFSET(guest_pdptr0, 216); + CHECK_OFFSET(guest_pdptr1, 224); + CHECK_OFFSET(guest_pdptr2, 232); + CHECK_OFFSET(guest_pdptr3, 240); + CHECK_OFFSET(guest_bndcfgs, 248); + CHECK_OFFSET(host_ia32_pat, 256); + CHECK_OFFSET(host_ia32_efer, 264); + CHECK_OFFSET(host_ia32_perf_global_ctrl, 272); + CHECK_OFFSET(vmread_bitmap, 280); + CHECK_OFFSET(vmwrite_bitmap, 288); + CHECK_OFFSET(vm_function_control, 296); + CHECK_OFFSET(eptp_list_address, 304); + CHECK_OFFSET(pml_address, 312); + CHECK_OFFSET(cr0_guest_host_mask, 344); + CHECK_OFFSET(cr4_guest_host_mask, 352); + CHECK_OFFSET(cr0_read_shadow, 360); + CHECK_OFFSET(cr4_read_shadow, 368); + CHECK_OFFSET(cr3_target_value0, 376); + CHECK_OFFSET(cr3_target_value1, 384); + CHECK_OFFSET(cr3_target_value2, 392); + CHECK_OFFSET(cr3_target_value3, 400); + CHECK_OFFSET(exit_qualification, 408); + CHECK_OFFSET(guest_linear_address, 416); + CHECK_OFFSET(guest_cr0, 424); + CHECK_OFFSET(guest_cr3, 432); + CHECK_OFFSET(guest_cr4, 440); + CHECK_OFFSET(guest_es_base, 448); + CHECK_OFFSET(guest_cs_base, 456); + CHECK_OFFSET(guest_ss_base, 464); + CHECK_OFFSET(guest_ds_base, 472); + CHECK_OFFSET(guest_fs_base, 480); + CHECK_OFFSET(guest_gs_base, 488); + CHECK_OFFSET(guest_ldtr_base, 496); + CHECK_OFFSET(guest_tr_base, 504); + CHECK_OFFSET(guest_gdtr_base, 512); + CHECK_OFFSET(guest_idtr_base, 520); + CHECK_OFFSET(guest_dr7, 528); + CHECK_OFFSET(guest_rsp, 536); + CHECK_OFFSET(guest_rip, 544); + CHECK_OFFSET(guest_rflags, 552); + CHECK_OFFSET(guest_pending_dbg_exceptions, 560); + CHECK_OFFSET(guest_sysenter_esp, 568); + CHECK_OFFSET(guest_sysenter_eip, 576); + CHECK_OFFSET(host_cr0, 584); + CHECK_OFFSET(host_cr3, 592); + CHECK_OFFSET(host_cr4, 600); + CHECK_OFFSET(host_fs_base, 608); + CHECK_OFFSET(host_gs_base, 616); + CHECK_OFFSET(host_tr_base, 624); + CHECK_OFFSET(host_gdtr_base, 632); + CHECK_OFFSET(host_idtr_base, 640); + CHECK_OFFSET(host_ia32_sysenter_esp, 648); + CHECK_OFFSET(host_ia32_sysenter_eip, 656); + CHECK_OFFSET(host_rsp, 664); + CHECK_OFFSET(host_rip, 672); + CHECK_OFFSET(pin_based_vm_exec_control, 744); + CHECK_OFFSET(cpu_based_vm_exec_control, 748); + CHECK_OFFSET(exception_bitmap, 752); + CHECK_OFFSET(page_fault_error_code_mask, 756); + CHECK_OFFSET(page_fault_error_code_match, 760); + CHECK_OFFSET(cr3_target_count, 764); + CHECK_OFFSET(vm_exit_controls, 768); + CHECK_OFFSET(vm_exit_msr_store_count, 772); + CHECK_OFFSET(vm_exit_msr_load_count, 776); + CHECK_OFFSET(vm_entry_controls, 780); + CHECK_OFFSET(vm_entry_msr_load_count, 784); + CHECK_OFFSET(vm_entry_intr_info_field, 788); + CHECK_OFFSET(vm_entry_exception_error_code, 792); + CHECK_OFFSET(vm_entry_instruction_len, 796); + CHECK_OFFSET(tpr_threshold, 800); + CHECK_OFFSET(secondary_vm_exec_control, 804); + CHECK_OFFSET(vm_instruction_error, 808); + CHECK_OFFSET(vm_exit_reason, 812); + CHECK_OFFSET(vm_exit_intr_info, 816); + CHECK_OFFSET(vm_exit_intr_error_code, 820); + CHECK_OFFSET(idt_vectoring_info_field, 824); + CHECK_OFFSET(idt_vectoring_error_code, 828); + CHECK_OFFSET(vm_exit_instruction_len, 832); + CHECK_OFFSET(vmx_instruction_info, 836); + CHECK_OFFSET(guest_es_limit, 840); + CHECK_OFFSET(guest_cs_limit, 844); + CHECK_OFFSET(guest_ss_limit, 848); + CHECK_OFFSET(guest_ds_limit, 852); + CHECK_OFFSET(guest_fs_limit, 856); + CHECK_OFFSET(guest_gs_limit, 860); + CHECK_OFFSET(guest_ldtr_limit, 864); + CHECK_OFFSET(guest_tr_limit, 868); + CHECK_OFFSET(guest_gdtr_limit, 872); + CHECK_OFFSET(guest_idtr_limit, 876); + CHECK_OFFSET(guest_es_ar_bytes, 880); + CHECK_OFFSET(guest_cs_ar_bytes, 884); + CHECK_OFFSET(guest_ss_ar_bytes, 888); + CHECK_OFFSET(guest_ds_ar_bytes, 892); + CHECK_OFFSET(guest_fs_ar_bytes, 896); + CHECK_OFFSET(guest_gs_ar_bytes, 900); + CHECK_OFFSET(guest_ldtr_ar_bytes, 904); + CHECK_OFFSET(guest_tr_ar_bytes, 908); + CHECK_OFFSET(guest_interruptibility_info, 912); + CHECK_OFFSET(guest_activity_state, 916); + CHECK_OFFSET(guest_sysenter_cs, 920); + CHECK_OFFSET(host_ia32_sysenter_cs, 924); + CHECK_OFFSET(vmx_preemption_timer_value, 928); + CHECK_OFFSET(virtual_processor_id, 960); + CHECK_OFFSET(posted_intr_nv, 962); + CHECK_OFFSET(guest_es_selector, 964); + CHECK_OFFSET(guest_cs_selector, 966); + CHECK_OFFSET(guest_ss_selector, 968); + CHECK_OFFSET(guest_ds_selector, 970); + CHECK_OFFSET(guest_fs_selector, 972); + CHECK_OFFSET(guest_gs_selector, 974); + CHECK_OFFSET(guest_ldtr_selector, 976); + CHECK_OFFSET(guest_tr_selector, 978); + CHECK_OFFSET(guest_intr_status, 980); + CHECK_OFFSET(host_es_selector, 982); + CHECK_OFFSET(host_cs_selector, 984); + CHECK_OFFSET(host_ss_selector, 986); + CHECK_OFFSET(host_ds_selector, 988); + CHECK_OFFSET(host_fs_selector, 990); + CHECK_OFFSET(host_gs_selector, 992); + CHECK_OFFSET(host_tr_selector, 994); + CHECK_OFFSET(guest_pml_index, 996); +} + +extern const unsigned short vmcs_field_to_offset_table[]; +extern const unsigned int nr_vmcs12_fields; + +#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) + +static inline short vmcs_field_to_offset(unsigned long field) +{ + unsigned short offset; + unsigned int index; + + if (field >> 15) + return -ENOENT; + + index = ROL16(field, 6); + if (index >= nr_vmcs12_fields) + return -ENOENT; + + index = array_index_nospec(index, nr_vmcs12_fields); + offset = vmcs_field_to_offset_table[index]; + if (offset == 0) + return -ENOENT; + return offset; +} + +#undef ROL16 + +/* + * Read a vmcs12 field. Since these can have varying lengths and we return + * one type, we chose the biggest type (u64) and zero-extend the return value + * to that size. Note that the caller, handle_vmread, might need to use only + * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of + * 64-bit fields are to be returned). + */ +static inline int vmcs12_read_any(struct vmcs12 *vmcs12, + unsigned long field, u64 *ret) +{ + short offset = vmcs_field_to_offset(field); + char *p; + + if (offset < 0) + return offset; + + p = (char *)vmcs12 + offset; + + switch (vmcs_field_width(field)) { + case VMCS_FIELD_WIDTH_NATURAL_WIDTH: + *ret = *((natural_width *)p); + return 0; + case VMCS_FIELD_WIDTH_U16: + *ret = *((u16 *)p); + return 0; + case VMCS_FIELD_WIDTH_U32: + *ret = *((u32 *)p); + return 0; + case VMCS_FIELD_WIDTH_U64: + *ret = *((u64 *)p); + return 0; + default: + WARN_ON(1); + return -ENOENT; + } +} + +static inline int vmcs12_write_any(struct vmcs12 *vmcs12, + unsigned long field, u64 field_value){ + short offset = vmcs_field_to_offset(field); + char *p = (char *)vmcs12 + offset; + + if (offset < 0) + return offset; + + switch (vmcs_field_width(field)) { + case VMCS_FIELD_WIDTH_U16: + *(u16 *)p = field_value; + return 0; + case VMCS_FIELD_WIDTH_U32: + *(u32 *)p = field_value; + return 0; + case VMCS_FIELD_WIDTH_U64: + *(u64 *)p = field_value; + return 0; + case VMCS_FIELD_WIDTH_NATURAL_WIDTH: + *(natural_width *)p = field_value; + return 0; + default: + WARN_ON(1); + return -ENOENT; + } + +} + +#endif /* __KVM_X86_VMX_VMCS12_H */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d2bafc22d594..ebe500f48bdf 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include @@ -59,6 +58,7 @@ #include "pmu.h" #include "trace.h" #include "vmcs.h" +#include "vmcs12.h" #include "x86.h" #define __ex(x) __kvm_handle_fault_on_reboot(x) @@ -368,367 +368,6 @@ struct shared_msr_entry { u64 mask; }; -/* - * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a - * single nested guest (L2), hence the name vmcs12. Any VMX implementation has - * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is - * stored in guest memory specified by VMPTRLD, but is opaque to the guest, - * which must access it using VMREAD/VMWRITE/VMCLEAR instructions. - * More than one of these structures may exist, if L1 runs multiple L2 guests. - * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the - * underlying hardware which will be used to run L2. - * This structure is packed to ensure that its layout is identical across - * machines (necessary for live migration). - * - * IMPORTANT: Changing the layout of existing fields in this structure - * will break save/restore compatibility with older kvm releases. When - * adding new fields, either use space in the reserved padding* arrays - * or add the new fields to the end of the structure. - */ -typedef u64 natural_width; -struct __packed vmcs12 { - /* According to the Intel spec, a VMCS region must start with the - * following two fields. Then follow implementation-specific data. - */ - struct vmcs_hdr hdr; - u32 abort; - - u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ - u32 padding[7]; /* room for future expansion */ - - u64 io_bitmap_a; - u64 io_bitmap_b; - u64 msr_bitmap; - u64 vm_exit_msr_store_addr; - u64 vm_exit_msr_load_addr; - u64 vm_entry_msr_load_addr; - u64 tsc_offset; - u64 virtual_apic_page_addr; - u64 apic_access_addr; - u64 posted_intr_desc_addr; - u64 ept_pointer; - u64 eoi_exit_bitmap0; - u64 eoi_exit_bitmap1; - u64 eoi_exit_bitmap2; - u64 eoi_exit_bitmap3; - u64 xss_exit_bitmap; - u64 guest_physical_address; - u64 vmcs_link_pointer; - u64 guest_ia32_debugctl; - u64 guest_ia32_pat; - u64 guest_ia32_efer; - u64 guest_ia32_perf_global_ctrl; - u64 guest_pdptr0; - u64 guest_pdptr1; - u64 guest_pdptr2; - u64 guest_pdptr3; - u64 guest_bndcfgs; - u64 host_ia32_pat; - u64 host_ia32_efer; - u64 host_ia32_perf_global_ctrl; - u64 vmread_bitmap; - u64 vmwrite_bitmap; - u64 vm_function_control; - u64 eptp_list_address; - u64 pml_address; - u64 padding64[3]; /* room for future expansion */ - /* - * To allow migration of L1 (complete with its L2 guests) between - * machines of different natural widths (32 or 64 bit), we cannot have - * unsigned long fields with no explict size. We use u64 (aliased - * natural_width) instead. Luckily, x86 is little-endian. - */ - natural_width cr0_guest_host_mask; - natural_width cr4_guest_host_mask; - natural_width cr0_read_shadow; - natural_width cr4_read_shadow; - natural_width cr3_target_value0; - natural_width cr3_target_value1; - natural_width cr3_target_value2; - natural_width cr3_target_value3; - natural_width exit_qualification; - natural_width guest_linear_address; - natural_width guest_cr0; - natural_width guest_cr3; - natural_width guest_cr4; - natural_width guest_es_base; - natural_width guest_cs_base; - natural_width guest_ss_base; - natural_width guest_ds_base; - natural_width guest_fs_base; - natural_width guest_gs_base; - natural_width guest_ldtr_base; - natural_width guest_tr_base; - natural_width guest_gdtr_base; - natural_width guest_idtr_base; - natural_width guest_dr7; - natural_width guest_rsp; - natural_width guest_rip; - natural_width guest_rflags; - natural_width guest_pending_dbg_exceptions; - natural_width guest_sysenter_esp; - natural_width guest_sysenter_eip; - natural_width host_cr0; - natural_width host_cr3; - natural_width host_cr4; - natural_width host_fs_base; - natural_width host_gs_base; - natural_width host_tr_base; - natural_width host_gdtr_base; - natural_width host_idtr_base; - natural_width host_ia32_sysenter_esp; - natural_width host_ia32_sysenter_eip; - natural_width host_rsp; - natural_width host_rip; - natural_width paddingl[8]; /* room for future expansion */ - u32 pin_based_vm_exec_control; - u32 cpu_based_vm_exec_control; - u32 exception_bitmap; - u32 page_fault_error_code_mask; - u32 page_fault_error_code_match; - u32 cr3_target_count; - u32 vm_exit_controls; - u32 vm_exit_msr_store_count; - u32 vm_exit_msr_load_count; - u32 vm_entry_controls; - u32 vm_entry_msr_load_count; - u32 vm_entry_intr_info_field; - u32 vm_entry_exception_error_code; - u32 vm_entry_instruction_len; - u32 tpr_threshold; - u32 secondary_vm_exec_control; - u32 vm_instruction_error; - u32 vm_exit_reason; - u32 vm_exit_intr_info; - u32 vm_exit_intr_error_code; - u32 idt_vectoring_info_field; - u32 idt_vectoring_error_code; - u32 vm_exit_instruction_len; - u32 vmx_instruction_info; - u32 guest_es_limit; - u32 guest_cs_limit; - u32 guest_ss_limit; - u32 guest_ds_limit; - u32 guest_fs_limit; - u32 guest_gs_limit; - u32 guest_ldtr_limit; - u32 guest_tr_limit; - u32 guest_gdtr_limit; - u32 guest_idtr_limit; - u32 guest_es_ar_bytes; - u32 guest_cs_ar_bytes; - u32 guest_ss_ar_bytes; - u32 guest_ds_ar_bytes; - u32 guest_fs_ar_bytes; - u32 guest_gs_ar_bytes; - u32 guest_ldtr_ar_bytes; - u32 guest_tr_ar_bytes; - u32 guest_interruptibility_info; - u32 guest_activity_state; - u32 guest_sysenter_cs; - u32 host_ia32_sysenter_cs; - u32 vmx_preemption_timer_value; - u32 padding32[7]; /* room for future expansion */ - u16 virtual_processor_id; - u16 posted_intr_nv; - u16 guest_es_selector; - u16 guest_cs_selector; - u16 guest_ss_selector; - u16 guest_ds_selector; - u16 guest_fs_selector; - u16 guest_gs_selector; - u16 guest_ldtr_selector; - u16 guest_tr_selector; - u16 guest_intr_status; - u16 host_es_selector; - u16 host_cs_selector; - u16 host_ss_selector; - u16 host_ds_selector; - u16 host_fs_selector; - u16 host_gs_selector; - u16 host_tr_selector; - u16 guest_pml_index; -}; - -/* - * For save/restore compatibility, the vmcs12 field offsets must not change. - */ -#define CHECK_OFFSET(field, loc) \ - BUILD_BUG_ON_MSG(offsetof(struct vmcs12, field) != (loc), \ - "Offset of " #field " in struct vmcs12 has changed.") - -static inline void vmx_check_vmcs12_offsets(void) { - CHECK_OFFSET(hdr, 0); - CHECK_OFFSET(abort, 4); - CHECK_OFFSET(launch_state, 8); - CHECK_OFFSET(io_bitmap_a, 40); - CHECK_OFFSET(io_bitmap_b, 48); - CHECK_OFFSET(msr_bitmap, 56); - CHECK_OFFSET(vm_exit_msr_store_addr, 64); - CHECK_OFFSET(vm_exit_msr_load_addr, 72); - CHECK_OFFSET(vm_entry_msr_load_addr, 80); - CHECK_OFFSET(tsc_offset, 88); - CHECK_OFFSET(virtual_apic_page_addr, 96); - CHECK_OFFSET(apic_access_addr, 104); - CHECK_OFFSET(posted_intr_desc_addr, 112); - CHECK_OFFSET(ept_pointer, 120); - CHECK_OFFSET(eoi_exit_bitmap0, 128); - CHECK_OFFSET(eoi_exit_bitmap1, 136); - CHECK_OFFSET(eoi_exit_bitmap2, 144); - CHECK_OFFSET(eoi_exit_bitmap3, 152); - CHECK_OFFSET(xss_exit_bitmap, 160); - CHECK_OFFSET(guest_physical_address, 168); - CHECK_OFFSET(vmcs_link_pointer, 176); - CHECK_OFFSET(guest_ia32_debugctl, 184); - CHECK_OFFSET(guest_ia32_pat, 192); - CHECK_OFFSET(guest_ia32_efer, 200); - CHECK_OFFSET(guest_ia32_perf_global_ctrl, 208); - CHECK_OFFSET(guest_pdptr0, 216); - CHECK_OFFSET(guest_pdptr1, 224); - CHECK_OFFSET(guest_pdptr2, 232); - CHECK_OFFSET(guest_pdptr3, 240); - CHECK_OFFSET(guest_bndcfgs, 248); - CHECK_OFFSET(host_ia32_pat, 256); - CHECK_OFFSET(host_ia32_efer, 264); - CHECK_OFFSET(host_ia32_perf_global_ctrl, 272); - CHECK_OFFSET(vmread_bitmap, 280); - CHECK_OFFSET(vmwrite_bitmap, 288); - CHECK_OFFSET(vm_function_control, 296); - CHECK_OFFSET(eptp_list_address, 304); - CHECK_OFFSET(pml_address, 312); - CHECK_OFFSET(cr0_guest_host_mask, 344); - CHECK_OFFSET(cr4_guest_host_mask, 352); - CHECK_OFFSET(cr0_read_shadow, 360); - CHECK_OFFSET(cr4_read_shadow, 368); - CHECK_OFFSET(cr3_target_value0, 376); - CHECK_OFFSET(cr3_target_value1, 384); - CHECK_OFFSET(cr3_target_value2, 392); - CHECK_OFFSET(cr3_target_value3, 400); - CHECK_OFFSET(exit_qualification, 408); - CHECK_OFFSET(guest_linear_address, 416); - CHECK_OFFSET(guest_cr0, 424); - CHECK_OFFSET(guest_cr3, 432); - CHECK_OFFSET(guest_cr4, 440); - CHECK_OFFSET(guest_es_base, 448); - CHECK_OFFSET(guest_cs_base, 456); - CHECK_OFFSET(guest_ss_base, 464); - CHECK_OFFSET(guest_ds_base, 472); - CHECK_OFFSET(guest_fs_base, 480); - CHECK_OFFSET(guest_gs_base, 488); - CHECK_OFFSET(guest_ldtr_base, 496); - CHECK_OFFSET(guest_tr_base, 504); - CHECK_OFFSET(guest_gdtr_base, 512); - CHECK_OFFSET(guest_idtr_base, 520); - CHECK_OFFSET(guest_dr7, 528); - CHECK_OFFSET(guest_rsp, 536); - CHECK_OFFSET(guest_rip, 544); - CHECK_OFFSET(guest_rflags, 552); - CHECK_OFFSET(guest_pending_dbg_exceptions, 560); - CHECK_OFFSET(guest_sysenter_esp, 568); - CHECK_OFFSET(guest_sysenter_eip, 576); - CHECK_OFFSET(host_cr0, 584); - CHECK_OFFSET(host_cr3, 592); - CHECK_OFFSET(host_cr4, 600); - CHECK_OFFSET(host_fs_base, 608); - CHECK_OFFSET(host_gs_base, 616); - CHECK_OFFSET(host_tr_base, 624); - CHECK_OFFSET(host_gdtr_base, 632); - CHECK_OFFSET(host_idtr_base, 640); - CHECK_OFFSET(host_ia32_sysenter_esp, 648); - CHECK_OFFSET(host_ia32_sysenter_eip, 656); - CHECK_OFFSET(host_rsp, 664); - CHECK_OFFSET(host_rip, 672); - CHECK_OFFSET(pin_based_vm_exec_control, 744); - CHECK_OFFSET(cpu_based_vm_exec_control, 748); - CHECK_OFFSET(exception_bitmap, 752); - CHECK_OFFSET(page_fault_error_code_mask, 756); - CHECK_OFFSET(page_fault_error_code_match, 760); - CHECK_OFFSET(cr3_target_count, 764); - CHECK_OFFSET(vm_exit_controls, 768); - CHECK_OFFSET(vm_exit_msr_store_count, 772); - CHECK_OFFSET(vm_exit_msr_load_count, 776); - CHECK_OFFSET(vm_entry_controls, 780); - CHECK_OFFSET(vm_entry_msr_load_count, 784); - CHECK_OFFSET(vm_entry_intr_info_field, 788); - CHECK_OFFSET(vm_entry_exception_error_code, 792); - CHECK_OFFSET(vm_entry_instruction_len, 796); - CHECK_OFFSET(tpr_threshold, 800); - CHECK_OFFSET(secondary_vm_exec_control, 804); - CHECK_OFFSET(vm_instruction_error, 808); - CHECK_OFFSET(vm_exit_reason, 812); - CHECK_OFFSET(vm_exit_intr_info, 816); - CHECK_OFFSET(vm_exit_intr_error_code, 820); - CHECK_OFFSET(idt_vectoring_info_field, 824); - CHECK_OFFSET(idt_vectoring_error_code, 828); - CHECK_OFFSET(vm_exit_instruction_len, 832); - CHECK_OFFSET(vmx_instruction_info, 836); - CHECK_OFFSET(guest_es_limit, 840); - CHECK_OFFSET(guest_cs_limit, 844); - CHECK_OFFSET(guest_ss_limit, 848); - CHECK_OFFSET(guest_ds_limit, 852); - CHECK_OFFSET(guest_fs_limit, 856); - CHECK_OFFSET(guest_gs_limit, 860); - CHECK_OFFSET(guest_ldtr_limit, 864); - CHECK_OFFSET(guest_tr_limit, 868); - CHECK_OFFSET(guest_gdtr_limit, 872); - CHECK_OFFSET(guest_idtr_limit, 876); - CHECK_OFFSET(guest_es_ar_bytes, 880); - CHECK_OFFSET(guest_cs_ar_bytes, 884); - CHECK_OFFSET(guest_ss_ar_bytes, 888); - CHECK_OFFSET(guest_ds_ar_bytes, 892); - CHECK_OFFSET(guest_fs_ar_bytes, 896); - CHECK_OFFSET(guest_gs_ar_bytes, 900); - CHECK_OFFSET(guest_ldtr_ar_bytes, 904); - CHECK_OFFSET(guest_tr_ar_bytes, 908); - CHECK_OFFSET(guest_interruptibility_info, 912); - CHECK_OFFSET(guest_activity_state, 916); - CHECK_OFFSET(guest_sysenter_cs, 920); - CHECK_OFFSET(host_ia32_sysenter_cs, 924); - CHECK_OFFSET(vmx_preemption_timer_value, 928); - CHECK_OFFSET(virtual_processor_id, 960); - CHECK_OFFSET(posted_intr_nv, 962); - CHECK_OFFSET(guest_es_selector, 964); - CHECK_OFFSET(guest_cs_selector, 966); - CHECK_OFFSET(guest_ss_selector, 968); - CHECK_OFFSET(guest_ds_selector, 970); - CHECK_OFFSET(guest_fs_selector, 972); - CHECK_OFFSET(guest_gs_selector, 974); - CHECK_OFFSET(guest_ldtr_selector, 976); - CHECK_OFFSET(guest_tr_selector, 978); - CHECK_OFFSET(guest_intr_status, 980); - CHECK_OFFSET(host_es_selector, 982); - CHECK_OFFSET(host_cs_selector, 984); - CHECK_OFFSET(host_ss_selector, 986); - CHECK_OFFSET(host_ds_selector, 988); - CHECK_OFFSET(host_fs_selector, 990); - CHECK_OFFSET(host_gs_selector, 992); - CHECK_OFFSET(host_tr_selector, 994); - CHECK_OFFSET(guest_pml_index, 996); -} - -/* - * VMCS12_REVISION is an arbitrary id that should be changed if the content or - * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and - * VMPTRLD verifies that the VMCS region that L1 is loading contains this id. - * - * IMPORTANT: Changing this value will break save/restore compatibility with - * older kvm releases. - */ -#define VMCS12_REVISION 0x11e57ed0 - -/* - * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region - * and any VMCS region. Although only sizeof(struct vmcs12) are used by the - * current implementation, 4K are reserved to avoid future complications. - */ -#define VMCS12_SIZE 0x1000 - -/* - * VMCS12_MAX_FIELD_INDEX is the highest index value used in any - * supported VMCS12 field encoding. - */ -#define VMCS12_MAX_FIELD_INDEX 0x17 - /* * The nested_vmx structure is part of vcpu_vmx, and holds information we need * for correct emulation of VMX (i.e., nested VMX) on this vcpu. @@ -1017,14 +656,6 @@ static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) return &(to_vmx(vcpu)->pi_desc); } -#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) -#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) -#define FIELD(number, name) [ROL16(number, 6)] = VMCS12_OFFSET(name) -#define FIELD64(number, name) \ - FIELD(number, name), \ - [ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32) - - static u16 shadow_read_only_fields[] = { #define SHADOW_FIELD_RO(x) x, #include "vmcs_shadow_fields.h" @@ -1039,172 +670,6 @@ static u16 shadow_read_write_fields[] = { static int max_shadow_read_write_fields = ARRAY_SIZE(shadow_read_write_fields); -static const unsigned short vmcs_field_to_offset_table[] = { - FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), - FIELD(POSTED_INTR_NV, posted_intr_nv), - FIELD(GUEST_ES_SELECTOR, guest_es_selector), - FIELD(GUEST_CS_SELECTOR, guest_cs_selector), - FIELD(GUEST_SS_SELECTOR, guest_ss_selector), - FIELD(GUEST_DS_SELECTOR, guest_ds_selector), - FIELD(GUEST_FS_SELECTOR, guest_fs_selector), - FIELD(GUEST_GS_SELECTOR, guest_gs_selector), - FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector), - FIELD(GUEST_TR_SELECTOR, guest_tr_selector), - FIELD(GUEST_INTR_STATUS, guest_intr_status), - FIELD(GUEST_PML_INDEX, guest_pml_index), - FIELD(HOST_ES_SELECTOR, host_es_selector), - FIELD(HOST_CS_SELECTOR, host_cs_selector), - FIELD(HOST_SS_SELECTOR, host_ss_selector), - FIELD(HOST_DS_SELECTOR, host_ds_selector), - FIELD(HOST_FS_SELECTOR, host_fs_selector), - FIELD(HOST_GS_SELECTOR, host_gs_selector), - FIELD(HOST_TR_SELECTOR, host_tr_selector), - FIELD64(IO_BITMAP_A, io_bitmap_a), - FIELD64(IO_BITMAP_B, io_bitmap_b), - FIELD64(MSR_BITMAP, msr_bitmap), - FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr), - FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr), - FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr), - FIELD64(PML_ADDRESS, pml_address), - FIELD64(TSC_OFFSET, tsc_offset), - FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr), - FIELD64(APIC_ACCESS_ADDR, apic_access_addr), - FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr), - FIELD64(VM_FUNCTION_CONTROL, vm_function_control), - FIELD64(EPT_POINTER, ept_pointer), - FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0), - FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1), - FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2), - FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3), - FIELD64(EPTP_LIST_ADDRESS, eptp_list_address), - FIELD64(VMREAD_BITMAP, vmread_bitmap), - FIELD64(VMWRITE_BITMAP, vmwrite_bitmap), - FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), - FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), - FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), - FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl), - FIELD64(GUEST_IA32_PAT, guest_ia32_pat), - FIELD64(GUEST_IA32_EFER, guest_ia32_efer), - FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl), - FIELD64(GUEST_PDPTR0, guest_pdptr0), - FIELD64(GUEST_PDPTR1, guest_pdptr1), - FIELD64(GUEST_PDPTR2, guest_pdptr2), - FIELD64(GUEST_PDPTR3, guest_pdptr3), - FIELD64(GUEST_BNDCFGS, guest_bndcfgs), - FIELD64(HOST_IA32_PAT, host_ia32_pat), - FIELD64(HOST_IA32_EFER, host_ia32_efer), - FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl), - FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control), - FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control), - FIELD(EXCEPTION_BITMAP, exception_bitmap), - FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask), - FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match), - FIELD(CR3_TARGET_COUNT, cr3_target_count), - FIELD(VM_EXIT_CONTROLS, vm_exit_controls), - FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count), - FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count), - FIELD(VM_ENTRY_CONTROLS, vm_entry_controls), - FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count), - FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field), - FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code), - FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len), - FIELD(TPR_THRESHOLD, tpr_threshold), - FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control), - FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error), - FIELD(VM_EXIT_REASON, vm_exit_reason), - FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info), - FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code), - FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field), - FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code), - FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len), - FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info), - FIELD(GUEST_ES_LIMIT, guest_es_limit), - FIELD(GUEST_CS_LIMIT, guest_cs_limit), - FIELD(GUEST_SS_LIMIT, guest_ss_limit), - FIELD(GUEST_DS_LIMIT, guest_ds_limit), - FIELD(GUEST_FS_LIMIT, guest_fs_limit), - FIELD(GUEST_GS_LIMIT, guest_gs_limit), - FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit), - FIELD(GUEST_TR_LIMIT, guest_tr_limit), - FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit), - FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit), - FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes), - FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes), - FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes), - FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes), - FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes), - FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes), - FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes), - FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes), - FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info), - FIELD(GUEST_ACTIVITY_STATE, guest_activity_state), - FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs), - FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs), - FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value), - FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask), - FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask), - FIELD(CR0_READ_SHADOW, cr0_read_shadow), - FIELD(CR4_READ_SHADOW, cr4_read_shadow), - FIELD(CR3_TARGET_VALUE0, cr3_target_value0), - FIELD(CR3_TARGET_VALUE1, cr3_target_value1), - FIELD(CR3_TARGET_VALUE2, cr3_target_value2), - FIELD(CR3_TARGET_VALUE3, cr3_target_value3), - FIELD(EXIT_QUALIFICATION, exit_qualification), - FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address), - FIELD(GUEST_CR0, guest_cr0), - FIELD(GUEST_CR3, guest_cr3), - FIELD(GUEST_CR4, guest_cr4), - FIELD(GUEST_ES_BASE, guest_es_base), - FIELD(GUEST_CS_BASE, guest_cs_base), - FIELD(GUEST_SS_BASE, guest_ss_base), - FIELD(GUEST_DS_BASE, guest_ds_base), - FIELD(GUEST_FS_BASE, guest_fs_base), - FIELD(GUEST_GS_BASE, guest_gs_base), - FIELD(GUEST_LDTR_BASE, guest_ldtr_base), - FIELD(GUEST_TR_BASE, guest_tr_base), - FIELD(GUEST_GDTR_BASE, guest_gdtr_base), - FIELD(GUEST_IDTR_BASE, guest_idtr_base), - FIELD(GUEST_DR7, guest_dr7), - FIELD(GUEST_RSP, guest_rsp), - FIELD(GUEST_RIP, guest_rip), - FIELD(GUEST_RFLAGS, guest_rflags), - FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions), - FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp), - FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip), - FIELD(HOST_CR0, host_cr0), - FIELD(HOST_CR3, host_cr3), - FIELD(HOST_CR4, host_cr4), - FIELD(HOST_FS_BASE, host_fs_base), - FIELD(HOST_GS_BASE, host_gs_base), - FIELD(HOST_TR_BASE, host_tr_base), - FIELD(HOST_GDTR_BASE, host_gdtr_base), - FIELD(HOST_IDTR_BASE, host_idtr_base), - FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp), - FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip), - FIELD(HOST_RSP, host_rsp), - FIELD(HOST_RIP, host_rip), -}; - -static inline short vmcs_field_to_offset(unsigned long field) -{ - const size_t size = ARRAY_SIZE(vmcs_field_to_offset_table); - unsigned short offset; - unsigned index; - - if (field >> 15) - return -ENOENT; - - index = ROL16(field, 6); - if (index >= size) - return -ENOENT; - - index = array_index_nospec(index, size); - offset = vmcs_field_to_offset_table[index]; - if (offset == 0) - return -ENOENT; - return offset; -} - static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) { return to_vmx(vcpu)->nested.cached_vmcs12; @@ -8206,71 +7671,6 @@ static int handle_vmresume(struct kvm_vcpu *vcpu) return nested_vmx_run(vcpu, false); } -/* - * Read a vmcs12 field. Since these can have varying lengths and we return - * one type, we chose the biggest type (u64) and zero-extend the return value - * to that size. Note that the caller, handle_vmread, might need to use only - * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of - * 64-bit fields are to be returned). - */ -static inline int vmcs12_read_any(struct vmcs12 *vmcs12, - unsigned long field, u64 *ret) -{ - short offset = vmcs_field_to_offset(field); - char *p; - - if (offset < 0) - return offset; - - p = (char *)vmcs12 + offset; - - switch (vmcs_field_width(field)) { - case VMCS_FIELD_WIDTH_NATURAL_WIDTH: - *ret = *((natural_width *)p); - return 0; - case VMCS_FIELD_WIDTH_U16: - *ret = *((u16 *)p); - return 0; - case VMCS_FIELD_WIDTH_U32: - *ret = *((u32 *)p); - return 0; - case VMCS_FIELD_WIDTH_U64: - *ret = *((u64 *)p); - return 0; - default: - WARN_ON(1); - return -ENOENT; - } -} - - -static inline int vmcs12_write_any(struct vmcs12 *vmcs12, - unsigned long field, u64 field_value){ - short offset = vmcs_field_to_offset(field); - char *p = (char *)vmcs12 + offset; - if (offset < 0) - return offset; - - switch (vmcs_field_width(field)) { - case VMCS_FIELD_WIDTH_U16: - *(u16 *)p = field_value; - return 0; - case VMCS_FIELD_WIDTH_U32: - *(u32 *)p = field_value; - return 0; - case VMCS_FIELD_WIDTH_U64: - *(u64 *)p = field_value; - return 0; - case VMCS_FIELD_WIDTH_NATURAL_WIDTH: - *(natural_width *)p = field_value; - return 0; - default: - WARN_ON(1); - return -ENOENT; - } - -} - static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) { struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; -- cgit v1.2.3 From 8373d25d25d14fe644feae007c15a5a10cf8e888 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 3 Dec 2018 13:53:08 -0800 Subject: KVM: VMX: Add vmx.h to hold VMX definitions Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 364 +------------------------------------------------ arch/x86/kvm/vmx/vmx.h | 351 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 352 insertions(+), 363 deletions(-) create mode 100644 arch/x86/kvm/vmx/vmx.h (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ebe500f48bdf..6e3da6486471 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -60,6 +60,7 @@ #include "vmcs.h" #include "vmcs12.h" #include "x86.h" +#include "vmx.h" #define __ex(x) __kvm_handle_fault_on_reboot(x) #define __ex_clear(x, reg) \ @@ -120,10 +121,6 @@ static u64 __read_mostly host_xss; bool __read_mostly enable_pml = 1; module_param_named(pml, enable_pml, bool, S_IRUGO); -#define MSR_TYPE_R 1 -#define MSR_TYPE_W 2 -#define MSR_TYPE_RW 3 - #define MSR_BITMAP_MODE_X2APIC 1 #define MSR_BITMAP_MODE_X2APIC_APICV 2 @@ -345,317 +342,6 @@ static const struct kernel_param_ops vmentry_l1d_flush_ops = { }; module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); -enum ept_pointers_status { - EPT_POINTERS_CHECK = 0, - EPT_POINTERS_MATCH = 1, - EPT_POINTERS_MISMATCH = 2 -}; - -struct kvm_vmx { - struct kvm kvm; - - unsigned int tss_addr; - bool ept_identity_pagetable_done; - gpa_t ept_identity_map_addr; - - enum ept_pointers_status ept_pointers_match; - spinlock_t ept_pointer_lock; -}; - -struct shared_msr_entry { - unsigned index; - u64 data; - u64 mask; -}; - -/* - * The nested_vmx structure is part of vcpu_vmx, and holds information we need - * for correct emulation of VMX (i.e., nested VMX) on this vcpu. - */ -struct nested_vmx { - /* Has the level1 guest done vmxon? */ - bool vmxon; - gpa_t vmxon_ptr; - bool pml_full; - - /* The guest-physical address of the current VMCS L1 keeps for L2 */ - gpa_t current_vmptr; - /* - * Cache of the guest's VMCS, existing outside of guest memory. - * Loaded from guest memory during VMPTRLD. Flushed to guest - * memory during VMCLEAR and VMPTRLD. - */ - struct vmcs12 *cached_vmcs12; - /* - * Cache of the guest's shadow VMCS, existing outside of guest - * memory. Loaded from guest memory during VM entry. Flushed - * to guest memory during VM exit. - */ - struct vmcs12 *cached_shadow_vmcs12; - /* - * Indicates if the shadow vmcs or enlightened vmcs must be updated - * with the data held by struct vmcs12. - */ - bool need_vmcs12_sync; - bool dirty_vmcs12; - - /* - * vmcs02 has been initialized, i.e. state that is constant for - * vmcs02 has been written to the backing VMCS. Initialization - * is delayed until L1 actually attempts to run a nested VM. - */ - bool vmcs02_initialized; - - bool change_vmcs01_virtual_apic_mode; - - /* - * Enlightened VMCS has been enabled. It does not mean that L1 has to - * use it. However, VMX features available to L1 will be limited based - * on what the enlightened VMCS supports. - */ - bool enlightened_vmcs_enabled; - - /* L2 must run next, and mustn't decide to exit to L1. */ - bool nested_run_pending; - - struct loaded_vmcs vmcs02; - - /* - * Guest pages referred to in the vmcs02 with host-physical - * pointers, so we must keep them pinned while L2 runs. - */ - struct page *apic_access_page; - struct page *virtual_apic_page; - struct page *pi_desc_page; - struct pi_desc *pi_desc; - bool pi_pending; - u16 posted_intr_nv; - - struct hrtimer preemption_timer; - bool preemption_timer_expired; - - /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ - u64 vmcs01_debugctl; - u64 vmcs01_guest_bndcfgs; - - u16 vpid02; - u16 last_vpid; - - struct nested_vmx_msrs msrs; - - /* SMM related state */ - struct { - /* in VMX operation on SMM entry? */ - bool vmxon; - /* in guest mode on SMM entry? */ - bool guest_mode; - } smm; - - gpa_t hv_evmcs_vmptr; - struct page *hv_evmcs_page; - struct hv_enlightened_vmcs *hv_evmcs; -}; - -#define POSTED_INTR_ON 0 -#define POSTED_INTR_SN 1 - -/* Posted-Interrupt Descriptor */ -struct pi_desc { - u32 pir[8]; /* Posted interrupt requested */ - union { - struct { - /* bit 256 - Outstanding Notification */ - u16 on : 1, - /* bit 257 - Suppress Notification */ - sn : 1, - /* bit 271:258 - Reserved */ - rsvd_1 : 14; - /* bit 279:272 - Notification Vector */ - u8 nv; - /* bit 287:280 - Reserved */ - u8 rsvd_2; - /* bit 319:288 - Notification Destination */ - u32 ndst; - }; - u64 control; - }; - u32 rsvd[6]; -} __aligned(64); - -static bool pi_test_and_set_on(struct pi_desc *pi_desc) -{ - return test_and_set_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static bool pi_test_and_clear_on(struct pi_desc *pi_desc) -{ - return test_and_clear_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) -{ - return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); -} - -static inline void pi_clear_sn(struct pi_desc *pi_desc) -{ - return clear_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); -} - -static inline void pi_set_sn(struct pi_desc *pi_desc) -{ - return set_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); -} - -static inline void pi_clear_on(struct pi_desc *pi_desc) -{ - clear_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static inline int pi_test_on(struct pi_desc *pi_desc) -{ - return test_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static inline int pi_test_sn(struct pi_desc *pi_desc) -{ - return test_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); -} - -#define NR_AUTOLOAD_MSRS 8 - -struct vmx_msrs { - unsigned int nr; - struct vmx_msr_entry val[NR_AUTOLOAD_MSRS]; -}; - -struct vcpu_vmx { - struct kvm_vcpu vcpu; - unsigned long host_rsp; - u8 fail; - u8 msr_bitmap_mode; - u32 exit_intr_info; - u32 idt_vectoring_info; - ulong rflags; - struct shared_msr_entry *guest_msrs; - int nmsrs; - int save_nmsrs; - bool guest_msrs_dirty; - unsigned long host_idt_base; -#ifdef CONFIG_X86_64 - u64 msr_host_kernel_gs_base; - u64 msr_guest_kernel_gs_base; -#endif - - u64 arch_capabilities; - u64 spec_ctrl; - - u32 vm_entry_controls_shadow; - u32 vm_exit_controls_shadow; - u32 secondary_exec_control; - - /* - * loaded_vmcs points to the VMCS currently used in this vcpu. For a - * non-nested (L1) guest, it always points to vmcs01. For a nested - * guest (L2), it points to a different VMCS. loaded_cpu_state points - * to the VMCS whose state is loaded into the CPU registers that only - * need to be switched when transitioning to/from the kernel; a NULL - * value indicates that host state is loaded. - */ - struct loaded_vmcs vmcs01; - struct loaded_vmcs *loaded_vmcs; - struct loaded_vmcs *loaded_cpu_state; - bool __launched; /* temporary, used in vmx_vcpu_run */ - struct msr_autoload { - struct vmx_msrs guest; - struct vmx_msrs host; - } msr_autoload; - - struct { - int vm86_active; - ulong save_rflags; - struct kvm_segment segs[8]; - } rmode; - struct { - u32 bitmask; /* 4 bits per segment (1 bit per field) */ - struct kvm_save_segment { - u16 selector; - unsigned long base; - u32 limit; - u32 ar; - } seg[8]; - } segment_cache; - int vpid; - bool emulation_required; - - u32 exit_reason; - - /* Posted interrupt descriptor */ - struct pi_desc pi_desc; - - /* Support for a guest hypervisor (nested VMX) */ - struct nested_vmx nested; - - /* Dynamic PLE window. */ - int ple_window; - bool ple_window_dirty; - - bool req_immediate_exit; - - /* Support for PML */ -#define PML_ENTITY_NUM 512 - struct page *pml_pg; - - /* apic deadline value in host tsc */ - u64 hv_deadline_tsc; - - u64 current_tsc_ratio; - - u32 host_pkru; - - unsigned long host_debugctlmsr; - - /* - * Only bits masked by msr_ia32_feature_control_valid_bits can be set in - * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included - * in msr_ia32_feature_control_valid_bits. - */ - u64 msr_ia32_feature_control; - u64 msr_ia32_feature_control_valid_bits; - u64 ept_pointer; -}; - -enum segment_cache_field { - SEG_FIELD_SEL = 0, - SEG_FIELD_BASE = 1, - SEG_FIELD_LIMIT = 2, - SEG_FIELD_AR = 3, - - SEG_FIELD_NR = 4 -}; - -static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm) -{ - return container_of(kvm, struct kvm_vmx, kvm); -} - -static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) -{ - return container_of(vcpu, struct vcpu_vmx, vcpu); -} - -static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) -{ - return &(to_vmx(vcpu)->pi_desc); -} - static u16 shadow_read_only_fields[] = { #define SHADOW_FIELD_RO(x) x, #include "vmcs_shadow_fields.h" @@ -1625,11 +1311,6 @@ static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val) vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val); } -static void vmx_segment_cache_clear(struct vcpu_vmx *vmx) -{ - vmx->segment_cache.bitmask = 0; -} - static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg, unsigned field) { @@ -5152,8 +4833,6 @@ static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu) return mode; } -#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4)) - static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap, u8 mode) { @@ -5478,47 +5157,6 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) vmx_update_msr_bitmap(vcpu); } -static u32 vmx_vmentry_ctrl(void) -{ - /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ - return vmcs_config.vmentry_ctrl & - ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER); -} - -static u32 vmx_vmexit_ctrl(void) -{ - /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ - return vmcs_config.vmexit_ctrl & - ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); -} - -static u32 vmx_exec_control(struct vcpu_vmx *vmx) -{ - u32 exec_control = vmcs_config.cpu_based_exec_ctrl; - - if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) - exec_control &= ~CPU_BASED_MOV_DR_EXITING; - - if (!cpu_need_tpr_shadow(&vmx->vcpu)) { - exec_control &= ~CPU_BASED_TPR_SHADOW; -#ifdef CONFIG_X86_64 - exec_control |= CPU_BASED_CR8_STORE_EXITING | - CPU_BASED_CR8_LOAD_EXITING; -#endif - } - if (!enable_ept) - exec_control |= CPU_BASED_CR3_STORE_EXITING | - CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_INVLPG_EXITING; - if (kvm_mwait_in_guest(vmx->vcpu.kvm)) - exec_control &= ~(CPU_BASED_MWAIT_EXITING | - CPU_BASED_MONITOR_EXITING); - if (kvm_hlt_in_guest(vmx->vcpu.kvm)) - exec_control &= ~CPU_BASED_HLT_EXITING; - return exec_control; -} - - static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) { struct kvm_vcpu *vcpu = &vmx->vcpu; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h new file mode 100644 index 000000000000..c9aa150aa014 --- /dev/null +++ b/arch/x86/kvm/vmx/vmx.h @@ -0,0 +1,351 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_VMX_H +#define __KVM_X86_VMX_H + +#include + +#include + +#include "capabilities.h" +#include "vmcs.h" + +#define MSR_TYPE_R 1 +#define MSR_TYPE_W 2 +#define MSR_TYPE_RW 3 + +#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4)) + +#define NR_AUTOLOAD_MSRS 8 + +struct vmx_msrs { + unsigned int nr; + struct vmx_msr_entry val[NR_AUTOLOAD_MSRS]; +}; + +struct shared_msr_entry { + unsigned index; + u64 data; + u64 mask; +}; + +enum segment_cache_field { + SEG_FIELD_SEL = 0, + SEG_FIELD_BASE = 1, + SEG_FIELD_LIMIT = 2, + SEG_FIELD_AR = 3, + + SEG_FIELD_NR = 4 +}; + +/* Posted-Interrupt Descriptor */ +struct pi_desc { + u32 pir[8]; /* Posted interrupt requested */ + union { + struct { + /* bit 256 - Outstanding Notification */ + u16 on : 1, + /* bit 257 - Suppress Notification */ + sn : 1, + /* bit 271:258 - Reserved */ + rsvd_1 : 14; + /* bit 279:272 - Notification Vector */ + u8 nv; + /* bit 287:280 - Reserved */ + u8 rsvd_2; + /* bit 319:288 - Notification Destination */ + u32 ndst; + }; + u64 control; + }; + u32 rsvd[6]; +} __aligned(64); + + +/* + * The nested_vmx structure is part of vcpu_vmx, and holds information we need + * for correct emulation of VMX (i.e., nested VMX) on this vcpu. + */ +struct nested_vmx { + /* Has the level1 guest done vmxon? */ + bool vmxon; + gpa_t vmxon_ptr; + bool pml_full; + + /* The guest-physical address of the current VMCS L1 keeps for L2 */ + gpa_t current_vmptr; + /* + * Cache of the guest's VMCS, existing outside of guest memory. + * Loaded from guest memory during VMPTRLD. Flushed to guest + * memory during VMCLEAR and VMPTRLD. + */ + struct vmcs12 *cached_vmcs12; + /* + * Cache of the guest's shadow VMCS, existing outside of guest + * memory. Loaded from guest memory during VM entry. Flushed + * to guest memory during VM exit. + */ + struct vmcs12 *cached_shadow_vmcs12; + /* + * Indicates if the shadow vmcs or enlightened vmcs must be updated + * with the data held by struct vmcs12. + */ + bool need_vmcs12_sync; + bool dirty_vmcs12; + + /* + * vmcs02 has been initialized, i.e. state that is constant for + * vmcs02 has been written to the backing VMCS. Initialization + * is delayed until L1 actually attempts to run a nested VM. + */ + bool vmcs02_initialized; + + bool change_vmcs01_virtual_apic_mode; + + /* + * Enlightened VMCS has been enabled. It does not mean that L1 has to + * use it. However, VMX features available to L1 will be limited based + * on what the enlightened VMCS supports. + */ + bool enlightened_vmcs_enabled; + + /* L2 must run next, and mustn't decide to exit to L1. */ + bool nested_run_pending; + + struct loaded_vmcs vmcs02; + + /* + * Guest pages referred to in the vmcs02 with host-physical + * pointers, so we must keep them pinned while L2 runs. + */ + struct page *apic_access_page; + struct page *virtual_apic_page; + struct page *pi_desc_page; + struct pi_desc *pi_desc; + bool pi_pending; + u16 posted_intr_nv; + + struct hrtimer preemption_timer; + bool preemption_timer_expired; + + /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ + u64 vmcs01_debugctl; + u64 vmcs01_guest_bndcfgs; + + u16 vpid02; + u16 last_vpid; + + struct nested_vmx_msrs msrs; + + /* SMM related state */ + struct { + /* in VMX operation on SMM entry? */ + bool vmxon; + /* in guest mode on SMM entry? */ + bool guest_mode; + } smm; + + gpa_t hv_evmcs_vmptr; + struct page *hv_evmcs_page; + struct hv_enlightened_vmcs *hv_evmcs; +}; + +struct vcpu_vmx { + struct kvm_vcpu vcpu; + unsigned long host_rsp; + u8 fail; + u8 msr_bitmap_mode; + u32 exit_intr_info; + u32 idt_vectoring_info; + ulong rflags; + struct shared_msr_entry *guest_msrs; + int nmsrs; + int save_nmsrs; + bool guest_msrs_dirty; + unsigned long host_idt_base; +#ifdef CONFIG_X86_64 + u64 msr_host_kernel_gs_base; + u64 msr_guest_kernel_gs_base; +#endif + + u64 arch_capabilities; + u64 spec_ctrl; + + u32 vm_entry_controls_shadow; + u32 vm_exit_controls_shadow; + u32 secondary_exec_control; + + /* + * loaded_vmcs points to the VMCS currently used in this vcpu. For a + * non-nested (L1) guest, it always points to vmcs01. For a nested + * guest (L2), it points to a different VMCS. loaded_cpu_state points + * to the VMCS whose state is loaded into the CPU registers that only + * need to be switched when transitioning to/from the kernel; a NULL + * value indicates that host state is loaded. + */ + struct loaded_vmcs vmcs01; + struct loaded_vmcs *loaded_vmcs; + struct loaded_vmcs *loaded_cpu_state; + bool __launched; /* temporary, used in vmx_vcpu_run */ + struct msr_autoload { + struct vmx_msrs guest; + struct vmx_msrs host; + } msr_autoload; + + struct { + int vm86_active; + ulong save_rflags; + struct kvm_segment segs[8]; + } rmode; + struct { + u32 bitmask; /* 4 bits per segment (1 bit per field) */ + struct kvm_save_segment { + u16 selector; + unsigned long base; + u32 limit; + u32 ar; + } seg[8]; + } segment_cache; + int vpid; + bool emulation_required; + + u32 exit_reason; + + /* Posted interrupt descriptor */ + struct pi_desc pi_desc; + + /* Support for a guest hypervisor (nested VMX) */ + struct nested_vmx nested; + + /* Dynamic PLE window. */ + int ple_window; + bool ple_window_dirty; + + bool req_immediate_exit; + + /* Support for PML */ +#define PML_ENTITY_NUM 512 + struct page *pml_pg; + + /* apic deadline value in host tsc */ + u64 hv_deadline_tsc; + + u64 current_tsc_ratio; + + u32 host_pkru; + + unsigned long host_debugctlmsr; + + /* + * Only bits masked by msr_ia32_feature_control_valid_bits can be set in + * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included + * in msr_ia32_feature_control_valid_bits. + */ + u64 msr_ia32_feature_control; + u64 msr_ia32_feature_control_valid_bits; + u64 ept_pointer; +}; + +enum ept_pointers_status { + EPT_POINTERS_CHECK = 0, + EPT_POINTERS_MATCH = 1, + EPT_POINTERS_MISMATCH = 2 +}; + +struct kvm_vmx { + struct kvm kvm; + + unsigned int tss_addr; + bool ept_identity_pagetable_done; + gpa_t ept_identity_map_addr; + + enum ept_pointers_status ept_pointers_match; + spinlock_t ept_pointer_lock; +}; + +#define POSTED_INTR_ON 0 +#define POSTED_INTR_SN 1 + +static inline bool pi_test_and_set_on(struct pi_desc *pi_desc) +{ + return test_and_set_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc) +{ + return test_and_clear_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) +{ + return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); +} + +static inline void pi_clear_sn(struct pi_desc *pi_desc) +{ + return clear_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + +static inline void pi_set_sn(struct pi_desc *pi_desc) +{ + return set_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + +static inline void pi_clear_on(struct pi_desc *pi_desc) +{ + clear_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static inline int pi_test_on(struct pi_desc *pi_desc) +{ + return test_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static inline int pi_test_sn(struct pi_desc *pi_desc) +{ + return test_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + +static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) +{ + vmx->segment_cache.bitmask = 0; +} + +static inline u32 vmx_vmentry_ctrl(void) +{ + /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ + return vmcs_config.vmentry_ctrl & + ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER); +} + +static inline u32 vmx_vmexit_ctrl(void) +{ + /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ + return vmcs_config.vmexit_ctrl & + ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); +} + +u32 vmx_exec_control(struct vcpu_vmx *vmx); + +static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm) +{ + return container_of(kvm, struct kvm_vmx, kvm); +} + +static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) +{ + return container_of(vcpu, struct vcpu_vmx, vcpu); +} + +static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) +{ + return &(to_vmx(vcpu)->pi_desc); +} + +#endif /* __KVM_X86_VMX_H */ -- cgit v1.2.3 From 75edce8a45486fe5fa5becdb43a7c36354b2a379 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 3 Dec 2018 13:53:06 -0800 Subject: KVM: VMX: Move eVMCS code to dedicated files The header, evmcs.h, already exists and contains a fair amount of code, but there are a few pieces in vmx.c that can be moved verbatim. In addition, move an array definition to evmcs.c to prepare for multiple consumers of evmcs.h. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/Makefile | 2 +- arch/x86/kvm/vmx/evmcs.c | 343 +++++++++++++++++++++++++++++++++++ arch/x86/kvm/vmx/evmcs.h | 457 +++++++++++++++++------------------------------ arch/x86/kvm/vmx/vmcs.h | 2 + arch/x86/kvm/vmx/vmx.c | 186 +------------------ 5 files changed, 514 insertions(+), 476 deletions(-) create mode 100644 arch/x86/kvm/vmx/evmcs.c (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 79d97d837cf3..7f3f50aaa203 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -16,7 +16,7 @@ kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ hyperv.o page_track.o debugfs.o -kvm-intel-y += vmx/vmx.o vmx/pmu_intel.o vmx/vmcs12.o +kvm-intel-y += vmx/vmx.o vmx/pmu_intel.o vmx/vmcs12.o vmx/evmcs.o kvm-amd-y += svm.o pmu_amd.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c new file mode 100644 index 000000000000..155fb4a39472 --- /dev/null +++ b/arch/x86/kvm/vmx/evmcs.c @@ -0,0 +1,343 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +#include "evmcs.h" +#include "vmcs.h" +#include "vmx.h" + +DEFINE_STATIC_KEY_FALSE(enable_evmcs); + +#if IS_ENABLED(CONFIG_HYPERV) + +#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) +#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x) +#define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \ + {EVMCS1_OFFSET(name), clean_field} + +const struct evmcs_field vmcs_field_to_evmcs_1[] = { + /* 64 bit rw */ + EVMCS1_FIELD(GUEST_RIP, guest_rip, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(GUEST_RSP, guest_rsp, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), + EVMCS1_FIELD(GUEST_RFLAGS, guest_rflags, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), + EVMCS1_FIELD(HOST_IA32_PAT, host_ia32_pat, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_IA32_EFER, host_ia32_efer, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_CR0, host_cr0, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_CR3, host_cr3, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_CR4, host_cr4, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_RIP, host_rip, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(IO_BITMAP_A, io_bitmap_a, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP), + EVMCS1_FIELD(IO_BITMAP_B, io_bitmap_b, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP), + EVMCS1_FIELD(MSR_BITMAP, msr_bitmap, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP), + EVMCS1_FIELD(GUEST_ES_BASE, guest_es_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_CS_BASE, guest_cs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_SS_BASE, guest_ss_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_DS_BASE, guest_ds_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_FS_BASE, guest_fs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GS_BASE, guest_gs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_LDTR_BASE, guest_ldtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_TR_BASE, guest_tr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GDTR_BASE, guest_gdtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_IDTR_BASE, guest_idtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(TSC_OFFSET, tsc_offset, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), + EVMCS1_FIELD(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), + EVMCS1_FIELD(VMCS_LINK_POINTER, vmcs_link_pointer, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_IA32_PAT, guest_ia32_pat, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_IA32_EFER, guest_ia32_efer, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_PDPTR0, guest_pdptr0, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_PDPTR1, guest_pdptr1, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_PDPTR2, guest_pdptr2, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_PDPTR3, guest_pdptr3, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(CR0_READ_SHADOW, cr0_read_shadow, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(CR4_READ_SHADOW, cr4_read_shadow, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(GUEST_CR0, guest_cr0, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(GUEST_CR3, guest_cr3, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(GUEST_CR4, guest_cr4, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(GUEST_DR7, guest_dr7, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(HOST_FS_BASE, host_fs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(HOST_GS_BASE, host_gs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(HOST_TR_BASE, host_tr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(HOST_GDTR_BASE, host_gdtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(HOST_IDTR_BASE, host_idtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(HOST_RSP, host_rsp, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(EPT_POINTER, ept_pointer, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT), + EVMCS1_FIELD(GUEST_BNDCFGS, guest_bndcfgs, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(XSS_EXIT_BITMAP, xss_exit_bitmap, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), + + /* 64 bit read only */ + EVMCS1_FIELD(GUEST_PHYSICAL_ADDRESS, guest_physical_address, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(EXIT_QUALIFICATION, exit_qualification, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + /* + * Not defined in KVM: + * + * EVMCS1_FIELD(0x00006402, exit_io_instruction_ecx, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); + * EVMCS1_FIELD(0x00006404, exit_io_instruction_esi, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); + * EVMCS1_FIELD(0x00006406, exit_io_instruction_esi, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); + * EVMCS1_FIELD(0x00006408, exit_io_instruction_eip, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); + */ + EVMCS1_FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + + /* + * No mask defined in the spec as Hyper-V doesn't currently support + * these. Future proof by resetting the whole clean field mask on + * access. + */ + EVMCS1_FIELD(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(CR3_TARGET_VALUE0, cr3_target_value0, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(CR3_TARGET_VALUE1, cr3_target_value1, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(CR3_TARGET_VALUE2, cr3_target_value2, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(CR3_TARGET_VALUE3, cr3_target_value3, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + + /* 32 bit rw */ + EVMCS1_FIELD(TPR_THRESHOLD, tpr_threshold, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), + EVMCS1_FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC), + EVMCS1_FIELD(EXCEPTION_BITMAP, exception_bitmap, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN), + EVMCS1_FIELD(VM_ENTRY_CONTROLS, vm_entry_controls, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY), + EVMCS1_FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), + EVMCS1_FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, + vm_entry_exception_error_code, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), + EVMCS1_FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), + EVMCS1_FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), + EVMCS1_FIELD(VM_EXIT_CONTROLS, vm_exit_controls, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), + EVMCS1_FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), + EVMCS1_FIELD(GUEST_ES_LIMIT, guest_es_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_CS_LIMIT, guest_cs_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_SS_LIMIT, guest_ss_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_DS_LIMIT, guest_ds_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_FS_LIMIT, guest_fs_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GS_LIMIT, guest_gs_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_TR_LIMIT, guest_tr_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_ACTIVITY_STATE, guest_activity_state, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + + /* 32 bit read only */ + EVMCS1_FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(VM_EXIT_REASON, vm_exit_reason, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + + /* No mask defined in the spec (not used) */ + EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(CR3_TARGET_COUNT, cr3_target_count, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + + /* 16 bit rw */ + EVMCS1_FIELD(HOST_ES_SELECTOR, host_es_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_CS_SELECTOR, host_cs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_SS_SELECTOR, host_ss_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_DS_SELECTOR, host_ds_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_FS_SELECTOR, host_fs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_GS_SELECTOR, host_gs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_TR_SELECTOR, host_tr_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(GUEST_ES_SELECTOR, guest_es_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_CS_SELECTOR, guest_cs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_SS_SELECTOR, guest_ss_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_DS_SELECTOR, guest_ds_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_FS_SELECTOR, guest_fs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GS_SELECTOR, guest_gs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_TR_SELECTOR, guest_tr_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT), +}; +const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1); + +void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) +{ + vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL; + vmcs_conf->cpu_based_2nd_exec_ctrl &= ~EVMCS1_UNSUPPORTED_2NDEXEC; + + vmcs_conf->vmexit_ctrl &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL; + vmcs_conf->vmentry_ctrl &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL; + +} +#endif + +int nested_enable_evmcs(struct kvm_vcpu *vcpu, + uint16_t *vmcs_version) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * vmcs_version represents the range of supported Enlightened VMCS + * versions: lower 8 bits is the minimal version, higher 8 bits is the + * maximum supported version. KVM supports versions from 1 to + * KVM_EVMCS_VERSION. + */ + if (vmcs_version) + *vmcs_version = (KVM_EVMCS_VERSION << 8) | 1; + + /* We don't support disabling the feature for simplicity. */ + if (vmx->nested.enlightened_vmcs_enabled) + return 0; + + vmx->nested.enlightened_vmcs_enabled = true; + + vmx->nested.msrs.pinbased_ctls_high &= ~EVMCS1_UNSUPPORTED_PINCTRL; + vmx->nested.msrs.entry_ctls_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL; + vmx->nested.msrs.exit_ctls_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL; + vmx->nested.msrs.secondary_ctls_high &= ~EVMCS1_UNSUPPORTED_2NDEXEC; + vmx->nested.msrs.vmfunc_controls &= ~EVMCS1_UNSUPPORTED_VMFUNC; + + return 0; +} diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h index 210a884090ad..1572a3759d65 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/evmcs.h @@ -2,302 +2,77 @@ #ifndef __KVM_X86_VMX_EVMCS_H #define __KVM_X86_VMX_EVMCS_H -#include +#include -#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) -#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x) -#define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \ - {EVMCS1_OFFSET(name), clean_field} +#include +#include +#include + +#include "capabilities.h" +#include "vmcs.h" + +struct vmcs_config; + +DECLARE_STATIC_KEY_FALSE(enable_evmcs); + +#define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs)) + +#define KVM_EVMCS_VERSION 1 + +/* + * Enlightened VMCSv1 doesn't support these: + * + * POSTED_INTR_NV = 0x00000002, + * GUEST_INTR_STATUS = 0x00000810, + * APIC_ACCESS_ADDR = 0x00002014, + * POSTED_INTR_DESC_ADDR = 0x00002016, + * EOI_EXIT_BITMAP0 = 0x0000201c, + * EOI_EXIT_BITMAP1 = 0x0000201e, + * EOI_EXIT_BITMAP2 = 0x00002020, + * EOI_EXIT_BITMAP3 = 0x00002022, + * GUEST_PML_INDEX = 0x00000812, + * PML_ADDRESS = 0x0000200e, + * VM_FUNCTION_CONTROL = 0x00002018, + * EPTP_LIST_ADDRESS = 0x00002024, + * VMREAD_BITMAP = 0x00002026, + * VMWRITE_BITMAP = 0x00002028, + * + * TSC_MULTIPLIER = 0x00002032, + * PLE_GAP = 0x00004020, + * PLE_WINDOW = 0x00004022, + * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E, + * GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808, + * HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04, + * + * Currently unsupported in KVM: + * GUEST_IA32_RTIT_CTL = 0x00002814, + */ +#define EVMCS1_UNSUPPORTED_PINCTRL (PIN_BASED_POSTED_INTR | \ + PIN_BASED_VMX_PREEMPTION_TIMER) +#define EVMCS1_UNSUPPORTED_2NDEXEC \ + (SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \ + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | \ + SECONDARY_EXEC_APIC_REGISTER_VIRT | \ + SECONDARY_EXEC_ENABLE_PML | \ + SECONDARY_EXEC_ENABLE_VMFUNC | \ + SECONDARY_EXEC_SHADOW_VMCS | \ + SECONDARY_EXEC_TSC_SCALING | \ + SECONDARY_EXEC_PAUSE_LOOP_EXITING) +#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) +#define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) +#define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING) + +#if IS_ENABLED(CONFIG_HYPERV) struct evmcs_field { u16 offset; u16 clean_field; }; -static const struct evmcs_field vmcs_field_to_evmcs_1[] = { - /* 64 bit rw */ - EVMCS1_FIELD(GUEST_RIP, guest_rip, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(GUEST_RSP, guest_rsp, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), - EVMCS1_FIELD(GUEST_RFLAGS, guest_rflags, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), - EVMCS1_FIELD(HOST_IA32_PAT, host_ia32_pat, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_IA32_EFER, host_ia32_efer, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_CR0, host_cr0, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_CR3, host_cr3, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_CR4, host_cr4, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_RIP, host_rip, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(IO_BITMAP_A, io_bitmap_a, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP), - EVMCS1_FIELD(IO_BITMAP_B, io_bitmap_b, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP), - EVMCS1_FIELD(MSR_BITMAP, msr_bitmap, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP), - EVMCS1_FIELD(GUEST_ES_BASE, guest_es_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_CS_BASE, guest_cs_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_SS_BASE, guest_ss_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_DS_BASE, guest_ds_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_FS_BASE, guest_fs_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_GS_BASE, guest_gs_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_LDTR_BASE, guest_ldtr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_TR_BASE, guest_tr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_GDTR_BASE, guest_gdtr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_IDTR_BASE, guest_idtr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(TSC_OFFSET, tsc_offset, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), - EVMCS1_FIELD(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), - EVMCS1_FIELD(VMCS_LINK_POINTER, vmcs_link_pointer, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_IA32_PAT, guest_ia32_pat, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_IA32_EFER, guest_ia32_efer, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_PDPTR0, guest_pdptr0, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_PDPTR1, guest_pdptr1, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_PDPTR2, guest_pdptr2, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_PDPTR3, guest_pdptr3, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(CR0_READ_SHADOW, cr0_read_shadow, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(CR4_READ_SHADOW, cr4_read_shadow, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(GUEST_CR0, guest_cr0, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(GUEST_CR3, guest_cr3, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(GUEST_CR4, guest_cr4, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(GUEST_DR7, guest_dr7, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), - EVMCS1_FIELD(HOST_FS_BASE, host_fs_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), - EVMCS1_FIELD(HOST_GS_BASE, host_gs_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), - EVMCS1_FIELD(HOST_TR_BASE, host_tr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), - EVMCS1_FIELD(HOST_GDTR_BASE, host_gdtr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), - EVMCS1_FIELD(HOST_IDTR_BASE, host_idtr_base, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), - EVMCS1_FIELD(HOST_RSP, host_rsp, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), - EVMCS1_FIELD(EPT_POINTER, ept_pointer, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT), - EVMCS1_FIELD(GUEST_BNDCFGS, guest_bndcfgs, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(XSS_EXIT_BITMAP, xss_exit_bitmap, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), - - /* 64 bit read only */ - EVMCS1_FIELD(GUEST_PHYSICAL_ADDRESS, guest_physical_address, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(EXIT_QUALIFICATION, exit_qualification, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - /* - * Not defined in KVM: - * - * EVMCS1_FIELD(0x00006402, exit_io_instruction_ecx, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); - * EVMCS1_FIELD(0x00006404, exit_io_instruction_esi, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); - * EVMCS1_FIELD(0x00006406, exit_io_instruction_esi, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); - * EVMCS1_FIELD(0x00006408, exit_io_instruction_eip, - * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); - */ - EVMCS1_FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - - /* - * No mask defined in the spec as Hyper-V doesn't currently support - * these. Future proof by resetting the whole clean field mask on - * access. - */ - EVMCS1_FIELD(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(CR3_TARGET_VALUE0, cr3_target_value0, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(CR3_TARGET_VALUE1, cr3_target_value1, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(CR3_TARGET_VALUE2, cr3_target_value2, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(CR3_TARGET_VALUE3, cr3_target_value3, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - - /* 32 bit rw */ - EVMCS1_FIELD(TPR_THRESHOLD, tpr_threshold, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), - EVMCS1_FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC), - EVMCS1_FIELD(EXCEPTION_BITMAP, exception_bitmap, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN), - EVMCS1_FIELD(VM_ENTRY_CONTROLS, vm_entry_controls, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY), - EVMCS1_FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), - EVMCS1_FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, - vm_entry_exception_error_code, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), - EVMCS1_FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), - EVMCS1_FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), - EVMCS1_FIELD(VM_EXIT_CONTROLS, vm_exit_controls, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), - EVMCS1_FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), - EVMCS1_FIELD(GUEST_ES_LIMIT, guest_es_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_CS_LIMIT, guest_cs_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_SS_LIMIT, guest_ss_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_DS_LIMIT, guest_ds_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_FS_LIMIT, guest_fs_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_GS_LIMIT, guest_gs_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_TR_LIMIT, guest_tr_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_ACTIVITY_STATE, guest_activity_state, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - EVMCS1_FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), - - /* 32 bit read only */ - EVMCS1_FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(VM_EXIT_REASON, vm_exit_reason, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - EVMCS1_FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), - - /* No mask defined in the spec (not used) */ - EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(CR3_TARGET_COUNT, cr3_target_count, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - - /* 16 bit rw */ - EVMCS1_FIELD(HOST_ES_SELECTOR, host_es_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_CS_SELECTOR, host_cs_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_SS_SELECTOR, host_ss_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_DS_SELECTOR, host_ds_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_FS_SELECTOR, host_fs_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_GS_SELECTOR, host_gs_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(HOST_TR_SELECTOR, host_tr_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), - EVMCS1_FIELD(GUEST_ES_SELECTOR, guest_es_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_CS_SELECTOR, guest_cs_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_SS_SELECTOR, guest_ss_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_DS_SELECTOR, guest_ds_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_FS_SELECTOR, guest_fs_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_GS_SELECTOR, guest_gs_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(GUEST_TR_SELECTOR, guest_tr_selector, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), - EVMCS1_FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT), -}; +extern const struct evmcs_field vmcs_field_to_evmcs_1[]; +extern const unsigned int nr_evmcs_1_fields; + +#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) static __always_inline int get_evmcs_offset(unsigned long field, u16 *clean_field) @@ -305,7 +80,7 @@ static __always_inline int get_evmcs_offset(unsigned long field, unsigned int index = ROL16(field, 6); const struct evmcs_field *evmcs_field; - if (unlikely(index >= ARRAY_SIZE(vmcs_field_to_evmcs_1))) { + if (unlikely(index >= nr_evmcs_1_fields)) { WARN_ONCE(1, "KVM: accessing unsupported EVMCS field %lx\n", field); return -ENOENT; @@ -321,4 +96,106 @@ static __always_inline int get_evmcs_offset(unsigned long field, #undef ROL16 +static inline void evmcs_write64(unsigned long field, u64 value) +{ + u16 clean_field; + int offset = get_evmcs_offset(field, &clean_field); + + if (offset < 0) + return; + + *(u64 *)((char *)current_evmcs + offset) = value; + + current_evmcs->hv_clean_fields &= ~clean_field; +} + +static inline void evmcs_write32(unsigned long field, u32 value) +{ + u16 clean_field; + int offset = get_evmcs_offset(field, &clean_field); + + if (offset < 0) + return; + + *(u32 *)((char *)current_evmcs + offset) = value; + current_evmcs->hv_clean_fields &= ~clean_field; +} + +static inline void evmcs_write16(unsigned long field, u16 value) +{ + u16 clean_field; + int offset = get_evmcs_offset(field, &clean_field); + + if (offset < 0) + return; + + *(u16 *)((char *)current_evmcs + offset) = value; + current_evmcs->hv_clean_fields &= ~clean_field; +} + +static inline u64 evmcs_read64(unsigned long field) +{ + int offset = get_evmcs_offset(field, NULL); + + if (offset < 0) + return 0; + + return *(u64 *)((char *)current_evmcs + offset); +} + +static inline u32 evmcs_read32(unsigned long field) +{ + int offset = get_evmcs_offset(field, NULL); + + if (offset < 0) + return 0; + + return *(u32 *)((char *)current_evmcs + offset); +} + +static inline u16 evmcs_read16(unsigned long field) +{ + int offset = get_evmcs_offset(field, NULL); + + if (offset < 0) + return 0; + + return *(u16 *)((char *)current_evmcs + offset); +} + +static inline void evmcs_touch_msr_bitmap(void) +{ + if (unlikely(!current_evmcs)) + return; + + if (current_evmcs->hv_enlightenments_control.msr_bitmap) + current_evmcs->hv_clean_fields &= + ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP; +} + +static inline void evmcs_load(u64 phys_addr) +{ + struct hv_vp_assist_page *vp_ap = + hv_get_vp_assist_page(smp_processor_id()); + + vp_ap->current_nested_vmcs = phys_addr; + vp_ap->enlighten_vmentry = 1; +} + +void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf); +#else /* !IS_ENABLED(CONFIG_HYPERV) */ +static inline void evmcs_write64(unsigned long field, u64 value) {} +static inline void evmcs_write32(unsigned long field, u32 value) {} +static inline void evmcs_write16(unsigned long field, u16 value) {} +static inline u64 evmcs_read64(unsigned long field) { return 0; } +static inline u32 evmcs_read32(unsigned long field) { return 0; } +static inline u16 evmcs_read16(unsigned long field) { return 0; } +static inline void evmcs_load(u64 phys_addr) {} +static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {} +static inline void evmcs_touch_msr_bitmap(void) {} +#endif /* IS_ENABLED(CONFIG_HYPERV) */ + +int nested_enable_evmcs(struct kvm_vcpu *vcpu, + uint16_t *vmcs_version); + #endif /* __KVM_X86_VMX_EVMCS_H */ diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 3b8da04203e4..6def3ba88e3b 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -22,6 +22,8 @@ struct vmcs { char data[0]; }; +DECLARE_PER_CPU(struct vmcs *, current_vmcs); + /* * vmcs_host_state tracks registers that are loaded from the VMCS on VMEXIT * and whose values change infrequently, but are not constant. I.e. this is diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 6e3da6486471..266fa6f324eb 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -385,7 +385,7 @@ static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bit u32 msr, int type); static DEFINE_PER_CPU(struct vmcs *, vmxarea); -static DEFINE_PER_CPU(struct vmcs *, current_vmcs); +DEFINE_PER_CPU(struct vmcs *, current_vmcs); /* * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. @@ -455,155 +455,10 @@ static const u32 vmx_msr_index[] = { MSR_EFER, MSR_TSC_AUX, MSR_STAR, }; -DEFINE_STATIC_KEY_FALSE(enable_evmcs); - -#define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs)) - -#define KVM_EVMCS_VERSION 1 - -/* - * Enlightened VMCSv1 doesn't support these: - * - * POSTED_INTR_NV = 0x00000002, - * GUEST_INTR_STATUS = 0x00000810, - * APIC_ACCESS_ADDR = 0x00002014, - * POSTED_INTR_DESC_ADDR = 0x00002016, - * EOI_EXIT_BITMAP0 = 0x0000201c, - * EOI_EXIT_BITMAP1 = 0x0000201e, - * EOI_EXIT_BITMAP2 = 0x00002020, - * EOI_EXIT_BITMAP3 = 0x00002022, - * GUEST_PML_INDEX = 0x00000812, - * PML_ADDRESS = 0x0000200e, - * VM_FUNCTION_CONTROL = 0x00002018, - * EPTP_LIST_ADDRESS = 0x00002024, - * VMREAD_BITMAP = 0x00002026, - * VMWRITE_BITMAP = 0x00002028, - * - * TSC_MULTIPLIER = 0x00002032, - * PLE_GAP = 0x00004020, - * PLE_WINDOW = 0x00004022, - * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E, - * GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808, - * HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04, - * - * Currently unsupported in KVM: - * GUEST_IA32_RTIT_CTL = 0x00002814, - */ -#define EVMCS1_UNSUPPORTED_PINCTRL (PIN_BASED_POSTED_INTR | \ - PIN_BASED_VMX_PREEMPTION_TIMER) -#define EVMCS1_UNSUPPORTED_2NDEXEC \ - (SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \ - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | \ - SECONDARY_EXEC_APIC_REGISTER_VIRT | \ - SECONDARY_EXEC_ENABLE_PML | \ - SECONDARY_EXEC_ENABLE_VMFUNC | \ - SECONDARY_EXEC_SHADOW_VMCS | \ - SECONDARY_EXEC_TSC_SCALING | \ - SECONDARY_EXEC_PAUSE_LOOP_EXITING) -#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) -#define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) -#define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING) - #if IS_ENABLED(CONFIG_HYPERV) static bool __read_mostly enlightened_vmcs = true; module_param(enlightened_vmcs, bool, 0444); -static inline void evmcs_write64(unsigned long field, u64 value) -{ - u16 clean_field; - int offset = get_evmcs_offset(field, &clean_field); - - if (offset < 0) - return; - - *(u64 *)((char *)current_evmcs + offset) = value; - - current_evmcs->hv_clean_fields &= ~clean_field; -} - -static inline void evmcs_write32(unsigned long field, u32 value) -{ - u16 clean_field; - int offset = get_evmcs_offset(field, &clean_field); - - if (offset < 0) - return; - - *(u32 *)((char *)current_evmcs + offset) = value; - current_evmcs->hv_clean_fields &= ~clean_field; -} - -static inline void evmcs_write16(unsigned long field, u16 value) -{ - u16 clean_field; - int offset = get_evmcs_offset(field, &clean_field); - - if (offset < 0) - return; - - *(u16 *)((char *)current_evmcs + offset) = value; - current_evmcs->hv_clean_fields &= ~clean_field; -} - -static inline u64 evmcs_read64(unsigned long field) -{ - int offset = get_evmcs_offset(field, NULL); - - if (offset < 0) - return 0; - - return *(u64 *)((char *)current_evmcs + offset); -} - -static inline u32 evmcs_read32(unsigned long field) -{ - int offset = get_evmcs_offset(field, NULL); - - if (offset < 0) - return 0; - - return *(u32 *)((char *)current_evmcs + offset); -} - -static inline u16 evmcs_read16(unsigned long field) -{ - int offset = get_evmcs_offset(field, NULL); - - if (offset < 0) - return 0; - - return *(u16 *)((char *)current_evmcs + offset); -} - -static inline void evmcs_touch_msr_bitmap(void) -{ - if (unlikely(!current_evmcs)) - return; - - if (current_evmcs->hv_enlightenments_control.msr_bitmap) - current_evmcs->hv_clean_fields &= - ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP; -} - -static void evmcs_load(u64 phys_addr) -{ - struct hv_vp_assist_page *vp_ap = - hv_get_vp_assist_page(smp_processor_id()); - - vp_ap->current_nested_vmcs = phys_addr; - vp_ap->enlighten_vmentry = 1; -} - -static void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) -{ - vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL; - vmcs_conf->cpu_based_2nd_exec_ctrl &= ~EVMCS1_UNSUPPORTED_2NDEXEC; - - vmcs_conf->vmexit_ctrl &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL; - vmcs_conf->vmentry_ctrl &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL; - -} - /* check_ept_pointer() should be under protection of ept_pointer_lock. */ static void check_ept_pointer_match(struct kvm *kvm) { @@ -650,47 +505,8 @@ static int vmx_hv_remote_flush_tlb(struct kvm *kvm) spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); return ret; } -#else /* !IS_ENABLED(CONFIG_HYPERV) */ -static inline void evmcs_write64(unsigned long field, u64 value) {} -static inline void evmcs_write32(unsigned long field, u32 value) {} -static inline void evmcs_write16(unsigned long field, u16 value) {} -static inline u64 evmcs_read64(unsigned long field) { return 0; } -static inline u32 evmcs_read32(unsigned long field) { return 0; } -static inline u16 evmcs_read16(unsigned long field) { return 0; } -static inline void evmcs_load(u64 phys_addr) {} -static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {} -static inline void evmcs_touch_msr_bitmap(void) {} #endif /* IS_ENABLED(CONFIG_HYPERV) */ -static int nested_enable_evmcs(struct kvm_vcpu *vcpu, - uint16_t *vmcs_version) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - /* - * vmcs_version represents the range of supported Enlightened VMCS - * versions: lower 8 bits is the minimal version, higher 8 bits is the - * maximum supported version. KVM supports versions from 1 to - * KVM_EVMCS_VERSION. - */ - if (vmcs_version) - *vmcs_version = (KVM_EVMCS_VERSION << 8) | 1; - - /* We don't support disabling the feature for simplicity. */ - if (vmx->nested.enlightened_vmcs_enabled) - return 0; - - vmx->nested.enlightened_vmcs_enabled = true; - - vmx->nested.msrs.pinbased_ctls_high &= ~EVMCS1_UNSUPPORTED_PINCTRL; - vmx->nested.msrs.entry_ctls_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL; - vmx->nested.msrs.exit_ctls_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL; - vmx->nested.msrs.secondary_ctls_high &= ~EVMCS1_UNSUPPORTED_2NDEXEC; - vmx->nested.msrs.vmfunc_controls &= ~EVMCS1_UNSUPPORTED_VMFUNC; - - return 0; -} - /* * Comment's format: document - errata name - stepping - processor name. * Refer from -- cgit v1.2.3 From 89b0c9f58350f6820f062ea12000e8a171177f3b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 3 Dec 2018 13:53:07 -0800 Subject: KVM: VMX: Move VMX instruction wrappers to a dedicated header file VMX has a few hundred lines of code just to wrap various VMX specific instructions, e.g. VMWREAD, INVVPID, etc... Move them to a dedicated header so it's easier to find/isolate the boilerplate. With this change, more inlines can be moved from vmx.c to vmx.h. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/ops.h | 285 +++++++++++++++++++++++++++++++++ arch/x86/kvm/vmx/vmx.c | 416 +++++-------------------------------------------- arch/x86/kvm/vmx/vmx.h | 108 +++++++++++++ 3 files changed, 429 insertions(+), 380 deletions(-) create mode 100644 arch/x86/kvm/vmx/ops.h (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/ops.h b/arch/x86/kvm/vmx/ops.h new file mode 100644 index 000000000000..b8e50f76fefc --- /dev/null +++ b/arch/x86/kvm/vmx/ops.h @@ -0,0 +1,285 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_VMX_INSN_H +#define __KVM_X86_VMX_INSN_H + +#include + +#include +#include + +#include "evmcs.h" +#include "vmcs.h" + +#define __ex(x) __kvm_handle_fault_on_reboot(x) +#define __ex_clear(x, reg) \ + ____kvm_handle_fault_on_reboot(x, "xor " reg ", " reg) + +static __always_inline void vmcs_check16(unsigned long field) +{ + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000, + "16-bit accessor invalid for 64-bit field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, + "16-bit accessor invalid for 64-bit high field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, + "16-bit accessor invalid for 32-bit high field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, + "16-bit accessor invalid for natural width field"); +} + +static __always_inline void vmcs_check32(unsigned long field) +{ + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, + "32-bit accessor invalid for 16-bit field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, + "32-bit accessor invalid for natural width field"); +} + +static __always_inline void vmcs_check64(unsigned long field) +{ + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, + "64-bit accessor invalid for 16-bit field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, + "64-bit accessor invalid for 64-bit high field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, + "64-bit accessor invalid for 32-bit field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, + "64-bit accessor invalid for natural width field"); +} + +static __always_inline void vmcs_checkl(unsigned long field) +{ + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, + "Natural width accessor invalid for 16-bit field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000, + "Natural width accessor invalid for 64-bit field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, + "Natural width accessor invalid for 64-bit high field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, + "Natural width accessor invalid for 32-bit field"); +} + +static __always_inline unsigned long __vmcs_readl(unsigned long field) +{ + unsigned long value; + + asm volatile (__ex_clear("vmread %1, %0", "%k0") + : "=r"(value) : "r"(field)); + return value; +} + +static __always_inline u16 vmcs_read16(unsigned long field) +{ + vmcs_check16(field); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_read16(field); + return __vmcs_readl(field); +} + +static __always_inline u32 vmcs_read32(unsigned long field) +{ + vmcs_check32(field); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_read32(field); + return __vmcs_readl(field); +} + +static __always_inline u64 vmcs_read64(unsigned long field) +{ + vmcs_check64(field); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_read64(field); +#ifdef CONFIG_X86_64 + return __vmcs_readl(field); +#else + return __vmcs_readl(field) | ((u64)__vmcs_readl(field+1) << 32); +#endif +} + +static __always_inline unsigned long vmcs_readl(unsigned long field) +{ + vmcs_checkl(field); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_read64(field); + return __vmcs_readl(field); +} + +static noinline void vmwrite_error(unsigned long field, unsigned long value) +{ + printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n", + field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); + dump_stack(); +} + +static __always_inline void __vmcs_writel(unsigned long field, unsigned long value) +{ + bool error; + + asm volatile (__ex("vmwrite %2, %1") CC_SET(na) + : CC_OUT(na) (error) : "r"(field), "rm"(value)); + if (unlikely(error)) + vmwrite_error(field, value); +} + +static __always_inline void vmcs_write16(unsigned long field, u16 value) +{ + vmcs_check16(field); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_write16(field, value); + + __vmcs_writel(field, value); +} + +static __always_inline void vmcs_write32(unsigned long field, u32 value) +{ + vmcs_check32(field); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_write32(field, value); + + __vmcs_writel(field, value); +} + +static __always_inline void vmcs_write64(unsigned long field, u64 value) +{ + vmcs_check64(field); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_write64(field, value); + + __vmcs_writel(field, value); +#ifndef CONFIG_X86_64 + asm volatile (""); + __vmcs_writel(field+1, value >> 32); +#endif +} + +static __always_inline void vmcs_writel(unsigned long field, unsigned long value) +{ + vmcs_checkl(field); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_write64(field, value); + + __vmcs_writel(field, value); +} + +static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask) +{ + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000, + "vmcs_clear_bits does not support 64-bit fields"); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_write32(field, evmcs_read32(field) & ~mask); + + __vmcs_writel(field, __vmcs_readl(field) & ~mask); +} + +static __always_inline void vmcs_set_bits(unsigned long field, u32 mask) +{ + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000, + "vmcs_set_bits does not support 64-bit fields"); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_write32(field, evmcs_read32(field) | mask); + + __vmcs_writel(field, __vmcs_readl(field) | mask); +} + +static inline void vmcs_clear(struct vmcs *vmcs) +{ + u64 phys_addr = __pa(vmcs); + bool error; + + asm volatile (__ex("vmclear %1") CC_SET(na) + : CC_OUT(na) (error) : "m"(phys_addr)); + if (unlikely(error)) + printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", + vmcs, phys_addr); +} + +static inline void vmcs_load(struct vmcs *vmcs) +{ + u64 phys_addr = __pa(vmcs); + bool error; + + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_load(phys_addr); + + asm volatile (__ex("vmptrld %1") CC_SET(na) + : CC_OUT(na) (error) : "m"(phys_addr)); + if (unlikely(error)) + printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n", + vmcs, phys_addr); +} + +static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva) +{ + struct { + u64 vpid : 16; + u64 rsvd : 48; + u64 gva; + } operand = { vpid, 0, gva }; + bool error; + + asm volatile (__ex("invvpid %2, %1") CC_SET(na) + : CC_OUT(na) (error) : "r"(ext), "m"(operand)); + BUG_ON(error); +} + +static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa) +{ + struct { + u64 eptp, gpa; + } operand = {eptp, gpa}; + bool error; + + asm volatile (__ex("invept %2, %1") CC_SET(na) + : CC_OUT(na) (error) : "r"(ext), "m"(operand)); + BUG_ON(error); +} + +static inline bool vpid_sync_vcpu_addr(int vpid, gva_t addr) +{ + if (vpid == 0) + return true; + + if (cpu_has_vmx_invvpid_individual_addr()) { + __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr); + return true; + } + + return false; +} + +static inline void vpid_sync_vcpu_single(int vpid) +{ + if (vpid == 0) + return; + + if (cpu_has_vmx_invvpid_single()) + __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0); +} + +static inline void vpid_sync_vcpu_global(void) +{ + if (cpu_has_vmx_invvpid_global()) + __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0); +} + +static inline void vpid_sync_context(int vpid) +{ + if (cpu_has_vmx_invvpid_single()) + vpid_sync_vcpu_single(vpid); + else + vpid_sync_vcpu_global(); +} + +static inline void ept_sync_global(void) +{ + __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0); +} + +static inline void ept_sync_context(u64 eptp) +{ + if (cpu_has_vmx_invept_context()) + __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0); + else + ept_sync_global(); +} + +#endif /* __KVM_X86_VMX_INSN_H */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 266fa6f324eb..cde40c52d2c6 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -55,17 +55,15 @@ #include "kvm_cache_regs.h" #include "lapic.h" #include "mmu.h" +#include "ops.h" #include "pmu.h" #include "trace.h" #include "vmcs.h" #include "vmcs12.h" +#include "vmx.h" #include "x86.h" #include "vmx.h" -#define __ex(x) __kvm_handle_fault_on_reboot(x) -#define __ex_clear(x, reg) \ - ____kvm_handle_fault_on_reboot(x, "xor " reg ", " reg) - MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); @@ -368,7 +366,6 @@ static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu) static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu); static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); -static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); static void vmx_get_segment(struct kvm_vcpu *vcpu, @@ -701,32 +698,6 @@ static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) return -1; } -static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva) -{ - struct { - u64 vpid : 16; - u64 rsvd : 48; - u64 gva; - } operand = { vpid, 0, gva }; - bool error; - - asm volatile (__ex("invvpid %2, %1") CC_SET(na) - : CC_OUT(na) (error) : "r"(ext), "m"(operand)); - BUG_ON(error); -} - -static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa) -{ - struct { - u64 eptp, gpa; - } operand = {eptp, gpa}; - bool error; - - asm volatile (__ex("invept %2, %1") CC_SET(na) - : CC_OUT(na) (error) : "r"(ext), "m"(operand)); - BUG_ON(error); -} - static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) { int i; @@ -737,19 +708,7 @@ static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) return NULL; } -static void vmcs_clear(struct vmcs *vmcs) -{ - u64 phys_addr = __pa(vmcs); - bool error; - - asm volatile (__ex("vmclear %1") CC_SET(na) - : CC_OUT(na) (error) : "m"(phys_addr)); - if (unlikely(error)) - printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", - vmcs, phys_addr); -} - -static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) +void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) { vmcs_clear(loaded_vmcs->vmcs); if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched) @@ -758,21 +717,6 @@ static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) loaded_vmcs->launched = 0; } -static void vmcs_load(struct vmcs *vmcs) -{ - u64 phys_addr = __pa(vmcs); - bool error; - - if (static_branch_unlikely(&enable_evmcs)) - return evmcs_load(phys_addr); - - asm volatile (__ex("vmptrld %1") CC_SET(na) - : CC_OUT(na) (error) : "m"(phys_addr)); - if (unlikely(error)) - printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n", - vmcs, phys_addr); -} - #ifdef CONFIG_KEXEC_CORE /* * This bitmap is used to indicate whether the vmclear @@ -837,7 +781,7 @@ static void __loaded_vmcs_clear(void *arg) crash_enable_local_vmclear(cpu); } -static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) +void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) { int cpu = loaded_vmcs->cpu; @@ -846,287 +790,6 @@ static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) __loaded_vmcs_clear, loaded_vmcs, 1); } -static inline bool vpid_sync_vcpu_addr(int vpid, gva_t addr) -{ - if (vpid == 0) - return true; - - if (cpu_has_vmx_invvpid_individual_addr()) { - __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr); - return true; - } - - return false; -} - -static inline void vpid_sync_vcpu_single(int vpid) -{ - if (vpid == 0) - return; - - if (cpu_has_vmx_invvpid_single()) - __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0); -} - -static inline void vpid_sync_vcpu_global(void) -{ - if (cpu_has_vmx_invvpid_global()) - __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0); -} - -static inline void vpid_sync_context(int vpid) -{ - if (cpu_has_vmx_invvpid_single()) - vpid_sync_vcpu_single(vpid); - else - vpid_sync_vcpu_global(); -} - -static inline void ept_sync_global(void) -{ - __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0); -} - -static inline void ept_sync_context(u64 eptp) -{ - if (cpu_has_vmx_invept_context()) - __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0); - else - ept_sync_global(); -} - -static __always_inline void vmcs_check16(unsigned long field) -{ - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000, - "16-bit accessor invalid for 64-bit field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, - "16-bit accessor invalid for 64-bit high field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, - "16-bit accessor invalid for 32-bit high field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, - "16-bit accessor invalid for natural width field"); -} - -static __always_inline void vmcs_check32(unsigned long field) -{ - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, - "32-bit accessor invalid for 16-bit field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, - "32-bit accessor invalid for natural width field"); -} - -static __always_inline void vmcs_check64(unsigned long field) -{ - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, - "64-bit accessor invalid for 16-bit field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, - "64-bit accessor invalid for 64-bit high field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, - "64-bit accessor invalid for 32-bit field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, - "64-bit accessor invalid for natural width field"); -} - -static __always_inline void vmcs_checkl(unsigned long field) -{ - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, - "Natural width accessor invalid for 16-bit field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000, - "Natural width accessor invalid for 64-bit field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, - "Natural width accessor invalid for 64-bit high field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, - "Natural width accessor invalid for 32-bit field"); -} - -static __always_inline unsigned long __vmcs_readl(unsigned long field) -{ - unsigned long value; - - asm volatile (__ex_clear("vmread %1, %0", "%k0") - : "=r"(value) : "r"(field)); - return value; -} - -static __always_inline u16 vmcs_read16(unsigned long field) -{ - vmcs_check16(field); - if (static_branch_unlikely(&enable_evmcs)) - return evmcs_read16(field); - return __vmcs_readl(field); -} - -static __always_inline u32 vmcs_read32(unsigned long field) -{ - vmcs_check32(field); - if (static_branch_unlikely(&enable_evmcs)) - return evmcs_read32(field); - return __vmcs_readl(field); -} - -static __always_inline u64 vmcs_read64(unsigned long field) -{ - vmcs_check64(field); - if (static_branch_unlikely(&enable_evmcs)) - return evmcs_read64(field); -#ifdef CONFIG_X86_64 - return __vmcs_readl(field); -#else - return __vmcs_readl(field) | ((u64)__vmcs_readl(field+1) << 32); -#endif -} - -static __always_inline unsigned long vmcs_readl(unsigned long field) -{ - vmcs_checkl(field); - if (static_branch_unlikely(&enable_evmcs)) - return evmcs_read64(field); - return __vmcs_readl(field); -} - -static noinline void vmwrite_error(unsigned long field, unsigned long value) -{ - printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n", - field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); - dump_stack(); -} - -static __always_inline void __vmcs_writel(unsigned long field, unsigned long value) -{ - bool error; - - asm volatile (__ex("vmwrite %2, %1") CC_SET(na) - : CC_OUT(na) (error) : "r"(field), "rm"(value)); - if (unlikely(error)) - vmwrite_error(field, value); -} - -static __always_inline void vmcs_write16(unsigned long field, u16 value) -{ - vmcs_check16(field); - if (static_branch_unlikely(&enable_evmcs)) - return evmcs_write16(field, value); - - __vmcs_writel(field, value); -} - -static __always_inline void vmcs_write32(unsigned long field, u32 value) -{ - vmcs_check32(field); - if (static_branch_unlikely(&enable_evmcs)) - return evmcs_write32(field, value); - - __vmcs_writel(field, value); -} - -static __always_inline void vmcs_write64(unsigned long field, u64 value) -{ - vmcs_check64(field); - if (static_branch_unlikely(&enable_evmcs)) - return evmcs_write64(field, value); - - __vmcs_writel(field, value); -#ifndef CONFIG_X86_64 - asm volatile (""); - __vmcs_writel(field+1, value >> 32); -#endif -} - -static __always_inline void vmcs_writel(unsigned long field, unsigned long value) -{ - vmcs_checkl(field); - if (static_branch_unlikely(&enable_evmcs)) - return evmcs_write64(field, value); - - __vmcs_writel(field, value); -} - -static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask) -{ - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000, - "vmcs_clear_bits does not support 64-bit fields"); - if (static_branch_unlikely(&enable_evmcs)) - return evmcs_write32(field, evmcs_read32(field) & ~mask); - - __vmcs_writel(field, __vmcs_readl(field) & ~mask); -} - -static __always_inline void vmcs_set_bits(unsigned long field, u32 mask) -{ - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000, - "vmcs_set_bits does not support 64-bit fields"); - if (static_branch_unlikely(&enable_evmcs)) - return evmcs_write32(field, evmcs_read32(field) | mask); - - __vmcs_writel(field, __vmcs_readl(field) | mask); -} - -static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx) -{ - vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS); -} - -static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val) -{ - vmcs_write32(VM_ENTRY_CONTROLS, val); - vmx->vm_entry_controls_shadow = val; -} - -static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val) -{ - if (vmx->vm_entry_controls_shadow != val) - vm_entry_controls_init(vmx, val); -} - -static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx) -{ - return vmx->vm_entry_controls_shadow; -} - - -static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val) -{ - vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val); -} - -static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val) -{ - vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val); -} - -static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx) -{ - vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS); -} - -static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val) -{ - vmcs_write32(VM_EXIT_CONTROLS, val); - vmx->vm_exit_controls_shadow = val; -} - -static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val) -{ - if (vmx->vm_exit_controls_shadow != val) - vm_exit_controls_init(vmx, val); -} - -static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx) -{ - return vmx->vm_exit_controls_shadow; -} - - -static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val) -{ - vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val); -} - -static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val) -{ - vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val); -} - static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg, unsigned field) { @@ -1672,12 +1335,6 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) new.control) != old.control); } -static void decache_tsc_multiplier(struct vcpu_vmx *vmx) -{ - vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio; - vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio); -} - /* * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken. @@ -3312,7 +2969,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, return 0; } -static struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu) +struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu) { int node = cpu_to_node(cpu); struct page *pages; @@ -3335,7 +2992,7 @@ static struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu) return vmcs; } -static void free_vmcs(struct vmcs *vmcs) +void free_vmcs(struct vmcs *vmcs) { free_pages((unsigned long)vmcs, vmcs_config.order); } @@ -3343,7 +3000,7 @@ static void free_vmcs(struct vmcs *vmcs) /* * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded */ -static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) +void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) { if (!loaded_vmcs->vmcs) return; @@ -3355,12 +3012,7 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) WARN_ON(loaded_vmcs->shadow_vmcs != NULL); } -static struct vmcs *alloc_vmcs(bool shadow) -{ - return alloc_vmcs_cpu(shadow, raw_smp_processor_id()); -} - -static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) +int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) { loaded_vmcs->vmcs = alloc_vmcs(false); if (!loaded_vmcs->vmcs) @@ -3692,24 +3344,6 @@ static void exit_lmode(struct kvm_vcpu *vcpu) #endif -static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid, - bool invalidate_gpa) -{ - if (enable_ept && (invalidate_gpa || !enable_vpid)) { - if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) - return; - ept_sync_context(construct_eptp(vcpu, - vcpu->arch.mmu->root_hpa)); - } else { - vpid_sync_context(vpid); - } -} - -static void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) -{ - __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa); -} - static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) { int vpid = to_vmx(vcpu)->vpid; @@ -3889,7 +3523,7 @@ static int get_ept_level(struct kvm_vcpu *vcpu) return 4; } -static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) +u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) { u64 eptp = VMX_EPTP_MT_WB; @@ -4749,11 +4383,6 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) nested_mark_vmcs12_pages_dirty(vcpu); } -static u8 vmx_get_rvi(void) -{ - return vmcs_read16(GUEST_INTR_STATUS) & 0xff; -} - static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -4973,6 +4602,33 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) vmx_update_msr_bitmap(vcpu); } +u32 vmx_exec_control(struct vcpu_vmx *vmx) +{ + u32 exec_control = vmcs_config.cpu_based_exec_ctrl; + + if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) + exec_control &= ~CPU_BASED_MOV_DR_EXITING; + + if (!cpu_need_tpr_shadow(&vmx->vcpu)) { + exec_control &= ~CPU_BASED_TPR_SHADOW; +#ifdef CONFIG_X86_64 + exec_control |= CPU_BASED_CR8_STORE_EXITING | + CPU_BASED_CR8_LOAD_EXITING; +#endif + } + if (!enable_ept) + exec_control |= CPU_BASED_CR3_STORE_EXITING | + CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_INVLPG_EXITING; + if (kvm_mwait_in_guest(vmx->vcpu.kvm)) + exec_control &= ~(CPU_BASED_MWAIT_EXITING | + CPU_BASED_MONITOR_EXITING); + if (kvm_hlt_in_guest(vmx->vcpu.kvm)) + exec_control &= ~CPU_BASED_HLT_EXITING; + return exec_control; +} + + static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) { struct kvm_vcpu *vcpu = &vmx->vcpu; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index c9aa150aa014..4c7fcf1571c3 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -7,6 +7,7 @@ #include #include "capabilities.h" +#include "ops.h" #include "vmcs.h" #define MSR_TYPE_R 1 @@ -312,6 +313,75 @@ static inline int pi_test_sn(struct pi_desc *pi_desc) (unsigned long *)&pi_desc->control); } +static inline u8 vmx_get_rvi(void) +{ + return vmcs_read16(GUEST_INTR_STATUS) & 0xff; +} + +static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx) +{ + vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS); +} + +static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val) +{ + vmcs_write32(VM_ENTRY_CONTROLS, val); + vmx->vm_entry_controls_shadow = val; +} + +static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val) +{ + if (vmx->vm_entry_controls_shadow != val) + vm_entry_controls_init(vmx, val); +} + +static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx) +{ + return vmx->vm_entry_controls_shadow; +} + +static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val) +{ + vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val); +} + +static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val) +{ + vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val); +} + +static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx) +{ + vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS); +} + +static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val) +{ + vmcs_write32(VM_EXIT_CONTROLS, val); + vmx->vm_exit_controls_shadow = val; +} + +static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val) +{ + if (vmx->vm_exit_controls_shadow != val) + vm_exit_controls_init(vmx, val); +} + +static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx) +{ + return vmx->vm_exit_controls_shadow; +} + +static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val) +{ + vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val); +} + +static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val) +{ + vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val); +} + static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) { vmx->segment_cache.bitmask = 0; @@ -348,4 +418,42 @@ static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) return &(to_vmx(vcpu)->pi_desc); } +struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu); +void free_vmcs(struct vmcs *vmcs); +int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs); +void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs); +void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs); +void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs); + +static inline struct vmcs *alloc_vmcs(bool shadow) +{ + return alloc_vmcs_cpu(shadow, raw_smp_processor_id()); +} + +u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); + +static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid, + bool invalidate_gpa) +{ + if (enable_ept && (invalidate_gpa || !enable_vpid)) { + if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) + return; + ept_sync_context(construct_eptp(vcpu, + vcpu->arch.mmu->root_hpa)); + } else { + vpid_sync_context(vpid); + } +} + +static inline void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) +{ + __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa); +} + +static inline void decache_tsc_multiplier(struct vcpu_vmx *vmx) +{ + vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio; + vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio); +} + #endif /* __KVM_X86_VMX_H */ -- cgit v1.2.3 From 944c3464538de42b92284a922f0929b5fe6bd893 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 3 Dec 2018 13:53:09 -0800 Subject: KVM: VMX: Move nested hardware/vcpu {un}setup to helper functions Eventually this will allow us to move the nested VMX code out of vmx.c. Note that this also effectively wraps @enable_shadow_vmcs with @nested so that it too can be moved out of vmx.c. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 85 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 54 insertions(+), 31 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index cde40c52d2c6..33e3de597d6d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4765,13 +4765,9 @@ static void ept_set_mmio_spte_mask(void) } #define VMX_XSS_EXIT_BITMAP 0 -/* - * Sets up the vmcs for emulated real mode. - */ -static void vmx_vcpu_setup(struct vcpu_vmx *vmx) -{ - int i; +static void nested_vmx_vcpu_setup(void) +{ if (enable_shadow_vmcs) { /* * At vCPU creation, "VMWRITE to any supported field @@ -4782,6 +4778,18 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmread_bitmap)); } +} + +/* + * Sets up the vmcs for emulated real mode. + */ +static void vmx_vcpu_setup(struct vcpu_vmx *vmx) +{ + int i; + + if (nested) + nested_vmx_vcpu_setup(); + if (cpu_has_vmx_msr_bitmap()) vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap)); @@ -6059,10 +6067,40 @@ static void vmx_enable_tdp(void) kvm_enable_tdp(); } +static __exit void nested_vmx_hardware_unsetup(void) +{ + int i; + + if (enable_shadow_vmcs) { + for (i = 0; i < VMX_BITMAP_NR; i++) + free_page((unsigned long)vmx_bitmap[i]); + } +} + +static __init int nested_vmx_hardware_setup(void) +{ + int i; + + if (enable_shadow_vmcs) { + for (i = 0; i < VMX_BITMAP_NR; i++) { + vmx_bitmap[i] = (unsigned long *) + __get_free_page(GFP_KERNEL); + if (!vmx_bitmap[i]) { + nested_vmx_hardware_unsetup(); + return -ENOMEM; + } + } + + init_vmcs_shadow_fields(); + } + + return 0; +} + static __init int hardware_setup(void) { unsigned long host_bndcfgs; - int r = -ENOMEM, i; + int r, i; rdmsrl_safe(MSR_EFER, &host_efer); @@ -6186,16 +6224,6 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_shadow_vmcs() || !nested) enable_shadow_vmcs = 0; - if (enable_shadow_vmcs) { - for (i = 0; i < VMX_BITMAP_NR; i++) { - vmx_bitmap[i] = (unsigned long *) - __get_free_page(GFP_KERNEL); - if (!vmx_bitmap[i]) - goto out; - } - - init_vmcs_shadow_fields(); - } kvm_set_posted_intr_wakeup_handler(wakeup_handler); nested_vmx_setup_ctls_msrs(&vmcs_config.nested, vmx_capability.ept, @@ -6203,27 +6231,22 @@ static __init int hardware_setup(void) kvm_mce_cap_supported |= MCG_LMCE_P; + if (nested) { + r = nested_vmx_hardware_setup(); + if (r) + return r; + } + r = alloc_kvm_area(); if (r) - goto out; - return 0; - -out: - if (enable_shadow_vmcs) { - for (i = 0; i < VMX_BITMAP_NR; i++) - free_page((unsigned long)vmx_bitmap[i]); - } + nested_vmx_hardware_unsetup(); return r; } static __exit void hardware_unsetup(void) { - int i; - - if (enable_shadow_vmcs) { - for (i = 0; i < VMX_BITMAP_NR; i++) - free_page((unsigned long)vmx_bitmap[i]); - } + if (nested) + nested_vmx_hardware_unsetup(); free_kvm_area(); } -- cgit v1.2.3 From 5158917c7b0196aefc1a4e9bc4458777dd2c41ec Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 3 Dec 2018 13:53:10 -0800 Subject: KVM: x86: nVMX: Allow nested_enable_evmcs to be NULL ...so that it can conditionally set by the VMX code, i.e. iff @nested is true. This will in turn allow it to be moved out of vmx.c and into a nested-specified file. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6af846c54660..54ef79421c6b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3827,6 +3827,8 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return kvm_hv_activate_synic(vcpu, cap->cap == KVM_CAP_HYPERV_SYNIC2); case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: + if (!kvm_x86_ops->nested_enable_evmcs) + return -ENOTTY; r = kvm_x86_ops->nested_enable_evmcs(vcpu, &vmcs_version); if (!r) { user_ptr = (void __user *)(uintptr_t)cap->args[0]; -- cgit v1.2.3 From a3203381ca95dcf1faa4053e84bd0cd10bc0ca8c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 3 Dec 2018 13:53:11 -0800 Subject: KVM: VMX: Move the hardware {un}setup functions to the bottom ...so that future patches can reference e.g. @kvm_vmx_exit_handlers without having to simultaneously move a big chunk of code. Speaking from experience, resolving merge conflicts is an absolute nightmare without pre-moving the code. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 368 ++++++++++++++++++++++++------------------------- 1 file changed, 184 insertions(+), 184 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 33e3de597d6d..b121daa868d7 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6067,190 +6067,6 @@ static void vmx_enable_tdp(void) kvm_enable_tdp(); } -static __exit void nested_vmx_hardware_unsetup(void) -{ - int i; - - if (enable_shadow_vmcs) { - for (i = 0; i < VMX_BITMAP_NR; i++) - free_page((unsigned long)vmx_bitmap[i]); - } -} - -static __init int nested_vmx_hardware_setup(void) -{ - int i; - - if (enable_shadow_vmcs) { - for (i = 0; i < VMX_BITMAP_NR; i++) { - vmx_bitmap[i] = (unsigned long *) - __get_free_page(GFP_KERNEL); - if (!vmx_bitmap[i]) { - nested_vmx_hardware_unsetup(); - return -ENOMEM; - } - } - - init_vmcs_shadow_fields(); - } - - return 0; -} - -static __init int hardware_setup(void) -{ - unsigned long host_bndcfgs; - int r, i; - - rdmsrl_safe(MSR_EFER, &host_efer); - - for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) - kvm_define_shared_msr(i, vmx_msr_index[i]); - - if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) - return -EIO; - - if (boot_cpu_has(X86_FEATURE_NX)) - kvm_enable_efer_bits(EFER_NX); - - if (boot_cpu_has(X86_FEATURE_MPX)) { - rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs); - WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost"); - } - - if (boot_cpu_has(X86_FEATURE_XSAVES)) - rdmsrl(MSR_IA32_XSS, host_xss); - - if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || - !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) - enable_vpid = 0; - - if (!cpu_has_vmx_ept() || - !cpu_has_vmx_ept_4levels() || - !cpu_has_vmx_ept_mt_wb() || - !cpu_has_vmx_invept_global()) - enable_ept = 0; - - if (!cpu_has_vmx_ept_ad_bits() || !enable_ept) - enable_ept_ad_bits = 0; - - if (!cpu_has_vmx_unrestricted_guest() || !enable_ept) - enable_unrestricted_guest = 0; - - if (!cpu_has_vmx_flexpriority()) - flexpriority_enabled = 0; - - if (!cpu_has_virtual_nmis()) - enable_vnmi = 0; - - /* - * set_apic_access_page_addr() is used to reload apic access - * page upon invalidation. No need to do anything if not - * using the APIC_ACCESS_ADDR VMCS field. - */ - if (!flexpriority_enabled) - kvm_x86_ops->set_apic_access_page_addr = NULL; - - if (!cpu_has_vmx_tpr_shadow()) - kvm_x86_ops->update_cr8_intercept = NULL; - - if (enable_ept && !cpu_has_vmx_ept_2m_page()) - kvm_disable_largepages(); - -#if IS_ENABLED(CONFIG_HYPERV) - if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH - && enable_ept) - kvm_x86_ops->tlb_remote_flush = vmx_hv_remote_flush_tlb; -#endif - - if (!cpu_has_vmx_ple()) { - ple_gap = 0; - ple_window = 0; - ple_window_grow = 0; - ple_window_max = 0; - ple_window_shrink = 0; - } - - if (!cpu_has_vmx_apicv()) { - enable_apicv = 0; - kvm_x86_ops->sync_pir_to_irr = NULL; - } - - if (cpu_has_vmx_tsc_scaling()) { - kvm_has_tsc_control = true; - kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; - kvm_tsc_scaling_ratio_frac_bits = 48; - } - - set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ - - if (enable_ept) - vmx_enable_tdp(); - else - kvm_disable_tdp(); - - if (!nested) { - kvm_x86_ops->get_nested_state = NULL; - kvm_x86_ops->set_nested_state = NULL; - } - - /* - * Only enable PML when hardware supports PML feature, and both EPT - * and EPT A/D bit features are enabled -- PML depends on them to work. - */ - if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) - enable_pml = 0; - - if (!enable_pml) { - kvm_x86_ops->slot_enable_log_dirty = NULL; - kvm_x86_ops->slot_disable_log_dirty = NULL; - kvm_x86_ops->flush_log_dirty = NULL; - kvm_x86_ops->enable_log_dirty_pt_masked = NULL; - } - - if (!cpu_has_vmx_preemption_timer()) - kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit; - - if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) { - u64 vmx_msr; - - rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); - cpu_preemption_timer_multi = - vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; - } else { - kvm_x86_ops->set_hv_timer = NULL; - kvm_x86_ops->cancel_hv_timer = NULL; - } - - if (!cpu_has_vmx_shadow_vmcs() || !nested) - enable_shadow_vmcs = 0; - - kvm_set_posted_intr_wakeup_handler(wakeup_handler); - nested_vmx_setup_ctls_msrs(&vmcs_config.nested, vmx_capability.ept, - enable_apicv); - - kvm_mce_cap_supported |= MCG_LMCE_P; - - if (nested) { - r = nested_vmx_hardware_setup(); - if (r) - return r; - } - - r = alloc_kvm_area(); - if (r) - nested_vmx_hardware_unsetup(); - return r; -} - -static __exit void hardware_unsetup(void) -{ - if (nested) - nested_vmx_hardware_unsetup(); - - free_kvm_area(); -} - /* * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE * exiting, so only get here on cpu with PAUSE-Loop-Exiting. @@ -13152,6 +12968,190 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, return 0; } +static __exit void nested_vmx_hardware_unsetup(void) +{ + int i; + + if (enable_shadow_vmcs) { + for (i = 0; i < VMX_BITMAP_NR; i++) + free_page((unsigned long)vmx_bitmap[i]); + } +} + +static __init int nested_vmx_hardware_setup(void) +{ + int i; + + if (enable_shadow_vmcs) { + for (i = 0; i < VMX_BITMAP_NR; i++) { + vmx_bitmap[i] = (unsigned long *) + __get_free_page(GFP_KERNEL); + if (!vmx_bitmap[i]) { + nested_vmx_hardware_unsetup(); + return -ENOMEM; + } + } + + init_vmcs_shadow_fields(); + } + + return 0; +} + +static __init int hardware_setup(void) +{ + unsigned long host_bndcfgs; + int r, i; + + rdmsrl_safe(MSR_EFER, &host_efer); + + for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) + kvm_define_shared_msr(i, vmx_msr_index[i]); + + if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) + return -EIO; + + if (boot_cpu_has(X86_FEATURE_NX)) + kvm_enable_efer_bits(EFER_NX); + + if (boot_cpu_has(X86_FEATURE_MPX)) { + rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs); + WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost"); + } + + if (boot_cpu_has(X86_FEATURE_XSAVES)) + rdmsrl(MSR_IA32_XSS, host_xss); + + if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || + !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) + enable_vpid = 0; + + if (!cpu_has_vmx_ept() || + !cpu_has_vmx_ept_4levels() || + !cpu_has_vmx_ept_mt_wb() || + !cpu_has_vmx_invept_global()) + enable_ept = 0; + + if (!cpu_has_vmx_ept_ad_bits() || !enable_ept) + enable_ept_ad_bits = 0; + + if (!cpu_has_vmx_unrestricted_guest() || !enable_ept) + enable_unrestricted_guest = 0; + + if (!cpu_has_vmx_flexpriority()) + flexpriority_enabled = 0; + + if (!cpu_has_virtual_nmis()) + enable_vnmi = 0; + + /* + * set_apic_access_page_addr() is used to reload apic access + * page upon invalidation. No need to do anything if not + * using the APIC_ACCESS_ADDR VMCS field. + */ + if (!flexpriority_enabled) + kvm_x86_ops->set_apic_access_page_addr = NULL; + + if (!cpu_has_vmx_tpr_shadow()) + kvm_x86_ops->update_cr8_intercept = NULL; + + if (enable_ept && !cpu_has_vmx_ept_2m_page()) + kvm_disable_largepages(); + +#if IS_ENABLED(CONFIG_HYPERV) + if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH + && enable_ept) + kvm_x86_ops->tlb_remote_flush = vmx_hv_remote_flush_tlb; +#endif + + if (!cpu_has_vmx_ple()) { + ple_gap = 0; + ple_window = 0; + ple_window_grow = 0; + ple_window_max = 0; + ple_window_shrink = 0; + } + + if (!cpu_has_vmx_apicv()) { + enable_apicv = 0; + kvm_x86_ops->sync_pir_to_irr = NULL; + } + + if (cpu_has_vmx_tsc_scaling()) { + kvm_has_tsc_control = true; + kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; + kvm_tsc_scaling_ratio_frac_bits = 48; + } + + set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ + + if (enable_ept) + vmx_enable_tdp(); + else + kvm_disable_tdp(); + + if (!nested) { + kvm_x86_ops->get_nested_state = NULL; + kvm_x86_ops->set_nested_state = NULL; + } + + /* + * Only enable PML when hardware supports PML feature, and both EPT + * and EPT A/D bit features are enabled -- PML depends on them to work. + */ + if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) + enable_pml = 0; + + if (!enable_pml) { + kvm_x86_ops->slot_enable_log_dirty = NULL; + kvm_x86_ops->slot_disable_log_dirty = NULL; + kvm_x86_ops->flush_log_dirty = NULL; + kvm_x86_ops->enable_log_dirty_pt_masked = NULL; + } + + if (!cpu_has_vmx_preemption_timer()) + kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit; + + if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) { + u64 vmx_msr; + + rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); + cpu_preemption_timer_multi = + vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; + } else { + kvm_x86_ops->set_hv_timer = NULL; + kvm_x86_ops->cancel_hv_timer = NULL; + } + + if (!cpu_has_vmx_shadow_vmcs() || !nested) + enable_shadow_vmcs = 0; + + kvm_set_posted_intr_wakeup_handler(wakeup_handler); + nested_vmx_setup_ctls_msrs(&vmcs_config.nested, vmx_capability.ept, + enable_apicv); + + kvm_mce_cap_supported |= MCG_LMCE_P; + + if (nested) { + r = nested_vmx_hardware_setup(); + if (r) + return r; + } + + r = alloc_kvm_area(); + if (r) + nested_vmx_hardware_unsetup(); + return r; +} + +static __exit void hardware_unsetup(void) +{ + if (nested) + nested_vmx_hardware_unsetup(); + + free_kvm_area(); +} + static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, -- cgit v1.2.3 From e4027cfafd7846cf795217ce6d1829728d95ad89 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 3 Dec 2018 13:53:12 -0800 Subject: KVM: nVMX: Set callbacks for nested functions during hardware setup ...in nested-specific code so that they can eventually be moved out of vmx.c, e.g. into nested.c. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 75 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 49 insertions(+), 26 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b121daa868d7..c445a90e200f 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7810,6 +7810,16 @@ fail: return 1; } +/* + * When nested=0, all VMX instruction VM Exits filter here. The handlers + * are overwritten by nested_vmx_setup() when nested=1. + */ +static int handle_vmx_instruction(struct kvm_vcpu *vcpu) +{ + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; +} + static int handle_encls(struct kvm_vcpu *vcpu) { /* @@ -7826,7 +7836,7 @@ static int handle_encls(struct kvm_vcpu *vcpu) * may resume. Otherwise they set the kvm_run parameter to indicate what needs * to be done to userspace and return 0. */ -static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { +static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_EXCEPTION_NMI] = handle_exception, [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, @@ -7843,15 +7853,15 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_INVLPG] = handle_invlpg, [EXIT_REASON_RDPMC] = handle_rdpmc, [EXIT_REASON_VMCALL] = handle_vmcall, - [EXIT_REASON_VMCLEAR] = handle_vmclear, - [EXIT_REASON_VMLAUNCH] = handle_vmlaunch, - [EXIT_REASON_VMPTRLD] = handle_vmptrld, - [EXIT_REASON_VMPTRST] = handle_vmptrst, - [EXIT_REASON_VMREAD] = handle_vmread, - [EXIT_REASON_VMRESUME] = handle_vmresume, - [EXIT_REASON_VMWRITE] = handle_vmwrite, - [EXIT_REASON_VMOFF] = handle_vmoff, - [EXIT_REASON_VMON] = handle_vmon, + [EXIT_REASON_VMCLEAR] = handle_vmx_instruction, + [EXIT_REASON_VMLAUNCH] = handle_vmx_instruction, + [EXIT_REASON_VMPTRLD] = handle_vmx_instruction, + [EXIT_REASON_VMPTRST] = handle_vmx_instruction, + [EXIT_REASON_VMREAD] = handle_vmx_instruction, + [EXIT_REASON_VMRESUME] = handle_vmx_instruction, + [EXIT_REASON_VMWRITE] = handle_vmx_instruction, + [EXIT_REASON_VMOFF] = handle_vmx_instruction, + [EXIT_REASON_VMON] = handle_vmx_instruction, [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, [EXIT_REASON_APIC_ACCESS] = handle_apic_access, [EXIT_REASON_APIC_WRITE] = handle_apic_write, @@ -7868,15 +7878,15 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait, [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap, [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor, - [EXIT_REASON_INVEPT] = handle_invept, - [EXIT_REASON_INVVPID] = handle_invvpid, + [EXIT_REASON_INVEPT] = handle_vmx_instruction, + [EXIT_REASON_INVVPID] = handle_vmx_instruction, [EXIT_REASON_RDRAND] = handle_invalid_op, [EXIT_REASON_RDSEED] = handle_invalid_op, [EXIT_REASON_XSAVES] = handle_xsaves, [EXIT_REASON_XRSTORS] = handle_xrstors, [EXIT_REASON_PML_FULL] = handle_pml_full, [EXIT_REASON_INVPCID] = handle_invpcid, - [EXIT_REASON_VMFUNC] = handle_vmfunc, + [EXIT_REASON_VMFUNC] = handle_vmx_instruction, [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, [EXIT_REASON_ENCLS] = handle_encls, }; @@ -12978,7 +12988,7 @@ static __exit void nested_vmx_hardware_unsetup(void) } } -static __init int nested_vmx_hardware_setup(void) +static __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) { int i; @@ -12995,6 +13005,25 @@ static __init int nested_vmx_hardware_setup(void) init_vmcs_shadow_fields(); } + exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear, + exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch, + exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld, + exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst, + exit_handlers[EXIT_REASON_VMREAD] = handle_vmread, + exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume, + exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite, + exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff, + exit_handlers[EXIT_REASON_VMON] = handle_vmon, + exit_handlers[EXIT_REASON_INVEPT] = handle_invept, + exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid, + exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc, + + kvm_x86_ops->check_nested_events = vmx_check_nested_events; + kvm_x86_ops->get_nested_state = vmx_get_nested_state; + kvm_x86_ops->set_nested_state = vmx_set_nested_state; + kvm_x86_ops->get_vmcs12_pages = nested_get_vmcs12_pages, + kvm_x86_ops->nested_enable_evmcs = nested_enable_evmcs; + return 0; } @@ -13090,11 +13119,6 @@ static __init int hardware_setup(void) else kvm_disable_tdp(); - if (!nested) { - kvm_x86_ops->get_nested_state = NULL; - kvm_x86_ops->set_nested_state = NULL; - } - /* * Only enable PML when hardware supports PML feature, and both EPT * and EPT A/D bit features are enabled -- PML depends on them to work. @@ -13133,7 +13157,7 @@ static __init int hardware_setup(void) kvm_mce_cap_supported |= MCG_LMCE_P; if (nested) { - r = nested_vmx_hardware_setup(); + r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers); if (r) return r; } @@ -13264,7 +13288,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .xsaves_supported = vmx_xsaves_supported, .umip_emulated = vmx_umip_emulated, - .check_nested_events = vmx_check_nested_events, .request_immediate_exit = vmx_request_immediate_exit, .sched_in = vmx_sched_in, @@ -13289,16 +13312,16 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .setup_mce = vmx_setup_mce, - .get_nested_state = vmx_get_nested_state, - .set_nested_state = vmx_set_nested_state, - .get_vmcs12_pages = nested_get_vmcs12_pages, - .smi_allowed = vmx_smi_allowed, .pre_enter_smm = vmx_pre_enter_smm, .pre_leave_smm = vmx_pre_leave_smm, .enable_smi_window = enable_smi_window, - .nested_enable_evmcs = nested_enable_evmcs, + .check_nested_events = NULL, + .get_nested_state = NULL, + .set_nested_state = NULL, + .get_vmcs12_pages = NULL, + .nested_enable_evmcs = NULL, }; static void vmx_cleanup_l1d_flush(void) -- cgit v1.2.3 From 3e8eacccae9d998e0f5074f4fe411c747d85dfd8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 3 Dec 2018 13:53:13 -0800 Subject: KVM: nVMX: Call nested_vmx_setup_ctls_msrs() iff @nested is true ...so that it doesn't need access to @nested. The only case where the provided struct isn't already zeroed is the call from vmx_create_vcpu() as setup_vmcs_config() zeroes the struct in the other use cases. This will allow @nested to be statically defined in vmx.c, i.e. this removes the last direct reference from nested code. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c445a90e200f..e67ff719dfe6 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1759,11 +1759,6 @@ static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu) static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps, bool apicv) { - if (!nested) { - memset(msrs, 0, sizeof(*msrs)); - return; - } - /* * Note that as a general rule, the high half of the MSRs (bits in * the control fields which may be 1) should be initialized by the @@ -9546,6 +9541,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) nested_vmx_setup_ctls_msrs(&vmx->nested.msrs, vmx_capability.ept, kvm_vcpu_apicv_active(&vmx->vcpu)); + else + memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs)); vmx->nested.posted_intr_nv = -1; vmx->nested.current_vmptr = -1ull; @@ -9619,7 +9616,9 @@ static void __init vmx_check_processor_compat(void *rtn) *(int *)rtn = 0; if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) *(int *)rtn = -EIO; - nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept, enable_apicv); + if (nested) + nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept, + enable_apicv); if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n", smp_processor_id()); @@ -13151,12 +13150,13 @@ static __init int hardware_setup(void) enable_shadow_vmcs = 0; kvm_set_posted_intr_wakeup_handler(wakeup_handler); - nested_vmx_setup_ctls_msrs(&vmcs_config.nested, vmx_capability.ept, - enable_apicv); kvm_mce_cap_supported |= MCG_LMCE_P; if (nested) { + nested_vmx_setup_ctls_msrs(&vmcs_config.nested, + vmx_capability.ept, enable_apicv); + r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers); if (r) return r; -- cgit v1.2.3 From ff241486ac90671ddf7c18e38c474f32b832af3a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 3 Dec 2018 13:53:14 -0800 Subject: KVM: nVMX: Move "vmcs12 to shadow/evmcs sync" to helper function ...so that the function doesn't need to be created when moving the nested code out of vmx.c. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 46 ++++++++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 20 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e67ff719dfe6..7b3a539e18ad 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -9113,6 +9113,30 @@ static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) vmx->loaded_vmcs->hv_timer_armed = false; } +static void nested_sync_from_vmcs12(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * hv_evmcs may end up being not mapped after migration (when + * L2 was running), map it here to make sure vmcs12 changes are + * properly reflected. + */ + if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs) + nested_vmx_handle_enlightened_vmptrld(vcpu, false); + + if (vmx->nested.hv_evmcs) { + copy_vmcs12_to_enlightened(vmx); + /* All fields are clean */ + vmx->nested.hv_evmcs->hv_clean_fields |= + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + } else { + copy_vmcs12_to_shadow(vmx); + } + + vmx->nested.need_vmcs12_sync = false; +} + static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -9133,26 +9157,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmcs_write32(PLE_WINDOW, vmx->ple_window); } - if (vmx->nested.need_vmcs12_sync) { - /* - * hv_evmcs may end up being not mapped after migration (when - * L2 was running), map it here to make sure vmcs12 changes are - * properly reflected. - */ - if (vmx->nested.enlightened_vmcs_enabled && - !vmx->nested.hv_evmcs) - nested_vmx_handle_enlightened_vmptrld(vcpu, false); - - if (vmx->nested.hv_evmcs) { - copy_vmcs12_to_enlightened(vmx); - /* All fields are clean */ - vmx->nested.hv_evmcs->hv_clean_fields |= - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; - } else { - copy_vmcs12_to_shadow(vmx); - } - vmx->nested.need_vmcs12_sync = false; - } + if (vmx->nested.need_vmcs12_sync) + nested_sync_from_vmcs12(vcpu); if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); -- cgit v1.2.3 From cf3646eb3adfd9149e3c29fc765a59e8bd6ff82d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 3 Dec 2018 13:53:15 -0800 Subject: KVM: VMX: Expose misc variables needed for nested VMX Exposed vmx_msr_index, vmx_return and host_efer via vmx.h so that the nested code can be moved out of vmx.c. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 5 ++--- arch/x86/kvm/vmx/vmx.h | 4 ++++ 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 7b3a539e18ad..001c6e14c44b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -187,7 +187,6 @@ module_param(ple_window_shrink, uint, 0444); static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; module_param(ple_window_max, uint, 0444); -extern const ulong vmx_return; extern const ulong vmx_early_consistency_check_return; static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); @@ -437,7 +436,7 @@ static const struct kvm_vmx_segment_field { VMX_SEGMENT_FIELD(LDTR), }; -static u64 host_efer; +u64 host_efer; static void ept_save_pdptrs(struct kvm_vcpu *vcpu); @@ -445,7 +444,7 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu); * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it * away by decrementing the array size. */ -static const u32 vmx_msr_index[] = { +const u32 vmx_msr_index[] = { #ifdef CONFIG_X86_64 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, #endif diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 4c7fcf1571c3..5c105aa4dc35 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -10,6 +10,10 @@ #include "ops.h" #include "vmcs.h" +extern const u32 vmx_msr_index[]; +extern const ulong vmx_return; +extern u64 host_efer; + #define MSR_TYPE_R 1 #define MSR_TYPE_W 2 #define MSR_TYPE_RW 3 -- cgit v1.2.3 From 97b7ead392637247569818b6603e54b0a6277dd0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 3 Dec 2018 13:53:16 -0800 Subject: KVM: VMX: Expose various getters and setters to nested VMX ...as they're used directly by the nested code. This will allow moving the bulk of the nested code out of vmx.c without concurrent changes to vmx.h. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 74 ++++++++++++++++++-------------------------------- arch/x86/kvm/vmx/vmx.h | 27 ++++++++++++++++++ 2 files changed, 53 insertions(+), 48 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 001c6e14c44b..20d4a1325d08 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -365,18 +365,11 @@ static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu) static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu); static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); -static void vmx_set_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg); -static void vmx_get_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg); static bool guest_state_valid(struct kvm_vcpu *vcpu); static u32 vmx_segment_access_rights(struct kvm_segment *var); static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); -static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); -static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, u16 error_code); -static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr, int type); @@ -438,8 +431,6 @@ static const struct kvm_vmx_segment_field { u64 host_efer; -static void ept_save_pdptrs(struct kvm_vcpu *vcpu); - /* * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it * away by decrementing the array size. @@ -687,7 +678,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, u32 exit_intr_info, unsigned long exit_qualification); -static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) +static inline int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) { int i; @@ -697,7 +688,7 @@ static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) return -1; } -static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) +struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) { int i; @@ -707,15 +698,6 @@ static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) return NULL; } -void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) -{ - vmcs_clear(loaded_vmcs->vmcs); - if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched) - vmcs_clear(loaded_vmcs->shadow_vmcs); - loaded_vmcs->cpu = -1; - loaded_vmcs->launched = 0; -} - #ifdef CONFIG_KEXEC_CORE /* * This bitmap is used to indicate whether the vmclear @@ -840,7 +822,7 @@ static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg) return *p; } -static void update_exception_bitmap(struct kvm_vcpu *vcpu) +void update_exception_bitmap(struct kvm_vcpu *vcpu) { u32 eb; @@ -1140,7 +1122,7 @@ static unsigned long segment_base(u16 selector) } #endif -static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) +void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs_host_state *host_state; @@ -1338,7 +1320,7 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken. */ -static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); bool already_loaded = vmx->loaded_vmcs->cpu == cpu; @@ -1419,7 +1401,7 @@ static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) pi_set_sn(pi_desc); } -static void vmx_vcpu_put(struct kvm_vcpu *vcpu) +void vmx_vcpu_put(struct kvm_vcpu *vcpu) { vmx_vcpu_pi_put(vcpu); @@ -1449,7 +1431,7 @@ static inline unsigned long nested_read_cr4(struct vmcs12 *fields) (fields->cr4_read_shadow & fields->cr4_guest_host_mask); } -static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) +unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) { unsigned long rflags, save_rflags; @@ -1466,7 +1448,7 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) return to_vmx(vcpu)->rflags; } -static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { unsigned long old_rflags = vmx_get_rflags(vcpu); @@ -1482,7 +1464,7 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) to_vmx(vcpu)->emulation_required = emulation_required(vcpu); } -static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) +u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) { u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); int ret = 0; @@ -1495,7 +1477,7 @@ static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) return ret; } -static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) +void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) { u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); u32 interruptibility = interruptibility_old; @@ -3291,7 +3273,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu) kvm_mmu_reset_context(vcpu); } -static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) +void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); @@ -3391,7 +3373,7 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu) } } -static void ept_save_pdptrs(struct kvm_vcpu *vcpu) +void ept_save_pdptrs(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; @@ -3442,8 +3424,6 @@ static bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val) #define nested_guest_cr4_valid nested_cr4_valid #define nested_host_cr4_valid nested_cr4_valid -static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); - static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, unsigned long cr0, struct kvm_vcpu *vcpu) @@ -3472,7 +3452,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, *hw_cr0 &= ~X86_CR0_WP; } -static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long hw_cr0; @@ -3531,7 +3511,7 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) return eptp; } -static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { struct kvm *kvm = vcpu->kvm; unsigned long guest_cr3; @@ -3561,7 +3541,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) vmcs_writel(GUEST_CR3, guest_cr3); } -static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { /* * Pass through host's Machine Check Enable value to hw_cr4, which @@ -3636,8 +3616,7 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 0; } -static void vmx_get_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg) +void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 ar; @@ -3684,7 +3663,7 @@ static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) return vmx_read_guest_seg_base(to_vmx(vcpu), seg); } -static int vmx_get_cpl(struct kvm_vcpu *vcpu) +int vmx_get_cpl(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -3716,8 +3695,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var) return ar; } -static void vmx_set_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg) +void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { struct vcpu_vmx *vmx = to_vmx(vcpu); const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; @@ -4111,7 +4089,7 @@ out: return r; } -static int allocate_vpid(void) +int allocate_vpid(void) { int vpid; @@ -4127,7 +4105,7 @@ static int allocate_vpid(void) return vpid; } -static void free_vpid(int vpid) +void free_vpid(int vpid) { if (!enable_vpid || vpid == 0) return; @@ -4302,7 +4280,7 @@ static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap, } } -static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) +void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; @@ -4490,7 +4468,7 @@ static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) * Note that host-state that does change is set elsewhere. E.g., host-state * that is set differently for each CPU is set in vmx_vcpu_load(), not here. */ -static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) +void vmx_set_constant_host_state(struct vcpu_vmx *vmx) { u32 low32, high32; unsigned long tmpl; @@ -4550,7 +4528,7 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) vmcs_write64(HOST_IA32_EFER, host_efer); } -static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) +void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) { vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; if (enable_ept) @@ -5080,7 +5058,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) vmx_clear_hlt(vcpu); } -static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) +bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); bool masked; @@ -5094,7 +5072,7 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) return masked; } -static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) +void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -8688,7 +8666,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) vmcs_write32(TPR_THRESHOLD, irr); } -static void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) +void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) { u32 sec_exec_control; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 5c105aa4dc35..ee2cbe64813d 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -267,6 +267,33 @@ struct kvm_vmx { spinlock_t ept_pointer_lock; }; +void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu); +void vmx_vcpu_put(struct kvm_vcpu *vcpu); +int allocate_vpid(void); +void free_vpid(int vpid); +void vmx_set_constant_host_state(struct vcpu_vmx *vmx); +void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); +int vmx_get_cpl(struct kvm_vcpu *vcpu); +unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu); +void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); +u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu); +void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask); +void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer); +void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); +void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); +int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); +void set_cr4_guest_host_mask(struct vcpu_vmx *vmx); +void ept_save_pdptrs(struct kvm_vcpu *vcpu); +void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); +void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); +u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); +void update_exception_bitmap(struct kvm_vcpu *vcpu); +void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); +bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); +void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); +void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu); +struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr); + #define POSTED_INTR_ON 0 #define POSTED_INTR_SN 1 -- cgit v1.2.3 From 7c97fcb3b68cd4d48a071bc1929c753d255dea47 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 3 Dec 2018 13:53:17 -0800 Subject: KVM: VMX: Expose nested_vmx_allowed() to nested VMX as a non-inline Exposing only the function allows @nested, i.e. the module param, to be statically defined in vmx.c, ensuring we aren't unnecessarily checking said variable in the nested code. nested_vmx_allowed() is exposed due to the need to verify nested support in vmx_{get,set}_nested_state(). The downside is that nested_vmx_allowed() likely won't be inlined in vmx_{get,set}_nested_state(), but that should be a non-issue as they're not a hot path. Keeping vmx_{get,set}_nested_state() in vmx.c isn't a viable option as they need access to several nested-only functions. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 11 ++++++++++- arch/x86/kvm/vmx/vmx.h | 1 + 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 20d4a1325d08..58bb8de04d0d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -698,6 +698,15 @@ struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) return NULL; } +void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) +{ + vmcs_clear(loaded_vmcs->vmcs); + if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched) + vmcs_clear(loaded_vmcs->shadow_vmcs); + loaded_vmcs->cpu = -1; + loaded_vmcs->launched = 0; +} + #ifdef CONFIG_KEXEC_CORE /* * This bitmap is used to indicate whether the vmclear @@ -1722,7 +1731,7 @@ static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) * all guests if the "nested" module option is off, and can also be disabled * for a single guest by disabling its VMX cpuid bit. */ -static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu) +bool nested_vmx_allowed(struct kvm_vcpu *vcpu) { return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX); } diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index ee2cbe64813d..f932d7c971e9 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -267,6 +267,7 @@ struct kvm_vmx { spinlock_t ept_pointer_lock; }; +bool nested_vmx_allowed(struct kvm_vcpu *vcpu); void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu); void vmx_vcpu_put(struct kvm_vcpu *vcpu); int allocate_vpid(void); -- cgit v1.2.3 From 55d2375e58a61be072431dd3d3c8a320f4a4a01b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 3 Dec 2018 13:53:18 -0800 Subject: KVM: nVMX: Move nested code to dedicated files From a functional perspective, this is (supposed to be) a straight copy-paste of code. Code was moved piecemeal to nested.c as not all code that could/should be moved was obviously nested-only. The nested code was then re-ordered as needed to compile, i.e. stats may not show this is being a "pure" move despite there not being any intended changes in functionality. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/Makefile | 2 +- arch/x86/kvm/vmx/nested.c | 5674 +++++++++++++++++++++++++++++ arch/x86/kvm/vmx/nested.h | 282 ++ arch/x86/kvm/vmx/vmx.c | 8684 ++++++++------------------------------------- 4 files changed, 7340 insertions(+), 7302 deletions(-) create mode 100644 arch/x86/kvm/vmx/nested.c create mode 100644 arch/x86/kvm/vmx/nested.h (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 7f3f50aaa203..83dc7d6a0294 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -16,7 +16,7 @@ kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ hyperv.o page_track.o debugfs.o -kvm-intel-y += vmx/vmx.o vmx/pmu_intel.o vmx/vmcs12.o vmx/evmcs.o +kvm-intel-y += vmx/vmx.o vmx/pmu_intel.o vmx/vmcs12.o vmx/evmcs.o vmx/nested.o kvm-amd-y += svm.o pmu_amd.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c new file mode 100644 index 000000000000..70e6d604c2a0 --- /dev/null +++ b/arch/x86/kvm/vmx/nested.c @@ -0,0 +1,5674 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +#include +#include + +#include "cpuid.h" +#include "hyperv.h" +#include "mmu.h" +#include "nested.h" +#include "trace.h" +#include "x86.h" + +static bool __read_mostly enable_shadow_vmcs = 1; +module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); + +static bool __read_mostly nested_early_check = 0; +module_param(nested_early_check, bool, S_IRUGO); + +extern const ulong vmx_early_consistency_check_return; + +/* + * Hyper-V requires all of these, so mark them as supported even though + * they are just treated the same as all-context. + */ +#define VMX_VPID_EXTENT_SUPPORTED_MASK \ + (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ + VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ + VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ + VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) + +#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 + +enum { + VMX_VMREAD_BITMAP, + VMX_VMWRITE_BITMAP, + VMX_BITMAP_NR +}; +static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; + +#define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) +#define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) + +static u16 shadow_read_only_fields[] = { +#define SHADOW_FIELD_RO(x) x, +#include "vmcs_shadow_fields.h" +}; +static int max_shadow_read_only_fields = + ARRAY_SIZE(shadow_read_only_fields); + +static u16 shadow_read_write_fields[] = { +#define SHADOW_FIELD_RW(x) x, +#include "vmcs_shadow_fields.h" +}; +static int max_shadow_read_write_fields = + ARRAY_SIZE(shadow_read_write_fields); + +void init_vmcs_shadow_fields(void) +{ + int i, j; + + memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); + memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); + + for (i = j = 0; i < max_shadow_read_only_fields; i++) { + u16 field = shadow_read_only_fields[i]; + + if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && + (i + 1 == max_shadow_read_only_fields || + shadow_read_only_fields[i + 1] != field + 1)) + pr_err("Missing field from shadow_read_only_field %x\n", + field + 1); + + clear_bit(field, vmx_vmread_bitmap); +#ifdef CONFIG_X86_64 + if (field & 1) + continue; +#endif + if (j < i) + shadow_read_only_fields[j] = field; + j++; + } + max_shadow_read_only_fields = j; + + for (i = j = 0; i < max_shadow_read_write_fields; i++) { + u16 field = shadow_read_write_fields[i]; + + if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && + (i + 1 == max_shadow_read_write_fields || + shadow_read_write_fields[i + 1] != field + 1)) + pr_err("Missing field from shadow_read_write_field %x\n", + field + 1); + + /* + * PML and the preemption timer can be emulated, but the + * processor cannot vmwrite to fields that don't exist + * on bare metal. + */ + switch (field) { + case GUEST_PML_INDEX: + if (!cpu_has_vmx_pml()) + continue; + break; + case VMX_PREEMPTION_TIMER_VALUE: + if (!cpu_has_vmx_preemption_timer()) + continue; + break; + case GUEST_INTR_STATUS: + if (!cpu_has_vmx_apicv()) + continue; + break; + default: + break; + } + + clear_bit(field, vmx_vmwrite_bitmap); + clear_bit(field, vmx_vmread_bitmap); +#ifdef CONFIG_X86_64 + if (field & 1) + continue; +#endif + if (j < i) + shadow_read_write_fields[j] = field; + j++; + } + max_shadow_read_write_fields = j; +} + +/* + * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), + * set the success or error code of an emulated VMX instruction (as specified + * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated + * instruction. + */ +static int nested_vmx_succeed(struct kvm_vcpu *vcpu) +{ + vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) + & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | + X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); + return kvm_skip_emulated_instruction(vcpu); +} + +static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) +{ + vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) + & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | + X86_EFLAGS_SF | X86_EFLAGS_OF)) + | X86_EFLAGS_CF); + return kvm_skip_emulated_instruction(vcpu); +} + +static int nested_vmx_failValid(struct kvm_vcpu *vcpu, + u32 vm_instruction_error) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * failValid writes the error number to the current VMCS, which + * can't be done if there isn't a current VMCS. + */ + if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs) + return nested_vmx_failInvalid(vcpu); + + vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) + & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | + X86_EFLAGS_SF | X86_EFLAGS_OF)) + | X86_EFLAGS_ZF); + get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; + /* + * We don't need to force a shadow sync because + * VM_INSTRUCTION_ERROR is not shadowed + */ + return kvm_skip_emulated_instruction(vcpu); +} + +static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) +{ + /* TODO: not to reset guest simply here. */ + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator); +} + +static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) +{ + vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS); + vmcs_write64(VMCS_LINK_POINTER, -1ull); +} + +static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!vmx->nested.hv_evmcs) + return; + + kunmap(vmx->nested.hv_evmcs_page); + kvm_release_page_dirty(vmx->nested.hv_evmcs_page); + vmx->nested.hv_evmcs_vmptr = -1ull; + vmx->nested.hv_evmcs_page = NULL; + vmx->nested.hv_evmcs = NULL; +} + +/* + * Free whatever needs to be freed from vmx->nested when L1 goes down, or + * just stops using VMX. + */ +static void free_nested(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) + return; + + vmx->nested.vmxon = false; + vmx->nested.smm.vmxon = false; + free_vpid(vmx->nested.vpid02); + vmx->nested.posted_intr_nv = -1; + vmx->nested.current_vmptr = -1ull; + if (enable_shadow_vmcs) { + vmx_disable_shadow_vmcs(vmx); + vmcs_clear(vmx->vmcs01.shadow_vmcs); + free_vmcs(vmx->vmcs01.shadow_vmcs); + vmx->vmcs01.shadow_vmcs = NULL; + } + kfree(vmx->nested.cached_vmcs12); + kfree(vmx->nested.cached_shadow_vmcs12); + /* Unpin physical memory we referred to in the vmcs02 */ + if (vmx->nested.apic_access_page) { + kvm_release_page_dirty(vmx->nested.apic_access_page); + vmx->nested.apic_access_page = NULL; + } + if (vmx->nested.virtual_apic_page) { + kvm_release_page_dirty(vmx->nested.virtual_apic_page); + vmx->nested.virtual_apic_page = NULL; + } + if (vmx->nested.pi_desc_page) { + kunmap(vmx->nested.pi_desc_page); + kvm_release_page_dirty(vmx->nested.pi_desc_page); + vmx->nested.pi_desc_page = NULL; + vmx->nested.pi_desc = NULL; + } + + kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); + + nested_release_evmcs(vcpu); + + free_loaded_vmcs(&vmx->nested.vmcs02); +} + +static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int cpu; + + if (vmx->loaded_vmcs == vmcs) + return; + + cpu = get_cpu(); + vmx_vcpu_put(vcpu); + vmx->loaded_vmcs = vmcs; + vmx_vcpu_load(vcpu, cpu); + put_cpu(); + + vm_entry_controls_reset_shadow(vmx); + vm_exit_controls_reset_shadow(vmx); + vmx_segment_cache_clear(vmx); +} + +/* + * Ensure that the current vmcs of the logical processor is the + * vmcs01 of the vcpu before calling free_nested(). + */ +void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) +{ + vcpu_load(vcpu); + vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01); + free_nested(vcpu); + vcpu_put(vcpu); +} + +static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, + struct x86_exception *fault) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 exit_reason; + unsigned long exit_qualification = vcpu->arch.exit_qualification; + + if (vmx->nested.pml_full) { + exit_reason = EXIT_REASON_PML_FULL; + vmx->nested.pml_full = false; + exit_qualification &= INTR_INFO_UNBLOCK_NMI; + } else if (fault->error_code & PFERR_RSVD_MASK) + exit_reason = EXIT_REASON_EPT_MISCONFIG; + else + exit_reason = EXIT_REASON_EPT_VIOLATION; + + nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification); + vmcs12->guest_physical_address = fault->address; +} + +static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) +{ + WARN_ON(mmu_is_nested(vcpu)); + + vcpu->arch.mmu = &vcpu->arch.guest_mmu; + kvm_init_shadow_ept_mmu(vcpu, + to_vmx(vcpu)->nested.msrs.ept_caps & + VMX_EPT_EXECUTE_ONLY_BIT, + nested_ept_ad_enabled(vcpu), + nested_ept_get_cr3(vcpu)); + vcpu->arch.mmu->set_cr3 = vmx_set_cr3; + vcpu->arch.mmu->get_cr3 = nested_ept_get_cr3; + vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; + vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; + + vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; +} + +static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) +{ + vcpu->arch.mmu = &vcpu->arch.root_mmu; + vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; +} + +static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, + u16 error_code) +{ + bool inequality, bit; + + bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; + inequality = + (error_code & vmcs12->page_fault_error_code_mask) != + vmcs12->page_fault_error_code_match; + return inequality ^ bit; +} + + +/* + * KVM wants to inject page-faults which it got to the guest. This function + * checks whether in a nested guest, we need to inject them to L1 or L2. + */ +static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + unsigned int nr = vcpu->arch.exception.nr; + bool has_payload = vcpu->arch.exception.has_payload; + unsigned long payload = vcpu->arch.exception.payload; + + if (nr == PF_VECTOR) { + if (vcpu->arch.exception.nested_apf) { + *exit_qual = vcpu->arch.apf.nested_apf_token; + return 1; + } + if (nested_vmx_is_page_fault_vmexit(vmcs12, + vcpu->arch.exception.error_code)) { + *exit_qual = has_payload ? payload : vcpu->arch.cr2; + return 1; + } + } else if (vmcs12->exception_bitmap & (1u << nr)) { + if (nr == DB_VECTOR) { + if (!has_payload) { + payload = vcpu->arch.dr6; + payload &= ~(DR6_FIXED_1 | DR6_BT); + payload ^= DR6_RTM; + } + *exit_qual = payload; + } else + *exit_qual = 0; + return 1; + } + + return 0; +} + + +static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, + struct x86_exception *fault) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + WARN_ON(!is_guest_mode(vcpu)); + + if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) && + !to_vmx(vcpu)->nested.nested_run_pending) { + vmcs12->vm_exit_intr_error_code = fault->error_code; + nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, + PF_VECTOR | INTR_TYPE_HARD_EXCEPTION | + INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK, + fault->address); + } else { + kvm_inject_page_fault(vcpu, fault); + } +} + +static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu)); +} + +static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) + return 0; + + if (!page_address_valid(vcpu, vmcs12->io_bitmap_a) || + !page_address_valid(vcpu, vmcs12->io_bitmap_b)) + return -EINVAL; + + return 0; +} + +static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) + return 0; + + if (!page_address_valid(vcpu, vmcs12->msr_bitmap)) + return -EINVAL; + + return 0; +} + +static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) + return 0; + + if (!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr)) + return -EINVAL; + + return 0; +} + +/* + * Check if MSR is intercepted for L01 MSR bitmap. + */ +static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr) +{ + unsigned long *msr_bitmap; + int f = sizeof(unsigned long); + + if (!cpu_has_vmx_msr_bitmap()) + return true; + + msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; + + if (msr <= 0x1fff) { + return !!test_bit(msr, msr_bitmap + 0x800 / f); + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { + msr &= 0x1fff; + return !!test_bit(msr, msr_bitmap + 0xc00 / f); + } + + return true; +} + +/* + * If a msr is allowed by L0, we should check whether it is allowed by L1. + * The corresponding bit will be cleared unless both of L0 and L1 allow it. + */ +static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, + unsigned long *msr_bitmap_nested, + u32 msr, int type) +{ + int f = sizeof(unsigned long); + + /* + * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals + * have the write-low and read-high bitmap offsets the wrong way round. + * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. + */ + if (msr <= 0x1fff) { + if (type & MSR_TYPE_R && + !test_bit(msr, msr_bitmap_l1 + 0x000 / f)) + /* read-low */ + __clear_bit(msr, msr_bitmap_nested + 0x000 / f); + + if (type & MSR_TYPE_W && + !test_bit(msr, msr_bitmap_l1 + 0x800 / f)) + /* write-low */ + __clear_bit(msr, msr_bitmap_nested + 0x800 / f); + + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { + msr &= 0x1fff; + if (type & MSR_TYPE_R && + !test_bit(msr, msr_bitmap_l1 + 0x400 / f)) + /* read-high */ + __clear_bit(msr, msr_bitmap_nested + 0x400 / f); + + if (type & MSR_TYPE_W && + !test_bit(msr, msr_bitmap_l1 + 0xc00 / f)) + /* write-high */ + __clear_bit(msr, msr_bitmap_nested + 0xc00 / f); + + } +} + +/* + * Merge L0's and L1's MSR bitmap, return false to indicate that + * we do not use the hardware. + */ +static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + int msr; + struct page *page; + unsigned long *msr_bitmap_l1; + unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; + /* + * pred_cmd & spec_ctrl are trying to verify two things: + * + * 1. L0 gave a permission to L1 to actually passthrough the MSR. This + * ensures that we do not accidentally generate an L02 MSR bitmap + * from the L12 MSR bitmap that is too permissive. + * 2. That L1 or L2s have actually used the MSR. This avoids + * unnecessarily merging of the bitmap if the MSR is unused. This + * works properly because we only update the L01 MSR bitmap lazily. + * So even if L0 should pass L1 these MSRs, the L01 bitmap is only + * updated to reflect this when L1 (or its L2s) actually write to + * the MSR. + */ + bool pred_cmd = !msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD); + bool spec_ctrl = !msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL); + + /* Nothing to do if the MSR bitmap is not in use. */ + if (!cpu_has_vmx_msr_bitmap() || + !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) + return false; + + if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && + !pred_cmd && !spec_ctrl) + return false; + + page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap); + if (is_error_page(page)) + return false; + + msr_bitmap_l1 = (unsigned long *)kmap(page); + if (nested_cpu_has_apic_reg_virt(vmcs12)) { + /* + * L0 need not intercept reads for MSRs between 0x800 and 0x8ff, it + * just lets the processor take the value from the virtual-APIC page; + * take those 256 bits directly from the L1 bitmap. + */ + for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { + unsigned word = msr / BITS_PER_LONG; + msr_bitmap_l0[word] = msr_bitmap_l1[word]; + msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0; + } + } else { + for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { + unsigned word = msr / BITS_PER_LONG; + msr_bitmap_l0[word] = ~0; + msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0; + } + } + + nested_vmx_disable_intercept_for_msr( + msr_bitmap_l1, msr_bitmap_l0, + X2APIC_MSR(APIC_TASKPRI), + MSR_TYPE_W); + + if (nested_cpu_has_vid(vmcs12)) { + nested_vmx_disable_intercept_for_msr( + msr_bitmap_l1, msr_bitmap_l0, + X2APIC_MSR(APIC_EOI), + MSR_TYPE_W); + nested_vmx_disable_intercept_for_msr( + msr_bitmap_l1, msr_bitmap_l0, + X2APIC_MSR(APIC_SELF_IPI), + MSR_TYPE_W); + } + + if (spec_ctrl) + nested_vmx_disable_intercept_for_msr( + msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_SPEC_CTRL, + MSR_TYPE_R | MSR_TYPE_W); + + if (pred_cmd) + nested_vmx_disable_intercept_for_msr( + msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_PRED_CMD, + MSR_TYPE_W); + + kunmap(page); + kvm_release_page_clean(page); + + return true; +} + +static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + struct vmcs12 *shadow; + struct page *page; + + if (!nested_cpu_has_shadow_vmcs(vmcs12) || + vmcs12->vmcs_link_pointer == -1ull) + return; + + shadow = get_shadow_vmcs12(vcpu); + page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer); + + memcpy(shadow, kmap(page), VMCS12_SIZE); + + kunmap(page); + kvm_release_page_clean(page); +} + +static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!nested_cpu_has_shadow_vmcs(vmcs12) || + vmcs12->vmcs_link_pointer == -1ull) + return; + + kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer, + get_shadow_vmcs12(vcpu), VMCS12_SIZE); +} + +/* + * In nested virtualization, check if L1 has set + * VM_EXIT_ACK_INTR_ON_EXIT + */ +static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) +{ + return get_vmcs12(vcpu)->vm_exit_controls & + VM_EXIT_ACK_INTR_ON_EXIT; +} + +static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) +{ + return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu)); +} + +static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && + !page_address_valid(vcpu, vmcs12->apic_access_addr)) + return -EINVAL; + else + return 0; +} + +static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && + !nested_cpu_has_apic_reg_virt(vmcs12) && + !nested_cpu_has_vid(vmcs12) && + !nested_cpu_has_posted_intr(vmcs12)) + return 0; + + /* + * If virtualize x2apic mode is enabled, + * virtualize apic access must be disabled. + */ + if (nested_cpu_has_virt_x2apic_mode(vmcs12) && + nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) + return -EINVAL; + + /* + * If virtual interrupt delivery is enabled, + * we must exit on external interrupts. + */ + if (nested_cpu_has_vid(vmcs12) && + !nested_exit_on_intr(vcpu)) + return -EINVAL; + + /* + * bits 15:8 should be zero in posted_intr_nv, + * the descriptor address has been already checked + * in nested_get_vmcs12_pages. + * + * bits 5:0 of posted_intr_desc_addr should be zero. + */ + if (nested_cpu_has_posted_intr(vmcs12) && + (!nested_cpu_has_vid(vmcs12) || + !nested_exit_intr_ack_set(vcpu) || + (vmcs12->posted_intr_nv & 0xff00) || + (vmcs12->posted_intr_desc_addr & 0x3f) || + (vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu)))) + return -EINVAL; + + /* tpr shadow is needed by all apicv features. */ + if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) + return -EINVAL; + + return 0; +} + +static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, + unsigned long count_field, + unsigned long addr_field) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + int maxphyaddr; + u64 count, addr; + + if (vmcs12_read_any(vmcs12, count_field, &count) || + vmcs12_read_any(vmcs12, addr_field, &addr)) { + WARN_ON(1); + return -EINVAL; + } + if (count == 0) + return 0; + maxphyaddr = cpuid_maxphyaddr(vcpu); + if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr || + (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) { + pr_debug_ratelimited( + "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)", + addr_field, maxphyaddr, count, addr); + return -EINVAL; + } + return 0; +} + +static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (vmcs12->vm_exit_msr_load_count == 0 && + vmcs12->vm_exit_msr_store_count == 0 && + vmcs12->vm_entry_msr_load_count == 0) + return 0; /* Fast path */ + if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT, + VM_EXIT_MSR_LOAD_ADDR) || + nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT, + VM_EXIT_MSR_STORE_ADDR) || + nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT, + VM_ENTRY_MSR_LOAD_ADDR)) + return -EINVAL; + return 0; +} + +static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has_pml(vmcs12)) + return 0; + + if (!nested_cpu_has_ept(vmcs12) || + !page_address_valid(vcpu, vmcs12->pml_address)) + return -EINVAL; + + return 0; +} + +static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && + !nested_cpu_has_ept(vmcs12)) + return -EINVAL; + return 0; +} + +static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && + !nested_cpu_has_ept(vmcs12)) + return -EINVAL; + return 0; +} + +static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has_shadow_vmcs(vmcs12)) + return 0; + + if (!page_address_valid(vcpu, vmcs12->vmread_bitmap) || + !page_address_valid(vcpu, vmcs12->vmwrite_bitmap)) + return -EINVAL; + + return 0; +} + +static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, + struct vmx_msr_entry *e) +{ + /* x2APIC MSR accesses are not allowed */ + if (vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8) + return -EINVAL; + if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */ + e->index == MSR_IA32_UCODE_REV) + return -EINVAL; + if (e->reserved != 0) + return -EINVAL; + return 0; +} + +static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, + struct vmx_msr_entry *e) +{ + if (e->index == MSR_FS_BASE || + e->index == MSR_GS_BASE || + e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */ + nested_vmx_msr_check_common(vcpu, e)) + return -EINVAL; + return 0; +} + +static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, + struct vmx_msr_entry *e) +{ + if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */ + nested_vmx_msr_check_common(vcpu, e)) + return -EINVAL; + return 0; +} + +/* + * Load guest's/host's msr at nested entry/exit. + * return 0 for success, entry index for failure. + */ +static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) +{ + u32 i; + struct vmx_msr_entry e; + struct msr_data msr; + + msr.host_initiated = false; + for (i = 0; i < count; i++) { + if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), + &e, sizeof(e))) { + pr_debug_ratelimited( + "%s cannot read MSR entry (%u, 0x%08llx)\n", + __func__, i, gpa + i * sizeof(e)); + goto fail; + } + if (nested_vmx_load_msr_check(vcpu, &e)) { + pr_debug_ratelimited( + "%s check failed (%u, 0x%x, 0x%x)\n", + __func__, i, e.index, e.reserved); + goto fail; + } + msr.index = e.index; + msr.data = e.value; + if (kvm_set_msr(vcpu, &msr)) { + pr_debug_ratelimited( + "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", + __func__, i, e.index, e.value); + goto fail; + } + } + return 0; +fail: + return i + 1; +} + +static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) +{ + u32 i; + struct vmx_msr_entry e; + + for (i = 0; i < count; i++) { + struct msr_data msr_info; + if (kvm_vcpu_read_guest(vcpu, + gpa + i * sizeof(e), + &e, 2 * sizeof(u32))) { + pr_debug_ratelimited( + "%s cannot read MSR entry (%u, 0x%08llx)\n", + __func__, i, gpa + i * sizeof(e)); + return -EINVAL; + } + if (nested_vmx_store_msr_check(vcpu, &e)) { + pr_debug_ratelimited( + "%s check failed (%u, 0x%x, 0x%x)\n", + __func__, i, e.index, e.reserved); + return -EINVAL; + } + msr_info.host_initiated = false; + msr_info.index = e.index; + if (kvm_get_msr(vcpu, &msr_info)) { + pr_debug_ratelimited( + "%s cannot read MSR (%u, 0x%x)\n", + __func__, i, e.index); + return -EINVAL; + } + if (kvm_vcpu_write_guest(vcpu, + gpa + i * sizeof(e) + + offsetof(struct vmx_msr_entry, value), + &msr_info.data, sizeof(msr_info.data))) { + pr_debug_ratelimited( + "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", + __func__, i, e.index, msr_info.data); + return -EINVAL; + } + } + return 0; +} + +static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val) +{ + unsigned long invalid_mask; + + invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu); + return (val & invalid_mask) == 0; +} + +/* + * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are + * emulating VM entry into a guest with EPT enabled. + * Returns 0 on success, 1 on failure. Invalid state exit qualification code + * is assigned to entry_failure_code on failure. + */ +static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept, + u32 *entry_failure_code) +{ + if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) { + if (!nested_cr3_valid(vcpu, cr3)) { + *entry_failure_code = ENTRY_FAIL_DEFAULT; + return 1; + } + + /* + * If PAE paging and EPT are both on, CR3 is not used by the CPU and + * must not be dereferenced. + */ + if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu) && + !nested_ept) { + if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) { + *entry_failure_code = ENTRY_FAIL_PDPTE; + return 1; + } + } + } + + if (!nested_ept) + kvm_mmu_new_cr3(vcpu, cr3, false); + + vcpu->arch.cr3 = cr3; + __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + + kvm_init_mmu(vcpu, false); + + return 0; +} + +/* + * Returns if KVM is able to config CPU to tag TLB entries + * populated by L2 differently than TLB entries populated + * by L1. + * + * If L1 uses EPT, then TLB entries are tagged with different EPTP. + * + * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged + * with different VPID (L1 entries are tagged with vmx->vpid + * while L2 entries are tagged with vmx->nested.vpid02). + */ +static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + return nested_cpu_has_ept(vmcs12) || + (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); +} + +static u16 nested_get_vpid02(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid; +} + + +static inline bool vmx_control_verify(u32 control, u32 low, u32 high) +{ + return fixed_bits_valid(control, low, high); +} + +static inline u64 vmx_control_msr(u32 low, u32 high) +{ + return low | ((u64)high << 32); +} + +static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) +{ + superset &= mask; + subset &= mask; + + return (superset | subset) == superset; +} + +static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) +{ + const u64 feature_and_reserved = + /* feature (except bit 48; see below) */ + BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | + /* reserved */ + BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); + u64 vmx_basic = vmx->nested.msrs.basic; + + if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) + return -EINVAL; + + /* + * KVM does not emulate a version of VMX that constrains physical + * addresses of VMX structures (e.g. VMCS) to 32-bits. + */ + if (data & BIT_ULL(48)) + return -EINVAL; + + if (vmx_basic_vmcs_revision_id(vmx_basic) != + vmx_basic_vmcs_revision_id(data)) + return -EINVAL; + + if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) + return -EINVAL; + + vmx->nested.msrs.basic = data; + return 0; +} + +static int +vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +{ + u64 supported; + u32 *lowp, *highp; + + switch (msr_index) { + case MSR_IA32_VMX_TRUE_PINBASED_CTLS: + lowp = &vmx->nested.msrs.pinbased_ctls_low; + highp = &vmx->nested.msrs.pinbased_ctls_high; + break; + case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: + lowp = &vmx->nested.msrs.procbased_ctls_low; + highp = &vmx->nested.msrs.procbased_ctls_high; + break; + case MSR_IA32_VMX_TRUE_EXIT_CTLS: + lowp = &vmx->nested.msrs.exit_ctls_low; + highp = &vmx->nested.msrs.exit_ctls_high; + break; + case MSR_IA32_VMX_TRUE_ENTRY_CTLS: + lowp = &vmx->nested.msrs.entry_ctls_low; + highp = &vmx->nested.msrs.entry_ctls_high; + break; + case MSR_IA32_VMX_PROCBASED_CTLS2: + lowp = &vmx->nested.msrs.secondary_ctls_low; + highp = &vmx->nested.msrs.secondary_ctls_high; + break; + default: + BUG(); + } + + supported = vmx_control_msr(*lowp, *highp); + + /* Check must-be-1 bits are still 1. */ + if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) + return -EINVAL; + + /* Check must-be-0 bits are still 0. */ + if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) + return -EINVAL; + + *lowp = data; + *highp = data >> 32; + return 0; +} + +static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) +{ + const u64 feature_and_reserved_bits = + /* feature */ + BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | + BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | + /* reserved */ + GENMASK_ULL(13, 9) | BIT_ULL(31); + u64 vmx_misc; + + vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, + vmx->nested.msrs.misc_high); + + if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) + return -EINVAL; + + if ((vmx->nested.msrs.pinbased_ctls_high & + PIN_BASED_VMX_PREEMPTION_TIMER) && + vmx_misc_preemption_timer_rate(data) != + vmx_misc_preemption_timer_rate(vmx_misc)) + return -EINVAL; + + if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) + return -EINVAL; + + if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) + return -EINVAL; + + if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) + return -EINVAL; + + vmx->nested.msrs.misc_low = data; + vmx->nested.msrs.misc_high = data >> 32; + + /* + * If L1 has read-only VM-exit information fields, use the + * less permissive vmx_vmwrite_bitmap to specify write + * permissions for the shadow VMCS. + */ + if (enable_shadow_vmcs && !nested_cpu_has_vmwrite_any_field(&vmx->vcpu)) + vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); + + return 0; +} + +static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) +{ + u64 vmx_ept_vpid_cap; + + vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps, + vmx->nested.msrs.vpid_caps); + + /* Every bit is either reserved or a feature bit. */ + if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) + return -EINVAL; + + vmx->nested.msrs.ept_caps = data; + vmx->nested.msrs.vpid_caps = data >> 32; + return 0; +} + +static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +{ + u64 *msr; + + switch (msr_index) { + case MSR_IA32_VMX_CR0_FIXED0: + msr = &vmx->nested.msrs.cr0_fixed0; + break; + case MSR_IA32_VMX_CR4_FIXED0: + msr = &vmx->nested.msrs.cr4_fixed0; + break; + default: + BUG(); + } + + /* + * 1 bits (which indicates bits which "must-be-1" during VMX operation) + * must be 1 in the restored value. + */ + if (!is_bitwise_subset(data, *msr, -1ULL)) + return -EINVAL; + + *msr = data; + return 0; +} + +/* + * Called when userspace is restoring VMX MSRs. + * + * Returns 0 on success, non-0 otherwise. + */ +int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * Don't allow changes to the VMX capability MSRs while the vCPU + * is in VMX operation. + */ + if (vmx->nested.vmxon) + return -EBUSY; + + switch (msr_index) { + case MSR_IA32_VMX_BASIC: + return vmx_restore_vmx_basic(vmx, data); + case MSR_IA32_VMX_PINBASED_CTLS: + case MSR_IA32_VMX_PROCBASED_CTLS: + case MSR_IA32_VMX_EXIT_CTLS: + case MSR_IA32_VMX_ENTRY_CTLS: + /* + * The "non-true" VMX capability MSRs are generated from the + * "true" MSRs, so we do not support restoring them directly. + * + * If userspace wants to emulate VMX_BASIC[55]=0, userspace + * should restore the "true" MSRs with the must-be-1 bits + * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND + * DEFAULT SETTINGS". + */ + return -EINVAL; + case MSR_IA32_VMX_TRUE_PINBASED_CTLS: + case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: + case MSR_IA32_VMX_TRUE_EXIT_CTLS: + case MSR_IA32_VMX_TRUE_ENTRY_CTLS: + case MSR_IA32_VMX_PROCBASED_CTLS2: + return vmx_restore_control_msr(vmx, msr_index, data); + case MSR_IA32_VMX_MISC: + return vmx_restore_vmx_misc(vmx, data); + case MSR_IA32_VMX_CR0_FIXED0: + case MSR_IA32_VMX_CR4_FIXED0: + return vmx_restore_fixed0_msr(vmx, msr_index, data); + case MSR_IA32_VMX_CR0_FIXED1: + case MSR_IA32_VMX_CR4_FIXED1: + /* + * These MSRs are generated based on the vCPU's CPUID, so we + * do not support restoring them directly. + */ + return -EINVAL; + case MSR_IA32_VMX_EPT_VPID_CAP: + return vmx_restore_vmx_ept_vpid_cap(vmx, data); + case MSR_IA32_VMX_VMCS_ENUM: + vmx->nested.msrs.vmcs_enum = data; + return 0; + default: + /* + * The rest of the VMX capability MSRs do not support restore. + */ + return -EINVAL; + } +} + +/* Returns 0 on success, non-0 otherwise. */ +int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) +{ + switch (msr_index) { + case MSR_IA32_VMX_BASIC: + *pdata = msrs->basic; + break; + case MSR_IA32_VMX_TRUE_PINBASED_CTLS: + case MSR_IA32_VMX_PINBASED_CTLS: + *pdata = vmx_control_msr( + msrs->pinbased_ctls_low, + msrs->pinbased_ctls_high); + if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) + *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; + break; + case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: + case MSR_IA32_VMX_PROCBASED_CTLS: + *pdata = vmx_control_msr( + msrs->procbased_ctls_low, + msrs->procbased_ctls_high); + if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) + *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; + break; + case MSR_IA32_VMX_TRUE_EXIT_CTLS: + case MSR_IA32_VMX_EXIT_CTLS: + *pdata = vmx_control_msr( + msrs->exit_ctls_low, + msrs->exit_ctls_high); + if (msr_index == MSR_IA32_VMX_EXIT_CTLS) + *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; + break; + case MSR_IA32_VMX_TRUE_ENTRY_CTLS: + case MSR_IA32_VMX_ENTRY_CTLS: + *pdata = vmx_control_msr( + msrs->entry_ctls_low, + msrs->entry_ctls_high); + if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) + *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; + break; + case MSR_IA32_VMX_MISC: + *pdata = vmx_control_msr( + msrs->misc_low, + msrs->misc_high); + break; + case MSR_IA32_VMX_CR0_FIXED0: + *pdata = msrs->cr0_fixed0; + break; + case MSR_IA32_VMX_CR0_FIXED1: + *pdata = msrs->cr0_fixed1; + break; + case MSR_IA32_VMX_CR4_FIXED0: + *pdata = msrs->cr4_fixed0; + break; + case MSR_IA32_VMX_CR4_FIXED1: + *pdata = msrs->cr4_fixed1; + break; + case MSR_IA32_VMX_VMCS_ENUM: + *pdata = msrs->vmcs_enum; + break; + case MSR_IA32_VMX_PROCBASED_CTLS2: + *pdata = vmx_control_msr( + msrs->secondary_ctls_low, + msrs->secondary_ctls_high); + break; + case MSR_IA32_VMX_EPT_VPID_CAP: + *pdata = msrs->ept_caps | + ((u64)msrs->vpid_caps << 32); + break; + case MSR_IA32_VMX_VMFUNC: + *pdata = msrs->vmfunc_controls; + break; + default: + return 1; + } + + return 0; +} + +/* + * Copy the writable VMCS shadow fields back to the VMCS12, in case + * they have been modified by the L1 guest. Note that the "read-only" + * VM-exit information fields are actually writable if the vCPU is + * configured to support "VMWRITE to any supported field in the VMCS." + */ +static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) +{ + const u16 *fields[] = { + shadow_read_write_fields, + shadow_read_only_fields + }; + const int max_fields[] = { + max_shadow_read_write_fields, + max_shadow_read_only_fields + }; + int i, q; + unsigned long field; + u64 field_value; + struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; + + preempt_disable(); + + vmcs_load(shadow_vmcs); + + for (q = 0; q < ARRAY_SIZE(fields); q++) { + for (i = 0; i < max_fields[q]; i++) { + field = fields[q][i]; + field_value = __vmcs_readl(field); + vmcs12_write_any(get_vmcs12(&vmx->vcpu), field, field_value); + } + /* + * Skip the VM-exit information fields if they are read-only. + */ + if (!nested_cpu_has_vmwrite_any_field(&vmx->vcpu)) + break; + } + + vmcs_clear(shadow_vmcs); + vmcs_load(vmx->loaded_vmcs->vmcs); + + preempt_enable(); +} + +static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) +{ + const u16 *fields[] = { + shadow_read_write_fields, + shadow_read_only_fields + }; + const int max_fields[] = { + max_shadow_read_write_fields, + max_shadow_read_only_fields + }; + int i, q; + unsigned long field; + u64 field_value = 0; + struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; + + vmcs_load(shadow_vmcs); + + for (q = 0; q < ARRAY_SIZE(fields); q++) { + for (i = 0; i < max_fields[q]; i++) { + field = fields[q][i]; + vmcs12_read_any(get_vmcs12(&vmx->vcpu), field, &field_value); + __vmcs_writel(field, field_value); + } + } + + vmcs_clear(shadow_vmcs); + vmcs_load(vmx->loaded_vmcs->vmcs); +} + +static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) +{ + struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; + struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; + + /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ + vmcs12->tpr_threshold = evmcs->tpr_threshold; + vmcs12->guest_rip = evmcs->guest_rip; + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { + vmcs12->guest_rsp = evmcs->guest_rsp; + vmcs12->guest_rflags = evmcs->guest_rflags; + vmcs12->guest_interruptibility_info = + evmcs->guest_interruptibility_info; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { + vmcs12->cpu_based_vm_exec_control = + evmcs->cpu_based_vm_exec_control; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { + vmcs12->exception_bitmap = evmcs->exception_bitmap; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { + vmcs12->vm_entry_controls = evmcs->vm_entry_controls; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { + vmcs12->vm_entry_intr_info_field = + evmcs->vm_entry_intr_info_field; + vmcs12->vm_entry_exception_error_code = + evmcs->vm_entry_exception_error_code; + vmcs12->vm_entry_instruction_len = + evmcs->vm_entry_instruction_len; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { + vmcs12->host_ia32_pat = evmcs->host_ia32_pat; + vmcs12->host_ia32_efer = evmcs->host_ia32_efer; + vmcs12->host_cr0 = evmcs->host_cr0; + vmcs12->host_cr3 = evmcs->host_cr3; + vmcs12->host_cr4 = evmcs->host_cr4; + vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; + vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; + vmcs12->host_rip = evmcs->host_rip; + vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; + vmcs12->host_es_selector = evmcs->host_es_selector; + vmcs12->host_cs_selector = evmcs->host_cs_selector; + vmcs12->host_ss_selector = evmcs->host_ss_selector; + vmcs12->host_ds_selector = evmcs->host_ds_selector; + vmcs12->host_fs_selector = evmcs->host_fs_selector; + vmcs12->host_gs_selector = evmcs->host_gs_selector; + vmcs12->host_tr_selector = evmcs->host_tr_selector; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { + vmcs12->pin_based_vm_exec_control = + evmcs->pin_based_vm_exec_control; + vmcs12->vm_exit_controls = evmcs->vm_exit_controls; + vmcs12->secondary_vm_exec_control = + evmcs->secondary_vm_exec_control; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { + vmcs12->io_bitmap_a = evmcs->io_bitmap_a; + vmcs12->io_bitmap_b = evmcs->io_bitmap_b; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { + vmcs12->msr_bitmap = evmcs->msr_bitmap; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { + vmcs12->guest_es_base = evmcs->guest_es_base; + vmcs12->guest_cs_base = evmcs->guest_cs_base; + vmcs12->guest_ss_base = evmcs->guest_ss_base; + vmcs12->guest_ds_base = evmcs->guest_ds_base; + vmcs12->guest_fs_base = evmcs->guest_fs_base; + vmcs12->guest_gs_base = evmcs->guest_gs_base; + vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; + vmcs12->guest_tr_base = evmcs->guest_tr_base; + vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; + vmcs12->guest_idtr_base = evmcs->guest_idtr_base; + vmcs12->guest_es_limit = evmcs->guest_es_limit; + vmcs12->guest_cs_limit = evmcs->guest_cs_limit; + vmcs12->guest_ss_limit = evmcs->guest_ss_limit; + vmcs12->guest_ds_limit = evmcs->guest_ds_limit; + vmcs12->guest_fs_limit = evmcs->guest_fs_limit; + vmcs12->guest_gs_limit = evmcs->guest_gs_limit; + vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; + vmcs12->guest_tr_limit = evmcs->guest_tr_limit; + vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; + vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; + vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; + vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; + vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; + vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; + vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; + vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; + vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; + vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; + vmcs12->guest_es_selector = evmcs->guest_es_selector; + vmcs12->guest_cs_selector = evmcs->guest_cs_selector; + vmcs12->guest_ss_selector = evmcs->guest_ss_selector; + vmcs12->guest_ds_selector = evmcs->guest_ds_selector; + vmcs12->guest_fs_selector = evmcs->guest_fs_selector; + vmcs12->guest_gs_selector = evmcs->guest_gs_selector; + vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; + vmcs12->guest_tr_selector = evmcs->guest_tr_selector; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { + vmcs12->tsc_offset = evmcs->tsc_offset; + vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; + vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { + vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; + vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; + vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; + vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; + vmcs12->guest_cr0 = evmcs->guest_cr0; + vmcs12->guest_cr3 = evmcs->guest_cr3; + vmcs12->guest_cr4 = evmcs->guest_cr4; + vmcs12->guest_dr7 = evmcs->guest_dr7; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { + vmcs12->host_fs_base = evmcs->host_fs_base; + vmcs12->host_gs_base = evmcs->host_gs_base; + vmcs12->host_tr_base = evmcs->host_tr_base; + vmcs12->host_gdtr_base = evmcs->host_gdtr_base; + vmcs12->host_idtr_base = evmcs->host_idtr_base; + vmcs12->host_rsp = evmcs->host_rsp; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { + vmcs12->ept_pointer = evmcs->ept_pointer; + vmcs12->virtual_processor_id = evmcs->virtual_processor_id; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { + vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; + vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; + vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; + vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; + vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; + vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; + vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; + vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; + vmcs12->guest_pending_dbg_exceptions = + evmcs->guest_pending_dbg_exceptions; + vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; + vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; + vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; + vmcs12->guest_activity_state = evmcs->guest_activity_state; + vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; + } + + /* + * Not used? + * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; + * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; + * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; + * vmcs12->cr3_target_value0 = evmcs->cr3_target_value0; + * vmcs12->cr3_target_value1 = evmcs->cr3_target_value1; + * vmcs12->cr3_target_value2 = evmcs->cr3_target_value2; + * vmcs12->cr3_target_value3 = evmcs->cr3_target_value3; + * vmcs12->page_fault_error_code_mask = + * evmcs->page_fault_error_code_mask; + * vmcs12->page_fault_error_code_match = + * evmcs->page_fault_error_code_match; + * vmcs12->cr3_target_count = evmcs->cr3_target_count; + * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; + * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; + * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; + */ + + /* + * Read only fields: + * vmcs12->guest_physical_address = evmcs->guest_physical_address; + * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; + * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; + * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; + * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; + * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; + * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; + * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; + * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; + * vmcs12->exit_qualification = evmcs->exit_qualification; + * vmcs12->guest_linear_address = evmcs->guest_linear_address; + * + * Not present in struct vmcs12: + * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; + * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; + * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; + * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; + */ + + return 0; +} + +static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) +{ + struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; + struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; + + /* + * Should not be changed by KVM: + * + * evmcs->host_es_selector = vmcs12->host_es_selector; + * evmcs->host_cs_selector = vmcs12->host_cs_selector; + * evmcs->host_ss_selector = vmcs12->host_ss_selector; + * evmcs->host_ds_selector = vmcs12->host_ds_selector; + * evmcs->host_fs_selector = vmcs12->host_fs_selector; + * evmcs->host_gs_selector = vmcs12->host_gs_selector; + * evmcs->host_tr_selector = vmcs12->host_tr_selector; + * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; + * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; + * evmcs->host_cr0 = vmcs12->host_cr0; + * evmcs->host_cr3 = vmcs12->host_cr3; + * evmcs->host_cr4 = vmcs12->host_cr4; + * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; + * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; + * evmcs->host_rip = vmcs12->host_rip; + * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; + * evmcs->host_fs_base = vmcs12->host_fs_base; + * evmcs->host_gs_base = vmcs12->host_gs_base; + * evmcs->host_tr_base = vmcs12->host_tr_base; + * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; + * evmcs->host_idtr_base = vmcs12->host_idtr_base; + * evmcs->host_rsp = vmcs12->host_rsp; + * sync_vmcs12() doesn't read these: + * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; + * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; + * evmcs->msr_bitmap = vmcs12->msr_bitmap; + * evmcs->ept_pointer = vmcs12->ept_pointer; + * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; + * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; + * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; + * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; + * evmcs->cr3_target_value0 = vmcs12->cr3_target_value0; + * evmcs->cr3_target_value1 = vmcs12->cr3_target_value1; + * evmcs->cr3_target_value2 = vmcs12->cr3_target_value2; + * evmcs->cr3_target_value3 = vmcs12->cr3_target_value3; + * evmcs->tpr_threshold = vmcs12->tpr_threshold; + * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; + * evmcs->exception_bitmap = vmcs12->exception_bitmap; + * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; + * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; + * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; + * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; + * evmcs->page_fault_error_code_mask = + * vmcs12->page_fault_error_code_mask; + * evmcs->page_fault_error_code_match = + * vmcs12->page_fault_error_code_match; + * evmcs->cr3_target_count = vmcs12->cr3_target_count; + * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; + * evmcs->tsc_offset = vmcs12->tsc_offset; + * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; + * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; + * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; + * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; + * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; + * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; + * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; + * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; + * + * Not present in struct vmcs12: + * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; + * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; + * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; + * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; + */ + + evmcs->guest_es_selector = vmcs12->guest_es_selector; + evmcs->guest_cs_selector = vmcs12->guest_cs_selector; + evmcs->guest_ss_selector = vmcs12->guest_ss_selector; + evmcs->guest_ds_selector = vmcs12->guest_ds_selector; + evmcs->guest_fs_selector = vmcs12->guest_fs_selector; + evmcs->guest_gs_selector = vmcs12->guest_gs_selector; + evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; + evmcs->guest_tr_selector = vmcs12->guest_tr_selector; + + evmcs->guest_es_limit = vmcs12->guest_es_limit; + evmcs->guest_cs_limit = vmcs12->guest_cs_limit; + evmcs->guest_ss_limit = vmcs12->guest_ss_limit; + evmcs->guest_ds_limit = vmcs12->guest_ds_limit; + evmcs->guest_fs_limit = vmcs12->guest_fs_limit; + evmcs->guest_gs_limit = vmcs12->guest_gs_limit; + evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; + evmcs->guest_tr_limit = vmcs12->guest_tr_limit; + evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; + evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; + + evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; + evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; + evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; + evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; + evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; + evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; + evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; + evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; + + evmcs->guest_es_base = vmcs12->guest_es_base; + evmcs->guest_cs_base = vmcs12->guest_cs_base; + evmcs->guest_ss_base = vmcs12->guest_ss_base; + evmcs->guest_ds_base = vmcs12->guest_ds_base; + evmcs->guest_fs_base = vmcs12->guest_fs_base; + evmcs->guest_gs_base = vmcs12->guest_gs_base; + evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; + evmcs->guest_tr_base = vmcs12->guest_tr_base; + evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; + evmcs->guest_idtr_base = vmcs12->guest_idtr_base; + + evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; + evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; + + evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; + evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; + evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; + evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; + + evmcs->guest_pending_dbg_exceptions = + vmcs12->guest_pending_dbg_exceptions; + evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; + evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; + + evmcs->guest_activity_state = vmcs12->guest_activity_state; + evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; + + evmcs->guest_cr0 = vmcs12->guest_cr0; + evmcs->guest_cr3 = vmcs12->guest_cr3; + evmcs->guest_cr4 = vmcs12->guest_cr4; + evmcs->guest_dr7 = vmcs12->guest_dr7; + + evmcs->guest_physical_address = vmcs12->guest_physical_address; + + evmcs->vm_instruction_error = vmcs12->vm_instruction_error; + evmcs->vm_exit_reason = vmcs12->vm_exit_reason; + evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; + evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; + evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; + evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; + evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; + evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; + + evmcs->exit_qualification = vmcs12->exit_qualification; + + evmcs->guest_linear_address = vmcs12->guest_linear_address; + evmcs->guest_rsp = vmcs12->guest_rsp; + evmcs->guest_rflags = vmcs12->guest_rflags; + + evmcs->guest_interruptibility_info = + vmcs12->guest_interruptibility_info; + evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; + evmcs->vm_entry_controls = vmcs12->vm_entry_controls; + evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; + evmcs->vm_entry_exception_error_code = + vmcs12->vm_entry_exception_error_code; + evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; + + evmcs->guest_rip = vmcs12->guest_rip; + + evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; + + return 0; +} + +/* + * This is an equivalent of the nested hypervisor executing the vmptrld + * instruction. + */ +static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, + bool from_launch) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct hv_vp_assist_page assist_page; + + if (likely(!vmx->nested.enlightened_vmcs_enabled)) + return 1; + + if (unlikely(!kvm_hv_get_assist_page(vcpu, &assist_page))) + return 1; + + if (unlikely(!assist_page.enlighten_vmentry)) + return 1; + + if (unlikely(assist_page.current_nested_vmcs != + vmx->nested.hv_evmcs_vmptr)) { + + if (!vmx->nested.hv_evmcs) + vmx->nested.current_vmptr = -1ull; + + nested_release_evmcs(vcpu); + + vmx->nested.hv_evmcs_page = kvm_vcpu_gpa_to_page( + vcpu, assist_page.current_nested_vmcs); + + if (unlikely(is_error_page(vmx->nested.hv_evmcs_page))) + return 0; + + vmx->nested.hv_evmcs = kmap(vmx->nested.hv_evmcs_page); + + /* + * Currently, KVM only supports eVMCS version 1 + * (== KVM_EVMCS_VERSION) and thus we expect guest to set this + * value to first u32 field of eVMCS which should specify eVMCS + * VersionNumber. + * + * Guest should be aware of supported eVMCS versions by host by + * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is + * expected to set this CPUID leaf according to the value + * returned in vmcs_version from nested_enable_evmcs(). + * + * However, it turns out that Microsoft Hyper-V fails to comply + * to their own invented interface: When Hyper-V use eVMCS, it + * just sets first u32 field of eVMCS to revision_id specified + * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number + * which is one of the supported versions specified in + * CPUID.0x4000000A.EAX[0:15]. + * + * To overcome Hyper-V bug, we accept here either a supported + * eVMCS version or VMCS12 revision_id as valid values for first + * u32 field of eVMCS. + */ + if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && + (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { + nested_release_evmcs(vcpu); + return 0; + } + + vmx->nested.dirty_vmcs12 = true; + /* + * As we keep L2 state for one guest only 'hv_clean_fields' mask + * can't be used when we switch between them. Reset it here for + * simplicity. + */ + vmx->nested.hv_evmcs->hv_clean_fields &= + ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + vmx->nested.hv_evmcs_vmptr = assist_page.current_nested_vmcs; + + /* + * Unlike normal vmcs12, enlightened vmcs12 is not fully + * reloaded from guest's memory (read only fields, fields not + * present in struct hv_enlightened_vmcs, ...). Make sure there + * are no leftovers. + */ + if (from_launch) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + memset(vmcs12, 0, sizeof(*vmcs12)); + vmcs12->hdr.revision_id = VMCS12_REVISION; + } + + } + return 1; +} + +void nested_sync_from_vmcs12(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * hv_evmcs may end up being not mapped after migration (when + * L2 was running), map it here to make sure vmcs12 changes are + * properly reflected. + */ + if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs) + nested_vmx_handle_enlightened_vmptrld(vcpu, false); + + if (vmx->nested.hv_evmcs) { + copy_vmcs12_to_enlightened(vmx); + /* All fields are clean */ + vmx->nested.hv_evmcs->hv_clean_fields |= + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + } else { + copy_vmcs12_to_shadow(vmx); + } + + vmx->nested.need_vmcs12_sync = false; +} + +static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) +{ + struct vcpu_vmx *vmx = + container_of(timer, struct vcpu_vmx, nested.preemption_timer); + + vmx->nested.preemption_timer_expired = true; + kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); + kvm_vcpu_kick(&vmx->vcpu); + + return HRTIMER_NORESTART; +} + +static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) +{ + u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * A timer value of zero is architecturally guaranteed to cause + * a VMExit prior to executing any instructions in the guest. + */ + if (preemption_timeout == 0) { + vmx_preemption_timer_fn(&vmx->nested.preemption_timer); + return; + } + + if (vcpu->arch.virtual_tsc_khz == 0) + return; + + preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; + preemption_timeout *= 1000000; + do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); + hrtimer_start(&vmx->nested.preemption_timer, + ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); +} + +static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) +{ + if (vmx->nested.nested_run_pending && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) + return vmcs12->guest_ia32_efer; + else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) + return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); + else + return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); +} + +static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) +{ + /* + * If vmcs02 hasn't been initialized, set the constant vmcs02 state + * according to L0's settings (vmcs12 is irrelevant here). Host + * fields that come from L0 and are not constant, e.g. HOST_CR3, + * will be set as needed prior to VMLAUNCH/VMRESUME. + */ + if (vmx->nested.vmcs02_initialized) + return; + vmx->nested.vmcs02_initialized = true; + + /* + * We don't care what the EPTP value is we just need to guarantee + * it's valid so we don't get a false positive when doing early + * consistency checks. + */ + if (enable_ept && nested_early_check) + vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0)); + + /* All VMFUNCs are currently emulated through L0 vmexits. */ + if (cpu_has_vmx_vmfunc()) + vmcs_write64(VM_FUNCTION_CONTROL, 0); + + if (cpu_has_vmx_posted_intr()) + vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); + + if (cpu_has_vmx_msr_bitmap()) + vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); + + if (enable_pml) + vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); + + /* + * Set the MSR load/store lists to match L0's settings. Only the + * addresses are constant (for vmcs02), the counts can change based + * on L2's behavior, e.g. switching to/from long mode. + */ + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); + vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); + vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); + + vmx_set_constant_host_state(vmx); +} + +static void prepare_vmcs02_early_full(struct vcpu_vmx *vmx, + struct vmcs12 *vmcs12) +{ + prepare_vmcs02_constant_state(vmx); + + vmcs_write64(VMCS_LINK_POINTER, -1ull); + + if (enable_vpid) { + if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) + vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); + else + vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); + } +} + +static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) +{ + u32 exec_control, vmcs12_exec_ctrl; + u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); + + if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) + prepare_vmcs02_early_full(vmx, vmcs12); + + /* + * HOST_RSP is normally set correctly in vmx_vcpu_run() just before + * entry, but only if the current (host) sp changed from the value + * we wrote last (vmx->host_rsp). This cache is no longer relevant + * if we switch vmcs, and rather than hold a separate cache per vmcs, + * here we just force the write to happen on entry. host_rsp will + * also be written unconditionally by nested_vmx_check_vmentry_hw() + * if we are doing early consistency checks via hardware. + */ + vmx->host_rsp = 0; + + /* + * PIN CONTROLS + */ + exec_control = vmcs12->pin_based_vm_exec_control; + + /* Preemption timer setting is computed directly in vmx_vcpu_run. */ + exec_control |= vmcs_config.pin_based_exec_ctrl; + exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + vmx->loaded_vmcs->hv_timer_armed = false; + + /* Posted interrupts setting is only taken from vmcs12. */ + if (nested_cpu_has_posted_intr(vmcs12)) { + vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; + vmx->nested.pi_pending = false; + } else { + exec_control &= ~PIN_BASED_POSTED_INTR; + } + vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); + + /* + * EXEC CONTROLS + */ + exec_control = vmx_exec_control(vmx); /* L0's desires */ + exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; + exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; + exec_control &= ~CPU_BASED_TPR_SHADOW; + exec_control |= vmcs12->cpu_based_vm_exec_control; + + /* + * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if + * nested_get_vmcs12_pages can't fix it up, the illegal value + * will result in a VM entry failure. + */ + if (exec_control & CPU_BASED_TPR_SHADOW) { + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull); + vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); + } else { +#ifdef CONFIG_X86_64 + exec_control |= CPU_BASED_CR8_LOAD_EXITING | + CPU_BASED_CR8_STORE_EXITING; +#endif + } + + /* + * A vmexit (to either L1 hypervisor or L0 userspace) is always needed + * for I/O port accesses. + */ + exec_control &= ~CPU_BASED_USE_IO_BITMAPS; + exec_control |= CPU_BASED_UNCOND_IO_EXITING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); + + /* + * SECONDARY EXEC CONTROLS + */ + if (cpu_has_secondary_exec_ctrls()) { + exec_control = vmx->secondary_exec_control; + + /* Take the following fields only from vmcs12 */ + exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_ENABLE_INVPCID | + SECONDARY_EXEC_RDTSCP | + SECONDARY_EXEC_XSAVES | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_ENABLE_VMFUNC); + if (nested_cpu_has(vmcs12, + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) { + vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control & + ~SECONDARY_EXEC_ENABLE_PML; + exec_control |= vmcs12_exec_ctrl; + } + + /* VMCS shadowing for L2 is emulated for now */ + exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; + + if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) + vmcs_write16(GUEST_INTR_STATUS, + vmcs12->guest_intr_status); + + /* + * Write an illegal value to APIC_ACCESS_ADDR. Later, + * nested_get_vmcs12_pages will either fix it up or + * remove the VM execution control. + */ + if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) + vmcs_write64(APIC_ACCESS_ADDR, -1ull); + + if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) + vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); + + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + } + + /* + * ENTRY CONTROLS + * + * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE + * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate + * on the related bits (if supported by the CPU) in the hope that + * we can avoid VMWrites during vmx_set_efer(). + */ + exec_control = (vmcs12->vm_entry_controls | vmx_vmentry_ctrl()) & + ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER; + if (cpu_has_load_ia32_efer()) { + if (guest_efer & EFER_LMA) + exec_control |= VM_ENTRY_IA32E_MODE; + if (guest_efer != host_efer) + exec_control |= VM_ENTRY_LOAD_IA32_EFER; + } + vm_entry_controls_init(vmx, exec_control); + + /* + * EXIT CONTROLS + * + * L2->L1 exit controls are emulated - the hardware exit is to L0 so + * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER + * bits may be modified by vmx_set_efer() in prepare_vmcs02(). + */ + exec_control = vmx_vmexit_ctrl(); + if (cpu_has_load_ia32_efer() && guest_efer != host_efer) + exec_control |= VM_EXIT_LOAD_IA32_EFER; + vm_exit_controls_init(vmx, exec_control); + + /* + * Conceptually we want to copy the PML address and index from + * vmcs01 here, and then back to vmcs01 on nested vmexit. But, + * since we always flush the log on each vmexit and never change + * the PML address (once set), this happens to be equivalent to + * simply resetting the index in vmcs02. + */ + if (enable_pml) + vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); + + /* + * Interrupt/Exception Fields + */ + if (vmx->nested.nested_run_pending) { + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + vmcs12->vm_entry_intr_info_field); + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, + vmcs12->vm_entry_exception_error_code); + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, + vmcs12->vm_entry_instruction_len); + vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, + vmcs12->guest_interruptibility_info); + vmx->loaded_vmcs->nmi_known_unmasked = + !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); + } else { + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); + } +} + +static void prepare_vmcs02_full(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) +{ + struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; + + if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { + vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); + vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); + vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); + vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); + vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); + vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); + vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); + vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); + vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); + vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); + vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); + vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); + vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); + vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); + vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); + vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); + vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); + vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); + vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); + vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); + vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); + vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); + vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); + vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); + vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); + vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); + vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); + vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); + vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); + vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); + vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); + vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); + vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); + vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); + } + + if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { + vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); + vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, + vmcs12->guest_pending_dbg_exceptions); + vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); + vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); + + /* + * L1 may access the L2's PDPTR, so save them to construct + * vmcs12 + */ + if (enable_ept) { + vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); + vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); + vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); + vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); + } + } + + if (nested_cpu_has_xsaves(vmcs12)) + vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); + + /* + * Whether page-faults are trapped is determined by a combination of + * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. + * If enable_ept, L0 doesn't care about page faults and we should + * set all of these to L1's desires. However, if !enable_ept, L0 does + * care about (at least some) page faults, and because it is not easy + * (if at all possible?) to merge L0 and L1's desires, we simply ask + * to exit on each and every L2 page fault. This is done by setting + * MASK=MATCH=0 and (see below) EB.PF=1. + * Note that below we don't need special code to set EB.PF beyond the + * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, + * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when + * !enable_ept, EB.PF is 1, so the "or" will always be 1. + */ + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, + enable_ept ? vmcs12->page_fault_error_code_mask : 0); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, + enable_ept ? vmcs12->page_fault_error_code_match : 0); + + if (cpu_has_vmx_apicv()) { + vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); + vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); + vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); + vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); + } + + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); + + set_cr4_guest_host_mask(vmx); + + if (kvm_mpx_supported()) { + if (vmx->nested.nested_run_pending && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) + vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); + else + vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); + } +} + +/* + * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested + * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it + * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 + * guest in a way that will both be appropriate to L1's requests, and our + * needs. In addition to modifying the active vmcs (which is vmcs02), this + * function also has additional necessary side-effects, like setting various + * vcpu->arch fields. + * Returns 0 on success, 1 on failure. Invalid state exit qualification code + * is assigned to entry_failure_code on failure. + */ +static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, + u32 *entry_failure_code) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; + + if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) { + prepare_vmcs02_full(vmx, vmcs12); + vmx->nested.dirty_vmcs12 = false; + } + + /* + * First, the fields that are shadowed. This must be kept in sync + * with vmcs_shadow_fields.h. + */ + if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { + vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); + vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); + } + + if (vmx->nested.nested_run_pending && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { + kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); + vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); + } else { + kvm_set_dr(vcpu, 7, vcpu->arch.dr7); + vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); + } + vmx_set_rflags(vcpu, vmcs12->guest_rflags); + + vmx->nested.preemption_timer_expired = false; + if (nested_cpu_has_preemption_timer(vmcs12)) + vmx_start_preemption_timer(vcpu); + + /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the + * bitwise-or of what L1 wants to trap for L2, and what we want to + * trap. Note that CR0.TS also needs updating - we do this later. + */ + update_exception_bitmap(vcpu); + vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; + vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); + + if (vmx->nested.nested_run_pending && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { + vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); + vcpu->arch.pat = vmcs12->guest_ia32_pat; + } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { + vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); + } + + vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); + + if (kvm_has_tsc_control) + decache_tsc_multiplier(vmx); + + if (enable_vpid) { + /* + * There is no direct mapping between vpid02 and vpid12, the + * vpid02 is per-vCPU for L0 and reused while the value of + * vpid12 is changed w/ one invvpid during nested vmentry. + * The vpid12 is allocated by L1 for L2, so it will not + * influence global bitmap(for vpid01 and vpid02 allocation) + * even if spawn a lot of nested vCPUs. + */ + if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) { + if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) { + vmx->nested.last_vpid = vmcs12->virtual_processor_id; + __vmx_flush_tlb(vcpu, nested_get_vpid02(vcpu), false); + } + } else { + /* + * If L1 use EPT, then L0 needs to execute INVEPT on + * EPTP02 instead of EPTP01. Therefore, delay TLB + * flush until vmcs02->eptp is fully updated by + * KVM_REQ_LOAD_CR3. Note that this assumes + * KVM_REQ_TLB_FLUSH is evaluated after + * KVM_REQ_LOAD_CR3 in vcpu_enter_guest(). + */ + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + } + } + + if (nested_cpu_has_ept(vmcs12)) + nested_ept_init_mmu_context(vcpu); + else if (nested_cpu_has2(vmcs12, + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) + vmx_flush_tlb(vcpu, true); + + /* + * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those + * bits which we consider mandatory enabled. + * The CR0_READ_SHADOW is what L2 should have expected to read given + * the specifications by L1; It's not enough to take + * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we + * have more bits than L1 expected. + */ + vmx_set_cr0(vcpu, vmcs12->guest_cr0); + vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); + + vmx_set_cr4(vcpu, vmcs12->guest_cr4); + vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); + + vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); + /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ + vmx_set_efer(vcpu, vcpu->arch.efer); + + /* + * Guest state is invalid and unrestricted guest is disabled, + * which means L1 attempted VMEntry to L2 with invalid state. + * Fail the VMEntry. + */ + if (vmx->emulation_required) { + *entry_failure_code = ENTRY_FAIL_DEFAULT; + return 1; + } + + /* Shadow page tables on either EPT or shadow page tables. */ + if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), + entry_failure_code)) + return 1; + + if (!enable_ept) + vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; + + kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp); + kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); + return 0; +} + +static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has_nmi_exiting(vmcs12) && + nested_cpu_has_virtual_nmis(vmcs12)) + return -EINVAL; + + if (!nested_cpu_has_virtual_nmis(vmcs12) && + nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING)) + return -EINVAL; + + return 0; +} + +static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int maxphyaddr = cpuid_maxphyaddr(vcpu); + + /* Check for memory type validity */ + switch (address & VMX_EPTP_MT_MASK) { + case VMX_EPTP_MT_UC: + if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT)) + return false; + break; + case VMX_EPTP_MT_WB: + if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT)) + return false; + break; + default: + return false; + } + + /* only 4 levels page-walk length are valid */ + if ((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4) + return false; + + /* Reserved bits should not be set */ + if (address >> maxphyaddr || ((address >> 7) & 0x1f)) + return false; + + /* AD, if set, should be supported */ + if (address & VMX_EPTP_AD_ENABLE_BIT) { + if (!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT)) + return false; + } + + return true; +} + +static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + bool ia32e; + + if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && + vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (nested_vmx_check_io_bitmap_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (nested_vmx_check_apic_access_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (!nested_cpu_has_preemption_timer(vmcs12) && + nested_cpu_has_save_preemption_timer(vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (nested_vmx_check_pml_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, + vmx->nested.msrs.procbased_ctls_low, + vmx->nested.msrs.procbased_ctls_high) || + (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && + !vmx_control_verify(vmcs12->secondary_vm_exec_control, + vmx->nested.msrs.secondary_ctls_low, + vmx->nested.msrs.secondary_ctls_high)) || + !vmx_control_verify(vmcs12->pin_based_vm_exec_control, + vmx->nested.msrs.pinbased_ctls_low, + vmx->nested.msrs.pinbased_ctls_high) || + !vmx_control_verify(vmcs12->vm_exit_controls, + vmx->nested.msrs.exit_ctls_low, + vmx->nested.msrs.exit_ctls_high) || + !vmx_control_verify(vmcs12->vm_entry_controls, + vmx->nested.msrs.entry_ctls_low, + vmx->nested.msrs.entry_ctls_high)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (nested_vmx_check_nmi_controls(vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (nested_cpu_has_vmfunc(vmcs12)) { + if (vmcs12->vm_function_control & + ~vmx->nested.msrs.vmfunc_controls) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (nested_cpu_has_eptp_switching(vmcs12)) { + if (!nested_cpu_has_ept(vmcs12) || + !page_address_valid(vcpu, vmcs12->eptp_list_address)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + } + } + + if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) || + !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) || + !nested_cr3_valid(vcpu, vmcs12->host_cr3)) + return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD; + + /* + * If the load IA32_EFER VM-exit control is 1, bits reserved in the + * IA32_EFER MSR must be 0 in the field for that register. In addition, + * the values of the LMA and LME bits in the field must each be that of + * the host address-space size VM-exit control. + */ + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { + ia32e = (vmcs12->vm_exit_controls & + VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0; + if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) || + ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) || + ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) + return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD; + } + + /* + * From the Intel SDM, volume 3: + * Fields relevant to VM-entry event injection must be set properly. + * These fields are the VM-entry interruption-information field, the + * VM-entry exception error code, and the VM-entry instruction length. + */ + if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { + u32 intr_info = vmcs12->vm_entry_intr_info_field; + u8 vector = intr_info & INTR_INFO_VECTOR_MASK; + u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; + bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; + bool should_have_error_code; + bool urg = nested_cpu_has2(vmcs12, + SECONDARY_EXEC_UNRESTRICTED_GUEST); + bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; + + /* VM-entry interruption-info field: interruption type */ + if (intr_type == INTR_TYPE_RESERVED || + (intr_type == INTR_TYPE_OTHER_EVENT && + !nested_cpu_supports_monitor_trap_flag(vcpu))) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + /* VM-entry interruption-info field: vector */ + if ((intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || + (intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || + (intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + /* VM-entry interruption-info field: deliver error code */ + should_have_error_code = + intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && + x86_exception_has_error_code(vector); + if (has_error_code != should_have_error_code) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + /* VM-entry exception error code */ + if (has_error_code && + vmcs12->vm_entry_exception_error_code & GENMASK(31, 15)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + /* VM-entry interruption-info field: reserved bits */ + if (intr_info & INTR_INFO_RESVD_BITS_MASK) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + /* VM-entry instruction length */ + switch (intr_type) { + case INTR_TYPE_SOFT_EXCEPTION: + case INTR_TYPE_SOFT_INTR: + case INTR_TYPE_PRIV_SW_EXCEPTION: + if ((vmcs12->vm_entry_instruction_len > 15) || + (vmcs12->vm_entry_instruction_len == 0 && + !nested_cpu_has_zero_length_injection(vcpu))) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + } + } + + if (nested_cpu_has_ept(vmcs12) && + !valid_ept_address(vcpu, vmcs12->ept_pointer)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + return 0; +} + +static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + int r; + struct page *page; + struct vmcs12 *shadow; + + if (vmcs12->vmcs_link_pointer == -1ull) + return 0; + + if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)) + return -EINVAL; + + page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer); + if (is_error_page(page)) + return -EINVAL; + + r = 0; + shadow = kmap(page); + if (shadow->hdr.revision_id != VMCS12_REVISION || + shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)) + r = -EINVAL; + kunmap(page); + kvm_release_page_clean(page); + return r; +} + +static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, + u32 *exit_qual) +{ + bool ia32e; + + *exit_qual = ENTRY_FAIL_DEFAULT; + + if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) || + !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) + return 1; + + if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { + *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR; + return 1; + } + + /* + * If the load IA32_EFER VM-entry control is 1, the following checks + * are performed on the field for the IA32_EFER MSR: + * - Bits reserved in the IA32_EFER MSR must be 0. + * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of + * the IA-32e mode guest VM-exit control. It must also be identical + * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to + * CR0.PG) is 1. + */ + if (to_vmx(vcpu)->nested.nested_run_pending && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { + ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; + if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) || + ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) || + ((vmcs12->guest_cr0 & X86_CR0_PG) && + ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) + return 1; + } + + if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && + (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) || + (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))) + return 1; + + return 0; +} + +static int __noclone nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long cr3, cr4; + + if (!nested_early_check) + return 0; + + if (vmx->msr_autoload.host.nr) + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); + if (vmx->msr_autoload.guest.nr) + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); + + preempt_disable(); + + vmx_prepare_switch_to_guest(vcpu); + + /* + * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, + * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to + * be written (by preparve_vmcs02()) before the "real" VMEnter, i.e. + * there is no need to preserve other bits or save/restore the field. + */ + vmcs_writel(GUEST_RFLAGS, 0); + + vmcs_writel(HOST_RIP, vmx_early_consistency_check_return); + + cr3 = __get_current_cr3_fast(); + if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { + vmcs_writel(HOST_CR3, cr3); + vmx->loaded_vmcs->host_state.cr3 = cr3; + } + + cr4 = cr4_read_shadow(); + if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { + vmcs_writel(HOST_CR4, cr4); + vmx->loaded_vmcs->host_state.cr4 = cr4; + } + + vmx->__launched = vmx->loaded_vmcs->launched; + + asm( + /* Set HOST_RSP */ + __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t" + "mov %%" _ASM_SP ", %c[host_rsp](%0)\n\t" + + /* Check if vmlaunch or vmresume is needed */ + "cmpl $0, %c[launched](%0)\n\t" + "jne 1f\n\t" + __ex("vmlaunch") "\n\t" + "jmp 2f\n\t" + "1: " __ex("vmresume") "\n\t" + "2: " + /* Set vmx->fail accordingly */ + "setbe %c[fail](%0)\n\t" + + ".pushsection .rodata\n\t" + ".global vmx_early_consistency_check_return\n\t" + "vmx_early_consistency_check_return: " _ASM_PTR " 2b\n\t" + ".popsection" + : + : "c"(vmx), "d"((unsigned long)HOST_RSP), + [launched]"i"(offsetof(struct vcpu_vmx, __launched)), + [fail]"i"(offsetof(struct vcpu_vmx, fail)), + [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)) + : "rax", "cc", "memory" + ); + + vmcs_writel(HOST_RIP, vmx_return); + + preempt_enable(); + + if (vmx->msr_autoload.host.nr) + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); + if (vmx->msr_autoload.guest.nr) + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); + + if (vmx->fail) { + WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != + VMXERR_ENTRY_INVALID_CONTROL_FIELD); + vmx->fail = 0; + return 1; + } + + /* + * VMExit clears RFLAGS.IF and DR7, even on a consistency check. + */ + local_irq_enable(); + if (hw_breakpoint_active()) + set_debugreg(__this_cpu_read(cpu_dr7), 7); + + /* + * A non-failing VMEntry means we somehow entered guest mode with + * an illegal RIP, and that's just the tip of the iceberg. There + * is no telling what memory has been modified or what state has + * been exposed to unknown code. Hitting this all but guarantees + * a (very critical) hardware issue. + */ + WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & + VMX_EXIT_REASONS_FAILED_VMENTRY)); + + return 0; +} +STACK_FRAME_NON_STANDARD(nested_vmx_check_vmentry_hw); + + +static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12); + +static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct page *page; + u64 hpa; + + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { + /* + * Translate L1 physical address to host physical + * address for vmcs02. Keep the page pinned, so this + * physical address remains valid. We keep a reference + * to it so we can release it later. + */ + if (vmx->nested.apic_access_page) { /* shouldn't happen */ + kvm_release_page_dirty(vmx->nested.apic_access_page); + vmx->nested.apic_access_page = NULL; + } + page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr); + /* + * If translation failed, no matter: This feature asks + * to exit when accessing the given address, and if it + * can never be accessed, this feature won't do + * anything anyway. + */ + if (!is_error_page(page)) { + vmx->nested.apic_access_page = page; + hpa = page_to_phys(vmx->nested.apic_access_page); + vmcs_write64(APIC_ACCESS_ADDR, hpa); + } else { + vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); + } + } + + if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { + if (vmx->nested.virtual_apic_page) { /* shouldn't happen */ + kvm_release_page_dirty(vmx->nested.virtual_apic_page); + vmx->nested.virtual_apic_page = NULL; + } + page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->virtual_apic_page_addr); + + /* + * If translation failed, VM entry will fail because + * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull. + * Failing the vm entry is _not_ what the processor + * does but it's basically the only possibility we + * have. We could still enter the guest if CR8 load + * exits are enabled, CR8 store exits are enabled, and + * virtualize APIC access is disabled; in this case + * the processor would never use the TPR shadow and we + * could simply clear the bit from the execution + * control. But such a configuration is useless, so + * let's keep the code simple. + */ + if (!is_error_page(page)) { + vmx->nested.virtual_apic_page = page; + hpa = page_to_phys(vmx->nested.virtual_apic_page); + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa); + } + } + + if (nested_cpu_has_posted_intr(vmcs12)) { + if (vmx->nested.pi_desc_page) { /* shouldn't happen */ + kunmap(vmx->nested.pi_desc_page); + kvm_release_page_dirty(vmx->nested.pi_desc_page); + vmx->nested.pi_desc_page = NULL; + } + page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->posted_intr_desc_addr); + if (is_error_page(page)) + return; + vmx->nested.pi_desc_page = page; + vmx->nested.pi_desc = kmap(vmx->nested.pi_desc_page); + vmx->nested.pi_desc = + (struct pi_desc *)((void *)vmx->nested.pi_desc + + (unsigned long)(vmcs12->posted_intr_desc_addr & + (PAGE_SIZE - 1))); + vmcs_write64(POSTED_INTR_DESC_ADDR, + page_to_phys(vmx->nested.pi_desc_page) + + (unsigned long)(vmcs12->posted_intr_desc_addr & + (PAGE_SIZE - 1))); + } + if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) + vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, + CPU_BASED_USE_MSR_BITMAPS); + else + vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, + CPU_BASED_USE_MSR_BITMAPS); +} + +/* + * Intel's VMX Instruction Reference specifies a common set of prerequisites + * for running VMX instructions (except VMXON, whose prerequisites are + * slightly different). It also specifies what exception to inject otherwise. + * Note that many of these exceptions have priority over VM exits, so they + * don't have to be checked again here. + */ +static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) +{ + if (!to_vmx(vcpu)->nested.vmxon) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 0; + } + + if (vmx_get_cpl(vcpu)) { + kvm_inject_gp(vcpu, 0); + return 0; + } + + return 1; +} + +static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu) +{ + u8 rvi = vmx_get_rvi(); + u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI); + + return ((rvi & 0xf0) > (vppr & 0xf0)); +} + +static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12); + +/* + * If from_vmentry is false, this is being called from state restore (either RSM + * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. ++ * ++ * Returns: ++ * 0 - success, i.e. proceed with actual VMEnter ++ * 1 - consistency check VMExit ++ * -1 - consistency check VMFail + */ +int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + bool evaluate_pending_interrupts; + u32 exit_reason = EXIT_REASON_INVALID_STATE; + u32 exit_qual; + + evaluate_pending_interrupts = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) & + (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING); + if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) + evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); + + if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) + vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + if (kvm_mpx_supported() && + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) + vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); + + vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); + + prepare_vmcs02_early(vmx, vmcs12); + + if (from_vmentry) { + nested_get_vmcs12_pages(vcpu); + + if (nested_vmx_check_vmentry_hw(vcpu)) { + vmx_switch_vmcs(vcpu, &vmx->vmcs01); + return -1; + } + + if (check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) + goto vmentry_fail_vmexit; + } + + enter_guest_mode(vcpu); + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) + vcpu->arch.tsc_offset += vmcs12->tsc_offset; + + if (prepare_vmcs02(vcpu, vmcs12, &exit_qual)) + goto vmentry_fail_vmexit_guest_mode; + + if (from_vmentry) { + exit_reason = EXIT_REASON_MSR_LOAD_FAIL; + exit_qual = nested_vmx_load_msr(vcpu, + vmcs12->vm_entry_msr_load_addr, + vmcs12->vm_entry_msr_load_count); + if (exit_qual) + goto vmentry_fail_vmexit_guest_mode; + } else { + /* + * The MMU is not initialized to point at the right entities yet and + * "get pages" would need to read data from the guest (i.e. we will + * need to perform gpa to hpa translation). Request a call + * to nested_get_vmcs12_pages before the next VM-entry. The MSRs + * have already been set at vmentry time and should not be reset. + */ + kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); + } + + /* + * If L1 had a pending IRQ/NMI until it executed + * VMLAUNCH/VMRESUME which wasn't delivered because it was + * disallowed (e.g. interrupts disabled), L0 needs to + * evaluate if this pending event should cause an exit from L2 + * to L1 or delivered directly to L2 (e.g. In case L1 don't + * intercept EXTERNAL_INTERRUPT). + * + * Usually this would be handled by the processor noticing an + * IRQ/NMI window request, or checking RVI during evaluation of + * pending virtual interrupts. However, this setting was done + * on VMCS01 and now VMCS02 is active instead. Thus, we force L0 + * to perform pending event evaluation by requesting a KVM_REQ_EVENT. + */ + if (unlikely(evaluate_pending_interrupts)) + kvm_make_request(KVM_REQ_EVENT, vcpu); + + /* + * Note no nested_vmx_succeed or nested_vmx_fail here. At this point + * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet + * returned as far as L1 is concerned. It will only return (and set + * the success flag) when L2 exits (see nested_vmx_vmexit()). + */ + return 0; + + /* + * A failed consistency check that leads to a VMExit during L1's + * VMEnter to L2 is a variation of a normal VMexit, as explained in + * 26.7 "VM-entry failures during or after loading guest state". + */ +vmentry_fail_vmexit_guest_mode: + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) + vcpu->arch.tsc_offset -= vmcs12->tsc_offset; + leave_guest_mode(vcpu); + +vmentry_fail_vmexit: + vmx_switch_vmcs(vcpu, &vmx->vmcs01); + + if (!from_vmentry) + return 1; + + load_vmcs12_host_state(vcpu, vmcs12); + vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY; + vmcs12->exit_qualification = exit_qual; + if (enable_shadow_vmcs || vmx->nested.hv_evmcs) + vmx->nested.need_vmcs12_sync = true; + return 1; +} + +/* + * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 + * for running an L2 nested guest. + */ +static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) +{ + struct vmcs12 *vmcs12; + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); + int ret; + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + if (!nested_vmx_handle_enlightened_vmptrld(vcpu, true)) + return 1; + + if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull) + return nested_vmx_failInvalid(vcpu); + + vmcs12 = get_vmcs12(vcpu); + + /* + * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact + * that there *is* a valid VMCS pointer, RFLAGS.CF is set + * rather than RFLAGS.ZF, and no error number is stored to the + * VM-instruction error field. + */ + if (vmcs12->hdr.shadow_vmcs) + return nested_vmx_failInvalid(vcpu); + + if (vmx->nested.hv_evmcs) { + copy_enlightened_to_vmcs12(vmx); + /* Enlightened VMCS doesn't have launch state */ + vmcs12->launch_state = !launch; + } else if (enable_shadow_vmcs) { + copy_shadow_to_vmcs12(vmx); + } + + /* + * The nested entry process starts with enforcing various prerequisites + * on vmcs12 as required by the Intel SDM, and act appropriately when + * they fail: As the SDM explains, some conditions should cause the + * instruction to fail, while others will cause the instruction to seem + * to succeed, but return an EXIT_REASON_INVALID_STATE. + * To speed up the normal (success) code path, we should avoid checking + * for misconfigurations which will anyway be caught by the processor + * when using the merged vmcs02. + */ + if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS) + return nested_vmx_failValid(vcpu, + VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); + + if (vmcs12->launch_state == launch) + return nested_vmx_failValid(vcpu, + launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS + : VMXERR_VMRESUME_NONLAUNCHED_VMCS); + + ret = check_vmentry_prereqs(vcpu, vmcs12); + if (ret) + return nested_vmx_failValid(vcpu, ret); + + /* + * We're finally done with prerequisite checking, and can start with + * the nested entry. + */ + vmx->nested.nested_run_pending = 1; + ret = nested_vmx_enter_non_root_mode(vcpu, true); + vmx->nested.nested_run_pending = !ret; + if (ret > 0) + return 1; + else if (ret) + return nested_vmx_failValid(vcpu, + VMXERR_ENTRY_INVALID_CONTROL_FIELD); + + /* Hide L1D cache contents from the nested guest. */ + vmx->vcpu.arch.l1tf_flush_l1d = true; + + /* + * Must happen outside of nested_vmx_enter_non_root_mode() as it will + * also be used as part of restoring nVMX state for + * snapshot restore (migration). + * + * In this flow, it is assumed that vmcs12 cache was + * trasferred as part of captured nVMX state and should + * therefore not be read from guest memory (which may not + * exist on destination host yet). + */ + nested_cache_shadow_vmcs12(vcpu, vmcs12); + + /* + * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken + * by event injection, halt vcpu. + */ + if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) && + !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK)) { + vmx->nested.nested_run_pending = 0; + return kvm_vcpu_halt(vcpu); + } + return 1; +} + +/* + * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date + * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK). + * This function returns the new value we should put in vmcs12.guest_cr0. + * It's not enough to just return the vmcs02 GUEST_CR0. Rather, + * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now + * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 + * didn't trap the bit, because if L1 did, so would L0). + * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have + * been modified by L2, and L1 knows it. So just leave the old value of + * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 + * isn't relevant, because if L0 traps this bit it can set it to anything. + * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have + * changed these bits, and therefore they need to be updated, but L0 + * didn't necessarily allow them to be changed in GUEST_CR0 - and rather + * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. + */ +static inline unsigned long +vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +{ + return + /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | + /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | + /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | + vcpu->arch.cr0_guest_owned_bits)); +} + +static inline unsigned long +vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +{ + return + /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | + /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | + /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | + vcpu->arch.cr4_guest_owned_bits)); +} + +static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + u32 idt_vectoring; + unsigned int nr; + + if (vcpu->arch.exception.injected) { + nr = vcpu->arch.exception.nr; + idt_vectoring = nr | VECTORING_INFO_VALID_MASK; + + if (kvm_exception_is_soft(nr)) { + vmcs12->vm_exit_instruction_len = + vcpu->arch.event_exit_inst_len; + idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; + } else + idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; + + if (vcpu->arch.exception.has_error_code) { + idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; + vmcs12->idt_vectoring_error_code = + vcpu->arch.exception.error_code; + } + + vmcs12->idt_vectoring_info_field = idt_vectoring; + } else if (vcpu->arch.nmi_injected) { + vmcs12->idt_vectoring_info_field = + INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; + } else if (vcpu->arch.interrupt.injected) { + nr = vcpu->arch.interrupt.nr; + idt_vectoring = nr | VECTORING_INFO_VALID_MASK; + + if (vcpu->arch.interrupt.soft) { + idt_vectoring |= INTR_TYPE_SOFT_INTR; + vmcs12->vm_entry_instruction_len = + vcpu->arch.event_exit_inst_len; + } else + idt_vectoring |= INTR_TYPE_EXT_INTR; + + vmcs12->idt_vectoring_info_field = idt_vectoring; + } +} + + +static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + gfn_t gfn; + + /* + * Don't need to mark the APIC access page dirty; it is never + * written to by the CPU during APIC virtualization. + */ + + if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { + gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; + kvm_vcpu_mark_page_dirty(vcpu, gfn); + } + + if (nested_cpu_has_posted_intr(vmcs12)) { + gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; + kvm_vcpu_mark_page_dirty(vcpu, gfn); + } +} + +static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int max_irr; + void *vapic_page; + u16 status; + + if (!vmx->nested.pi_desc || !vmx->nested.pi_pending) + return; + + vmx->nested.pi_pending = false; + if (!pi_test_and_clear_on(vmx->nested.pi_desc)) + return; + + max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); + if (max_irr != 256) { + vapic_page = kmap(vmx->nested.virtual_apic_page); + __kvm_apic_update_irr(vmx->nested.pi_desc->pir, + vapic_page, &max_irr); + kunmap(vmx->nested.virtual_apic_page); + + status = vmcs_read16(GUEST_INTR_STATUS); + if ((u8)max_irr > ((u8)status & 0xff)) { + status &= ~0xff; + status |= (u8)max_irr; + vmcs_write16(GUEST_INTR_STATUS, status); + } + } + + nested_mark_vmcs12_pages_dirty(vcpu); +} + +static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu, + unsigned long exit_qual) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + unsigned int nr = vcpu->arch.exception.nr; + u32 intr_info = nr | INTR_INFO_VALID_MASK; + + if (vcpu->arch.exception.has_error_code) { + vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code; + intr_info |= INTR_INFO_DELIVER_CODE_MASK; + } + + if (kvm_exception_is_soft(nr)) + intr_info |= INTR_TYPE_SOFT_EXCEPTION; + else + intr_info |= INTR_TYPE_HARD_EXCEPTION; + + if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && + vmx_get_nmi_mask(vcpu)) + intr_info |= INTR_INFO_UNBLOCK_NMI; + + nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); +} + +static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long exit_qual; + bool block_nested_events = + vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu); + + if (vcpu->arch.exception.pending && + nested_vmx_check_exception(vcpu, &exit_qual)) { + if (block_nested_events) + return -EBUSY; + nested_vmx_inject_exception_vmexit(vcpu, exit_qual); + return 0; + } + + if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && + vmx->nested.preemption_timer_expired) { + if (block_nested_events) + return -EBUSY; + nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); + return 0; + } + + if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) { + if (block_nested_events) + return -EBUSY; + nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, + NMI_VECTOR | INTR_TYPE_NMI_INTR | + INTR_INFO_VALID_MASK, 0); + /* + * The NMI-triggered VM exit counts as injection: + * clear this one and block further NMIs. + */ + vcpu->arch.nmi_pending = 0; + vmx_set_nmi_mask(vcpu, true); + return 0; + } + + if ((kvm_cpu_has_interrupt(vcpu) || external_intr) && + nested_exit_on_intr(vcpu)) { + if (block_nested_events) + return -EBUSY; + nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); + return 0; + } + + vmx_complete_nested_posted_interrupt(vcpu); + return 0; +} + +static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) +{ + ktime_t remaining = + hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); + u64 value; + + if (ktime_to_ns(remaining) <= 0) + return 0; + + value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; + do_div(value, 1000000); + return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; +} + +/* + * Update the guest state fields of vmcs12 to reflect changes that + * occurred while L2 was running. (The "IA-32e mode guest" bit of the + * VM-entry controls is also updated, since this is really a guest + * state bit.) + */ +static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +{ + vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); + vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); + + vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); + vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP); + vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); + + vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); + vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); + vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); + vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); + vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); + vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); + vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); + vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); + vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); + vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); + vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); + vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); + vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); + vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); + vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); + vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); + vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); + vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); + vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); + vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); + vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); + vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); + vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); + vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); + vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); + vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); + vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); + vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); + vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); + vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); + vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); + vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); + vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); + vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); + vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); + vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); + + vmcs12->guest_interruptibility_info = + vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); + vmcs12->guest_pending_dbg_exceptions = + vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); + if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) + vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; + else + vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; + + if (nested_cpu_has_preemption_timer(vmcs12)) { + if (vmcs12->vm_exit_controls & + VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) + vmcs12->vmx_preemption_timer_value = + vmx_get_preemption_timer_value(vcpu); + hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); + } + + /* + * In some cases (usually, nested EPT), L2 is allowed to change its + * own CR3 without exiting. If it has changed it, we must keep it. + * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined + * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. + * + * Additionally, restore L2's PDPTR to vmcs12. + */ + if (enable_ept) { + vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); + vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); + vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); + vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); + vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); + } + + vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); + + if (nested_cpu_has_vid(vmcs12)) + vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); + + vmcs12->vm_entry_controls = + (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | + (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); + + if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) { + kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); + vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + } + + /* TODO: These cannot have changed unless we have MSR bitmaps and + * the relevant bit asks not to trap the change */ + if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) + vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT); + if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) + vmcs12->guest_ia32_efer = vcpu->arch.efer; + vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); + vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); + vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); + if (kvm_mpx_supported()) + vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); +} + +/* + * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits + * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), + * and this function updates it to reflect the changes to the guest state while + * L2 was running (and perhaps made some exits which were handled directly by L0 + * without going back to L1), and to reflect the exit reason. + * Note that we do not have to copy here all VMCS fields, just those that + * could have changed by the L2 guest or the exit - i.e., the guest-state and + * exit-information fields only. Other fields are modified by L1 with VMWRITE, + * which already writes to vmcs12 directly. + */ +static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, + u32 exit_reason, u32 exit_intr_info, + unsigned long exit_qualification) +{ + /* update guest state fields: */ + sync_vmcs12(vcpu, vmcs12); + + /* update exit information fields: */ + + vmcs12->vm_exit_reason = exit_reason; + vmcs12->exit_qualification = exit_qualification; + vmcs12->vm_exit_intr_info = exit_intr_info; + + vmcs12->idt_vectoring_info_field = 0; + vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + + if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { + vmcs12->launch_state = 1; + + /* vm_entry_intr_info_field is cleared on exit. Emulate this + * instead of reading the real value. */ + vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; + + /* + * Transfer the event that L0 or L1 may wanted to inject into + * L2 to IDT_VECTORING_INFO_FIELD. + */ + vmcs12_save_pending_event(vcpu, vmcs12); + } + + /* + * Drop what we picked up for L2 via vmx_complete_interrupts. It is + * preserved above and would only end up incorrectly in L1. + */ + vcpu->arch.nmi_injected = false; + kvm_clear_exception_queue(vcpu); + kvm_clear_interrupt_queue(vcpu); +} + +/* + * A part of what we need to when the nested L2 guest exits and we want to + * run its L1 parent, is to reset L1's guest state to the host state specified + * in vmcs12. + * This function is to be called not only on normal nested exit, but also on + * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry + * Failures During or After Loading Guest State"). + * This function should be called when the active VMCS is L1's (vmcs01). + */ +static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + struct kvm_segment seg; + u32 entry_failure_code; + + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) + vcpu->arch.efer = vmcs12->host_ia32_efer; + else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) + vcpu->arch.efer |= (EFER_LMA | EFER_LME); + else + vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); + vmx_set_efer(vcpu, vcpu->arch.efer); + + kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp); + kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip); + vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); + vmx_set_interrupt_shadow(vcpu, 0); + + /* + * Note that calling vmx_set_cr0 is important, even if cr0 hasn't + * actually changed, because vmx_set_cr0 refers to efer set above. + * + * CR0_GUEST_HOST_MASK is already set in the original vmcs01 + * (KVM doesn't change it); + */ + vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; + vmx_set_cr0(vcpu, vmcs12->host_cr0); + + /* Same as above - no reason to call set_cr4_guest_host_mask(). */ + vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); + vmx_set_cr4(vcpu, vmcs12->host_cr4); + + nested_ept_uninit_mmu_context(vcpu); + + /* + * Only PDPTE load can fail as the value of cr3 was checked on entry and + * couldn't have changed. + */ + if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code)) + nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); + + if (!enable_ept) + vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; + + /* + * If vmcs01 doesn't use VPID, CPU flushes TLB on every + * VMEntry/VMExit. Thus, no need to flush TLB. + * + * If vmcs12 doesn't use VPID, L1 expects TLB to be + * flushed on every VMEntry/VMExit. + * + * Otherwise, we can preserve TLB entries as long as we are + * able to tag L1 TLB entries differently than L2 TLB entries. + * + * If vmcs12 uses EPT, we need to execute this flush on EPTP01 + * and therefore we request the TLB flush to happen only after VMCS EPTP + * has been set by KVM_REQ_LOAD_CR3. + */ + if (enable_vpid && + (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) { + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + } + + vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); + vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); + vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); + vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); + vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); + vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); + vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); + + /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ + if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) + vmcs_write64(GUEST_BNDCFGS, 0); + + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { + vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); + vcpu->arch.pat = vmcs12->host_ia32_pat; + } + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) + vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, + vmcs12->host_ia32_perf_global_ctrl); + + /* Set L1 segment info according to Intel SDM + 27.5.2 Loading Host Segment and Descriptor-Table Registers */ + seg = (struct kvm_segment) { + .base = 0, + .limit = 0xFFFFFFFF, + .selector = vmcs12->host_cs_selector, + .type = 11, + .present = 1, + .s = 1, + .g = 1 + }; + if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) + seg.l = 1; + else + seg.db = 1; + vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); + seg = (struct kvm_segment) { + .base = 0, + .limit = 0xFFFFFFFF, + .type = 3, + .present = 1, + .s = 1, + .db = 1, + .g = 1 + }; + seg.selector = vmcs12->host_ds_selector; + vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); + seg.selector = vmcs12->host_es_selector; + vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); + seg.selector = vmcs12->host_ss_selector; + vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); + seg.selector = vmcs12->host_fs_selector; + seg.base = vmcs12->host_fs_base; + vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); + seg.selector = vmcs12->host_gs_selector; + seg.base = vmcs12->host_gs_base; + vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); + seg = (struct kvm_segment) { + .base = vmcs12->host_tr_base, + .limit = 0x67, + .selector = vmcs12->host_tr_selector, + .type = 11, + .present = 1 + }; + vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); + + kvm_set_dr(vcpu, 7, 0x400); + vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + + if (cpu_has_vmx_msr_bitmap()) + vmx_update_msr_bitmap(vcpu); + + if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, + vmcs12->vm_exit_msr_load_count)) + nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); +} + +static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) +{ + struct shared_msr_entry *efer_msr; + unsigned int i; + + if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) + return vmcs_read64(GUEST_IA32_EFER); + + if (cpu_has_load_ia32_efer()) + return host_efer; + + for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { + if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) + return vmx->msr_autoload.guest.val[i].value; + } + + efer_msr = find_msr_entry(vmx, MSR_EFER); + if (efer_msr) + return efer_msr->data; + + return host_efer; +} + +static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmx_msr_entry g, h; + struct msr_data msr; + gpa_t gpa; + u32 i, j; + + vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); + + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { + /* + * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set + * as vmcs01.GUEST_DR7 contains a userspace defined value + * and vcpu->arch.dr7 is not squirreled away before the + * nested VMENTER (not worth adding a variable in nested_vmx). + */ + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) + kvm_set_dr(vcpu, 7, DR7_FIXED_1); + else + WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); + } + + /* + * Note that calling vmx_set_{efer,cr0,cr4} is important as they + * handle a variety of side effects to KVM's software model. + */ + vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); + + vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; + vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); + + vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); + vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); + + nested_ept_uninit_mmu_context(vcpu); + vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); + __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + + /* + * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs + * from vmcs01 (if necessary). The PDPTRs are not loaded on + * VMFail, like everything else we just need to ensure our + * software model is up-to-date. + */ + ept_save_pdptrs(vcpu); + + kvm_mmu_reset_context(vcpu); + + if (cpu_has_vmx_msr_bitmap()) + vmx_update_msr_bitmap(vcpu); + + /* + * This nasty bit of open coding is a compromise between blindly + * loading L1's MSRs using the exit load lists (incorrect emulation + * of VMFail), leaving the nested VM's MSRs in the software model + * (incorrect behavior) and snapshotting the modified MSRs (too + * expensive since the lists are unbound by hardware). For each + * MSR that was (prematurely) loaded from the nested VMEntry load + * list, reload it from the exit load list if it exists and differs + * from the guest value. The intent is to stuff host state as + * silently as possible, not to fully process the exit load list. + */ + msr.host_initiated = false; + for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { + gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); + if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { + pr_debug_ratelimited( + "%s read MSR index failed (%u, 0x%08llx)\n", + __func__, i, gpa); + goto vmabort; + } + + for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { + gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); + if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { + pr_debug_ratelimited( + "%s read MSR failed (%u, 0x%08llx)\n", + __func__, j, gpa); + goto vmabort; + } + if (h.index != g.index) + continue; + if (h.value == g.value) + break; + + if (nested_vmx_load_msr_check(vcpu, &h)) { + pr_debug_ratelimited( + "%s check failed (%u, 0x%x, 0x%x)\n", + __func__, j, h.index, h.reserved); + goto vmabort; + } + + msr.index = h.index; + msr.data = h.value; + if (kvm_set_msr(vcpu, &msr)) { + pr_debug_ratelimited( + "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", + __func__, j, h.index, h.value); + goto vmabort; + } + } + } + + return; + +vmabort: + nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); +} + +/* + * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 + * and modify vmcs12 to make it see what it would expect to see there if + * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) + */ +void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, + u32 exit_intr_info, unsigned long exit_qualification) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + /* trying to cancel vmlaunch/vmresume is a bug */ + WARN_ON_ONCE(vmx->nested.nested_run_pending); + + leave_guest_mode(vcpu); + + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) + vcpu->arch.tsc_offset -= vmcs12->tsc_offset; + + if (likely(!vmx->fail)) { + if (exit_reason == -1) + sync_vmcs12(vcpu, vmcs12); + else + prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, + exit_qualification); + + /* + * Must happen outside of sync_vmcs12() as it will + * also be used to capture vmcs12 cache as part of + * capturing nVMX state for snapshot (migration). + * + * Otherwise, this flush will dirty guest memory at a + * point it is already assumed by user-space to be + * immutable. + */ + nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); + + if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr, + vmcs12->vm_exit_msr_store_count)) + nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL); + } else { + /* + * The only expected VM-instruction error is "VM entry with + * invalid control field(s)." Anything else indicates a + * problem with L0. And we should never get here with a + * VMFail of any type if early consistency checks are enabled. + */ + WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != + VMXERR_ENTRY_INVALID_CONTROL_FIELD); + WARN_ON_ONCE(nested_early_check); + } + + vmx_switch_vmcs(vcpu, &vmx->vmcs01); + + /* Update any VMCS fields that might have changed while L2 ran */ + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); + vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); + + if (kvm_has_tsc_control) + decache_tsc_multiplier(vmx); + + if (vmx->nested.change_vmcs01_virtual_apic_mode) { + vmx->nested.change_vmcs01_virtual_apic_mode = false; + vmx_set_virtual_apic_mode(vcpu); + } else if (!nested_cpu_has_ept(vmcs12) && + nested_cpu_has2(vmcs12, + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { + vmx_flush_tlb(vcpu, true); + } + + /* This is needed for same reason as it was needed in prepare_vmcs02 */ + vmx->host_rsp = 0; + + /* Unpin physical memory we referred to in vmcs02 */ + if (vmx->nested.apic_access_page) { + kvm_release_page_dirty(vmx->nested.apic_access_page); + vmx->nested.apic_access_page = NULL; + } + if (vmx->nested.virtual_apic_page) { + kvm_release_page_dirty(vmx->nested.virtual_apic_page); + vmx->nested.virtual_apic_page = NULL; + } + if (vmx->nested.pi_desc_page) { + kunmap(vmx->nested.pi_desc_page); + kvm_release_page_dirty(vmx->nested.pi_desc_page); + vmx->nested.pi_desc_page = NULL; + vmx->nested.pi_desc = NULL; + } + + /* + * We are now running in L2, mmu_notifier will force to reload the + * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1. + */ + kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); + + if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs)) + vmx->nested.need_vmcs12_sync = true; + + /* in case we halted in L2 */ + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + + if (likely(!vmx->fail)) { + /* + * TODO: SDM says that with acknowledge interrupt on + * exit, bit 31 of the VM-exit interrupt information + * (valid interrupt) is always set to 1 on + * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't + * need kvm_cpu_has_interrupt(). See the commit + * message for details. + */ + if (nested_exit_intr_ack_set(vcpu) && + exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && + kvm_cpu_has_interrupt(vcpu)) { + int irq = kvm_cpu_get_interrupt(vcpu); + WARN_ON(irq < 0); + vmcs12->vm_exit_intr_info = irq | + INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; + } + + if (exit_reason != -1) + trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, + vmcs12->exit_qualification, + vmcs12->idt_vectoring_info_field, + vmcs12->vm_exit_intr_info, + vmcs12->vm_exit_intr_error_code, + KVM_ISA_VMX); + + load_vmcs12_host_state(vcpu, vmcs12); + + return; + } + + /* + * After an early L2 VM-entry failure, we're now back + * in L1 which thinks it just finished a VMLAUNCH or + * VMRESUME instruction, so we need to set the failure + * flag and the VM-instruction error field of the VMCS + * accordingly, and skip the emulated instruction. + */ + (void)nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + + /* + * Restore L1's host state to KVM's software model. We're here + * because a consistency check was caught by hardware, which + * means some amount of guest state has been propagated to KVM's + * model and needs to be unwound to the host's state. + */ + nested_vmx_restore_host_state(vcpu); + + vmx->fail = 0; +} + +/* + * Decode the memory-address operand of a vmx instruction, as recorded on an + * exit caused by such an instruction (run by a guest hypervisor). + * On success, returns 0. When the operand is invalid, returns 1 and throws + * #UD or #GP. + */ +int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, + u32 vmx_instruction_info, bool wr, gva_t *ret) +{ + gva_t off; + bool exn; + struct kvm_segment s; + + /* + * According to Vol. 3B, "Information for VM Exits Due to Instruction + * Execution", on an exit, vmx_instruction_info holds most of the + * addressing components of the operand. Only the displacement part + * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). + * For how an actual address is calculated from all these components, + * refer to Vol. 1, "Operand Addressing". + */ + int scaling = vmx_instruction_info & 3; + int addr_size = (vmx_instruction_info >> 7) & 7; + bool is_reg = vmx_instruction_info & (1u << 10); + int seg_reg = (vmx_instruction_info >> 15) & 7; + int index_reg = (vmx_instruction_info >> 18) & 0xf; + bool index_is_valid = !(vmx_instruction_info & (1u << 22)); + int base_reg = (vmx_instruction_info >> 23) & 0xf; + bool base_is_valid = !(vmx_instruction_info & (1u << 27)); + + if (is_reg) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + /* Addr = segment_base + offset */ + /* offset = base + [index * scale] + displacement */ + off = exit_qualification; /* holds the displacement */ + if (base_is_valid) + off += kvm_register_read(vcpu, base_reg); + if (index_is_valid) + off += kvm_register_read(vcpu, index_reg)< s.limit); + } + if (exn) { + kvm_queue_exception_e(vcpu, + seg_reg == VCPU_SREG_SS ? + SS_VECTOR : GP_VECTOR, + 0); + return 1; + } + + return 0; +} + +static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer) +{ + gva_t gva; + struct x86_exception e; + + if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva)) + return 1; + + if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) { + kvm_inject_page_fault(vcpu, &e); + return 1; + } + + return 0; +} + +/* + * Allocate a shadow VMCS and associate it with the currently loaded + * VMCS, unless such a shadow VMCS already exists. The newly allocated + * VMCS is also VMCLEARed, so that it is ready for use. + */ +static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; + + /* + * We should allocate a shadow vmcs for vmcs01 only when L1 + * executes VMXON and free it when L1 executes VMXOFF. + * As it is invalid to execute VMXON twice, we shouldn't reach + * here when vmcs01 already have an allocated shadow vmcs. + */ + WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs); + + if (!loaded_vmcs->shadow_vmcs) { + loaded_vmcs->shadow_vmcs = alloc_vmcs(true); + if (loaded_vmcs->shadow_vmcs) + vmcs_clear(loaded_vmcs->shadow_vmcs); + } + return loaded_vmcs->shadow_vmcs; +} + +static int enter_vmx_operation(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int r; + + r = alloc_loaded_vmcs(&vmx->nested.vmcs02); + if (r < 0) + goto out_vmcs02; + + vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); + if (!vmx->nested.cached_vmcs12) + goto out_cached_vmcs12; + + vmx->nested.cached_shadow_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); + if (!vmx->nested.cached_shadow_vmcs12) + goto out_cached_shadow_vmcs12; + + if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) + goto out_shadow_vmcs; + + hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_PINNED); + vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; + + vmx->nested.vpid02 = allocate_vpid(); + + vmx->nested.vmcs02_initialized = false; + vmx->nested.vmxon = true; + return 0; + +out_shadow_vmcs: + kfree(vmx->nested.cached_shadow_vmcs12); + +out_cached_shadow_vmcs12: + kfree(vmx->nested.cached_vmcs12); + +out_cached_vmcs12: + free_loaded_vmcs(&vmx->nested.vmcs02); + +out_vmcs02: + return -ENOMEM; +} + +/* + * Emulate the VMXON instruction. + * Currently, we just remember that VMX is active, and do not save or even + * inspect the argument to VMXON (the so-called "VMXON pointer") because we + * do not currently need to store anything in that guest-allocated memory + * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their + * argument is different from the VMXON pointer (which the spec says they do). + */ +static int handle_vmon(struct kvm_vcpu *vcpu) +{ + int ret; + gpa_t vmptr; + struct page *page; + struct vcpu_vmx *vmx = to_vmx(vcpu); + const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED + | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; + + /* + * The Intel VMX Instruction Reference lists a bunch of bits that are + * prerequisite to running VMXON, most notably cr4.VMXE must be set to + * 1 (see vmx_set_cr4() for when we allow the guest to set this). + * Otherwise, we should fail with #UD. But most faulting conditions + * have already been checked by hardware, prior to the VM-exit for + * VMXON. We do test guest cr4.VMXE because processor CR4 always has + * that bit set to 1 in non-root mode. + */ + if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + /* CPL=0 must be checked manually. */ + if (vmx_get_cpl(vcpu)) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + if (vmx->nested.vmxon) + return nested_vmx_failValid(vcpu, + VMXERR_VMXON_IN_VMX_ROOT_OPERATION); + + if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) + != VMXON_NEEDED_FEATURES) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + if (nested_vmx_get_vmptr(vcpu, &vmptr)) + return 1; + + /* + * SDM 3: 24.11.5 + * The first 4 bytes of VMXON region contain the supported + * VMCS revision identifier + * + * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; + * which replaces physical address width with 32 + */ + if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) + return nested_vmx_failInvalid(vcpu); + + page = kvm_vcpu_gpa_to_page(vcpu, vmptr); + if (is_error_page(page)) + return nested_vmx_failInvalid(vcpu); + + if (*(u32 *)kmap(page) != VMCS12_REVISION) { + kunmap(page); + kvm_release_page_clean(page); + return nested_vmx_failInvalid(vcpu); + } + kunmap(page); + kvm_release_page_clean(page); + + vmx->nested.vmxon_ptr = vmptr; + ret = enter_vmx_operation(vcpu); + if (ret) + return ret; + + return nested_vmx_succeed(vcpu); +} + +static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (vmx->nested.current_vmptr == -1ull) + return; + + if (enable_shadow_vmcs) { + /* copy to memory all shadowed fields in case + they were modified */ + copy_shadow_to_vmcs12(vmx); + vmx->nested.need_vmcs12_sync = false; + vmx_disable_shadow_vmcs(vmx); + } + vmx->nested.posted_intr_nv = -1; + + /* Flush VMCS12 to guest memory */ + kvm_vcpu_write_guest_page(vcpu, + vmx->nested.current_vmptr >> PAGE_SHIFT, + vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); + + kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); + + vmx->nested.current_vmptr = -1ull; +} + +/* Emulate the VMXOFF instruction */ +static int handle_vmoff(struct kvm_vcpu *vcpu) +{ + if (!nested_vmx_check_permission(vcpu)) + return 1; + free_nested(vcpu); + return nested_vmx_succeed(vcpu); +} + +/* Emulate the VMCLEAR instruction */ +static int handle_vmclear(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 zero = 0; + gpa_t vmptr; + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + if (nested_vmx_get_vmptr(vcpu, &vmptr)) + return 1; + + if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) + return nested_vmx_failValid(vcpu, + VMXERR_VMCLEAR_INVALID_ADDRESS); + + if (vmptr == vmx->nested.vmxon_ptr) + return nested_vmx_failValid(vcpu, + VMXERR_VMCLEAR_VMXON_POINTER); + + if (vmx->nested.hv_evmcs_page) { + if (vmptr == vmx->nested.hv_evmcs_vmptr) + nested_release_evmcs(vcpu); + } else { + if (vmptr == vmx->nested.current_vmptr) + nested_release_vmcs12(vcpu); + + kvm_vcpu_write_guest(vcpu, + vmptr + offsetof(struct vmcs12, + launch_state), + &zero, sizeof(zero)); + } + + return nested_vmx_succeed(vcpu); +} + +static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch); + +/* Emulate the VMLAUNCH instruction */ +static int handle_vmlaunch(struct kvm_vcpu *vcpu) +{ + return nested_vmx_run(vcpu, true); +} + +/* Emulate the VMRESUME instruction */ +static int handle_vmresume(struct kvm_vcpu *vcpu) +{ + + return nested_vmx_run(vcpu, false); +} + +static int handle_vmread(struct kvm_vcpu *vcpu) +{ + unsigned long field; + u64 field_value; + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + gva_t gva = 0; + struct vmcs12 *vmcs12; + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + if (to_vmx(vcpu)->nested.current_vmptr == -1ull) + return nested_vmx_failInvalid(vcpu); + + if (!is_guest_mode(vcpu)) + vmcs12 = get_vmcs12(vcpu); + else { + /* + * When vmcs->vmcs_link_pointer is -1ull, any VMREAD + * to shadowed-field sets the ALU flags for VMfailInvalid. + */ + if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) + return nested_vmx_failInvalid(vcpu); + vmcs12 = get_shadow_vmcs12(vcpu); + } + + /* Decode instruction info and find the field to read */ + field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); + /* Read the field, zero-extended to a u64 field_value */ + if (vmcs12_read_any(vmcs12, field, &field_value) < 0) + return nested_vmx_failValid(vcpu, + VMXERR_UNSUPPORTED_VMCS_COMPONENT); + + /* + * Now copy part of this value to register or memory, as requested. + * Note that the number of bits actually copied is 32 or 64 depending + * on the guest's mode (32 or 64 bit), not on the given field's length. + */ + if (vmx_instruction_info & (1u << 10)) { + kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf), + field_value); + } else { + if (get_vmx_mem_address(vcpu, exit_qualification, + vmx_instruction_info, true, &gva)) + return 1; + /* _system ok, nested_vmx_check_permission has verified cpl=0 */ + kvm_write_guest_virt_system(vcpu, gva, &field_value, + (is_long_mode(vcpu) ? 8 : 4), NULL); + } + + return nested_vmx_succeed(vcpu); +} + + +static int handle_vmwrite(struct kvm_vcpu *vcpu) +{ + unsigned long field; + gva_t gva; + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + + /* The value to write might be 32 or 64 bits, depending on L1's long + * mode, and eventually we need to write that into a field of several + * possible lengths. The code below first zero-extends the value to 64 + * bit (field_value), and then copies only the appropriate number of + * bits into the vmcs12 field. + */ + u64 field_value = 0; + struct x86_exception e; + struct vmcs12 *vmcs12; + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + if (vmx->nested.current_vmptr == -1ull) + return nested_vmx_failInvalid(vcpu); + + if (vmx_instruction_info & (1u << 10)) + field_value = kvm_register_readl(vcpu, + (((vmx_instruction_info) >> 3) & 0xf)); + else { + if (get_vmx_mem_address(vcpu, exit_qualification, + vmx_instruction_info, false, &gva)) + return 1; + if (kvm_read_guest_virt(vcpu, gva, &field_value, + (is_64_bit_mode(vcpu) ? 8 : 4), &e)) { + kvm_inject_page_fault(vcpu, &e); + return 1; + } + } + + + field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); + /* + * If the vCPU supports "VMWRITE to any supported field in the + * VMCS," then the "read-only" fields are actually read/write. + */ + if (vmcs_field_readonly(field) && + !nested_cpu_has_vmwrite_any_field(vcpu)) + return nested_vmx_failValid(vcpu, + VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); + + if (!is_guest_mode(vcpu)) + vmcs12 = get_vmcs12(vcpu); + else { + /* + * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE + * to shadowed-field sets the ALU flags for VMfailInvalid. + */ + if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) + return nested_vmx_failInvalid(vcpu); + vmcs12 = get_shadow_vmcs12(vcpu); + } + + if (vmcs12_write_any(vmcs12, field, field_value) < 0) + return nested_vmx_failValid(vcpu, + VMXERR_UNSUPPORTED_VMCS_COMPONENT); + + /* + * Do not track vmcs12 dirty-state if in guest-mode + * as we actually dirty shadow vmcs12 instead of vmcs12. + */ + if (!is_guest_mode(vcpu)) { + switch (field) { +#define SHADOW_FIELD_RW(x) case x: +#include "vmcs_shadow_fields.h" + /* + * The fields that can be updated by L1 without a vmexit are + * always updated in the vmcs02, the others go down the slow + * path of prepare_vmcs02. + */ + break; + default: + vmx->nested.dirty_vmcs12 = true; + break; + } + } + + return nested_vmx_succeed(vcpu); +} + +static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) +{ + vmx->nested.current_vmptr = vmptr; + if (enable_shadow_vmcs) { + vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, + SECONDARY_EXEC_SHADOW_VMCS); + vmcs_write64(VMCS_LINK_POINTER, + __pa(vmx->vmcs01.shadow_vmcs)); + vmx->nested.need_vmcs12_sync = true; + } + vmx->nested.dirty_vmcs12 = true; +} + +/* Emulate the VMPTRLD instruction */ +static int handle_vmptrld(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + gpa_t vmptr; + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + if (nested_vmx_get_vmptr(vcpu, &vmptr)) + return 1; + + if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) + return nested_vmx_failValid(vcpu, + VMXERR_VMPTRLD_INVALID_ADDRESS); + + if (vmptr == vmx->nested.vmxon_ptr) + return nested_vmx_failValid(vcpu, + VMXERR_VMPTRLD_VMXON_POINTER); + + /* Forbid normal VMPTRLD if Enlightened version was used */ + if (vmx->nested.hv_evmcs) + return 1; + + if (vmx->nested.current_vmptr != vmptr) { + struct vmcs12 *new_vmcs12; + struct page *page; + + page = kvm_vcpu_gpa_to_page(vcpu, vmptr); + if (is_error_page(page)) { + /* + * Reads from an unbacked page return all 1s, + * which means that the 32 bits located at the + * given physical address won't match the required + * VMCS12_REVISION identifier. + */ + nested_vmx_failValid(vcpu, + VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); + return kvm_skip_emulated_instruction(vcpu); + } + new_vmcs12 = kmap(page); + if (new_vmcs12->hdr.revision_id != VMCS12_REVISION || + (new_vmcs12->hdr.shadow_vmcs && + !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { + kunmap(page); + kvm_release_page_clean(page); + return nested_vmx_failValid(vcpu, + VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); + } + + nested_release_vmcs12(vcpu); + + /* + * Load VMCS12 from guest memory since it is not already + * cached. + */ + memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE); + kunmap(page); + kvm_release_page_clean(page); + + set_current_vmptr(vmx, vmptr); + } + + return nested_vmx_succeed(vcpu); +} + +/* Emulate the VMPTRST instruction */ +static int handle_vmptrst(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION); + u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); + gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; + struct x86_exception e; + gva_t gva; + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + if (unlikely(to_vmx(vcpu)->nested.hv_evmcs)) + return 1; + + if (get_vmx_mem_address(vcpu, exit_qual, instr_info, true, &gva)) + return 1; + /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ + if (kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, + sizeof(gpa_t), &e)) { + kvm_inject_page_fault(vcpu, &e); + return 1; + } + return nested_vmx_succeed(vcpu); +} + +/* Emulate the INVEPT instruction */ +static int handle_invept(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 vmx_instruction_info, types; + unsigned long type; + gva_t gva; + struct x86_exception e; + struct { + u64 eptp, gpa; + } operand; + + if (!(vmx->nested.msrs.secondary_ctls_high & + SECONDARY_EXEC_ENABLE_EPT) || + !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); + + types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; + + if (type >= 32 || !(types & (1 << type))) + return nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + + /* According to the Intel VMX instruction reference, the memory + * operand is read even if it isn't needed (e.g., for type==global) + */ + if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + vmx_instruction_info, false, &gva)) + return 1; + if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { + kvm_inject_page_fault(vcpu, &e); + return 1; + } + + switch (type) { + case VMX_EPT_EXTENT_GLOBAL: + /* + * TODO: track mappings and invalidate + * single context requests appropriately + */ + case VMX_EPT_EXTENT_CONTEXT: + kvm_mmu_sync_roots(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + break; + default: + BUG_ON(1); + break; + } + + return nested_vmx_succeed(vcpu); +} + +static int handle_invvpid(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 vmx_instruction_info; + unsigned long type, types; + gva_t gva; + struct x86_exception e; + struct { + u64 vpid; + u64 gla; + } operand; + u16 vpid02; + + if (!(vmx->nested.msrs.secondary_ctls_high & + SECONDARY_EXEC_ENABLE_VPID) || + !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); + + types = (vmx->nested.msrs.vpid_caps & + VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; + + if (type >= 32 || !(types & (1 << type))) + return nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + + /* according to the intel vmx instruction reference, the memory + * operand is read even if it isn't needed (e.g., for type==global) + */ + if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + vmx_instruction_info, false, &gva)) + return 1; + if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { + kvm_inject_page_fault(vcpu, &e); + return 1; + } + if (operand.vpid >> 16) + return nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + + vpid02 = nested_get_vpid02(vcpu); + switch (type) { + case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: + if (!operand.vpid || + is_noncanonical_address(operand.gla, vcpu)) + return nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + if (cpu_has_vmx_invvpid_individual_addr()) { + __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, + vpid02, operand.gla); + } else + __vmx_flush_tlb(vcpu, vpid02, false); + break; + case VMX_VPID_EXTENT_SINGLE_CONTEXT: + case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: + if (!operand.vpid) + return nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + __vmx_flush_tlb(vcpu, vpid02, false); + break; + case VMX_VPID_EXTENT_ALL_CONTEXT: + __vmx_flush_tlb(vcpu, vpid02, false); + break; + default: + WARN_ON_ONCE(1); + return kvm_skip_emulated_instruction(vcpu); + } + + return nested_vmx_succeed(vcpu); +} + +static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + u32 index = vcpu->arch.regs[VCPU_REGS_RCX]; + u64 address; + bool accessed_dirty; + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + + if (!nested_cpu_has_eptp_switching(vmcs12) || + !nested_cpu_has_ept(vmcs12)) + return 1; + + if (index >= VMFUNC_EPTP_ENTRIES) + return 1; + + + if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, + &address, index * 8, 8)) + return 1; + + accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT); + + /* + * If the (L2) guest does a vmfunc to the currently + * active ept pointer, we don't have to do anything else + */ + if (vmcs12->ept_pointer != address) { + if (!valid_ept_address(vcpu, address)) + return 1; + + kvm_mmu_unload(vcpu); + mmu->ept_ad = accessed_dirty; + mmu->mmu_role.base.ad_disabled = !accessed_dirty; + vmcs12->ept_pointer = address; + /* + * TODO: Check what's the correct approach in case + * mmu reload fails. Currently, we just let the next + * reload potentially fail + */ + kvm_mmu_reload(vcpu); + } + + return 0; +} + +static int handle_vmfunc(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs12 *vmcs12; + u32 function = vcpu->arch.regs[VCPU_REGS_RAX]; + + /* + * VMFUNC is only supported for nested guests, but we always enable the + * secondary control for simplicity; for non-nested mode, fake that we + * didn't by injecting #UD. + */ + if (!is_guest_mode(vcpu)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + vmcs12 = get_vmcs12(vcpu); + if ((vmcs12->vm_function_control & (1 << function)) == 0) + goto fail; + + switch (function) { + case 0: + if (nested_vmx_eptp_switching(vcpu, vmcs12)) + goto fail; + break; + default: + goto fail; + } + return kvm_skip_emulated_instruction(vcpu); + +fail: + nested_vmx_vmexit(vcpu, vmx->exit_reason, + vmcs_read32(VM_EXIT_INTR_INFO), + vmcs_readl(EXIT_QUALIFICATION)); + return 1; +} + + +static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + unsigned long exit_qualification; + gpa_t bitmap, last_bitmap; + unsigned int port; + int size; + u8 b; + + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) + return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); + + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + + port = exit_qualification >> 16; + size = (exit_qualification & 7) + 1; + + last_bitmap = (gpa_t)-1; + b = -1; + + while (size > 0) { + if (port < 0x8000) + bitmap = vmcs12->io_bitmap_a; + else if (port < 0x10000) + bitmap = vmcs12->io_bitmap_b; + else + return true; + bitmap += (port & 0x7fff) / 8; + + if (last_bitmap != bitmap) + if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) + return true; + if (b & (1 << (port & 7))) + return true; + + port++; + size--; + last_bitmap = bitmap; + } + + return false; +} + +/* + * Return 1 if we should exit from L2 to L1 to handle an MSR access access, + * rather than handle it ourselves in L0. I.e., check whether L1 expressed + * disinterest in the current event (read or write a specific MSR) by using an + * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. + */ +static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12, u32 exit_reason) +{ + u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX]; + gpa_t bitmap; + + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) + return true; + + /* + * The MSR_BITMAP page is divided into four 1024-byte bitmaps, + * for the four combinations of read/write and low/high MSR numbers. + * First we need to figure out which of the four to use: + */ + bitmap = vmcs12->msr_bitmap; + if (exit_reason == EXIT_REASON_MSR_WRITE) + bitmap += 2048; + if (msr_index >= 0xc0000000) { + msr_index -= 0xc0000000; + bitmap += 1024; + } + + /* Then read the msr_index'th bit from this bitmap: */ + if (msr_index < 1024*8) { + unsigned char b; + if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) + return true; + return 1 & (b >> (msr_index & 7)); + } else + return true; /* let L1 handle the wrong parameter */ +} + +/* + * Return 1 if we should exit from L2 to L1 to handle a CR access exit, + * rather than handle it ourselves in L0. I.e., check if L1 wanted to + * intercept (via guest_host_mask etc.) the current event. + */ +static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + int cr = exit_qualification & 15; + int reg; + unsigned long val; + + switch ((exit_qualification >> 4) & 3) { + case 0: /* mov to cr */ + reg = (exit_qualification >> 8) & 15; + val = kvm_register_readl(vcpu, reg); + switch (cr) { + case 0: + if (vmcs12->cr0_guest_host_mask & + (val ^ vmcs12->cr0_read_shadow)) + return true; + break; + case 3: + if ((vmcs12->cr3_target_count >= 1 && + vmcs12->cr3_target_value0 == val) || + (vmcs12->cr3_target_count >= 2 && + vmcs12->cr3_target_value1 == val) || + (vmcs12->cr3_target_count >= 3 && + vmcs12->cr3_target_value2 == val) || + (vmcs12->cr3_target_count >= 4 && + vmcs12->cr3_target_value3 == val)) + return false; + if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) + return true; + break; + case 4: + if (vmcs12->cr4_guest_host_mask & + (vmcs12->cr4_read_shadow ^ val)) + return true; + break; + case 8: + if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) + return true; + break; + } + break; + case 2: /* clts */ + if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && + (vmcs12->cr0_read_shadow & X86_CR0_TS)) + return true; + break; + case 1: /* mov from cr */ + switch (cr) { + case 3: + if (vmcs12->cpu_based_vm_exec_control & + CPU_BASED_CR3_STORE_EXITING) + return true; + break; + case 8: + if (vmcs12->cpu_based_vm_exec_control & + CPU_BASED_CR8_STORE_EXITING) + return true; + break; + } + break; + case 3: /* lmsw */ + /* + * lmsw can change bits 1..3 of cr0, and only set bit 0 of + * cr0. Other attempted changes are ignored, with no exit. + */ + val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; + if (vmcs12->cr0_guest_host_mask & 0xe & + (val ^ vmcs12->cr0_read_shadow)) + return true; + if ((vmcs12->cr0_guest_host_mask & 0x1) && + !(vmcs12->cr0_read_shadow & 0x1) && + (val & 0x1)) + return true; + break; + } + return false; +} + +static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12, gpa_t bitmap) +{ + u32 vmx_instruction_info; + unsigned long field; + u8 b; + + if (!nested_cpu_has_shadow_vmcs(vmcs12)) + return true; + + /* Decode instruction info and find the field to access */ + vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); + + /* Out-of-range fields always cause a VM exit from L2 to L1 */ + if (field >> 15) + return true; + + if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) + return true; + + return 1 & (b >> (field & 7)); +} + +/* + * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we + * should handle it ourselves in L0 (and then continue L2). Only call this + * when in is_guest_mode (L2). + */ +bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) +{ + u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + if (vmx->nested.nested_run_pending) + return false; + + if (unlikely(vmx->fail)) { + pr_info_ratelimited("%s failed vm entry %x\n", __func__, + vmcs_read32(VM_INSTRUCTION_ERROR)); + return true; + } + + /* + * The host physical addresses of some pages of guest memory + * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC + * Page). The CPU may write to these pages via their host + * physical address while L2 is running, bypassing any + * address-translation-based dirty tracking (e.g. EPT write + * protection). + * + * Mark them dirty on every exit from L2 to prevent them from + * getting out of sync with dirty tracking. + */ + nested_mark_vmcs12_pages_dirty(vcpu); + + trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, + vmcs_readl(EXIT_QUALIFICATION), + vmx->idt_vectoring_info, + intr_info, + vmcs_read32(VM_EXIT_INTR_ERROR_CODE), + KVM_ISA_VMX); + + switch (exit_reason) { + case EXIT_REASON_EXCEPTION_NMI: + if (is_nmi(intr_info)) + return false; + else if (is_page_fault(intr_info)) + return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept; + else if (is_debug(intr_info) && + vcpu->guest_debug & + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) + return false; + else if (is_breakpoint(intr_info) && + vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) + return false; + return vmcs12->exception_bitmap & + (1u << (intr_info & INTR_INFO_VECTOR_MASK)); + case EXIT_REASON_EXTERNAL_INTERRUPT: + return false; + case EXIT_REASON_TRIPLE_FAULT: + return true; + case EXIT_REASON_PENDING_INTERRUPT: + return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING); + case EXIT_REASON_NMI_WINDOW: + return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING); + case EXIT_REASON_TASK_SWITCH: + return true; + case EXIT_REASON_CPUID: + return true; + case EXIT_REASON_HLT: + return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); + case EXIT_REASON_INVD: + return true; + case EXIT_REASON_INVLPG: + return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); + case EXIT_REASON_RDPMC: + return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); + case EXIT_REASON_RDRAND: + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); + case EXIT_REASON_RDSEED: + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); + case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: + return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); + case EXIT_REASON_VMREAD: + return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, + vmcs12->vmread_bitmap); + case EXIT_REASON_VMWRITE: + return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, + vmcs12->vmwrite_bitmap); + case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: + case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: + case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: + case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: + case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: + /* + * VMX instructions trap unconditionally. This allows L1 to + * emulate them for its L2 guest, i.e., allows 3-level nesting! + */ + return true; + case EXIT_REASON_CR_ACCESS: + return nested_vmx_exit_handled_cr(vcpu, vmcs12); + case EXIT_REASON_DR_ACCESS: + return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); + case EXIT_REASON_IO_INSTRUCTION: + return nested_vmx_exit_handled_io(vcpu, vmcs12); + case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); + case EXIT_REASON_MSR_READ: + case EXIT_REASON_MSR_WRITE: + return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); + case EXIT_REASON_INVALID_STATE: + return true; + case EXIT_REASON_MWAIT_INSTRUCTION: + return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); + case EXIT_REASON_MONITOR_TRAP_FLAG: + return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG); + case EXIT_REASON_MONITOR_INSTRUCTION: + return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); + case EXIT_REASON_PAUSE_INSTRUCTION: + return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || + nested_cpu_has2(vmcs12, + SECONDARY_EXEC_PAUSE_LOOP_EXITING); + case EXIT_REASON_MCE_DURING_VMENTRY: + return false; + case EXIT_REASON_TPR_BELOW_THRESHOLD: + return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); + case EXIT_REASON_APIC_ACCESS: + case EXIT_REASON_APIC_WRITE: + case EXIT_REASON_EOI_INDUCED: + /* + * The controls for "virtualize APIC accesses," "APIC- + * register virtualization," and "virtual-interrupt + * delivery" only come from vmcs12. + */ + return true; + case EXIT_REASON_EPT_VIOLATION: + /* + * L0 always deals with the EPT violation. If nested EPT is + * used, and the nested mmu code discovers that the address is + * missing in the guest EPT table (EPT12), the EPT violation + * will be injected with nested_ept_inject_page_fault() + */ + return false; + case EXIT_REASON_EPT_MISCONFIG: + /* + * L2 never uses directly L1's EPT, but rather L0's own EPT + * table (shadow on EPT) or a merged EPT table that L0 built + * (EPT on EPT). So any problems with the structure of the + * table is L0's fault. + */ + return false; + case EXIT_REASON_INVPCID: + return + nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && + nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); + case EXIT_REASON_WBINVD: + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); + case EXIT_REASON_XSETBV: + return true; + case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: + /* + * This should never happen, since it is not possible to + * set XSS to a non-zero value---neither in L1 nor in L2. + * If if it were, XSS would have to be checked against + * the XSS exit bitmap in vmcs12. + */ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); + case EXIT_REASON_PREEMPTION_TIMER: + return false; + case EXIT_REASON_PML_FULL: + /* We emulate PML support to L1. */ + return false; + case EXIT_REASON_VMFUNC: + /* VM functions are emulated through L2->L0 vmexits. */ + return false; + case EXIT_REASON_ENCLS: + /* SGX is never exposed to L1 */ + return false; + default: + return true; + } +} + + +static int vmx_get_nested_state(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + u32 user_data_size) +{ + struct vcpu_vmx *vmx; + struct vmcs12 *vmcs12; + struct kvm_nested_state kvm_state = { + .flags = 0, + .format = 0, + .size = sizeof(kvm_state), + .vmx.vmxon_pa = -1ull, + .vmx.vmcs_pa = -1ull, + }; + + if (!vcpu) + return kvm_state.size + 2 * VMCS12_SIZE; + + vmx = to_vmx(vcpu); + vmcs12 = get_vmcs12(vcpu); + + if (nested_vmx_allowed(vcpu) && vmx->nested.enlightened_vmcs_enabled) + kvm_state.flags |= KVM_STATE_NESTED_EVMCS; + + if (nested_vmx_allowed(vcpu) && + (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { + kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr; + kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr; + + if (vmx_has_valid_vmcs12(vcpu)) { + kvm_state.size += VMCS12_SIZE; + + if (is_guest_mode(vcpu) && + nested_cpu_has_shadow_vmcs(vmcs12) && + vmcs12->vmcs_link_pointer != -1ull) + kvm_state.size += VMCS12_SIZE; + } + + if (vmx->nested.smm.vmxon) + kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; + + if (vmx->nested.smm.guest_mode) + kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; + + if (is_guest_mode(vcpu)) { + kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; + + if (vmx->nested.nested_run_pending) + kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; + } + } + + if (user_data_size < kvm_state.size) + goto out; + + if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) + return -EFAULT; + + if (!vmx_has_valid_vmcs12(vcpu)) + goto out; + + /* + * When running L2, the authoritative vmcs12 state is in the + * vmcs02. When running L1, the authoritative vmcs12 state is + * in the shadow or enlightened vmcs linked to vmcs01, unless + * need_vmcs12_sync is set, in which case, the authoritative + * vmcs12 state is in the vmcs12 already. + */ + if (is_guest_mode(vcpu)) { + sync_vmcs12(vcpu, vmcs12); + } else if (!vmx->nested.need_vmcs12_sync) { + if (vmx->nested.hv_evmcs) + copy_enlightened_to_vmcs12(vmx); + else if (enable_shadow_vmcs) + copy_shadow_to_vmcs12(vmx); + } + + if (copy_to_user(user_kvm_nested_state->data, vmcs12, sizeof(*vmcs12))) + return -EFAULT; + + if (nested_cpu_has_shadow_vmcs(vmcs12) && + vmcs12->vmcs_link_pointer != -1ull) { + if (copy_to_user(user_kvm_nested_state->data + VMCS12_SIZE, + get_shadow_vmcs12(vcpu), sizeof(*vmcs12))) + return -EFAULT; + } + +out: + return kvm_state.size; +} + +/* + * Forcibly leave nested mode in order to be able to reset the VCPU later on. + */ +void vmx_leave_nested(struct kvm_vcpu *vcpu) +{ + if (is_guest_mode(vcpu)) { + to_vmx(vcpu)->nested.nested_run_pending = 0; + nested_vmx_vmexit(vcpu, -1, 0, 0); + } + free_nested(vcpu); +} + +static int vmx_set_nested_state(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + struct kvm_nested_state *kvm_state) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs12 *vmcs12; + u32 exit_qual; + int ret; + + if (kvm_state->format != 0) + return -EINVAL; + + if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) + nested_enable_evmcs(vcpu, NULL); + + if (!nested_vmx_allowed(vcpu)) + return kvm_state->vmx.vmxon_pa == -1ull ? 0 : -EINVAL; + + if (kvm_state->vmx.vmxon_pa == -1ull) { + if (kvm_state->vmx.smm.flags) + return -EINVAL; + + if (kvm_state->vmx.vmcs_pa != -1ull) + return -EINVAL; + + vmx_leave_nested(vcpu); + return 0; + } + + if (!page_address_valid(vcpu, kvm_state->vmx.vmxon_pa)) + return -EINVAL; + + if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && + (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) + return -EINVAL; + + if (kvm_state->vmx.smm.flags & + ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) + return -EINVAL; + + /* + * SMM temporarily disables VMX, so we cannot be in guest mode, + * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags + * must be zero. + */ + if (is_smm(vcpu) ? kvm_state->flags : kvm_state->vmx.smm.flags) + return -EINVAL; + + if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && + !(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) + return -EINVAL; + + vmx_leave_nested(vcpu); + if (kvm_state->vmx.vmxon_pa == -1ull) + return 0; + + vmx->nested.vmxon_ptr = kvm_state->vmx.vmxon_pa; + ret = enter_vmx_operation(vcpu); + if (ret) + return ret; + + /* Empty 'VMXON' state is permitted */ + if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12)) + return 0; + + if (kvm_state->vmx.vmcs_pa != -1ull) { + if (kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa || + !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa)) + return -EINVAL; + + set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa); + } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { + /* + * Sync eVMCS upon entry as we may not have + * HV_X64_MSR_VP_ASSIST_PAGE set up yet. + */ + vmx->nested.need_vmcs12_sync = true; + } else { + return -EINVAL; + } + + if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { + vmx->nested.smm.vmxon = true; + vmx->nested.vmxon = false; + + if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) + vmx->nested.smm.guest_mode = true; + } + + vmcs12 = get_vmcs12(vcpu); + if (copy_from_user(vmcs12, user_kvm_nested_state->data, sizeof(*vmcs12))) + return -EFAULT; + + if (vmcs12->hdr.revision_id != VMCS12_REVISION) + return -EINVAL; + + if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) + return 0; + + vmx->nested.nested_run_pending = + !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); + + if (nested_cpu_has_shadow_vmcs(vmcs12) && + vmcs12->vmcs_link_pointer != -1ull) { + struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); + + if (kvm_state->size < sizeof(kvm_state) + 2 * sizeof(*vmcs12)) + return -EINVAL; + + if (copy_from_user(shadow_vmcs12, + user_kvm_nested_state->data + VMCS12_SIZE, + sizeof(*vmcs12))) + return -EFAULT; + + if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || + !shadow_vmcs12->hdr.shadow_vmcs) + return -EINVAL; + } + + if (check_vmentry_prereqs(vcpu, vmcs12) || + check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) + return -EINVAL; + + vmx->nested.dirty_vmcs12 = true; + ret = nested_vmx_enter_non_root_mode(vcpu, false); + if (ret) + return -EINVAL; + + return 0; +} + +void nested_vmx_vcpu_setup(void) +{ + if (enable_shadow_vmcs) { + /* + * At vCPU creation, "VMWRITE to any supported field + * in the VMCS" is supported, so use the more + * permissive vmx_vmread_bitmap to specify both read + * and write permissions for the shadow VMCS. + */ + vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); + vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmread_bitmap)); + } +} + +/* + * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be + * returned for the various VMX controls MSRs when nested VMX is enabled. + * The same values should also be used to verify that vmcs12 control fields are + * valid during nested entry from L1 to L2. + * Each of these control msrs has a low and high 32-bit half: A low bit is on + * if the corresponding bit in the (32-bit) control field *must* be on, and a + * bit in the high half is on if the corresponding bit in the control field + * may be on. See also vmx_control_verify(). + */ +void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps, + bool apicv) +{ + /* + * Note that as a general rule, the high half of the MSRs (bits in + * the control fields which may be 1) should be initialized by the + * intersection of the underlying hardware's MSR (i.e., features which + * can be supported) and the list of features we want to expose - + * because they are known to be properly supported in our code. + * Also, usually, the low half of the MSRs (bits which must be 1) can + * be set to 0, meaning that L1 may turn off any of these bits. The + * reason is that if one of these bits is necessary, it will appear + * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control + * fields of vmcs01 and vmcs02, will turn these bits off - and + * nested_vmx_exit_reflected() will not pass related exits to L1. + * These rules have exceptions below. + */ + + /* pin-based controls */ + rdmsr(MSR_IA32_VMX_PINBASED_CTLS, + msrs->pinbased_ctls_low, + msrs->pinbased_ctls_high); + msrs->pinbased_ctls_low |= + PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; + msrs->pinbased_ctls_high &= + PIN_BASED_EXT_INTR_MASK | + PIN_BASED_NMI_EXITING | + PIN_BASED_VIRTUAL_NMIS | + (apicv ? PIN_BASED_POSTED_INTR : 0); + msrs->pinbased_ctls_high |= + PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | + PIN_BASED_VMX_PREEMPTION_TIMER; + + /* exit controls */ + rdmsr(MSR_IA32_VMX_EXIT_CTLS, + msrs->exit_ctls_low, + msrs->exit_ctls_high); + msrs->exit_ctls_low = + VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; + + msrs->exit_ctls_high &= +#ifdef CONFIG_X86_64 + VM_EXIT_HOST_ADDR_SPACE_SIZE | +#endif + VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; + msrs->exit_ctls_high |= + VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | + VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | + VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; + + /* We support free control of debug control saving. */ + msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; + + /* entry controls */ + rdmsr(MSR_IA32_VMX_ENTRY_CTLS, + msrs->entry_ctls_low, + msrs->entry_ctls_high); + msrs->entry_ctls_low = + VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; + msrs->entry_ctls_high &= +#ifdef CONFIG_X86_64 + VM_ENTRY_IA32E_MODE | +#endif + VM_ENTRY_LOAD_IA32_PAT; + msrs->entry_ctls_high |= + (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); + + /* We support free control of debug control loading. */ + msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; + + /* cpu-based controls */ + rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, + msrs->procbased_ctls_low, + msrs->procbased_ctls_high); + msrs->procbased_ctls_low = + CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; + msrs->procbased_ctls_high &= + CPU_BASED_VIRTUAL_INTR_PENDING | + CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | + CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | + CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING | +#ifdef CONFIG_X86_64 + CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | +#endif + CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | + CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | + CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | + CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | + CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; + /* + * We can allow some features even when not supported by the + * hardware. For example, L1 can specify an MSR bitmap - and we + * can use it to avoid exits to L1 - even when L0 runs L2 + * without MSR bitmaps. + */ + msrs->procbased_ctls_high |= + CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | + CPU_BASED_USE_MSR_BITMAPS; + + /* We support free control of CR3 access interception. */ + msrs->procbased_ctls_low &= + ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); + + /* + * secondary cpu-based controls. Do not include those that + * depend on CPUID bits, they are added later by vmx_cpuid_update. + */ + rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, + msrs->secondary_ctls_low, + msrs->secondary_ctls_high); + msrs->secondary_ctls_low = 0; + msrs->secondary_ctls_high &= + SECONDARY_EXEC_DESC | + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | + SECONDARY_EXEC_WBINVD_EXITING; + + /* + * We can emulate "VMCS shadowing," even if the hardware + * doesn't support it. + */ + msrs->secondary_ctls_high |= + SECONDARY_EXEC_SHADOW_VMCS; + + if (enable_ept) { + /* nested EPT: emulate EPT also to L1 */ + msrs->secondary_ctls_high |= + SECONDARY_EXEC_ENABLE_EPT; + msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT | + VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT; + if (cpu_has_vmx_ept_execute_only()) + msrs->ept_caps |= + VMX_EPT_EXECUTE_ONLY_BIT; + msrs->ept_caps &= ept_caps; + msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | + VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | + VMX_EPT_1GB_PAGE_BIT; + if (enable_ept_ad_bits) { + msrs->secondary_ctls_high |= + SECONDARY_EXEC_ENABLE_PML; + msrs->ept_caps |= VMX_EPT_AD_BIT; + } + } + + if (cpu_has_vmx_vmfunc()) { + msrs->secondary_ctls_high |= + SECONDARY_EXEC_ENABLE_VMFUNC; + /* + * Advertise EPTP switching unconditionally + * since we emulate it + */ + if (enable_ept) + msrs->vmfunc_controls = + VMX_VMFUNC_EPTP_SWITCHING; + } + + /* + * Old versions of KVM use the single-context version without + * checking for support, so declare that it is supported even + * though it is treated as global context. The alternative is + * not failing the single-context invvpid, and it is worse. + */ + if (enable_vpid) { + msrs->secondary_ctls_high |= + SECONDARY_EXEC_ENABLE_VPID; + msrs->vpid_caps = VMX_VPID_INVVPID_BIT | + VMX_VPID_EXTENT_SUPPORTED_MASK; + } + + if (enable_unrestricted_guest) + msrs->secondary_ctls_high |= + SECONDARY_EXEC_UNRESTRICTED_GUEST; + + if (flexpriority_enabled) + msrs->secondary_ctls_high |= + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + + /* miscellaneous data */ + rdmsr(MSR_IA32_VMX_MISC, + msrs->misc_low, + msrs->misc_high); + msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA; + msrs->misc_low |= + MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | + VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | + VMX_MISC_ACTIVITY_HLT; + msrs->misc_high = 0; + + /* + * This MSR reports some information about VMX support. We + * should return information about the VMX we emulate for the + * guest, and the VMCS structure we give it - not about the + * VMX support of the underlying hardware. + */ + msrs->basic = + VMCS12_REVISION | + VMX_BASIC_TRUE_CTLS | + ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | + (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); + + if (cpu_has_vmx_basic_inout()) + msrs->basic |= VMX_BASIC_INOUT; + + /* + * These MSRs specify bits which the guest must keep fixed on + * while L1 is in VMXON mode (in L1's root mode, or running an L2). + * We picked the standard core2 setting. + */ +#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) +#define VMXON_CR4_ALWAYSON X86_CR4_VMXE + msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; + msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; + + /* These MSRs specify bits which the guest must keep fixed off. */ + rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); + rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); + + /* highest index: VMX_PREEMPTION_TIMER_VALUE */ + msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1; +} + +void nested_vmx_hardware_unsetup(void) +{ + int i; + + if (enable_shadow_vmcs) { + for (i = 0; i < VMX_BITMAP_NR; i++) + free_page((unsigned long)vmx_bitmap[i]); + } +} + +__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) +{ + int i; + + if (!cpu_has_vmx_shadow_vmcs()) + enable_shadow_vmcs = 0; + if (enable_shadow_vmcs) { + for (i = 0; i < VMX_BITMAP_NR; i++) { + vmx_bitmap[i] = (unsigned long *) + __get_free_page(GFP_KERNEL); + if (!vmx_bitmap[i]) { + nested_vmx_hardware_unsetup(); + return -ENOMEM; + } + } + + init_vmcs_shadow_fields(); + } + + exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear, + exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch, + exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld, + exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst, + exit_handlers[EXIT_REASON_VMREAD] = handle_vmread, + exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume, + exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite, + exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff, + exit_handlers[EXIT_REASON_VMON] = handle_vmon, + exit_handlers[EXIT_REASON_INVEPT] = handle_invept, + exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid, + exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc, + + kvm_x86_ops->check_nested_events = vmx_check_nested_events; + kvm_x86_ops->get_nested_state = vmx_get_nested_state; + kvm_x86_ops->set_nested_state = vmx_set_nested_state; + kvm_x86_ops->get_vmcs12_pages = nested_get_vmcs12_pages, + kvm_x86_ops->nested_enable_evmcs = nested_enable_evmcs; + + return 0; +} diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h new file mode 100644 index 000000000000..e847ff1019a2 --- /dev/null +++ b/arch/x86/kvm/vmx/nested.h @@ -0,0 +1,282 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_VMX_NESTED_H +#define __KVM_X86_VMX_NESTED_H + +#include "kvm_cache_regs.h" +#include "vmcs12.h" +#include "vmx.h" + +void vmx_leave_nested(struct kvm_vcpu *vcpu); +void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps, + bool apicv); +void nested_vmx_hardware_unsetup(void); +__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)); +void nested_vmx_vcpu_setup(void); +void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu); +int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry); +bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason); +void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, + u32 exit_intr_info, unsigned long exit_qualification); +void nested_sync_from_vmcs12(struct kvm_vcpu *vcpu); +int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); +int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata); +int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, + u32 vmx_instruction_info, bool wr, gva_t *ret); + +static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->nested.cached_vmcs12; +} + +static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->nested.cached_shadow_vmcs12; +} + +static inline int vmx_has_valid_vmcs12(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * In case we do two consecutive get/set_nested_state()s while L2 was + * running hv_evmcs may end up not being mapped (we map it from + * nested_vmx_run()/vmx_vcpu_run()). Check is_guest_mode() as we always + * have vmcs12 if it is true. + */ + return is_guest_mode(vcpu) || vmx->nested.current_vmptr != -1ull || + vmx->nested.hv_evmcs; +} + +static inline unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) +{ + /* return the page table to be shadowed - in our case, EPT12 */ + return get_vmcs12(vcpu)->ept_pointer; +} + +static inline bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu) +{ + return nested_ept_get_cr3(vcpu) & VMX_EPTP_AD_ENABLE_BIT; +} + +/* + * Reflect a VM Exit into L1. + */ +static inline int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, + u32 exit_reason) +{ + u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + + /* + * At this point, the exit interruption info in exit_intr_info + * is only valid for EXCEPTION_NMI exits. For EXTERNAL_INTERRUPT + * we need to query the in-kernel LAPIC. + */ + WARN_ON(exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT); + if ((exit_intr_info & + (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) == + (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + vmcs12->vm_exit_intr_error_code = + vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + } + + nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info, + vmcs_readl(EXIT_QUALIFICATION)); + return 1; +} + +/* + * Return the cr0 value that a nested guest would read. This is a combination + * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by + * its hypervisor (cr0_read_shadow). + */ +static inline unsigned long nested_read_cr0(struct vmcs12 *fields) +{ + return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) | + (fields->cr0_read_shadow & fields->cr0_guest_host_mask); +} +static inline unsigned long nested_read_cr4(struct vmcs12 *fields) +{ + return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) | + (fields->cr4_read_shadow & fields->cr4_guest_host_mask); +} + +static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu) +{ + return vmx_misc_cr3_count(to_vmx(vcpu)->nested.msrs.misc_low); +} + +/* + * Do the virtual VMX capability MSRs specify that L1 can use VMWRITE + * to modify any valid field of the VMCS, or are the VM-exit + * information fields read-only? + */ +static inline bool nested_cpu_has_vmwrite_any_field(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->nested.msrs.misc_low & + MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS; +} + +static inline bool nested_cpu_has_zero_length_injection(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->nested.msrs.misc_low & VMX_MISC_ZERO_LEN_INS; +} + +static inline bool nested_cpu_supports_monitor_trap_flag(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->nested.msrs.procbased_ctls_high & + CPU_BASED_MONITOR_TRAP_FLAG; +} + +static inline bool nested_cpu_has_vmx_shadow_vmcs(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->nested.msrs.secondary_ctls_high & + SECONDARY_EXEC_SHADOW_VMCS; +} + +static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit) +{ + return vmcs12->cpu_based_vm_exec_control & bit; +} + +static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit) +{ + return (vmcs12->cpu_based_vm_exec_control & + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && + (vmcs12->secondary_vm_exec_control & bit); +} + +static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12) +{ + return vmcs12->pin_based_vm_exec_control & + PIN_BASED_VMX_PREEMPTION_TIMER; +} + +static inline bool nested_cpu_has_nmi_exiting(struct vmcs12 *vmcs12) +{ + return vmcs12->pin_based_vm_exec_control & PIN_BASED_NMI_EXITING; +} + +static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12) +{ + return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; +} + +static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); +} + +static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); +} + +static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML); +} + +static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); +} + +static inline bool nested_cpu_has_vpid(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID); +} + +static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT); +} + +static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); +} + +static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12) +{ + return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR; +} + +static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC); +} + +static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12) +{ + return nested_cpu_has_vmfunc(vmcs12) && + (vmcs12->vm_function_control & + VMX_VMFUNC_EPTP_SWITCHING); +} + +static inline bool nested_cpu_has_shadow_vmcs(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS); +} + +static inline bool nested_cpu_has_save_preemption_timer(struct vmcs12 *vmcs12) +{ + return vmcs12->vm_exit_controls & + VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; +} + +/* + * In nested virtualization, check if L1 asked to exit on external interrupts. + * For most existing hypervisors, this will always return true. + */ +static inline bool nested_exit_on_intr(struct kvm_vcpu *vcpu) +{ + return get_vmcs12(vcpu)->pin_based_vm_exec_control & + PIN_BASED_EXT_INTR_MASK; +} + +/* + * if fixed0[i] == 1: val[i] must be 1 + * if fixed1[i] == 0: val[i] must be 0 + */ +static inline bool fixed_bits_valid(u64 val, u64 fixed0, u64 fixed1) +{ + return ((val & fixed1) | fixed0) == val; +} + +static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) +{ + u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0; + u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1; + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + if (to_vmx(vcpu)->nested.msrs.secondary_ctls_high & + SECONDARY_EXEC_UNRESTRICTED_GUEST && + nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) + fixed0 &= ~(X86_CR0_PE | X86_CR0_PG); + + return fixed_bits_valid(val, fixed0, fixed1); +} + +static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) +{ + u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0; + u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1; + + return fixed_bits_valid(val, fixed0, fixed1); +} + +static bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val) +{ + u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr4_fixed0; + u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr4_fixed1; + + return fixed_bits_valid(val, fixed0, fixed1); +} + +/* No difference in the restrictions on guest and host CR4 in VMX operation. */ +#define nested_guest_cr4_valid nested_cr4_valid +#define nested_host_cr4_valid nested_cr4_valid + +#endif /* __KVM_X86_VMX_NESTED_H */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 58bb8de04d0d..47943d073d6d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -50,11 +50,11 @@ #include "capabilities.h" #include "cpuid.h" #include "evmcs.h" -#include "hyperv.h" #include "irq.h" #include "kvm_cache_regs.h" #include "lapic.h" #include "mmu.h" +#include "nested.h" #include "ops.h" #include "pmu.h" #include "trace.h" @@ -101,8 +101,6 @@ module_param(fasteoi, bool, S_IRUGO); static bool __read_mostly enable_apicv = 1; module_param(enable_apicv, bool, S_IRUGO); -static bool __read_mostly enable_shadow_vmcs = 1; -module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); /* * If nested=1, nested virtualization is supported, i.e., guests may use * VMX and be a hypervisor for its own guests. If nested=0, guests may not @@ -111,9 +109,6 @@ module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); static bool __read_mostly nested = 1; module_param(nested, bool, S_IRUGO); -static bool __read_mostly nested_early_check = 0; -module_param(nested_early_check, bool, S_IRUGO); - static u64 __read_mostly host_xss; bool __read_mostly enable_pml = 1; @@ -146,18 +141,6 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) -#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 - -/* - * Hyper-V requires all of these, so mark them as supported even though - * they are just treated the same as all-context. - */ -#define VMX_VPID_EXTENT_SUPPORTED_MASK \ - (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ - VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ - VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ - VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) - /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * ple_gap: upper bound on the amount of time between two successive @@ -187,8 +170,6 @@ module_param(ple_window_shrink, uint, 0444); static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; module_param(ple_window_max, uint, 0444); -extern const ulong vmx_early_consistency_check_return; - static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); static DEFINE_MUTEX(vmx_l1d_flush_mutex); @@ -339,37 +320,8 @@ static const struct kernel_param_ops vmentry_l1d_flush_ops = { }; module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); -static u16 shadow_read_only_fields[] = { -#define SHADOW_FIELD_RO(x) x, -#include "vmcs_shadow_fields.h" -}; -static int max_shadow_read_only_fields = - ARRAY_SIZE(shadow_read_only_fields); - -static u16 shadow_read_write_fields[] = { -#define SHADOW_FIELD_RW(x) x, -#include "vmcs_shadow_fields.h" -}; -static int max_shadow_read_write_fields = - ARRAY_SIZE(shadow_read_write_fields); - -static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) -{ - return to_vmx(vcpu)->nested.cached_vmcs12; -} - -static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu) -{ - return to_vmx(vcpu)->nested.cached_shadow_vmcs12; -} - -static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu); -static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); static bool guest_state_valid(struct kvm_vcpu *vcpu); static u32 vmx_segment_access_rights(struct kvm_segment *var); -static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); -static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, - u16 error_code); static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr, int type); @@ -388,17 +340,6 @@ static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); -enum { - VMX_VMREAD_BITMAP, - VMX_VMWRITE_BITMAP, - VMX_BITMAP_NR -}; - -static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; - -#define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) -#define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) - static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); static DEFINE_SPINLOCK(vmx_vpid_lock); @@ -550,134 +491,6 @@ static inline bool report_flexpriority(void) return flexpriority_enabled; } -static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu) -{ - return vmx_misc_cr3_count(to_vmx(vcpu)->nested.msrs.misc_low); -} - -/* - * Do the virtual VMX capability MSRs specify that L1 can use VMWRITE - * to modify any valid field of the VMCS, or are the VM-exit - * information fields read-only? - */ -static inline bool nested_cpu_has_vmwrite_any_field(struct kvm_vcpu *vcpu) -{ - return to_vmx(vcpu)->nested.msrs.misc_low & - MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS; -} - -static inline bool nested_cpu_has_zero_length_injection(struct kvm_vcpu *vcpu) -{ - return to_vmx(vcpu)->nested.msrs.misc_low & VMX_MISC_ZERO_LEN_INS; -} - -static inline bool nested_cpu_supports_monitor_trap_flag(struct kvm_vcpu *vcpu) -{ - return to_vmx(vcpu)->nested.msrs.procbased_ctls_high & - CPU_BASED_MONITOR_TRAP_FLAG; -} - -static inline bool nested_cpu_has_vmx_shadow_vmcs(struct kvm_vcpu *vcpu) -{ - return to_vmx(vcpu)->nested.msrs.secondary_ctls_high & - SECONDARY_EXEC_SHADOW_VMCS; -} - -static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit) -{ - return vmcs12->cpu_based_vm_exec_control & bit; -} - -static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit) -{ - return (vmcs12->cpu_based_vm_exec_control & - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && - (vmcs12->secondary_vm_exec_control & bit); -} - -static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12) -{ - return vmcs12->pin_based_vm_exec_control & - PIN_BASED_VMX_PREEMPTION_TIMER; -} - -static inline bool nested_cpu_has_nmi_exiting(struct vmcs12 *vmcs12) -{ - return vmcs12->pin_based_vm_exec_control & PIN_BASED_NMI_EXITING; -} - -static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12) -{ - return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; -} - -static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) -{ - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); -} - -static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12) -{ - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); -} - -static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12) -{ - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML); -} - -static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12) -{ - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); -} - -static inline bool nested_cpu_has_vpid(struct vmcs12 *vmcs12) -{ - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID); -} - -static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12) -{ - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT); -} - -static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12) -{ - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); -} - -static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12) -{ - return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR; -} - -static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12) -{ - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC); -} - -static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12) -{ - return nested_cpu_has_vmfunc(vmcs12) && - (vmcs12->vm_function_control & - VMX_VMFUNC_EPTP_SWITCHING); -} - -static inline bool nested_cpu_has_shadow_vmcs(struct vmcs12 *vmcs12) -{ - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS); -} - -static inline bool nested_cpu_has_save_preemption_timer(struct vmcs12 *vmcs12) -{ - return vmcs12->vm_exit_controls & - VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; -} - -static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, - u32 exit_intr_info, - unsigned long exit_qualification); - static inline int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) { int i; @@ -888,29 +701,6 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) return true; } -/* - * Check if MSR is intercepted for L01 MSR bitmap. - */ -static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr) -{ - unsigned long *msr_bitmap; - int f = sizeof(unsigned long); - - if (!cpu_has_vmx_msr_bitmap()) - return true; - - msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; - - if (msr <= 0x1fff) { - return !!test_bit(msr, msr_bitmap + 0x800 / f); - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { - msr &= 0x1fff; - return !!test_bit(msr, msr_bitmap + 0xc00 / f); - } - - return true; -} - static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, unsigned long entry, unsigned long exit) { @@ -1424,22 +1214,6 @@ static bool emulation_required(struct kvm_vcpu *vcpu) static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); -/* - * Return the cr0 value that a nested guest would read. This is a combination - * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by - * its hypervisor (cr0_read_shadow). - */ -static inline unsigned long nested_read_cr0(struct vmcs12 *fields) -{ - return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) | - (fields->cr0_read_shadow & fields->cr0_guest_host_mask); -} -static inline unsigned long nested_read_cr4(struct vmcs12 *fields) -{ - return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) | - (fields->cr4_read_shadow & fields->cr4_guest_host_mask); -} - unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) { unsigned long rflags, save_rflags; @@ -1514,67 +1288,6 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) vmx_set_interrupt_shadow(vcpu, 0); } -static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu, - unsigned long exit_qual) -{ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - unsigned int nr = vcpu->arch.exception.nr; - u32 intr_info = nr | INTR_INFO_VALID_MASK; - - if (vcpu->arch.exception.has_error_code) { - vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code; - intr_info |= INTR_INFO_DELIVER_CODE_MASK; - } - - if (kvm_exception_is_soft(nr)) - intr_info |= INTR_TYPE_SOFT_EXCEPTION; - else - intr_info |= INTR_TYPE_HARD_EXCEPTION; - - if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && - vmx_get_nmi_mask(vcpu)) - intr_info |= INTR_INFO_UNBLOCK_NMI; - - nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); -} - -/* - * KVM wants to inject page-faults which it got to the guest. This function - * checks whether in a nested guest, we need to inject them to L1 or L2. - */ -static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual) -{ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - unsigned int nr = vcpu->arch.exception.nr; - bool has_payload = vcpu->arch.exception.has_payload; - unsigned long payload = vcpu->arch.exception.payload; - - if (nr == PF_VECTOR) { - if (vcpu->arch.exception.nested_apf) { - *exit_qual = vcpu->arch.apf.nested_apf_token; - return 1; - } - if (nested_vmx_is_page_fault_vmexit(vmcs12, - vcpu->arch.exception.error_code)) { - *exit_qual = has_payload ? payload : vcpu->arch.cr2; - return 1; - } - } else if (vmcs12->exception_bitmap & (1u << nr)) { - if (nr == DB_VECTOR) { - if (!has_payload) { - payload = vcpu->arch.dr6; - payload &= ~(DR6_FIXED_1 | DR6_BT); - payload ^= DR6_RTM; - } - *exit_qual = payload; - } else - *exit_qual = 0; - return 1; - } - - return 0; -} - static void vmx_clear_hlt(struct kvm_vcpu *vcpu) { /* @@ -1736,629 +1449,64 @@ bool nested_vmx_allowed(struct kvm_vcpu *vcpu) return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX); } -/* - * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be - * returned for the various VMX controls MSRs when nested VMX is enabled. - * The same values should also be used to verify that vmcs12 control fields are - * valid during nested entry from L1 to L2. - * Each of these control msrs has a low and high 32-bit half: A low bit is on - * if the corresponding bit in the (32-bit) control field *must* be on, and a - * bit in the high half is on if the corresponding bit in the control field - * may be on. See also vmx_control_verify(). - */ -static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, - u32 ept_caps, bool apicv) -{ - /* - * Note that as a general rule, the high half of the MSRs (bits in - * the control fields which may be 1) should be initialized by the - * intersection of the underlying hardware's MSR (i.e., features which - * can be supported) and the list of features we want to expose - - * because they are known to be properly supported in our code. - * Also, usually, the low half of the MSRs (bits which must be 1) can - * be set to 0, meaning that L1 may turn off any of these bits. The - * reason is that if one of these bits is necessary, it will appear - * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control - * fields of vmcs01 and vmcs02, will turn these bits off - and - * nested_vmx_exit_reflected() will not pass related exits to L1. - * These rules have exceptions below. - */ - - /* pin-based controls */ - rdmsr(MSR_IA32_VMX_PINBASED_CTLS, - msrs->pinbased_ctls_low, - msrs->pinbased_ctls_high); - msrs->pinbased_ctls_low |= - PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; - msrs->pinbased_ctls_high &= - PIN_BASED_EXT_INTR_MASK | - PIN_BASED_NMI_EXITING | - PIN_BASED_VIRTUAL_NMIS | - (apicv ? PIN_BASED_POSTED_INTR : 0); - msrs->pinbased_ctls_high |= - PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | - PIN_BASED_VMX_PREEMPTION_TIMER; - - /* exit controls */ - rdmsr(MSR_IA32_VMX_EXIT_CTLS, - msrs->exit_ctls_low, - msrs->exit_ctls_high); - msrs->exit_ctls_low = - VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; - - msrs->exit_ctls_high &= -#ifdef CONFIG_X86_64 - VM_EXIT_HOST_ADDR_SPACE_SIZE | -#endif - VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; - msrs->exit_ctls_high |= - VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | - VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | - VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; - - /* We support free control of debug control saving. */ - msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; - - /* entry controls */ - rdmsr(MSR_IA32_VMX_ENTRY_CTLS, - msrs->entry_ctls_low, - msrs->entry_ctls_high); - msrs->entry_ctls_low = - VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; - msrs->entry_ctls_high &= -#ifdef CONFIG_X86_64 - VM_ENTRY_IA32E_MODE | -#endif - VM_ENTRY_LOAD_IA32_PAT; - msrs->entry_ctls_high |= - (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); - - /* We support free control of debug control loading. */ - msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; - - /* cpu-based controls */ - rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, - msrs->procbased_ctls_low, - msrs->procbased_ctls_high); - msrs->procbased_ctls_low = - CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; - msrs->procbased_ctls_high &= - CPU_BASED_VIRTUAL_INTR_PENDING | - CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | - CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | - CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING | -#ifdef CONFIG_X86_64 - CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | -#endif - CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | - CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | - CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | - CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | - CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; - /* - * We can allow some features even when not supported by the - * hardware. For example, L1 can specify an MSR bitmap - and we - * can use it to avoid exits to L1 - even when L0 runs L2 - * without MSR bitmaps. - */ - msrs->procbased_ctls_high |= - CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | - CPU_BASED_USE_MSR_BITMAPS; - - /* We support free control of CR3 access interception. */ - msrs->procbased_ctls_low &= - ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); - - /* - * secondary cpu-based controls. Do not include those that - * depend on CPUID bits, they are added later by vmx_cpuid_update. - */ - rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, - msrs->secondary_ctls_low, - msrs->secondary_ctls_high); - msrs->secondary_ctls_low = 0; - msrs->secondary_ctls_high &= - SECONDARY_EXEC_DESC | - SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | - SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | - SECONDARY_EXEC_WBINVD_EXITING; - - /* - * We can emulate "VMCS shadowing," even if the hardware - * doesn't support it. - */ - msrs->secondary_ctls_high |= - SECONDARY_EXEC_SHADOW_VMCS; - - if (enable_ept) { - /* nested EPT: emulate EPT also to L1 */ - msrs->secondary_ctls_high |= - SECONDARY_EXEC_ENABLE_EPT; - msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT | - VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT; - if (cpu_has_vmx_ept_execute_only()) - msrs->ept_caps |= - VMX_EPT_EXECUTE_ONLY_BIT; - msrs->ept_caps &= ept_caps; - msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | - VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | - VMX_EPT_1GB_PAGE_BIT; - if (enable_ept_ad_bits) { - msrs->secondary_ctls_high |= - SECONDARY_EXEC_ENABLE_PML; - msrs->ept_caps |= VMX_EPT_AD_BIT; - } - } - - if (cpu_has_vmx_vmfunc()) { - msrs->secondary_ctls_high |= - SECONDARY_EXEC_ENABLE_VMFUNC; - /* - * Advertise EPTP switching unconditionally - * since we emulate it - */ - if (enable_ept) - msrs->vmfunc_controls = - VMX_VMFUNC_EPTP_SWITCHING; - } - - /* - * Old versions of KVM use the single-context version without - * checking for support, so declare that it is supported even - * though it is treated as global context. The alternative is - * not failing the single-context invvpid, and it is worse. - */ - if (enable_vpid) { - msrs->secondary_ctls_high |= - SECONDARY_EXEC_ENABLE_VPID; - msrs->vpid_caps = VMX_VPID_INVVPID_BIT | - VMX_VPID_EXTENT_SUPPORTED_MASK; - } - - if (enable_unrestricted_guest) - msrs->secondary_ctls_high |= - SECONDARY_EXEC_UNRESTRICTED_GUEST; - - if (flexpriority_enabled) - msrs->secondary_ctls_high |= - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; - - /* miscellaneous data */ - rdmsr(MSR_IA32_VMX_MISC, - msrs->misc_low, - msrs->misc_high); - msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA; - msrs->misc_low |= - MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | - VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | - VMX_MISC_ACTIVITY_HLT; - msrs->misc_high = 0; - - /* - * This MSR reports some information about VMX support. We - * should return information about the VMX we emulate for the - * guest, and the VMCS structure we give it - not about the - * VMX support of the underlying hardware. - */ - msrs->basic = - VMCS12_REVISION | - VMX_BASIC_TRUE_CTLS | - ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | - (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); - - if (cpu_has_vmx_basic_inout()) - msrs->basic |= VMX_BASIC_INOUT; - - /* - * These MSRs specify bits which the guest must keep fixed on - * while L1 is in VMXON mode (in L1's root mode, or running an L2). - * We picked the standard core2 setting. - */ -#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) -#define VMXON_CR4_ALWAYSON X86_CR4_VMXE - msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; - msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; - - /* These MSRs specify bits which the guest must keep fixed off. */ - rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); - rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); - - /* highest index: VMX_PREEMPTION_TIMER_VALUE */ - msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1; -} - -/* - * if fixed0[i] == 1: val[i] must be 1 - * if fixed1[i] == 0: val[i] must be 0 - */ -static inline bool fixed_bits_valid(u64 val, u64 fixed0, u64 fixed1) -{ - return ((val & fixed1) | fixed0) == val; -} - -static inline bool vmx_control_verify(u32 control, u32 low, u32 high) -{ - return fixed_bits_valid(control, low, high); -} - -static inline u64 vmx_control_msr(u32 low, u32 high) -{ - return low | ((u64)high << 32); -} - -static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) +static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu, + uint64_t val) { - superset &= mask; - subset &= mask; + uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits; - return (superset | subset) == superset; + return !(val & ~valid_bits); } -static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) +static int vmx_get_msr_feature(struct kvm_msr_entry *msr) { - const u64 feature_and_reserved = - /* feature (except bit 48; see below) */ - BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | - /* reserved */ - BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); - u64 vmx_basic = vmx->nested.msrs.basic; - - if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) - return -EINVAL; - - /* - * KVM does not emulate a version of VMX that constrains physical - * addresses of VMX structures (e.g. VMCS) to 32-bits. - */ - if (data & BIT_ULL(48)) - return -EINVAL; - - if (vmx_basic_vmcs_revision_id(vmx_basic) != - vmx_basic_vmcs_revision_id(data)) - return -EINVAL; - - if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) - return -EINVAL; + switch (msr->index) { + case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: + if (!nested) + return 1; + return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data); + default: + return 1; + } - vmx->nested.msrs.basic = data; return 0; } -static int -vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +/* + * Reads an msr value (of 'msr_index') into 'pdata'. + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - u64 supported; - u32 *lowp, *highp; + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct shared_msr_entry *msr; - switch (msr_index) { - case MSR_IA32_VMX_TRUE_PINBASED_CTLS: - lowp = &vmx->nested.msrs.pinbased_ctls_low; - highp = &vmx->nested.msrs.pinbased_ctls_high; + switch (msr_info->index) { +#ifdef CONFIG_X86_64 + case MSR_FS_BASE: + msr_info->data = vmcs_readl(GUEST_FS_BASE); break; - case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: - lowp = &vmx->nested.msrs.procbased_ctls_low; - highp = &vmx->nested.msrs.procbased_ctls_high; + case MSR_GS_BASE: + msr_info->data = vmcs_readl(GUEST_GS_BASE); break; - case MSR_IA32_VMX_TRUE_EXIT_CTLS: - lowp = &vmx->nested.msrs.exit_ctls_low; - highp = &vmx->nested.msrs.exit_ctls_high; + case MSR_KERNEL_GS_BASE: + msr_info->data = vmx_read_guest_kernel_gs_base(vmx); break; - case MSR_IA32_VMX_TRUE_ENTRY_CTLS: - lowp = &vmx->nested.msrs.entry_ctls_low; - highp = &vmx->nested.msrs.entry_ctls_high; +#endif + case MSR_EFER: + return kvm_get_msr_common(vcpu, msr_info); + case MSR_IA32_SPEC_CTRL: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) + return 1; + + msr_info->data = to_vmx(vcpu)->spec_ctrl; break; - case MSR_IA32_VMX_PROCBASED_CTLS2: - lowp = &vmx->nested.msrs.secondary_ctls_low; - highp = &vmx->nested.msrs.secondary_ctls_high; - break; - default: - BUG(); - } - - supported = vmx_control_msr(*lowp, *highp); - - /* Check must-be-1 bits are still 1. */ - if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) - return -EINVAL; - - /* Check must-be-0 bits are still 0. */ - if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) - return -EINVAL; - - *lowp = data; - *highp = data >> 32; - return 0; -} - -static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) -{ - const u64 feature_and_reserved_bits = - /* feature */ - BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | - BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | - /* reserved */ - GENMASK_ULL(13, 9) | BIT_ULL(31); - u64 vmx_misc; - - vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, - vmx->nested.msrs.misc_high); - - if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) - return -EINVAL; - - if ((vmx->nested.msrs.pinbased_ctls_high & - PIN_BASED_VMX_PREEMPTION_TIMER) && - vmx_misc_preemption_timer_rate(data) != - vmx_misc_preemption_timer_rate(vmx_misc)) - return -EINVAL; - - if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) - return -EINVAL; - - if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) - return -EINVAL; - - if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) - return -EINVAL; - - vmx->nested.msrs.misc_low = data; - vmx->nested.msrs.misc_high = data >> 32; - - /* - * If L1 has read-only VM-exit information fields, use the - * less permissive vmx_vmwrite_bitmap to specify write - * permissions for the shadow VMCS. - */ - if (enable_shadow_vmcs && !nested_cpu_has_vmwrite_any_field(&vmx->vcpu)) - vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); - - return 0; -} - -static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) -{ - u64 vmx_ept_vpid_cap; - - vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps, - vmx->nested.msrs.vpid_caps); - - /* Every bit is either reserved or a feature bit. */ - if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) - return -EINVAL; - - vmx->nested.msrs.ept_caps = data; - vmx->nested.msrs.vpid_caps = data >> 32; - return 0; -} - -static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) -{ - u64 *msr; - - switch (msr_index) { - case MSR_IA32_VMX_CR0_FIXED0: - msr = &vmx->nested.msrs.cr0_fixed0; - break; - case MSR_IA32_VMX_CR4_FIXED0: - msr = &vmx->nested.msrs.cr4_fixed0; - break; - default: - BUG(); - } - - /* - * 1 bits (which indicates bits which "must-be-1" during VMX operation) - * must be 1 in the restored value. - */ - if (!is_bitwise_subset(data, *msr, -1ULL)) - return -EINVAL; - - *msr = data; - return 0; -} - -/* - * Called when userspace is restoring VMX MSRs. - * - * Returns 0 on success, non-0 otherwise. - */ -static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - /* - * Don't allow changes to the VMX capability MSRs while the vCPU - * is in VMX operation. - */ - if (vmx->nested.vmxon) - return -EBUSY; - - switch (msr_index) { - case MSR_IA32_VMX_BASIC: - return vmx_restore_vmx_basic(vmx, data); - case MSR_IA32_VMX_PINBASED_CTLS: - case MSR_IA32_VMX_PROCBASED_CTLS: - case MSR_IA32_VMX_EXIT_CTLS: - case MSR_IA32_VMX_ENTRY_CTLS: - /* - * The "non-true" VMX capability MSRs are generated from the - * "true" MSRs, so we do not support restoring them directly. - * - * If userspace wants to emulate VMX_BASIC[55]=0, userspace - * should restore the "true" MSRs with the must-be-1 bits - * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND - * DEFAULT SETTINGS". - */ - return -EINVAL; - case MSR_IA32_VMX_TRUE_PINBASED_CTLS: - case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: - case MSR_IA32_VMX_TRUE_EXIT_CTLS: - case MSR_IA32_VMX_TRUE_ENTRY_CTLS: - case MSR_IA32_VMX_PROCBASED_CTLS2: - return vmx_restore_control_msr(vmx, msr_index, data); - case MSR_IA32_VMX_MISC: - return vmx_restore_vmx_misc(vmx, data); - case MSR_IA32_VMX_CR0_FIXED0: - case MSR_IA32_VMX_CR4_FIXED0: - return vmx_restore_fixed0_msr(vmx, msr_index, data); - case MSR_IA32_VMX_CR0_FIXED1: - case MSR_IA32_VMX_CR4_FIXED1: - /* - * These MSRs are generated based on the vCPU's CPUID, so we - * do not support restoring them directly. - */ - return -EINVAL; - case MSR_IA32_VMX_EPT_VPID_CAP: - return vmx_restore_vmx_ept_vpid_cap(vmx, data); - case MSR_IA32_VMX_VMCS_ENUM: - vmx->nested.msrs.vmcs_enum = data; - return 0; - default: - /* - * The rest of the VMX capability MSRs do not support restore. - */ - return -EINVAL; - } -} - -/* Returns 0 on success, non-0 otherwise. */ -static int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) -{ - switch (msr_index) { - case MSR_IA32_VMX_BASIC: - *pdata = msrs->basic; - break; - case MSR_IA32_VMX_TRUE_PINBASED_CTLS: - case MSR_IA32_VMX_PINBASED_CTLS: - *pdata = vmx_control_msr( - msrs->pinbased_ctls_low, - msrs->pinbased_ctls_high); - if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) - *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; - break; - case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: - case MSR_IA32_VMX_PROCBASED_CTLS: - *pdata = vmx_control_msr( - msrs->procbased_ctls_low, - msrs->procbased_ctls_high); - if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) - *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; - break; - case MSR_IA32_VMX_TRUE_EXIT_CTLS: - case MSR_IA32_VMX_EXIT_CTLS: - *pdata = vmx_control_msr( - msrs->exit_ctls_low, - msrs->exit_ctls_high); - if (msr_index == MSR_IA32_VMX_EXIT_CTLS) - *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; - break; - case MSR_IA32_VMX_TRUE_ENTRY_CTLS: - case MSR_IA32_VMX_ENTRY_CTLS: - *pdata = vmx_control_msr( - msrs->entry_ctls_low, - msrs->entry_ctls_high); - if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) - *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; - break; - case MSR_IA32_VMX_MISC: - *pdata = vmx_control_msr( - msrs->misc_low, - msrs->misc_high); - break; - case MSR_IA32_VMX_CR0_FIXED0: - *pdata = msrs->cr0_fixed0; - break; - case MSR_IA32_VMX_CR0_FIXED1: - *pdata = msrs->cr0_fixed1; - break; - case MSR_IA32_VMX_CR4_FIXED0: - *pdata = msrs->cr4_fixed0; - break; - case MSR_IA32_VMX_CR4_FIXED1: - *pdata = msrs->cr4_fixed1; - break; - case MSR_IA32_VMX_VMCS_ENUM: - *pdata = msrs->vmcs_enum; - break; - case MSR_IA32_VMX_PROCBASED_CTLS2: - *pdata = vmx_control_msr( - msrs->secondary_ctls_low, - msrs->secondary_ctls_high); - break; - case MSR_IA32_VMX_EPT_VPID_CAP: - *pdata = msrs->ept_caps | - ((u64)msrs->vpid_caps << 32); - break; - case MSR_IA32_VMX_VMFUNC: - *pdata = msrs->vmfunc_controls; - break; - default: - return 1; - } - - return 0; -} - -static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu, - uint64_t val) -{ - uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits; - - return !(val & ~valid_bits); -} - -static int vmx_get_msr_feature(struct kvm_msr_entry *msr) -{ - switch (msr->index) { - case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: - if (!nested) - return 1; - return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data); - default: - return 1; - } - - return 0; -} - -/* - * Reads an msr value (of 'msr_index') into 'pdata'. - * Returns 0 on success, non-0 otherwise. - * Assumes vcpu_load() was already called. - */ -static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct shared_msr_entry *msr; - - switch (msr_info->index) { -#ifdef CONFIG_X86_64 - case MSR_FS_BASE: - msr_info->data = vmcs_readl(GUEST_FS_BASE); - break; - case MSR_GS_BASE: - msr_info->data = vmcs_readl(GUEST_GS_BASE); - break; - case MSR_KERNEL_GS_BASE: - msr_info->data = vmx_read_guest_kernel_gs_base(vmx); - break; -#endif - case MSR_EFER: - return kvm_get_msr_common(vcpu, msr_info); - case MSR_IA32_SPEC_CTRL: - if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) - return 1; - - msr_info->data = to_vmx(vcpu)->spec_ctrl; - break; - case MSR_IA32_ARCH_CAPABILITIES: - if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) - return 1; - msr_info->data = to_vmx(vcpu)->arch_capabilities; + case MSR_IA32_ARCH_CAPABILITIES: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) + return 1; + msr_info->data = to_vmx(vcpu)->arch_capabilities; break; case MSR_IA32_SYSENTER_CS: msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); @@ -2413,8 +1561,6 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 0; } -static void vmx_leave_nested(struct kvm_vcpu *vcpu); - /* * Writes msr value into into the appropriate "register". * Returns 0 on success, non-0 otherwise. @@ -3041,75 +2187,6 @@ static void free_kvm_area(void) } } -static void init_vmcs_shadow_fields(void) -{ - int i, j; - - memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); - memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); - - for (i = j = 0; i < max_shadow_read_only_fields; i++) { - u16 field = shadow_read_only_fields[i]; - if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && - (i + 1 == max_shadow_read_only_fields || - shadow_read_only_fields[i + 1] != field + 1)) - pr_err("Missing field from shadow_read_only_field %x\n", - field + 1); - - clear_bit(field, vmx_vmread_bitmap); -#ifdef CONFIG_X86_64 - if (field & 1) - continue; -#endif - if (j < i) - shadow_read_only_fields[j] = field; - j++; - } - max_shadow_read_only_fields = j; - - for (i = j = 0; i < max_shadow_read_write_fields; i++) { - u16 field = shadow_read_write_fields[i]; - if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && - (i + 1 == max_shadow_read_write_fields || - shadow_read_write_fields[i + 1] != field + 1)) - pr_err("Missing field from shadow_read_write_field %x\n", - field + 1); - - /* - * PML and the preemption timer can be emulated, but the - * processor cannot vmwrite to fields that don't exist - * on bare metal. - */ - switch (field) { - case GUEST_PML_INDEX: - if (!cpu_has_vmx_pml()) - continue; - break; - case VMX_PREEMPTION_TIMER_VALUE: - if (!cpu_has_vmx_preemption_timer()) - continue; - break; - case GUEST_INTR_STATUS: - if (!cpu_has_vmx_apicv()) - continue; - break; - default: - break; - } - - clear_bit(field, vmx_vmwrite_bitmap); - clear_bit(field, vmx_vmread_bitmap); -#ifdef CONFIG_X86_64 - if (field & 1) - continue; -#endif - if (j < i) - shadow_read_write_fields[j] = field; - j++; - } - max_shadow_read_write_fields = j; -} - static __init int alloc_kvm_area(void) { int cpu; @@ -3399,40 +2476,6 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu) (unsigned long *)&vcpu->arch.regs_dirty); } -static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) -{ - u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0; - u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1; - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - - if (to_vmx(vcpu)->nested.msrs.secondary_ctls_high & - SECONDARY_EXEC_UNRESTRICTED_GUEST && - nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) - fixed0 &= ~(X86_CR0_PE | X86_CR0_PG); - - return fixed_bits_valid(val, fixed0, fixed1); -} - -static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) -{ - u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0; - u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1; - - return fixed_bits_valid(val, fixed0, fixed1); -} - -static bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val) -{ - u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr4_fixed0; - u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr4_fixed1; - - return fixed_bits_valid(val, fixed0, fixed1); -} - -/* No difference in the restrictions on guest and host CR4 in VMX operation. */ -#define nested_guest_cr4_valid nested_cr4_valid -#define nested_host_cr4_valid nested_cr4_valid - static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, unsigned long cr0, struct kvm_vcpu *vcpu) @@ -3972,11 +3015,6 @@ static bool guest_state_valid(struct kvm_vcpu *vcpu) return true; } -static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) -{ - return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu)); -} - static int init_rmode_tss(struct kvm *kvm) { gfn_t fn; @@ -4208,47 +3246,6 @@ static __always_inline void vmx_set_intercept_for_msr(unsigned long *msr_bitmap, vmx_disable_intercept_for_msr(msr_bitmap, msr, type); } -/* - * If a msr is allowed by L0, we should check whether it is allowed by L1. - * The corresponding bit will be cleared unless both of L0 and L1 allow it. - */ -static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, - unsigned long *msr_bitmap_nested, - u32 msr, int type) -{ - int f = sizeof(unsigned long); - - /* - * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals - * have the write-low and read-high bitmap offsets the wrong way round. - * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. - */ - if (msr <= 0x1fff) { - if (type & MSR_TYPE_R && - !test_bit(msr, msr_bitmap_l1 + 0x000 / f)) - /* read-low */ - __clear_bit(msr, msr_bitmap_nested + 0x000 / f); - - if (type & MSR_TYPE_W && - !test_bit(msr, msr_bitmap_l1 + 0x800 / f)) - /* write-low */ - __clear_bit(msr, msr_bitmap_nested + 0x800 / f); - - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { - msr &= 0x1fff; - if (type & MSR_TYPE_R && - !test_bit(msr, msr_bitmap_l1 + 0x400 / f)) - /* read-high */ - __clear_bit(msr, msr_bitmap_nested + 0x400 / f); - - if (type & MSR_TYPE_W && - !test_bit(msr, msr_bitmap_l1 + 0xc00 / f)) - /* write-high */ - __clear_bit(msr, msr_bitmap_nested + 0xc00 / f); - - } -} - static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu) { u8 mode = 0; @@ -4310,60 +3307,6 @@ static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu) return enable_apicv; } -static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) -{ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - gfn_t gfn; - - /* - * Don't need to mark the APIC access page dirty; it is never - * written to by the CPU during APIC virtualization. - */ - - if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { - gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; - kvm_vcpu_mark_page_dirty(vcpu, gfn); - } - - if (nested_cpu_has_posted_intr(vmcs12)) { - gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; - kvm_vcpu_mark_page_dirty(vcpu, gfn); - } -} - - -static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - int max_irr; - void *vapic_page; - u16 status; - - if (!vmx->nested.pi_desc || !vmx->nested.pi_pending) - return; - - vmx->nested.pi_pending = false; - if (!pi_test_and_clear_on(vmx->nested.pi_desc)) - return; - - max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); - if (max_irr != 256) { - vapic_page = kmap(vmx->nested.virtual_apic_page); - __kvm_apic_update_irr(vmx->nested.pi_desc->pir, - vapic_page, &max_irr); - kunmap(vmx->nested.virtual_apic_page); - - status = vmcs_read16(GUEST_INTR_STATUS); - if ((u8)max_irr > ((u8)status & 0xff)) { - status &= ~0xff; - status |= (u8)max_irr; - vmcs_write16(GUEST_INTR_STATUS, status); - } - } - - nested_mark_vmcs12_pages_dirty(vcpu); -} - static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -4747,20 +3690,6 @@ static void ept_set_mmio_spte_mask(void) #define VMX_XSS_EXIT_BITMAP 0 -static void nested_vmx_vcpu_setup(void) -{ - if (enable_shadow_vmcs) { - /* - * At vCPU creation, "VMWRITE to any supported field - * in the VMCS" is supported, so use the more - * permissive vmx_vmread_bitmap to specify both read - * and write permissions for the shadow VMCS. - */ - vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); - vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmread_bitmap)); - } -} - /* * Sets up the vmcs for emulated real mode. */ @@ -4963,31 +3892,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx_clear_hlt(vcpu); } -/* - * In nested virtualization, check if L1 asked to exit on external interrupts. - * For most existing hypervisors, this will always return true. - */ -static bool nested_exit_on_intr(struct kvm_vcpu *vcpu) -{ - return get_vmcs12(vcpu)->pin_based_vm_exec_control & - PIN_BASED_EXT_INTR_MASK; -} - -/* - * In nested virtualization, check if L1 has set - * VM_EXIT_ACK_INTR_ON_EXIT - */ -static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) -{ - return get_vmcs12(vcpu)->vm_exit_controls & - VM_EXIT_ACK_INTR_ON_EXIT; -} - -static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) -{ - return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu)); -} - static void enable_irq_window(struct kvm_vcpu *vcpu) { vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, @@ -6095,6206 +4999,1670 @@ static int handle_monitor(struct kvm_vcpu *vcpu) return handle_nop(vcpu); } -/* - * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), - * set the success or error code of an emulated VMX instruction (as specified - * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated - * instruction. - */ -static int nested_vmx_succeed(struct kvm_vcpu *vcpu) -{ - vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) - & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | - X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); - return kvm_skip_emulated_instruction(vcpu); -} - -static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) -{ - vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) - & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | - X86_EFLAGS_SF | X86_EFLAGS_OF)) - | X86_EFLAGS_CF); - return kvm_skip_emulated_instruction(vcpu); -} - -static int nested_vmx_failValid(struct kvm_vcpu *vcpu, - u32 vm_instruction_error) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - /* - * failValid writes the error number to the current VMCS, which - * can't be done if there isn't a current VMCS. - */ - if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs) - return nested_vmx_failInvalid(vcpu); - - vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) - & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | - X86_EFLAGS_SF | X86_EFLAGS_OF)) - | X86_EFLAGS_ZF); - get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; - /* - * We don't need to force a shadow sync because - * VM_INSTRUCTION_ERROR is not shadowed - */ - return kvm_skip_emulated_instruction(vcpu); -} - -static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) -{ - /* TODO: not to reset guest simply here. */ - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); - pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator); -} - -static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) -{ - struct vcpu_vmx *vmx = - container_of(timer, struct vcpu_vmx, nested.preemption_timer); - - vmx->nested.preemption_timer_expired = true; - kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); - kvm_vcpu_kick(&vmx->vcpu); - - return HRTIMER_NORESTART; -} - -/* - * Decode the memory-address operand of a vmx instruction, as recorded on an - * exit caused by such an instruction (run by a guest hypervisor). - * On success, returns 0. When the operand is invalid, returns 1 and throws - * #UD or #GP. - */ -static int get_vmx_mem_address(struct kvm_vcpu *vcpu, - unsigned long exit_qualification, - u32 vmx_instruction_info, bool wr, gva_t *ret) +static int handle_invpcid(struct kvm_vcpu *vcpu) { - gva_t off; - bool exn; - struct kvm_segment s; + u32 vmx_instruction_info; + unsigned long type; + bool pcid_enabled; + gva_t gva; + struct x86_exception e; + unsigned i; + unsigned long roots_to_free = 0; + struct { + u64 pcid; + u64 gla; + } operand; - /* - * According to Vol. 3B, "Information for VM Exits Due to Instruction - * Execution", on an exit, vmx_instruction_info holds most of the - * addressing components of the operand. Only the displacement part - * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). - * For how an actual address is calculated from all these components, - * refer to Vol. 1, "Operand Addressing". - */ - int scaling = vmx_instruction_info & 3; - int addr_size = (vmx_instruction_info >> 7) & 7; - bool is_reg = vmx_instruction_info & (1u << 10); - int seg_reg = (vmx_instruction_info >> 15) & 7; - int index_reg = (vmx_instruction_info >> 18) & 0xf; - bool index_is_valid = !(vmx_instruction_info & (1u << 22)); - int base_reg = (vmx_instruction_info >> 23) & 0xf; - bool base_is_valid = !(vmx_instruction_info & (1u << 27)); - - if (is_reg) { + if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } - /* Addr = segment_base + offset */ - /* offset = base + [index * scale] + displacement */ - off = exit_qualification; /* holds the displacement */ - if (base_is_valid) - off += kvm_register_read(vcpu, base_reg); - if (index_is_valid) - off += kvm_register_read(vcpu, index_reg)< s.limit); - } - if (exn) { - kvm_queue_exception_e(vcpu, - seg_reg == VCPU_SREG_SS ? - SS_VECTOR : GP_VECTOR, - 0); + vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); + + if (type > 3) { + kvm_inject_gp(vcpu, 0); return 1; } - return 0; -} - -static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer) -{ - gva_t gva; - struct x86_exception e; - + /* According to the Intel instruction reference, the memory operand + * is read even if it isn't needed (e.g., for type==all) + */ if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), - vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva)) + vmx_instruction_info, false, &gva)) return 1; - if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) { + if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { kvm_inject_page_fault(vcpu, &e); return 1; } - return 0; -} - -/* - * Allocate a shadow VMCS and associate it with the currently loaded - * VMCS, unless such a shadow VMCS already exists. The newly allocated - * VMCS is also VMCLEARed, so that it is ready for use. - */ -static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; - - /* - * We should allocate a shadow vmcs for vmcs01 only when L1 - * executes VMXON and free it when L1 executes VMXOFF. - * As it is invalid to execute VMXON twice, we shouldn't reach - * here when vmcs01 already have an allocated shadow vmcs. - */ - WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs); - - if (!loaded_vmcs->shadow_vmcs) { - loaded_vmcs->shadow_vmcs = alloc_vmcs(true); - if (loaded_vmcs->shadow_vmcs) - vmcs_clear(loaded_vmcs->shadow_vmcs); + if (operand.pcid >> 12 != 0) { + kvm_inject_gp(vcpu, 0); + return 1; } - return loaded_vmcs->shadow_vmcs; -} - -static int enter_vmx_operation(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - int r; - r = alloc_loaded_vmcs(&vmx->nested.vmcs02); - if (r < 0) - goto out_vmcs02; - - vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); - if (!vmx->nested.cached_vmcs12) - goto out_cached_vmcs12; + pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); - vmx->nested.cached_shadow_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); - if (!vmx->nested.cached_shadow_vmcs12) - goto out_cached_shadow_vmcs12; + switch (type) { + case INVPCID_TYPE_INDIV_ADDR: + if ((!pcid_enabled && (operand.pcid != 0)) || + is_noncanonical_address(operand.gla, vcpu)) { + kvm_inject_gp(vcpu, 0); + return 1; + } + kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid); + return kvm_skip_emulated_instruction(vcpu); - if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) - goto out_shadow_vmcs; + case INVPCID_TYPE_SINGLE_CTXT: + if (!pcid_enabled && (operand.pcid != 0)) { + kvm_inject_gp(vcpu, 0); + return 1; + } - hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_PINNED); - vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; + if (kvm_get_active_pcid(vcpu) == operand.pcid) { + kvm_mmu_sync_roots(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + } - vmx->nested.vpid02 = allocate_vpid(); + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].cr3) + == operand.pcid) + roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); - vmx->nested.vmcs02_initialized = false; - vmx->nested.vmxon = true; - return 0; + kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free); + /* + * If neither the current cr3 nor any of the prev_roots use the + * given PCID, then nothing needs to be done here because a + * resync will happen anyway before switching to any other CR3. + */ -out_shadow_vmcs: - kfree(vmx->nested.cached_shadow_vmcs12); + return kvm_skip_emulated_instruction(vcpu); -out_cached_shadow_vmcs12: - kfree(vmx->nested.cached_vmcs12); + case INVPCID_TYPE_ALL_NON_GLOBAL: + /* + * Currently, KVM doesn't mark global entries in the shadow + * page tables, so a non-global flush just degenerates to a + * global flush. If needed, we could optimize this later by + * keeping track of global entries in shadow page tables. + */ -out_cached_vmcs12: - free_loaded_vmcs(&vmx->nested.vmcs02); + /* fall-through */ + case INVPCID_TYPE_ALL_INCL_GLOBAL: + kvm_mmu_unload(vcpu); + return kvm_skip_emulated_instruction(vcpu); -out_vmcs02: - return -ENOMEM; + default: + BUG(); /* We have already checked above that type <= 3 */ + } } -/* - * Emulate the VMXON instruction. - * Currently, we just remember that VMX is active, and do not save or even - * inspect the argument to VMXON (the so-called "VMXON pointer") because we - * do not currently need to store anything in that guest-allocated memory - * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their - * argument is different from the VMXON pointer (which the spec says they do). - */ -static int handle_vmon(struct kvm_vcpu *vcpu) +static int handle_pml_full(struct kvm_vcpu *vcpu) { - int ret; - gpa_t vmptr; - struct page *page; - struct vcpu_vmx *vmx = to_vmx(vcpu); - const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED - | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; - - /* - * The Intel VMX Instruction Reference lists a bunch of bits that are - * prerequisite to running VMXON, most notably cr4.VMXE must be set to - * 1 (see vmx_set_cr4() for when we allow the guest to set this). - * Otherwise, we should fail with #UD. But most faulting conditions - * have already been checked by hardware, prior to the VM-exit for - * VMXON. We do test guest cr4.VMXE because processor CR4 always has - * that bit set to 1 in non-root mode. - */ - if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } - - /* CPL=0 must be checked manually. */ - if (vmx_get_cpl(vcpu)) { - kvm_inject_gp(vcpu, 0); - return 1; - } - - if (vmx->nested.vmxon) - return nested_vmx_failValid(vcpu, - VMXERR_VMXON_IN_VMX_ROOT_OPERATION); + unsigned long exit_qualification; - if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) - != VMXON_NEEDED_FEATURES) { - kvm_inject_gp(vcpu, 0); - return 1; - } + trace_kvm_pml_full(vcpu->vcpu_id); - if (nested_vmx_get_vmptr(vcpu, &vmptr)) - return 1; + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); /* - * SDM 3: 24.11.5 - * The first 4 bytes of VMXON region contain the supported - * VMCS revision identifier - * - * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; - * which replaces physical address width with 32 + * PML buffer FULL happened while executing iret from NMI, + * "blocked by NMI" bit has to be set before next VM entry. */ - if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) - return nested_vmx_failInvalid(vcpu); - - page = kvm_vcpu_gpa_to_page(vcpu, vmptr); - if (is_error_page(page)) - return nested_vmx_failInvalid(vcpu); - - if (*(u32 *)kmap(page) != VMCS12_REVISION) { - kunmap(page); - kvm_release_page_clean(page); - return nested_vmx_failInvalid(vcpu); - } - kunmap(page); - kvm_release_page_clean(page); - - vmx->nested.vmxon_ptr = vmptr; - ret = enter_vmx_operation(vcpu); - if (ret) - return ret; - - return nested_vmx_succeed(vcpu); -} - -/* - * Intel's VMX Instruction Reference specifies a common set of prerequisites - * for running VMX instructions (except VMXON, whose prerequisites are - * slightly different). It also specifies what exception to inject otherwise. - * Note that many of these exceptions have priority over VM exits, so they - * don't have to be checked again here. - */ -static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) -{ - if (!to_vmx(vcpu)->nested.vmxon) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 0; - } - - if (vmx_get_cpl(vcpu)) { - kvm_inject_gp(vcpu, 0); - return 0; - } + if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && + enable_vnmi && + (exit_qualification & INTR_INFO_UNBLOCK_NMI)) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + /* + * PML buffer already flushed at beginning of VMEXIT. Nothing to do + * here.., and there's no userspace involvement needed for PML. + */ return 1; } -static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) +static int handle_preemption_timer(struct kvm_vcpu *vcpu) { - vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS); - vmcs_write64(VMCS_LINK_POINTER, -1ull); + if (!to_vmx(vcpu)->req_immediate_exit) + kvm_lapic_expired_hv_timer(vcpu); + return 1; } -static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) +/* + * When nested=0, all VMX instruction VM Exits filter here. The handlers + * are overwritten by nested_vmx_setup() when nested=1. + */ +static int handle_vmx_instruction(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (!vmx->nested.hv_evmcs) - return; - - kunmap(vmx->nested.hv_evmcs_page); - kvm_release_page_dirty(vmx->nested.hv_evmcs_page); - vmx->nested.hv_evmcs_vmptr = -1ull; - vmx->nested.hv_evmcs_page = NULL; - vmx->nested.hv_evmcs = NULL; + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; } -static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) +static int handle_encls(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (vmx->nested.current_vmptr == -1ull) - return; - - if (enable_shadow_vmcs) { - /* copy to memory all shadowed fields in case - they were modified */ - copy_shadow_to_vmcs12(vmx); - vmx->nested.need_vmcs12_sync = false; - vmx_disable_shadow_vmcs(vmx); - } - vmx->nested.posted_intr_nv = -1; - - /* Flush VMCS12 to guest memory */ - kvm_vcpu_write_guest_page(vcpu, - vmx->nested.current_vmptr >> PAGE_SHIFT, - vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); - - kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); - - vmx->nested.current_vmptr = -1ull; + /* + * SGX virtualization is not yet supported. There is no software + * enable bit for SGX, so we have to trap ENCLS and inject a #UD + * to prevent the guest from executing ENCLS. + */ + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; } /* - * Free whatever needs to be freed from vmx->nested when L1 goes down, or - * just stops using VMX. + * The exit handlers return 1 if the exit was handled fully and guest execution + * may resume. Otherwise they set the kvm_run parameter to indicate what needs + * to be done to userspace and return 0. */ -static void free_nested(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) - return; - - vmx->nested.vmxon = false; - vmx->nested.smm.vmxon = false; - free_vpid(vmx->nested.vpid02); - vmx->nested.posted_intr_nv = -1; - vmx->nested.current_vmptr = -1ull; - if (enable_shadow_vmcs) { - vmx_disable_shadow_vmcs(vmx); - vmcs_clear(vmx->vmcs01.shadow_vmcs); - free_vmcs(vmx->vmcs01.shadow_vmcs); - vmx->vmcs01.shadow_vmcs = NULL; - } - kfree(vmx->nested.cached_vmcs12); - kfree(vmx->nested.cached_shadow_vmcs12); - /* Unpin physical memory we referred to in the vmcs02 */ - if (vmx->nested.apic_access_page) { - kvm_release_page_dirty(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = NULL; - } - if (vmx->nested.virtual_apic_page) { - kvm_release_page_dirty(vmx->nested.virtual_apic_page); - vmx->nested.virtual_apic_page = NULL; - } - if (vmx->nested.pi_desc_page) { - kunmap(vmx->nested.pi_desc_page); - kvm_release_page_dirty(vmx->nested.pi_desc_page); - vmx->nested.pi_desc_page = NULL; - vmx->nested.pi_desc = NULL; - } - - kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); - - nested_release_evmcs(vcpu); +static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { + [EXIT_REASON_EXCEPTION_NMI] = handle_exception, + [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, + [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, + [EXIT_REASON_NMI_WINDOW] = handle_nmi_window, + [EXIT_REASON_IO_INSTRUCTION] = handle_io, + [EXIT_REASON_CR_ACCESS] = handle_cr, + [EXIT_REASON_DR_ACCESS] = handle_dr, + [EXIT_REASON_CPUID] = handle_cpuid, + [EXIT_REASON_MSR_READ] = handle_rdmsr, + [EXIT_REASON_MSR_WRITE] = handle_wrmsr, + [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, + [EXIT_REASON_HLT] = handle_halt, + [EXIT_REASON_INVD] = handle_invd, + [EXIT_REASON_INVLPG] = handle_invlpg, + [EXIT_REASON_RDPMC] = handle_rdpmc, + [EXIT_REASON_VMCALL] = handle_vmcall, + [EXIT_REASON_VMCLEAR] = handle_vmx_instruction, + [EXIT_REASON_VMLAUNCH] = handle_vmx_instruction, + [EXIT_REASON_VMPTRLD] = handle_vmx_instruction, + [EXIT_REASON_VMPTRST] = handle_vmx_instruction, + [EXIT_REASON_VMREAD] = handle_vmx_instruction, + [EXIT_REASON_VMRESUME] = handle_vmx_instruction, + [EXIT_REASON_VMWRITE] = handle_vmx_instruction, + [EXIT_REASON_VMOFF] = handle_vmx_instruction, + [EXIT_REASON_VMON] = handle_vmx_instruction, + [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, + [EXIT_REASON_APIC_ACCESS] = handle_apic_access, + [EXIT_REASON_APIC_WRITE] = handle_apic_write, + [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced, + [EXIT_REASON_WBINVD] = handle_wbinvd, + [EXIT_REASON_XSETBV] = handle_xsetbv, + [EXIT_REASON_TASK_SWITCH] = handle_task_switch, + [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, + [EXIT_REASON_GDTR_IDTR] = handle_desc, + [EXIT_REASON_LDTR_TR] = handle_desc, + [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, + [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, + [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, + [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait, + [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap, + [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor, + [EXIT_REASON_INVEPT] = handle_vmx_instruction, + [EXIT_REASON_INVVPID] = handle_vmx_instruction, + [EXIT_REASON_RDRAND] = handle_invalid_op, + [EXIT_REASON_RDSEED] = handle_invalid_op, + [EXIT_REASON_XSAVES] = handle_xsaves, + [EXIT_REASON_XRSTORS] = handle_xrstors, + [EXIT_REASON_PML_FULL] = handle_pml_full, + [EXIT_REASON_INVPCID] = handle_invpcid, + [EXIT_REASON_VMFUNC] = handle_vmx_instruction, + [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, + [EXIT_REASON_ENCLS] = handle_encls, +}; - free_loaded_vmcs(&vmx->nested.vmcs02); -} +static const int kvm_vmx_max_exit_handlers = + ARRAY_SIZE(kvm_vmx_exit_handlers); -/* Emulate the VMXOFF instruction */ -static int handle_vmoff(struct kvm_vcpu *vcpu) +static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) { - if (!nested_vmx_check_permission(vcpu)) - return 1; - free_nested(vcpu); - return nested_vmx_succeed(vcpu); + *info1 = vmcs_readl(EXIT_QUALIFICATION); + *info2 = vmcs_read32(VM_EXIT_INTR_INFO); } -/* Emulate the VMCLEAR instruction */ -static int handle_vmclear(struct kvm_vcpu *vcpu) +static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 zero = 0; - gpa_t vmptr; - - if (!nested_vmx_check_permission(vcpu)) - return 1; - - if (nested_vmx_get_vmptr(vcpu, &vmptr)) - return 1; - - if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) - return nested_vmx_failValid(vcpu, - VMXERR_VMCLEAR_INVALID_ADDRESS); - - if (vmptr == vmx->nested.vmxon_ptr) - return nested_vmx_failValid(vcpu, - VMXERR_VMCLEAR_VMXON_POINTER); - - if (vmx->nested.hv_evmcs_page) { - if (vmptr == vmx->nested.hv_evmcs_vmptr) - nested_release_evmcs(vcpu); - } else { - if (vmptr == vmx->nested.current_vmptr) - nested_release_vmcs12(vcpu); - - kvm_vcpu_write_guest(vcpu, - vmptr + offsetof(struct vmcs12, - launch_state), - &zero, sizeof(zero)); + if (vmx->pml_pg) { + __free_page(vmx->pml_pg); + vmx->pml_pg = NULL; } - - return nested_vmx_succeed(vcpu); } -static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch); - -/* Emulate the VMLAUNCH instruction */ -static int handle_vmlaunch(struct kvm_vcpu *vcpu) +static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) { - return nested_vmx_run(vcpu, true); -} + struct vcpu_vmx *vmx = to_vmx(vcpu); + u64 *pml_buf; + u16 pml_idx; -/* Emulate the VMRESUME instruction */ -static int handle_vmresume(struct kvm_vcpu *vcpu) -{ + pml_idx = vmcs_read16(GUEST_PML_INDEX); - return nested_vmx_run(vcpu, false); -} + /* Do nothing if PML buffer is empty */ + if (pml_idx == (PML_ENTITY_NUM - 1)) + return; -static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) -{ - struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; - struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; + /* PML index always points to next available PML buffer entity */ + if (pml_idx >= PML_ENTITY_NUM) + pml_idx = 0; + else + pml_idx++; - /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ - vmcs12->tpr_threshold = evmcs->tpr_threshold; - vmcs12->guest_rip = evmcs->guest_rip; + pml_buf = page_address(vmx->pml_pg); + for (; pml_idx < PML_ENTITY_NUM; pml_idx++) { + u64 gpa; - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { - vmcs12->guest_rsp = evmcs->guest_rsp; - vmcs12->guest_rflags = evmcs->guest_rflags; - vmcs12->guest_interruptibility_info = - evmcs->guest_interruptibility_info; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { - vmcs12->cpu_based_vm_exec_control = - evmcs->cpu_based_vm_exec_control; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { - vmcs12->exception_bitmap = evmcs->exception_bitmap; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { - vmcs12->vm_entry_controls = evmcs->vm_entry_controls; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { - vmcs12->vm_entry_intr_info_field = - evmcs->vm_entry_intr_info_field; - vmcs12->vm_entry_exception_error_code = - evmcs->vm_entry_exception_error_code; - vmcs12->vm_entry_instruction_len = - evmcs->vm_entry_instruction_len; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { - vmcs12->host_ia32_pat = evmcs->host_ia32_pat; - vmcs12->host_ia32_efer = evmcs->host_ia32_efer; - vmcs12->host_cr0 = evmcs->host_cr0; - vmcs12->host_cr3 = evmcs->host_cr3; - vmcs12->host_cr4 = evmcs->host_cr4; - vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; - vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; - vmcs12->host_rip = evmcs->host_rip; - vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; - vmcs12->host_es_selector = evmcs->host_es_selector; - vmcs12->host_cs_selector = evmcs->host_cs_selector; - vmcs12->host_ss_selector = evmcs->host_ss_selector; - vmcs12->host_ds_selector = evmcs->host_ds_selector; - vmcs12->host_fs_selector = evmcs->host_fs_selector; - vmcs12->host_gs_selector = evmcs->host_gs_selector; - vmcs12->host_tr_selector = evmcs->host_tr_selector; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { - vmcs12->pin_based_vm_exec_control = - evmcs->pin_based_vm_exec_control; - vmcs12->vm_exit_controls = evmcs->vm_exit_controls; - vmcs12->secondary_vm_exec_control = - evmcs->secondary_vm_exec_control; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { - vmcs12->io_bitmap_a = evmcs->io_bitmap_a; - vmcs12->io_bitmap_b = evmcs->io_bitmap_b; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { - vmcs12->msr_bitmap = evmcs->msr_bitmap; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { - vmcs12->guest_es_base = evmcs->guest_es_base; - vmcs12->guest_cs_base = evmcs->guest_cs_base; - vmcs12->guest_ss_base = evmcs->guest_ss_base; - vmcs12->guest_ds_base = evmcs->guest_ds_base; - vmcs12->guest_fs_base = evmcs->guest_fs_base; - vmcs12->guest_gs_base = evmcs->guest_gs_base; - vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; - vmcs12->guest_tr_base = evmcs->guest_tr_base; - vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; - vmcs12->guest_idtr_base = evmcs->guest_idtr_base; - vmcs12->guest_es_limit = evmcs->guest_es_limit; - vmcs12->guest_cs_limit = evmcs->guest_cs_limit; - vmcs12->guest_ss_limit = evmcs->guest_ss_limit; - vmcs12->guest_ds_limit = evmcs->guest_ds_limit; - vmcs12->guest_fs_limit = evmcs->guest_fs_limit; - vmcs12->guest_gs_limit = evmcs->guest_gs_limit; - vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; - vmcs12->guest_tr_limit = evmcs->guest_tr_limit; - vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; - vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; - vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; - vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; - vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; - vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; - vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; - vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; - vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; - vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; - vmcs12->guest_es_selector = evmcs->guest_es_selector; - vmcs12->guest_cs_selector = evmcs->guest_cs_selector; - vmcs12->guest_ss_selector = evmcs->guest_ss_selector; - vmcs12->guest_ds_selector = evmcs->guest_ds_selector; - vmcs12->guest_fs_selector = evmcs->guest_fs_selector; - vmcs12->guest_gs_selector = evmcs->guest_gs_selector; - vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; - vmcs12->guest_tr_selector = evmcs->guest_tr_selector; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { - vmcs12->tsc_offset = evmcs->tsc_offset; - vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; - vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { - vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; - vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; - vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; - vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; - vmcs12->guest_cr0 = evmcs->guest_cr0; - vmcs12->guest_cr3 = evmcs->guest_cr3; - vmcs12->guest_cr4 = evmcs->guest_cr4; - vmcs12->guest_dr7 = evmcs->guest_dr7; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { - vmcs12->host_fs_base = evmcs->host_fs_base; - vmcs12->host_gs_base = evmcs->host_gs_base; - vmcs12->host_tr_base = evmcs->host_tr_base; - vmcs12->host_gdtr_base = evmcs->host_gdtr_base; - vmcs12->host_idtr_base = evmcs->host_idtr_base; - vmcs12->host_rsp = evmcs->host_rsp; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { - vmcs12->ept_pointer = evmcs->ept_pointer; - vmcs12->virtual_processor_id = evmcs->virtual_processor_id; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { - vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; - vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; - vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; - vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; - vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; - vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; - vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; - vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; - vmcs12->guest_pending_dbg_exceptions = - evmcs->guest_pending_dbg_exceptions; - vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; - vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; - vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; - vmcs12->guest_activity_state = evmcs->guest_activity_state; - vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; + gpa = pml_buf[pml_idx]; + WARN_ON(gpa & (PAGE_SIZE - 1)); + kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); } - /* - * Not used? - * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; - * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; - * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; - * vmcs12->cr3_target_value0 = evmcs->cr3_target_value0; - * vmcs12->cr3_target_value1 = evmcs->cr3_target_value1; - * vmcs12->cr3_target_value2 = evmcs->cr3_target_value2; - * vmcs12->cr3_target_value3 = evmcs->cr3_target_value3; - * vmcs12->page_fault_error_code_mask = - * evmcs->page_fault_error_code_mask; - * vmcs12->page_fault_error_code_match = - * evmcs->page_fault_error_code_match; - * vmcs12->cr3_target_count = evmcs->cr3_target_count; - * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; - * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; - * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; - */ - - /* - * Read only fields: - * vmcs12->guest_physical_address = evmcs->guest_physical_address; - * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; - * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; - * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; - * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; - * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; - * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; - * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; - * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; - * vmcs12->exit_qualification = evmcs->exit_qualification; - * vmcs12->guest_linear_address = evmcs->guest_linear_address; - * - * Not present in struct vmcs12: - * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; - * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; - * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; - * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; - */ - - return 0; -} - -static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) -{ - struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; - struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; - - /* - * Should not be changed by KVM: - * - * evmcs->host_es_selector = vmcs12->host_es_selector; - * evmcs->host_cs_selector = vmcs12->host_cs_selector; - * evmcs->host_ss_selector = vmcs12->host_ss_selector; - * evmcs->host_ds_selector = vmcs12->host_ds_selector; - * evmcs->host_fs_selector = vmcs12->host_fs_selector; - * evmcs->host_gs_selector = vmcs12->host_gs_selector; - * evmcs->host_tr_selector = vmcs12->host_tr_selector; - * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; - * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; - * evmcs->host_cr0 = vmcs12->host_cr0; - * evmcs->host_cr3 = vmcs12->host_cr3; - * evmcs->host_cr4 = vmcs12->host_cr4; - * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; - * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; - * evmcs->host_rip = vmcs12->host_rip; - * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; - * evmcs->host_fs_base = vmcs12->host_fs_base; - * evmcs->host_gs_base = vmcs12->host_gs_base; - * evmcs->host_tr_base = vmcs12->host_tr_base; - * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; - * evmcs->host_idtr_base = vmcs12->host_idtr_base; - * evmcs->host_rsp = vmcs12->host_rsp; - * sync_vmcs12() doesn't read these: - * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; - * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; - * evmcs->msr_bitmap = vmcs12->msr_bitmap; - * evmcs->ept_pointer = vmcs12->ept_pointer; - * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; - * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; - * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; - * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; - * evmcs->cr3_target_value0 = vmcs12->cr3_target_value0; - * evmcs->cr3_target_value1 = vmcs12->cr3_target_value1; - * evmcs->cr3_target_value2 = vmcs12->cr3_target_value2; - * evmcs->cr3_target_value3 = vmcs12->cr3_target_value3; - * evmcs->tpr_threshold = vmcs12->tpr_threshold; - * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; - * evmcs->exception_bitmap = vmcs12->exception_bitmap; - * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; - * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; - * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; - * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; - * evmcs->page_fault_error_code_mask = - * vmcs12->page_fault_error_code_mask; - * evmcs->page_fault_error_code_match = - * vmcs12->page_fault_error_code_match; - * evmcs->cr3_target_count = vmcs12->cr3_target_count; - * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; - * evmcs->tsc_offset = vmcs12->tsc_offset; - * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; - * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; - * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; - * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; - * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; - * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; - * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; - * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; - * - * Not present in struct vmcs12: - * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; - * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; - * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; - * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; - */ - - evmcs->guest_es_selector = vmcs12->guest_es_selector; - evmcs->guest_cs_selector = vmcs12->guest_cs_selector; - evmcs->guest_ss_selector = vmcs12->guest_ss_selector; - evmcs->guest_ds_selector = vmcs12->guest_ds_selector; - evmcs->guest_fs_selector = vmcs12->guest_fs_selector; - evmcs->guest_gs_selector = vmcs12->guest_gs_selector; - evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; - evmcs->guest_tr_selector = vmcs12->guest_tr_selector; - - evmcs->guest_es_limit = vmcs12->guest_es_limit; - evmcs->guest_cs_limit = vmcs12->guest_cs_limit; - evmcs->guest_ss_limit = vmcs12->guest_ss_limit; - evmcs->guest_ds_limit = vmcs12->guest_ds_limit; - evmcs->guest_fs_limit = vmcs12->guest_fs_limit; - evmcs->guest_gs_limit = vmcs12->guest_gs_limit; - evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; - evmcs->guest_tr_limit = vmcs12->guest_tr_limit; - evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; - evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; - - evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; - evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; - evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; - evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; - evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; - evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; - evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; - evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; - - evmcs->guest_es_base = vmcs12->guest_es_base; - evmcs->guest_cs_base = vmcs12->guest_cs_base; - evmcs->guest_ss_base = vmcs12->guest_ss_base; - evmcs->guest_ds_base = vmcs12->guest_ds_base; - evmcs->guest_fs_base = vmcs12->guest_fs_base; - evmcs->guest_gs_base = vmcs12->guest_gs_base; - evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; - evmcs->guest_tr_base = vmcs12->guest_tr_base; - evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; - evmcs->guest_idtr_base = vmcs12->guest_idtr_base; - - evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; - evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; - - evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; - evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; - evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; - evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; - - evmcs->guest_pending_dbg_exceptions = - vmcs12->guest_pending_dbg_exceptions; - evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; - evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; - - evmcs->guest_activity_state = vmcs12->guest_activity_state; - evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; - - evmcs->guest_cr0 = vmcs12->guest_cr0; - evmcs->guest_cr3 = vmcs12->guest_cr3; - evmcs->guest_cr4 = vmcs12->guest_cr4; - evmcs->guest_dr7 = vmcs12->guest_dr7; - - evmcs->guest_physical_address = vmcs12->guest_physical_address; - - evmcs->vm_instruction_error = vmcs12->vm_instruction_error; - evmcs->vm_exit_reason = vmcs12->vm_exit_reason; - evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; - evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; - evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; - evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; - evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; - evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; - - evmcs->exit_qualification = vmcs12->exit_qualification; - - evmcs->guest_linear_address = vmcs12->guest_linear_address; - evmcs->guest_rsp = vmcs12->guest_rsp; - evmcs->guest_rflags = vmcs12->guest_rflags; - - evmcs->guest_interruptibility_info = - vmcs12->guest_interruptibility_info; - evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; - evmcs->vm_entry_controls = vmcs12->vm_entry_controls; - evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; - evmcs->vm_entry_exception_error_code = - vmcs12->vm_entry_exception_error_code; - evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; - - evmcs->guest_rip = vmcs12->guest_rip; - - evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; - - return 0; + /* reset PML index */ + vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); } /* - * Copy the writable VMCS shadow fields back to the VMCS12, in case - * they have been modified by the L1 guest. Note that the "read-only" - * VM-exit information fields are actually writable if the vCPU is - * configured to support "VMWRITE to any supported field in the VMCS." + * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap. + * Called before reporting dirty_bitmap to userspace. */ -static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) -{ - const u16 *fields[] = { - shadow_read_write_fields, - shadow_read_only_fields - }; - const int max_fields[] = { - max_shadow_read_write_fields, - max_shadow_read_only_fields - }; - int i, q; - unsigned long field; - u64 field_value; - struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; - - preempt_disable(); - - vmcs_load(shadow_vmcs); - - for (q = 0; q < ARRAY_SIZE(fields); q++) { - for (i = 0; i < max_fields[q]; i++) { - field = fields[q][i]; - field_value = __vmcs_readl(field); - vmcs12_write_any(get_vmcs12(&vmx->vcpu), field, field_value); - } - /* - * Skip the VM-exit information fields if they are read-only. - */ - if (!nested_cpu_has_vmwrite_any_field(&vmx->vcpu)) - break; - } - - vmcs_clear(shadow_vmcs); - vmcs_load(vmx->loaded_vmcs->vmcs); - - preempt_enable(); -} - -static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) -{ - const u16 *fields[] = { - shadow_read_write_fields, - shadow_read_only_fields - }; - const int max_fields[] = { - max_shadow_read_write_fields, - max_shadow_read_only_fields - }; - int i, q; - unsigned long field; - u64 field_value = 0; - struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; - - vmcs_load(shadow_vmcs); - - for (q = 0; q < ARRAY_SIZE(fields); q++) { - for (i = 0; i < max_fields[q]; i++) { - field = fields[q][i]; - vmcs12_read_any(get_vmcs12(&vmx->vcpu), field, &field_value); - __vmcs_writel(field, field_value); - } - } - - vmcs_clear(shadow_vmcs); - vmcs_load(vmx->loaded_vmcs->vmcs); -} - -static int handle_vmread(struct kvm_vcpu *vcpu) +static void kvm_flush_pml_buffers(struct kvm *kvm) { - unsigned long field; - u64 field_value; - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - gva_t gva = 0; - struct vmcs12 *vmcs12; - - if (!nested_vmx_check_permission(vcpu)) - return 1; - - if (to_vmx(vcpu)->nested.current_vmptr == -1ull) - return nested_vmx_failInvalid(vcpu); - - if (!is_guest_mode(vcpu)) - vmcs12 = get_vmcs12(vcpu); - else { - /* - * When vmcs->vmcs_link_pointer is -1ull, any VMREAD - * to shadowed-field sets the ALU flags for VMfailInvalid. - */ - if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) - return nested_vmx_failInvalid(vcpu); - vmcs12 = get_shadow_vmcs12(vcpu); - } - - /* Decode instruction info and find the field to read */ - field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); - /* Read the field, zero-extended to a u64 field_value */ - if (vmcs12_read_any(vmcs12, field, &field_value) < 0) - return nested_vmx_failValid(vcpu, - VMXERR_UNSUPPORTED_VMCS_COMPONENT); - + int i; + struct kvm_vcpu *vcpu; /* - * Now copy part of this value to register or memory, as requested. - * Note that the number of bits actually copied is 32 or 64 depending - * on the guest's mode (32 or 64 bit), not on the given field's length. + * We only need to kick vcpu out of guest mode here, as PML buffer + * is flushed at beginning of all VMEXITs, and it's obvious that only + * vcpus running in guest are possible to have unflushed GPAs in PML + * buffer. */ - if (vmx_instruction_info & (1u << 10)) { - kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf), - field_value); - } else { - if (get_vmx_mem_address(vcpu, exit_qualification, - vmx_instruction_info, true, &gva)) - return 1; - /* _system ok, nested_vmx_check_permission has verified cpl=0 */ - kvm_write_guest_virt_system(vcpu, gva, &field_value, - (is_long_mode(vcpu) ? 8 : 4), NULL); - } - - return nested_vmx_succeed(vcpu); + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_vcpu_kick(vcpu); } - -static int handle_vmwrite(struct kvm_vcpu *vcpu) +static void vmx_dump_sel(char *name, uint32_t sel) { - unsigned long field; - gva_t gva; - struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - - /* The value to write might be 32 or 64 bits, depending on L1's long - * mode, and eventually we need to write that into a field of several - * possible lengths. The code below first zero-extends the value to 64 - * bit (field_value), and then copies only the appropriate number of - * bits into the vmcs12 field. - */ - u64 field_value = 0; - struct x86_exception e; - struct vmcs12 *vmcs12; - - if (!nested_vmx_check_permission(vcpu)) - return 1; - - if (vmx->nested.current_vmptr == -1ull) - return nested_vmx_failInvalid(vcpu); - - if (vmx_instruction_info & (1u << 10)) - field_value = kvm_register_readl(vcpu, - (((vmx_instruction_info) >> 3) & 0xf)); - else { - if (get_vmx_mem_address(vcpu, exit_qualification, - vmx_instruction_info, false, &gva)) - return 1; - if (kvm_read_guest_virt(vcpu, gva, &field_value, - (is_64_bit_mode(vcpu) ? 8 : 4), &e)) { - kvm_inject_page_fault(vcpu, &e); - return 1; - } - } - - - field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); - /* - * If the vCPU supports "VMWRITE to any supported field in the - * VMCS," then the "read-only" fields are actually read/write. - */ - if (vmcs_field_readonly(field) && - !nested_cpu_has_vmwrite_any_field(vcpu)) - return nested_vmx_failValid(vcpu, - VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); - - if (!is_guest_mode(vcpu)) - vmcs12 = get_vmcs12(vcpu); - else { - /* - * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE - * to shadowed-field sets the ALU flags for VMfailInvalid. - */ - if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) - return nested_vmx_failInvalid(vcpu); - vmcs12 = get_shadow_vmcs12(vcpu); - } - - if (vmcs12_write_any(vmcs12, field, field_value) < 0) - return nested_vmx_failValid(vcpu, - VMXERR_UNSUPPORTED_VMCS_COMPONENT); - - /* - * Do not track vmcs12 dirty-state if in guest-mode - * as we actually dirty shadow vmcs12 instead of vmcs12. - */ - if (!is_guest_mode(vcpu)) { - switch (field) { -#define SHADOW_FIELD_RW(x) case x: -#include "vmcs_shadow_fields.h" - /* - * The fields that can be updated by L1 without a vmexit are - * always updated in the vmcs02, the others go down the slow - * path of prepare_vmcs02. - */ - break; - default: - vmx->nested.dirty_vmcs12 = true; - break; - } - } - - return nested_vmx_succeed(vcpu); + pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", + name, vmcs_read16(sel), + vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR), + vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR), + vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR)); } -static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) +static void vmx_dump_dtsel(char *name, uint32_t limit) { - vmx->nested.current_vmptr = vmptr; - if (enable_shadow_vmcs) { - vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, - SECONDARY_EXEC_SHADOW_VMCS); - vmcs_write64(VMCS_LINK_POINTER, - __pa(vmx->vmcs01.shadow_vmcs)); - vmx->nested.need_vmcs12_sync = true; - } - vmx->nested.dirty_vmcs12 = true; + pr_err("%s limit=0x%08x, base=0x%016lx\n", + name, vmcs_read32(limit), + vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); } -/* Emulate the VMPTRLD instruction */ -static int handle_vmptrld(struct kvm_vcpu *vcpu) +static void dump_vmcs(void) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - gpa_t vmptr; - - if (!nested_vmx_check_permission(vcpu)) - return 1; - - if (nested_vmx_get_vmptr(vcpu, &vmptr)) - return 1; - - if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) - return nested_vmx_failValid(vcpu, - VMXERR_VMPTRLD_INVALID_ADDRESS); - - if (vmptr == vmx->nested.vmxon_ptr) - return nested_vmx_failValid(vcpu, - VMXERR_VMPTRLD_VMXON_POINTER); - - /* Forbid normal VMPTRLD if Enlightened version was used */ - if (vmx->nested.hv_evmcs) - return 1; + u32 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS); + u32 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS); + u32 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + u32 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); + u32 secondary_exec_control = 0; + unsigned long cr4 = vmcs_readl(GUEST_CR4); + u64 efer = vmcs_read64(GUEST_IA32_EFER); + int i, n; - if (vmx->nested.current_vmptr != vmptr) { - struct vmcs12 *new_vmcs12; - struct page *page; - page = kvm_vcpu_gpa_to_page(vcpu, vmptr); - if (is_error_page(page)) { - /* - * Reads from an unbacked page return all 1s, - * which means that the 32 bits located at the - * given physical address won't match the required - * VMCS12_REVISION identifier. - */ - nested_vmx_failValid(vcpu, - VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); - return kvm_skip_emulated_instruction(vcpu); - } - new_vmcs12 = kmap(page); - if (new_vmcs12->hdr.revision_id != VMCS12_REVISION || - (new_vmcs12->hdr.shadow_vmcs && - !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { - kunmap(page); - kvm_release_page_clean(page); - return nested_vmx_failValid(vcpu, - VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); - } + if (cpu_has_secondary_exec_ctrls()) + secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - nested_release_vmcs12(vcpu); - - /* - * Load VMCS12 from guest memory since it is not already - * cached. - */ - memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE); - kunmap(page); - kvm_release_page_clean(page); - - set_current_vmptr(vmx, vmptr); + pr_err("*** Guest State ***\n"); + pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", + vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW), + vmcs_readl(CR0_GUEST_HOST_MASK)); + pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", + cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK)); + pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3)); + if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) && + (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA)) + { + pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n", + vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1)); + pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n", + vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3)); } + pr_err("RSP = 0x%016lx RIP = 0x%016lx\n", + vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP)); + pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n", + vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7)); + pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", + vmcs_readl(GUEST_SYSENTER_ESP), + vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP)); + vmx_dump_sel("CS: ", GUEST_CS_SELECTOR); + vmx_dump_sel("DS: ", GUEST_DS_SELECTOR); + vmx_dump_sel("SS: ", GUEST_SS_SELECTOR); + vmx_dump_sel("ES: ", GUEST_ES_SELECTOR); + vmx_dump_sel("FS: ", GUEST_FS_SELECTOR); + vmx_dump_sel("GS: ", GUEST_GS_SELECTOR); + vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT); + vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR); + vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT); + vmx_dump_sel("TR: ", GUEST_TR_SELECTOR); + if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) || + (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER))) + pr_err("EFER = 0x%016llx PAT = 0x%016llx\n", + efer, vmcs_read64(GUEST_IA32_PAT)); + pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n", + vmcs_read64(GUEST_IA32_DEBUGCTL), + vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS)); + if (cpu_has_load_perf_global_ctrl() && + vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) + pr_err("PerfGlobCtl = 0x%016llx\n", + vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL)); + if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) + pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS)); + pr_err("Interruptibility = %08x ActivityState = %08x\n", + vmcs_read32(GUEST_INTERRUPTIBILITY_INFO), + vmcs_read32(GUEST_ACTIVITY_STATE)); + if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) + pr_err("InterruptStatus = %04x\n", + vmcs_read16(GUEST_INTR_STATUS)); + + pr_err("*** Host State ***\n"); + pr_err("RIP = 0x%016lx RSP = 0x%016lx\n", + vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP)); + pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n", + vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR), + vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR), + vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR), + vmcs_read16(HOST_TR_SELECTOR)); + pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n", + vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE), + vmcs_readl(HOST_TR_BASE)); + pr_err("GDTBase=%016lx IDTBase=%016lx\n", + vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE)); + pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n", + vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3), + vmcs_readl(HOST_CR4)); + pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", + vmcs_readl(HOST_IA32_SYSENTER_ESP), + vmcs_read32(HOST_IA32_SYSENTER_CS), + vmcs_readl(HOST_IA32_SYSENTER_EIP)); + if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER)) + pr_err("EFER = 0x%016llx PAT = 0x%016llx\n", + vmcs_read64(HOST_IA32_EFER), + vmcs_read64(HOST_IA32_PAT)); + if (cpu_has_load_perf_global_ctrl() && + vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) + pr_err("PerfGlobCtl = 0x%016llx\n", + vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); - return nested_vmx_succeed(vcpu); + pr_err("*** Control State ***\n"); + pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n", + pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control); + pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl); + pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n", + vmcs_read32(EXCEPTION_BITMAP), + vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK), + vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH)); + pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n", + vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), + vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE), + vmcs_read32(VM_ENTRY_INSTRUCTION_LEN)); + pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n", + vmcs_read32(VM_EXIT_INTR_INFO), + vmcs_read32(VM_EXIT_INTR_ERROR_CODE), + vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); + pr_err(" reason=%08x qualification=%016lx\n", + vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION)); + pr_err("IDTVectoring: info=%08x errcode=%08x\n", + vmcs_read32(IDT_VECTORING_INFO_FIELD), + vmcs_read32(IDT_VECTORING_ERROR_CODE)); + pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET)); + if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING) + pr_err("TSC Multiplier = 0x%016llx\n", + vmcs_read64(TSC_MULTIPLIER)); + if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) + pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); + if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) + pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); + if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) + pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER)); + n = vmcs_read32(CR3_TARGET_COUNT); + for (i = 0; i + 1 < n; i += 4) + pr_err("CR3 target%u=%016lx target%u=%016lx\n", + i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2), + i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2)); + if (i < n) + pr_err("CR3 target%u=%016lx\n", + i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2)); + if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) + pr_err("PLE Gap=%08x Window=%08x\n", + vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW)); + if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID) + pr_err("Virtual processor ID = 0x%04x\n", + vmcs_read16(VIRTUAL_PROCESSOR_ID)); } /* - * This is an equivalent of the nested hypervisor executing the vmptrld - * instruction. + * The guest has exited. See if we can fix it or if we need userspace + * assistance. */ -static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, - bool from_launch) +static int vmx_handle_exit(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct hv_vp_assist_page assist_page; - - if (likely(!vmx->nested.enlightened_vmcs_enabled)) - return 1; - - if (unlikely(!kvm_hv_get_assist_page(vcpu, &assist_page))) - return 1; - - if (unlikely(!assist_page.enlighten_vmentry)) - return 1; + u32 exit_reason = vmx->exit_reason; + u32 vectoring_info = vmx->idt_vectoring_info; - if (unlikely(assist_page.current_nested_vmcs != - vmx->nested.hv_evmcs_vmptr)) { + trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX); - if (!vmx->nested.hv_evmcs) - vmx->nested.current_vmptr = -1ull; + /* + * Flush logged GPAs PML buffer, this will make dirty_bitmap more + * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before + * querying dirty_bitmap, we only need to kick all vcpus out of guest + * mode as if vcpus is in root mode, the PML buffer must has been + * flushed already. + */ + if (enable_pml) + vmx_flush_pml_buffer(vcpu); - nested_release_evmcs(vcpu); + /* If guest state is invalid, start emulating */ + if (vmx->emulation_required) + return handle_invalid_guest_state(vcpu); - vmx->nested.hv_evmcs_page = kvm_vcpu_gpa_to_page( - vcpu, assist_page.current_nested_vmcs); + if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason)) + return nested_vmx_reflect_vmexit(vcpu, exit_reason); - if (unlikely(is_error_page(vmx->nested.hv_evmcs_page))) - return 0; + if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { + dump_vmcs(); + vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; + vcpu->run->fail_entry.hardware_entry_failure_reason + = exit_reason; + return 0; + } - vmx->nested.hv_evmcs = kmap(vmx->nested.hv_evmcs_page); + if (unlikely(vmx->fail)) { + vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; + vcpu->run->fail_entry.hardware_entry_failure_reason + = vmcs_read32(VM_INSTRUCTION_ERROR); + return 0; + } - /* - * Currently, KVM only supports eVMCS version 1 - * (== KVM_EVMCS_VERSION) and thus we expect guest to set this - * value to first u32 field of eVMCS which should specify eVMCS - * VersionNumber. - * - * Guest should be aware of supported eVMCS versions by host by - * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is - * expected to set this CPUID leaf according to the value - * returned in vmcs_version from nested_enable_evmcs(). - * - * However, it turns out that Microsoft Hyper-V fails to comply - * to their own invented interface: When Hyper-V use eVMCS, it - * just sets first u32 field of eVMCS to revision_id specified - * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number - * which is one of the supported versions specified in - * CPUID.0x4000000A.EAX[0:15]. - * - * To overcome Hyper-V bug, we accept here either a supported - * eVMCS version or VMCS12 revision_id as valid values for first - * u32 field of eVMCS. - */ - if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && - (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { - nested_release_evmcs(vcpu); - return 0; + /* + * Note: + * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by + * delivery event since it indicates guest is accessing MMIO. + * The vm-exit can be triggered again after return to guest that + * will cause infinite loop. + */ + if ((vectoring_info & VECTORING_INFO_VALID_MASK) && + (exit_reason != EXIT_REASON_EXCEPTION_NMI && + exit_reason != EXIT_REASON_EPT_VIOLATION && + exit_reason != EXIT_REASON_PML_FULL && + exit_reason != EXIT_REASON_TASK_SWITCH)) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; + vcpu->run->internal.ndata = 3; + vcpu->run->internal.data[0] = vectoring_info; + vcpu->run->internal.data[1] = exit_reason; + vcpu->run->internal.data[2] = vcpu->arch.exit_qualification; + if (exit_reason == EXIT_REASON_EPT_MISCONFIG) { + vcpu->run->internal.ndata++; + vcpu->run->internal.data[3] = + vmcs_read64(GUEST_PHYSICAL_ADDRESS); } + return 0; + } - vmx->nested.dirty_vmcs12 = true; - /* - * As we keep L2 state for one guest only 'hv_clean_fields' mask - * can't be used when we switch between them. Reset it here for - * simplicity. - */ - vmx->nested.hv_evmcs->hv_clean_fields &= - ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; - vmx->nested.hv_evmcs_vmptr = assist_page.current_nested_vmcs; - - /* - * Unlike normal vmcs12, enlightened vmcs12 is not fully - * reloaded from guest's memory (read only fields, fields not - * present in struct hv_enlightened_vmcs, ...). Make sure there - * are no leftovers. - */ - if (from_launch) { - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - memset(vmcs12, 0, sizeof(*vmcs12)); - vmcs12->hdr.revision_id = VMCS12_REVISION; + if (unlikely(!enable_vnmi && + vmx->loaded_vmcs->soft_vnmi_blocked)) { + if (vmx_interrupt_allowed(vcpu)) { + vmx->loaded_vmcs->soft_vnmi_blocked = 0; + } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL && + vcpu->arch.nmi_pending) { + /* + * This CPU don't support us in finding the end of an + * NMI-blocked window if the guest runs with IRQs + * disabled. So we pull the trigger after 1 s of + * futile waiting, but inform the user about this. + */ + printk(KERN_WARNING "%s: Breaking out of NMI-blocked " + "state on VCPU %d after 1 s timeout\n", + __func__, vcpu->vcpu_id); + vmx->loaded_vmcs->soft_vnmi_blocked = 0; } - } - return 1; -} -/* Emulate the VMPTRST instruction */ -static int handle_vmptrst(struct kvm_vcpu *vcpu) -{ - unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION); - u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); - gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; - struct x86_exception e; - gva_t gva; - - if (!nested_vmx_check_permission(vcpu)) - return 1; - - if (unlikely(to_vmx(vcpu)->nested.hv_evmcs)) - return 1; - - if (get_vmx_mem_address(vcpu, exit_qual, instr_info, true, &gva)) - return 1; - /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ - if (kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, - sizeof(gpa_t), &e)) { - kvm_inject_page_fault(vcpu, &e); + if (exit_reason < kvm_vmx_max_exit_handlers + && kvm_vmx_exit_handlers[exit_reason]) + return kvm_vmx_exit_handlers[exit_reason](vcpu); + else { + vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", + exit_reason); + kvm_queue_exception(vcpu, UD_VECTOR); return 1; } - return nested_vmx_succeed(vcpu); } -/* Emulate the INVEPT instruction */ -static int handle_invept(struct kvm_vcpu *vcpu) +/* + * Software based L1D cache flush which is used when microcode providing + * the cache control MSR is not loaded. + * + * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to + * flush it is required to read in 64 KiB because the replacement algorithm + * is not exactly LRU. This could be sized at runtime via topology + * information but as all relevant affected CPUs have 32KiB L1D cache size + * there is no point in doing so. + */ +static void vmx_l1d_flush(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 vmx_instruction_info, types; - unsigned long type; - gva_t gva; - struct x86_exception e; - struct { - u64 eptp, gpa; - } operand; - - if (!(vmx->nested.msrs.secondary_ctls_high & - SECONDARY_EXEC_ENABLE_EPT) || - !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } - - if (!nested_vmx_check_permission(vcpu)) - return 1; + int size = PAGE_SIZE << L1D_CACHE_ORDER; - vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); + /* + * This code is only executed when the the flush mode is 'cond' or + * 'always' + */ + if (static_branch_likely(&vmx_l1d_flush_cond)) { + bool flush_l1d; - types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; + /* + * Clear the per-vcpu flush bit, it gets set again + * either from vcpu_run() or from one of the unsafe + * VMEXIT handlers. + */ + flush_l1d = vcpu->arch.l1tf_flush_l1d; + vcpu->arch.l1tf_flush_l1d = false; - if (type >= 32 || !(types & (1 << type))) - return nested_vmx_failValid(vcpu, - VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + /* + * Clear the per-cpu flush bit, it gets set again from + * the interrupt handlers. + */ + flush_l1d |= kvm_get_cpu_l1tf_flush_l1d(); + kvm_clear_cpu_l1tf_flush_l1d(); - /* According to the Intel VMX instruction reference, the memory - * operand is read even if it isn't needed (e.g., for type==global) - */ - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), - vmx_instruction_info, false, &gva)) - return 1; - if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { - kvm_inject_page_fault(vcpu, &e); - return 1; + if (!flush_l1d) + return; } - switch (type) { - case VMX_EPT_EXTENT_GLOBAL: - /* - * TODO: track mappings and invalidate - * single context requests appropriately - */ - case VMX_EPT_EXTENT_CONTEXT: - kvm_mmu_sync_roots(vcpu); - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); - break; - default: - BUG_ON(1); - break; + vcpu->stat.l1d_flush++; + + if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { + wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); + return; } - return nested_vmx_succeed(vcpu); + asm volatile( + /* First ensure the pages are in the TLB */ + "xorl %%eax, %%eax\n" + ".Lpopulate_tlb:\n\t" + "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" + "addl $4096, %%eax\n\t" + "cmpl %%eax, %[size]\n\t" + "jne .Lpopulate_tlb\n\t" + "xorl %%eax, %%eax\n\t" + "cpuid\n\t" + /* Now fill the cache */ + "xorl %%eax, %%eax\n" + ".Lfill_cache:\n" + "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" + "addl $64, %%eax\n\t" + "cmpl %%eax, %[size]\n\t" + "jne .Lfill_cache\n\t" + "lfence\n" + :: [flush_pages] "r" (vmx_l1d_flush_pages), + [size] "r" (size) + : "eax", "ebx", "ecx", "edx"); } -static u16 nested_get_vpid02(struct kvm_vcpu *vcpu) +static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - - return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid; -} + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); -static int handle_invvpid(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 vmx_instruction_info; - unsigned long type, types; - gva_t gva; - struct x86_exception e; - struct { - u64 vpid; - u64 gla; - } operand; - u16 vpid02; + if (is_guest_mode(vcpu) && + nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) + return; - if (!(vmx->nested.msrs.secondary_ctls_high & - SECONDARY_EXEC_ENABLE_VPID) || - !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; + if (irr == -1 || tpr < irr) { + vmcs_write32(TPR_THRESHOLD, 0); + return; } - if (!nested_vmx_check_permission(vcpu)) - return 1; + vmcs_write32(TPR_THRESHOLD, irr); +} - vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); +void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) +{ + u32 sec_exec_control; - types = (vmx->nested.msrs.vpid_caps & - VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; + if (!lapic_in_kernel(vcpu)) + return; - if (type >= 32 || !(types & (1 << type))) - return nested_vmx_failValid(vcpu, - VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + if (!flexpriority_enabled && + !cpu_has_vmx_virtualize_x2apic_mode()) + return; - /* according to the intel vmx instruction reference, the memory - * operand is read even if it isn't needed (e.g., for type==global) - */ - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), - vmx_instruction_info, false, &gva)) - return 1; - if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { - kvm_inject_page_fault(vcpu, &e); - return 1; + /* Postpone execution until vmcs01 is the current VMCS. */ + if (is_guest_mode(vcpu)) { + to_vmx(vcpu)->nested.change_vmcs01_virtual_apic_mode = true; + return; } - if (operand.vpid >> 16) - return nested_vmx_failValid(vcpu, - VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); - vpid02 = nested_get_vpid02(vcpu); - switch (type) { - case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: - if (!operand.vpid || - is_noncanonical_address(operand.gla, vcpu)) - return nested_vmx_failValid(vcpu, - VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); - if (cpu_has_vmx_invvpid_individual_addr()) { - __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, - vpid02, operand.gla); - } else - __vmx_flush_tlb(vcpu, vpid02, false); + sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); + + switch (kvm_get_apic_mode(vcpu)) { + case LAPIC_MODE_INVALID: + WARN_ONCE(true, "Invalid local APIC state"); + case LAPIC_MODE_DISABLED: break; - case VMX_VPID_EXTENT_SINGLE_CONTEXT: - case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: - if (!operand.vpid) - return nested_vmx_failValid(vcpu, - VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); - __vmx_flush_tlb(vcpu, vpid02, false); + case LAPIC_MODE_XAPIC: + if (flexpriority_enabled) { + sec_exec_control |= + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + vmx_flush_tlb(vcpu, true); + } break; - case VMX_VPID_EXTENT_ALL_CONTEXT: - __vmx_flush_tlb(vcpu, vpid02, false); + case LAPIC_MODE_X2APIC: + if (cpu_has_vmx_virtualize_x2apic_mode()) + sec_exec_control |= + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; break; - default: - WARN_ON_ONCE(1); - return kvm_skip_emulated_instruction(vcpu); } + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); - return nested_vmx_succeed(vcpu); + vmx_update_msr_bitmap(vcpu); } -static int handle_invpcid(struct kvm_vcpu *vcpu) +static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) { - u32 vmx_instruction_info; - unsigned long type; - bool pcid_enabled; - gva_t gva; - struct x86_exception e; - unsigned i; - unsigned long roots_to_free = 0; - struct { - u64 pcid; - u64 gla; - } operand; - - if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } - - vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); - - if (type > 3) { - kvm_inject_gp(vcpu, 0); - return 1; + if (!is_guest_mode(vcpu)) { + vmcs_write64(APIC_ACCESS_ADDR, hpa); + vmx_flush_tlb(vcpu, true); } +} - /* According to the Intel instruction reference, the memory operand - * is read even if it isn't needed (e.g., for type==all) - */ - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), - vmx_instruction_info, false, &gva)) - return 1; +static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) +{ + u16 status; + u8 old; - if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { - kvm_inject_page_fault(vcpu, &e); - return 1; - } + if (max_isr == -1) + max_isr = 0; - if (operand.pcid >> 12 != 0) { - kvm_inject_gp(vcpu, 0); - return 1; + status = vmcs_read16(GUEST_INTR_STATUS); + old = status >> 8; + if (max_isr != old) { + status &= 0xff; + status |= max_isr << 8; + vmcs_write16(GUEST_INTR_STATUS, status); } +} - pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); +static void vmx_set_rvi(int vector) +{ + u16 status; + u8 old; - switch (type) { - case INVPCID_TYPE_INDIV_ADDR: - if ((!pcid_enabled && (operand.pcid != 0)) || - is_noncanonical_address(operand.gla, vcpu)) { - kvm_inject_gp(vcpu, 0); - return 1; - } - kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid); - return kvm_skip_emulated_instruction(vcpu); + if (vector == -1) + vector = 0; - case INVPCID_TYPE_SINGLE_CTXT: - if (!pcid_enabled && (operand.pcid != 0)) { - kvm_inject_gp(vcpu, 0); - return 1; - } + status = vmcs_read16(GUEST_INTR_STATUS); + old = (u8)status & 0xff; + if ((u8)vector != old) { + status &= ~0xff; + status |= (u8)vector; + vmcs_write16(GUEST_INTR_STATUS, status); + } +} - if (kvm_get_active_pcid(vcpu) == operand.pcid) { - kvm_mmu_sync_roots(vcpu); - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); - } +static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) +{ + /* + * When running L2, updating RVI is only relevant when + * vmcs12 virtual-interrupt-delivery enabled. + * However, it can be enabled only when L1 also + * intercepts external-interrupts and in that case + * we should not update vmcs02 RVI but instead intercept + * interrupt. Therefore, do nothing when running L2. + */ + if (!is_guest_mode(vcpu)) + vmx_set_rvi(max_irr); +} - for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) - if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].cr3) - == operand.pcid) - roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); +static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int max_irr; + bool max_irr_updated; - kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free); + WARN_ON(!vcpu->arch.apicv_active); + if (pi_test_on(&vmx->pi_desc)) { + pi_clear_on(&vmx->pi_desc); /* - * If neither the current cr3 nor any of the prev_roots use the - * given PCID, then nothing needs to be done here because a - * resync will happen anyway before switching to any other CR3. + * IOMMU can write to PIR.ON, so the barrier matters even on UP. + * But on x86 this is just a compiler barrier anyway. */ + smp_mb__after_atomic(); + max_irr_updated = + kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr); - return kvm_skip_emulated_instruction(vcpu); - - case INVPCID_TYPE_ALL_NON_GLOBAL: /* - * Currently, KVM doesn't mark global entries in the shadow - * page tables, so a non-global flush just degenerates to a - * global flush. If needed, we could optimize this later by - * keeping track of global entries in shadow page tables. + * If we are running L2 and L1 has a new pending interrupt + * which can be injected, we should re-evaluate + * what should be done with this new L1 interrupt. + * If L1 intercepts external-interrupts, we should + * exit from L2 to L1. Otherwise, interrupt should be + * delivered directly to L2. */ - - /* fall-through */ - case INVPCID_TYPE_ALL_INCL_GLOBAL: - kvm_mmu_unload(vcpu); - return kvm_skip_emulated_instruction(vcpu); - - default: - BUG(); /* We have already checked above that type <= 3 */ + if (is_guest_mode(vcpu) && max_irr_updated) { + if (nested_exit_on_intr(vcpu)) + kvm_vcpu_exiting_guest_mode(vcpu); + else + kvm_make_request(KVM_REQ_EVENT, vcpu); + } + } else { + max_irr = kvm_lapic_find_highest_irr(vcpu); } + vmx_hwapic_irr_update(vcpu, max_irr); + return max_irr; } -static int handle_pml_full(struct kvm_vcpu *vcpu) +static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) { - unsigned long exit_qualification; - - trace_kvm_pml_full(vcpu->vcpu_id); - - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - - /* - * PML buffer FULL happened while executing iret from NMI, - * "blocked by NMI" bit has to be set before next VM entry. - */ - if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && - enable_vnmi && - (exit_qualification & INTR_INFO_UNBLOCK_NMI)) - vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); + if (!kvm_vcpu_apicv_active(vcpu)) + return; - /* - * PML buffer already flushed at beginning of VMEXIT. Nothing to do - * here.., and there's no userspace involvement needed for PML. - */ - return 1; + vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); + vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); + vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); + vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); } -static int handle_preemption_timer(struct kvm_vcpu *vcpu) +static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu) { - if (!to_vmx(vcpu)->req_immediate_exit) - kvm_lapic_expired_hv_timer(vcpu); - return 1; + struct vcpu_vmx *vmx = to_vmx(vcpu); + + pi_clear_on(&vmx->pi_desc); + memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir)); } -static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address) +static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - int maxphyaddr = cpuid_maxphyaddr(vcpu); + u32 exit_intr_info = 0; + u16 basic_exit_reason = (u16)vmx->exit_reason; - /* Check for memory type validity */ - switch (address & VMX_EPTP_MT_MASK) { - case VMX_EPTP_MT_UC: - if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT)) - return false; - break; - case VMX_EPTP_MT_WB: - if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT)) - return false; - break; - default: - return false; - } + if (!(basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY + || basic_exit_reason == EXIT_REASON_EXCEPTION_NMI)) + return; - /* only 4 levels page-walk length are valid */ - if ((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4) - return false; + if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) + exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + vmx->exit_intr_info = exit_intr_info; - /* Reserved bits should not be set */ - if (address >> maxphyaddr || ((address >> 7) & 0x1f)) - return false; + /* if exit due to PF check for async PF */ + if (is_page_fault(exit_intr_info)) + vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason(); - /* AD, if set, should be supported */ - if (address & VMX_EPTP_AD_ENABLE_BIT) { - if (!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT)) - return false; - } + /* Handle machine checks before interrupts are enabled */ + if (basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY || + is_machine_check(exit_intr_info)) + kvm_machine_check(); - return true; + /* We need to handle NMIs before interrupts are enabled */ + if (is_nmi(exit_intr_info)) { + kvm_before_interrupt(&vmx->vcpu); + asm("int $2"); + kvm_after_interrupt(&vmx->vcpu); + } } -static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) +static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) { - u32 index = vcpu->arch.regs[VCPU_REGS_RCX]; - u64 address; - bool accessed_dirty; - struct kvm_mmu *mmu = vcpu->arch.walk_mmu; - - if (!nested_cpu_has_eptp_switching(vmcs12) || - !nested_cpu_has_ept(vmcs12)) - return 1; + u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - if (index >= VMFUNC_EPTP_ENTRIES) - return 1; + if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK)) + == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) { + unsigned int vector; + unsigned long entry; + gate_desc *desc; + struct vcpu_vmx *vmx = to_vmx(vcpu); +#ifdef CONFIG_X86_64 + unsigned long tmp; +#endif + vector = exit_intr_info & INTR_INFO_VECTOR_MASK; + desc = (gate_desc *)vmx->host_idt_base + vector; + entry = gate_offset(desc); + asm volatile( +#ifdef CONFIG_X86_64 + "mov %%" _ASM_SP ", %[sp]\n\t" + "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t" + "push $%c[ss]\n\t" + "push %[sp]\n\t" +#endif + "pushf\n\t" + __ASM_SIZE(push) " $%c[cs]\n\t" + CALL_NOSPEC + : +#ifdef CONFIG_X86_64 + [sp]"=&r"(tmp), +#endif + ASM_CALL_CONSTRAINT + : + THUNK_TARGET(entry), + [ss]"i"(__KERNEL_DS), + [cs]"i"(__KERNEL_CS) + ); + } +} +STACK_FRAME_NON_STANDARD(vmx_handle_external_intr); - if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, - &address, index * 8, 8)) - return 1; +static bool vmx_has_emulated_msr(int index) +{ + switch (index) { + case MSR_IA32_SMBASE: + /* + * We cannot do SMM unless we can run the guest in big + * real mode. + */ + return enable_unrestricted_guest || emulate_invalid_guest_state; + case MSR_AMD64_VIRT_SPEC_CTRL: + /* This is AMD only. */ + return false; + default: + return true; + } +} - accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT); +static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) +{ + u32 exit_intr_info; + bool unblock_nmi; + u8 vector; + bool idtv_info_valid; - /* - * If the (L2) guest does a vmfunc to the currently - * active ept pointer, we don't have to do anything else - */ - if (vmcs12->ept_pointer != address) { - if (!valid_ept_address(vcpu, address)) - return 1; + idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; - kvm_mmu_unload(vcpu); - mmu->ept_ad = accessed_dirty; - mmu->mmu_role.base.ad_disabled = !accessed_dirty; - vmcs12->ept_pointer = address; + if (enable_vnmi) { + if (vmx->loaded_vmcs->nmi_known_unmasked) + return; /* - * TODO: Check what's the correct approach in case - * mmu reload fails. Currently, we just let the next - * reload potentially fail + * Can't use vmx->exit_intr_info since we're not sure what + * the exit reason is. */ - kvm_mmu_reload(vcpu); - } - - return 0; + exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; + vector = exit_intr_info & INTR_INFO_VECTOR_MASK; + /* + * SDM 3: 27.7.1.2 (September 2008) + * Re-set bit "block by NMI" before VM entry if vmexit caused by + * a guest IRET fault. + * SDM 3: 23.2.2 (September 2008) + * Bit 12 is undefined in any of the following cases: + * If the VM exit sets the valid bit in the IDT-vectoring + * information field. + * If the VM exit is due to a double fault. + */ + if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && + vector != DF_VECTOR && !idtv_info_valid) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + else + vmx->loaded_vmcs->nmi_known_unmasked = + !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) + & GUEST_INTR_STATE_NMI); + } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked)) + vmx->loaded_vmcs->vnmi_blocked_time += + ktime_to_ns(ktime_sub(ktime_get(), + vmx->loaded_vmcs->entry_time)); } -static int handle_vmfunc(struct kvm_vcpu *vcpu) +static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, + u32 idt_vectoring_info, + int instr_len_field, + int error_code_field) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct vmcs12 *vmcs12; - u32 function = vcpu->arch.regs[VCPU_REGS_RAX]; + u8 vector; + int type; + bool idtv_info_valid; - /* - * VMFUNC is only supported for nested guests, but we always enable the - * secondary control for simplicity; for non-nested mode, fake that we - * didn't by injecting #UD. - */ - if (!is_guest_mode(vcpu)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } + idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; - vmcs12 = get_vmcs12(vcpu); - if ((vmcs12->vm_function_control & (1 << function)) == 0) - goto fail; + vcpu->arch.nmi_injected = false; + kvm_clear_exception_queue(vcpu); + kvm_clear_interrupt_queue(vcpu); - switch (function) { - case 0: - if (nested_vmx_eptp_switching(vcpu, vmcs12)) - goto fail; - break; - default: - goto fail; - } - return kvm_skip_emulated_instruction(vcpu); + if (!idtv_info_valid) + return; -fail: - nested_vmx_vmexit(vcpu, vmx->exit_reason, - vmcs_read32(VM_EXIT_INTR_INFO), - vmcs_readl(EXIT_QUALIFICATION)); - return 1; -} + kvm_make_request(KVM_REQ_EVENT, vcpu); -/* - * When nested=0, all VMX instruction VM Exits filter here. The handlers - * are overwritten by nested_vmx_setup() when nested=1. - */ -static int handle_vmx_instruction(struct kvm_vcpu *vcpu) -{ - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; -} + vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; + type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; -static int handle_encls(struct kvm_vcpu *vcpu) -{ - /* - * SGX virtualization is not yet supported. There is no software - * enable bit for SGX, so we have to trap ENCLS and inject a #UD - * to prevent the guest from executing ENCLS. - */ - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; + switch (type) { + case INTR_TYPE_NMI_INTR: + vcpu->arch.nmi_injected = true; + /* + * SDM 3: 27.7.1.2 (September 2008) + * Clear bit "block by NMI" before VM entry if a NMI + * delivery faulted. + */ + vmx_set_nmi_mask(vcpu, false); + break; + case INTR_TYPE_SOFT_EXCEPTION: + vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); + /* fall through */ + case INTR_TYPE_HARD_EXCEPTION: + if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { + u32 err = vmcs_read32(error_code_field); + kvm_requeue_exception_e(vcpu, vector, err); + } else + kvm_requeue_exception(vcpu, vector); + break; + case INTR_TYPE_SOFT_INTR: + vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); + /* fall through */ + case INTR_TYPE_EXT_INTR: + kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR); + break; + default: + break; + } } -/* - * The exit handlers return 1 if the exit was handled fully and guest execution - * may resume. Otherwise they set the kvm_run parameter to indicate what needs - * to be done to userspace and return 0. - */ -static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { - [EXIT_REASON_EXCEPTION_NMI] = handle_exception, - [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, - [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, - [EXIT_REASON_NMI_WINDOW] = handle_nmi_window, - [EXIT_REASON_IO_INSTRUCTION] = handle_io, - [EXIT_REASON_CR_ACCESS] = handle_cr, - [EXIT_REASON_DR_ACCESS] = handle_dr, - [EXIT_REASON_CPUID] = handle_cpuid, - [EXIT_REASON_MSR_READ] = handle_rdmsr, - [EXIT_REASON_MSR_WRITE] = handle_wrmsr, - [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, - [EXIT_REASON_HLT] = handle_halt, - [EXIT_REASON_INVD] = handle_invd, - [EXIT_REASON_INVLPG] = handle_invlpg, - [EXIT_REASON_RDPMC] = handle_rdpmc, - [EXIT_REASON_VMCALL] = handle_vmcall, - [EXIT_REASON_VMCLEAR] = handle_vmx_instruction, - [EXIT_REASON_VMLAUNCH] = handle_vmx_instruction, - [EXIT_REASON_VMPTRLD] = handle_vmx_instruction, - [EXIT_REASON_VMPTRST] = handle_vmx_instruction, - [EXIT_REASON_VMREAD] = handle_vmx_instruction, - [EXIT_REASON_VMRESUME] = handle_vmx_instruction, - [EXIT_REASON_VMWRITE] = handle_vmx_instruction, - [EXIT_REASON_VMOFF] = handle_vmx_instruction, - [EXIT_REASON_VMON] = handle_vmx_instruction, - [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, - [EXIT_REASON_APIC_ACCESS] = handle_apic_access, - [EXIT_REASON_APIC_WRITE] = handle_apic_write, - [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced, - [EXIT_REASON_WBINVD] = handle_wbinvd, - [EXIT_REASON_XSETBV] = handle_xsetbv, - [EXIT_REASON_TASK_SWITCH] = handle_task_switch, - [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, - [EXIT_REASON_GDTR_IDTR] = handle_desc, - [EXIT_REASON_LDTR_TR] = handle_desc, - [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, - [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, - [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, - [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait, - [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap, - [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor, - [EXIT_REASON_INVEPT] = handle_vmx_instruction, - [EXIT_REASON_INVVPID] = handle_vmx_instruction, - [EXIT_REASON_RDRAND] = handle_invalid_op, - [EXIT_REASON_RDSEED] = handle_invalid_op, - [EXIT_REASON_XSAVES] = handle_xsaves, - [EXIT_REASON_XRSTORS] = handle_xrstors, - [EXIT_REASON_PML_FULL] = handle_pml_full, - [EXIT_REASON_INVPCID] = handle_invpcid, - [EXIT_REASON_VMFUNC] = handle_vmx_instruction, - [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, - [EXIT_REASON_ENCLS] = handle_encls, -}; - -static const int kvm_vmx_max_exit_handlers = - ARRAY_SIZE(kvm_vmx_exit_handlers); +static void vmx_complete_interrupts(struct vcpu_vmx *vmx) +{ + __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info, + VM_EXIT_INSTRUCTION_LEN, + IDT_VECTORING_ERROR_CODE); +} -static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) +static void vmx_cancel_injection(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification; - gpa_t bitmap, last_bitmap; - unsigned int port; - int size; - u8 b; + __vmx_complete_interrupts(vcpu, + vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), + VM_ENTRY_INSTRUCTION_LEN, + VM_ENTRY_EXCEPTION_ERROR_CODE); - if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) - return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); +} - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); +static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) +{ + int i, nr_msrs; + struct perf_guest_switch_msr *msrs; - port = exit_qualification >> 16; - size = (exit_qualification & 7) + 1; + msrs = perf_guest_get_msrs(&nr_msrs); - last_bitmap = (gpa_t)-1; - b = -1; + if (!msrs) + return; - while (size > 0) { - if (port < 0x8000) - bitmap = vmcs12->io_bitmap_a; - else if (port < 0x10000) - bitmap = vmcs12->io_bitmap_b; + for (i = 0; i < nr_msrs; i++) + if (msrs[i].host == msrs[i].guest) + clear_atomic_switch_msr(vmx, msrs[i].msr); else - return true; - bitmap += (port & 0x7fff) / 8; - - if (last_bitmap != bitmap) - if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) - return true; - if (b & (1 << (port & 7))) - return true; - - port++; - size--; - last_bitmap = bitmap; - } - - return false; + add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, + msrs[i].host, false); } -/* - * Return 1 if we should exit from L2 to L1 to handle an MSR access access, - * rather than handle it ourselves in L0. I.e., check whether L1 expressed - * disinterest in the current event (read or write a specific MSR) by using an - * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. - */ -static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12, u32 exit_reason) +static void vmx_arm_hv_timer(struct vcpu_vmx *vmx, u32 val) { - u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX]; - gpa_t bitmap; - - if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) - return true; - - /* - * The MSR_BITMAP page is divided into four 1024-byte bitmaps, - * for the four combinations of read/write and low/high MSR numbers. - * First we need to figure out which of the four to use: - */ - bitmap = vmcs12->msr_bitmap; - if (exit_reason == EXIT_REASON_MSR_WRITE) - bitmap += 2048; - if (msr_index >= 0xc0000000) { - msr_index -= 0xc0000000; - bitmap += 1024; - } - - /* Then read the msr_index'th bit from this bitmap: */ - if (msr_index < 1024*8) { - unsigned char b; - if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) - return true; - return 1 & (b >> (msr_index & 7)); - } else - return true; /* let L1 handle the wrong parameter */ + vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, val); + if (!vmx->loaded_vmcs->hv_timer_armed) + vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, + PIN_BASED_VMX_PREEMPTION_TIMER); + vmx->loaded_vmcs->hv_timer_armed = true; } -/* - * Return 1 if we should exit from L2 to L1 to handle a CR access exit, - * rather than handle it ourselves in L0. I.e., check if L1 wanted to - * intercept (via guest_host_mask etc.) the current event. - */ -static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) +static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - int cr = exit_qualification & 15; - int reg; - unsigned long val; + struct vcpu_vmx *vmx = to_vmx(vcpu); + u64 tscl; + u32 delta_tsc; - switch ((exit_qualification >> 4) & 3) { - case 0: /* mov to cr */ - reg = (exit_qualification >> 8) & 15; - val = kvm_register_readl(vcpu, reg); - switch (cr) { - case 0: - if (vmcs12->cr0_guest_host_mask & - (val ^ vmcs12->cr0_read_shadow)) - return true; - break; - case 3: - if ((vmcs12->cr3_target_count >= 1 && - vmcs12->cr3_target_value0 == val) || - (vmcs12->cr3_target_count >= 2 && - vmcs12->cr3_target_value1 == val) || - (vmcs12->cr3_target_count >= 3 && - vmcs12->cr3_target_value2 == val) || - (vmcs12->cr3_target_count >= 4 && - vmcs12->cr3_target_value3 == val)) - return false; - if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) - return true; - break; - case 4: - if (vmcs12->cr4_guest_host_mask & - (vmcs12->cr4_read_shadow ^ val)) - return true; - break; - case 8: - if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) - return true; - break; - } - break; - case 2: /* clts */ - if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && - (vmcs12->cr0_read_shadow & X86_CR0_TS)) - return true; - break; - case 1: /* mov from cr */ - switch (cr) { - case 3: - if (vmcs12->cpu_based_vm_exec_control & - CPU_BASED_CR3_STORE_EXITING) - return true; - break; - case 8: - if (vmcs12->cpu_based_vm_exec_control & - CPU_BASED_CR8_STORE_EXITING) - return true; - break; - } - break; - case 3: /* lmsw */ - /* - * lmsw can change bits 1..3 of cr0, and only set bit 0 of - * cr0. Other attempted changes are ignored, with no exit. - */ - val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; - if (vmcs12->cr0_guest_host_mask & 0xe & - (val ^ vmcs12->cr0_read_shadow)) - return true; - if ((vmcs12->cr0_guest_host_mask & 0x1) && - !(vmcs12->cr0_read_shadow & 0x1) && - (val & 0x1)) - return true; - break; + if (vmx->req_immediate_exit) { + vmx_arm_hv_timer(vmx, 0); + return; } - return false; + + if (vmx->hv_deadline_tsc != -1) { + tscl = rdtsc(); + if (vmx->hv_deadline_tsc > tscl) + /* set_hv_timer ensures the delta fits in 32-bits */ + delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> + cpu_preemption_timer_multi); + else + delta_tsc = 0; + + vmx_arm_hv_timer(vmx, delta_tsc); + return; + } + + if (vmx->loaded_vmcs->hv_timer_armed) + vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, + PIN_BASED_VMX_PREEMPTION_TIMER); + vmx->loaded_vmcs->hv_timer_armed = false; } -static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12, gpa_t bitmap) +static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) { - u32 vmx_instruction_info; - unsigned long field; - u8 b; - - if (!nested_cpu_has_shadow_vmcs(vmcs12)) - return true; + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long cr3, cr4, evmcs_rsp; - /* Decode instruction info and find the field to access */ - vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); + /* Record the guest's net vcpu time for enforced NMI injections. */ + if (unlikely(!enable_vnmi && + vmx->loaded_vmcs->soft_vnmi_blocked)) + vmx->loaded_vmcs->entry_time = ktime_get(); - /* Out-of-range fields always cause a VM exit from L2 to L1 */ - if (field >> 15) - return true; + /* Don't enter VMX if guest state is invalid, let the exit handler + start emulation until we arrive back to a valid state */ + if (vmx->emulation_required) + return; - if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) - return true; + if (vmx->ple_window_dirty) { + vmx->ple_window_dirty = false; + vmcs_write32(PLE_WINDOW, vmx->ple_window); + } - return 1 & (b >> (field & 7)); -} + if (vmx->nested.need_vmcs12_sync) + nested_sync_from_vmcs12(vcpu); -/* - * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we - * should handle it ourselves in L0 (and then continue L2). Only call this - * when in is_guest_mode (L2). - */ -static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) -{ - u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) + vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); + if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) + vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); - if (vmx->nested.nested_run_pending) - return false; + cr3 = __get_current_cr3_fast(); + if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { + vmcs_writel(HOST_CR3, cr3); + vmx->loaded_vmcs->host_state.cr3 = cr3; + } - if (unlikely(vmx->fail)) { - pr_info_ratelimited("%s failed vm entry %x\n", __func__, - vmcs_read32(VM_INSTRUCTION_ERROR)); - return true; + cr4 = cr4_read_shadow(); + if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { + vmcs_writel(HOST_CR4, cr4); + vmx->loaded_vmcs->host_state.cr4 = cr4; } - /* - * The host physical addresses of some pages of guest memory - * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC - * Page). The CPU may write to these pages via their host - * physical address while L2 is running, bypassing any - * address-translation-based dirty tracking (e.g. EPT write - * protection). - * - * Mark them dirty on every exit from L2 to prevent them from - * getting out of sync with dirty tracking. - */ - nested_mark_vmcs12_pages_dirty(vcpu); - - trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, - vmcs_readl(EXIT_QUALIFICATION), - vmx->idt_vectoring_info, - intr_info, - vmcs_read32(VM_EXIT_INTR_ERROR_CODE), - KVM_ISA_VMX); - - switch (exit_reason) { - case EXIT_REASON_EXCEPTION_NMI: - if (is_nmi(intr_info)) - return false; - else if (is_page_fault(intr_info)) - return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept; - else if (is_debug(intr_info) && - vcpu->guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) - return false; - else if (is_breakpoint(intr_info) && - vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) - return false; - return vmcs12->exception_bitmap & - (1u << (intr_info & INTR_INFO_VECTOR_MASK)); - case EXIT_REASON_EXTERNAL_INTERRUPT: - return false; - case EXIT_REASON_TRIPLE_FAULT: - return true; - case EXIT_REASON_PENDING_INTERRUPT: - return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING); - case EXIT_REASON_NMI_WINDOW: - return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING); - case EXIT_REASON_TASK_SWITCH: - return true; - case EXIT_REASON_CPUID: - return true; - case EXIT_REASON_HLT: - return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); - case EXIT_REASON_INVD: - return true; - case EXIT_REASON_INVLPG: - return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); - case EXIT_REASON_RDPMC: - return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); - case EXIT_REASON_RDRAND: - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); - case EXIT_REASON_RDSEED: - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); - case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: - return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); - case EXIT_REASON_VMREAD: - return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, - vmcs12->vmread_bitmap); - case EXIT_REASON_VMWRITE: - return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, - vmcs12->vmwrite_bitmap); - case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: - case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: - case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: - case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: - case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: - /* - * VMX instructions trap unconditionally. This allows L1 to - * emulate them for its L2 guest, i.e., allows 3-level nesting! - */ - return true; - case EXIT_REASON_CR_ACCESS: - return nested_vmx_exit_handled_cr(vcpu, vmcs12); - case EXIT_REASON_DR_ACCESS: - return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); - case EXIT_REASON_IO_INSTRUCTION: - return nested_vmx_exit_handled_io(vcpu, vmcs12); - case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); - case EXIT_REASON_MSR_READ: - case EXIT_REASON_MSR_WRITE: - return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); - case EXIT_REASON_INVALID_STATE: - return true; - case EXIT_REASON_MWAIT_INSTRUCTION: - return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); - case EXIT_REASON_MONITOR_TRAP_FLAG: - return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG); - case EXIT_REASON_MONITOR_INSTRUCTION: - return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); - case EXIT_REASON_PAUSE_INSTRUCTION: - return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || - nested_cpu_has2(vmcs12, - SECONDARY_EXEC_PAUSE_LOOP_EXITING); - case EXIT_REASON_MCE_DURING_VMENTRY: - return false; - case EXIT_REASON_TPR_BELOW_THRESHOLD: - return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); - case EXIT_REASON_APIC_ACCESS: - case EXIT_REASON_APIC_WRITE: - case EXIT_REASON_EOI_INDUCED: - /* - * The controls for "virtualize APIC accesses," "APIC- - * register virtualization," and "virtual-interrupt - * delivery" only come from vmcs12. - */ - return true; - case EXIT_REASON_EPT_VIOLATION: - /* - * L0 always deals with the EPT violation. If nested EPT is - * used, and the nested mmu code discovers that the address is - * missing in the guest EPT table (EPT12), the EPT violation - * will be injected with nested_ept_inject_page_fault() - */ - return false; - case EXIT_REASON_EPT_MISCONFIG: - /* - * L2 never uses directly L1's EPT, but rather L0's own EPT - * table (shadow on EPT) or a merged EPT table that L0 built - * (EPT on EPT). So any problems with the structure of the - * table is L0's fault. - */ - return false; - case EXIT_REASON_INVPCID: - return - nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && - nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); - case EXIT_REASON_WBINVD: - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); - case EXIT_REASON_XSETBV: - return true; - case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: - /* - * This should never happen, since it is not possible to - * set XSS to a non-zero value---neither in L1 nor in L2. - * If if it were, XSS would have to be checked against - * the XSS exit bitmap in vmcs12. - */ - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); - case EXIT_REASON_PREEMPTION_TIMER: - return false; - case EXIT_REASON_PML_FULL: - /* We emulate PML support to L1. */ - return false; - case EXIT_REASON_VMFUNC: - /* VM functions are emulated through L2->L0 vmexits. */ - return false; - case EXIT_REASON_ENCLS: - /* SGX is never exposed to L1 */ - return false; - default: - return true; - } -} - -static int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason) -{ - u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - - /* - * At this point, the exit interruption info in exit_intr_info - * is only valid for EXCEPTION_NMI exits. For EXTERNAL_INTERRUPT - * we need to query the in-kernel LAPIC. - */ - WARN_ON(exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT); - if ((exit_intr_info & - (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) == - (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) { - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - vmcs12->vm_exit_intr_error_code = - vmcs_read32(VM_EXIT_INTR_ERROR_CODE); - } - - nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info, - vmcs_readl(EXIT_QUALIFICATION)); - return 1; -} - -static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) -{ - *info1 = vmcs_readl(EXIT_QUALIFICATION); - *info2 = vmcs_read32(VM_EXIT_INTR_INFO); -} - -static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) -{ - if (vmx->pml_pg) { - __free_page(vmx->pml_pg); - vmx->pml_pg = NULL; - } -} - -static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - u64 *pml_buf; - u16 pml_idx; - - pml_idx = vmcs_read16(GUEST_PML_INDEX); - - /* Do nothing if PML buffer is empty */ - if (pml_idx == (PML_ENTITY_NUM - 1)) - return; - - /* PML index always points to next available PML buffer entity */ - if (pml_idx >= PML_ENTITY_NUM) - pml_idx = 0; - else - pml_idx++; - - pml_buf = page_address(vmx->pml_pg); - for (; pml_idx < PML_ENTITY_NUM; pml_idx++) { - u64 gpa; - - gpa = pml_buf[pml_idx]; - WARN_ON(gpa & (PAGE_SIZE - 1)); - kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); - } - - /* reset PML index */ - vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); -} - -/* - * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap. - * Called before reporting dirty_bitmap to userspace. - */ -static void kvm_flush_pml_buffers(struct kvm *kvm) -{ - int i; - struct kvm_vcpu *vcpu; - /* - * We only need to kick vcpu out of guest mode here, as PML buffer - * is flushed at beginning of all VMEXITs, and it's obvious that only - * vcpus running in guest are possible to have unflushed GPAs in PML - * buffer. - */ - kvm_for_each_vcpu(i, vcpu, kvm) - kvm_vcpu_kick(vcpu); -} - -static void vmx_dump_sel(char *name, uint32_t sel) -{ - pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", - name, vmcs_read16(sel), - vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR), - vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR), - vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR)); -} - -static void vmx_dump_dtsel(char *name, uint32_t limit) -{ - pr_err("%s limit=0x%08x, base=0x%016lx\n", - name, vmcs_read32(limit), - vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); -} - -static void dump_vmcs(void) -{ - u32 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS); - u32 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS); - u32 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); - u32 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); - u32 secondary_exec_control = 0; - unsigned long cr4 = vmcs_readl(GUEST_CR4); - u64 efer = vmcs_read64(GUEST_IA32_EFER); - int i, n; - - if (cpu_has_secondary_exec_ctrls()) - secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - - pr_err("*** Guest State ***\n"); - pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", - vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW), - vmcs_readl(CR0_GUEST_HOST_MASK)); - pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", - cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK)); - pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3)); - if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) && - (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA)) - { - pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n", - vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1)); - pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n", - vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3)); - } - pr_err("RSP = 0x%016lx RIP = 0x%016lx\n", - vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP)); - pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n", - vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7)); - pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", - vmcs_readl(GUEST_SYSENTER_ESP), - vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP)); - vmx_dump_sel("CS: ", GUEST_CS_SELECTOR); - vmx_dump_sel("DS: ", GUEST_DS_SELECTOR); - vmx_dump_sel("SS: ", GUEST_SS_SELECTOR); - vmx_dump_sel("ES: ", GUEST_ES_SELECTOR); - vmx_dump_sel("FS: ", GUEST_FS_SELECTOR); - vmx_dump_sel("GS: ", GUEST_GS_SELECTOR); - vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT); - vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR); - vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT); - vmx_dump_sel("TR: ", GUEST_TR_SELECTOR); - if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) || - (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER))) - pr_err("EFER = 0x%016llx PAT = 0x%016llx\n", - efer, vmcs_read64(GUEST_IA32_PAT)); - pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n", - vmcs_read64(GUEST_IA32_DEBUGCTL), - vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS)); - if (cpu_has_load_perf_global_ctrl() && - vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) - pr_err("PerfGlobCtl = 0x%016llx\n", - vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL)); - if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) - pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS)); - pr_err("Interruptibility = %08x ActivityState = %08x\n", - vmcs_read32(GUEST_INTERRUPTIBILITY_INFO), - vmcs_read32(GUEST_ACTIVITY_STATE)); - if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) - pr_err("InterruptStatus = %04x\n", - vmcs_read16(GUEST_INTR_STATUS)); - - pr_err("*** Host State ***\n"); - pr_err("RIP = 0x%016lx RSP = 0x%016lx\n", - vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP)); - pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n", - vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR), - vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR), - vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR), - vmcs_read16(HOST_TR_SELECTOR)); - pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n", - vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE), - vmcs_readl(HOST_TR_BASE)); - pr_err("GDTBase=%016lx IDTBase=%016lx\n", - vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE)); - pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n", - vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3), - vmcs_readl(HOST_CR4)); - pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", - vmcs_readl(HOST_IA32_SYSENTER_ESP), - vmcs_read32(HOST_IA32_SYSENTER_CS), - vmcs_readl(HOST_IA32_SYSENTER_EIP)); - if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER)) - pr_err("EFER = 0x%016llx PAT = 0x%016llx\n", - vmcs_read64(HOST_IA32_EFER), - vmcs_read64(HOST_IA32_PAT)); - if (cpu_has_load_perf_global_ctrl() && - vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) - pr_err("PerfGlobCtl = 0x%016llx\n", - vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); - - pr_err("*** Control State ***\n"); - pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n", - pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control); - pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl); - pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n", - vmcs_read32(EXCEPTION_BITMAP), - vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK), - vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH)); - pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n", - vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), - vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE), - vmcs_read32(VM_ENTRY_INSTRUCTION_LEN)); - pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n", - vmcs_read32(VM_EXIT_INTR_INFO), - vmcs_read32(VM_EXIT_INTR_ERROR_CODE), - vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); - pr_err(" reason=%08x qualification=%016lx\n", - vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION)); - pr_err("IDTVectoring: info=%08x errcode=%08x\n", - vmcs_read32(IDT_VECTORING_INFO_FIELD), - vmcs_read32(IDT_VECTORING_ERROR_CODE)); - pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET)); - if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING) - pr_err("TSC Multiplier = 0x%016llx\n", - vmcs_read64(TSC_MULTIPLIER)); - if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) - pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); - if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) - pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); - if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) - pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER)); - n = vmcs_read32(CR3_TARGET_COUNT); - for (i = 0; i + 1 < n; i += 4) - pr_err("CR3 target%u=%016lx target%u=%016lx\n", - i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2), - i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2)); - if (i < n) - pr_err("CR3 target%u=%016lx\n", - i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2)); - if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) - pr_err("PLE Gap=%08x Window=%08x\n", - vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW)); - if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID) - pr_err("Virtual processor ID = 0x%04x\n", - vmcs_read16(VIRTUAL_PROCESSOR_ID)); -} - -/* - * The guest has exited. See if we can fix it or if we need userspace - * assistance. - */ -static int vmx_handle_exit(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 exit_reason = vmx->exit_reason; - u32 vectoring_info = vmx->idt_vectoring_info; - - trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX); - - /* - * Flush logged GPAs PML buffer, this will make dirty_bitmap more - * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before - * querying dirty_bitmap, we only need to kick all vcpus out of guest - * mode as if vcpus is in root mode, the PML buffer must has been - * flushed already. - */ - if (enable_pml) - vmx_flush_pml_buffer(vcpu); - - /* If guest state is invalid, start emulating */ - if (vmx->emulation_required) - return handle_invalid_guest_state(vcpu); - - if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason)) - return nested_vmx_reflect_vmexit(vcpu, exit_reason); - - if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { - dump_vmcs(); - vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; - vcpu->run->fail_entry.hardware_entry_failure_reason - = exit_reason; - return 0; - } - - if (unlikely(vmx->fail)) { - vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; - vcpu->run->fail_entry.hardware_entry_failure_reason - = vmcs_read32(VM_INSTRUCTION_ERROR); - return 0; - } - - /* - * Note: - * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by - * delivery event since it indicates guest is accessing MMIO. - * The vm-exit can be triggered again after return to guest that - * will cause infinite loop. - */ - if ((vectoring_info & VECTORING_INFO_VALID_MASK) && - (exit_reason != EXIT_REASON_EXCEPTION_NMI && - exit_reason != EXIT_REASON_EPT_VIOLATION && - exit_reason != EXIT_REASON_PML_FULL && - exit_reason != EXIT_REASON_TASK_SWITCH)) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; - vcpu->run->internal.ndata = 3; - vcpu->run->internal.data[0] = vectoring_info; - vcpu->run->internal.data[1] = exit_reason; - vcpu->run->internal.data[2] = vcpu->arch.exit_qualification; - if (exit_reason == EXIT_REASON_EPT_MISCONFIG) { - vcpu->run->internal.ndata++; - vcpu->run->internal.data[3] = - vmcs_read64(GUEST_PHYSICAL_ADDRESS); - } - return 0; - } - - if (unlikely(!enable_vnmi && - vmx->loaded_vmcs->soft_vnmi_blocked)) { - if (vmx_interrupt_allowed(vcpu)) { - vmx->loaded_vmcs->soft_vnmi_blocked = 0; - } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL && - vcpu->arch.nmi_pending) { - /* - * This CPU don't support us in finding the end of an - * NMI-blocked window if the guest runs with IRQs - * disabled. So we pull the trigger after 1 s of - * futile waiting, but inform the user about this. - */ - printk(KERN_WARNING "%s: Breaking out of NMI-blocked " - "state on VCPU %d after 1 s timeout\n", - __func__, vcpu->vcpu_id); - vmx->loaded_vmcs->soft_vnmi_blocked = 0; - } - } - - if (exit_reason < kvm_vmx_max_exit_handlers - && kvm_vmx_exit_handlers[exit_reason]) - return kvm_vmx_exit_handlers[exit_reason](vcpu); - else { - vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", - exit_reason); - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } -} - -/* - * Software based L1D cache flush which is used when microcode providing - * the cache control MSR is not loaded. - * - * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to - * flush it is required to read in 64 KiB because the replacement algorithm - * is not exactly LRU. This could be sized at runtime via topology - * information but as all relevant affected CPUs have 32KiB L1D cache size - * there is no point in doing so. - */ -static void vmx_l1d_flush(struct kvm_vcpu *vcpu) -{ - int size = PAGE_SIZE << L1D_CACHE_ORDER; - - /* - * This code is only executed when the the flush mode is 'cond' or - * 'always' - */ - if (static_branch_likely(&vmx_l1d_flush_cond)) { - bool flush_l1d; - - /* - * Clear the per-vcpu flush bit, it gets set again - * either from vcpu_run() or from one of the unsafe - * VMEXIT handlers. - */ - flush_l1d = vcpu->arch.l1tf_flush_l1d; - vcpu->arch.l1tf_flush_l1d = false; - - /* - * Clear the per-cpu flush bit, it gets set again from - * the interrupt handlers. - */ - flush_l1d |= kvm_get_cpu_l1tf_flush_l1d(); - kvm_clear_cpu_l1tf_flush_l1d(); - - if (!flush_l1d) - return; - } - - vcpu->stat.l1d_flush++; - - if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { - wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); - return; - } - - asm volatile( - /* First ensure the pages are in the TLB */ - "xorl %%eax, %%eax\n" - ".Lpopulate_tlb:\n\t" - "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" - "addl $4096, %%eax\n\t" - "cmpl %%eax, %[size]\n\t" - "jne .Lpopulate_tlb\n\t" - "xorl %%eax, %%eax\n\t" - "cpuid\n\t" - /* Now fill the cache */ - "xorl %%eax, %%eax\n" - ".Lfill_cache:\n" - "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" - "addl $64, %%eax\n\t" - "cmpl %%eax, %[size]\n\t" - "jne .Lfill_cache\n\t" - "lfence\n" - :: [flush_pages] "r" (vmx_l1d_flush_pages), - [size] "r" (size) - : "eax", "ebx", "ecx", "edx"); -} - -static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) -{ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - - if (is_guest_mode(vcpu) && - nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) - return; - - if (irr == -1 || tpr < irr) { - vmcs_write32(TPR_THRESHOLD, 0); - return; - } - - vmcs_write32(TPR_THRESHOLD, irr); -} - -void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) -{ - u32 sec_exec_control; - - if (!lapic_in_kernel(vcpu)) - return; - - if (!flexpriority_enabled && - !cpu_has_vmx_virtualize_x2apic_mode()) - return; - - /* Postpone execution until vmcs01 is the current VMCS. */ - if (is_guest_mode(vcpu)) { - to_vmx(vcpu)->nested.change_vmcs01_virtual_apic_mode = true; - return; - } - - sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | - SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); - - switch (kvm_get_apic_mode(vcpu)) { - case LAPIC_MODE_INVALID: - WARN_ONCE(true, "Invalid local APIC state"); - case LAPIC_MODE_DISABLED: - break; - case LAPIC_MODE_XAPIC: - if (flexpriority_enabled) { - sec_exec_control |= - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; - vmx_flush_tlb(vcpu, true); - } - break; - case LAPIC_MODE_X2APIC: - if (cpu_has_vmx_virtualize_x2apic_mode()) - sec_exec_control |= - SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; - break; - } - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); - - vmx_update_msr_bitmap(vcpu); -} - -static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) -{ - if (!is_guest_mode(vcpu)) { - vmcs_write64(APIC_ACCESS_ADDR, hpa); - vmx_flush_tlb(vcpu, true); - } -} - -static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) -{ - u16 status; - u8 old; - - if (max_isr == -1) - max_isr = 0; - - status = vmcs_read16(GUEST_INTR_STATUS); - old = status >> 8; - if (max_isr != old) { - status &= 0xff; - status |= max_isr << 8; - vmcs_write16(GUEST_INTR_STATUS, status); - } -} - -static void vmx_set_rvi(int vector) -{ - u16 status; - u8 old; - - if (vector == -1) - vector = 0; - - status = vmcs_read16(GUEST_INTR_STATUS); - old = (u8)status & 0xff; - if ((u8)vector != old) { - status &= ~0xff; - status |= (u8)vector; - vmcs_write16(GUEST_INTR_STATUS, status); - } -} - -static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) -{ - /* - * When running L2, updating RVI is only relevant when - * vmcs12 virtual-interrupt-delivery enabled. - * However, it can be enabled only when L1 also - * intercepts external-interrupts and in that case - * we should not update vmcs02 RVI but instead intercept - * interrupt. Therefore, do nothing when running L2. - */ - if (!is_guest_mode(vcpu)) - vmx_set_rvi(max_irr); -} - -static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - int max_irr; - bool max_irr_updated; - - WARN_ON(!vcpu->arch.apicv_active); - if (pi_test_on(&vmx->pi_desc)) { - pi_clear_on(&vmx->pi_desc); - /* - * IOMMU can write to PIR.ON, so the barrier matters even on UP. - * But on x86 this is just a compiler barrier anyway. - */ - smp_mb__after_atomic(); - max_irr_updated = - kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr); - - /* - * If we are running L2 and L1 has a new pending interrupt - * which can be injected, we should re-evaluate - * what should be done with this new L1 interrupt. - * If L1 intercepts external-interrupts, we should - * exit from L2 to L1. Otherwise, interrupt should be - * delivered directly to L2. - */ - if (is_guest_mode(vcpu) && max_irr_updated) { - if (nested_exit_on_intr(vcpu)) - kvm_vcpu_exiting_guest_mode(vcpu); - else - kvm_make_request(KVM_REQ_EVENT, vcpu); - } - } else { - max_irr = kvm_lapic_find_highest_irr(vcpu); - } - vmx_hwapic_irr_update(vcpu, max_irr); - return max_irr; -} - -static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu) -{ - u8 rvi = vmx_get_rvi(); - u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI); - - return ((rvi & 0xf0) > (vppr & 0xf0)); -} - -static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) -{ - if (!kvm_vcpu_apicv_active(vcpu)) - return; - - vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); - vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); - vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); - vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); -} - -static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - pi_clear_on(&vmx->pi_desc); - memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir)); -} - -static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) -{ - u32 exit_intr_info = 0; - u16 basic_exit_reason = (u16)vmx->exit_reason; - - if (!(basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY - || basic_exit_reason == EXIT_REASON_EXCEPTION_NMI)) - return; - - if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) - exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - vmx->exit_intr_info = exit_intr_info; - - /* if exit due to PF check for async PF */ - if (is_page_fault(exit_intr_info)) - vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason(); - - /* Handle machine checks before interrupts are enabled */ - if (basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY || - is_machine_check(exit_intr_info)) - kvm_machine_check(); - - /* We need to handle NMIs before interrupts are enabled */ - if (is_nmi(exit_intr_info)) { - kvm_before_interrupt(&vmx->vcpu); - asm("int $2"); - kvm_after_interrupt(&vmx->vcpu); - } -} - -static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) -{ - u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - - if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK)) - == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) { - unsigned int vector; - unsigned long entry; - gate_desc *desc; - struct vcpu_vmx *vmx = to_vmx(vcpu); -#ifdef CONFIG_X86_64 - unsigned long tmp; -#endif - - vector = exit_intr_info & INTR_INFO_VECTOR_MASK; - desc = (gate_desc *)vmx->host_idt_base + vector; - entry = gate_offset(desc); - asm volatile( -#ifdef CONFIG_X86_64 - "mov %%" _ASM_SP ", %[sp]\n\t" - "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t" - "push $%c[ss]\n\t" - "push %[sp]\n\t" -#endif - "pushf\n\t" - __ASM_SIZE(push) " $%c[cs]\n\t" - CALL_NOSPEC - : -#ifdef CONFIG_X86_64 - [sp]"=&r"(tmp), -#endif - ASM_CALL_CONSTRAINT - : - THUNK_TARGET(entry), - [ss]"i"(__KERNEL_DS), - [cs]"i"(__KERNEL_CS) - ); - } -} -STACK_FRAME_NON_STANDARD(vmx_handle_external_intr); - -static bool vmx_has_emulated_msr(int index) -{ - switch (index) { - case MSR_IA32_SMBASE: - /* - * We cannot do SMM unless we can run the guest in big - * real mode. - */ - return enable_unrestricted_guest || emulate_invalid_guest_state; - case MSR_AMD64_VIRT_SPEC_CTRL: - /* This is AMD only. */ - return false; - default: - return true; - } -} - -static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) -{ - u32 exit_intr_info; - bool unblock_nmi; - u8 vector; - bool idtv_info_valid; - - idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; - - if (enable_vnmi) { - if (vmx->loaded_vmcs->nmi_known_unmasked) - return; - /* - * Can't use vmx->exit_intr_info since we're not sure what - * the exit reason is. - */ - exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; - vector = exit_intr_info & INTR_INFO_VECTOR_MASK; - /* - * SDM 3: 27.7.1.2 (September 2008) - * Re-set bit "block by NMI" before VM entry if vmexit caused by - * a guest IRET fault. - * SDM 3: 23.2.2 (September 2008) - * Bit 12 is undefined in any of the following cases: - * If the VM exit sets the valid bit in the IDT-vectoring - * information field. - * If the VM exit is due to a double fault. - */ - if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && - vector != DF_VECTOR && !idtv_info_valid) - vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); - else - vmx->loaded_vmcs->nmi_known_unmasked = - !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) - & GUEST_INTR_STATE_NMI); - } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked)) - vmx->loaded_vmcs->vnmi_blocked_time += - ktime_to_ns(ktime_sub(ktime_get(), - vmx->loaded_vmcs->entry_time)); -} - -static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, - u32 idt_vectoring_info, - int instr_len_field, - int error_code_field) -{ - u8 vector; - int type; - bool idtv_info_valid; - - idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; - - vcpu->arch.nmi_injected = false; - kvm_clear_exception_queue(vcpu); - kvm_clear_interrupt_queue(vcpu); - - if (!idtv_info_valid) - return; - - kvm_make_request(KVM_REQ_EVENT, vcpu); - - vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; - type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; - - switch (type) { - case INTR_TYPE_NMI_INTR: - vcpu->arch.nmi_injected = true; - /* - * SDM 3: 27.7.1.2 (September 2008) - * Clear bit "block by NMI" before VM entry if a NMI - * delivery faulted. - */ - vmx_set_nmi_mask(vcpu, false); - break; - case INTR_TYPE_SOFT_EXCEPTION: - vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); - /* fall through */ - case INTR_TYPE_HARD_EXCEPTION: - if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { - u32 err = vmcs_read32(error_code_field); - kvm_requeue_exception_e(vcpu, vector, err); - } else - kvm_requeue_exception(vcpu, vector); - break; - case INTR_TYPE_SOFT_INTR: - vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); - /* fall through */ - case INTR_TYPE_EXT_INTR: - kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR); - break; - default: - break; - } -} - -static void vmx_complete_interrupts(struct vcpu_vmx *vmx) -{ - __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info, - VM_EXIT_INSTRUCTION_LEN, - IDT_VECTORING_ERROR_CODE); -} - -static void vmx_cancel_injection(struct kvm_vcpu *vcpu) -{ - __vmx_complete_interrupts(vcpu, - vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), - VM_ENTRY_INSTRUCTION_LEN, - VM_ENTRY_EXCEPTION_ERROR_CODE); - - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); -} - -static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) -{ - int i, nr_msrs; - struct perf_guest_switch_msr *msrs; - - msrs = perf_guest_get_msrs(&nr_msrs); - - if (!msrs) - return; - - for (i = 0; i < nr_msrs; i++) - if (msrs[i].host == msrs[i].guest) - clear_atomic_switch_msr(vmx, msrs[i].msr); - else - add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, - msrs[i].host, false); -} - -static void vmx_arm_hv_timer(struct vcpu_vmx *vmx, u32 val) -{ - vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, val); - if (!vmx->loaded_vmcs->hv_timer_armed) - vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, - PIN_BASED_VMX_PREEMPTION_TIMER); - vmx->loaded_vmcs->hv_timer_armed = true; -} - -static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - u64 tscl; - u32 delta_tsc; - - if (vmx->req_immediate_exit) { - vmx_arm_hv_timer(vmx, 0); - return; - } - - if (vmx->hv_deadline_tsc != -1) { - tscl = rdtsc(); - if (vmx->hv_deadline_tsc > tscl) - /* set_hv_timer ensures the delta fits in 32-bits */ - delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> - cpu_preemption_timer_multi); - else - delta_tsc = 0; - - vmx_arm_hv_timer(vmx, delta_tsc); - return; - } - - if (vmx->loaded_vmcs->hv_timer_armed) - vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, - PIN_BASED_VMX_PREEMPTION_TIMER); - vmx->loaded_vmcs->hv_timer_armed = false; -} - -static void nested_sync_from_vmcs12(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - /* - * hv_evmcs may end up being not mapped after migration (when - * L2 was running), map it here to make sure vmcs12 changes are - * properly reflected. - */ - if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs) - nested_vmx_handle_enlightened_vmptrld(vcpu, false); - - if (vmx->nested.hv_evmcs) { - copy_vmcs12_to_enlightened(vmx); - /* All fields are clean */ - vmx->nested.hv_evmcs->hv_clean_fields |= - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; - } else { - copy_vmcs12_to_shadow(vmx); - } - - vmx->nested.need_vmcs12_sync = false; -} - -static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long cr3, cr4, evmcs_rsp; - - /* Record the guest's net vcpu time for enforced NMI injections. */ - if (unlikely(!enable_vnmi && - vmx->loaded_vmcs->soft_vnmi_blocked)) - vmx->loaded_vmcs->entry_time = ktime_get(); - - /* Don't enter VMX if guest state is invalid, let the exit handler - start emulation until we arrive back to a valid state */ - if (vmx->emulation_required) - return; - - if (vmx->ple_window_dirty) { - vmx->ple_window_dirty = false; - vmcs_write32(PLE_WINDOW, vmx->ple_window); - } - - if (vmx->nested.need_vmcs12_sync) - nested_sync_from_vmcs12(vcpu); - - if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) - vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); - if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) - vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); - - cr3 = __get_current_cr3_fast(); - if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { - vmcs_writel(HOST_CR3, cr3); - vmx->loaded_vmcs->host_state.cr3 = cr3; - } - - cr4 = cr4_read_shadow(); - if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { - vmcs_writel(HOST_CR4, cr4); - vmx->loaded_vmcs->host_state.cr4 = cr4; - } - - /* When single-stepping over STI and MOV SS, we must clear the - * corresponding interruptibility bits in the guest state. Otherwise - * vmentry fails as it then expects bit 14 (BS) in pending debug - * exceptions being set, but that's not correct for the guest debugging - * case. */ - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) - vmx_set_interrupt_shadow(vcpu, 0); - - if (static_cpu_has(X86_FEATURE_PKU) && - kvm_read_cr4_bits(vcpu, X86_CR4_PKE) && - vcpu->arch.pkru != vmx->host_pkru) - __write_pkru(vcpu->arch.pkru); - - atomic_switch_perf_msrs(vmx); - - vmx_update_hv_timer(vcpu); - - /* - * If this vCPU has touched SPEC_CTRL, restore the guest's value if - * it's non-zero. Since vmentry is serialising on affected CPUs, there - * is no need to worry about the conditional branch over the wrmsr - * being speculatively taken. - */ - x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0); - - vmx->__launched = vmx->loaded_vmcs->launched; - - evmcs_rsp = static_branch_unlikely(&enable_evmcs) ? - (unsigned long)¤t_evmcs->host_rsp : 0; - - if (static_branch_unlikely(&vmx_l1d_should_flush)) - vmx_l1d_flush(vcpu); - - asm( - /* Store host registers */ - "push %%" _ASM_DX "; push %%" _ASM_BP ";" - "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */ - "push %%" _ASM_CX " \n\t" - "cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t" - "je 1f \n\t" - "mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t" - /* Avoid VMWRITE when Enlightened VMCS is in use */ - "test %%" _ASM_SI ", %%" _ASM_SI " \n\t" - "jz 2f \n\t" - "mov %%" _ASM_SP ", (%%" _ASM_SI ") \n\t" - "jmp 1f \n\t" - "2: \n\t" - __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t" - "1: \n\t" - /* Reload cr2 if changed */ - "mov %c[cr2](%0), %%" _ASM_AX " \n\t" - "mov %%cr2, %%" _ASM_DX " \n\t" - "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t" - "je 3f \n\t" - "mov %%" _ASM_AX", %%cr2 \n\t" - "3: \n\t" - /* Check if vmlaunch or vmresume is needed */ - "cmpl $0, %c[launched](%0) \n\t" - /* Load guest registers. Don't clobber flags. */ - "mov %c[rax](%0), %%" _ASM_AX " \n\t" - "mov %c[rbx](%0), %%" _ASM_BX " \n\t" - "mov %c[rdx](%0), %%" _ASM_DX " \n\t" - "mov %c[rsi](%0), %%" _ASM_SI " \n\t" - "mov %c[rdi](%0), %%" _ASM_DI " \n\t" - "mov %c[rbp](%0), %%" _ASM_BP " \n\t" -#ifdef CONFIG_X86_64 - "mov %c[r8](%0), %%r8 \n\t" - "mov %c[r9](%0), %%r9 \n\t" - "mov %c[r10](%0), %%r10 \n\t" - "mov %c[r11](%0), %%r11 \n\t" - "mov %c[r12](%0), %%r12 \n\t" - "mov %c[r13](%0), %%r13 \n\t" - "mov %c[r14](%0), %%r14 \n\t" - "mov %c[r15](%0), %%r15 \n\t" -#endif - "mov %c[rcx](%0), %%" _ASM_CX " \n\t" /* kills %0 (ecx) */ - - /* Enter guest mode */ - "jne 1f \n\t" - __ex("vmlaunch") "\n\t" - "jmp 2f \n\t" - "1: " __ex("vmresume") "\n\t" - "2: " - /* Save guest registers, load host registers, keep flags */ - "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t" - "pop %0 \n\t" - "setbe %c[fail](%0)\n\t" - "mov %%" _ASM_AX ", %c[rax](%0) \n\t" - "mov %%" _ASM_BX ", %c[rbx](%0) \n\t" - __ASM_SIZE(pop) " %c[rcx](%0) \n\t" - "mov %%" _ASM_DX ", %c[rdx](%0) \n\t" - "mov %%" _ASM_SI ", %c[rsi](%0) \n\t" - "mov %%" _ASM_DI ", %c[rdi](%0) \n\t" - "mov %%" _ASM_BP ", %c[rbp](%0) \n\t" -#ifdef CONFIG_X86_64 - "mov %%r8, %c[r8](%0) \n\t" - "mov %%r9, %c[r9](%0) \n\t" - "mov %%r10, %c[r10](%0) \n\t" - "mov %%r11, %c[r11](%0) \n\t" - "mov %%r12, %c[r12](%0) \n\t" - "mov %%r13, %c[r13](%0) \n\t" - "mov %%r14, %c[r14](%0) \n\t" - "mov %%r15, %c[r15](%0) \n\t" - /* - * Clear host registers marked as clobbered to prevent - * speculative use. - */ - "xor %%r8d, %%r8d \n\t" - "xor %%r9d, %%r9d \n\t" - "xor %%r10d, %%r10d \n\t" - "xor %%r11d, %%r11d \n\t" - "xor %%r12d, %%r12d \n\t" - "xor %%r13d, %%r13d \n\t" - "xor %%r14d, %%r14d \n\t" - "xor %%r15d, %%r15d \n\t" -#endif - "mov %%cr2, %%" _ASM_AX " \n\t" - "mov %%" _ASM_AX ", %c[cr2](%0) \n\t" - - "xor %%eax, %%eax \n\t" - "xor %%ebx, %%ebx \n\t" - "xor %%esi, %%esi \n\t" - "xor %%edi, %%edi \n\t" - "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t" - ".pushsection .rodata \n\t" - ".global vmx_return \n\t" - "vmx_return: " _ASM_PTR " 2b \n\t" - ".popsection" - : : "c"(vmx), "d"((unsigned long)HOST_RSP), "S"(evmcs_rsp), - [launched]"i"(offsetof(struct vcpu_vmx, __launched)), - [fail]"i"(offsetof(struct vcpu_vmx, fail)), - [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), - [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), - [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), - [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), - [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])), - [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])), - [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])), - [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])), -#ifdef CONFIG_X86_64 - [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])), - [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])), - [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])), - [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])), - [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])), - [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])), - [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), - [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), -#endif - [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)), - [wordsize]"i"(sizeof(ulong)) - : "cc", "memory" -#ifdef CONFIG_X86_64 - , "rax", "rbx", "rdi" - , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" -#else - , "eax", "ebx", "edi" -#endif - ); - - /* - * We do not use IBRS in the kernel. If this vCPU has used the - * SPEC_CTRL MSR it may have left it on; save the value and - * turn it off. This is much more efficient than blindly adding - * it to the atomic save/restore list. Especially as the former - * (Saving guest MSRs on vmexit) doesn't even exist in KVM. - * - * For non-nested case: - * If the L01 MSR bitmap does not intercept the MSR, then we need to - * save it. - * - * For nested case: - * If the L02 MSR bitmap does not intercept the MSR, then we need to - * save it. - */ - if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) - vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); - - x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0); - - /* Eliminate branch target predictions from guest mode */ - vmexit_fill_RSB(); - - /* All fields are clean at this point */ - if (static_branch_unlikely(&enable_evmcs)) - current_evmcs->hv_clean_fields |= - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; - - /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ - if (vmx->host_debugctlmsr) - update_debugctlmsr(vmx->host_debugctlmsr); - -#ifndef CONFIG_X86_64 - /* - * The sysexit path does not restore ds/es, so we must set them to - * a reasonable value ourselves. - * - * We can't defer this to vmx_prepare_switch_to_host() since that - * function may be executed in interrupt context, which saves and - * restore segments around it, nullifying its effect. - */ - loadsegment(ds, __USER_DS); - loadsegment(es, __USER_DS); -#endif - - vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) - | (1 << VCPU_EXREG_RFLAGS) - | (1 << VCPU_EXREG_PDPTR) - | (1 << VCPU_EXREG_SEGMENTS) - | (1 << VCPU_EXREG_CR3)); - vcpu->arch.regs_dirty = 0; - - /* - * eager fpu is enabled if PKEY is supported and CR4 is switched - * back on host, so it is safe to read guest PKRU from current - * XSAVE. - */ - if (static_cpu_has(X86_FEATURE_PKU) && - kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) { - vcpu->arch.pkru = __read_pkru(); - if (vcpu->arch.pkru != vmx->host_pkru) - __write_pkru(vmx->host_pkru); - } - - vmx->nested.nested_run_pending = 0; - vmx->idt_vectoring_info = 0; - - vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON); - if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) - return; - - vmx->loaded_vmcs->launched = 1; - vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); - - vmx_complete_atomic_exit(vmx); - vmx_recover_nmi_blocking(vmx); - vmx_complete_interrupts(vmx); -} -STACK_FRAME_NON_STANDARD(vmx_vcpu_run); - -static struct kvm *vmx_vm_alloc(void) -{ - struct kvm_vmx *kvm_vmx = vzalloc(sizeof(struct kvm_vmx)); - return &kvm_vmx->kvm; -} - -static void vmx_vm_free(struct kvm *kvm) -{ - vfree(to_kvm_vmx(kvm)); -} - -static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - int cpu; - - if (vmx->loaded_vmcs == vmcs) - return; - - cpu = get_cpu(); - vmx_vcpu_put(vcpu); - vmx->loaded_vmcs = vmcs; - vmx_vcpu_load(vcpu, cpu); - put_cpu(); - - vm_entry_controls_reset_shadow(vmx); - vm_exit_controls_reset_shadow(vmx); - vmx_segment_cache_clear(vmx); -} - -/* - * Ensure that the current vmcs of the logical processor is the - * vmcs01 of the vcpu before calling free_nested(). - */ -static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu) -{ - vcpu_load(vcpu); - vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01); - free_nested(vcpu); - vcpu_put(vcpu); -} - -static void vmx_free_vcpu(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (enable_pml) - vmx_destroy_pml_buffer(vmx); - free_vpid(vmx->vpid); - leave_guest_mode(vcpu); - vmx_free_vcpu_nested(vcpu); - free_loaded_vmcs(vmx->loaded_vmcs); - kfree(vmx->guest_msrs); - kvm_vcpu_uninit(vcpu); - kmem_cache_free(kvm_vcpu_cache, vmx); -} - -static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) -{ - int err; - struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); - unsigned long *msr_bitmap; - int cpu; - - if (!vmx) - return ERR_PTR(-ENOMEM); - - vmx->vpid = allocate_vpid(); - - err = kvm_vcpu_init(&vmx->vcpu, kvm, id); - if (err) - goto free_vcpu; - - err = -ENOMEM; - - /* - * If PML is turned on, failure on enabling PML just results in failure - * of creating the vcpu, therefore we can simplify PML logic (by - * avoiding dealing with cases, such as enabling PML partially on vcpus - * for the guest, etc. - */ - if (enable_pml) { - vmx->pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!vmx->pml_pg) - goto uninit_vcpu; - } - - vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); - BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0]) - > PAGE_SIZE); - - if (!vmx->guest_msrs) - goto free_pml; - - err = alloc_loaded_vmcs(&vmx->vmcs01); - if (err < 0) - goto free_msrs; - - msr_bitmap = vmx->vmcs01.msr_bitmap; - vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); - vmx->msr_bitmap_mode = 0; - - vmx->loaded_vmcs = &vmx->vmcs01; - cpu = get_cpu(); - vmx_vcpu_load(&vmx->vcpu, cpu); - vmx->vcpu.cpu = cpu; - vmx_vcpu_setup(vmx); - vmx_vcpu_put(&vmx->vcpu); - put_cpu(); - if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { - err = alloc_apic_access_page(kvm); - if (err) - goto free_vmcs; - } - - if (enable_ept && !enable_unrestricted_guest) { - err = init_rmode_identity_map(kvm); - if (err) - goto free_vmcs; - } - - if (nested) - nested_vmx_setup_ctls_msrs(&vmx->nested.msrs, - vmx_capability.ept, - kvm_vcpu_apicv_active(&vmx->vcpu)); - else - memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs)); - - vmx->nested.posted_intr_nv = -1; - vmx->nested.current_vmptr = -1ull; - - vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED; - - /* - * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR - * or POSTED_INTR_WAKEUP_VECTOR. - */ - vmx->pi_desc.nv = POSTED_INTR_VECTOR; - vmx->pi_desc.sn = 1; - - return &vmx->vcpu; - -free_vmcs: - free_loaded_vmcs(vmx->loaded_vmcs); -free_msrs: - kfree(vmx->guest_msrs); -free_pml: - vmx_destroy_pml_buffer(vmx); -uninit_vcpu: - kvm_vcpu_uninit(&vmx->vcpu); -free_vcpu: - free_vpid(vmx->vpid); - kmem_cache_free(kvm_vcpu_cache, vmx); - return ERR_PTR(err); -} - -#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n" -#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n" - -static int vmx_vm_init(struct kvm *kvm) -{ - spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock); - - if (!ple_gap) - kvm->arch.pause_in_guest = true; - - if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) { - switch (l1tf_mitigation) { - case L1TF_MITIGATION_OFF: - case L1TF_MITIGATION_FLUSH_NOWARN: - /* 'I explicitly don't care' is set */ - break; - case L1TF_MITIGATION_FLUSH: - case L1TF_MITIGATION_FLUSH_NOSMT: - case L1TF_MITIGATION_FULL: - /* - * Warn upon starting the first VM in a potentially - * insecure environment. - */ - if (cpu_smt_control == CPU_SMT_ENABLED) - pr_warn_once(L1TF_MSG_SMT); - if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER) - pr_warn_once(L1TF_MSG_L1D); - break; - case L1TF_MITIGATION_FULL_FORCE: - /* Flush is enforced */ - break; - } - } - return 0; -} - -static void __init vmx_check_processor_compat(void *rtn) -{ - struct vmcs_config vmcs_conf; - struct vmx_capability vmx_cap; - - *(int *)rtn = 0; - if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) - *(int *)rtn = -EIO; - if (nested) - nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept, - enable_apicv); - if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { - printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n", - smp_processor_id()); - *(int *)rtn = -EIO; - } -} - -static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) -{ - u8 cache; - u64 ipat = 0; - - /* For VT-d and EPT combination - * 1. MMIO: always map as UC - * 2. EPT with VT-d: - * a. VT-d without snooping control feature: can't guarantee the - * result, try to trust guest. - * b. VT-d with snooping control feature: snooping control feature of - * VT-d engine can guarantee the cache correctness. Just set it - * to WB to keep consistent with host. So the same as item 3. - * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep - * consistent with host MTRR - */ - if (is_mmio) { - cache = MTRR_TYPE_UNCACHABLE; - goto exit; - } - - if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) { - ipat = VMX_EPT_IPAT_BIT; - cache = MTRR_TYPE_WRBACK; - goto exit; - } - - if (kvm_read_cr0(vcpu) & X86_CR0_CD) { - ipat = VMX_EPT_IPAT_BIT; - if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) - cache = MTRR_TYPE_WRBACK; - else - cache = MTRR_TYPE_UNCACHABLE; - goto exit; - } - - cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn); - -exit: - return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat; -} - -static int vmx_get_lpage_level(void) -{ - if (enable_ept && !cpu_has_vmx_ept_1g_page()) - return PT_DIRECTORY_LEVEL; - else - /* For shadow and EPT supported 1GB page */ - return PT_PDPE_LEVEL; -} - -static void vmcs_set_secondary_exec_control(u32 new_ctl) -{ - /* - * These bits in the secondary execution controls field - * are dynamic, the others are mostly based on the hypervisor - * architecture and the guest's CPUID. Do not touch the - * dynamic bits. - */ - u32 mask = - SECONDARY_EXEC_SHADOW_VMCS | - SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | - SECONDARY_EXEC_DESC; - - u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, - (new_ctl & ~mask) | (cur_ctl & mask)); -} - -/* - * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits - * (indicating "allowed-1") if they are supported in the guest's CPUID. - */ -static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct kvm_cpuid_entry2 *entry; - - vmx->nested.msrs.cr0_fixed1 = 0xffffffff; - vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE; - -#define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \ - if (entry && (entry->_reg & (_cpuid_mask))) \ - vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \ -} while (0) - - entry = kvm_find_cpuid_entry(vcpu, 0x1, 0); - cr4_fixed1_update(X86_CR4_VME, edx, bit(X86_FEATURE_VME)); - cr4_fixed1_update(X86_CR4_PVI, edx, bit(X86_FEATURE_VME)); - cr4_fixed1_update(X86_CR4_TSD, edx, bit(X86_FEATURE_TSC)); - cr4_fixed1_update(X86_CR4_DE, edx, bit(X86_FEATURE_DE)); - cr4_fixed1_update(X86_CR4_PSE, edx, bit(X86_FEATURE_PSE)); - cr4_fixed1_update(X86_CR4_PAE, edx, bit(X86_FEATURE_PAE)); - cr4_fixed1_update(X86_CR4_MCE, edx, bit(X86_FEATURE_MCE)); - cr4_fixed1_update(X86_CR4_PGE, edx, bit(X86_FEATURE_PGE)); - cr4_fixed1_update(X86_CR4_OSFXSR, edx, bit(X86_FEATURE_FXSR)); - cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, bit(X86_FEATURE_XMM)); - cr4_fixed1_update(X86_CR4_VMXE, ecx, bit(X86_FEATURE_VMX)); - cr4_fixed1_update(X86_CR4_SMXE, ecx, bit(X86_FEATURE_SMX)); - cr4_fixed1_update(X86_CR4_PCIDE, ecx, bit(X86_FEATURE_PCID)); - cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, bit(X86_FEATURE_XSAVE)); - - entry = kvm_find_cpuid_entry(vcpu, 0x7, 0); - cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, bit(X86_FEATURE_FSGSBASE)); - cr4_fixed1_update(X86_CR4_SMEP, ebx, bit(X86_FEATURE_SMEP)); - cr4_fixed1_update(X86_CR4_SMAP, ebx, bit(X86_FEATURE_SMAP)); - cr4_fixed1_update(X86_CR4_PKE, ecx, bit(X86_FEATURE_PKU)); - cr4_fixed1_update(X86_CR4_UMIP, ecx, bit(X86_FEATURE_UMIP)); - -#undef cr4_fixed1_update -} - -static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (kvm_mpx_supported()) { - bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX); - - if (mpx_enabled) { - vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; - vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; - } else { - vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS; - vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS; - } - } -} - -static void vmx_cpuid_update(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (cpu_has_secondary_exec_ctrls()) { - vmx_compute_secondary_exec_control(vmx); - vmcs_set_secondary_exec_control(vmx->secondary_exec_control); - } - - if (nested_vmx_allowed(vcpu)) - to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= - FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; - else - to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= - ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; - - if (nested_vmx_allowed(vcpu)) { - nested_vmx_cr_fixed1_bits_update(vcpu); - nested_vmx_entry_exit_ctls_update(vcpu); - } -} - -static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) -{ - if (func == 1 && nested) - entry->ecx |= bit(X86_FEATURE_VMX); -} - -static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, - struct x86_exception *fault) -{ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 exit_reason; - unsigned long exit_qualification = vcpu->arch.exit_qualification; - - if (vmx->nested.pml_full) { - exit_reason = EXIT_REASON_PML_FULL; - vmx->nested.pml_full = false; - exit_qualification &= INTR_INFO_UNBLOCK_NMI; - } else if (fault->error_code & PFERR_RSVD_MASK) - exit_reason = EXIT_REASON_EPT_MISCONFIG; - else - exit_reason = EXIT_REASON_EPT_VIOLATION; - - nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification); - vmcs12->guest_physical_address = fault->address; -} - -static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu) -{ - return nested_ept_get_cr3(vcpu) & VMX_EPTP_AD_ENABLE_BIT; -} - -/* Callbacks for nested_ept_init_mmu_context: */ - -static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) -{ - /* return the page table to be shadowed - in our case, EPT12 */ - return get_vmcs12(vcpu)->ept_pointer; -} - -static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) -{ - WARN_ON(mmu_is_nested(vcpu)); - - vcpu->arch.mmu = &vcpu->arch.guest_mmu; - kvm_init_shadow_ept_mmu(vcpu, - to_vmx(vcpu)->nested.msrs.ept_caps & - VMX_EPT_EXECUTE_ONLY_BIT, - nested_ept_ad_enabled(vcpu), - nested_ept_get_cr3(vcpu)); - vcpu->arch.mmu->set_cr3 = vmx_set_cr3; - vcpu->arch.mmu->get_cr3 = nested_ept_get_cr3; - vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; - vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; - - vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; -} - -static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) -{ - vcpu->arch.mmu = &vcpu->arch.root_mmu; - vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; -} - -static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, - u16 error_code) -{ - bool inequality, bit; - - bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; - inequality = - (error_code & vmcs12->page_fault_error_code_mask) != - vmcs12->page_fault_error_code_match; - return inequality ^ bit; -} - -static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, - struct x86_exception *fault) -{ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - - WARN_ON(!is_guest_mode(vcpu)); - - if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) && - !to_vmx(vcpu)->nested.nested_run_pending) { - vmcs12->vm_exit_intr_error_code = fault->error_code; - nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, - PF_VECTOR | INTR_TYPE_HARD_EXCEPTION | - INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK, - fault->address); - } else { - kvm_inject_page_fault(vcpu, fault); - } -} - -static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12); - -static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) -{ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct page *page; - u64 hpa; - - if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { - /* - * Translate L1 physical address to host physical - * address for vmcs02. Keep the page pinned, so this - * physical address remains valid. We keep a reference - * to it so we can release it later. - */ - if (vmx->nested.apic_access_page) { /* shouldn't happen */ - kvm_release_page_dirty(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = NULL; - } - page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr); - /* - * If translation failed, no matter: This feature asks - * to exit when accessing the given address, and if it - * can never be accessed, this feature won't do - * anything anyway. - */ - if (!is_error_page(page)) { - vmx->nested.apic_access_page = page; - hpa = page_to_phys(vmx->nested.apic_access_page); - vmcs_write64(APIC_ACCESS_ADDR, hpa); - } else { - vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); - } - } - - if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { - if (vmx->nested.virtual_apic_page) { /* shouldn't happen */ - kvm_release_page_dirty(vmx->nested.virtual_apic_page); - vmx->nested.virtual_apic_page = NULL; - } - page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->virtual_apic_page_addr); - - /* - * If translation failed, VM entry will fail because - * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull. - * Failing the vm entry is _not_ what the processor - * does but it's basically the only possibility we - * have. We could still enter the guest if CR8 load - * exits are enabled, CR8 store exits are enabled, and - * virtualize APIC access is disabled; in this case - * the processor would never use the TPR shadow and we - * could simply clear the bit from the execution - * control. But such a configuration is useless, so - * let's keep the code simple. - */ - if (!is_error_page(page)) { - vmx->nested.virtual_apic_page = page; - hpa = page_to_phys(vmx->nested.virtual_apic_page); - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa); - } - } - - if (nested_cpu_has_posted_intr(vmcs12)) { - if (vmx->nested.pi_desc_page) { /* shouldn't happen */ - kunmap(vmx->nested.pi_desc_page); - kvm_release_page_dirty(vmx->nested.pi_desc_page); - vmx->nested.pi_desc_page = NULL; - } - page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->posted_intr_desc_addr); - if (is_error_page(page)) - return; - vmx->nested.pi_desc_page = page; - vmx->nested.pi_desc = kmap(vmx->nested.pi_desc_page); - vmx->nested.pi_desc = - (struct pi_desc *)((void *)vmx->nested.pi_desc + - (unsigned long)(vmcs12->posted_intr_desc_addr & - (PAGE_SIZE - 1))); - vmcs_write64(POSTED_INTR_DESC_ADDR, - page_to_phys(vmx->nested.pi_desc_page) + - (unsigned long)(vmcs12->posted_intr_desc_addr & - (PAGE_SIZE - 1))); - } - if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) - vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, - CPU_BASED_USE_MSR_BITMAPS); - else - vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, - CPU_BASED_USE_MSR_BITMAPS); -} - -static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) -{ - u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; - struct vcpu_vmx *vmx = to_vmx(vcpu); - - /* - * A timer value of zero is architecturally guaranteed to cause - * a VMExit prior to executing any instructions in the guest. - */ - if (preemption_timeout == 0) { - vmx_preemption_timer_fn(&vmx->nested.preemption_timer); - return; - } - - if (vcpu->arch.virtual_tsc_khz == 0) - return; - - preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; - preemption_timeout *= 1000000; - do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); - hrtimer_start(&vmx->nested.preemption_timer, - ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); -} - -static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) - return 0; - - if (!page_address_valid(vcpu, vmcs12->io_bitmap_a) || - !page_address_valid(vcpu, vmcs12->io_bitmap_b)) - return -EINVAL; - - return 0; -} - -static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) - return 0; - - if (!page_address_valid(vcpu, vmcs12->msr_bitmap)) - return -EINVAL; - - return 0; -} - -static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) - return 0; - - if (!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr)) - return -EINVAL; - - return 0; -} - -/* - * Merge L0's and L1's MSR bitmap, return false to indicate that - * we do not use the hardware. - */ -static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - int msr; - struct page *page; - unsigned long *msr_bitmap_l1; - unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; - /* - * pred_cmd & spec_ctrl are trying to verify two things: - * - * 1. L0 gave a permission to L1 to actually passthrough the MSR. This - * ensures that we do not accidentally generate an L02 MSR bitmap - * from the L12 MSR bitmap that is too permissive. - * 2. That L1 or L2s have actually used the MSR. This avoids - * unnecessarily merging of the bitmap if the MSR is unused. This - * works properly because we only update the L01 MSR bitmap lazily. - * So even if L0 should pass L1 these MSRs, the L01 bitmap is only - * updated to reflect this when L1 (or its L2s) actually write to - * the MSR. - */ - bool pred_cmd = !msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD); - bool spec_ctrl = !msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL); - - /* Nothing to do if the MSR bitmap is not in use. */ - if (!cpu_has_vmx_msr_bitmap() || - !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) - return false; - - if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && - !pred_cmd && !spec_ctrl) - return false; - - page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap); - if (is_error_page(page)) - return false; - - msr_bitmap_l1 = (unsigned long *)kmap(page); - if (nested_cpu_has_apic_reg_virt(vmcs12)) { - /* - * L0 need not intercept reads for MSRs between 0x800 and 0x8ff, it - * just lets the processor take the value from the virtual-APIC page; - * take those 256 bits directly from the L1 bitmap. - */ - for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { - unsigned word = msr / BITS_PER_LONG; - msr_bitmap_l0[word] = msr_bitmap_l1[word]; - msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0; - } - } else { - for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { - unsigned word = msr / BITS_PER_LONG; - msr_bitmap_l0[word] = ~0; - msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0; - } - } - - nested_vmx_disable_intercept_for_msr( - msr_bitmap_l1, msr_bitmap_l0, - X2APIC_MSR(APIC_TASKPRI), - MSR_TYPE_W); - - if (nested_cpu_has_vid(vmcs12)) { - nested_vmx_disable_intercept_for_msr( - msr_bitmap_l1, msr_bitmap_l0, - X2APIC_MSR(APIC_EOI), - MSR_TYPE_W); - nested_vmx_disable_intercept_for_msr( - msr_bitmap_l1, msr_bitmap_l0, - X2APIC_MSR(APIC_SELF_IPI), - MSR_TYPE_W); - } - - if (spec_ctrl) - nested_vmx_disable_intercept_for_msr( - msr_bitmap_l1, msr_bitmap_l0, - MSR_IA32_SPEC_CTRL, - MSR_TYPE_R | MSR_TYPE_W); - - if (pred_cmd) - nested_vmx_disable_intercept_for_msr( - msr_bitmap_l1, msr_bitmap_l0, - MSR_IA32_PRED_CMD, - MSR_TYPE_W); - - kunmap(page); - kvm_release_page_clean(page); - - return true; -} - -static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - struct vmcs12 *shadow; - struct page *page; - - if (!nested_cpu_has_shadow_vmcs(vmcs12) || - vmcs12->vmcs_link_pointer == -1ull) - return; - - shadow = get_shadow_vmcs12(vcpu); - page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer); - - memcpy(shadow, kmap(page), VMCS12_SIZE); - - kunmap(page); - kvm_release_page_clean(page); -} - -static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (!nested_cpu_has_shadow_vmcs(vmcs12) || - vmcs12->vmcs_link_pointer == -1ull) - return; - - kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer, - get_shadow_vmcs12(vcpu), VMCS12_SIZE); -} - -static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && - !page_address_valid(vcpu, vmcs12->apic_access_addr)) - return -EINVAL; - else - return 0; -} - -static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && - !nested_cpu_has_apic_reg_virt(vmcs12) && - !nested_cpu_has_vid(vmcs12) && - !nested_cpu_has_posted_intr(vmcs12)) - return 0; - - /* - * If virtualize x2apic mode is enabled, - * virtualize apic access must be disabled. - */ - if (nested_cpu_has_virt_x2apic_mode(vmcs12) && - nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) - return -EINVAL; - - /* - * If virtual interrupt delivery is enabled, - * we must exit on external interrupts. - */ - if (nested_cpu_has_vid(vmcs12) && - !nested_exit_on_intr(vcpu)) - return -EINVAL; - - /* - * bits 15:8 should be zero in posted_intr_nv, - * the descriptor address has been already checked - * in nested_get_vmcs12_pages. - * - * bits 5:0 of posted_intr_desc_addr should be zero. - */ - if (nested_cpu_has_posted_intr(vmcs12) && - (!nested_cpu_has_vid(vmcs12) || - !nested_exit_intr_ack_set(vcpu) || - (vmcs12->posted_intr_nv & 0xff00) || - (vmcs12->posted_intr_desc_addr & 0x3f) || - (vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu)))) - return -EINVAL; - - /* tpr shadow is needed by all apicv features. */ - if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) - return -EINVAL; - - return 0; -} - -static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, - unsigned long count_field, - unsigned long addr_field) -{ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - int maxphyaddr; - u64 count, addr; - - if (vmcs12_read_any(vmcs12, count_field, &count) || - vmcs12_read_any(vmcs12, addr_field, &addr)) { - WARN_ON(1); - return -EINVAL; - } - if (count == 0) - return 0; - maxphyaddr = cpuid_maxphyaddr(vcpu); - if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr || - (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) { - pr_debug_ratelimited( - "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)", - addr_field, maxphyaddr, count, addr); - return -EINVAL; - } - return 0; -} - -static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - if (vmcs12->vm_exit_msr_load_count == 0 && - vmcs12->vm_exit_msr_store_count == 0 && - vmcs12->vm_entry_msr_load_count == 0) - return 0; /* Fast path */ - if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT, - VM_EXIT_MSR_LOAD_ADDR) || - nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT, - VM_EXIT_MSR_STORE_ADDR) || - nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT, - VM_ENTRY_MSR_LOAD_ADDR)) - return -EINVAL; - return 0; -} - -static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - if (!nested_cpu_has_pml(vmcs12)) - return 0; - - if (!nested_cpu_has_ept(vmcs12) || - !page_address_valid(vcpu, vmcs12->pml_address)) - return -EINVAL; - - return 0; -} - -static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && - !nested_cpu_has_ept(vmcs12)) - return -EINVAL; - return 0; -} - -static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && - !nested_cpu_has_ept(vmcs12)) - return -EINVAL; - return 0; -} - -static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - if (!nested_cpu_has_shadow_vmcs(vmcs12)) - return 0; - - if (!page_address_valid(vcpu, vmcs12->vmread_bitmap) || - !page_address_valid(vcpu, vmcs12->vmwrite_bitmap)) - return -EINVAL; - - return 0; -} - -static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, - struct vmx_msr_entry *e) -{ - /* x2APIC MSR accesses are not allowed */ - if (vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8) - return -EINVAL; - if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */ - e->index == MSR_IA32_UCODE_REV) - return -EINVAL; - if (e->reserved != 0) - return -EINVAL; - return 0; -} - -static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, - struct vmx_msr_entry *e) -{ - if (e->index == MSR_FS_BASE || - e->index == MSR_GS_BASE || - e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */ - nested_vmx_msr_check_common(vcpu, e)) - return -EINVAL; - return 0; -} - -static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, - struct vmx_msr_entry *e) -{ - if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */ - nested_vmx_msr_check_common(vcpu, e)) - return -EINVAL; - return 0; -} - -/* - * Load guest's/host's msr at nested entry/exit. - * return 0 for success, entry index for failure. - */ -static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) -{ - u32 i; - struct vmx_msr_entry e; - struct msr_data msr; - - msr.host_initiated = false; - for (i = 0; i < count; i++) { - if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), - &e, sizeof(e))) { - pr_debug_ratelimited( - "%s cannot read MSR entry (%u, 0x%08llx)\n", - __func__, i, gpa + i * sizeof(e)); - goto fail; - } - if (nested_vmx_load_msr_check(vcpu, &e)) { - pr_debug_ratelimited( - "%s check failed (%u, 0x%x, 0x%x)\n", - __func__, i, e.index, e.reserved); - goto fail; - } - msr.index = e.index; - msr.data = e.value; - if (kvm_set_msr(vcpu, &msr)) { - pr_debug_ratelimited( - "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", - __func__, i, e.index, e.value); - goto fail; - } - } - return 0; -fail: - return i + 1; -} - -static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) -{ - u32 i; - struct vmx_msr_entry e; - - for (i = 0; i < count; i++) { - struct msr_data msr_info; - if (kvm_vcpu_read_guest(vcpu, - gpa + i * sizeof(e), - &e, 2 * sizeof(u32))) { - pr_debug_ratelimited( - "%s cannot read MSR entry (%u, 0x%08llx)\n", - __func__, i, gpa + i * sizeof(e)); - return -EINVAL; - } - if (nested_vmx_store_msr_check(vcpu, &e)) { - pr_debug_ratelimited( - "%s check failed (%u, 0x%x, 0x%x)\n", - __func__, i, e.index, e.reserved); - return -EINVAL; - } - msr_info.host_initiated = false; - msr_info.index = e.index; - if (kvm_get_msr(vcpu, &msr_info)) { - pr_debug_ratelimited( - "%s cannot read MSR (%u, 0x%x)\n", - __func__, i, e.index); - return -EINVAL; - } - if (kvm_vcpu_write_guest(vcpu, - gpa + i * sizeof(e) + - offsetof(struct vmx_msr_entry, value), - &msr_info.data, sizeof(msr_info.data))) { - pr_debug_ratelimited( - "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", - __func__, i, e.index, msr_info.data); - return -EINVAL; - } - } - return 0; -} - -static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val) -{ - unsigned long invalid_mask; - - invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu); - return (val & invalid_mask) == 0; -} - -/* - * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are - * emulating VM entry into a guest with EPT enabled. - * Returns 0 on success, 1 on failure. Invalid state exit qualification code - * is assigned to entry_failure_code on failure. - */ -static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept, - u32 *entry_failure_code) -{ - if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) { - if (!nested_cr3_valid(vcpu, cr3)) { - *entry_failure_code = ENTRY_FAIL_DEFAULT; - return 1; - } - - /* - * If PAE paging and EPT are both on, CR3 is not used by the CPU and - * must not be dereferenced. - */ - if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu) && - !nested_ept) { - if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) { - *entry_failure_code = ENTRY_FAIL_PDPTE; - return 1; - } - } - } - - if (!nested_ept) - kvm_mmu_new_cr3(vcpu, cr3, false); - - vcpu->arch.cr3 = cr3; - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); - - kvm_init_mmu(vcpu, false); - - return 0; -} - -/* - * Returns if KVM is able to config CPU to tag TLB entries - * populated by L2 differently than TLB entries populated - * by L1. - * - * If L1 uses EPT, then TLB entries are tagged with different EPTP. - * - * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged - * with different VPID (L1 entries are tagged with vmx->vpid - * while L2 entries are tagged with vmx->nested.vpid02). - */ -static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) -{ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - - return nested_cpu_has_ept(vmcs12) || - (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); -} - -static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) -{ - if (vmx->nested.nested_run_pending && - (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) - return vmcs12->guest_ia32_efer; - else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) - return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); - else - return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); -} - -static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) -{ - /* - * If vmcs02 hasn't been initialized, set the constant vmcs02 state - * according to L0's settings (vmcs12 is irrelevant here). Host - * fields that come from L0 and are not constant, e.g. HOST_CR3, - * will be set as needed prior to VMLAUNCH/VMRESUME. - */ - if (vmx->nested.vmcs02_initialized) - return; - vmx->nested.vmcs02_initialized = true; - - /* - * We don't care what the EPTP value is we just need to guarantee - * it's valid so we don't get a false positive when doing early - * consistency checks. - */ - if (enable_ept && nested_early_check) - vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0)); - - /* All VMFUNCs are currently emulated through L0 vmexits. */ - if (cpu_has_vmx_vmfunc()) - vmcs_write64(VM_FUNCTION_CONTROL, 0); - - if (cpu_has_vmx_posted_intr()) - vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); - - if (cpu_has_vmx_msr_bitmap()) - vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); - - if (enable_pml) - vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); - - /* - * Set the MSR load/store lists to match L0's settings. Only the - * addresses are constant (for vmcs02), the counts can change based - * on L2's behavior, e.g. switching to/from long mode. - */ - vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); - vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); - vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); - - vmx_set_constant_host_state(vmx); -} - -static void prepare_vmcs02_early_full(struct vcpu_vmx *vmx, - struct vmcs12 *vmcs12) -{ - prepare_vmcs02_constant_state(vmx); - - vmcs_write64(VMCS_LINK_POINTER, -1ull); - - if (enable_vpid) { - if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) - vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); - else - vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); - } -} - -static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) -{ - u32 exec_control, vmcs12_exec_ctrl; - u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); - - if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) - prepare_vmcs02_early_full(vmx, vmcs12); - - /* - * HOST_RSP is normally set correctly in vmx_vcpu_run() just before - * entry, but only if the current (host) sp changed from the value - * we wrote last (vmx->host_rsp). This cache is no longer relevant - * if we switch vmcs, and rather than hold a separate cache per vmcs, - * here we just force the write to happen on entry. host_rsp will - * also be written unconditionally by nested_vmx_check_vmentry_hw() - * if we are doing early consistency checks via hardware. - */ - vmx->host_rsp = 0; - - /* - * PIN CONTROLS - */ - exec_control = vmcs12->pin_based_vm_exec_control; - - /* Preemption timer setting is computed directly in vmx_vcpu_run. */ - exec_control |= vmcs_config.pin_based_exec_ctrl; - exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; - vmx->loaded_vmcs->hv_timer_armed = false; - - /* Posted interrupts setting is only taken from vmcs12. */ - if (nested_cpu_has_posted_intr(vmcs12)) { - vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; - vmx->nested.pi_pending = false; - } else { - exec_control &= ~PIN_BASED_POSTED_INTR; - } - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); - - /* - * EXEC CONTROLS - */ - exec_control = vmx_exec_control(vmx); /* L0's desires */ - exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; - exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; - exec_control &= ~CPU_BASED_TPR_SHADOW; - exec_control |= vmcs12->cpu_based_vm_exec_control; - - /* - * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if - * nested_get_vmcs12_pages can't fix it up, the illegal value - * will result in a VM entry failure. - */ - if (exec_control & CPU_BASED_TPR_SHADOW) { - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull); - vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); - } else { -#ifdef CONFIG_X86_64 - exec_control |= CPU_BASED_CR8_LOAD_EXITING | - CPU_BASED_CR8_STORE_EXITING; -#endif - } - - /* - * A vmexit (to either L1 hypervisor or L0 userspace) is always needed - * for I/O port accesses. - */ - exec_control &= ~CPU_BASED_USE_IO_BITMAPS; - exec_control |= CPU_BASED_UNCOND_IO_EXITING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); - - /* - * SECONDARY EXEC CONTROLS - */ - if (cpu_has_secondary_exec_ctrls()) { - exec_control = vmx->secondary_exec_control; - - /* Take the following fields only from vmcs12 */ - exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | - SECONDARY_EXEC_ENABLE_INVPCID | - SECONDARY_EXEC_RDTSCP | - SECONDARY_EXEC_XSAVES | - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | - SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_ENABLE_VMFUNC); - if (nested_cpu_has(vmcs12, - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) { - vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control & - ~SECONDARY_EXEC_ENABLE_PML; - exec_control |= vmcs12_exec_ctrl; - } - - /* VMCS shadowing for L2 is emulated for now */ - exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; - - if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) - vmcs_write16(GUEST_INTR_STATUS, - vmcs12->guest_intr_status); - - /* - * Write an illegal value to APIC_ACCESS_ADDR. Later, - * nested_get_vmcs12_pages will either fix it up or - * remove the VM execution control. - */ - if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) - vmcs_write64(APIC_ACCESS_ADDR, -1ull); - - if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) - vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); - - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); - } - - /* - * ENTRY CONTROLS - * - * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE - * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate - * on the related bits (if supported by the CPU) in the hope that - * we can avoid VMWrites during vmx_set_efer(). - */ - exec_control = (vmcs12->vm_entry_controls | vmx_vmentry_ctrl()) & - ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER; - if (cpu_has_load_ia32_efer()) { - if (guest_efer & EFER_LMA) - exec_control |= VM_ENTRY_IA32E_MODE; - if (guest_efer != host_efer) - exec_control |= VM_ENTRY_LOAD_IA32_EFER; - } - vm_entry_controls_init(vmx, exec_control); - - /* - * EXIT CONTROLS - * - * L2->L1 exit controls are emulated - the hardware exit is to L0 so - * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER - * bits may be modified by vmx_set_efer() in prepare_vmcs02(). - */ - exec_control = vmx_vmexit_ctrl(); - if (cpu_has_load_ia32_efer() && guest_efer != host_efer) - exec_control |= VM_EXIT_LOAD_IA32_EFER; - vm_exit_controls_init(vmx, exec_control); - - /* - * Conceptually we want to copy the PML address and index from - * vmcs01 here, and then back to vmcs01 on nested vmexit. But, - * since we always flush the log on each vmexit and never change - * the PML address (once set), this happens to be equivalent to - * simply resetting the index in vmcs02. - */ - if (enable_pml) - vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); - - /* - * Interrupt/Exception Fields - */ - if (vmx->nested.nested_run_pending) { - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, - vmcs12->vm_entry_intr_info_field); - vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, - vmcs12->vm_entry_exception_error_code); - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, - vmcs12->vm_entry_instruction_len); - vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, - vmcs12->guest_interruptibility_info); - vmx->loaded_vmcs->nmi_known_unmasked = - !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); - } else { - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); - } -} - -static void prepare_vmcs02_full(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) -{ - struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; - - if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { - vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); - vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); - vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); - vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); - vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); - vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); - vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); - vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); - vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); - vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); - vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); - vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); - vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); - vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); - vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); - vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); - vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); - vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); - vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); - vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); - vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); - vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); - vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); - vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); - vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); - vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); - vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); - vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); - vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); - vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); - vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); - vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); - vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); - vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); - } - - if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { - vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); - vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, - vmcs12->guest_pending_dbg_exceptions); - vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); - vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); - - /* - * L1 may access the L2's PDPTR, so save them to construct - * vmcs12 - */ - if (enable_ept) { - vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); - vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); - vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); - vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); - } - } - - if (nested_cpu_has_xsaves(vmcs12)) - vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); - - /* - * Whether page-faults are trapped is determined by a combination of - * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. - * If enable_ept, L0 doesn't care about page faults and we should - * set all of these to L1's desires. However, if !enable_ept, L0 does - * care about (at least some) page faults, and because it is not easy - * (if at all possible?) to merge L0 and L1's desires, we simply ask - * to exit on each and every L2 page fault. This is done by setting - * MASK=MATCH=0 and (see below) EB.PF=1. - * Note that below we don't need special code to set EB.PF beyond the - * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, - * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when - * !enable_ept, EB.PF is 1, so the "or" will always be 1. - */ - vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, - enable_ept ? vmcs12->page_fault_error_code_mask : 0); - vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, - enable_ept ? vmcs12->page_fault_error_code_match : 0); - - if (cpu_has_vmx_apicv()) { - vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); - vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); - vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); - vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); - } - - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); - - set_cr4_guest_host_mask(vmx); - - if (kvm_mpx_supported()) { - if (vmx->nested.nested_run_pending && - (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) - vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); - else - vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); - } -} - -/* - * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested - * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it - * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 - * guest in a way that will both be appropriate to L1's requests, and our - * needs. In addition to modifying the active vmcs (which is vmcs02), this - * function also has additional necessary side-effects, like setting various - * vcpu->arch fields. - * Returns 0 on success, 1 on failure. Invalid state exit qualification code - * is assigned to entry_failure_code on failure. - */ -static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, - u32 *entry_failure_code) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; - - if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) { - prepare_vmcs02_full(vmx, vmcs12); - vmx->nested.dirty_vmcs12 = false; - } - - /* - * First, the fields that are shadowed. This must be kept in sync - * with vmcs_shadow_fields.h. - */ - if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { - vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); - vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); - } - - if (vmx->nested.nested_run_pending && - (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { - kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); - vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); - } else { - kvm_set_dr(vcpu, 7, vcpu->arch.dr7); - vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); - } - vmx_set_rflags(vcpu, vmcs12->guest_rflags); - - vmx->nested.preemption_timer_expired = false; - if (nested_cpu_has_preemption_timer(vmcs12)) - vmx_start_preemption_timer(vcpu); - - /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the - * bitwise-or of what L1 wants to trap for L2, and what we want to - * trap. Note that CR0.TS also needs updating - we do this later. - */ - update_exception_bitmap(vcpu); - vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; - vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); - - if (vmx->nested.nested_run_pending && - (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { - vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); - vcpu->arch.pat = vmcs12->guest_ia32_pat; - } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { - vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); - } - - vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); - - if (kvm_has_tsc_control) - decache_tsc_multiplier(vmx); - - if (enable_vpid) { - /* - * There is no direct mapping between vpid02 and vpid12, the - * vpid02 is per-vCPU for L0 and reused while the value of - * vpid12 is changed w/ one invvpid during nested vmentry. - * The vpid12 is allocated by L1 for L2, so it will not - * influence global bitmap(for vpid01 and vpid02 allocation) - * even if spawn a lot of nested vCPUs. - */ - if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) { - if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) { - vmx->nested.last_vpid = vmcs12->virtual_processor_id; - __vmx_flush_tlb(vcpu, nested_get_vpid02(vcpu), false); - } - } else { - /* - * If L1 use EPT, then L0 needs to execute INVEPT on - * EPTP02 instead of EPTP01. Therefore, delay TLB - * flush until vmcs02->eptp is fully updated by - * KVM_REQ_LOAD_CR3. Note that this assumes - * KVM_REQ_TLB_FLUSH is evaluated after - * KVM_REQ_LOAD_CR3 in vcpu_enter_guest(). - */ - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); - } - } - - if (nested_cpu_has_ept(vmcs12)) - nested_ept_init_mmu_context(vcpu); - else if (nested_cpu_has2(vmcs12, - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) - vmx_flush_tlb(vcpu, true); - - /* - * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those - * bits which we consider mandatory enabled. - * The CR0_READ_SHADOW is what L2 should have expected to read given - * the specifications by L1; It's not enough to take - * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we - * have more bits than L1 expected. - */ - vmx_set_cr0(vcpu, vmcs12->guest_cr0); - vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); - - vmx_set_cr4(vcpu, vmcs12->guest_cr4); - vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); - - vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); - /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ - vmx_set_efer(vcpu, vcpu->arch.efer); - - /* - * Guest state is invalid and unrestricted guest is disabled, - * which means L1 attempted VMEntry to L2 with invalid state. - * Fail the VMEntry. - */ - if (vmx->emulation_required) { - *entry_failure_code = ENTRY_FAIL_DEFAULT; - return 1; - } - - /* Shadow page tables on either EPT or shadow page tables. */ - if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), - entry_failure_code)) - return 1; - - if (!enable_ept) - vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; - - kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp); - kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); - return 0; -} - -static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) -{ - if (!nested_cpu_has_nmi_exiting(vmcs12) && - nested_cpu_has_virtual_nmis(vmcs12)) - return -EINVAL; - - if (!nested_cpu_has_virtual_nmis(vmcs12) && - nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING)) - return -EINVAL; - - return 0; -} - -static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - bool ia32e; - - if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && - vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_vmx_check_io_bitmap_controls(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_vmx_check_apic_access_controls(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (!nested_cpu_has_preemption_timer(vmcs12) && - nested_cpu_has_save_preemption_timer(vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_vmx_check_pml_controls(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, - vmx->nested.msrs.procbased_ctls_low, - vmx->nested.msrs.procbased_ctls_high) || - (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && - !vmx_control_verify(vmcs12->secondary_vm_exec_control, - vmx->nested.msrs.secondary_ctls_low, - vmx->nested.msrs.secondary_ctls_high)) || - !vmx_control_verify(vmcs12->pin_based_vm_exec_control, - vmx->nested.msrs.pinbased_ctls_low, - vmx->nested.msrs.pinbased_ctls_high) || - !vmx_control_verify(vmcs12->vm_exit_controls, - vmx->nested.msrs.exit_ctls_low, - vmx->nested.msrs.exit_ctls_high) || - !vmx_control_verify(vmcs12->vm_entry_controls, - vmx->nested.msrs.entry_ctls_low, - vmx->nested.msrs.entry_ctls_high)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_vmx_check_nmi_controls(vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_cpu_has_vmfunc(vmcs12)) { - if (vmcs12->vm_function_control & - ~vmx->nested.msrs.vmfunc_controls) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_cpu_has_eptp_switching(vmcs12)) { - if (!nested_cpu_has_ept(vmcs12) || - !page_address_valid(vcpu, vmcs12->eptp_list_address)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - } - } - - if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) || - !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) || - !nested_cr3_valid(vcpu, vmcs12->host_cr3)) - return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD; - - /* - * If the load IA32_EFER VM-exit control is 1, bits reserved in the - * IA32_EFER MSR must be 0 in the field for that register. In addition, - * the values of the LMA and LME bits in the field must each be that of - * the host address-space size VM-exit control. - */ - if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { - ia32e = (vmcs12->vm_exit_controls & - VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0; - if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) || - ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) || - ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) - return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD; - } - - /* - * From the Intel SDM, volume 3: - * Fields relevant to VM-entry event injection must be set properly. - * These fields are the VM-entry interruption-information field, the - * VM-entry exception error code, and the VM-entry instruction length. - */ - if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { - u32 intr_info = vmcs12->vm_entry_intr_info_field; - u8 vector = intr_info & INTR_INFO_VECTOR_MASK; - u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; - bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; - bool should_have_error_code; - bool urg = nested_cpu_has2(vmcs12, - SECONDARY_EXEC_UNRESTRICTED_GUEST); - bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; - - /* VM-entry interruption-info field: interruption type */ - if (intr_type == INTR_TYPE_RESERVED || - (intr_type == INTR_TYPE_OTHER_EVENT && - !nested_cpu_supports_monitor_trap_flag(vcpu))) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - /* VM-entry interruption-info field: vector */ - if ((intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || - (intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || - (intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - /* VM-entry interruption-info field: deliver error code */ - should_have_error_code = - intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && - x86_exception_has_error_code(vector); - if (has_error_code != should_have_error_code) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - /* VM-entry exception error code */ - if (has_error_code && - vmcs12->vm_entry_exception_error_code & GENMASK(31, 15)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - /* VM-entry interruption-info field: reserved bits */ - if (intr_info & INTR_INFO_RESVD_BITS_MASK) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - /* VM-entry instruction length */ - switch (intr_type) { - case INTR_TYPE_SOFT_EXCEPTION: - case INTR_TYPE_SOFT_INTR: - case INTR_TYPE_PRIV_SW_EXCEPTION: - if ((vmcs12->vm_entry_instruction_len > 15) || - (vmcs12->vm_entry_instruction_len == 0 && - !nested_cpu_has_zero_length_injection(vcpu))) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - } - } - - if (nested_cpu_has_ept(vmcs12) && - !valid_ept_address(vcpu, vmcs12->ept_pointer)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - return 0; -} - -static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - int r; - struct page *page; - struct vmcs12 *shadow; - - if (vmcs12->vmcs_link_pointer == -1ull) - return 0; - - if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)) - return -EINVAL; - - page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer); - if (is_error_page(page)) - return -EINVAL; - - r = 0; - shadow = kmap(page); - if (shadow->hdr.revision_id != VMCS12_REVISION || - shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)) - r = -EINVAL; - kunmap(page); - kvm_release_page_clean(page); - return r; -} - -static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, - u32 *exit_qual) -{ - bool ia32e; - - *exit_qual = ENTRY_FAIL_DEFAULT; - - if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) || - !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) - return 1; - - if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { - *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR; - return 1; - } - - /* - * If the load IA32_EFER VM-entry control is 1, the following checks - * are performed on the field for the IA32_EFER MSR: - * - Bits reserved in the IA32_EFER MSR must be 0. - * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of - * the IA-32e mode guest VM-exit control. It must also be identical - * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to - * CR0.PG) is 1. - */ - if (to_vmx(vcpu)->nested.nested_run_pending && - (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { - ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; - if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) || - ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) || - ((vmcs12->guest_cr0 & X86_CR0_PG) && - ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) - return 1; - } - - if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && - (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) || - (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))) - return 1; - - return 0; -} - -static int __noclone nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long cr3, cr4; - - if (!nested_early_check) - return 0; - - if (vmx->msr_autoload.host.nr) - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); - if (vmx->msr_autoload.guest.nr) - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); - - preempt_disable(); - - vmx_prepare_switch_to_guest(vcpu); - - /* - * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, - * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to - * be written (by preparve_vmcs02()) before the "real" VMEnter, i.e. - * there is no need to preserve other bits or save/restore the field. - */ - vmcs_writel(GUEST_RFLAGS, 0); - - vmcs_writel(HOST_RIP, vmx_early_consistency_check_return); - - cr3 = __get_current_cr3_fast(); - if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { - vmcs_writel(HOST_CR3, cr3); - vmx->loaded_vmcs->host_state.cr3 = cr3; - } - - cr4 = cr4_read_shadow(); - if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { - vmcs_writel(HOST_CR4, cr4); - vmx->loaded_vmcs->host_state.cr4 = cr4; - } - - vmx->__launched = vmx->loaded_vmcs->launched; - - asm( - /* Set HOST_RSP */ - __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t" - "mov %%" _ASM_SP ", %c[host_rsp](%0)\n\t" - - /* Check if vmlaunch or vmresume is needed */ - "cmpl $0, %c[launched](%0)\n\t" - "jne 1f\n\t" - __ex("vmlaunch") "\n\t" - "jmp 2f\n\t" - "1: " __ex("vmresume") "\n\t" - "2: " - /* Set vmx->fail accordingly */ - "setbe %c[fail](%0)\n\t" - - ".pushsection .rodata\n\t" - ".global vmx_early_consistency_check_return\n\t" - "vmx_early_consistency_check_return: " _ASM_PTR " 2b\n\t" - ".popsection" - : - : "c"(vmx), "d"((unsigned long)HOST_RSP), - [launched]"i"(offsetof(struct vcpu_vmx, __launched)), - [fail]"i"(offsetof(struct vcpu_vmx, fail)), - [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)) - : "rax", "cc", "memory" - ); - - vmcs_writel(HOST_RIP, vmx_return); - - preempt_enable(); - - if (vmx->msr_autoload.host.nr) - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); - if (vmx->msr_autoload.guest.nr) - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); - - if (vmx->fail) { - WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != - VMXERR_ENTRY_INVALID_CONTROL_FIELD); - vmx->fail = 0; - return 1; - } - - /* - * VMExit clears RFLAGS.IF and DR7, even on a consistency check. - */ - local_irq_enable(); - if (hw_breakpoint_active()) - set_debugreg(__this_cpu_read(cpu_dr7), 7); - - /* - * A non-failing VMEntry means we somehow entered guest mode with - * an illegal RIP, and that's just the tip of the iceberg. There - * is no telling what memory has been modified or what state has - * been exposed to unknown code. Hitting this all but guarantees - * a (very critical) hardware issue. - */ - WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & - VMX_EXIT_REASONS_FAILED_VMENTRY)); - - return 0; -} -STACK_FRAME_NON_STANDARD(nested_vmx_check_vmentry_hw); - -static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12); - -/* - * If from_vmentry is false, this is being called from state restore (either RSM - * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. -+ * -+ * Returns: -+ * 0 - success, i.e. proceed with actual VMEnter -+ * 1 - consistency check VMExit -+ * -1 - consistency check VMFail - */ -static int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, - bool from_vmentry) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - bool evaluate_pending_interrupts; - u32 exit_reason = EXIT_REASON_INVALID_STATE; - u32 exit_qual; - - evaluate_pending_interrupts = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) & - (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING); - if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) - evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); - - if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) - vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); - if (kvm_mpx_supported() && - !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) - vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); - - vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); - - prepare_vmcs02_early(vmx, vmcs12); - - if (from_vmentry) { - nested_get_vmcs12_pages(vcpu); - - if (nested_vmx_check_vmentry_hw(vcpu)) { - vmx_switch_vmcs(vcpu, &vmx->vmcs01); - return -1; - } - - if (check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) - goto vmentry_fail_vmexit; - } - - enter_guest_mode(vcpu); - if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) - vcpu->arch.tsc_offset += vmcs12->tsc_offset; - - if (prepare_vmcs02(vcpu, vmcs12, &exit_qual)) - goto vmentry_fail_vmexit_guest_mode; - - if (from_vmentry) { - exit_reason = EXIT_REASON_MSR_LOAD_FAIL; - exit_qual = nested_vmx_load_msr(vcpu, - vmcs12->vm_entry_msr_load_addr, - vmcs12->vm_entry_msr_load_count); - if (exit_qual) - goto vmentry_fail_vmexit_guest_mode; - } else { - /* - * The MMU is not initialized to point at the right entities yet and - * "get pages" would need to read data from the guest (i.e. we will - * need to perform gpa to hpa translation). Request a call - * to nested_get_vmcs12_pages before the next VM-entry. The MSRs - * have already been set at vmentry time and should not be reset. - */ - kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); - } - - /* - * If L1 had a pending IRQ/NMI until it executed - * VMLAUNCH/VMRESUME which wasn't delivered because it was - * disallowed (e.g. interrupts disabled), L0 needs to - * evaluate if this pending event should cause an exit from L2 - * to L1 or delivered directly to L2 (e.g. In case L1 don't - * intercept EXTERNAL_INTERRUPT). - * - * Usually this would be handled by the processor noticing an - * IRQ/NMI window request, or checking RVI during evaluation of - * pending virtual interrupts. However, this setting was done - * on VMCS01 and now VMCS02 is active instead. Thus, we force L0 - * to perform pending event evaluation by requesting a KVM_REQ_EVENT. - */ - if (unlikely(evaluate_pending_interrupts)) - kvm_make_request(KVM_REQ_EVENT, vcpu); - - /* - * Note no nested_vmx_succeed or nested_vmx_fail here. At this point - * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet - * returned as far as L1 is concerned. It will only return (and set - * the success flag) when L2 exits (see nested_vmx_vmexit()). - */ - return 0; - - /* - * A failed consistency check that leads to a VMExit during L1's - * VMEnter to L2 is a variation of a normal VMexit, as explained in - * 26.7 "VM-entry failures during or after loading guest state". - */ -vmentry_fail_vmexit_guest_mode: - if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) - vcpu->arch.tsc_offset -= vmcs12->tsc_offset; - leave_guest_mode(vcpu); - -vmentry_fail_vmexit: - vmx_switch_vmcs(vcpu, &vmx->vmcs01); - - if (!from_vmentry) - return 1; - - load_vmcs12_host_state(vcpu, vmcs12); - vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY; - vmcs12->exit_qualification = exit_qual; - if (enable_shadow_vmcs || vmx->nested.hv_evmcs) - vmx->nested.need_vmcs12_sync = true; - return 1; -} - -/* - * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 - * for running an L2 nested guest. - */ -static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) -{ - struct vmcs12 *vmcs12; - struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); - int ret; - - if (!nested_vmx_check_permission(vcpu)) - return 1; + /* When single-stepping over STI and MOV SS, we must clear the + * corresponding interruptibility bits in the guest state. Otherwise + * vmentry fails as it then expects bit 14 (BS) in pending debug + * exceptions being set, but that's not correct for the guest debugging + * case. */ + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + vmx_set_interrupt_shadow(vcpu, 0); - if (!nested_vmx_handle_enlightened_vmptrld(vcpu, true)) - return 1; + if (static_cpu_has(X86_FEATURE_PKU) && + kvm_read_cr4_bits(vcpu, X86_CR4_PKE) && + vcpu->arch.pkru != vmx->host_pkru) + __write_pkru(vcpu->arch.pkru); - if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull) - return nested_vmx_failInvalid(vcpu); + atomic_switch_perf_msrs(vmx); - vmcs12 = get_vmcs12(vcpu); + vmx_update_hv_timer(vcpu); /* - * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact - * that there *is* a valid VMCS pointer, RFLAGS.CF is set - * rather than RFLAGS.ZF, and no error number is stored to the - * VM-instruction error field. + * If this vCPU has touched SPEC_CTRL, restore the guest's value if + * it's non-zero. Since vmentry is serialising on affected CPUs, there + * is no need to worry about the conditional branch over the wrmsr + * being speculatively taken. */ - if (vmcs12->hdr.shadow_vmcs) - return nested_vmx_failInvalid(vcpu); + x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0); - if (vmx->nested.hv_evmcs) { - copy_enlightened_to_vmcs12(vmx); - /* Enlightened VMCS doesn't have launch state */ - vmcs12->launch_state = !launch; - } else if (enable_shadow_vmcs) { - copy_shadow_to_vmcs12(vmx); - } + vmx->__launched = vmx->loaded_vmcs->launched; - /* - * The nested entry process starts with enforcing various prerequisites - * on vmcs12 as required by the Intel SDM, and act appropriately when - * they fail: As the SDM explains, some conditions should cause the - * instruction to fail, while others will cause the instruction to seem - * to succeed, but return an EXIT_REASON_INVALID_STATE. - * To speed up the normal (success) code path, we should avoid checking - * for misconfigurations which will anyway be caught by the processor - * when using the merged vmcs02. - */ - if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS) - return nested_vmx_failValid(vcpu, - VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); + evmcs_rsp = static_branch_unlikely(&enable_evmcs) ? + (unsigned long)¤t_evmcs->host_rsp : 0; - if (vmcs12->launch_state == launch) - return nested_vmx_failValid(vcpu, - launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS - : VMXERR_VMRESUME_NONLAUNCHED_VMCS); + if (static_branch_unlikely(&vmx_l1d_should_flush)) + vmx_l1d_flush(vcpu); - ret = check_vmentry_prereqs(vcpu, vmcs12); - if (ret) - return nested_vmx_failValid(vcpu, ret); + asm( + /* Store host registers */ + "push %%" _ASM_DX "; push %%" _ASM_BP ";" + "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */ + "push %%" _ASM_CX " \n\t" + "cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t" + "je 1f \n\t" + "mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t" + /* Avoid VMWRITE when Enlightened VMCS is in use */ + "test %%" _ASM_SI ", %%" _ASM_SI " \n\t" + "jz 2f \n\t" + "mov %%" _ASM_SP ", (%%" _ASM_SI ") \n\t" + "jmp 1f \n\t" + "2: \n\t" + __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t" + "1: \n\t" + /* Reload cr2 if changed */ + "mov %c[cr2](%0), %%" _ASM_AX " \n\t" + "mov %%cr2, %%" _ASM_DX " \n\t" + "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t" + "je 3f \n\t" + "mov %%" _ASM_AX", %%cr2 \n\t" + "3: \n\t" + /* Check if vmlaunch or vmresume is needed */ + "cmpl $0, %c[launched](%0) \n\t" + /* Load guest registers. Don't clobber flags. */ + "mov %c[rax](%0), %%" _ASM_AX " \n\t" + "mov %c[rbx](%0), %%" _ASM_BX " \n\t" + "mov %c[rdx](%0), %%" _ASM_DX " \n\t" + "mov %c[rsi](%0), %%" _ASM_SI " \n\t" + "mov %c[rdi](%0), %%" _ASM_DI " \n\t" + "mov %c[rbp](%0), %%" _ASM_BP " \n\t" +#ifdef CONFIG_X86_64 + "mov %c[r8](%0), %%r8 \n\t" + "mov %c[r9](%0), %%r9 \n\t" + "mov %c[r10](%0), %%r10 \n\t" + "mov %c[r11](%0), %%r11 \n\t" + "mov %c[r12](%0), %%r12 \n\t" + "mov %c[r13](%0), %%r13 \n\t" + "mov %c[r14](%0), %%r14 \n\t" + "mov %c[r15](%0), %%r15 \n\t" +#endif + "mov %c[rcx](%0), %%" _ASM_CX " \n\t" /* kills %0 (ecx) */ - /* - * We're finally done with prerequisite checking, and can start with - * the nested entry. - */ - vmx->nested.nested_run_pending = 1; - ret = nested_vmx_enter_non_root_mode(vcpu, true); - vmx->nested.nested_run_pending = !ret; - if (ret > 0) - return 1; - else if (ret) - return nested_vmx_failValid(vcpu, - VMXERR_ENTRY_INVALID_CONTROL_FIELD); + /* Enter guest mode */ + "jne 1f \n\t" + __ex("vmlaunch") "\n\t" + "jmp 2f \n\t" + "1: " __ex("vmresume") "\n\t" + "2: " + /* Save guest registers, load host registers, keep flags */ + "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t" + "pop %0 \n\t" + "setbe %c[fail](%0)\n\t" + "mov %%" _ASM_AX ", %c[rax](%0) \n\t" + "mov %%" _ASM_BX ", %c[rbx](%0) \n\t" + __ASM_SIZE(pop) " %c[rcx](%0) \n\t" + "mov %%" _ASM_DX ", %c[rdx](%0) \n\t" + "mov %%" _ASM_SI ", %c[rsi](%0) \n\t" + "mov %%" _ASM_DI ", %c[rdi](%0) \n\t" + "mov %%" _ASM_BP ", %c[rbp](%0) \n\t" +#ifdef CONFIG_X86_64 + "mov %%r8, %c[r8](%0) \n\t" + "mov %%r9, %c[r9](%0) \n\t" + "mov %%r10, %c[r10](%0) \n\t" + "mov %%r11, %c[r11](%0) \n\t" + "mov %%r12, %c[r12](%0) \n\t" + "mov %%r13, %c[r13](%0) \n\t" + "mov %%r14, %c[r14](%0) \n\t" + "mov %%r15, %c[r15](%0) \n\t" + /* + * Clear host registers marked as clobbered to prevent + * speculative use. + */ + "xor %%r8d, %%r8d \n\t" + "xor %%r9d, %%r9d \n\t" + "xor %%r10d, %%r10d \n\t" + "xor %%r11d, %%r11d \n\t" + "xor %%r12d, %%r12d \n\t" + "xor %%r13d, %%r13d \n\t" + "xor %%r14d, %%r14d \n\t" + "xor %%r15d, %%r15d \n\t" +#endif + "mov %%cr2, %%" _ASM_AX " \n\t" + "mov %%" _ASM_AX ", %c[cr2](%0) \n\t" - /* Hide L1D cache contents from the nested guest. */ - vmx->vcpu.arch.l1tf_flush_l1d = true; + "xor %%eax, %%eax \n\t" + "xor %%ebx, %%ebx \n\t" + "xor %%esi, %%esi \n\t" + "xor %%edi, %%edi \n\t" + "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t" + ".pushsection .rodata \n\t" + ".global vmx_return \n\t" + "vmx_return: " _ASM_PTR " 2b \n\t" + ".popsection" + : : "c"(vmx), "d"((unsigned long)HOST_RSP), "S"(evmcs_rsp), + [launched]"i"(offsetof(struct vcpu_vmx, __launched)), + [fail]"i"(offsetof(struct vcpu_vmx, fail)), + [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), + [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), + [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), + [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), + [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])), + [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])), + [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])), + [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])), +#ifdef CONFIG_X86_64 + [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])), + [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])), + [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])), + [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])), + [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])), + [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])), + [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), + [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), +#endif + [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)), + [wordsize]"i"(sizeof(ulong)) + : "cc", "memory" +#ifdef CONFIG_X86_64 + , "rax", "rbx", "rdi" + , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" +#else + , "eax", "ebx", "edi" +#endif + ); /* - * Must happen outside of nested_vmx_enter_non_root_mode() as it will - * also be used as part of restoring nVMX state for - * snapshot restore (migration). + * We do not use IBRS in the kernel. If this vCPU has used the + * SPEC_CTRL MSR it may have left it on; save the value and + * turn it off. This is much more efficient than blindly adding + * it to the atomic save/restore list. Especially as the former + * (Saving guest MSRs on vmexit) doesn't even exist in KVM. * - * In this flow, it is assumed that vmcs12 cache was - * trasferred as part of captured nVMX state and should - * therefore not be read from guest memory (which may not - * exist on destination host yet). - */ - nested_cache_shadow_vmcs12(vcpu, vmcs12); - - /* - * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken - * by event injection, halt vcpu. + * For non-nested case: + * If the L01 MSR bitmap does not intercept the MSR, then we need to + * save it. + * + * For nested case: + * If the L02 MSR bitmap does not intercept the MSR, then we need to + * save it. */ - if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) && - !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK)) { - vmx->nested.nested_run_pending = 0; - return kvm_vcpu_halt(vcpu); - } - return 1; -} - -/* - * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date - * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK). - * This function returns the new value we should put in vmcs12.guest_cr0. - * It's not enough to just return the vmcs02 GUEST_CR0. Rather, - * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now - * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 - * didn't trap the bit, because if L1 did, so would L0). - * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have - * been modified by L2, and L1 knows it. So just leave the old value of - * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 - * isn't relevant, because if L0 traps this bit it can set it to anything. - * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have - * changed these bits, and therefore they need to be updated, but L0 - * didn't necessarily allow them to be changed in GUEST_CR0 - and rather - * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. - */ -static inline unsigned long -vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) -{ - return - /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | - /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | - /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | - vcpu->arch.cr0_guest_owned_bits)); -} - -static inline unsigned long -vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) -{ - return - /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | - /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | - /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | - vcpu->arch.cr4_guest_owned_bits)); -} - -static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - u32 idt_vectoring; - unsigned int nr; - - if (vcpu->arch.exception.injected) { - nr = vcpu->arch.exception.nr; - idt_vectoring = nr | VECTORING_INFO_VALID_MASK; - - if (kvm_exception_is_soft(nr)) { - vmcs12->vm_exit_instruction_len = - vcpu->arch.event_exit_inst_len; - idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; - } else - idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; - - if (vcpu->arch.exception.has_error_code) { - idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; - vmcs12->idt_vectoring_error_code = - vcpu->arch.exception.error_code; - } - - vmcs12->idt_vectoring_info_field = idt_vectoring; - } else if (vcpu->arch.nmi_injected) { - vmcs12->idt_vectoring_info_field = - INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; - } else if (vcpu->arch.interrupt.injected) { - nr = vcpu->arch.interrupt.nr; - idt_vectoring = nr | VECTORING_INFO_VALID_MASK; - - if (vcpu->arch.interrupt.soft) { - idt_vectoring |= INTR_TYPE_SOFT_INTR; - vmcs12->vm_entry_instruction_len = - vcpu->arch.event_exit_inst_len; - } else - idt_vectoring |= INTR_TYPE_EXT_INTR; - - vmcs12->idt_vectoring_info_field = idt_vectoring; - } -} - -static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long exit_qual; - bool block_nested_events = - vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu); - - if (vcpu->arch.exception.pending && - nested_vmx_check_exception(vcpu, &exit_qual)) { - if (block_nested_events) - return -EBUSY; - nested_vmx_inject_exception_vmexit(vcpu, exit_qual); - return 0; - } - - if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && - vmx->nested.preemption_timer_expired) { - if (block_nested_events) - return -EBUSY; - nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); - return 0; - } - - if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) { - if (block_nested_events) - return -EBUSY; - nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, - NMI_VECTOR | INTR_TYPE_NMI_INTR | - INTR_INFO_VALID_MASK, 0); - /* - * The NMI-triggered VM exit counts as injection: - * clear this one and block further NMIs. - */ - vcpu->arch.nmi_pending = 0; - vmx_set_nmi_mask(vcpu, true); - return 0; - } - - if ((kvm_cpu_has_interrupt(vcpu) || external_intr) && - nested_exit_on_intr(vcpu)) { - if (block_nested_events) - return -EBUSY; - nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); - return 0; - } + if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) + vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); - vmx_complete_nested_posted_interrupt(vcpu); - return 0; -} + x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0); -static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu) -{ - to_vmx(vcpu)->req_immediate_exit = true; -} + /* Eliminate branch target predictions from guest mode */ + vmexit_fill_RSB(); -static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) -{ - ktime_t remaining = - hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); - u64 value; + /* All fields are clean at this point */ + if (static_branch_unlikely(&enable_evmcs)) + current_evmcs->hv_clean_fields |= + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; - if (ktime_to_ns(remaining) <= 0) - return 0; + /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ + if (vmx->host_debugctlmsr) + update_debugctlmsr(vmx->host_debugctlmsr); - value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; - do_div(value, 1000000); - return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; -} +#ifndef CONFIG_X86_64 + /* + * The sysexit path does not restore ds/es, so we must set them to + * a reasonable value ourselves. + * + * We can't defer this to vmx_prepare_switch_to_host() since that + * function may be executed in interrupt context, which saves and + * restore segments around it, nullifying its effect. + */ + loadsegment(ds, __USER_DS); + loadsegment(es, __USER_DS); +#endif -/* - * Update the guest state fields of vmcs12 to reflect changes that - * occurred while L2 was running. (The "IA-32e mode guest" bit of the - * VM-entry controls is also updated, since this is really a guest - * state bit.) - */ -static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) -{ - vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); - vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); - - vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); - vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP); - vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); - - vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); - vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); - vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); - vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); - vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); - vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); - vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); - vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); - vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); - vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); - vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); - vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); - vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); - vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); - vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); - vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); - vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); - vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); - vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); - vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); - vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); - vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); - vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); - vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); - vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); - vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); - vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); - vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); - vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); - vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); - vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); - vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); - vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); - vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); - vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); - vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); - - vmcs12->guest_interruptibility_info = - vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); - vmcs12->guest_pending_dbg_exceptions = - vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); - if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) - vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; - else - vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; + vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) + | (1 << VCPU_EXREG_RFLAGS) + | (1 << VCPU_EXREG_PDPTR) + | (1 << VCPU_EXREG_SEGMENTS) + | (1 << VCPU_EXREG_CR3)); + vcpu->arch.regs_dirty = 0; - if (nested_cpu_has_preemption_timer(vmcs12)) { - if (vmcs12->vm_exit_controls & - VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) - vmcs12->vmx_preemption_timer_value = - vmx_get_preemption_timer_value(vcpu); - hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); + /* + * eager fpu is enabled if PKEY is supported and CR4 is switched + * back on host, so it is safe to read guest PKRU from current + * XSAVE. + */ + if (static_cpu_has(X86_FEATURE_PKU) && + kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) { + vcpu->arch.pkru = __read_pkru(); + if (vcpu->arch.pkru != vmx->host_pkru) + __write_pkru(vmx->host_pkru); } - /* - * In some cases (usually, nested EPT), L2 is allowed to change its - * own CR3 without exiting. If it has changed it, we must keep it. - * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined - * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. - * - * Additionally, restore L2's PDPTR to vmcs12. - */ - if (enable_ept) { - vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); - vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); - vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); - vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); - vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); - } + vmx->nested.nested_run_pending = 0; + vmx->idt_vectoring_info = 0; - vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); + vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON); + if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) + return; - if (nested_cpu_has_vid(vmcs12)) - vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); + vmx->loaded_vmcs->launched = 1; + vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); - vmcs12->vm_entry_controls = - (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | - (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); + vmx_complete_atomic_exit(vmx); + vmx_recover_nmi_blocking(vmx); + vmx_complete_interrupts(vmx); +} +STACK_FRAME_NON_STANDARD(vmx_vcpu_run); - if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) { - kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); - vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); - } +static struct kvm *vmx_vm_alloc(void) +{ + struct kvm_vmx *kvm_vmx = vzalloc(sizeof(struct kvm_vmx)); + return &kvm_vmx->kvm; +} - /* TODO: These cannot have changed unless we have MSR bitmaps and - * the relevant bit asks not to trap the change */ - if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) - vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT); - if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) - vmcs12->guest_ia32_efer = vcpu->arch.efer; - vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); - vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); - vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); - if (kvm_mpx_supported()) - vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); +static void vmx_vm_free(struct kvm *kvm) +{ + vfree(to_kvm_vmx(kvm)); } -/* - * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits - * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), - * and this function updates it to reflect the changes to the guest state while - * L2 was running (and perhaps made some exits which were handled directly by L0 - * without going back to L1), and to reflect the exit reason. - * Note that we do not have to copy here all VMCS fields, just those that - * could have changed by the L2 guest or the exit - i.e., the guest-state and - * exit-information fields only. Other fields are modified by L1 with VMWRITE, - * which already writes to vmcs12 directly. - */ -static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, - u32 exit_reason, u32 exit_intr_info, - unsigned long exit_qualification) +static void vmx_free_vcpu(struct kvm_vcpu *vcpu) { - /* update guest state fields: */ - sync_vmcs12(vcpu, vmcs12); + struct vcpu_vmx *vmx = to_vmx(vcpu); - /* update exit information fields: */ + if (enable_pml) + vmx_destroy_pml_buffer(vmx); + free_vpid(vmx->vpid); + leave_guest_mode(vcpu); + nested_vmx_free_vcpu(vcpu); + free_loaded_vmcs(vmx->loaded_vmcs); + kfree(vmx->guest_msrs); + kvm_vcpu_uninit(vcpu); + kmem_cache_free(kvm_vcpu_cache, vmx); +} - vmcs12->vm_exit_reason = exit_reason; - vmcs12->exit_qualification = exit_qualification; - vmcs12->vm_exit_intr_info = exit_intr_info; +static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) +{ + int err; + struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); + unsigned long *msr_bitmap; + int cpu; - vmcs12->idt_vectoring_info_field = 0; - vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); - vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + if (!vmx) + return ERR_PTR(-ENOMEM); - if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { - vmcs12->launch_state = 1; + vmx->vpid = allocate_vpid(); - /* vm_entry_intr_info_field is cleared on exit. Emulate this - * instead of reading the real value. */ - vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; + err = kvm_vcpu_init(&vmx->vcpu, kvm, id); + if (err) + goto free_vcpu; - /* - * Transfer the event that L0 or L1 may wanted to inject into - * L2 to IDT_VECTORING_INFO_FIELD. - */ - vmcs12_save_pending_event(vcpu, vmcs12); - } + err = -ENOMEM; /* - * Drop what we picked up for L2 via vmx_complete_interrupts. It is - * preserved above and would only end up incorrectly in L1. + * If PML is turned on, failure on enabling PML just results in failure + * of creating the vcpu, therefore we can simplify PML logic (by + * avoiding dealing with cases, such as enabling PML partially on vcpus + * for the guest, etc. */ - vcpu->arch.nmi_injected = false; - kvm_clear_exception_queue(vcpu); - kvm_clear_interrupt_queue(vcpu); -} + if (enable_pml) { + vmx->pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!vmx->pml_pg) + goto uninit_vcpu; + } -/* - * A part of what we need to when the nested L2 guest exits and we want to - * run its L1 parent, is to reset L1's guest state to the host state specified - * in vmcs12. - * This function is to be called not only on normal nested exit, but also on - * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry - * Failures During or After Loading Guest State"). - * This function should be called when the active VMCS is L1's (vmcs01). - */ -static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - struct kvm_segment seg; - u32 entry_failure_code; + vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); + BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0]) + > PAGE_SIZE); - if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) - vcpu->arch.efer = vmcs12->host_ia32_efer; - else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) - vcpu->arch.efer |= (EFER_LMA | EFER_LME); - else - vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); - vmx_set_efer(vcpu, vcpu->arch.efer); + if (!vmx->guest_msrs) + goto free_pml; - kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp); - kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip); - vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); - vmx_set_interrupt_shadow(vcpu, 0); + err = alloc_loaded_vmcs(&vmx->vmcs01); + if (err < 0) + goto free_msrs; - /* - * Note that calling vmx_set_cr0 is important, even if cr0 hasn't - * actually changed, because vmx_set_cr0 refers to efer set above. - * - * CR0_GUEST_HOST_MASK is already set in the original vmcs01 - * (KVM doesn't change it); - */ - vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; - vmx_set_cr0(vcpu, vmcs12->host_cr0); + msr_bitmap = vmx->vmcs01.msr_bitmap; + vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); + vmx->msr_bitmap_mode = 0; + + vmx->loaded_vmcs = &vmx->vmcs01; + cpu = get_cpu(); + vmx_vcpu_load(&vmx->vcpu, cpu); + vmx->vcpu.cpu = cpu; + vmx_vcpu_setup(vmx); + vmx_vcpu_put(&vmx->vcpu); + put_cpu(); + if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { + err = alloc_apic_access_page(kvm); + if (err) + goto free_vmcs; + } - /* Same as above - no reason to call set_cr4_guest_host_mask(). */ - vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); - vmx_set_cr4(vcpu, vmcs12->host_cr4); + if (enable_ept && !enable_unrestricted_guest) { + err = init_rmode_identity_map(kvm); + if (err) + goto free_vmcs; + } - nested_ept_uninit_mmu_context(vcpu); + if (nested) + nested_vmx_setup_ctls_msrs(&vmx->nested.msrs, + vmx_capability.ept, + kvm_vcpu_apicv_active(&vmx->vcpu)); + else + memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs)); - /* - * Only PDPTE load can fail as the value of cr3 was checked on entry and - * couldn't have changed. - */ - if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code)) - nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); + vmx->nested.posted_intr_nv = -1; + vmx->nested.current_vmptr = -1ull; - if (!enable_ept) - vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; + vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED; /* - * If vmcs01 doesn't use VPID, CPU flushes TLB on every - * VMEntry/VMExit. Thus, no need to flush TLB. - * - * If vmcs12 doesn't use VPID, L1 expects TLB to be - * flushed on every VMEntry/VMExit. - * - * Otherwise, we can preserve TLB entries as long as we are - * able to tag L1 TLB entries differently than L2 TLB entries. - * - * If vmcs12 uses EPT, we need to execute this flush on EPTP01 - * and therefore we request the TLB flush to happen only after VMCS EPTP - * has been set by KVM_REQ_LOAD_CR3. + * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR + * or POSTED_INTR_WAKEUP_VECTOR. */ - if (enable_vpid && - (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) { - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); - } + vmx->pi_desc.nv = POSTED_INTR_VECTOR; + vmx->pi_desc.sn = 1; - vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); - vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); - vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); - vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); - vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); - vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); - vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); + return &vmx->vcpu; - /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ - if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) - vmcs_write64(GUEST_BNDCFGS, 0); +free_vmcs: + free_loaded_vmcs(vmx->loaded_vmcs); +free_msrs: + kfree(vmx->guest_msrs); +free_pml: + vmx_destroy_pml_buffer(vmx); +uninit_vcpu: + kvm_vcpu_uninit(&vmx->vcpu); +free_vcpu: + free_vpid(vmx->vpid); + kmem_cache_free(kvm_vcpu_cache, vmx); + return ERR_PTR(err); +} - if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { - vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); - vcpu->arch.pat = vmcs12->host_ia32_pat; - } - if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) - vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, - vmcs12->host_ia32_perf_global_ctrl); - - /* Set L1 segment info according to Intel SDM - 27.5.2 Loading Host Segment and Descriptor-Table Registers */ - seg = (struct kvm_segment) { - .base = 0, - .limit = 0xFFFFFFFF, - .selector = vmcs12->host_cs_selector, - .type = 11, - .present = 1, - .s = 1, - .g = 1 - }; - if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) - seg.l = 1; - else - seg.db = 1; - vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); - seg = (struct kvm_segment) { - .base = 0, - .limit = 0xFFFFFFFF, - .type = 3, - .present = 1, - .s = 1, - .db = 1, - .g = 1 - }; - seg.selector = vmcs12->host_ds_selector; - vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); - seg.selector = vmcs12->host_es_selector; - vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); - seg.selector = vmcs12->host_ss_selector; - vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); - seg.selector = vmcs12->host_fs_selector; - seg.base = vmcs12->host_fs_base; - vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); - seg.selector = vmcs12->host_gs_selector; - seg.base = vmcs12->host_gs_base; - vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); - seg = (struct kvm_segment) { - .base = vmcs12->host_tr_base, - .limit = 0x67, - .selector = vmcs12->host_tr_selector, - .type = 11, - .present = 1 - }; - vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); +#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n" +#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n" - kvm_set_dr(vcpu, 7, 0x400); - vmcs_write64(GUEST_IA32_DEBUGCTL, 0); +static int vmx_vm_init(struct kvm *kvm) +{ + spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock); - if (cpu_has_vmx_msr_bitmap()) - vmx_update_msr_bitmap(vcpu); + if (!ple_gap) + kvm->arch.pause_in_guest = true; - if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, - vmcs12->vm_exit_msr_load_count)) - nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); + if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) { + switch (l1tf_mitigation) { + case L1TF_MITIGATION_OFF: + case L1TF_MITIGATION_FLUSH_NOWARN: + /* 'I explicitly don't care' is set */ + break; + case L1TF_MITIGATION_FLUSH: + case L1TF_MITIGATION_FLUSH_NOSMT: + case L1TF_MITIGATION_FULL: + /* + * Warn upon starting the first VM in a potentially + * insecure environment. + */ + if (cpu_smt_control == CPU_SMT_ENABLED) + pr_warn_once(L1TF_MSG_SMT); + if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER) + pr_warn_once(L1TF_MSG_L1D); + break; + case L1TF_MITIGATION_FULL_FORCE: + /* Flush is enforced */ + break; + } + } + return 0; } -static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) +static void __init vmx_check_processor_compat(void *rtn) { - struct shared_msr_entry *efer_msr; - unsigned int i; - - if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) - return vmcs_read64(GUEST_IA32_EFER); - - if (cpu_has_load_ia32_efer()) - return host_efer; + struct vmcs_config vmcs_conf; + struct vmx_capability vmx_cap; - for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { - if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) - return vmx->msr_autoload.guest.val[i].value; + *(int *)rtn = 0; + if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) + *(int *)rtn = -EIO; + if (nested) + nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept, + enable_apicv); + if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { + printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n", + smp_processor_id()); + *(int *)rtn = -EIO; } - - efer_msr = find_msr_entry(vmx, MSR_EFER); - if (efer_msr) - return efer_msr->data; - - return host_efer; } -static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) +static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct vmx_msr_entry g, h; - struct msr_data msr; - gpa_t gpa; - u32 i, j; - - vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); - - if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { - /* - * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set - * as vmcs01.GUEST_DR7 contains a userspace defined value - * and vcpu->arch.dr7 is not squirreled away before the - * nested VMENTER (not worth adding a variable in nested_vmx). - */ - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) - kvm_set_dr(vcpu, 7, DR7_FIXED_1); - else - WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); - } + u8 cache; + u64 ipat = 0; - /* - * Note that calling vmx_set_{efer,cr0,cr4} is important as they - * handle a variety of side effects to KVM's software model. + /* For VT-d and EPT combination + * 1. MMIO: always map as UC + * 2. EPT with VT-d: + * a. VT-d without snooping control feature: can't guarantee the + * result, try to trust guest. + * b. VT-d with snooping control feature: snooping control feature of + * VT-d engine can guarantee the cache correctness. Just set it + * to WB to keep consistent with host. So the same as item 3. + * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep + * consistent with host MTRR */ - vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); - - vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; - vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); + if (is_mmio) { + cache = MTRR_TYPE_UNCACHABLE; + goto exit; + } - vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); - vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); + if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) { + ipat = VMX_EPT_IPAT_BIT; + cache = MTRR_TYPE_WRBACK; + goto exit; + } - nested_ept_uninit_mmu_context(vcpu); - vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + if (kvm_read_cr0(vcpu) & X86_CR0_CD) { + ipat = VMX_EPT_IPAT_BIT; + if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) + cache = MTRR_TYPE_WRBACK; + else + cache = MTRR_TYPE_UNCACHABLE; + goto exit; + } - /* - * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs - * from vmcs01 (if necessary). The PDPTRs are not loaded on - * VMFail, like everything else we just need to ensure our - * software model is up-to-date. - */ - ept_save_pdptrs(vcpu); + cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn); - kvm_mmu_reset_context(vcpu); +exit: + return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat; +} - if (cpu_has_vmx_msr_bitmap()) - vmx_update_msr_bitmap(vcpu); +static int vmx_get_lpage_level(void) +{ + if (enable_ept && !cpu_has_vmx_ept_1g_page()) + return PT_DIRECTORY_LEVEL; + else + /* For shadow and EPT supported 1GB page */ + return PT_PDPE_LEVEL; +} +static void vmcs_set_secondary_exec_control(u32 new_ctl) +{ /* - * This nasty bit of open coding is a compromise between blindly - * loading L1's MSRs using the exit load lists (incorrect emulation - * of VMFail), leaving the nested VM's MSRs in the software model - * (incorrect behavior) and snapshotting the modified MSRs (too - * expensive since the lists are unbound by hardware). For each - * MSR that was (prematurely) loaded from the nested VMEntry load - * list, reload it from the exit load list if it exists and differs - * from the guest value. The intent is to stuff host state as - * silently as possible, not to fully process the exit load list. + * These bits in the secondary execution controls field + * are dynamic, the others are mostly based on the hypervisor + * architecture and the guest's CPUID. Do not touch the + * dynamic bits. */ - msr.host_initiated = false; - for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { - gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); - if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { - pr_debug_ratelimited( - "%s read MSR index failed (%u, 0x%08llx)\n", - __func__, i, gpa); - goto vmabort; - } - - for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { - gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); - if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { - pr_debug_ratelimited( - "%s read MSR failed (%u, 0x%08llx)\n", - __func__, j, gpa); - goto vmabort; - } - if (h.index != g.index) - continue; - if (h.value == g.value) - break; - - if (nested_vmx_load_msr_check(vcpu, &h)) { - pr_debug_ratelimited( - "%s check failed (%u, 0x%x, 0x%x)\n", - __func__, j, h.index, h.reserved); - goto vmabort; - } - - msr.index = h.index; - msr.data = h.value; - if (kvm_set_msr(vcpu, &msr)) { - pr_debug_ratelimited( - "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", - __func__, j, h.index, h.value); - goto vmabort; - } - } - } + u32 mask = + SECONDARY_EXEC_SHADOW_VMCS | + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_DESC; - return; + u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); -vmabort: - nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, + (new_ctl & ~mask) | (cur_ctl & mask)); } /* - * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 - * and modify vmcs12 to make it see what it would expect to see there if - * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) + * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits + * (indicating "allowed-1") if they are supported in the guest's CPUID. */ -static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, - u32 exit_intr_info, - unsigned long exit_qualification) +static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - - /* trying to cancel vmlaunch/vmresume is a bug */ - WARN_ON_ONCE(vmx->nested.nested_run_pending); - - leave_guest_mode(vcpu); + struct kvm_cpuid_entry2 *entry; - if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) - vcpu->arch.tsc_offset -= vmcs12->tsc_offset; + vmx->nested.msrs.cr0_fixed1 = 0xffffffff; + vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE; - if (likely(!vmx->fail)) { - if (exit_reason == -1) - sync_vmcs12(vcpu, vmcs12); - else - prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, - exit_qualification); +#define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \ + if (entry && (entry->_reg & (_cpuid_mask))) \ + vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \ +} while (0) - /* - * Must happen outside of sync_vmcs12() as it will - * also be used to capture vmcs12 cache as part of - * capturing nVMX state for snapshot (migration). - * - * Otherwise, this flush will dirty guest memory at a - * point it is already assumed by user-space to be - * immutable. - */ - nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); + entry = kvm_find_cpuid_entry(vcpu, 0x1, 0); + cr4_fixed1_update(X86_CR4_VME, edx, bit(X86_FEATURE_VME)); + cr4_fixed1_update(X86_CR4_PVI, edx, bit(X86_FEATURE_VME)); + cr4_fixed1_update(X86_CR4_TSD, edx, bit(X86_FEATURE_TSC)); + cr4_fixed1_update(X86_CR4_DE, edx, bit(X86_FEATURE_DE)); + cr4_fixed1_update(X86_CR4_PSE, edx, bit(X86_FEATURE_PSE)); + cr4_fixed1_update(X86_CR4_PAE, edx, bit(X86_FEATURE_PAE)); + cr4_fixed1_update(X86_CR4_MCE, edx, bit(X86_FEATURE_MCE)); + cr4_fixed1_update(X86_CR4_PGE, edx, bit(X86_FEATURE_PGE)); + cr4_fixed1_update(X86_CR4_OSFXSR, edx, bit(X86_FEATURE_FXSR)); + cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, bit(X86_FEATURE_XMM)); + cr4_fixed1_update(X86_CR4_VMXE, ecx, bit(X86_FEATURE_VMX)); + cr4_fixed1_update(X86_CR4_SMXE, ecx, bit(X86_FEATURE_SMX)); + cr4_fixed1_update(X86_CR4_PCIDE, ecx, bit(X86_FEATURE_PCID)); + cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, bit(X86_FEATURE_XSAVE)); - if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr, - vmcs12->vm_exit_msr_store_count)) - nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL); - } else { - /* - * The only expected VM-instruction error is "VM entry with - * invalid control field(s)." Anything else indicates a - * problem with L0. And we should never get here with a - * VMFail of any type if early consistency checks are enabled. - */ - WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != - VMXERR_ENTRY_INVALID_CONTROL_FIELD); - WARN_ON_ONCE(nested_early_check); - } + entry = kvm_find_cpuid_entry(vcpu, 0x7, 0); + cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, bit(X86_FEATURE_FSGSBASE)); + cr4_fixed1_update(X86_CR4_SMEP, ebx, bit(X86_FEATURE_SMEP)); + cr4_fixed1_update(X86_CR4_SMAP, ebx, bit(X86_FEATURE_SMAP)); + cr4_fixed1_update(X86_CR4_PKE, ecx, bit(X86_FEATURE_PKU)); + cr4_fixed1_update(X86_CR4_UMIP, ecx, bit(X86_FEATURE_UMIP)); - vmx_switch_vmcs(vcpu, &vmx->vmcs01); +#undef cr4_fixed1_update +} - /* Update any VMCS fields that might have changed while L2 ran */ - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); - vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); +static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); - if (kvm_has_tsc_control) - decache_tsc_multiplier(vmx); + if (kvm_mpx_supported()) { + bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX); - if (vmx->nested.change_vmcs01_virtual_apic_mode) { - vmx->nested.change_vmcs01_virtual_apic_mode = false; - vmx_set_virtual_apic_mode(vcpu); - } else if (!nested_cpu_has_ept(vmcs12) && - nested_cpu_has2(vmcs12, - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { - vmx_flush_tlb(vcpu, true); + if (mpx_enabled) { + vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; + vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; + } else { + vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS; + vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS; + } } +} - /* This is needed for same reason as it was needed in prepare_vmcs02 */ - vmx->host_rsp = 0; +static void vmx_cpuid_update(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); - /* Unpin physical memory we referred to in vmcs02 */ - if (vmx->nested.apic_access_page) { - kvm_release_page_dirty(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = NULL; - } - if (vmx->nested.virtual_apic_page) { - kvm_release_page_dirty(vmx->nested.virtual_apic_page); - vmx->nested.virtual_apic_page = NULL; - } - if (vmx->nested.pi_desc_page) { - kunmap(vmx->nested.pi_desc_page); - kvm_release_page_dirty(vmx->nested.pi_desc_page); - vmx->nested.pi_desc_page = NULL; - vmx->nested.pi_desc = NULL; + if (cpu_has_secondary_exec_ctrls()) { + vmx_compute_secondary_exec_control(vmx); + vmcs_set_secondary_exec_control(vmx->secondary_exec_control); } - /* - * We are now running in L2, mmu_notifier will force to reload the - * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1. - */ - kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); - - if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs)) - vmx->nested.need_vmcs12_sync = true; - - /* in case we halted in L2 */ - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; - - if (likely(!vmx->fail)) { - /* - * TODO: SDM says that with acknowledge interrupt on - * exit, bit 31 of the VM-exit interrupt information - * (valid interrupt) is always set to 1 on - * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't - * need kvm_cpu_has_interrupt(). See the commit - * message for details. - */ - if (nested_exit_intr_ack_set(vcpu) && - exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && - kvm_cpu_has_interrupt(vcpu)) { - int irq = kvm_cpu_get_interrupt(vcpu); - WARN_ON(irq < 0); - vmcs12->vm_exit_intr_info = irq | - INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; - } - - if (exit_reason != -1) - trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, - vmcs12->exit_qualification, - vmcs12->idt_vectoring_info_field, - vmcs12->vm_exit_intr_info, - vmcs12->vm_exit_intr_error_code, - KVM_ISA_VMX); - - load_vmcs12_host_state(vcpu, vmcs12); + if (nested_vmx_allowed(vcpu)) + to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= + FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; + else + to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= + ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; - return; + if (nested_vmx_allowed(vcpu)) { + nested_vmx_cr_fixed1_bits_update(vcpu); + nested_vmx_entry_exit_ctls_update(vcpu); } +} - /* - * After an early L2 VM-entry failure, we're now back - * in L1 which thinks it just finished a VMLAUNCH or - * VMRESUME instruction, so we need to set the failure - * flag and the VM-instruction error field of the VMCS - * accordingly, and skip the emulated instruction. - */ - (void)nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - - /* - * Restore L1's host state to KVM's software model. We're here - * because a consistency check was caught by hardware, which - * means some amount of guest state has been propagated to KVM's - * model and needs to be unwound to the host's state. - */ - nested_vmx_restore_host_state(vcpu); - - vmx->fail = 0; +static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) +{ + if (func == 1 && nested) + entry->ecx |= bit(X86_FEATURE_VMX); } -/* - * Forcibly leave nested mode in order to be able to reset the VCPU later on. - */ -static void vmx_leave_nested(struct kvm_vcpu *vcpu) +static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu)) { - to_vmx(vcpu)->nested.nested_run_pending = 0; - nested_vmx_vmexit(vcpu, -1, 0, 0); - } - free_nested(vcpu); + to_vmx(vcpu)->req_immediate_exit = true; } static int vmx_check_intercept(struct kvm_vcpu *vcpu, @@ -12735,289 +7103,6 @@ static int enable_smi_window(struct kvm_vcpu *vcpu) return 0; } -static inline int vmx_has_valid_vmcs12(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - /* - * In case we do two consecutive get/set_nested_state()s while L2 was - * running hv_evmcs may end up not being mapped (we map it from - * nested_vmx_run()/vmx_vcpu_run()). Check is_guest_mode() as we always - * have vmcs12 if it is true. - */ - return is_guest_mode(vcpu) || vmx->nested.current_vmptr != -1ull || - vmx->nested.hv_evmcs; -} - -static int vmx_get_nested_state(struct kvm_vcpu *vcpu, - struct kvm_nested_state __user *user_kvm_nested_state, - u32 user_data_size) -{ - struct vcpu_vmx *vmx; - struct vmcs12 *vmcs12; - struct kvm_nested_state kvm_state = { - .flags = 0, - .format = 0, - .size = sizeof(kvm_state), - .vmx.vmxon_pa = -1ull, - .vmx.vmcs_pa = -1ull, - }; - - if (!vcpu) - return kvm_state.size + 2 * VMCS12_SIZE; - - vmx = to_vmx(vcpu); - vmcs12 = get_vmcs12(vcpu); - - if (nested_vmx_allowed(vcpu) && vmx->nested.enlightened_vmcs_enabled) - kvm_state.flags |= KVM_STATE_NESTED_EVMCS; - - if (nested_vmx_allowed(vcpu) && - (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { - kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr; - kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr; - - if (vmx_has_valid_vmcs12(vcpu)) { - kvm_state.size += VMCS12_SIZE; - - if (is_guest_mode(vcpu) && - nested_cpu_has_shadow_vmcs(vmcs12) && - vmcs12->vmcs_link_pointer != -1ull) - kvm_state.size += VMCS12_SIZE; - } - - if (vmx->nested.smm.vmxon) - kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; - - if (vmx->nested.smm.guest_mode) - kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; - - if (is_guest_mode(vcpu)) { - kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; - - if (vmx->nested.nested_run_pending) - kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; - } - } - - if (user_data_size < kvm_state.size) - goto out; - - if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) - return -EFAULT; - - if (!vmx_has_valid_vmcs12(vcpu)) - goto out; - - /* - * When running L2, the authoritative vmcs12 state is in the - * vmcs02. When running L1, the authoritative vmcs12 state is - * in the shadow or enlightened vmcs linked to vmcs01, unless - * need_vmcs12_sync is set, in which case, the authoritative - * vmcs12 state is in the vmcs12 already. - */ - if (is_guest_mode(vcpu)) { - sync_vmcs12(vcpu, vmcs12); - } else if (!vmx->nested.need_vmcs12_sync) { - if (vmx->nested.hv_evmcs) - copy_enlightened_to_vmcs12(vmx); - else if (enable_shadow_vmcs) - copy_shadow_to_vmcs12(vmx); - } - - if (copy_to_user(user_kvm_nested_state->data, vmcs12, sizeof(*vmcs12))) - return -EFAULT; - - if (nested_cpu_has_shadow_vmcs(vmcs12) && - vmcs12->vmcs_link_pointer != -1ull) { - if (copy_to_user(user_kvm_nested_state->data + VMCS12_SIZE, - get_shadow_vmcs12(vcpu), sizeof(*vmcs12))) - return -EFAULT; - } - -out: - return kvm_state.size; -} - -static int vmx_set_nested_state(struct kvm_vcpu *vcpu, - struct kvm_nested_state __user *user_kvm_nested_state, - struct kvm_nested_state *kvm_state) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct vmcs12 *vmcs12; - u32 exit_qual; - int ret; - - if (kvm_state->format != 0) - return -EINVAL; - - if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) - nested_enable_evmcs(vcpu, NULL); - - if (!nested_vmx_allowed(vcpu)) - return kvm_state->vmx.vmxon_pa == -1ull ? 0 : -EINVAL; - - if (kvm_state->vmx.vmxon_pa == -1ull) { - if (kvm_state->vmx.smm.flags) - return -EINVAL; - - if (kvm_state->vmx.vmcs_pa != -1ull) - return -EINVAL; - - vmx_leave_nested(vcpu); - return 0; - } - - if (!page_address_valid(vcpu, kvm_state->vmx.vmxon_pa)) - return -EINVAL; - - if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && - (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) - return -EINVAL; - - if (kvm_state->vmx.smm.flags & - ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) - return -EINVAL; - - /* - * SMM temporarily disables VMX, so we cannot be in guest mode, - * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags - * must be zero. - */ - if (is_smm(vcpu) ? kvm_state->flags : kvm_state->vmx.smm.flags) - return -EINVAL; - - if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && - !(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) - return -EINVAL; - - vmx_leave_nested(vcpu); - if (kvm_state->vmx.vmxon_pa == -1ull) - return 0; - - vmx->nested.vmxon_ptr = kvm_state->vmx.vmxon_pa; - ret = enter_vmx_operation(vcpu); - if (ret) - return ret; - - /* Empty 'VMXON' state is permitted */ - if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12)) - return 0; - - if (kvm_state->vmx.vmcs_pa != -1ull) { - if (kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa || - !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa)) - return -EINVAL; - - set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa); - } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { - /* - * Sync eVMCS upon entry as we may not have - * HV_X64_MSR_VP_ASSIST_PAGE set up yet. - */ - vmx->nested.need_vmcs12_sync = true; - } else { - return -EINVAL; - } - - if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { - vmx->nested.smm.vmxon = true; - vmx->nested.vmxon = false; - - if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) - vmx->nested.smm.guest_mode = true; - } - - vmcs12 = get_vmcs12(vcpu); - if (copy_from_user(vmcs12, user_kvm_nested_state->data, sizeof(*vmcs12))) - return -EFAULT; - - if (vmcs12->hdr.revision_id != VMCS12_REVISION) - return -EINVAL; - - if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) - return 0; - - vmx->nested.nested_run_pending = - !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); - - if (nested_cpu_has_shadow_vmcs(vmcs12) && - vmcs12->vmcs_link_pointer != -1ull) { - struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); - if (kvm_state->size < sizeof(kvm_state) + 2 * sizeof(*vmcs12)) - return -EINVAL; - - if (copy_from_user(shadow_vmcs12, - user_kvm_nested_state->data + VMCS12_SIZE, - sizeof(*vmcs12))) - return -EFAULT; - - if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || - !shadow_vmcs12->hdr.shadow_vmcs) - return -EINVAL; - } - - if (check_vmentry_prereqs(vcpu, vmcs12) || - check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) - return -EINVAL; - - vmx->nested.dirty_vmcs12 = true; - ret = nested_vmx_enter_non_root_mode(vcpu, false); - if (ret) - return -EINVAL; - - return 0; -} - -static __exit void nested_vmx_hardware_unsetup(void) -{ - int i; - - if (enable_shadow_vmcs) { - for (i = 0; i < VMX_BITMAP_NR; i++) - free_page((unsigned long)vmx_bitmap[i]); - } -} - -static __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) -{ - int i; - - if (enable_shadow_vmcs) { - for (i = 0; i < VMX_BITMAP_NR; i++) { - vmx_bitmap[i] = (unsigned long *) - __get_free_page(GFP_KERNEL); - if (!vmx_bitmap[i]) { - nested_vmx_hardware_unsetup(); - return -ENOMEM; - } - } - - init_vmcs_shadow_fields(); - } - - exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear, - exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch, - exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld, - exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst, - exit_handlers[EXIT_REASON_VMREAD] = handle_vmread, - exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume, - exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite, - exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff, - exit_handlers[EXIT_REASON_VMON] = handle_vmon, - exit_handlers[EXIT_REASON_INVEPT] = handle_invept, - exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid, - exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc, - - kvm_x86_ops->check_nested_events = vmx_check_nested_events; - kvm_x86_ops->get_nested_state = vmx_get_nested_state; - kvm_x86_ops->set_nested_state = vmx_set_nested_state; - kvm_x86_ops->get_vmcs12_pages = nested_get_vmcs12_pages, - kvm_x86_ops->nested_enable_evmcs = nested_enable_evmcs; - - return 0; -} - static __init int hardware_setup(void) { unsigned long host_bndcfgs; @@ -13138,9 +7223,6 @@ static __init int hardware_setup(void) kvm_x86_ops->cancel_hv_timer = NULL; } - if (!cpu_has_vmx_shadow_vmcs() || !nested) - enable_shadow_vmcs = 0; - kvm_set_posted_intr_wakeup_handler(wakeup_handler); kvm_mce_cap_supported |= MCG_LMCE_P; -- cgit v1.2.3 From 0023ef39dc35c773c436eaa46ca539a26b308b55 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 5 Dec 2018 15:28:58 -0800 Subject: kvm: vmx: Set IA32_TSC_AUX for legacy mode guests RDTSCP is supported in legacy mode as well as long mode. The IA32_TSC_AUX MSR should be set to the correct guest value before entering any guest that supports RDTSCP. Fixes: 4e47c7a6d714 ("KVM: VMX: Add instruction rdtscp support for guest") Signed-off-by: Jim Mattson Reviewed-by: Peter Shier Reviewed-by: Marc Orr Reviewed-by: Liran Alon Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 47943d073d6d..c1feb62faac4 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1382,9 +1382,6 @@ static void setup_msrs(struct vcpu_vmx *vmx) index = __find_msr_index(vmx, MSR_CSTAR); if (index >= 0) move_msr_up(vmx, index, save_nmsrs++); - index = __find_msr_index(vmx, MSR_TSC_AUX); - if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP)) - move_msr_up(vmx, index, save_nmsrs++); /* * MSR_STAR is only needed on long mode guests, and only * if efer.sce is enabled. @@ -1397,6 +1394,9 @@ static void setup_msrs(struct vcpu_vmx *vmx) index = __find_msr_index(vmx, MSR_EFER); if (index >= 0 && update_transition_efer(vmx, index)) move_msr_up(vmx, index, save_nmsrs++); + index = __find_msr_index(vmx, MSR_TSC_AUX); + if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP)) + move_msr_up(vmx, index, save_nmsrs++); vmx->save_nmsrs = save_nmsrs; vmx->guest_msrs_dirty = true; -- cgit v1.2.3 From 898a811f1486df3209ce857027d0564d6a616cf2 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 5 Dec 2018 15:28:59 -0800 Subject: kvm: vmx: Document the need for MSR_STAR in i386 builds Add a comment explaining why MSR_STAR must be included in vmx_msr_index[] even for i386 builds. The elided comment has not been relevant since move_msr_up() was introduced in commit a75beee6e4f5d ("KVM: VMX: Avoid saving and restoring msrs on lightweight vmexit"). Signed-off-by: Jim Mattson Reviewed-by: Peter Shier Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c1feb62faac4..d754fd0c8674 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -373,8 +373,11 @@ static const struct kvm_vmx_segment_field { u64 host_efer; /* - * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it - * away by decrementing the array size. + * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm + * will emulate SYSCALL in legacy mode if the vendor string in guest + * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To + * support this emulation, IA32_STAR must always be included in + * vmx_msr_index[], even in i386 builds. */ const u32 vmx_msr_index[] = { #ifdef CONFIG_X86_64 -- cgit v1.2.3 From db31c8f5af7de0e78f63fd4c255cf3868d667bf4 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 5 Dec 2018 15:29:00 -0800 Subject: kvm: vmx: Don't set hardware IA32_CSTAR MSR on VM-entry SYSCALL raises #UD in compatibility mode on Intel CPUs, so it's pointless to load the guest's IA32_CSTAR value into the hardware MSR. IA32_CSTAR still provides 48 bits of storage on Intel CPUs that have CPUID.80000001:EDX.LM[bit 29] set, so we cannot remove it from the vmx_msr_index[] array. Signed-off-by: Jim Mattson Reviewed-by: Peter Shier Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d754fd0c8674..2084db097622 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1380,9 +1380,6 @@ static void setup_msrs(struct vcpu_vmx *vmx) if (index >= 0) move_msr_up(vmx, index, save_nmsrs++); index = __find_msr_index(vmx, MSR_LSTAR); - if (index >= 0) - move_msr_up(vmx, index, save_nmsrs++); - index = __find_msr_index(vmx, MSR_CSTAR); if (index >= 0) move_msr_up(vmx, index, save_nmsrs++); /* -- cgit v1.2.3 From 84c8c5b8f82f9939b1bcb3b5e25da2b5fc47ab50 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 5 Dec 2018 15:29:01 -0800 Subject: kvm: vmx: Skip all SYSCALL MSRs in setup_msrs() when !EFER.SCE Like IA32_STAR, IA32_LSTAR and IA32_FMASK only need to contain guest values on VM-entry when the guest is in long mode and EFER.SCE is set. Signed-off-by: Jim Mattson Reviewed-by: Peter Shier Reviewed-by: Marc Orr Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 2084db097622..579f4c4144d2 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1375,19 +1375,19 @@ static void setup_msrs(struct vcpu_vmx *vmx) save_nmsrs = 0; #ifdef CONFIG_X86_64 - if (is_long_mode(&vmx->vcpu)) { - index = __find_msr_index(vmx, MSR_SYSCALL_MASK); + /* + * The SYSCALL MSRs are only needed on long mode guests, and only + * when EFER.SCE is set. + */ + if (is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE)) { + index = __find_msr_index(vmx, MSR_STAR); if (index >= 0) move_msr_up(vmx, index, save_nmsrs++); index = __find_msr_index(vmx, MSR_LSTAR); if (index >= 0) move_msr_up(vmx, index, save_nmsrs++); - /* - * MSR_STAR is only needed on long mode guests, and only - * if efer.sce is enabled. - */ - index = __find_msr_index(vmx, MSR_STAR); - if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE)) + index = __find_msr_index(vmx, MSR_SYSCALL_MASK); + if (index >= 0) move_msr_up(vmx, index, save_nmsrs++); } #endif -- cgit v1.2.3 From b2227ddec1cca6c52f49a19ec1c7a40cd27e0566 Mon Sep 17 00:00:00 2001 From: Peng Hao Date: Fri, 14 Dec 2018 16:18:03 +0800 Subject: kvm: svm: remove unused struct definition structure svm_init_data is never used. So remove it. Signed-off-by: Peng Hao Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d7fec9f4af43..3cbe67a31c54 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -675,11 +675,6 @@ struct svm_cpu_data { static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); -struct svm_init_data { - int cpu; - int r; -}; - static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges) -- cgit v1.2.3 From eb1ff0a913ca092ae5a4e7c9277b0a634c645eee Mon Sep 17 00:00:00 2001 From: Peng Hao Date: Tue, 4 Dec 2018 17:42:50 +0800 Subject: kvm: x86: remove unnecessary recalculate_apic_map In the previous code, the variable apic_sw_disabled influences recalculate_apic_map. But in "KVM: x86: simplify kvm_apic_map" (commit: 3b5a5ffa928a3f875b0d5dd284eeb7c322e1688a), the access to apic_sw_disabled in recalculate_apic_map has been deleted. Signed-off-by: Peng Hao Reviewed-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index c4533d05c214..9f089e2e09d0 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -251,10 +251,9 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) if (enabled != apic->sw_enabled) { apic->sw_enabled = enabled; - if (enabled) { + if (enabled) static_key_slow_dec_deferred(&apic_sw_disabled); - recalculate_apic_map(apic->vcpu->kvm); - } else + else static_key_slow_inc(&apic_sw_disabled.key); } } -- cgit v1.2.3 From 3a0e7731724f6ac684129f1763eef4d2d3e7dec7 Mon Sep 17 00:00:00 2001 From: Roman Kagan Date: Mon, 10 Dec 2018 18:47:26 +0000 Subject: x86: kvm: hyperv: simplify SynIC message delivery SynIC message delivery is somewhat overengineered: it pretends to follow the ordering rules when grabbing the message slot, using atomic operations and all that, but does it incorrectly and unnecessarily. The correct order would be to first set .msg_pending, then atomically replace .message_type if it was zero, and then clear .msg_pending if the previous step was successful. But this all is done in vcpu context so the whole update looks atomic to the guest (it's assumed to only access the message page from this cpu), and therefore can be done in whatever order is most convenient (and is also the reason why the incorrect order didn't trigger any bugs so far). While at this, also switch to kvm_vcpu_{read,write}_guest_page, and drop the no longer needed synic_clear_sint_msg_pending. Signed-off-by: Roman Kagan Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 98 +++++++++++++++++++++------------------------------ 1 file changed, 41 insertions(+), 57 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 4e80080f277a..17d04d2c6d4f 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -158,32 +158,6 @@ static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vpidx) return (synic->active) ? synic : NULL; } -static void synic_clear_sint_msg_pending(struct kvm_vcpu_hv_synic *synic, - u32 sint) -{ - struct kvm_vcpu *vcpu = synic_to_vcpu(synic); - struct page *page; - gpa_t gpa; - struct hv_message *msg; - struct hv_message_page *msg_page; - - gpa = synic->msg_page & PAGE_MASK; - page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT); - if (is_error_page(page)) { - vcpu_err(vcpu, "Hyper-V SynIC can't get msg page, gpa 0x%llx\n", - gpa); - return; - } - msg_page = kmap_atomic(page); - - msg = &msg_page->sint_message[sint]; - msg->header.message_flags.msg_pending = 0; - - kunmap_atomic(msg_page); - kvm_release_page_dirty(page); - kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); -} - static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint) { struct kvm *kvm = vcpu->kvm; @@ -194,9 +168,6 @@ static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint) trace_kvm_hv_notify_acked_sint(vcpu->vcpu_id, sint); - if (synic->msg_page & HV_SYNIC_SIMP_ENABLE) - synic_clear_sint_msg_pending(synic, sint); - /* Try to deliver pending Hyper-V SynIC timers messages */ stimers_pending = 0; for (idx = 0; idx < ARRAY_SIZE(hv_vcpu->stimer); idx++) { @@ -589,41 +560,54 @@ static int synic_deliver_msg(struct kvm_vcpu_hv_synic *synic, u32 sint, struct hv_message *src_msg) { struct kvm_vcpu *vcpu = synic_to_vcpu(synic); - struct page *page; - gpa_t gpa; - struct hv_message *dst_msg; + int msg_off = offsetof(struct hv_message_page, sint_message[sint]); + gfn_t msg_page_gfn; + struct hv_message_header hv_hdr; int r; - struct hv_message_page *msg_page; if (!(synic->msg_page & HV_SYNIC_SIMP_ENABLE)) return -ENOENT; - gpa = synic->msg_page & PAGE_MASK; - page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT); - if (is_error_page(page)) - return -EFAULT; + msg_page_gfn = synic->msg_page >> PAGE_SHIFT; - msg_page = kmap_atomic(page); - dst_msg = &msg_page->sint_message[sint]; - if (sync_cmpxchg(&dst_msg->header.message_type, HVMSG_NONE, - src_msg->header.message_type) != HVMSG_NONE) { - dst_msg->header.message_flags.msg_pending = 1; - r = -EAGAIN; - } else { - memcpy(&dst_msg->u.payload, &src_msg->u.payload, - src_msg->header.payload_size); - dst_msg->header.message_type = src_msg->header.message_type; - dst_msg->header.payload_size = src_msg->header.payload_size; - r = synic_set_irq(synic, sint); - if (r >= 1) - r = 0; - else if (r == 0) - r = -EFAULT; + /* + * Strictly following the spec-mandated ordering would assume setting + * .msg_pending before checking .message_type. However, this function + * is only called in vcpu context so the entire update is atomic from + * guest POV and thus the exact order here doesn't matter. + */ + r = kvm_vcpu_read_guest_page(vcpu, msg_page_gfn, &hv_hdr.message_type, + msg_off + offsetof(struct hv_message, + header.message_type), + sizeof(hv_hdr.message_type)); + if (r < 0) + return r; + + if (hv_hdr.message_type != HVMSG_NONE) { + hv_hdr.message_flags.msg_pending = 1; + r = kvm_vcpu_write_guest_page(vcpu, msg_page_gfn, + &hv_hdr.message_flags, + msg_off + + offsetof(struct hv_message, + header.message_flags), + sizeof(hv_hdr.message_flags)); + if (r < 0) + return r; + return -EAGAIN; } - kunmap_atomic(msg_page); - kvm_release_page_dirty(page); - kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); - return r; + + r = kvm_vcpu_write_guest_page(vcpu, msg_page_gfn, src_msg, msg_off, + sizeof(src_msg->header) + + src_msg->header.payload_size); + if (r < 0) + return r; + + r = synic_set_irq(synic, sint); + if (r < 0) + return r; + if (r == 0) + return -EFAULT; + return 0; } static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer) -- cgit v1.2.3 From 7deec5e0df741102c9b54156c88cd1476d272ae5 Mon Sep 17 00:00:00 2001 From: Roman Kagan Date: Mon, 10 Dec 2018 18:47:27 +0000 Subject: x86: kvm: hyperv: don't retry message delivery for periodic timers The SynIC message delivery protocol allows the message originator to request, should the message slot be busy, to be notified when it's free. However, this is unnecessary and even undesirable for messages generated by SynIC timers in periodic mode: if the period is short enough compared to the time the guest spends in the timer interrupt handler, so the timer ticks start piling up, the excessive interactions due to this notification and retried message delivery only makes the things worse. [This was observed, in particular, with Windows L2 guests setting (temporarily) the periodic timer to 2 kHz, and spending hundreds of microseconds in the timer interrupt handler due to several L2->L1 exits; under some load in L0 this could exceed 500 us so the timer ticks started to pile up and the guest livelocked.] Relieve the situation somewhat by not retrying message delivery for periodic SynIC timers. This appears to remain within the "lazy" lost ticks policy for SynIC timers as implemented in KVM. Note that it doesn't solve the fundamental problem of livelocking the guest with a periodic timer whose period is smaller than the time needed to process a tick, but it makes it a bit less likely to be triggered. Signed-off-by: Roman Kagan Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 17d04d2c6d4f..ded4b0c90984 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -557,7 +557,7 @@ static int stimer_get_count(struct kvm_vcpu_hv_stimer *stimer, u64 *pcount) } static int synic_deliver_msg(struct kvm_vcpu_hv_synic *synic, u32 sint, - struct hv_message *src_msg) + struct hv_message *src_msg, bool no_retry) { struct kvm_vcpu *vcpu = synic_to_vcpu(synic); int msg_off = offsetof(struct hv_message_page, sint_message[sint]); @@ -584,6 +584,9 @@ static int synic_deliver_msg(struct kvm_vcpu_hv_synic *synic, u32 sint, return r; if (hv_hdr.message_type != HVMSG_NONE) { + if (no_retry) + return 0; + hv_hdr.message_flags.msg_pending = 1; r = kvm_vcpu_write_guest_page(vcpu, msg_page_gfn, &hv_hdr.message_flags, @@ -617,10 +620,17 @@ static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer) struct hv_timer_message_payload *payload = (struct hv_timer_message_payload *)&msg->u.payload; + /* + * To avoid piling up periodic ticks, don't retry message + * delivery for them (within "lazy" lost ticks policy). + */ + bool no_retry = stimer->config & HV_STIMER_PERIODIC; + payload->expiration_time = stimer->exp_time; payload->delivery_time = get_time_ref_counter(vcpu->kvm); return synic_deliver_msg(vcpu_to_synic(vcpu), - HV_STIMER_SINT(stimer->config), msg); + HV_STIMER_SINT(stimer->config), msg, + no_retry); } static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer) -- cgit v1.2.3 From a4987defc1e66c5b88c2a97e3a0ef386631587fa Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 10 Dec 2018 18:21:53 +0100 Subject: x86/hyper-v: Do some housekeeping in hyperv-tlfs.h hyperv-tlfs.h is a bit messy: CPUID feature bits are not always sorted, it's hard to get which CPUID they belong to, some items are duplicated (e.g. HV_X64_MSR_CRASH_CTL_NOTIFY/HV_CRASH_CTL_CRASH_NOTIFY). Do some housekeeping work. While on it, replace all (1 << X) with BIT(X) macro. Signed-off-by: Vitaly Kuznetsov Reviewed-by: Michael Kelley Acked-by: Thomas Gleixner Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/hyperv-tlfs.h | 186 ++++++++++++++++++------------------- arch/x86/kvm/hyperv.c | 4 +- 2 files changed, 93 insertions(+), 97 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 1e0d08af4050..bb9a3f1a3f88 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -30,158 +30,151 @@ /* * Feature identification. EAX indicates which features are available * to the partition based upon the current partition privileges. + * These are HYPERV_CPUID_FEATURES.EAX bits. */ /* VP Runtime (HV_X64_MSR_VP_RUNTIME) available */ -#define HV_X64_MSR_VP_RUNTIME_AVAILABLE (1 << 0) +#define HV_X64_MSR_VP_RUNTIME_AVAILABLE BIT(0) /* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/ -#define HV_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1) -/* Partition reference TSC MSR is available */ -#define HV_MSR_REFERENCE_TSC_AVAILABLE (1 << 9) -/* Partition Guest IDLE MSR is available */ -#define HV_X64_MSR_GUEST_IDLE_AVAILABLE (1 << 10) - -/* A partition's reference time stamp counter (TSC) page */ -#define HV_X64_MSR_REFERENCE_TSC 0x40000021 - -/* - * There is a single feature flag that signifies if the partition has access - * to MSRs with local APIC and TSC frequencies. - */ -#define HV_X64_ACCESS_FREQUENCY_MSRS (1 << 11) - -/* AccessReenlightenmentControls privilege */ -#define HV_X64_ACCESS_REENLIGHTENMENT BIT(13) - +#define HV_MSR_TIME_REF_COUNT_AVAILABLE BIT(1) /* * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available */ -#define HV_X64_MSR_SYNIC_AVAILABLE (1 << 2) +#define HV_X64_MSR_SYNIC_AVAILABLE BIT(2) /* * Synthetic Timer MSRs (HV_X64_MSR_STIMER0_CONFIG through * HV_X64_MSR_STIMER3_COUNT) available */ -#define HV_MSR_SYNTIMER_AVAILABLE (1 << 3) +#define HV_MSR_SYNTIMER_AVAILABLE BIT(3) /* * APIC access MSRs (HV_X64_MSR_EOI, HV_X64_MSR_ICR and HV_X64_MSR_TPR) * are available */ -#define HV_X64_MSR_APIC_ACCESS_AVAILABLE (1 << 4) +#define HV_X64_MSR_APIC_ACCESS_AVAILABLE BIT(4) /* Hypercall MSRs (HV_X64_MSR_GUEST_OS_ID and HV_X64_MSR_HYPERCALL) available*/ -#define HV_X64_MSR_HYPERCALL_AVAILABLE (1 << 5) +#define HV_X64_MSR_HYPERCALL_AVAILABLE BIT(5) /* Access virtual processor index MSR (HV_X64_MSR_VP_INDEX) available*/ -#define HV_X64_MSR_VP_INDEX_AVAILABLE (1 << 6) +#define HV_X64_MSR_VP_INDEX_AVAILABLE BIT(6) /* Virtual system reset MSR (HV_X64_MSR_RESET) is available*/ -#define HV_X64_MSR_RESET_AVAILABLE (1 << 7) - /* - * Access statistics pages MSRs (HV_X64_MSR_STATS_PARTITION_RETAIL_PAGE, - * HV_X64_MSR_STATS_PARTITION_INTERNAL_PAGE, HV_X64_MSR_STATS_VP_RETAIL_PAGE, - * HV_X64_MSR_STATS_VP_INTERNAL_PAGE) available - */ -#define HV_X64_MSR_STAT_PAGES_AVAILABLE (1 << 8) - -/* Frequency MSRs available */ -#define HV_FEATURE_FREQUENCY_MSRS_AVAILABLE (1 << 8) - -/* Crash MSR available */ -#define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE (1 << 10) - -/* stimer Direct Mode is available */ -#define HV_STIMER_DIRECT_MODE_AVAILABLE (1 << 19) +#define HV_X64_MSR_RESET_AVAILABLE BIT(7) +/* + * Access statistics pages MSRs (HV_X64_MSR_STATS_PARTITION_RETAIL_PAGE, + * HV_X64_MSR_STATS_PARTITION_INTERNAL_PAGE, HV_X64_MSR_STATS_VP_RETAIL_PAGE, + * HV_X64_MSR_STATS_VP_INTERNAL_PAGE) available + */ +#define HV_X64_MSR_STAT_PAGES_AVAILABLE BIT(8) +/* Partition reference TSC MSR is available */ +#define HV_MSR_REFERENCE_TSC_AVAILABLE BIT(9) +/* Partition Guest IDLE MSR is available */ +#define HV_X64_MSR_GUEST_IDLE_AVAILABLE BIT(10) +/* + * There is a single feature flag that signifies if the partition has access + * to MSRs with local APIC and TSC frequencies. + */ +#define HV_X64_ACCESS_FREQUENCY_MSRS BIT(11) +/* AccessReenlightenmentControls privilege */ +#define HV_X64_ACCESS_REENLIGHTENMENT BIT(13) /* - * Feature identification: EBX indicates which flags were specified at - * partition creation. The format is the same as the partition creation - * flag structure defined in section Partition Creation Flags. + * Feature identification: indicates which flags were specified at partition + * creation. The format is the same as the partition creation flag structure + * defined in section Partition Creation Flags. + * These are HYPERV_CPUID_FEATURES.EBX bits. */ -#define HV_X64_CREATE_PARTITIONS (1 << 0) -#define HV_X64_ACCESS_PARTITION_ID (1 << 1) -#define HV_X64_ACCESS_MEMORY_POOL (1 << 2) -#define HV_X64_ADJUST_MESSAGE_BUFFERS (1 << 3) -#define HV_X64_POST_MESSAGES (1 << 4) -#define HV_X64_SIGNAL_EVENTS (1 << 5) -#define HV_X64_CREATE_PORT (1 << 6) -#define HV_X64_CONNECT_PORT (1 << 7) -#define HV_X64_ACCESS_STATS (1 << 8) -#define HV_X64_DEBUGGING (1 << 11) -#define HV_X64_CPU_POWER_MANAGEMENT (1 << 12) -#define HV_X64_CONFIGURE_PROFILER (1 << 13) +#define HV_X64_CREATE_PARTITIONS BIT(0) +#define HV_X64_ACCESS_PARTITION_ID BIT(1) +#define HV_X64_ACCESS_MEMORY_POOL BIT(2) +#define HV_X64_ADJUST_MESSAGE_BUFFERS BIT(3) +#define HV_X64_POST_MESSAGES BIT(4) +#define HV_X64_SIGNAL_EVENTS BIT(5) +#define HV_X64_CREATE_PORT BIT(6) +#define HV_X64_CONNECT_PORT BIT(7) +#define HV_X64_ACCESS_STATS BIT(8) +#define HV_X64_DEBUGGING BIT(11) +#define HV_X64_CPU_POWER_MANAGEMENT BIT(12) +#define HV_X64_CONFIGURE_PROFILER BIT(13) /* * Feature identification. EDX indicates which miscellaneous features * are available to the partition. + * These are HYPERV_CPUID_FEATURES.EDX bits. */ /* The MWAIT instruction is available (per section MONITOR / MWAIT) */ -#define HV_X64_MWAIT_AVAILABLE (1 << 0) +#define HV_X64_MWAIT_AVAILABLE BIT(0) /* Guest debugging support is available */ -#define HV_X64_GUEST_DEBUGGING_AVAILABLE (1 << 1) +#define HV_X64_GUEST_DEBUGGING_AVAILABLE BIT(1) /* Performance Monitor support is available*/ -#define HV_X64_PERF_MONITOR_AVAILABLE (1 << 2) +#define HV_X64_PERF_MONITOR_AVAILABLE BIT(2) /* Support for physical CPU dynamic partitioning events is available*/ -#define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE (1 << 3) +#define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE BIT(3) /* * Support for passing hypercall input parameter block via XMM * registers is available */ -#define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE (1 << 4) +#define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE BIT(4) /* Support for a virtual guest idle state is available */ -#define HV_X64_GUEST_IDLE_STATE_AVAILABLE (1 << 5) -/* Guest crash data handler available */ -#define HV_X64_GUEST_CRASH_MSR_AVAILABLE (1 << 10) +#define HV_X64_GUEST_IDLE_STATE_AVAILABLE BIT(5) +/* Frequency MSRs available */ +#define HV_FEATURE_FREQUENCY_MSRS_AVAILABLE BIT(8) +/* Crash MSR available */ +#define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE BIT(10) +/* stimer Direct Mode is available */ +#define HV_STIMER_DIRECT_MODE_AVAILABLE BIT(19) /* * Implementation recommendations. Indicates which behaviors the hypervisor * recommends the OS implement for optimal performance. + * These are HYPERV_CPUID_ENLIGHTMENT_INFO.EAX bits. + */ +/* + * Recommend using hypercall for address space switches rather + * than MOV to CR3 instruction */ - /* - * Recommend using hypercall for address space switches rather - * than MOV to CR3 instruction - */ -#define HV_X64_AS_SWITCH_RECOMMENDED (1 << 0) +#define HV_X64_AS_SWITCH_RECOMMENDED BIT(0) /* Recommend using hypercall for local TLB flushes rather * than INVLPG or MOV to CR3 instructions */ -#define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED (1 << 1) +#define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED BIT(1) /* * Recommend using hypercall for remote TLB flushes rather * than inter-processor interrupts */ -#define HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED (1 << 2) +#define HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED BIT(2) /* * Recommend using MSRs for accessing APIC registers * EOI, ICR and TPR rather than their memory-mapped counterparts */ -#define HV_X64_APIC_ACCESS_RECOMMENDED (1 << 3) +#define HV_X64_APIC_ACCESS_RECOMMENDED BIT(3) /* Recommend using the hypervisor-provided MSR to initiate a system RESET */ -#define HV_X64_SYSTEM_RESET_RECOMMENDED (1 << 4) +#define HV_X64_SYSTEM_RESET_RECOMMENDED BIT(4) /* * Recommend using relaxed timing for this partition. If used, * the VM should disable any watchdog timeouts that rely on the * timely delivery of external interrupts */ -#define HV_X64_RELAXED_TIMING_RECOMMENDED (1 << 5) +#define HV_X64_RELAXED_TIMING_RECOMMENDED BIT(5) /* * Recommend not using Auto End-Of-Interrupt feature */ -#define HV_DEPRECATING_AEOI_RECOMMENDED (1 << 9) +#define HV_DEPRECATING_AEOI_RECOMMENDED BIT(9) /* * Recommend using cluster IPI hypercalls. */ -#define HV_X64_CLUSTER_IPI_RECOMMENDED (1 << 10) +#define HV_X64_CLUSTER_IPI_RECOMMENDED BIT(10) /* Recommend using the newer ExProcessorMasks interface */ -#define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED (1 << 11) +#define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED BIT(11) /* Recommend using enlightened VMCS */ -#define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED (1 << 14) +#define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED BIT(14) -/* - * Crash notification flags. - */ -#define HV_CRASH_CTL_CRASH_NOTIFY_MSG BIT_ULL(62) -#define HV_CRASH_CTL_CRASH_NOTIFY BIT_ULL(63) +/* Nested features. These are HYPERV_CPUID_NESTED_FEATURES.EAX bits. */ +#define HV_X64_NESTED_GUEST_MAPPING_FLUSH BIT(18) +#define HV_X64_NESTED_MSR_BITMAP BIT(19) + +/* Hyper-V specific model specific registers (MSRs) */ /* MSR used to identify the guest OS. */ #define HV_X64_MSR_GUEST_OS_ID 0x40000000 @@ -201,6 +194,9 @@ /* MSR used to read the per-partition time reference counter */ #define HV_X64_MSR_TIME_REF_COUNT 0x40000020 +/* A partition's reference time stamp counter (TSC) page */ +#define HV_X64_MSR_REFERENCE_TSC 0x40000021 + /* MSR used to retrieve the TSC frequency */ #define HV_X64_MSR_TSC_FREQUENCY 0x40000022 @@ -258,9 +254,11 @@ #define HV_X64_MSR_CRASH_P3 0x40000103 #define HV_X64_MSR_CRASH_P4 0x40000104 #define HV_X64_MSR_CRASH_CTL 0x40000105 -#define HV_X64_MSR_CRASH_CTL_NOTIFY (1ULL << 63) -#define HV_X64_MSR_CRASH_PARAMS \ - (1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0)) + +/* TSC emulation after migration */ +#define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106 +#define HV_X64_MSR_TSC_EMULATION_CONTROL 0x40000107 +#define HV_X64_MSR_TSC_EMULATION_STATUS 0x40000108 /* * Declare the MSR used to setup pages used to communicate with the hypervisor. @@ -311,13 +309,6 @@ struct ms_hyperv_tsc_page { #define HV_LINUX_VENDOR_ID 0x8100 -/* TSC emulation after migration */ -#define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106 - -/* Nested features (CPUID 0x4000000A) EAX */ -#define HV_X64_NESTED_GUEST_MAPPING_FLUSH BIT(18) -#define HV_X64_NESTED_MSR_BITMAP BIT(19) - struct hv_reenlightenment_control { __u64 vector:8; __u64 reserved1:8; @@ -326,9 +317,6 @@ struct hv_reenlightenment_control { __u64 target_vp:32; } __packed; -#define HV_X64_MSR_TSC_EMULATION_CONTROL 0x40000107 -#define HV_X64_MSR_TSC_EMULATION_STATUS 0x40000108 - struct hv_tsc_emulation_control { __u64 enabled:1; __u64 reserved:63; @@ -344,6 +332,14 @@ struct hv_tsc_emulation_status { #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \ (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1)) +/* + * Crash notification (HV_X64_MSR_CRASH_CTL) flags. + */ +#define HV_CRASH_CTL_CRASH_NOTIFY_MSG BIT_ULL(62) +#define HV_CRASH_CTL_CRASH_NOTIFY BIT_ULL(63) +#define HV_X64_MSR_CRASH_PARAMS \ + (1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0)) + #define HV_IPI_LOW_VECTOR 0x10 #define HV_IPI_HIGH_VECTOR 0xff diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index ded4b0c90984..a9b5d6c4e3eb 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -809,9 +809,9 @@ static int kvm_hv_msr_set_crash_ctl(struct kvm_vcpu *vcpu, u64 data, bool host) struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; if (host) - hv->hv_crash_ctl = data & HV_X64_MSR_CRASH_CTL_NOTIFY; + hv->hv_crash_ctl = data & HV_CRASH_CTL_CRASH_NOTIFY; - if (!host && (data & HV_X64_MSR_CRASH_CTL_NOTIFY)) { + if (!host && (data & HV_CRASH_CTL_CRASH_NOTIFY)) { vcpu_debug(vcpu, "hv crash (0x%llx 0x%llx 0x%llx 0x%llx 0x%llx)\n", hv->hv_crash_param[0], -- cgit v1.2.3 From e2e871ab2f02dc9ca5f06065234475393dcec38b Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 10 Dec 2018 18:21:55 +0100 Subject: x86/kvm/hyper-v: Introduce nested_get_evmcs_version() helper The upcoming KVM_GET_SUPPORTED_HV_CPUID ioctl will need to return Enlightened VMCS version in HYPERV_CPUID_NESTED_FEATURES.EAX when it was enabled. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm.c | 7 +++++++ arch/x86/kvm/vmx/evmcs.c | 23 ++++++++++++++++------- arch/x86/kvm/vmx/evmcs.h | 1 + arch/x86/kvm/vmx/nested.c | 1 + 5 files changed, 26 insertions(+), 7 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fbda5a917c5b..f5c2ce4f01e8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1186,6 +1186,7 @@ struct kvm_x86_ops { int (*nested_enable_evmcs)(struct kvm_vcpu *vcpu, uint16_t *vmcs_version); + uint16_t (*nested_get_evmcs_version)(struct kvm_vcpu *vcpu); }; struct kvm_arch_async_pf { diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 3cbe67a31c54..0762e81e0498 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -7047,6 +7047,12 @@ failed: return ret; } +static uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu) +{ + /* Not supported */ + return 0; +} + static int nested_enable_evmcs(struct kvm_vcpu *vcpu, uint16_t *vmcs_version) { @@ -7185,6 +7191,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .mem_enc_unreg_region = svm_unregister_enc_region, .nested_enable_evmcs = nested_enable_evmcs, + .nested_get_evmcs_version = nested_get_evmcs_version, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c index 155fb4a39472..95bc2247478d 100644 --- a/arch/x86/kvm/vmx/evmcs.c +++ b/arch/x86/kvm/vmx/evmcs.c @@ -313,19 +313,28 @@ void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) } #endif +uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + /* + * vmcs_version represents the range of supported Enlightened VMCS + * versions: lower 8 bits is the minimal version, higher 8 bits is the + * maximum supported version. KVM supports versions from 1 to + * KVM_EVMCS_VERSION. + */ + if (vmx->nested.enlightened_vmcs_enabled) + return (KVM_EVMCS_VERSION << 8) | 1; + + return 0; +} + int nested_enable_evmcs(struct kvm_vcpu *vcpu, uint16_t *vmcs_version) { struct vcpu_vmx *vmx = to_vmx(vcpu); - /* - * vmcs_version represents the range of supported Enlightened VMCS - * versions: lower 8 bits is the minimal version, higher 8 bits is the - * maximum supported version. KVM supports versions from 1 to - * KVM_EVMCS_VERSION. - */ if (vmcs_version) - *vmcs_version = (KVM_EVMCS_VERSION << 8) | 1; + *vmcs_version = nested_get_evmcs_version(vcpu); /* We don't support disabling the feature for simplicity. */ if (vmx->nested.enlightened_vmcs_enabled) diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h index 1572a3759d65..e0fcef85b332 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/evmcs.h @@ -195,6 +195,7 @@ static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {} static inline void evmcs_touch_msr_bitmap(void) {} #endif /* IS_ENABLED(CONFIG_HYPERV) */ +uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu); int nested_enable_evmcs(struct kvm_vcpu *vcpu, uint16_t *vmcs_version); diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 70e6d604c2a0..bb3a7030eb39 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5669,6 +5669,7 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) kvm_x86_ops->set_nested_state = vmx_set_nested_state; kvm_x86_ops->get_vmcs12_pages = nested_get_vmcs12_pages, kvm_x86_ops->nested_enable_evmcs = nested_enable_evmcs; + kvm_x86_ops->nested_get_evmcs_version = nested_get_evmcs_version; return 0; } -- cgit v1.2.3 From 2bc39970e9327ceb06cb210f86ba35f81d00e350 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 10 Dec 2018 18:21:56 +0100 Subject: x86/kvm/hyper-v: Introduce KVM_GET_SUPPORTED_HV_CPUID With every new Hyper-V Enlightenment we implement we're forced to add a KVM_CAP_HYPERV_* capability. While this approach works it is fairly inconvenient: the majority of the enlightenments we do have corresponding CPUID feature bit(s) and userspace has to know this anyways to be able to expose the feature to the guest. Add KVM_GET_SUPPORTED_HV_CPUID ioctl (backed by KVM_CAP_HYPERV_CPUID, "one cap to rule them all!") returning all Hyper-V CPUID feature leaves. Using the existing KVM_GET_SUPPORTED_CPUID doesn't seem to be possible: Hyper-V CPUID feature leaves intersect with KVM's (e.g. 0x40000000, 0x40000001) and we would probably confuse userspace in case we decide to return these twice. KVM_CAP_HYPERV_CPUID's number is interim: we're intended to drop KVM_CAP_HYPERV_STIMER_DIRECT and use its number instead. Suggested-by: Paolo Bonzini Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/api.txt | 56 ++++++++++++++++++ arch/x86/kvm/hyperv.c | 121 ++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/hyperv.h | 2 + arch/x86/kvm/x86.c | 20 +++++++ include/uapi/linux/kvm.h | 4 ++ 5 files changed, 203 insertions(+) (limited to 'arch/x86/kvm') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index f2c345f7b630..356156f5c52d 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3800,6 +3800,62 @@ is enabled; for more information, see the description of the capability. However, it can always be used as long as KVM_CHECK_EXTENSION confirms that KVM_CAP_MANUAL_DIRTY_LOG_PROTECT is present. +4.118 KVM_GET_SUPPORTED_HV_CPUID + +Capability: KVM_CAP_HYPERV_CPUID +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_cpuid2 (in/out) +Returns: 0 on success, -1 on error + +struct kvm_cpuid2 { + __u32 nent; + __u32 padding; + struct kvm_cpuid_entry2 entries[0]; +}; + +struct kvm_cpuid_entry2 { + __u32 function; + __u32 index; + __u32 flags; + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 padding[3]; +}; + +This ioctl returns x86 cpuid features leaves related to Hyper-V emulation in +KVM. Userspace can use the information returned by this ioctl to construct +cpuid information presented to guests consuming Hyper-V enlightenments (e.g. +Windows or Hyper-V guests). + +CPUID feature leaves returned by this ioctl are defined by Hyper-V Top Level +Functional Specification (TLFS). These leaves can't be obtained with +KVM_GET_SUPPORTED_CPUID ioctl because some of them intersect with KVM feature +leaves (0x40000000, 0x40000001). + +Currently, the following list of CPUID leaves are returned: + HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS + HYPERV_CPUID_INTERFACE + HYPERV_CPUID_VERSION + HYPERV_CPUID_FEATURES + HYPERV_CPUID_ENLIGHTMENT_INFO + HYPERV_CPUID_IMPLEMENT_LIMITS + HYPERV_CPUID_NESTED_FEATURES + +HYPERV_CPUID_NESTED_FEATURES leaf is only exposed when Enlightened VMCS was +enabled on the corresponding vCPU (KVM_CAP_HYPERV_ENLIGHTENED_VMCS). + +Userspace invokes KVM_GET_SUPPORTED_CPUID by passing a kvm_cpuid2 structure +with the 'nent' field indicating the number of entries in the variable-size +array 'entries'. If the number of entries is too low to describe all Hyper-V +feature leaves, an error (E2BIG) is returned. If the number is more or equal +to the number of Hyper-V feature leaves, the 'nent' field is adjusted to the +number of valid entries in the 'entries' array, which is then filled. + +'index' and 'flags' fields in 'struct kvm_cpuid_entry2' are currently reserved, +userspace should not expect to get any particular value there. 5. The kvm_run structure ------------------------ diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index a9b5d6c4e3eb..e3c076020d4e 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1752,3 +1752,124 @@ int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args) return kvm_hv_eventfd_deassign(kvm, args->conn_id); return kvm_hv_eventfd_assign(kvm, args->conn_id, args->fd); } + +int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries) +{ + uint16_t evmcs_ver = kvm_x86_ops->nested_get_evmcs_version(vcpu); + struct kvm_cpuid_entry2 cpuid_entries[] = { + { .function = HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS }, + { .function = HYPERV_CPUID_INTERFACE }, + { .function = HYPERV_CPUID_VERSION }, + { .function = HYPERV_CPUID_FEATURES }, + { .function = HYPERV_CPUID_ENLIGHTMENT_INFO }, + { .function = HYPERV_CPUID_IMPLEMENT_LIMITS }, + { .function = HYPERV_CPUID_NESTED_FEATURES }, + }; + int i, nent = ARRAY_SIZE(cpuid_entries); + + /* Skip NESTED_FEATURES if eVMCS is not supported */ + if (!evmcs_ver) + --nent; + + if (cpuid->nent < nent) + return -E2BIG; + + if (cpuid->nent > nent) + cpuid->nent = nent; + + for (i = 0; i < nent; i++) { + struct kvm_cpuid_entry2 *ent = &cpuid_entries[i]; + u32 signature[3]; + + switch (ent->function) { + case HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS: + memcpy(signature, "Linux KVM Hv", 12); + + ent->eax = HYPERV_CPUID_NESTED_FEATURES; + ent->ebx = signature[0]; + ent->ecx = signature[1]; + ent->edx = signature[2]; + break; + + case HYPERV_CPUID_INTERFACE: + memcpy(signature, "Hv#1\0\0\0\0\0\0\0\0", 12); + ent->eax = signature[0]; + break; + + case HYPERV_CPUID_VERSION: + /* + * We implement some Hyper-V 2016 functions so let's use + * this version. + */ + ent->eax = 0x00003839; + ent->ebx = 0x000A0000; + break; + + case HYPERV_CPUID_FEATURES: + ent->eax |= HV_X64_MSR_VP_RUNTIME_AVAILABLE; + ent->eax |= HV_MSR_TIME_REF_COUNT_AVAILABLE; + ent->eax |= HV_X64_MSR_SYNIC_AVAILABLE; + ent->eax |= HV_MSR_SYNTIMER_AVAILABLE; + ent->eax |= HV_X64_MSR_APIC_ACCESS_AVAILABLE; + ent->eax |= HV_X64_MSR_HYPERCALL_AVAILABLE; + ent->eax |= HV_X64_MSR_VP_INDEX_AVAILABLE; + ent->eax |= HV_X64_MSR_RESET_AVAILABLE; + ent->eax |= HV_MSR_REFERENCE_TSC_AVAILABLE; + ent->eax |= HV_X64_MSR_GUEST_IDLE_AVAILABLE; + ent->eax |= HV_X64_ACCESS_FREQUENCY_MSRS; + ent->eax |= HV_X64_ACCESS_REENLIGHTENMENT; + + ent->ebx |= HV_X64_POST_MESSAGES; + ent->ebx |= HV_X64_SIGNAL_EVENTS; + + ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE; + ent->edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; + ent->edx |= HV_STIMER_DIRECT_MODE_AVAILABLE; + + break; + + case HYPERV_CPUID_ENLIGHTMENT_INFO: + ent->eax |= HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED; + ent->eax |= HV_X64_APIC_ACCESS_RECOMMENDED; + ent->eax |= HV_X64_SYSTEM_RESET_RECOMMENDED; + ent->eax |= HV_X64_RELAXED_TIMING_RECOMMENDED; + ent->eax |= HV_X64_CLUSTER_IPI_RECOMMENDED; + ent->eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED; + ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; + + /* + * Default number of spinlock retry attempts, matches + * HyperV 2016. + */ + ent->ebx = 0x00000FFF; + + break; + + case HYPERV_CPUID_IMPLEMENT_LIMITS: + /* Maximum number of virtual processors */ + ent->eax = KVM_MAX_VCPUS; + /* + * Maximum number of logical processors, matches + * HyperV 2016. + */ + ent->ebx = 64; + + break; + + case HYPERV_CPUID_NESTED_FEATURES: + ent->eax = evmcs_ver; + + break; + + default: + break; + } + } + + if (copy_to_user(entries, cpuid_entries, + nent * sizeof(struct kvm_cpuid_entry2))) + return -EFAULT; + + return 0; +} diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 9c21c3479899..fd7cf13a2144 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -97,5 +97,7 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, void kvm_hv_init_vm(struct kvm *kvm); void kvm_hv_destroy_vm(struct kvm *kvm); int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args); +int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries); #endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 54ef79421c6b..18b7817af2bc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2997,6 +2997,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_HYPERV_TLBFLUSH: case KVM_CAP_HYPERV_SEND_IPI: case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: + case KVM_CAP_HYPERV_CPUID: case KVM_CAP_PCI_SEGMENT: case KVM_CAP_DEBUGREGS: case KVM_CAP_X86_ROBUST_SINGLESTEP: @@ -4191,6 +4192,25 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state); break; } + case KVM_GET_SUPPORTED_HV_CPUID: { + struct kvm_cpuid2 __user *cpuid_arg = argp; + struct kvm_cpuid2 cpuid; + + r = -EFAULT; + if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) + goto out; + + r = kvm_vcpu_ioctl_get_hv_cpuid(vcpu, &cpuid, + cpuid_arg->entries); + if (r) + goto out; + + r = -EFAULT; + if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid))) + goto out; + r = 0; + break; + } default: r = -EINVAL; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 9fe35f1ac938..6d4ea4b6c922 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -987,6 +987,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_EXCEPTION_PAYLOAD 164 #define KVM_CAP_ARM_VM_IPA_SIZE 165 #define KVM_CAP_MANUAL_DIRTY_LOG_PROTECT 166 +#define KVM_CAP_HYPERV_CPUID 167 #ifdef KVM_CAP_IRQ_ROUTING @@ -1436,6 +1437,9 @@ struct kvm_enc_region { /* Available with KVM_CAP_MANUAL_DIRTY_LOG_PROTECT */ #define KVM_CLEAR_DIRTY_LOG _IOWR(KVMIO, 0xc0, struct kvm_clear_dirty_log) +/* Available with KVM_CAP_HYPERV_CPUID */ +#define KVM_GET_SUPPORTED_HV_CPUID _IOWR(KVMIO, 0xc1, struct kvm_cpuid2) + /* Secure Encrypted Virtualization command */ enum sev_cmd_id { /* Guest initialization commands */ -- cgit v1.2.3 From 6a058a1eadc3882eff6efaa757f2c71a31fe9906 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 26 Nov 2018 16:47:30 +0100 Subject: x86/kvm/hyper-v: use stimer config definition from hyperv-tlfs.h As a preparation to implementing Direct Mode for Hyper-V synthetic timers switch to using stimer config definition from hyperv-tlfs.h. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/hyperv-tlfs.h | 6 ------ arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/hyperv.c | 35 ++++++++++++++++++----------------- 3 files changed, 19 insertions(+), 24 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index ce438d49a192..a2fa7ab2a6a8 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -723,12 +723,6 @@ struct hv_enlightened_vmcs { #define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL 0xFFFF -#define HV_STIMER_ENABLE (1ULL << 0) -#define HV_STIMER_PERIODIC (1ULL << 1) -#define HV_STIMER_LAZY (1ULL << 2) -#define HV_STIMER_AUTOENABLE (1ULL << 3) -#define HV_STIMER_SINT(config) (__u8)(((config) >> 16) & 0x0F) - /* Define synthetic interrupt controller flag constants. */ #define HV_EVENT_FLAGS_COUNT (256 * 8) #define HV_EVENT_FLAGS_LONG_COUNT (256 / sizeof(unsigned long)) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f5c2ce4f01e8..ddeddf2a992c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -497,7 +497,7 @@ struct kvm_mtrr { struct kvm_vcpu_hv_stimer { struct hrtimer timer; int index; - u64 config; + union hv_stimer_config config; u64 count; u64 exp_time; struct hv_message msg; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index e3c076020d4e..15849d266078 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -172,9 +172,8 @@ static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint) stimers_pending = 0; for (idx = 0; idx < ARRAY_SIZE(hv_vcpu->stimer); idx++) { stimer = &hv_vcpu->stimer[idx]; - if (stimer->msg_pending && - (stimer->config & HV_STIMER_ENABLE) && - HV_STIMER_SINT(stimer->config) == sint) { + if (stimer->msg_pending && stimer->config.enable && + stimer->config.sintx == sint) { set_bit(stimer->index, hv_vcpu->stimer_pending_bitmap); stimers_pending++; @@ -468,7 +467,7 @@ static int stimer_start(struct kvm_vcpu_hv_stimer *stimer) time_now = get_time_ref_counter(stimer_to_vcpu(stimer)->kvm); ktime_now = ktime_get(); - if (stimer->config & HV_STIMER_PERIODIC) { + if (stimer->config.periodic) { if (stimer->exp_time) { if (time_now >= stimer->exp_time) { u64 remainder; @@ -517,13 +516,15 @@ static int stimer_start(struct kvm_vcpu_hv_stimer *stimer) static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config, bool host) { + union hv_stimer_config new_config = {.as_uint64 = config}; + trace_kvm_hv_stimer_set_config(stimer_to_vcpu(stimer)->vcpu_id, stimer->index, config, host); stimer_cleanup(stimer); - if ((stimer->config & HV_STIMER_ENABLE) && HV_STIMER_SINT(config) == 0) - config &= ~HV_STIMER_ENABLE; - stimer->config = config; + if (stimer->config.enable && new_config.sintx == 0) + new_config.enable = 0; + stimer->config.as_uint64 = new_config.as_uint64; stimer_mark_pending(stimer, false); return 0; } @@ -537,16 +538,16 @@ static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count, stimer_cleanup(stimer); stimer->count = count; if (stimer->count == 0) - stimer->config &= ~HV_STIMER_ENABLE; - else if (stimer->config & HV_STIMER_AUTOENABLE) - stimer->config |= HV_STIMER_ENABLE; + stimer->config.enable = 0; + else if (stimer->config.auto_enable) + stimer->config.enable = 1; stimer_mark_pending(stimer, false); return 0; } static int stimer_get_config(struct kvm_vcpu_hv_stimer *stimer, u64 *pconfig) { - *pconfig = stimer->config; + *pconfig = stimer->config.as_uint64; return 0; } @@ -624,12 +625,12 @@ static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer) * To avoid piling up periodic ticks, don't retry message * delivery for them (within "lazy" lost ticks policy). */ - bool no_retry = stimer->config & HV_STIMER_PERIODIC; + bool no_retry = stimer->config.periodic; payload->expiration_time = stimer->exp_time; payload->delivery_time = get_time_ref_counter(vcpu->kvm); return synic_deliver_msg(vcpu_to_synic(vcpu), - HV_STIMER_SINT(stimer->config), msg, + stimer->config.sintx, msg, no_retry); } @@ -643,8 +644,8 @@ static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer) stimer->index, r); if (!r) { stimer->msg_pending = false; - if (!(stimer->config & HV_STIMER_PERIODIC)) - stimer->config &= ~HV_STIMER_ENABLE; + if (!(stimer->config.periodic)) + stimer->config.enable = 0; } } @@ -658,7 +659,7 @@ void kvm_hv_process_stimers(struct kvm_vcpu *vcpu) for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) if (test_and_clear_bit(i, hv_vcpu->stimer_pending_bitmap)) { stimer = &hv_vcpu->stimer[i]; - if (stimer->config & HV_STIMER_ENABLE) { + if (stimer->config.enable) { exp_time = stimer->exp_time; if (exp_time) { @@ -668,7 +669,7 @@ void kvm_hv_process_stimers(struct kvm_vcpu *vcpu) stimer_expiration(stimer); } - if ((stimer->config & HV_STIMER_ENABLE) && + if ((stimer->config.enable) && stimer->count) { if (!stimer->msg_pending) stimer_start(stimer); -- cgit v1.2.3 From 8644f771e07c52617627dffa4c67ba0ea120fd13 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 26 Nov 2018 16:47:31 +0100 Subject: x86/kvm/hyper-v: direct mode for synthetic timers Turns out Hyper-V on KVM (as of 2016) will only use synthetic timers if direct mode is available. With direct mode we notify the guest by asserting APIC irq instead of sending a SynIC message. The implementation uses existing vec_bitmap for letting lapic code know that we're interested in the particular IRQ's EOI request. We assume that the same APIC irq won't be used by the guest for both direct mode stimer and as sint source (especially with AutoEOI semantics). It is unclear how things should be handled if that's not true. Direct mode is also somewhat less expensive; in my testing stimer_send_msg() takes not less than 1500 cpu cycles and stimer_notify_direct() can usually be done in 300-400. WS2016 without Hyper-V, however, always sticks to non-direct version. Signed-off-by: Vitaly Kuznetsov Reviewed-by: Roman Kagan Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 67 +++++++++++++++++++++++++++++++++++++++++++++------ arch/x86/kvm/trace.h | 10 +++++--- 2 files changed, 65 insertions(+), 12 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 15849d266078..e8f6ccc22014 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -38,6 +38,9 @@ #define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, 64) +static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer, + bool vcpu_kick); + static inline u64 synic_read_sint(struct kvm_vcpu_hv_synic *synic, int sint) { return atomic64_read(&synic->sint[sint]); @@ -53,8 +56,21 @@ static inline int synic_get_sint_vector(u64 sint_value) static bool synic_has_vector_connected(struct kvm_vcpu_hv_synic *synic, int vector) { + struct kvm_vcpu *vcpu = synic_to_vcpu(synic); + struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); + struct kvm_vcpu_hv_stimer *stimer; int i; + for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) { + stimer = &hv_vcpu->stimer[i]; + if (stimer->config.enable && stimer->config.direct_mode && + stimer->config.apic_vector == vector) + return true; + } + + if (vector < HV_SYNIC_FIRST_VALID_VECTOR) + return false; + for (i = 0; i < ARRAY_SIZE(synic->sint); i++) { if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector) return true; @@ -80,14 +96,14 @@ static bool synic_has_vector_auto_eoi(struct kvm_vcpu_hv_synic *synic, static void synic_update_vector(struct kvm_vcpu_hv_synic *synic, int vector) { - if (vector < HV_SYNIC_FIRST_VALID_VECTOR) - return; - if (synic_has_vector_connected(synic, vector)) __set_bit(vector, synic->vec_bitmap); else __clear_bit(vector, synic->vec_bitmap); + if (vector < HV_SYNIC_FIRST_VALID_VECTOR) + return; + if (synic_has_vector_auto_eoi(synic, vector)) __set_bit(vector, synic->auto_eoi_bitmap); else @@ -173,6 +189,7 @@ static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint) for (idx = 0; idx < ARRAY_SIZE(hv_vcpu->stimer); idx++) { stimer = &hv_vcpu->stimer[idx]; if (stimer->msg_pending && stimer->config.enable && + !stimer->config.direct_mode && stimer->config.sintx == sint) { set_bit(stimer->index, hv_vcpu->stimer_pending_bitmap); @@ -342,7 +359,9 @@ int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vpidx, u32 sint) void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector) { + struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); + struct kvm_vcpu_hv_stimer *stimer; int i; trace_kvm_hv_synic_send_eoi(vcpu->vcpu_id, vector); @@ -350,6 +369,14 @@ void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector) for (i = 0; i < ARRAY_SIZE(synic->sint); i++) if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector) kvm_hv_notify_acked_sint(vcpu, i); + + for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) { + stimer = &hv_vcpu->stimer[i]; + if (stimer->msg_pending && stimer->config.enable && + stimer->config.direct_mode && + stimer->config.apic_vector == vector) + stimer_mark_pending(stimer, false); + } } static int kvm_hv_set_sint_gsi(struct kvm *kvm, u32 vpidx, u32 sint, int gsi) @@ -516,15 +543,25 @@ static int stimer_start(struct kvm_vcpu_hv_stimer *stimer) static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config, bool host) { - union hv_stimer_config new_config = {.as_uint64 = config}; + struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); + struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); + union hv_stimer_config new_config = {.as_uint64 = config}, + old_config = {.as_uint64 = stimer->config.as_uint64}; trace_kvm_hv_stimer_set_config(stimer_to_vcpu(stimer)->vcpu_id, stimer->index, config, host); stimer_cleanup(stimer); - if (stimer->config.enable && new_config.sintx == 0) + if (old_config.enable && + !new_config.direct_mode && new_config.sintx == 0) new_config.enable = 0; stimer->config.as_uint64 = new_config.as_uint64; + + if (old_config.direct_mode) + synic_update_vector(&hv_vcpu->synic, old_config.apic_vector); + if (new_config.direct_mode) + synic_update_vector(&hv_vcpu->synic, new_config.apic_vector); + stimer_mark_pending(stimer, false); return 0; } @@ -634,14 +671,28 @@ static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer) no_retry); } +static int stimer_notify_direct(struct kvm_vcpu_hv_stimer *stimer) +{ + struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); + struct kvm_lapic_irq irq = { + .delivery_mode = APIC_DM_FIXED, + .vector = stimer->config.apic_vector + }; + + return !kvm_apic_set_irq(vcpu, &irq, NULL); +} + static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer) { - int r; + int r, direct = stimer->config.direct_mode; stimer->msg_pending = true; - r = stimer_send_msg(stimer); + if (!direct) + r = stimer_send_msg(stimer); + else + r = stimer_notify_direct(stimer); trace_kvm_hv_stimer_expiration(stimer_to_vcpu(stimer)->vcpu_id, - stimer->index, r); + stimer->index, direct, r); if (!r) { stimer->msg_pending = false; if (!(stimer->config.periodic)) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 0659465a745c..705f40ae2532 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1254,24 +1254,26 @@ TRACE_EVENT(kvm_hv_stimer_callback, * Tracepoint for stimer_expiration. */ TRACE_EVENT(kvm_hv_stimer_expiration, - TP_PROTO(int vcpu_id, int timer_index, int msg_send_result), - TP_ARGS(vcpu_id, timer_index, msg_send_result), + TP_PROTO(int vcpu_id, int timer_index, int direct, int msg_send_result), + TP_ARGS(vcpu_id, timer_index, direct, msg_send_result), TP_STRUCT__entry( __field(int, vcpu_id) __field(int, timer_index) + __field(int, direct) __field(int, msg_send_result) ), TP_fast_assign( __entry->vcpu_id = vcpu_id; __entry->timer_index = timer_index; + __entry->direct = direct; __entry->msg_send_result = msg_send_result; ), - TP_printk("vcpu_id %d timer %d msg send result %d", + TP_printk("vcpu_id %d timer %d direct %d send result %d", __entry->vcpu_id, __entry->timer_index, - __entry->msg_send_result) + __entry->direct, __entry->msg_send_result) ); /* -- cgit v1.2.3 From 08a800ac257a2eeed1fad34b55ed55b2578e9b4e Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 26 Nov 2018 16:47:32 +0100 Subject: x86/kvm/hyper-v: avoid open-coding stimer_mark_pending() in kvm_hv_notify_acked_sint() stimers_pending optimization only helps us to avoid multiple kvm_make_request() calls. This doesn't happen very often and these calls are very cheap in the first place, remove open-coded version of stimer_mark_pending() from kvm_hv_notify_acked_sint(). Suggested-by: Paolo Bonzini Signed-off-by: Vitaly Kuznetsov Reviewed-by: Roman Kagan Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index e8f6ccc22014..209ca485a2c7 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -180,24 +180,18 @@ static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint) struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); struct kvm_vcpu_hv_stimer *stimer; - int gsi, idx, stimers_pending; + int gsi, idx; trace_kvm_hv_notify_acked_sint(vcpu->vcpu_id, sint); /* Try to deliver pending Hyper-V SynIC timers messages */ - stimers_pending = 0; for (idx = 0; idx < ARRAY_SIZE(hv_vcpu->stimer); idx++) { stimer = &hv_vcpu->stimer[idx]; if (stimer->msg_pending && stimer->config.enable && !stimer->config.direct_mode && - stimer->config.sintx == sint) { - set_bit(stimer->index, - hv_vcpu->stimer_pending_bitmap); - stimers_pending++; - } + stimer->config.sintx == sint) + stimer_mark_pending(stimer, false); } - if (stimers_pending) - kvm_make_request(KVM_REQ_HV_STIMER, vcpu); idx = srcu_read_lock(&kvm->irq_srcu); gsi = atomic_read(&synic->sint_to_gsi[sint]); -- cgit v1.2.3 From 87a8d795b2f1d926b4e831d27144d3e6ab7e33dc Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 5 Dec 2018 16:36:21 +0100 Subject: x86/hyper-v: Stop caring about EOI for direct stimers Turns out we over-engineered Direct Mode for stimers a bit: unlike traditional stimers where we may want to try to re-inject the message upon EOI, Direct Mode stimers just set the irq in APIC and kvm_apic_set_irq() fails only when APIC is disabled (see APIC_DM_FIXED case in __apic_accept_irq()). Remove the redundant part. Suggested-by: Roman Kagan Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 36 +++--------------------------------- 1 file changed, 3 insertions(+), 33 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 209ca485a2c7..c90a5352d158 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -56,21 +56,8 @@ static inline int synic_get_sint_vector(u64 sint_value) static bool synic_has_vector_connected(struct kvm_vcpu_hv_synic *synic, int vector) { - struct kvm_vcpu *vcpu = synic_to_vcpu(synic); - struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); - struct kvm_vcpu_hv_stimer *stimer; int i; - for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) { - stimer = &hv_vcpu->stimer[i]; - if (stimer->config.enable && stimer->config.direct_mode && - stimer->config.apic_vector == vector) - return true; - } - - if (vector < HV_SYNIC_FIRST_VALID_VECTOR) - return false; - for (i = 0; i < ARRAY_SIZE(synic->sint); i++) { if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector) return true; @@ -96,14 +83,14 @@ static bool synic_has_vector_auto_eoi(struct kvm_vcpu_hv_synic *synic, static void synic_update_vector(struct kvm_vcpu_hv_synic *synic, int vector) { + if (vector < HV_SYNIC_FIRST_VALID_VECTOR) + return; + if (synic_has_vector_connected(synic, vector)) __set_bit(vector, synic->vec_bitmap); else __clear_bit(vector, synic->vec_bitmap); - if (vector < HV_SYNIC_FIRST_VALID_VECTOR) - return; - if (synic_has_vector_auto_eoi(synic, vector)) __set_bit(vector, synic->auto_eoi_bitmap); else @@ -353,9 +340,7 @@ int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vpidx, u32 sint) void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector) { - struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); - struct kvm_vcpu_hv_stimer *stimer; int i; trace_kvm_hv_synic_send_eoi(vcpu->vcpu_id, vector); @@ -363,14 +348,6 @@ void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector) for (i = 0; i < ARRAY_SIZE(synic->sint); i++) if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector) kvm_hv_notify_acked_sint(vcpu, i); - - for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) { - stimer = &hv_vcpu->stimer[i]; - if (stimer->msg_pending && stimer->config.enable && - stimer->config.direct_mode && - stimer->config.apic_vector == vector) - stimer_mark_pending(stimer, false); - } } static int kvm_hv_set_sint_gsi(struct kvm *kvm, u32 vpidx, u32 sint, int gsi) @@ -537,8 +514,6 @@ static int stimer_start(struct kvm_vcpu_hv_stimer *stimer) static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config, bool host) { - struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); - struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); union hv_stimer_config new_config = {.as_uint64 = config}, old_config = {.as_uint64 = stimer->config.as_uint64}; @@ -551,11 +526,6 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config, new_config.enable = 0; stimer->config.as_uint64 = new_config.as_uint64; - if (old_config.direct_mode) - synic_update_vector(&hv_vcpu->synic, old_config.apic_vector); - if (new_config.direct_mode) - synic_update_vector(&hv_vcpu->synic, new_config.apic_vector); - stimer_mark_pending(stimer, false); return 0; } -- cgit v1.2.3 From d7b09c827a6cf291f66637a36f46928dd1423184 Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Wed, 5 Dec 2018 17:19:56 -0200 Subject: kvm: x86: Report STIBP on GET_SUPPORTED_CPUID Months ago, we have added code to allow direct access to MSR_IA32_SPEC_CTRL to the guest, which makes STIBP available to guests. This was implemented by commits d28b387fb74d ("KVM/VMX: Allow direct access to MSR_IA32_SPEC_CTRL") and b2ac58f90540 ("KVM/SVM: Allow direct access to MSR_IA32_SPEC_CTRL"). However, we never updated GET_SUPPORTED_CPUID to let userspace know that STIBP can be enabled in CPUID. Fix that by updating kvm_cpuid_8000_0008_ebx_x86_features and kvm_cpuid_7_0_edx_x86_features. Signed-off-by: Eduardo Habkost Reviewed-by: Jim Mattson Reviewed-by: Konrad Rzeszutek Wilk Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 7bcfa61375c0..cc6dd65828a4 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -381,7 +381,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 0x80000008.ebx */ const u32 kvm_cpuid_8000_0008_ebx_x86_features = F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | - F(AMD_SSB_NO); + F(AMD_SSB_NO) | F(AMD_STIBP); /* cpuid 0xC0000001.edx */ const u32 kvm_cpuid_C000_0001_edx_x86_features = @@ -411,7 +411,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.edx*/ const u32 kvm_cpuid_7_0_edx_x86_features = F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | - F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES); + F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); -- cgit v1.2.3 From 3d82c565a7a237b9dd4666f17c84dcc028d2bed5 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Mon, 3 Dec 2018 14:13:32 -0600 Subject: kvm: vmx: add cpu into VMX preemption timer bug list This patch adds Intel "Xeon CPU E3-1220 V2", with CPUID.01H.EAX=0x000306A8, into the list of known broken CPUs which fail to support VMX preemption timer. This bug was found while running the APIC timer test of kvm-unit-test on this specific CPU, even though the errata info can't be located in the public domain for this CPU. Signed-off-by: Wei Huang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 579f4c4144d2..ddaae1d4cd49 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -469,6 +469,8 @@ static u32 vmx_preemption_cpu_tfms[] = { /* 321324.pdf - AAK139 - D0 - Xeon 5500 */ /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */ 0x000106A5, + /* Xeon E3-1220 V2 */ +0x000306A8, }; static inline bool cpu_has_broken_vmx_preemption_timer(void) -- cgit v1.2.3 From e53d88af63ab4104e1226b8f9959f1e9903da10b Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Tue, 30 Oct 2018 12:20:21 -0700 Subject: kvm: x86: Don't modify MSR_PLATFORM_INFO on vCPU reset If userspace has provided a different value for this MSR (e.g with the turbo bits set), the userspace-provided value should survive a vCPU reset. For backwards compatibility, MSR_PLATFORM_INFO is initialized in kvm_arch_vcpu_setup. Signed-off-by: Jim Mattson Reviewed-by: Drew Schmitt Cc: Abhiroop Dabral Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 18b7817af2bc..1a10e564174d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8658,6 +8658,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) { + vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; kvm_vcpu_mtrr_init(vcpu); vcpu_load(vcpu); kvm_vcpu_reset(vcpu, false); @@ -8760,7 +8761,6 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_pmu_reset(vcpu); vcpu->arch.smbase = 0x30000; - vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; vcpu->arch.msr_misc_features_enables = 0; vcpu->arch.xcr0 = XFEATURE_MASK_FP; -- cgit v1.2.3 From a0d4f8034446df402a879dac04cab61f71c4fafc Mon Sep 17 00:00:00 2001 From: Krish Sadhukhan Date: Tue, 4 Dec 2018 19:00:13 -0500 Subject: KVM nVMX: MSRs should not be stored if VM-entry fails during or after loading guest state According to section "VM-entry Failures During or After Loading Guest State" in Intel SDM vol 3C, "No MSRs are saved into the VM-exit MSR-store area." when bit 31 of the exit reason is set. Reported-by: Krish Sadhukhan Suggested-by: Jim Mattson Signed-off-by: Krish Sadhukhan Reviewed-by: Darren Kenny Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index bb3a7030eb39..b347d975589d 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3504,6 +3504,18 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, * L2 to IDT_VECTORING_INFO_FIELD. */ vmcs12_save_pending_event(vcpu, vmcs12); + + /* + * According to spec, there's no need to store the guest's + * MSRs if the exit is due to a VM-entry failure that occurs + * during or after loading the guest state. Since this exit + * does not fall in that category, we need to save the MSRs. + */ + if (nested_vmx_store_msr(vcpu, + vmcs12->vm_exit_msr_store_addr, + vmcs12->vm_exit_msr_store_count)) + nested_vmx_abort(vcpu, + VMX_ABORT_SAVE_GUEST_MSR_FAIL); } /* @@ -3835,10 +3847,6 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, * immutable. */ nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); - - if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr, - vmcs12->vm_exit_msr_store_count)) - nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL); } else { /* * The only expected VM-instruction error is "VM entry with -- cgit v1.2.3 From 53963a70ac268c8e813e6d5cd24cbd1b03f56059 Mon Sep 17 00:00:00 2001 From: Lan Tianyu Date: Thu, 6 Dec 2018 15:34:36 +0800 Subject: KVM/VMX: Check ept_pointer before flushing ept tlb This patch is to initialize ept_pointer to INVALID_PAGE and check it before flushing ept tlb. If ept_pointer is invalid, bypass the flush request. Signed-off-by: Lan Tianyu Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ddaae1d4cd49..c3461f835445 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -423,11 +423,18 @@ static int vmx_hv_remote_flush_tlb(struct kvm *kvm) /* * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs the address of the * base of EPT PML4 table, strip off EPT configuration information. + * If ept_pointer is invalid pointer, bypass the flush request. */ if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) { - kvm_for_each_vcpu(i, vcpu, kvm) + kvm_for_each_vcpu(i, vcpu, kvm) { + u64 ept_pointer = to_vmx(vcpu)->ept_pointer; + + if (!VALID_PAGE(ept_pointer)) + continue; + ret |= hyperv_flush_guest_mapping( - to_vmx(kvm_get_vcpu(kvm, i))->ept_pointer & PAGE_MASK); + ept_pointer & PAGE_MASK); + } } else { ret = hyperv_flush_guest_mapping( to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer & PAGE_MASK); @@ -6433,6 +6440,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx->pi_desc.nv = POSTED_INTR_VECTOR; vmx->pi_desc.sn = 1; + vmx->ept_pointer = INVALID_PAGE; + return &vmx->vcpu; free_vmcs: -- cgit v1.2.3 From 16322a3b5e7ce8250025361c092903a7259c1677 Mon Sep 17 00:00:00 2001 From: Krish Sadhukhan Date: Wed, 12 Dec 2018 13:30:06 -0500 Subject: KVM: nVMX: Prepend "nested_vmx_" to check_vmentry_{pre,post}reqs() .. as they are used only in nested vmx context. Signed-off-by: Krish Sadhukhan Reviewed-by: Liran Alon Reviewed-by: Mihai Carabas Reviewed-by: Mark Kanda Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index b347d975589d..f1ba4871f4ac 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2444,7 +2444,8 @@ static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address) return true; } -static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +static int nested_vmx_check_vmentry_prereqs(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); bool ia32e; @@ -2636,8 +2637,8 @@ static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, return r; } -static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, - u32 *exit_qual) +static int nested_vmx_check_vmentry_postreqs(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12, u32 *exit_qual) { bool ia32e; @@ -2951,7 +2952,7 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) return -1; } - if (check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) + if (nested_vmx_check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) goto vmentry_fail_vmexit; } @@ -3087,7 +3088,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS : VMXERR_VMRESUME_NONLAUNCHED_VMCS); - ret = check_vmentry_prereqs(vcpu, vmcs12); + ret = nested_vmx_check_vmentry_prereqs(vcpu, vmcs12); if (ret) return nested_vmx_failValid(vcpu, ret); @@ -5371,8 +5372,8 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, return -EINVAL; } - if (check_vmentry_prereqs(vcpu, vmcs12) || - check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) + if (nested_vmx_check_vmentry_prereqs(vcpu, vmcs12) || + nested_vmx_check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) return -EINVAL; vmx->nested.dirty_vmcs12 = true; -- cgit v1.2.3 From 461b4ba4c7ad79137171de2887e5d4d05a0ec8c1 Mon Sep 17 00:00:00 2001 From: Krish Sadhukhan Date: Wed, 12 Dec 2018 13:30:07 -0500 Subject: KVM: nVMX: Move the checks for VM-Execution Control Fields to a separate helper function .. to improve readability and maintainability, and to align the code as per the layout of the checks in chapter "VM Entries" in Intel SDM vol 3C. Signed-off-by: Krish Sadhukhan Reviewed-by: Mihai Carabas Reviewed-by: Mark Kanda Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 131 ++++++++++++++++++++++------------------------ 1 file changed, 62 insertions(+), 69 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index f1ba4871f4ac..211e5ad96ae2 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2444,94 +2444,90 @@ static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address) return true; } -static int nested_vmx_check_vmentry_prereqs(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) +/* + * Checks related to VM-Execution Control Fields + */ +static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); - bool ia32e; - - if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && - vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_vmx_check_io_bitmap_controls(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_vmx_check_apic_access_controls(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (!nested_cpu_has_preemption_timer(vmcs12) && - nested_cpu_has_save_preemption_timer(vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_vmx_check_pml_controls(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, - vmx->nested.msrs.procbased_ctls_low, - vmx->nested.msrs.procbased_ctls_high) || - (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && - !vmx_control_verify(vmcs12->secondary_vm_exec_control, - vmx->nested.msrs.secondary_ctls_low, - vmx->nested.msrs.secondary_ctls_high)) || - !vmx_control_verify(vmcs12->pin_based_vm_exec_control, + if (!vmx_control_verify(vmcs12->pin_based_vm_exec_control, vmx->nested.msrs.pinbased_ctls_low, vmx->nested.msrs.pinbased_ctls_high) || - !vmx_control_verify(vmcs12->vm_exit_controls, - vmx->nested.msrs.exit_ctls_low, - vmx->nested.msrs.exit_ctls_high) || - !vmx_control_verify(vmcs12->vm_entry_controls, - vmx->nested.msrs.entry_ctls_low, - vmx->nested.msrs.entry_ctls_high)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + !vmx_control_verify(vmcs12->cpu_based_vm_exec_control, + vmx->nested.msrs.procbased_ctls_low, + vmx->nested.msrs.procbased_ctls_high)) + return -EINVAL; - if (nested_vmx_check_nmi_controls(vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && + !vmx_control_verify(vmcs12->secondary_vm_exec_control, + vmx->nested.msrs.secondary_ctls_low, + vmx->nested.msrs.secondary_ctls_high)) + return -EINVAL; + + if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu) || + nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || + nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || + nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || + nested_vmx_check_apic_access_controls(vcpu, vmcs12) || + nested_vmx_check_apicv_controls(vcpu, vmcs12) || + nested_vmx_check_nmi_controls(vmcs12) || + nested_vmx_check_pml_controls(vcpu, vmcs12) || + nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || + nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || + nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || + (nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) + return -EINVAL; + + if (nested_cpu_has_ept(vmcs12) && + !valid_ept_address(vcpu, vmcs12->ept_pointer)) + return -EINVAL; if (nested_cpu_has_vmfunc(vmcs12)) { if (vmcs12->vm_function_control & ~vmx->nested.msrs.vmfunc_controls) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + return -EINVAL; if (nested_cpu_has_eptp_switching(vmcs12)) { if (!nested_cpu_has_ept(vmcs12) || !page_address_valid(vcpu, vmcs12->eptp_list_address)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + return -EINVAL; } } - if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) + return 0; +} + +static int nested_vmx_check_vmentry_prereqs(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + bool ia32e; + + if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && + vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (nested_check_vm_execution_controls(vcpu, vmcs12)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) || !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) || !nested_cr3_valid(vcpu, vmcs12->host_cr3)) return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD; + if (!vmx_control_verify(vmcs12->vm_exit_controls, + vmx->nested.msrs.exit_ctls_low, + vmx->nested.msrs.exit_ctls_high) || + !vmx_control_verify(vmcs12->vm_entry_controls, + vmx->nested.msrs.entry_ctls_low, + vmx->nested.msrs.entry_ctls_high)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + /* * If the load IA32_EFER VM-exit control is 1, bits reserved in the * IA32_EFER MSR must be 0 in the field for that register. In addition, @@ -2603,10 +2599,6 @@ static int nested_vmx_check_vmentry_prereqs(struct kvm_vcpu *vcpu, } } - if (nested_cpu_has_ept(vmcs12) && - !valid_ept_address(vcpu, vmcs12->ept_pointer)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - return 0; } @@ -2638,7 +2630,8 @@ static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, } static int nested_vmx_check_vmentry_postreqs(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12, u32 *exit_qual) + struct vmcs12 *vmcs12, + u32 *exit_qual) { bool ia32e; -- cgit v1.2.3 From f9b245e182ac9beb40f48faa0fd04ed3f3830d50 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 12 Dec 2018 13:30:08 -0500 Subject: KVM: nVMX: Remove param indirection from nested_vmx_check_msr_switch() Passing the enum and doing an indirect lookup is silly when we can simply pass the field directly. Remove the "fast path" code in nested_vmx_check_msr_switch_controls() as it's now nothing more than a redundant check. Remove the debug message rather than continue passing the enum for the address field. Having debug messages for the MSRs themselves is useful as MSR legality is a huge space, whereas messing up a physical address means the VMM is fundamentally broken. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 34 ++++++++++------------------------ 1 file changed, 10 insertions(+), 24 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 211e5ad96ae2..f594ddce7280 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -700,45 +700,31 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, } static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, - unsigned long count_field, - unsigned long addr_field) + u32 count, u64 addr) { - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); int maxphyaddr; - u64 count, addr; - if (vmcs12_read_any(vmcs12, count_field, &count) || - vmcs12_read_any(vmcs12, addr_field, &addr)) { - WARN_ON(1); - return -EINVAL; - } if (count == 0) return 0; maxphyaddr = cpuid_maxphyaddr(vcpu); if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr || - (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) { - pr_debug_ratelimited( - "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)", - addr_field, maxphyaddr, count, addr); + (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) return -EINVAL; - } + return 0; } static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - if (vmcs12->vm_exit_msr_load_count == 0 && - vmcs12->vm_exit_msr_store_count == 0 && - vmcs12->vm_entry_msr_load_count == 0) - return 0; /* Fast path */ - if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT, - VM_EXIT_MSR_LOAD_ADDR) || - nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT, - VM_EXIT_MSR_STORE_ADDR) || - nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT, - VM_ENTRY_MSR_LOAD_ADDR)) + if (nested_vmx_check_msr_switch(vcpu, vmcs12->vm_exit_msr_load_count, + vmcs12->vm_exit_msr_load_addr) || + nested_vmx_check_msr_switch(vcpu, vmcs12->vm_exit_msr_store_count, + vmcs12->vm_exit_msr_store_addr) || + nested_vmx_check_msr_switch(vcpu, vmcs12->vm_entry_msr_load_count, + vmcs12->vm_entry_msr_load_addr)) return -EINVAL; + return 0; } -- cgit v1.2.3 From 61446ba75e9ad95446e057ac59e23e257493a3b3 Mon Sep 17 00:00:00 2001 From: Krish Sadhukhan Date: Wed, 12 Dec 2018 13:30:09 -0500 Subject: KVM: nVMX: Move the checks for VM-Exit Control Fields to a separate helper function .. to improve readability and maintainability, and to align the code as per the layout of the checks in chapter "VM Entries" in Intel SDM vol 3C. Signed-off-by: Krish Sadhukhan Reviewed-by: Mihai Carabas Reviewed-by: Mark Kanda Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 43 +++++++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 10 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index f594ddce7280..8e6b4248b3d4 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -714,20 +714,28 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, return 0; } -static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) +static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) { if (nested_vmx_check_msr_switch(vcpu, vmcs12->vm_exit_msr_load_count, vmcs12->vm_exit_msr_load_addr) || nested_vmx_check_msr_switch(vcpu, vmcs12->vm_exit_msr_store_count, - vmcs12->vm_exit_msr_store_addr) || - nested_vmx_check_msr_switch(vcpu, vmcs12->vm_entry_msr_load_count, - vmcs12->vm_entry_msr_load_addr)) + vmcs12->vm_exit_msr_store_addr)) return -EINVAL; return 0; } +static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (nested_vmx_check_msr_switch(vcpu, vmcs12->vm_entry_msr_load_count, + vmcs12->vm_entry_msr_load_addr)) + return -EINVAL; + + return 0; +} + static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { @@ -2485,6 +2493,23 @@ static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, return 0; } +/* + * Checks related to VM-Exit Control Fields + */ +static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!vmx_control_verify(vmcs12->vm_exit_controls, + vmx->nested.msrs.exit_ctls_low, + vmx->nested.msrs.exit_ctls_high) || + nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12)) + return -EINVAL; + + return 0; +} + static int nested_vmx_check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { @@ -2495,7 +2520,8 @@ static int nested_vmx_check_vmentry_prereqs(struct kvm_vcpu *vcpu, vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - if (nested_check_vm_execution_controls(vcpu, vmcs12)) + if (nested_check_vm_execution_controls(vcpu, vmcs12) || + nested_check_vm_exit_controls(vcpu, vmcs12)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) @@ -2506,10 +2532,7 @@ static int nested_vmx_check_vmentry_prereqs(struct kvm_vcpu *vcpu, !nested_cr3_valid(vcpu, vmcs12->host_cr3)) return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD; - if (!vmx_control_verify(vmcs12->vm_exit_controls, - vmx->nested.msrs.exit_ctls_low, - vmx->nested.msrs.exit_ctls_high) || - !vmx_control_verify(vmcs12->vm_entry_controls, + if (!vmx_control_verify(vmcs12->vm_entry_controls, vmx->nested.msrs.entry_ctls_low, vmx->nested.msrs.entry_ctls_high)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; -- cgit v1.2.3 From 5fbf963400a39919e53d20fe43bd3be072622944 Mon Sep 17 00:00:00 2001 From: Krish Sadhukhan Date: Wed, 12 Dec 2018 13:30:10 -0500 Subject: KVM: nVMX: Move the checks for VM-Entry Control Fields to a separate helper function .. to improve readability and maintainability, and to align the code as per the layout of the checks in chapter "VM Entries" in Intel SDM vol 3C. Signed-off-by: Krish Sadhukhan Reviewed-by: Mihai Carabas Reviewed-by: Mark Kanda Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 97 ++++++++++++++++++++++++++--------------------- 1 file changed, 54 insertions(+), 43 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 8e6b4248b3d4..b7fb40a19836 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -726,8 +726,8 @@ static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, return 0; } -static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) +static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) { if (nested_vmx_check_msr_switch(vcpu, vmcs12->vm_entry_msr_load_count, vmcs12->vm_entry_msr_load_addr)) @@ -2510,47 +2510,18 @@ static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, return 0; } -static int nested_vmx_check_vmentry_prereqs(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) +/* + * Checks related to VM-Entry Control Fields + */ +static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); - bool ia32e; - - if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && - vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_check_vm_execution_controls(vcpu, vmcs12) || - nested_check_vm_exit_controls(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) || - !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) || - !nested_cr3_valid(vcpu, vmcs12->host_cr3)) - return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD; if (!vmx_control_verify(vmcs12->vm_entry_controls, vmx->nested.msrs.entry_ctls_low, vmx->nested.msrs.entry_ctls_high)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - /* - * If the load IA32_EFER VM-exit control is 1, bits reserved in the - * IA32_EFER MSR must be 0 in the field for that register. In addition, - * the values of the LMA and LME bits in the field must each be that of - * the host address-space size VM-exit control. - */ - if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { - ia32e = (vmcs12->vm_exit_controls & - VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0; - if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) || - ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) || - ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) - return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD; - } + return -EINVAL; /* * From the Intel SDM, volume 3: @@ -2572,29 +2543,29 @@ static int nested_vmx_check_vmentry_prereqs(struct kvm_vcpu *vcpu, if (intr_type == INTR_TYPE_RESERVED || (intr_type == INTR_TYPE_OTHER_EVENT && !nested_cpu_supports_monitor_trap_flag(vcpu))) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + return -EINVAL; /* VM-entry interruption-info field: vector */ if ((intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || (intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || (intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + return -EINVAL; /* VM-entry interruption-info field: deliver error code */ should_have_error_code = intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && x86_exception_has_error_code(vector); if (has_error_code != should_have_error_code) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + return -EINVAL; /* VM-entry exception error code */ if (has_error_code && vmcs12->vm_entry_exception_error_code & GENMASK(31, 15)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + return -EINVAL; /* VM-entry interruption-info field: reserved bits */ if (intr_info & INTR_INFO_RESVD_BITS_MASK) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + return -EINVAL; /* VM-entry instruction length */ switch (intr_type) { @@ -2604,10 +2575,50 @@ static int nested_vmx_check_vmentry_prereqs(struct kvm_vcpu *vcpu, if ((vmcs12->vm_entry_instruction_len > 15) || (vmcs12->vm_entry_instruction_len == 0 && !nested_cpu_has_zero_length_injection(vcpu))) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + return -EINVAL; } } + if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12)) + return -EINVAL; + + return 0; +} + +static int nested_vmx_check_vmentry_prereqs(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + bool ia32e; + + if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && + vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (nested_check_vm_execution_controls(vcpu, vmcs12) || + nested_check_vm_exit_controls(vcpu, vmcs12) || + nested_check_vm_entry_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) || + !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) || + !nested_cr3_valid(vcpu, vmcs12->host_cr3)) + return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD; + + /* + * If the load IA32_EFER VM-exit control is 1, bits reserved in the + * IA32_EFER MSR must be 0 in the field for that register. In addition, + * the values of the LMA and LME bits in the field must each be that of + * the host address-space size VM-exit control. + */ + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { + ia32e = (vmcs12->vm_exit_controls & + VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0; + if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) || + ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) || + ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) + return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD; + } + return 0; } -- cgit v1.2.3 From 254b2f3b0f7b45c2f2232a2e0da77577233750e2 Mon Sep 17 00:00:00 2001 From: Krish Sadhukhan Date: Wed, 12 Dec 2018 13:30:11 -0500 Subject: KVM: nVMX: Move the checks for Host Control Registers and MSRs to a separate helper function .. to improve readability and maintainability, and to align the code as per the layout of the checks in chapter "VM Entries" in Intel SDM vol 3C. Signed-off-by: Krish Sadhukhan Reviewed-by: Mihai Carabas Reviewed-by: Mark Kanda Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index b7fb40a19836..c1bfc3d18f38 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2585,25 +2585,18 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, return 0; } -static int nested_vmx_check_vmentry_prereqs(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) +/* + * Checks related to Host Control Registers and MSRs + */ +static int nested_check_host_control_regs(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) { bool ia32e; - if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && - vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - - if (nested_check_vm_execution_controls(vcpu, vmcs12) || - nested_check_vm_exit_controls(vcpu, vmcs12) || - nested_check_vm_entry_controls(vcpu, vmcs12)) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; - if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) || !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) || !nested_cr3_valid(vcpu, vmcs12->host_cr3)) - return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD; - + return -EINVAL; /* * If the load IA32_EFER VM-exit control is 1, bits reserved in the * IA32_EFER MSR must be 0 in the field for that register. In addition, @@ -2616,12 +2609,30 @@ static int nested_vmx_check_vmentry_prereqs(struct kvm_vcpu *vcpu, if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) || ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) || ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) - return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD; + return -EINVAL; } return 0; } +static int nested_vmx_check_vmentry_prereqs(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && + vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (nested_check_vm_execution_controls(vcpu, vmcs12) || + nested_check_vm_exit_controls(vcpu, vmcs12) || + nested_check_vm_entry_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (nested_check_host_control_regs(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD; + + return 0; +} + static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { -- cgit v1.2.3 From 4e445aee9654dc4aee61919d4f1d379b77cf3435 Mon Sep 17 00:00:00 2001 From: Krish Sadhukhan Date: Wed, 12 Dec 2018 13:30:12 -0500 Subject: KVM: nVMX: Move the checks for Guest Non-Register States to a separate helper function .. to improve readability and maintainability, and to align the code as per the layout of the checks in chapter "VM Entries" in Intel SDM vol 3C. Signed-off-by: Krish Sadhukhan Reviewed-by: Mihai Carabas Reviewed-by: Mark Kanda Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index c1bfc3d18f38..3f019aa63341 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2615,13 +2615,21 @@ static int nested_check_host_control_regs(struct kvm_vcpu *vcpu, return 0; } -static int nested_vmx_check_vmentry_prereqs(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) +/* + * Checks related to Guest Non-register State + */ +static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) { if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) - return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + return -EINVAL; + + return 0; +} +static int nested_vmx_check_vmentry_prereqs(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ if (nested_check_vm_execution_controls(vcpu, vmcs12) || nested_check_vm_exit_controls(vcpu, vmcs12) || nested_check_vm_entry_controls(vcpu, vmcs12)) @@ -2630,6 +2638,9 @@ static int nested_vmx_check_vmentry_prereqs(struct kvm_vcpu *vcpu, if (nested_check_host_control_regs(vcpu, vmcs12)) return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD; + if (nested_check_guest_non_reg_state(vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + return 0; } -- cgit v1.2.3 From 240c35a3783ab9b3a0afaba0dde7291295680a6b Mon Sep 17 00:00:00 2001 From: Marc Orr Date: Tue, 6 Nov 2018 14:53:55 -0800 Subject: kvm: x86: Use task structs fpu field for user Previously, x86's instantiation of 'struct kvm_vcpu_arch' added an fpu field to save/restore fpu-related architectural state, which will differ from kvm's fpu state. However, this is redundant to the 'struct fpu' field, called fpu, embedded in the task struct, via the thread field. Thus, this patch removes the user_fpu field from the kvm_vcpu_arch struct and replaces it with the task struct's fpu field. This change is significant because the fpu struct is actually quite large. For example, on the system used to develop this patch, this change reduces the size of the vcpu_vmx struct from 23680 bytes down to 19520 bytes, when building the kernel with kvmconfig. This reduction in the size of the vcpu_vmx struct moves us closer to being able to allocate the struct at order 2, rather than order 3. Suggested-by: Dave Hansen Signed-off-by: Marc Orr Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 7 +++---- arch/x86/kvm/x86.c | 4 ++-- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ddeddf2a992c..c143dfe75869 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -601,16 +601,15 @@ struct kvm_vcpu_arch { /* * QEMU userspace and the guest each have their own FPU state. - * In vcpu_run, we switch between the user and guest FPU contexts. - * While running a VCPU, the VCPU thread will have the guest FPU - * context. + * In vcpu_run, we switch between the user, maintained in the + * task_struct struct, and guest FPU contexts. While running a VCPU, + * the VCPU thread will have the guest FPU context. * * Note that while the PKRU state lives inside the fpu registers, * it is switched out separately at VMENTER and VMEXIT time. The * "guest_fpu" state here contains the guest FPU context, with the * host PRKU bits. */ - struct fpu user_fpu; struct fpu guest_fpu; u64 xcr0; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1a10e564174d..c1a00e9b2eaa 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8035,7 +8035,7 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { preempt_disable(); - copy_fpregs_to_fpstate(&vcpu->arch.user_fpu); + copy_fpregs_to_fpstate(¤t->thread.fpu); /* PKRU is separately restored in kvm_x86_ops->run. */ __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state, ~XFEATURE_MASK_PKRU); @@ -8048,7 +8048,7 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { preempt_disable(); copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu); - copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state); + copy_kernel_to_fpregs(¤t->thread.fpu.state); preempt_enable(); ++vcpu->stat.fpu_reload; trace_kvm_fpu(0); -- cgit v1.2.3 From b666a4b697397f8492dc11a2a1877557d3e0af56 Mon Sep 17 00:00:00 2001 From: Marc Orr Date: Tue, 6 Nov 2018 14:53:56 -0800 Subject: kvm: x86: Dynamically allocate guest_fpu Previously, the guest_fpu field was embedded in the kvm_vcpu_arch struct. Unfortunately, the field is quite large, (e.g., 4352 bytes on my current setup). This bloats the kvm_vcpu_arch struct for x86 into an order 3 memory allocation, which can become a problem on overcommitted machines. Thus, this patch moves the fpu state outside of the kvm_vcpu_arch struct. With this patch applied, the kvm_vcpu_arch struct is reduced to 15168 bytes for vmx on my setup when building the kernel with kvmconfig. Suggested-by: Dave Hansen Signed-off-by: Marc Orr Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 ++- arch/x86/kvm/svm.c | 10 ++++++++ arch/x86/kvm/vmx/vmx.c | 10 ++++++++ arch/x86/kvm/x86.c | 51 ++++++++++++++++++++++++++++++----------- 4 files changed, 60 insertions(+), 14 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c143dfe75869..bca77c25a19a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -610,7 +610,7 @@ struct kvm_vcpu_arch { * "guest_fpu" state here contains the guest FPU context, with the * host PRKU bits. */ - struct fpu guest_fpu; + struct fpu *guest_fpu; u64 xcr0; u64 guest_supported_xcr0; @@ -1196,6 +1196,7 @@ struct kvm_arch_async_pf { }; extern struct kvm_x86_ops *kvm_x86_ops; +extern struct kmem_cache *x86_fpu_cache; #define __KVM_HAVE_ARCH_VM_ALLOC static inline struct kvm *kvm_arch_alloc_vm(void) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 0762e81e0498..e4f18a305ef6 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2125,6 +2125,13 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) goto out; } + svm->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL); + if (!svm->vcpu.arch.guest_fpu) { + printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n"); + err = -ENOMEM; + goto free_partial_svm; + } + err = kvm_vcpu_init(&svm->vcpu, kvm, id); if (err) goto free_svm; @@ -2184,6 +2191,8 @@ free_page1: uninit: kvm_vcpu_uninit(&svm->vcpu); free_svm: + kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.guest_fpu); +free_partial_svm: kmem_cache_free(kvm_vcpu_cache, svm); out: return ERR_PTR(err); @@ -2213,6 +2222,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) __free_page(virt_to_page(svm->nested.hsave)); __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); kvm_vcpu_uninit(vcpu); + kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.guest_fpu); kmem_cache_free(kvm_vcpu_cache, svm); } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c3461f835445..b8fa74ce3af2 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6349,6 +6349,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) free_loaded_vmcs(vmx->loaded_vmcs); kfree(vmx->guest_msrs); kvm_vcpu_uninit(vcpu); + kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu); kmem_cache_free(kvm_vcpu_cache, vmx); } @@ -6362,6 +6363,13 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (!vmx) return ERR_PTR(-ENOMEM); + vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL); + if (!vmx->vcpu.arch.guest_fpu) { + printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n"); + err = -ENOMEM; + goto free_partial_vcpu; + } + vmx->vpid = allocate_vpid(); err = kvm_vcpu_init(&vmx->vcpu, kvm, id); @@ -6454,6 +6462,8 @@ uninit_vcpu: kvm_vcpu_uninit(&vmx->vcpu); free_vcpu: free_vpid(vmx->vpid); + kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu); +free_partial_vcpu: kmem_cache_free(kvm_vcpu_cache, vmx); return ERR_PTR(err); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c1a00e9b2eaa..4f786fcc620e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -213,6 +213,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { u64 __read_mostly host_xcr0; +struct kmem_cache *x86_fpu_cache; +EXPORT_SYMBOL_GPL(x86_fpu_cache); + static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) @@ -3630,7 +3633,7 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) { - struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave; + struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave; u64 xstate_bv = xsave->header.xfeatures; u64 valid; @@ -3672,7 +3675,7 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) { - struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave; + struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave; u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET); u64 valid; @@ -3720,7 +3723,7 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, fill_xsave((u8 *) guest_xsave->region, vcpu); } else { memcpy(guest_xsave->region, - &vcpu->arch.guest_fpu.state.fxsave, + &vcpu->arch.guest_fpu->state.fxsave, sizeof(struct fxregs_state)); *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] = XFEATURE_MASK_FPSSE; @@ -3750,7 +3753,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, if (xstate_bv & ~XFEATURE_MASK_FPSSE || mxcsr & ~mxcsr_feature_mask) return -EINVAL; - memcpy(&vcpu->arch.guest_fpu.state.fxsave, + memcpy(&vcpu->arch.guest_fpu->state.fxsave, guest_xsave->region, sizeof(struct fxregs_state)); } return 0; @@ -6852,11 +6855,30 @@ int kvm_arch_init(void *opaque) goto out; } + /* + * KVM explicitly assumes that the guest has an FPU and + * FXSAVE/FXRSTOR. For example, the KVM_GET_FPU explicitly casts the + * vCPU's FPU state as a fxregs_state struct. + */ + if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) { + printk(KERN_ERR "kvm: inadequate fpu\n"); + r = -EOPNOTSUPP; + goto out; + } + r = -ENOMEM; + x86_fpu_cache = kmem_cache_create("x86_fpu", fpu_kernel_xstate_size, + __alignof__(struct fpu), SLAB_ACCOUNT, + NULL); + if (!x86_fpu_cache) { + printk(KERN_ERR "kvm: failed to allocate cache for x86 fpu\n"); + goto out; + } + shared_msrs = alloc_percpu(struct kvm_shared_msrs); if (!shared_msrs) { printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n"); - goto out; + goto out_free_x86_fpu_cache; } r = kvm_mmu_module_init(); @@ -6889,6 +6911,8 @@ int kvm_arch_init(void *opaque) out_free_percpu: free_percpu(shared_msrs); +out_free_x86_fpu_cache: + kmem_cache_destroy(x86_fpu_cache); out: return r; } @@ -6912,6 +6936,7 @@ void kvm_arch_exit(void) kvm_x86_ops = NULL; kvm_mmu_module_exit(); free_percpu(shared_msrs); + kmem_cache_destroy(x86_fpu_cache); } int kvm_vcpu_halt(struct kvm_vcpu *vcpu) @@ -8037,7 +8062,7 @@ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) preempt_disable(); copy_fpregs_to_fpstate(¤t->thread.fpu); /* PKRU is separately restored in kvm_x86_ops->run. */ - __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state, + __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state, ~XFEATURE_MASK_PKRU); preempt_enable(); trace_kvm_fpu(1); @@ -8047,7 +8072,7 @@ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { preempt_disable(); - copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu); + copy_fpregs_to_fpstate(vcpu->arch.guest_fpu); copy_kernel_to_fpregs(¤t->thread.fpu.state); preempt_enable(); ++vcpu->stat.fpu_reload; @@ -8542,7 +8567,7 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) vcpu_load(vcpu); - fxsave = &vcpu->arch.guest_fpu.state.fxsave; + fxsave = &vcpu->arch.guest_fpu->state.fxsave; memcpy(fpu->fpr, fxsave->st_space, 128); fpu->fcw = fxsave->cwd; fpu->fsw = fxsave->swd; @@ -8562,7 +8587,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) vcpu_load(vcpu); - fxsave = &vcpu->arch.guest_fpu.state.fxsave; + fxsave = &vcpu->arch.guest_fpu->state.fxsave; memcpy(fxsave->st_space, fpu->fpr, 128); fxsave->cwd = fpu->fcw; @@ -8618,9 +8643,9 @@ static int sync_regs(struct kvm_vcpu *vcpu) static void fx_init(struct kvm_vcpu *vcpu) { - fpstate_init(&vcpu->arch.guest_fpu.state); + fpstate_init(&vcpu->arch.guest_fpu->state); if (boot_cpu_has(X86_FEATURE_XSAVES)) - vcpu->arch.guest_fpu.state.xsave.header.xcomp_bv = + vcpu->arch.guest_fpu->state.xsave.header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED; /* @@ -8745,11 +8770,11 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) */ if (init_event) kvm_put_guest_fpu(vcpu); - mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave, + mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave, XFEATURE_MASK_BNDREGS); if (mpx_state_buffer) memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state)); - mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave, + mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave, XFEATURE_MASK_BNDCSR); if (mpx_state_buffer) memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr)); -- cgit v1.2.3 From ed8e48122728acfaf7bfa904cba034d83905b43a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 21 Dec 2018 11:25:59 +0100 Subject: KVM: x86: fix size of x86_fpu_cache objects The memory allocation in b666a4b69739 ("kvm: x86: Dynamically allocate guest_fpu", 2018-11-06) is wrong, there are other members in struct fpu before the fpregs_state union and the patch should be doing something similar to the code in fpu__init_task_struct_size. It's enough to run a guest and then rmmod kvm to see slub errors which are actually caused by memory corruption. For now let's revert it to sizeof(struct fpu), which is conservative. I have plans to move fsave/fxsave/xsave directly in KVM, without using the kernel FPU helpers, and once it's done, the size of the object in the cache will be something like kvm_xstate_size. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4f786fcc620e..70faa3cdc4dc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6867,7 +6867,7 @@ int kvm_arch_init(void *opaque) } r = -ENOMEM; - x86_fpu_cache = kmem_cache_create("x86_fpu", fpu_kernel_xstate_size, + x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu), __alignof__(struct fpu), SLAB_ACCOUNT, NULL); if (!x86_fpu_cache) { -- cgit v1.2.3 From e87555e550cef4941579cd879759a7c0dee24e68 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 19 Dec 2018 12:06:13 +0100 Subject: KVM: x86: svm: report MSR_IA32_MCG_EXT_CTL as unsupported AMD doesn't seem to implement MSR_IA32_MCG_EXT_CTL and svm code in kvm knows nothing about it, however, this MSR is among emulated_msrs and thus returned with KVM_GET_MSR_INDEX_LIST. The consequent KVM_GET_MSRS, of course, fails. Report the MSR as unsupported to not confuse userspace. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Radim Krčmář --- arch/x86/kvm/svm.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e4f18a305ef6..c4377f02a33b 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5840,6 +5840,13 @@ static bool svm_cpu_has_accelerated_tpr(void) static bool svm_has_emulated_msr(int index) { + switch (index) { + case MSR_IA32_MCG_EXT_CTL: + return false; + default: + break; + } + return true; } -- cgit v1.2.3 From ba7424b200d347fbbf9fadecaffb2a0feb740039 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 18 Dec 2018 01:01:49 +0000 Subject: KVM: VMX: Remove duplicated include from vmx.c Remove duplicated include. Signed-off-by: YueHaibing Reviewed-by: David Hildenbrand Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx/vmx.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b8fa74ce3af2..d5e81485fc15 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -62,7 +62,6 @@ #include "vmcs12.h" #include "vmx.h" #include "x86.h" -#include "vmx.h" MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); -- cgit v1.2.3 From e081354d6aa7b67c6d0ef51ff8c428b6c261a6fe Mon Sep 17 00:00:00 2001 From: Tambe, William Date: Tue, 13 Nov 2018 16:51:20 +0000 Subject: KVM: nSVM: Fix nested guest support for PAUSE filtering. Currently, the nested guest's PAUSE intercept intentions are not being honored. Instead, since the L0 hypervisor's pause_filter_count and pause_filter_thresh values are still in place, these values are used instead of those programmed in the VMCB by the L1 hypervisor. To honor the desired PAUSE intercept support of the L1 hypervisor, the L0 hypervisor must use the PAUSE filtering fields of the L1 hypervisor. This requires saving and restoring of both the L0 and L1 hypervisor's PAUSE filtering fields. Signed-off-by: William Tambe Signed-off-by: Radim Krčmář --- arch/x86/kvm/svm.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index c4377f02a33b..1a64844dcdfb 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3281,6 +3281,8 @@ static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *fr dst->event_inj_err = from->event_inj_err; dst->nested_cr3 = from->nested_cr3; dst->virt_ext = from->virt_ext; + dst->pause_filter_count = from->pause_filter_count; + dst->pause_filter_thresh = from->pause_filter_thresh; } static int nested_svm_vmexit(struct vcpu_svm *svm) @@ -3359,6 +3361,11 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) nested_vmcb->control.event_inj = 0; nested_vmcb->control.event_inj_err = 0; + nested_vmcb->control.pause_filter_count = + svm->vmcb->control.pause_filter_count; + nested_vmcb->control.pause_filter_thresh = + svm->vmcb->control.pause_filter_thresh; + /* We always set V_INTR_MASKING and remember the old value in hflags */ if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK; @@ -3536,6 +3543,11 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; + svm->vmcb->control.pause_filter_count = + nested_vmcb->control.pause_filter_count; + svm->vmcb->control.pause_filter_thresh = + nested_vmcb->control.pause_filter_thresh; + nested_svm_unmap(page); /* Enter Guest-Mode */ -- cgit v1.2.3 From 9ebdfe5230f2e50e3ba05c57723a06e90946815a Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Mon, 26 Nov 2018 11:22:32 -0800 Subject: kvm: nVMX: NMI-window and interrupt-window exiting should wake L2 from HLT According to the SDM, "NMI-window exiting" VM-exits wake a logical processor from the same inactive states as would an NMI and "interrupt-window exiting" VM-exits wake a logical processor from the same inactive states as would an external interrupt. Specifically, they wake a logical processor from the shutdown state and from the states entered using the HLT and MWAIT instructions. Fixes: 6dfacadd5858 ("KVM: nVMX: Add support for activity state HLT") Signed-off-by: Jim Mattson Reviewed-by: Peter Shier Suggested-by: Sean Christopherson [Squashed comments of two Jim's patches and used the simplified code hunk provided by Sean. - Radim] Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx/nested.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 3f019aa63341..adc8493132ee 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3156,11 +3156,15 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) nested_cache_shadow_vmcs12(vcpu, vmcs12); /* - * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken - * by event injection, halt vcpu. + * If we're entering a halted L2 vcpu and the L2 vcpu won't be + * awakened by event injection or by an NMI-window VM-exit or + * by an interrupt-window VM-exit, halt the vcpu. */ if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) && - !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK)) { + !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && + !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_NMI_PENDING) && + !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_INTR_PENDING) && + (vmcs12->guest_rflags & X86_EFLAGS_IF))) { vmx->nested.nested_run_pending = 0; return kvm_vcpu_halt(vcpu); } -- cgit v1.2.3 From 788fc1e9ad8e1d332d3a18692028d1faee6fabb0 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 9 Nov 2018 09:35:11 -0800 Subject: kvm: vmx: Allow guest read access to IA32_TSC Let the guest read the IA32_TSC MSR with the generic RDMSR instruction as well as the specific RDTSC(P) instructions. Note that the hardware applies the TSC multiplier and offset (when applicable) to the result of RDMSR(IA32_TSC), just as it does to the result of RDTSC(P). Signed-off-by: Jim Mattson Reviewed-by: Peter Shier Reviewed-by: Marc Orr Reviewed-by: Liran Alon Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx/vmx.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d5e81485fc15..9b58921f5cd3 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6401,6 +6401,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) goto free_msrs; msr_bitmap = vmx->vmcs01.msr_bitmap; + vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_TSC, MSR_TYPE_R); vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW); vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW); vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); -- cgit v1.2.3 From 9b7ebff23cb8a8e126b3852ad0b1bfb284f25273 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 5 Nov 2018 10:44:32 -0800 Subject: KVM: x86: Remove KF() macro placeholder Although well-intentioned, keeping the KF() definition as a hint for handling scattered CPUID features may be counter-productive. Simply redefining the bit position only works for directly manipulating the guest's CPUID leafs, e.g. it doesn't make guest_cpuid_has() magically work. Taking an alternative approach, e.g. ensuring the bit position is identical between the Linux-defined and hardware-defined features, may be a simpler and/or more effective method of exposing scattered features to the guest. Signed-off-by: Sean Christopherson Signed-off-by: Radim Krčmář --- arch/x86/kvm/cpuid.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index cc6dd65828a4..be5e64c61d8f 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -67,9 +67,6 @@ u64 kvm_supported_xcr0(void) #define F(x) bit(X86_FEATURE_##x) -/* For scattered features from cpufeatures.h; we currently expose none */ -#define KF(x) bit(KVM_CPUID_BIT_##x) - int kvm_update_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; -- cgit v1.2.3 From bdd303cb1bdb24e71eef8e4510b27166bfadf286 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 5 Nov 2018 14:45:03 +0800 Subject: KVM: fix some typos Signed-off-by: Wei Yang [Preserved the iff and a probably intentional weird bracket notation. Also dropped the style change to make a single-purpose patch. - Radim] Signed-off-by: Radim Krčmář --- arch/x86/kvm/mmu.c | 2 +- arch/x86/kvm/x86.c | 2 +- virt/kvm/async_pf.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 7c03c0f35444..1fba9f356218 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5638,7 +5638,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, * spte from present to present (changing the spte from present * to nonpresent will flush all the TLBs immediately), in other * words, the only case we care is mmu_spte_update() where we - * haved checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE + * have checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE * instead of PT_WRITABLE_MASK, that means it does not depend * on PT_WRITABLE_MASK anymore. */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 70faa3cdc4dc..99e9b6964fce 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9344,7 +9344,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, * with dirty logging disabled in order to eliminate unnecessary GPA * logging in PML buffer (and potential PML buffer full VMEXT). This * guarantees leaving PML enabled during guest's lifetime won't have - * any additonal overhead from PML when guest is running with dirty + * any additional overhead from PML when guest is running with dirty * logging disabled for memory slots. * * kvm_x86_ops->slot_enable_log_dirty is called when switching new slot diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 23c2519c5b32..110cbe3f74f8 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -82,7 +82,7 @@ static void async_pf_execute(struct work_struct *work) might_sleep(); /* - * This work is run asynchromously to the task which owns + * This work is run asynchronously to the task which owns * mm and might be done in another context, so we must * access remotely. */ -- cgit v1.2.3 From f99e3daf94ff35dd4a878d32ff66e1fd35223ad6 Mon Sep 17 00:00:00 2001 From: Chao Peng Date: Wed, 24 Oct 2018 16:05:10 +0800 Subject: KVM: x86: Add Intel PT virtualization work mode Intel Processor Trace virtualization can be work in one of 2 possible modes: a. System-Wide mode (default): When the host configures Intel PT to collect trace packets of the entire system, it can leave the relevant VMX controls clear to allow VMX-specific packets to provide information across VMX transitions. KVM guest will not aware this feature in this mode and both host and KVM guest trace will output to host buffer. b. Host-Guest mode: Host can configure trace-packet generation while in VMX non-root operation for guests and root operation for native executing normally. Intel PT will be exposed to KVM guest in this mode, and the trace output to respective buffer of host and guest. In this mode, tht status of PT will be saved and disabled before VM-entry and restored after VM-exit if trace a virtual machine. Signed-off-by: Chao Peng Signed-off-by: Luwei Kang Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/msr-index.h | 1 + arch/x86/include/asm/vmx.h | 8 ++++++++ arch/x86/kvm/vmx/capabilities.h | 15 +++++++++++++++ arch/x86/kvm/vmx/vmx.c | 21 +++++++++++++++++++-- arch/x86/kvm/vmx/vmx.h | 9 ++++++++- 5 files changed, 51 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 6a900150184b..8f95297371af 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -807,6 +807,7 @@ #define VMX_BASIC_INOUT 0x0040000000000000LLU /* MSR_IA32_VMX_MISC bits */ +#define MSR_IA32_VMX_MISC_INTEL_PT (1ULL << 14) #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29) #define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE 0x1F /* AMD-V MSRs */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 898c443eeed1..4e4133e86484 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -77,7 +77,9 @@ #define SECONDARY_EXEC_ENCLS_EXITING 0x00008000 #define SECONDARY_EXEC_RDSEED_EXITING 0x00010000 #define SECONDARY_EXEC_ENABLE_PML 0x00020000 +#define SECONDARY_EXEC_PT_CONCEAL_VMX 0x00080000 #define SECONDARY_EXEC_XSAVES 0x00100000 +#define SECONDARY_EXEC_PT_USE_GPA 0x01000000 #define SECONDARY_EXEC_MODE_BASED_EPT_EXEC 0x00400000 #define SECONDARY_EXEC_TSC_SCALING 0x02000000 @@ -99,6 +101,8 @@ #define VM_EXIT_LOAD_IA32_EFER 0x00200000 #define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000 #define VM_EXIT_CLEAR_BNDCFGS 0x00800000 +#define VM_EXIT_PT_CONCEAL_PIP 0x01000000 +#define VM_EXIT_CLEAR_IA32_RTIT_CTL 0x02000000 #define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff @@ -110,6 +114,8 @@ #define VM_ENTRY_LOAD_IA32_PAT 0x00004000 #define VM_ENTRY_LOAD_IA32_EFER 0x00008000 #define VM_ENTRY_LOAD_BNDCFGS 0x00010000 +#define VM_ENTRY_PT_CONCEAL_PIP 0x00020000 +#define VM_ENTRY_LOAD_IA32_RTIT_CTL 0x00040000 #define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x000011ff @@ -241,6 +247,8 @@ enum vmcs_field { GUEST_PDPTR3_HIGH = 0x00002811, GUEST_BNDCFGS = 0x00002812, GUEST_BNDCFGS_HIGH = 0x00002813, + GUEST_IA32_RTIT_CTL = 0x00002814, + GUEST_IA32_RTIT_CTL_HIGH = 0x00002815, HOST_IA32_PAT = 0x00002c00, HOST_IA32_PAT_HIGH = 0x00002c01, HOST_IA32_EFER = 0x00002c02, diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 366b9dd2e4ae..854e144131c6 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -10,6 +10,10 @@ extern bool __read_mostly enable_ept; extern bool __read_mostly enable_unrestricted_guest; extern bool __read_mostly enable_ept_ad_bits; extern bool __read_mostly enable_pml; +extern int __read_mostly pt_mode; + +#define PT_MODE_SYSTEM 0 +#define PT_MODE_HOST_GUEST 1 struct nested_vmx_msrs { /* @@ -325,4 +329,15 @@ static inline bool cpu_has_vmx_invvpid_global(void) return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; } +static inline bool cpu_has_vmx_intel_pt(void) +{ + u64 vmx_msr; + + rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); + return (vmx_msr & MSR_IA32_VMX_MISC_INTEL_PT) && + (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_PT_USE_GPA) && + (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_IA32_RTIT_CTL) && + (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_RTIT_CTL); +} + #endif /* __KVM_X86_VMX_CAPS_H */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 9b58921f5cd3..338977e6f552 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -169,6 +169,10 @@ module_param(ple_window_shrink, uint, 0444); static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; module_param(ple_window_max, uint, 0444); +/* Default is SYSTEM mode, 1 for host-guest mode */ +int __read_mostly pt_mode = PT_MODE_SYSTEM; +module_param(pt_mode, int, S_IRUGO); + static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); static DEFINE_MUTEX(vmx_l1d_flush_mutex); @@ -1975,6 +1979,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, SECONDARY_EXEC_RDRAND_EXITING | SECONDARY_EXEC_ENABLE_PML | SECONDARY_EXEC_TSC_SCALING | + SECONDARY_EXEC_PT_USE_GPA | + SECONDARY_EXEC_PT_CONCEAL_VMX | SECONDARY_EXEC_ENABLE_VMFUNC | SECONDARY_EXEC_ENCLS_EXITING; if (adjust_vmx_controls(min2, opt2, @@ -2023,7 +2029,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER | - VM_EXIT_CLEAR_BNDCFGS; + VM_EXIT_CLEAR_BNDCFGS | + VM_EXIT_PT_CONCEAL_PIP | + VM_EXIT_CLEAR_IA32_RTIT_CTL; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, &_vmexit_control) < 0) return -EIO; @@ -2045,7 +2053,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, opt = VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER | - VM_ENTRY_LOAD_BNDCFGS; + VM_ENTRY_LOAD_BNDCFGS | + VM_ENTRY_PT_CONCEAL_PIP | + VM_ENTRY_LOAD_IA32_RTIT_CTL; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, &_vmentry_control) < 0) return -EIO; @@ -3567,6 +3577,8 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; + if (pt_mode == PT_MODE_SYSTEM) + exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX); if (!cpu_need_virtualize_apic_accesses(vcpu)) exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; if (vmx->vpid == 0) @@ -7248,6 +7260,11 @@ static __init int hardware_setup(void) kvm_mce_cap_supported |= MCG_LMCE_P; + if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST) + return -EINVAL; + if (!enable_ept || !cpu_has_vmx_intel_pt()) + pt_mode = PT_MODE_SYSTEM; + if (nested) { nested_vmx_setup_ctls_msrs(&vmcs_config.nested, vmx_capability.ept, enable_apicv); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index f932d7c971e9..86eb9c887386 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -5,6 +5,7 @@ #include #include +#include #include "capabilities.h" #include "ops.h" @@ -421,13 +422,19 @@ static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) static inline u32 vmx_vmentry_ctrl(void) { + u32 vmentry_ctrl = vmcs_config.vmentry_ctrl; + if (pt_mode == PT_MODE_SYSTEM) + vmentry_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | VM_EXIT_CLEAR_IA32_RTIT_CTL); /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ - return vmcs_config.vmentry_ctrl & + return vmentry_ctrl & ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER); } static inline u32 vmx_vmexit_ctrl(void) { + u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; + if (pt_mode == PT_MODE_SYSTEM) + vmexit_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | VM_ENTRY_LOAD_IA32_RTIT_CTL); /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ return vmcs_config.vmexit_ctrl & ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); -- cgit v1.2.3 From 86f5201df0d3e3efc78d3eac7fc5a59b813287cd Mon Sep 17 00:00:00 2001 From: Chao Peng Date: Wed, 24 Oct 2018 16:05:11 +0800 Subject: KVM: x86: Add Intel Processor Trace cpuid emulation Expose Intel Processor Trace to guest only when the PT works in Host-Guest mode. Signed-off-by: Chao Peng Signed-off-by: Luwei Kang Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/cpuid.c | 22 ++++++++++++++++++++-- arch/x86/kvm/svm.c | 6 ++++++ arch/x86/kvm/vmx/vmx.c | 6 ++++++ 4 files changed, 33 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index bca77c25a19a..ffb8a853e0d4 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1105,6 +1105,7 @@ struct kvm_x86_ops { bool (*mpx_supported)(void); bool (*xsaves_supported)(void); bool (*umip_emulated)(void); + bool (*pt_supported)(void); int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); void (*request_immediate_exit)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index be5e64c61d8f..c731e87baef5 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -334,6 +334,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0; unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0; unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0; + unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0; /* cpuid 1.edx */ const u32 kvm_cpuid_1_edx_x86_features = @@ -392,7 +393,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) | - F(SHA_NI) | F(AVX512BW) | F(AVX512VL); + F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | f_intel_pt; /* cpuid 0xD.1.eax */ const u32 kvm_cpuid_D_1_eax_x86_features = @@ -423,7 +424,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, switch (function) { case 0: - entry->eax = min(entry->eax, (u32)0xd); + entry->eax = min(entry->eax, (u32)(f_intel_pt ? 0x14 : 0xd)); break; case 1: entry->edx &= kvm_cpuid_1_edx_x86_features; @@ -600,6 +601,23 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } break; } + /* Intel PT */ + case 0x14: { + int t, times = entry->eax; + + if (!f_intel_pt) + break; + + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + for (t = 1; t <= times; ++t) { + if (*nent >= maxnent) + goto out; + do_cpuid_1_ent(&entry[t], function, t); + entry[t].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + ++*nent; + } + break; + } case KVM_CPUID_SIGNATURE: { static const char signature[12] = "KVMKVMKVM\0\0"; const u32 *sigptr = (const u32 *)signature; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1a64844dcdfb..47a9a45a2f70 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5947,6 +5947,11 @@ static bool svm_umip_emulated(void) return false; } +static bool svm_pt_supported(void) +{ + return false; +} + static bool svm_has_wbinvd_exit(void) { return true; @@ -7188,6 +7193,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .mpx_supported = svm_mpx_supported, .xsaves_supported = svm_xsaves_supported, .umip_emulated = svm_umip_emulated, + .pt_supported = svm_pt_supported, .set_supported_cpuid = svm_set_supported_cpuid, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 338977e6f552..f5b88b578bbe 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5886,6 +5886,11 @@ static bool vmx_has_emulated_msr(int index) } } +static bool vmx_pt_supported(void) +{ + return pt_mode == PT_MODE_HOST_GUEST; +} + static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -7399,6 +7404,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .mpx_supported = vmx_mpx_supported, .xsaves_supported = vmx_xsaves_supported, .umip_emulated = vmx_umip_emulated, + .pt_supported = vmx_pt_supported, .request_immediate_exit = vmx_request_immediate_exit, -- cgit v1.2.3 From 2ef444f1600bfc2d8522df0f537aafef79befa7e Mon Sep 17 00:00:00 2001 From: Chao Peng Date: Wed, 24 Oct 2018 16:05:12 +0800 Subject: KVM: x86: Add Intel PT context switch for each vcpu Load/Store Intel Processor Trace register in context switch. MSR IA32_RTIT_CTL is loaded/stored automatically from VMCS. In Host-Guest mode, we need load/resore PT MSRs only when PT is enabled in guest. Signed-off-by: Chao Peng Signed-off-by: Luwei Kang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/vmx/vmx.h | 21 ++++++++++++++ 2 files changed, 95 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f5b88b578bbe..462c716d46ac 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -936,6 +936,69 @@ static unsigned long segment_base(u16 selector) } #endif +static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) +{ + u32 i; + + wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status); + wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); + wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); + wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); + for (i = 0; i < addr_range; i++) { + wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); + wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); + } +} + +static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range) +{ + u32 i; + + rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status); + rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); + rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); + rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); + for (i = 0; i < addr_range; i++) { + rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); + rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); + } +} + +static void pt_guest_enter(struct vcpu_vmx *vmx) +{ + if (pt_mode == PT_MODE_SYSTEM) + return; + + /* Save host state before VM entry */ + rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); + + /* + * Set guest state of MSR_IA32_RTIT_CTL MSR (PT will be disabled + * on VM entry when it has been disabled in guest before). + */ + vmcs_write64(GUEST_IA32_RTIT_CTL, vmx->pt_desc.guest.ctl); + + if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { + wrmsrl(MSR_IA32_RTIT_CTL, 0); + pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range); + pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range); + } +} + +static void pt_guest_exit(struct vcpu_vmx *vmx) +{ + if (pt_mode == PT_MODE_SYSTEM) + return; + + if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { + pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range); + pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range); + } + + /* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */ + wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); +} + void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -3814,6 +3877,13 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) if (cpu_has_vmx_encls_vmexit()) vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); + + if (pt_mode == PT_MODE_HOST_GUEST) { + memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc)); + /* Bit[6~0] are forced to 1, writes are ignored. */ + vmx->pt_desc.guest.output_mask = 0x7F; + vmcs_write64(GUEST_IA32_RTIT_CTL, 0); + } } static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -6115,6 +6185,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.pkru != vmx->host_pkru) __write_pkru(vcpu->arch.pkru); + pt_guest_enter(vmx); + atomic_switch_perf_msrs(vmx); vmx_update_hv_timer(vcpu); @@ -6314,6 +6386,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) | (1 << VCPU_EXREG_CR3)); vcpu->arch.regs_dirty = 0; + pt_guest_exit(vmx); + /* * eager fpu is enabled if PKEY is supported and CR4 is switched * back on host, so it is safe to read guest PKRU from current diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 86eb9c887386..dd3b9ab90556 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -66,6 +66,25 @@ struct pi_desc { u32 rsvd[6]; } __aligned(64); +#define RTIT_ADDR_RANGE 4 + +struct pt_ctx { + u64 ctl; + u64 status; + u64 output_base; + u64 output_mask; + u64 cr3_match; + u64 addr_a[RTIT_ADDR_RANGE]; + u64 addr_b[RTIT_ADDR_RANGE]; +}; + +struct pt_desc { + u64 ctl_bitmask; + u32 addr_range; + u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES]; + struct pt_ctx host; + struct pt_ctx guest; +}; /* * The nested_vmx structure is part of vcpu_vmx, and holds information we need @@ -249,6 +268,8 @@ struct vcpu_vmx { u64 msr_ia32_feature_control; u64 msr_ia32_feature_control_valid_bits; u64 ept_pointer; + + struct pt_desc pt_desc; }; enum ept_pointers_status { -- cgit v1.2.3 From 6c0f0bba85a0de83b32ff8ccf3e5965815cc069b Mon Sep 17 00:00:00 2001 From: Luwei Kang Date: Wed, 24 Oct 2018 16:05:13 +0800 Subject: KVM: x86: Introduce a function to initialize the PT configuration Initialize the Intel PT configuration when cpuid update. Include cpuid inforamtion, rtit_ctl bit mask and the number of address ranges. Signed-off-by: Luwei Kang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 462c716d46ac..966abf10f324 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6744,6 +6744,75 @@ static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu) } } +static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_cpuid_entry2 *best = NULL; + int i; + + for (i = 0; i < PT_CPUID_LEAVES; i++) { + best = kvm_find_cpuid_entry(vcpu, 0x14, i); + if (!best) + return; + vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax; + vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx; + vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx; + vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx; + } + + /* Get the number of configurable Address Ranges for filtering */ + vmx->pt_desc.addr_range = intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_num_address_ranges); + + /* Initialize and clear the no dependency bits */ + vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS | + RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC); + + /* + * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise + * will inject an #GP + */ + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering)) + vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN; + + /* + * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and + * PSBFreq can be set + */ + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc)) + vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC | + RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ); + + /* + * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn BranchEn and + * MTCFreq can be set + */ + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc)) + vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN | + RTIT_CTL_BRANCH_EN | RTIT_CTL_MTC_RANGE); + + /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */ + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite)) + vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW | + RTIT_CTL_PTW_EN); + + /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */ + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace)) + vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN; + + /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */ + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output)) + vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA; + + /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabircEn can be set */ + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys)) + vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN; + + /* unmask address range configure area */ + for (i = 0; i < vmx->pt_desc.addr_range; i++) + vmx->pt_desc.ctl_bitmask &= ~(0xf << (32 + i * 4)); +} + static void vmx_cpuid_update(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6764,6 +6833,10 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) nested_vmx_cr_fixed1_bits_update(vcpu); nested_vmx_entry_exit_ctls_update(vcpu); } + + if (boot_cpu_has(X86_FEATURE_INTEL_PT) && + guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT)) + update_intel_pt_cfg(vcpu); } static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) -- cgit v1.2.3 From bf8c55d8dc094c85a3f98cd302a4dddb720dd63f Mon Sep 17 00:00:00 2001 From: Chao Peng Date: Wed, 24 Oct 2018 16:05:14 +0800 Subject: KVM: x86: Implement Intel PT MSRs read/write emulation This patch implement Intel Processor Trace MSRs read/write emulation. Intel PT MSRs read/write need to be emulated when Intel PT MSRs is intercepted in guest and during live migration. Signed-off-by: Chao Peng Signed-off-by: Luwei Kang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 184 +++++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 33 ++++++++- 2 files changed, 216 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 966abf10f324..a4ec5369f25e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -140,6 +140,14 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) +#define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \ + RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \ + RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \ + RTIT_STATUS_BYTECNT)) + +#define MSR_IA32_RTIT_OUTPUT_BASE_MASK \ + (~((1UL << cpuid_query_maxphyaddr(vcpu)) - 1) | 0x7f) + /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * ple_gap: upper bound on the amount of time between two successive @@ -1354,6 +1362,79 @@ void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); } +static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long value; + + /* + * Any MSR write that attempts to change bits marked reserved will + * case a #GP fault. + */ + if (data & vmx->pt_desc.ctl_bitmask) + return 1; + + /* + * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will + * result in a #GP unless the same write also clears TraceEn. + */ + if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) && + ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN)) + return 1; + + /* + * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit + * and FabricEn would cause #GP, if + * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0 + */ + if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) && + !(data & RTIT_CTL_FABRIC_EN) && + !intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_single_range_output)) + return 1; + + /* + * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that + * utilize encodings marked reserved will casue a #GP fault. + */ + value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods); + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) && + !test_bit((data & RTIT_CTL_MTC_RANGE) >> + RTIT_CTL_MTC_RANGE_OFFSET, &value)) + return 1; + value = intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_cycle_thresholds); + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && + !test_bit((data & RTIT_CTL_CYC_THRESH) >> + RTIT_CTL_CYC_THRESH_OFFSET, &value)) + return 1; + value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods); + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && + !test_bit((data & RTIT_CTL_PSB_FREQ) >> + RTIT_CTL_PSB_FREQ_OFFSET, &value)) + return 1; + + /* + * If ADDRx_CFG is reserved or the encodings is >2 will + * cause a #GP fault. + */ + value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET; + if ((value && (vmx->pt_desc.addr_range < 1)) || (value > 2)) + return 1; + value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET; + if ((value && (vmx->pt_desc.addr_range < 2)) || (value > 2)) + return 1; + value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET; + if ((value && (vmx->pt_desc.addr_range < 3)) || (value > 2)) + return 1; + value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET; + if ((value && (vmx->pt_desc.addr_range < 4)) || (value > 2)) + return 1; + + return 0; +} + + static void skip_emulated_instruction(struct kvm_vcpu *vcpu) { unsigned long rip; @@ -1555,6 +1636,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct shared_msr_entry *msr; + u32 index; switch (msr_info->index) { #ifdef CONFIG_X86_64 @@ -1619,6 +1701,52 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; msr_info->data = vcpu->arch.ia32_xss; break; + case MSR_IA32_RTIT_CTL: + if (pt_mode != PT_MODE_HOST_GUEST) + return 1; + msr_info->data = vmx->pt_desc.guest.ctl; + break; + case MSR_IA32_RTIT_STATUS: + if (pt_mode != PT_MODE_HOST_GUEST) + return 1; + msr_info->data = vmx->pt_desc.guest.status; + break; + case MSR_IA32_RTIT_CR3_MATCH: + if ((pt_mode != PT_MODE_HOST_GUEST) || + !intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_cr3_filtering)) + return 1; + msr_info->data = vmx->pt_desc.guest.cr3_match; + break; + case MSR_IA32_RTIT_OUTPUT_BASE: + if ((pt_mode != PT_MODE_HOST_GUEST) || + (!intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_topa_output) && + !intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_single_range_output))) + return 1; + msr_info->data = vmx->pt_desc.guest.output_base; + break; + case MSR_IA32_RTIT_OUTPUT_MASK: + if ((pt_mode != PT_MODE_HOST_GUEST) || + (!intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_topa_output) && + !intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_single_range_output))) + return 1; + msr_info->data = vmx->pt_desc.guest.output_mask; + break; + case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: + index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; + if ((pt_mode != PT_MODE_HOST_GUEST) || + (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_num_address_ranges))) + return 1; + if (index % 2) + msr_info->data = vmx->pt_desc.guest.addr_b[index / 2]; + else + msr_info->data = vmx->pt_desc.guest.addr_a[index / 2]; + break; case MSR_TSC_AUX: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) @@ -1648,6 +1776,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) int ret = 0; u32 msr_index = msr_info->index; u64 data = msr_info->data; + u32 index; switch (msr_index) { case MSR_EFER: @@ -1799,6 +1928,61 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) else clear_atomic_switch_msr(vmx, MSR_IA32_XSS); break; + case MSR_IA32_RTIT_CTL: + if ((pt_mode != PT_MODE_HOST_GUEST) || + vmx_rtit_ctl_check(vcpu, data)) + return 1; + vmcs_write64(GUEST_IA32_RTIT_CTL, data); + vmx->pt_desc.guest.ctl = data; + break; + case MSR_IA32_RTIT_STATUS: + if ((pt_mode != PT_MODE_HOST_GUEST) || + (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) || + (data & MSR_IA32_RTIT_STATUS_MASK)) + return 1; + vmx->pt_desc.guest.status = data; + break; + case MSR_IA32_RTIT_CR3_MATCH: + if ((pt_mode != PT_MODE_HOST_GUEST) || + (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) || + !intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_cr3_filtering)) + return 1; + vmx->pt_desc.guest.cr3_match = data; + break; + case MSR_IA32_RTIT_OUTPUT_BASE: + if ((pt_mode != PT_MODE_HOST_GUEST) || + (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) || + (!intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_topa_output) && + !intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_single_range_output)) || + (data & MSR_IA32_RTIT_OUTPUT_BASE_MASK)) + return 1; + vmx->pt_desc.guest.output_base = data; + break; + case MSR_IA32_RTIT_OUTPUT_MASK: + if ((pt_mode != PT_MODE_HOST_GUEST) || + (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) || + (!intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_topa_output) && + !intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_single_range_output))) + return 1; + vmx->pt_desc.guest.output_mask = data; + break; + case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: + index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; + if ((pt_mode != PT_MODE_HOST_GUEST) || + (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) || + (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_num_address_ranges))) + return 1; + if (index % 2) + vmx->pt_desc.guest.addr_b[index / 2] = data; + else + vmx->pt_desc.guest.addr_a[index / 2] = data; + break; case MSR_TSC_AUX: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 99e9b6964fce..9ce1b8e2d3b9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -69,6 +69,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include "trace.h" @@ -1124,7 +1125,13 @@ static u32 msrs_to_save[] = { #endif MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, - MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES + MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES, + MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH, + MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK, + MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B, + MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B, + MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B, + MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, }; static unsigned num_msrs_to_save; @@ -4884,6 +4891,30 @@ static void kvm_init_msr_list(void) if (!kvm_x86_ops->rdtscp_supported()) continue; break; + case MSR_IA32_RTIT_CTL: + case MSR_IA32_RTIT_STATUS: + if (!kvm_x86_ops->pt_supported()) + continue; + break; + case MSR_IA32_RTIT_CR3_MATCH: + if (!kvm_x86_ops->pt_supported() || + !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering)) + continue; + break; + case MSR_IA32_RTIT_OUTPUT_BASE: + case MSR_IA32_RTIT_OUTPUT_MASK: + if (!kvm_x86_ops->pt_supported() || + (!intel_pt_validate_hw_cap(PT_CAP_topa_output) && + !intel_pt_validate_hw_cap(PT_CAP_single_range_output))) + continue; + break; + case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: { + if (!kvm_x86_ops->pt_supported() || + msrs_to_save[i] - MSR_IA32_RTIT_ADDR0_A >= + intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2) + continue; + break; + } default: break; } -- cgit v1.2.3 From b08c28960f254bd246af8e30a468dfc7dd56e03b Mon Sep 17 00:00:00 2001 From: Chao Peng Date: Wed, 24 Oct 2018 16:05:15 +0800 Subject: KVM: x86: Set intercept for Intel PT MSRs read/write To save performance overhead, disable intercept Intel PT MSRs read/write when Intel PT is enabled in guest. MSR_IA32_RTIT_CTL is an exception that will always be intercepted. Signed-off-by: Chao Peng Signed-off-by: Luwei Kang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 33 ++++++++++++++++++++++++++------- arch/x86/kvm/vmx/vmx.h | 1 + 2 files changed, 27 insertions(+), 7 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index a4ec5369f25e..338c65091241 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -977,15 +977,11 @@ static void pt_guest_enter(struct vcpu_vmx *vmx) if (pt_mode == PT_MODE_SYSTEM) return; - /* Save host state before VM entry */ - rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); - /* - * Set guest state of MSR_IA32_RTIT_CTL MSR (PT will be disabled - * on VM entry when it has been disabled in guest before). + * GUEST_IA32_RTIT_CTL is already set in the VMCS. + * Save host state before VM entry. */ - vmcs_write64(GUEST_IA32_RTIT_CTL, vmx->pt_desc.guest.ctl); - + rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { wrmsrl(MSR_IA32_RTIT_CTL, 0); pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range); @@ -1934,6 +1930,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vmcs_write64(GUEST_IA32_RTIT_CTL, data); vmx->pt_desc.guest.ctl = data; + pt_update_intercept_for_msr(vmx); break; case MSR_IA32_RTIT_STATUS: if ((pt_mode != PT_MODE_HOST_GUEST) || @@ -3567,6 +3564,28 @@ void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) vmx->msr_bitmap_mode = mode; } +void pt_update_intercept_for_msr(struct vcpu_vmx *vmx) +{ + unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; + bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); + u32 i; + + vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_STATUS, + MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_BASE, + MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_MASK, + MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_CR3_MATCH, + MSR_TYPE_RW, flag); + for (i = 0; i < vmx->pt_desc.addr_range; i++) { + vmx_set_intercept_for_msr(msr_bitmap, + MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(msr_bitmap, + MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag); + } +} + static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu) { return enable_apicv; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index dd3b9ab90556..20172c11d5f8 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -316,6 +316,7 @@ bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu); struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr); +void pt_update_intercept_for_msr(struct vcpu_vmx *vmx); #define POSTED_INTR_ON 0 #define POSTED_INTR_SN 1 -- cgit v1.2.3 From ee85dec2fe9c860c40f6e2e1d53052b80f36cd58 Mon Sep 17 00:00:00 2001 From: Luwei Kang Date: Wed, 24 Oct 2018 16:05:16 +0800 Subject: KVM: x86: Disable Intel PT when VMXON in L1 guest Currently, Intel Processor Trace do not support tracing in L1 guest VMX operation(IA32_VMX_MISC[bit 14] is 0). As mentioned in SDM, on these type of processors, execution of the VMXON instruction will clears IA32_RTIT_CTL.TraceEn and any attempt to write IA32_RTIT_CTL causes a general-protection exception (#GP). Signed-off-by: Luwei Kang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 6 ++++++ arch/x86/kvm/vmx/vmx.c | 3 ++- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index adc8493132ee..d839864aa8b0 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4167,6 +4167,12 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) vmx->nested.vmcs02_initialized = false; vmx->nested.vmxon = true; + + if (pt_mode == PT_MODE_HOST_GUEST) { + vmx->pt_desc.guest.ctl = 0; + pt_update_intercept_for_msr(vmx); + } + return 0; out_shadow_vmcs: diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 338c65091241..3ef444de0c5e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1926,7 +1926,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_RTIT_CTL: if ((pt_mode != PT_MODE_HOST_GUEST) || - vmx_rtit_ctl_check(vcpu, data)) + vmx_rtit_ctl_check(vcpu, data) || + vmx->nested.vmxon) return 1; vmcs_write64(GUEST_IA32_RTIT_CTL, data); vmx->pt_desc.guest.ctl = data; -- cgit v1.2.3 From 1f3a3e46cc49e81ab84d32710d3dae30697753a5 Mon Sep 17 00:00:00 2001 From: Lan Tianyu Date: Thu, 6 Dec 2018 21:21:07 +0800 Subject: KVM/VMX: Add hv tlb range flush support This patch is to register tlb_remote_flush_with_range callback with hv tlb range flush interface. Signed-off-by: Lan Tianyu Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 61 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 44 insertions(+), 17 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 3ef444de0c5e..d1c818a877df 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -421,7 +421,34 @@ static void check_ept_pointer_match(struct kvm *kvm) to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH; } -static int vmx_hv_remote_flush_tlb(struct kvm *kvm) +int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush, + void *data) +{ + struct kvm_tlb_range *range = data; + + return hyperv_fill_flush_guest_mapping_list(flush, range->start_gfn, + range->pages); +} + +static inline int __hv_remote_flush_tlb_with_range(struct kvm *kvm, + struct kvm_vcpu *vcpu, struct kvm_tlb_range *range) +{ + u64 ept_pointer = to_vmx(vcpu)->ept_pointer; + + /* + * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs address + * of the base of EPT PML4 table, strip off EPT configuration + * information. + */ + if (range) + return hyperv_flush_guest_mapping_range(ept_pointer & PAGE_MASK, + kvm_fill_hv_flush_list_func, (void *)range); + else + return hyperv_flush_guest_mapping(ept_pointer & PAGE_MASK); +} + +static int hv_remote_flush_tlb_with_range(struct kvm *kvm, + struct kvm_tlb_range *range) { struct kvm_vcpu *vcpu; int ret = -ENOTSUPP, i; @@ -431,29 +458,26 @@ static int vmx_hv_remote_flush_tlb(struct kvm *kvm) if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK) check_ept_pointer_match(kvm); - /* - * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs the address of the - * base of EPT PML4 table, strip off EPT configuration information. - * If ept_pointer is invalid pointer, bypass the flush request. - */ if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) { kvm_for_each_vcpu(i, vcpu, kvm) { - u64 ept_pointer = to_vmx(vcpu)->ept_pointer; - - if (!VALID_PAGE(ept_pointer)) - continue; - - ret |= hyperv_flush_guest_mapping( - ept_pointer & PAGE_MASK); + /* If ept_pointer is invalid pointer, bypass flush request. */ + if (VALID_PAGE(to_vmx(vcpu)->ept_pointer)) + ret |= __hv_remote_flush_tlb_with_range( + kvm, vcpu, range); } } else { - ret = hyperv_flush_guest_mapping( - to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer & PAGE_MASK); + ret = __hv_remote_flush_tlb_with_range(kvm, + kvm_get_vcpu(kvm, 0), range); } spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); return ret; } +static int hv_remote_flush_tlb(struct kvm *kvm) +{ + return hv_remote_flush_tlb_with_range(kvm, NULL); +} + #endif /* IS_ENABLED(CONFIG_HYPERV) */ /* @@ -7554,8 +7578,11 @@ static __init int hardware_setup(void) #if IS_ENABLED(CONFIG_HYPERV) if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH - && enable_ept) - kvm_x86_ops->tlb_remote_flush = vmx_hv_remote_flush_tlb; + && enable_ept) { + kvm_x86_ops->tlb_remote_flush = hv_remote_flush_tlb; + kvm_x86_ops->tlb_remote_flush_with_range = + hv_remote_flush_tlb_with_range; + } #endif if (!cpu_has_vmx_ple()) { -- cgit v1.2.3 From 40ef75a758b291dc2cc88ecef36ffbb0eb095e00 Mon Sep 17 00:00:00 2001 From: Lan Tianyu Date: Thu, 6 Dec 2018 21:21:08 +0800 Subject: KVM/MMU: Add tlb flush with range helper function This patch is to add wrapper functions for tlb_remote_flush_with_range callback and flush tlb directly in kvm_mmu_zap_collapsible_spte(). kvm_mmu_zap_collapsible_spte() returns flush request to the slot_handle_leaf() and the latter does flush on demand. When range flush is available, make kvm_mmu_zap_collapsible_spte() to flush tlb with range directly to avoid returning range back to slot_handle_leaf(). Signed-off-by: Lan Tianyu Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 1fba9f356218..f105f17fb4c7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -264,6 +264,35 @@ static void mmu_spte_set(u64 *sptep, u64 spte); static union kvm_mmu_page_role kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu); + +static inline bool kvm_available_flush_tlb_with_range(void) +{ + return kvm_x86_ops->tlb_remote_flush_with_range; +} + +static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm, + struct kvm_tlb_range *range) +{ + int ret = -ENOTSUPP; + + if (range && kvm_x86_ops->tlb_remote_flush_with_range) + ret = kvm_x86_ops->tlb_remote_flush_with_range(kvm, range); + + if (ret) + kvm_flush_remote_tlbs(kvm); +} + +static void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, + u64 start_gfn, u64 pages) +{ + struct kvm_tlb_range range; + + range.start_gfn = start_gfn; + range.pages = pages; + + kvm_flush_remote_tlbs_with_range(kvm, &range); +} + void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value) { BUG_ON((mmio_mask & mmio_value) != mmio_value); @@ -5671,7 +5700,13 @@ restart: !kvm_is_reserved_pfn(pfn) && PageTransCompoundMap(pfn_to_page(pfn))) { pte_list_remove(rmap_head, sptep); - need_tlb_flush = 1; + + if (kvm_available_flush_tlb_with_range()) + kvm_flush_remote_tlbs_with_address(kvm, sp->gfn, + KVM_PAGES_PER_HPAGE(sp->role.level)); + else + need_tlb_flush = 1; + goto restart; } } -- cgit v1.2.3 From c3134ce240eed2eaa9703579ef99c3099665236e Mon Sep 17 00:00:00 2001 From: Lan Tianyu Date: Thu, 6 Dec 2018 21:21:09 +0800 Subject: KVM: Replace old tlb flush function with new one to flush a specified range. This patch is to replace kvm_flush_remote_tlbs() with kvm_flush_ remote_tlbs_with_address() in some functions without logic change. Signed-off-by: Lan Tianyu Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 31 +++++++++++++++++++++---------- arch/x86/kvm/paging_tmpl.h | 3 ++- 2 files changed, 23 insertions(+), 11 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f105f17fb4c7..711a41a85dfb 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1485,8 +1485,12 @@ static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) { - if (__drop_large_spte(vcpu->kvm, sptep)) - kvm_flush_remote_tlbs(vcpu->kvm); + if (__drop_large_spte(vcpu->kvm, sptep)) { + struct kvm_mmu_page *sp = page_header(__pa(sptep)); + + kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, + KVM_PAGES_PER_HPAGE(sp->role.level)); + } } /* @@ -1954,7 +1958,8 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0); - kvm_flush_remote_tlbs(vcpu->kvm); + kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, + KVM_PAGES_PER_HPAGE(sp->role.level)); } int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) @@ -2470,7 +2475,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, account_shadowed(vcpu->kvm, sp); if (level == PT_PAGE_TABLE_LEVEL && rmap_write_protect(vcpu, gfn)) - kvm_flush_remote_tlbs(vcpu->kvm); + kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1); if (level > PT_PAGE_TABLE_LEVEL && need_sync) flush |= kvm_sync_pages(vcpu, gfn, &invalid_list); @@ -2590,7 +2595,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, return; drop_parent_pte(child, sptep); - kvm_flush_remote_tlbs(vcpu->kvm); + kvm_flush_remote_tlbs_with_address(vcpu->kvm, child->gfn, 1); } } @@ -3014,8 +3019,10 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, ret = RET_PF_EMULATE; kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } + if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush) - kvm_flush_remote_tlbs(vcpu->kvm); + kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, + KVM_PAGES_PER_HPAGE(level)); if (unlikely(is_mmio_spte(*sptep))) ret = RET_PF_EMULATE; @@ -5672,7 +5679,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, * on PT_WRITABLE_MASK anymore. */ if (flush) - kvm_flush_remote_tlbs(kvm); + kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn, + memslot->npages); } static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, @@ -5742,7 +5750,8 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, * dirty_bitmap. */ if (flush) - kvm_flush_remote_tlbs(kvm); + kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn, + memslot->npages); } EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty); @@ -5760,7 +5769,8 @@ void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, lockdep_assert_held(&kvm->slots_lock); if (flush) - kvm_flush_remote_tlbs(kvm); + kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn, + memslot->npages); } EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access); @@ -5777,7 +5787,8 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm, /* see kvm_mmu_slot_leaf_clear_dirty */ if (flush) - kvm_flush_remote_tlbs(kvm); + kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn, + memslot->npages); } EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 7cf2185b7eb5..6bdca39829bc 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -894,7 +894,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); if (mmu_page_zap_pte(vcpu->kvm, sp, sptep)) - kvm_flush_remote_tlbs(vcpu->kvm); + kvm_flush_remote_tlbs_with_address(vcpu->kvm, + sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); if (!rmap_can_add(vcpu)) break; -- cgit v1.2.3 From 748c0e312fce983bd7854b369b192e24dce90878 Mon Sep 17 00:00:00 2001 From: Lan Tianyu Date: Thu, 6 Dec 2018 21:21:10 +0800 Subject: KVM: Make kvm_set_spte_hva() return int The patch is to make kvm_set_spte_hva() return int and caller can check return value to determine flush tlb or not. Signed-off-by: Lan Tianyu Acked-by: Paul Mackerras Signed-off-by: Paolo Bonzini --- arch/arm/include/asm/kvm_host.h | 2 +- arch/arm64/include/asm/kvm_host.h | 2 +- arch/mips/include/asm/kvm_host.h | 2 +- arch/mips/kvm/mmu.c | 3 ++- arch/powerpc/include/asm/kvm_host.h | 2 +- arch/powerpc/kvm/book3s.c | 3 ++- arch/powerpc/kvm/e500_mmu_host.c | 3 ++- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu.c | 3 ++- virt/kvm/arm/mmu.c | 6 ++++-- 10 files changed, 17 insertions(+), 11 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index c5634c6ffcea..4f3400a74a17 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -225,7 +225,7 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, #define KVM_ARCH_WANT_MMU_NOTIFIER int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); -void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); +int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu); int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices); diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index c22e50ba3473..063886be25ad 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -360,7 +360,7 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, #define KVM_ARCH_WANT_MMU_NOTIFIER int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); -void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); +int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 2c1c53d12179..71c3f21d80d5 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -933,7 +933,7 @@ enum kvm_mips_fault_result kvm_trap_emul_gva_fault(struct kvm_vcpu *vcpu, #define KVM_ARCH_WANT_MMU_NOTIFIER int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); -void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); +int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index d8dcdb350405..97e538a8c1be 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -551,7 +551,7 @@ static int kvm_set_spte_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, (pte_dirty(old_pte) && !pte_dirty(hva_pte)); } -void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) { unsigned long end = hva + PAGE_SIZE; int ret; @@ -559,6 +559,7 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) ret = handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pte); if (ret) kvm_mips_callbacks->flush_shadow_all(kvm); + return 0; } static int kvm_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 7a2483a139cf..0f98f00da2ea 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -72,7 +72,7 @@ extern int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); extern int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); -extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); +extern int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); #define HPTEG_CACHE_NUM (1 << 15) #define HPTEG_HASH_BITS_PTE 13 diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index a35fb4099094..bd1a677dd9e4 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -851,9 +851,10 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) return kvm->arch.kvm_ops->test_age_hva(kvm, hva); } -void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) { kvm->arch.kvm_ops->set_spte_hva(kvm, hva, pte); + return 0; } void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index 8f2985e46f6f..c3f312b2bcb3 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -757,10 +757,11 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) return 0; } -void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) { /* The page will get remapped properly on its next fault */ kvm_unmap_hva(kvm, hva); + return 0; } /*****************************************/ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3639077d285b..35638ace7387 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1512,7 +1512,7 @@ asmlinkage void kvm_spurious_fault(void); int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); -void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); +int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 711a41a85dfb..dd71864ded39 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1913,9 +1913,10 @@ int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); } -void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) { kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); + return 0; } static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index dee3dbd98712..3053bf2584f8 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -2049,14 +2049,14 @@ static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data } -void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) { unsigned long end = hva + PAGE_SIZE; kvm_pfn_t pfn = pte_pfn(pte); pte_t stage2_pte; if (!kvm->arch.pgd) - return; + return 0; trace_kvm_set_spte_hva(hva); @@ -2067,6 +2067,8 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) clean_dcache_guest_page(pfn, PAGE_SIZE); stage2_pte = kvm_pfn_pte(pfn, PAGE_S2); handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte); + + return 0; } static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) -- cgit v1.2.3 From 0cf853c5e238edf503ebda2fe541e6f4a3d5bd40 Mon Sep 17 00:00:00 2001 From: Lan Tianyu Date: Thu, 6 Dec 2018 21:21:11 +0800 Subject: KVM/MMU: Move tlb flush in kvm_set_pte_rmapp() to kvm_mmu_notifier_change_pte() This patch is to move tlb flush in kvm_set_pte_rmapp() to kvm_mmu_notifier_change_pte() in order to avoid redundant tlb flush. Signed-off-by: Lan Tianyu Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 8 ++------ virt/kvm/kvm_main.c | 5 ++++- 2 files changed, 6 insertions(+), 7 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index dd71864ded39..5a046f93a41a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1776,10 +1776,7 @@ restart: } } - if (need_flush) - kvm_flush_remote_tlbs(kvm); - - return 0; + return need_flush; } struct slot_rmap_walk_iterator { @@ -1915,8 +1912,7 @@ int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) { - kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); - return 0; + return kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); } static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index f90ceab3840e..cf7cc0554094 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -354,7 +354,10 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); kvm->mmu_notifier_seq++; - kvm_set_spte_hva(kvm, address, pte); + + if (kvm_set_spte_hva(kvm, address, pte)) + kvm_flush_remote_tlbs(kvm); + spin_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); } -- cgit v1.2.3 From 3cc5ea94de5f71dff0d3353f610e729308583e6b Mon Sep 17 00:00:00 2001 From: Lan Tianyu Date: Thu, 6 Dec 2018 21:21:12 +0800 Subject: KVM/MMU: Flush tlb directly in kvm_set_pte_rmapp() This patch is to flush tlb directly in kvm_set_pte_rmapp() function when Hyper-V remote TLB flush is available, returning 0 so that kvm_mmu_notifier_change_pte() does not flush again. Signed-off-by: Lan Tianyu Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5a046f93a41a..fd61d715ebd6 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1776,6 +1776,11 @@ restart: } } + if (need_flush && kvm_available_flush_tlb_with_range()) { + kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); + return 0; + } + return need_flush; } -- cgit v1.2.3 From 71883a62fcd6c70639fa12cda733378b4d997409 Mon Sep 17 00:00:00 2001 From: Lan Tianyu Date: Thu, 6 Dec 2018 21:21:13 +0800 Subject: KVM/MMU: Flush tlb directly in the kvm_zap_gfn_range() Originally, flush tlb is done by slot_handle_level_range(). This patch moves the flush directly to kvm_zap_gfn_range() when range flush is available, so that only the requested range can be flushed. Signed-off-by: Lan Tianyu Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index fd61d715ebd6..ce770b446238 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5624,8 +5624,13 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; + bool flush_tlb = true; + bool flush = false; int i; + if (kvm_available_flush_tlb_with_range()) + flush_tlb = false; + spin_lock(&kvm->mmu_lock); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { slots = __kvm_memslots(kvm, i); @@ -5637,12 +5642,17 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) if (start >= end) continue; - slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, - PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL, - start, end - 1, true); + flush |= slot_handle_level_range(kvm, memslot, + kvm_zap_rmapp, PT_PAGE_TABLE_LEVEL, + PT_MAX_HUGEPAGE_LEVEL, start, + end - 1, flush_tlb); } } + if (flush) + kvm_flush_remote_tlbs_with_address(kvm, gfn_start, + gfn_end - gfn_start + 1); + spin_unlock(&kvm->mmu_lock); } -- cgit v1.2.3 From ac5ffda2447f033ddc44a87882dfcfed38d944dc Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 26 Nov 2018 17:00:08 +0100 Subject: KVM/x86: Use SVM assembly instruction mnemonics instead of .byte streams Recently the minimum required version of binutils was changed to 2.20, which supports all SVM instruction mnemonics. The patch removes all .byte #defines and uses real instruction mnemonics instead. Signed-off-by: Uros Bizjak Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/svm.h | 7 ------- arch/x86/kvm/svm.c | 12 ++++++------ 2 files changed, 6 insertions(+), 13 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 93b462e48067..dec9c1e84c78 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -290,11 +290,4 @@ struct __attribute__ ((__packed__)) vmcb { #define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP) -#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda" -#define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8" -#define SVM_VMSAVE ".byte 0x0f, 0x01, 0xdb" -#define SVM_CLGI ".byte 0x0f, 0x01, 0xdd" -#define SVM_STGI ".byte 0x0f, 0x01, 0xdc" -#define SVM_INVLPGA ".byte 0x0f, 0x01, 0xdf" - #endif diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 47a9a45a2f70..00594332bba5 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -706,17 +706,17 @@ static u32 svm_msrpm_offset(u32 msr) static inline void clgi(void) { - asm volatile (__ex(SVM_CLGI)); + asm volatile (__ex("clgi")); } static inline void stgi(void) { - asm volatile (__ex(SVM_STGI)); + asm volatile (__ex("stgi")); } static inline void invlpga(unsigned long addr, u32 asid) { - asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid)); + asm volatile (__ex("invlpga %1, %0") : : "c"(asid), "a"(addr)); } static int get_npt_level(struct kvm_vcpu *vcpu) @@ -5652,9 +5652,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) /* Enter guest mode */ "push %%" _ASM_AX " \n\t" "mov %c[vmcb](%[svm]), %%" _ASM_AX " \n\t" - __ex(SVM_VMLOAD) "\n\t" - __ex(SVM_VMRUN) "\n\t" - __ex(SVM_VMSAVE) "\n\t" + __ex("vmload %%" _ASM_AX) "\n\t" + __ex("vmrun %%" _ASM_AX) "\n\t" + __ex("vmsave %%" _ASM_AX) "\n\t" "pop %%" _ASM_AX " \n\t" /* Save guest registers, load host registers */ -- cgit v1.2.3 From 051a2d3e59e51ae49fd56aef34e472832897ce46 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 20 Dec 2018 12:25:16 -0800 Subject: KVM: VMX: Explicitly reference RCX as the vmx_vcpu pointer in asm blobs Use '%% " _ASM_CX"' instead of '%0' to dereference RCX, i.e. the 'struct vcpu_vmx' pointer, in the VM-Enter asm blobs of vmx_vcpu_run() and nested_vmx_check_vmentry_hw(). Using the symbolic name means that adding/removing an output parameter(s) requires "rewriting" almost all of the asm blob, which makes it nearly impossible to understand what's being changed in even the most minor patches. Opportunistically improve the code comments. Signed-off-by: Sean Christopherson Reviewed-by: Andi Kleen Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 6 ++-- arch/x86/kvm/vmx/vmx.c | 86 ++++++++++++++++++++++++++--------------------- 2 files changed, 50 insertions(+), 42 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index d839864aa8b0..da16f022a529 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2759,17 +2759,17 @@ static int __noclone nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) asm( /* Set HOST_RSP */ __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t" - "mov %%" _ASM_SP ", %c[host_rsp](%0)\n\t" + "mov %%" _ASM_SP ", %c[host_rsp](%% " _ASM_CX")\n\t" /* Check if vmlaunch or vmresume is needed */ - "cmpl $0, %c[launched](%0)\n\t" + "cmpl $0, %c[launched](%% " _ASM_CX")\n\t" "jne 1f\n\t" __ex("vmlaunch") "\n\t" "jmp 2f\n\t" "1: " __ex("vmresume") "\n\t" "2: " /* Set vmx->fail accordingly */ - "setbe %c[fail](%0)\n\t" + "setbe %c[fail](%% " _ASM_CX")\n\t" ".pushsection .rodata\n\t" ".global vmx_early_consistency_check_return\n\t" diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d1c818a877df..e82319fbdceb 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6440,9 +6440,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) "push %%" _ASM_DX "; push %%" _ASM_BP ";" "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */ "push %%" _ASM_CX " \n\t" - "cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t" + "cmp %%" _ASM_SP ", %c[host_rsp](%%" _ASM_CX ") \n\t" "je 1f \n\t" - "mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t" + "mov %%" _ASM_SP ", %c[host_rsp](%%" _ASM_CX ") \n\t" /* Avoid VMWRITE when Enlightened VMCS is in use */ "test %%" _ASM_SI ", %%" _ASM_SI " \n\t" "jz 2f \n\t" @@ -6452,32 +6452,33 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t" "1: \n\t" /* Reload cr2 if changed */ - "mov %c[cr2](%0), %%" _ASM_AX " \n\t" + "mov %c[cr2](%%" _ASM_CX "), %%" _ASM_AX " \n\t" "mov %%cr2, %%" _ASM_DX " \n\t" "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t" "je 3f \n\t" "mov %%" _ASM_AX", %%cr2 \n\t" "3: \n\t" /* Check if vmlaunch or vmresume is needed */ - "cmpl $0, %c[launched](%0) \n\t" + "cmpl $0, %c[launched](%%" _ASM_CX ") \n\t" /* Load guest registers. Don't clobber flags. */ - "mov %c[rax](%0), %%" _ASM_AX " \n\t" - "mov %c[rbx](%0), %%" _ASM_BX " \n\t" - "mov %c[rdx](%0), %%" _ASM_DX " \n\t" - "mov %c[rsi](%0), %%" _ASM_SI " \n\t" - "mov %c[rdi](%0), %%" _ASM_DI " \n\t" - "mov %c[rbp](%0), %%" _ASM_BP " \n\t" + "mov %c[rax](%%" _ASM_CX "), %%" _ASM_AX " \n\t" + "mov %c[rbx](%%" _ASM_CX "), %%" _ASM_BX " \n\t" + "mov %c[rdx](%%" _ASM_CX "), %%" _ASM_DX " \n\t" + "mov %c[rsi](%%" _ASM_CX "), %%" _ASM_SI " \n\t" + "mov %c[rdi](%%" _ASM_CX "), %%" _ASM_DI " \n\t" + "mov %c[rbp](%%" _ASM_CX "), %%" _ASM_BP " \n\t" #ifdef CONFIG_X86_64 - "mov %c[r8](%0), %%r8 \n\t" - "mov %c[r9](%0), %%r9 \n\t" - "mov %c[r10](%0), %%r10 \n\t" - "mov %c[r11](%0), %%r11 \n\t" - "mov %c[r12](%0), %%r12 \n\t" - "mov %c[r13](%0), %%r13 \n\t" - "mov %c[r14](%0), %%r14 \n\t" - "mov %c[r15](%0), %%r15 \n\t" + "mov %c[r8](%%" _ASM_CX "), %%r8 \n\t" + "mov %c[r9](%%" _ASM_CX "), %%r9 \n\t" + "mov %c[r10](%%" _ASM_CX "), %%r10 \n\t" + "mov %c[r11](%%" _ASM_CX "), %%r11 \n\t" + "mov %c[r12](%%" _ASM_CX "), %%r12 \n\t" + "mov %c[r13](%%" _ASM_CX "), %%r13 \n\t" + "mov %c[r14](%%" _ASM_CX "), %%r14 \n\t" + "mov %c[r15](%%" _ASM_CX "), %%r15 \n\t" #endif - "mov %c[rcx](%0), %%" _ASM_CX " \n\t" /* kills %0 (ecx) */ + /* Load guest RCX. This kills the vmx_vcpu pointer! */ + "mov %c[rcx](%%" _ASM_CX "), %%" _ASM_CX " \n\t" /* Enter guest mode */ "jne 1f \n\t" @@ -6485,26 +6486,33 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) "jmp 2f \n\t" "1: " __ex("vmresume") "\n\t" "2: " - /* Save guest registers, load host registers, keep flags */ - "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t" - "pop %0 \n\t" - "setbe %c[fail](%0)\n\t" - "mov %%" _ASM_AX ", %c[rax](%0) \n\t" - "mov %%" _ASM_BX ", %c[rbx](%0) \n\t" - __ASM_SIZE(pop) " %c[rcx](%0) \n\t" - "mov %%" _ASM_DX ", %c[rdx](%0) \n\t" - "mov %%" _ASM_SI ", %c[rsi](%0) \n\t" - "mov %%" _ASM_DI ", %c[rdi](%0) \n\t" - "mov %%" _ASM_BP ", %c[rbp](%0) \n\t" + + /* Save guest's RCX to the stack placeholder (see above) */ + "mov %%" _ASM_CX ", %c[wordsize](%%" _ASM_SP ") \n\t" + + /* Load host's RCX, i.e. the vmx_vcpu pointer */ + "pop %%" _ASM_CX " \n\t" + + /* Set vmx->fail based on EFLAGS.{CF,ZF} */ + "setbe %c[fail](%%" _ASM_CX ")\n\t" + + /* Save all guest registers, including RCX from the stack */ + "mov %%" _ASM_AX ", %c[rax](%%" _ASM_CX ") \n\t" + "mov %%" _ASM_BX ", %c[rbx](%%" _ASM_CX ") \n\t" + __ASM_SIZE(pop) " %c[rcx](%%" _ASM_CX ") \n\t" + "mov %%" _ASM_DX ", %c[rdx](%%" _ASM_CX ") \n\t" + "mov %%" _ASM_SI ", %c[rsi](%%" _ASM_CX ") \n\t" + "mov %%" _ASM_DI ", %c[rdi](%%" _ASM_CX ") \n\t" + "mov %%" _ASM_BP ", %c[rbp](%%" _ASM_CX ") \n\t" #ifdef CONFIG_X86_64 - "mov %%r8, %c[r8](%0) \n\t" - "mov %%r9, %c[r9](%0) \n\t" - "mov %%r10, %c[r10](%0) \n\t" - "mov %%r11, %c[r11](%0) \n\t" - "mov %%r12, %c[r12](%0) \n\t" - "mov %%r13, %c[r13](%0) \n\t" - "mov %%r14, %c[r14](%0) \n\t" - "mov %%r15, %c[r15](%0) \n\t" + "mov %%r8, %c[r8](%%" _ASM_CX ") \n\t" + "mov %%r9, %c[r9](%%" _ASM_CX ") \n\t" + "mov %%r10, %c[r10](%%" _ASM_CX ") \n\t" + "mov %%r11, %c[r11](%%" _ASM_CX ") \n\t" + "mov %%r12, %c[r12](%%" _ASM_CX ") \n\t" + "mov %%r13, %c[r13](%%" _ASM_CX ") \n\t" + "mov %%r14, %c[r14](%%" _ASM_CX ") \n\t" + "mov %%r15, %c[r15](%%" _ASM_CX ") \n\t" /* * Clear host registers marked as clobbered to prevent * speculative use. @@ -6519,7 +6527,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) "xor %%r15d, %%r15d \n\t" #endif "mov %%cr2, %%" _ASM_AX " \n\t" - "mov %%" _ASM_AX ", %c[cr2](%0) \n\t" + "mov %%" _ASM_AX ", %c[cr2](%%" _ASM_CX ") \n\t" "xor %%eax, %%eax \n\t" "xor %%ebx, %%ebx \n\t" -- cgit v1.2.3 From 453eafbe65f72c04fc7c74c5c95c04e78e907dfb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 20 Dec 2018 12:25:17 -0800 Subject: KVM: VMX: Move VM-Enter + VM-Exit handling to non-inline sub-routines Transitioning to/from a VMX guest requires KVM to manually save/load the bulk of CPU state that the guest is allowed to direclty access, e.g. XSAVE state, CR2, GPRs, etc... For obvious reasons, loading the guest's GPR snapshot prior to VM-Enter and saving the snapshot after VM-Exit is done via handcoded assembly. The assembly blob is written as inline asm so that it can easily access KVM-defined structs that are used to hold guest state, e.g. moving the blob to a standalone assembly file would require generating defines for struct offsets. The other relevant aspect of VMX transitions in KVM is the handling of VM-Exits. KVM doesn't employ a separate VM-Exit handler per se, but rather treats the VMX transition as a mega instruction (with many side effects), i.e. sets the VMCS.HOST_RIP to a label immediately following VMLAUNCH/VMRESUME. The label is then exposed to C code via a global variable definition in the inline assembly. Because of the global variable, KVM takes steps to (attempt to) ensure only a single instance of the owning C function, e.g. vmx_vcpu_run, is generated by the compiler. The earliest approach placed the inline assembly in a separate noinline function[1]. Later, the assembly was folded back into vmx_vcpu_run() and tagged with __noclone[2][3], which is still used today. After moving to __noclone, an edge case was encountered where GCC's -ftracer optimization resulted in the inline assembly blob being duplicated. This was "fixed" by explicitly disabling -ftracer in the __noclone definition[4]. Recently, it was found that disabling -ftracer causes build warnings for unsuspecting users of __noclone[5], and more importantly for KVM, prevents the compiler for properly optimizing vmx_vcpu_run()[6]. And perhaps most importantly of all, it was pointed out that there is no way to prevent duplication of a function with 100% reliability[7], i.e. more edge cases may be encountered in the future. So to summarize, the only way to prevent the compiler from duplicating the global variable definition is to move the variable out of inline assembly, which has been suggested several times over[1][7][8]. Resolve the aforementioned issues by moving the VMLAUNCH+VRESUME and VM-Exit "handler" to standalone assembly sub-routines. Moving only the core VMX transition codes allows the struct indexing to remain as inline assembly and also allows the sub-routines to be used by nested_vmx_check_vmentry_hw(). Reusing the sub-routines has a happy side-effect of eliminating two VMWRITEs in the nested_early_check path as there is no longer a need to dynamically change VMCS.HOST_RIP. Note that callers to vmx_vmenter() must account for the CALL modifying RSP, e.g. must subtract op-size from RSP when synchronizing RSP with VMCS.HOST_RSP and "restore" RSP prior to the CALL. There are no great alternatives to fudging RSP. Saving RSP in vmx_enter() is difficult because doing so requires a second register (VMWRITE does not provide an immediate encoding for the VMCS field and KVM supports Hyper-V's memory-based eVMCS ABI). The other more drastic alternative would be to use eschew VMCS.HOST_RSP and manually save/load RSP using a per-cpu variable (which can be encoded as e.g. gs:[imm]). But because a valid stack is needed at the time of VM-Exit (NMIs aren't blocked and a user could theoretically insert INT3/INT1ICEBRK at the VM-Exit handler), a dedicated per-cpu VM-Exit stack would be required. A dedicated stack isn't difficult to implement, but it would require at least one page per CPU and knowledge of the stack in the dumpstack routines. And in most cases there is essentially zero overhead in dynamically updating VMCS.HOST_RSP, e.g. the VMWRITE can be avoided for all but the first VMLAUNCH unless nested_early_check=1, which is not a fast path. In other words, avoiding the VMCS.HOST_RSP by using a dedicated stack would only make the code marginally less ugly while requiring at least one page per CPU and forcing the kernel to be aware (and approve) of the VM-Exit stack shenanigans. [1] cea15c24ca39 ("KVM: Move KVM context switch into own function") [2] a3b5ba49a8c5 ("KVM: VMX: add the __noclone attribute to vmx_vcpu_run") [3] 104f226bfd0a ("KVM: VMX: Fold __vmx_vcpu_run() into vmx_vcpu_run()") [4] 95272c29378e ("compiler-gcc: disable -ftracer for __noclone functions") [5] https://lkml.kernel.org/r/20181218140105.ajuiglkpvstt3qxs@treble [6] https://patchwork.kernel.org/patch/8707981/#21817015 [7] https://lkml.kernel.org/r/ri6y38lo23g.fsf@suse.cz [8] https://lkml.kernel.org/r/20181218212042.GE25620@tassilo.jf.intel.com Suggested-by: Andi Kleen Suggested-by: Martin Jambor Cc: Paolo Bonzini Cc: Nadav Amit Cc: Andi Kleen Cc: Josh Poimboeuf Cc: Martin Jambor Cc: Arnd Bergmann Cc: Steven Rostedt Cc: Miroslav Benes Signed-off-by: Sean Christopherson Reviewed-by: Andi Kleen Signed-off-by: Paolo Bonzini --- arch/x86/kvm/Makefile | 2 +- arch/x86/kvm/vmx/nested.c | 30 ++++++++---------------- arch/x86/kvm/vmx/vmenter.S | 57 ++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/vmx/vmx.c | 22 ++++++++---------- arch/x86/kvm/vmx/vmx.h | 1 - 5 files changed, 78 insertions(+), 34 deletions(-) create mode 100644 arch/x86/kvm/vmx/vmenter.S (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 83dc7d6a0294..69b3a7c30013 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -16,7 +16,7 @@ kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ hyperv.o page_track.o debugfs.o -kvm-intel-y += vmx/vmx.o vmx/pmu_intel.o vmx/vmcs12.o vmx/evmcs.o vmx/nested.o +kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o vmx/evmcs.o vmx/nested.o kvm-amd-y += svm.o pmu_amd.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index da16f022a529..abbad93ed26a 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -19,8 +19,6 @@ module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); static bool __read_mostly nested_early_check = 0; module_param(nested_early_check, bool, S_IRUGO); -extern const ulong vmx_early_consistency_check_return; - /* * Hyper-V requires all of these, so mark them as supported even though * they are just treated the same as all-context. @@ -2715,7 +2713,7 @@ static int nested_vmx_check_vmentry_postreqs(struct kvm_vcpu *vcpu, return 0; } -static int __noclone nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) +static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long cr3, cr4; @@ -2740,8 +2738,6 @@ static int __noclone nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) */ vmcs_writel(GUEST_RFLAGS, 0); - vmcs_writel(HOST_RIP, vmx_early_consistency_check_return); - cr3 = __get_current_cr3_fast(); if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { vmcs_writel(HOST_CR3, cr3); @@ -2758,33 +2754,27 @@ static int __noclone nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) asm( /* Set HOST_RSP */ + "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t" - "mov %%" _ASM_SP ", %c[host_rsp](%% " _ASM_CX")\n\t" + "mov %%" _ASM_SP ", %c[host_rsp](%1)\n\t" + "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ /* Check if vmlaunch or vmresume is needed */ "cmpl $0, %c[launched](%% " _ASM_CX")\n\t" - "jne 1f\n\t" - __ex("vmlaunch") "\n\t" - "jmp 2f\n\t" - "1: " __ex("vmresume") "\n\t" - "2: " + + "call vmx_vmenter\n\t" + /* Set vmx->fail accordingly */ "setbe %c[fail](%% " _ASM_CX")\n\t" - - ".pushsection .rodata\n\t" - ".global vmx_early_consistency_check_return\n\t" - "vmx_early_consistency_check_return: " _ASM_PTR " 2b\n\t" - ".popsection" - : + : ASM_CALL_CONSTRAINT : "c"(vmx), "d"((unsigned long)HOST_RSP), [launched]"i"(offsetof(struct vcpu_vmx, __launched)), [fail]"i"(offsetof(struct vcpu_vmx, fail)), - [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)) + [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), + [wordsize]"i"(sizeof(ulong)) : "rax", "cc", "memory" ); - vmcs_writel(HOST_RIP, vmx_return); - preempt_enable(); if (vmx->msr_autoload.host.nr) diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S new file mode 100644 index 000000000000..bcef2c7e9bc4 --- /dev/null +++ b/arch/x86/kvm/vmx/vmenter.S @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include +#include + + .text + +/** + * vmx_vmenter - VM-Enter the current loaded VMCS + * + * %RFLAGS.ZF: !VMCS.LAUNCHED, i.e. controls VMLAUNCH vs. VMRESUME + * + * Returns: + * %RFLAGS.CF is set on VM-Fail Invalid + * %RFLAGS.ZF is set on VM-Fail Valid + * %RFLAGS.{CF,ZF} are cleared on VM-Success, i.e. VM-Exit + * + * Note that VMRESUME/VMLAUNCH fall-through and return directly if + * they VM-Fail, whereas a successful VM-Enter + VM-Exit will jump + * to vmx_vmexit. + */ +ENTRY(vmx_vmenter) + /* EFLAGS.ZF is set if VMCS.LAUNCHED == 0 */ + je 2f + +1: vmresume + ret + +2: vmlaunch + ret + +3: cmpb $0, kvm_rebooting + jne 4f + call kvm_spurious_fault +4: ret + + .pushsection .fixup, "ax" +5: jmp 3b + .popsection + + _ASM_EXTABLE(1b, 5b) + _ASM_EXTABLE(2b, 5b) + +ENDPROC(vmx_vmenter) + +/** + * vmx_vmexit - Handle a VMX VM-Exit + * + * Returns: + * %RFLAGS.{CF,ZF} are cleared on VM-Success, i.e. VM-Exit + * + * This is vmx_vmenter's partner in crime. On a VM-Exit, control will jump + * here after hardware loads the host's state, i.e. this is the destination + * referred to by VMCS.HOST_RIP. + */ +ENTRY(vmx_vmexit) + ret +ENDPROC(vmx_vmexit) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e82319fbdceb..41d6f7081ff7 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -336,6 +336,8 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var); static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr, int type); +void vmx_vmexit(void); + static DEFINE_PER_CPU(struct vmcs *, vmxarea); DEFINE_PER_CPU(struct vmcs *, current_vmcs); /* @@ -3773,7 +3775,7 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ vmx->host_idt_base = dt.address; - vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */ + vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */ rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); vmcs_write32(HOST_IA32_SYSENTER_CS, low32); @@ -6360,7 +6362,7 @@ static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) vmx->loaded_vmcs->hv_timer_armed = false; } -static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) +static void vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long cr3, cr4, evmcs_rsp; @@ -6440,6 +6442,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) "push %%" _ASM_DX "; push %%" _ASM_BP ";" "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */ "push %%" _ASM_CX " \n\t" + "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ "cmp %%" _ASM_SP ", %c[host_rsp](%%" _ASM_CX ") \n\t" "je 1f \n\t" "mov %%" _ASM_SP ", %c[host_rsp](%%" _ASM_CX ") \n\t" @@ -6451,6 +6454,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) "2: \n\t" __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t" "1: \n\t" + "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ + /* Reload cr2 if changed */ "mov %c[cr2](%%" _ASM_CX "), %%" _ASM_AX " \n\t" "mov %%cr2, %%" _ASM_DX " \n\t" @@ -6481,11 +6486,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) "mov %c[rcx](%%" _ASM_CX "), %%" _ASM_CX " \n\t" /* Enter guest mode */ - "jne 1f \n\t" - __ex("vmlaunch") "\n\t" - "jmp 2f \n\t" - "1: " __ex("vmresume") "\n\t" - "2: " + "call vmx_vmenter\n\t" /* Save guest's RCX to the stack placeholder (see above) */ "mov %%" _ASM_CX ", %c[wordsize](%%" _ASM_SP ") \n\t" @@ -6534,11 +6535,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) "xor %%esi, %%esi \n\t" "xor %%edi, %%edi \n\t" "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t" - ".pushsection .rodata \n\t" - ".global vmx_return \n\t" - "vmx_return: " _ASM_PTR " 2b \n\t" - ".popsection" - : : "c"(vmx), "d"((unsigned long)HOST_RSP), "S"(evmcs_rsp), + : ASM_CALL_CONSTRAINT + : "c"(vmx), "d"((unsigned long)HOST_RSP), "S"(evmcs_rsp), [launched]"i"(offsetof(struct vcpu_vmx, __launched)), [fail]"i"(offsetof(struct vcpu_vmx, fail)), [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 20172c11d5f8..99328954c2fc 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -12,7 +12,6 @@ #include "vmcs.h" extern const u32 vmx_msr_index[]; -extern const ulong vmx_return; extern u64 host_efer; #define MSR_TYPE_R 1 -- cgit v1.2.3 From a0aea130afebcd091d5396d13f25b9da24c9144a Mon Sep 17 00:00:00 2001 From: Robert Hoo Date: Wed, 19 Dec 2018 21:51:43 +0800 Subject: KVM: x86: Add CPUID support for new instruction WBNOINVD Signed-off-by: Robert Hoo Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kvm/cpuid.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 28c4a502b419..39a48f06d39d 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -281,6 +281,7 @@ #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ #define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */ #define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */ +#define X86_FEATURE_WBNOINVD (13*32+ 9) /* WBNOINVD instruction */ #define X86_FEATURE_AMD_IBPB (13*32+12) /* "" Indirect Branch Prediction Barrier */ #define X86_FEATURE_AMD_IBRS (13*32+14) /* "" Indirect Branch Restricted Speculation */ #define X86_FEATURE_AMD_STIBP (13*32+15) /* "" Single Thread Indirect Branch Predictors */ diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index c731e87baef5..bbffa6c54697 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -378,7 +378,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 0x80000008.ebx */ const u32 kvm_cpuid_8000_0008_ebx_x86_features = - F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | + F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | F(AMD_SSB_NO) | F(AMD_STIBP); /* cpuid 0xC0000001.edx */ -- cgit v1.2.3