aboutsummaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
authorLinus Torvalds2018-06-12 11:34:04 -0700
committerLinus Torvalds2018-06-12 11:34:04 -0700
commitb357bf6023a948cf6a9472f07a1b0caac0e4f8e8 (patch)
tree1471a2691cd56e8640cf6ad51e255b54903a164b /arch/x86
parent0725d4e1b8b08a60838db3a6e65c23ea8824a048 (diff)
parent766d3571d8e50d3a73b77043dc632226f9e6b389 (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Paolo Bonzini: "Small update for KVM: ARM: - lazy context-switching of FPSIMD registers on arm64 - "split" regions for vGIC redistributor s390: - cleanups for nested - clock handling - crypto - storage keys - control register bits x86: - many bugfixes - implement more Hyper-V super powers - implement lapic_timer_advance_ns even when the LAPIC timer is emulated using the processor's VMX preemption timer. - two security-related bugfixes at the top of the branch" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (79 commits) kvm: fix typo in flag name kvm: x86: use correct privilege level for sgdt/sidt/fxsave/fxrstor access KVM: x86: pass kvm_vcpu to kvm_read_guest_virt and kvm_write_guest_virt_system KVM: x86: introduce linear_{read,write}_system kvm: nVMX: Enforce cpl=0 for VMX instructions kvm: nVMX: Add support for "VMWRITE to any supported field" kvm: nVMX: Restrict VMX capability MSR changes KVM: VMX: Optimize tscdeadline timer latency KVM: docs: nVMX: Remove known limitations as they do not exist now KVM: docs: mmu: KVM support exposing SLAT to guests kvm: no need to check return value of debugfs_create functions kvm: Make VM ioctl do valloc for some archs kvm: Change return type to vm_fault_t KVM: docs: mmu: Fix link to NPT presentation from KVM Forum 2008 kvm: x86: Amend the KVM_GET_SUPPORTED_CPUID API documentation KVM: x86: hyperv: declare KVM_CAP_HYPERV_TLBFLUSH capability KVM: x86: hyperv: simplistic HVCALL_FLUSH_VIRTUAL_ADDRESS_{LIST,SPACE}_EX implementation KVM: x86: hyperv: simplistic HVCALL_FLUSH_VIRTUAL_ADDRESS_{LIST,SPACE} implementation KVM: introduce kvm_make_vcpus_request_mask() API KVM: x86: hyperv: do rep check for each hypercall separately ...
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/hyperv/mmu.c28
-rw-r--r--arch/x86/include/asm/hyperv-tlfs.h25
-rw-r--r--arch/x86/include/asm/kvm_emulate.h6
-rw-r--r--arch/x86/include/asm/kvm_host.h7
-rw-r--r--arch/x86/include/asm/mshyperv.h2
-rw-r--r--arch/x86/include/asm/vmx.h2
-rw-r--r--arch/x86/kvm/cpuid.c3
-rw-r--r--arch/x86/kvm/emulate.c76
-rw-r--r--arch/x86/kvm/hyperv.c171
-rw-r--r--arch/x86/kvm/lapic.c12
-rw-r--r--arch/x86/kvm/lapic.h14
-rw-r--r--arch/x86/kvm/mmu.c78
-rw-r--r--arch/x86/kvm/svm.c16
-rw-r--r--arch/x86/kvm/trace.h51
-rw-r--r--arch/x86/kvm/vmx.c489
-rw-r--r--arch/x86/kvm/x86.c97
-rw-r--r--arch/x86/kvm/x86.h4
17 files changed, 813 insertions, 268 deletions
diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
index 5f053d7d1bd9..de27615c51ea 100644
--- a/arch/x86/hyperv/mmu.c
+++ b/arch/x86/hyperv/mmu.c
@@ -13,22 +13,6 @@
#define CREATE_TRACE_POINTS
#include <asm/trace/hyperv.h>
-/* HvFlushVirtualAddressSpace, HvFlushVirtualAddressList hypercalls */
-struct hv_flush_pcpu {
- u64 address_space;
- u64 flags;
- u64 processor_mask;
- u64 gva_list[];
-};
-
-/* HvFlushVirtualAddressSpaceEx, HvFlushVirtualAddressListEx hypercalls */
-struct hv_flush_pcpu_ex {
- u64 address_space;
- u64 flags;
- struct hv_vpset hv_vp_set;
- u64 gva_list[];
-};
-
/* Each gva in gva_list encodes up to 4096 pages to flush */
#define HV_TLB_FLUSH_UNIT (4096 * PAGE_SIZE)
@@ -67,8 +51,8 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus,
const struct flush_tlb_info *info)
{
int cpu, vcpu, gva_n, max_gvas;
- struct hv_flush_pcpu **flush_pcpu;
- struct hv_flush_pcpu *flush;
+ struct hv_tlb_flush **flush_pcpu;
+ struct hv_tlb_flush *flush;
u64 status = U64_MAX;
unsigned long flags;
@@ -82,7 +66,7 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus,
local_irq_save(flags);
- flush_pcpu = (struct hv_flush_pcpu **)
+ flush_pcpu = (struct hv_tlb_flush **)
this_cpu_ptr(hyperv_pcpu_input_arg);
flush = *flush_pcpu;
@@ -152,8 +136,8 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
const struct flush_tlb_info *info)
{
int nr_bank = 0, max_gvas, gva_n;
- struct hv_flush_pcpu_ex **flush_pcpu;
- struct hv_flush_pcpu_ex *flush;
+ struct hv_tlb_flush_ex **flush_pcpu;
+ struct hv_tlb_flush_ex *flush;
u64 status = U64_MAX;
unsigned long flags;
@@ -167,7 +151,7 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
local_irq_save(flags);
- flush_pcpu = (struct hv_flush_pcpu_ex **)
+ flush_pcpu = (struct hv_tlb_flush_ex **)
this_cpu_ptr(hyperv_pcpu_input_arg);
flush = *flush_pcpu;
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index 3bfa92c2793c..b8c89265baf0 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -308,6 +308,9 @@ struct ms_hyperv_tsc_page {
/* TSC emulation after migration */
#define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106
+/* Nested features (CPUID 0x4000000A) EAX */
+#define HV_X64_NESTED_MSR_BITMAP BIT(19)
+
struct hv_reenlightenment_control {
__u64 vector:8;
__u64 reserved1:8;
@@ -678,7 +681,11 @@ struct hv_enlightened_vmcs {
u32 hv_clean_fields;
u32 hv_padding_32;
u32 hv_synthetic_controls;
- u32 hv_enlightenments_control;
+ struct {
+ u32 nested_flush_hypercall:1;
+ u32 msr_bitmap:1;
+ u32 reserved:30;
+ } hv_enlightenments_control;
u32 hv_vp_id;
u64 hv_vm_id;
@@ -734,4 +741,20 @@ struct ipi_arg_ex {
struct hv_vpset vp_set;
};
+/* HvFlushVirtualAddressSpace, HvFlushVirtualAddressList hypercalls */
+struct hv_tlb_flush {
+ u64 address_space;
+ u64 flags;
+ u64 processor_mask;
+ u64 gva_list[];
+};
+
+/* HvFlushVirtualAddressSpaceEx, HvFlushVirtualAddressListEx hypercalls */
+struct hv_tlb_flush_ex {
+ u64 address_space;
+ u64 flags;
+ struct hv_vpset hv_vp_set;
+ u64 gva_list[];
+};
+
#endif
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index b24b1c8b3979..0f82cd91cd3c 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -107,11 +107,12 @@ struct x86_emulate_ops {
* @addr: [IN ] Linear address from which to read.
* @val: [OUT] Value read from memory, zero-extended to 'u_long'.
* @bytes: [IN ] Number of bytes to read from memory.
+ * @system:[IN ] Whether the access is forced to be at CPL0.
*/
int (*read_std)(struct x86_emulate_ctxt *ctxt,
unsigned long addr, void *val,
unsigned int bytes,
- struct x86_exception *fault);
+ struct x86_exception *fault, bool system);
/*
* read_phys: Read bytes of standard (non-emulated/special) memory.
@@ -129,10 +130,11 @@ struct x86_emulate_ops {
* @addr: [IN ] Linear address to which to write.
* @val: [OUT] Value write to memory, zero-extended to 'u_long'.
* @bytes: [IN ] Number of bytes to write to memory.
+ * @system:[IN ] Whether the access is forced to be at CPL0.
*/
int (*write_std)(struct x86_emulate_ctxt *ctxt,
unsigned long addr, void *val, unsigned int bytes,
- struct x86_exception *fault);
+ struct x86_exception *fault, bool system);
/*
* fetch: Read bytes of standard (non-emulated/special) memory.
* Used for instruction fetch.
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f4b2588865e9..c13cd28d9d1b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -258,7 +258,8 @@ union kvm_mmu_page_role {
unsigned smep_andnot_wp:1;
unsigned smap_andnot_wp:1;
unsigned ad_disabled:1;
- unsigned :7;
+ unsigned guest_mode:1;
+ unsigned :6;
/*
* This is left at the top of the word so that
@@ -476,6 +477,7 @@ struct kvm_vcpu_hv {
struct kvm_hyperv_exit exit;
struct kvm_vcpu_hv_stimer stimer[HV_SYNIC_STIMER_COUNT];
DECLARE_BITMAP(stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT);
+ cpumask_t tlb_lush;
};
struct kvm_vcpu_arch {
@@ -995,7 +997,7 @@ struct kvm_x86_ops {
void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr);
void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
- void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
+ void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu);
void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
@@ -1277,6 +1279,7 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
int kvm_mmu_load(struct kvm_vcpu *vcpu);
void kvm_mmu_unload(struct kvm_vcpu *vcpu);
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
+void kvm_mmu_free_roots(struct kvm_vcpu *vcpu);
gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
struct x86_exception *exception);
gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 997192131b7b..3cd14311edfa 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -269,7 +269,7 @@ static inline int cpumask_to_vpset(struct hv_vpset *vpset,
return 0;
/*
- * Clear all banks up to the maximum possible bank as hv_flush_pcpu_ex
+ * Clear all banks up to the maximum possible bank as hv_tlb_flush_ex
* structs are not cleared between calls, we risk flushing unneeded
* vCPUs otherwise.
*/
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 5db8b0b10766..425e6b8b9547 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -207,7 +207,9 @@ enum vmcs_field {
EPTP_LIST_ADDRESS = 0x00002024,
EPTP_LIST_ADDRESS_HIGH = 0x00002025,
VMREAD_BITMAP = 0x00002026,
+ VMREAD_BITMAP_HIGH = 0x00002027,
VMWRITE_BITMAP = 0x00002028,
+ VMWRITE_BITMAP_HIGH = 0x00002029,
XSS_EXIT_BITMAP = 0x0000202C,
XSS_EXIT_BITMAP_HIGH = 0x0000202D,
TSC_MULTIPLIER = 0x00002032,
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index f4f30d0c25c4..5720e78b2f7b 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -404,7 +404,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
const u32 kvm_cpuid_7_0_ecx_x86_features =
F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ |
F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
- F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG);
+ F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
+ F(CLDEMOTE);
/* cpuid 7.0.edx*/
const u32 kvm_cpuid_7_0_edx_x86_features =
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index b3705ae52824..4c4f4263420c 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -812,6 +812,19 @@ static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
return assign_eip_near(ctxt, ctxt->_eip + rel);
}
+static int linear_read_system(struct x86_emulate_ctxt *ctxt, ulong linear,
+ void *data, unsigned size)
+{
+ return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception, true);
+}
+
+static int linear_write_system(struct x86_emulate_ctxt *ctxt,
+ ulong linear, void *data,
+ unsigned int size)
+{
+ return ctxt->ops->write_std(ctxt, linear, data, size, &ctxt->exception, true);
+}
+
static int segmented_read_std(struct x86_emulate_ctxt *ctxt,
struct segmented_address addr,
void *data,
@@ -823,7 +836,7 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt,
rc = linearize(ctxt, addr, size, false, &linear);
if (rc != X86EMUL_CONTINUE)
return rc;
- return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception);
+ return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception, false);
}
static int segmented_write_std(struct x86_emulate_ctxt *ctxt,
@@ -837,7 +850,7 @@ static int segmented_write_std(struct x86_emulate_ctxt *ctxt,
rc = linearize(ctxt, addr, size, true, &linear);
if (rc != X86EMUL_CONTINUE)
return rc;
- return ctxt->ops->write_std(ctxt, linear, data, size, &ctxt->exception);
+ return ctxt->ops->write_std(ctxt, linear, data, size, &ctxt->exception, false);
}
/*
@@ -1496,8 +1509,7 @@ static int read_interrupt_descriptor(struct x86_emulate_ctxt *ctxt,
return emulate_gp(ctxt, index << 3 | 0x2);
addr = dt.address + index * 8;
- return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc,
- &ctxt->exception);
+ return linear_read_system(ctxt, addr, desc, sizeof *desc);
}
static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
@@ -1560,8 +1572,7 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
if (rc != X86EMUL_CONTINUE)
return rc;
- return ctxt->ops->read_std(ctxt, *desc_addr_p, desc, sizeof(*desc),
- &ctxt->exception);
+ return linear_read_system(ctxt, *desc_addr_p, desc, sizeof(*desc));
}
/* allowed just for 8 bytes segments */
@@ -1575,8 +1586,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
if (rc != X86EMUL_CONTINUE)
return rc;
- return ctxt->ops->write_std(ctxt, addr, desc, sizeof *desc,
- &ctxt->exception);
+ return linear_write_system(ctxt, addr, desc, sizeof *desc);
}
static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
@@ -1737,8 +1747,7 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
return ret;
}
} else if (ctxt->mode == X86EMUL_MODE_PROT64) {
- ret = ctxt->ops->read_std(ctxt, desc_addr+8, &base3,
- sizeof(base3), &ctxt->exception);
+ ret = linear_read_system(ctxt, desc_addr+8, &base3, sizeof(base3));
if (ret != X86EMUL_CONTINUE)
return ret;
if (emul_is_noncanonical_address(get_desc_base(&seg_desc) |
@@ -2051,11 +2060,11 @@ static int __emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
eip_addr = dt.address + (irq << 2);
cs_addr = dt.address + (irq << 2) + 2;
- rc = ops->read_std(ctxt, cs_addr, &cs, 2, &ctxt->exception);
+ rc = linear_read_system(ctxt, cs_addr, &cs, 2);
if (rc != X86EMUL_CONTINUE)
return rc;
- rc = ops->read_std(ctxt, eip_addr, &eip, 2, &ctxt->exception);
+ rc = linear_read_system(ctxt, eip_addr, &eip, 2);
if (rc != X86EMUL_CONTINUE)
return rc;
@@ -2919,12 +2928,12 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
#ifdef CONFIG_X86_64
base |= ((u64)base3) << 32;
#endif
- r = ops->read_std(ctxt, base + 102, &io_bitmap_ptr, 2, NULL);
+ r = ops->read_std(ctxt, base + 102, &io_bitmap_ptr, 2, NULL, true);
if (r != X86EMUL_CONTINUE)
return false;
if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg))
return false;
- r = ops->read_std(ctxt, base + io_bitmap_ptr + port/8, &perm, 2, NULL);
+ r = ops->read_std(ctxt, base + io_bitmap_ptr + port/8, &perm, 2, NULL, true);
if (r != X86EMUL_CONTINUE)
return false;
if ((perm >> bit_idx) & mask)
@@ -3053,35 +3062,30 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
u16 tss_selector, u16 old_tss_sel,
ulong old_tss_base, struct desc_struct *new_desc)
{
- const struct x86_emulate_ops *ops = ctxt->ops;
struct tss_segment_16 tss_seg;
int ret;
u32 new_tss_base = get_desc_base(new_desc);
- ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
- &ctxt->exception);
+ ret = linear_read_system(ctxt, old_tss_base, &tss_seg, sizeof tss_seg);
if (ret != X86EMUL_CONTINUE)
return ret;
save_state_to_tss16(ctxt, &tss_seg);
- ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
- &ctxt->exception);
+ ret = linear_write_system(ctxt, old_tss_base, &tss_seg, sizeof tss_seg);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg,
- &ctxt->exception);
+ ret = linear_read_system(ctxt, new_tss_base, &tss_seg, sizeof tss_seg);
if (ret != X86EMUL_CONTINUE)
return ret;
if (old_tss_sel != 0xffff) {
tss_seg.prev_task_link = old_tss_sel;
- ret = ops->write_std(ctxt, new_tss_base,
- &tss_seg.prev_task_link,
- sizeof tss_seg.prev_task_link,
- &ctxt->exception);
+ ret = linear_write_system(ctxt, new_tss_base,
+ &tss_seg.prev_task_link,
+ sizeof tss_seg.prev_task_link);
if (ret != X86EMUL_CONTINUE)
return ret;
}
@@ -3197,38 +3201,34 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
u16 tss_selector, u16 old_tss_sel,
ulong old_tss_base, struct desc_struct *new_desc)
{
- const struct x86_emulate_ops *ops = ctxt->ops;
struct tss_segment_32 tss_seg;
int ret;
u32 new_tss_base = get_desc_base(new_desc);
u32 eip_offset = offsetof(struct tss_segment_32, eip);
u32 ldt_sel_offset = offsetof(struct tss_segment_32, ldt_selector);
- ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
- &ctxt->exception);
+ ret = linear_read_system(ctxt, old_tss_base, &tss_seg, sizeof tss_seg);
if (ret != X86EMUL_CONTINUE)
return ret;
save_state_to_tss32(ctxt, &tss_seg);
/* Only GP registers and segment selectors are saved */
- ret = ops->write_std(ctxt, old_tss_base + eip_offset, &tss_seg.eip,
- ldt_sel_offset - eip_offset, &ctxt->exception);
+ ret = linear_write_system(ctxt, old_tss_base + eip_offset, &tss_seg.eip,
+ ldt_sel_offset - eip_offset);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg,
- &ctxt->exception);
+ ret = linear_read_system(ctxt, new_tss_base, &tss_seg, sizeof tss_seg);
if (ret != X86EMUL_CONTINUE)
return ret;
if (old_tss_sel != 0xffff) {
tss_seg.prev_task_link = old_tss_sel;
- ret = ops->write_std(ctxt, new_tss_base,
- &tss_seg.prev_task_link,
- sizeof tss_seg.prev_task_link,
- &ctxt->exception);
+ ret = linear_write_system(ctxt, new_tss_base,
+ &tss_seg.prev_task_link,
+ sizeof tss_seg.prev_task_link);
if (ret != X86EMUL_CONTINUE)
return ret;
}
@@ -4189,7 +4189,9 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt)
maxphyaddr = eax & 0xff;
else
maxphyaddr = 36;
- rsvd = rsvd_bits(maxphyaddr, 62);
+ rsvd = rsvd_bits(maxphyaddr, 63);
+ if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_PCIDE)
+ rsvd &= ~CR3_PCID_INVD;
}
if (new_val & rsvd)
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 46ff64da44ca..af8caf965baa 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1242,6 +1242,121 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
return kvm_hv_get_msr(vcpu, msr, pdata);
}
+static __always_inline int get_sparse_bank_no(u64 valid_bank_mask, int bank_no)
+{
+ int i = 0, j;
+
+ if (!(valid_bank_mask & BIT_ULL(bank_no)))
+ return -1;
+
+ for (j = 0; j < bank_no; j++)
+ if (valid_bank_mask & BIT_ULL(j))
+ i++;
+
+ return i;
+}
+
+static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa,
+ u16 rep_cnt, bool ex)
+{
+ struct kvm *kvm = current_vcpu->kvm;
+ struct kvm_vcpu_hv *hv_current = &current_vcpu->arch.hyperv;
+ struct hv_tlb_flush_ex flush_ex;
+ struct hv_tlb_flush flush;
+ struct kvm_vcpu *vcpu;
+ unsigned long vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)] = {0};
+ unsigned long valid_bank_mask = 0;
+ u64 sparse_banks[64];
+ int sparse_banks_len, i;
+ bool all_cpus;
+
+ if (!ex) {
+ if (unlikely(kvm_read_guest(kvm, ingpa, &flush, sizeof(flush))))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+ trace_kvm_hv_flush_tlb(flush.processor_mask,
+ flush.address_space, flush.flags);
+
+ sparse_banks[0] = flush.processor_mask;
+ all_cpus = flush.flags & HV_FLUSH_ALL_PROCESSORS;
+ } else {
+ if (unlikely(kvm_read_guest(kvm, ingpa, &flush_ex,
+ sizeof(flush_ex))))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+ trace_kvm_hv_flush_tlb_ex(flush_ex.hv_vp_set.valid_bank_mask,
+ flush_ex.hv_vp_set.format,
+ flush_ex.address_space,
+ flush_ex.flags);
+
+ valid_bank_mask = flush_ex.hv_vp_set.valid_bank_mask;
+ all_cpus = flush_ex.hv_vp_set.format !=
+ HV_GENERIC_SET_SPARSE_4K;
+
+ sparse_banks_len = bitmap_weight(&valid_bank_mask, 64) *
+ sizeof(sparse_banks[0]);
+
+ if (!sparse_banks_len && !all_cpus)
+ goto ret_success;
+
+ if (!all_cpus &&
+ kvm_read_guest(kvm,
+ ingpa + offsetof(struct hv_tlb_flush_ex,
+ hv_vp_set.bank_contents),
+ sparse_banks,
+ sparse_banks_len))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ }
+
+ cpumask_clear(&hv_current->tlb_lush);
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
+ int bank = hv->vp_index / 64, sbank = 0;
+
+ if (!all_cpus) {
+ /* Banks >64 can't be represented */
+ if (bank >= 64)
+ continue;
+
+ /* Non-ex hypercalls can only address first 64 vCPUs */
+ if (!ex && bank)
+ continue;
+
+ if (ex) {
+ /*
+ * Check is the bank of this vCPU is in sparse
+ * set and get the sparse bank number.
+ */
+ sbank = get_sparse_bank_no(valid_bank_mask,
+ bank);
+
+ if (sbank < 0)
+ continue;
+ }
+
+ if (!(sparse_banks[sbank] & BIT_ULL(hv->vp_index % 64)))
+ continue;
+ }
+
+ /*
+ * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we
+ * can't analyze it here, flush TLB regardless of the specified
+ * address space.
+ */
+ __set_bit(i, vcpu_bitmap);
+ }
+
+ kvm_make_vcpus_request_mask(kvm,
+ KVM_REQ_TLB_FLUSH | KVM_REQUEST_NO_WAKEUP,
+ vcpu_bitmap, &hv_current->tlb_lush);
+
+ret_success:
+ /* We always do full TLB flush, set rep_done = rep_cnt. */
+ return (u64)HV_STATUS_SUCCESS |
+ ((u64)rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET);
+}
+
bool kvm_hv_hypercall_enabled(struct kvm *kvm)
{
return READ_ONCE(kvm->arch.hyperv.hv_hypercall) & HV_X64_MSR_HYPERCALL_ENABLE;
@@ -1315,7 +1430,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
{
u64 param, ingpa, outgpa, ret = HV_STATUS_SUCCESS;
uint16_t code, rep_idx, rep_cnt;
- bool fast, longmode;
+ bool fast, longmode, rep;
/*
* hypercall generates UD from non zero cpl and real mode
@@ -1345,31 +1460,34 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
#endif
code = param & 0xffff;
- fast = (param >> 16) & 0x1;
- rep_cnt = (param >> 32) & 0xfff;
- rep_idx = (param >> 48) & 0xfff;
+ fast = !!(param & HV_HYPERCALL_FAST_BIT);
+ rep_cnt = (param >> HV_HYPERCALL_REP_COMP_OFFSET) & 0xfff;
+ rep_idx = (param >> HV_HYPERCALL_REP_START_OFFSET) & 0xfff;
+ rep = !!(rep_cnt || rep_idx);
trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
- /* Hypercall continuation is not supported yet */
- if (rep_cnt || rep_idx) {
- ret = HV_STATUS_INVALID_HYPERCALL_CODE;
- goto out;
- }
-
switch (code) {
case HVCALL_NOTIFY_LONG_SPIN_WAIT:
+ if (unlikely(rep)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
kvm_vcpu_on_spin(vcpu, true);
break;
case HVCALL_SIGNAL_EVENT:
+ if (unlikely(rep)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
ret = kvm_hvcall_signal_event(vcpu, fast, ingpa);
if (ret != HV_STATUS_INVALID_PORT_ID)
break;
/* maybe userspace knows this conn_id: fall through */
case HVCALL_POST_MESSAGE:
/* don't bother userspace if it has no way to handle it */
- if (!vcpu_to_synic(vcpu)->active) {
- ret = HV_STATUS_INVALID_HYPERCALL_CODE;
+ if (unlikely(rep || !vcpu_to_synic(vcpu)->active)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
vcpu->run->exit_reason = KVM_EXIT_HYPERV;
@@ -1380,12 +1498,39 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
vcpu->arch.complete_userspace_io =
kvm_hv_hypercall_complete_userspace;
return 0;
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST:
+ if (unlikely(fast || !rep_cnt || rep_idx)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, false);
+ break;
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE:
+ if (unlikely(fast || rep)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, false);
+ break;
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX:
+ if (unlikely(fast || !rep_cnt || rep_idx)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, true);
+ break;
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX:
+ if (unlikely(fast || rep)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, true);
+ break;
default:
ret = HV_STATUS_INVALID_HYPERCALL_CODE;
break;
}
-out:
return kvm_hv_hypercall_complete(vcpu, ret);
}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 3773c4625114..b5cd8465d44f 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2002,13 +2002,11 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
}
}
- if ((old_value ^ value) & X2APIC_ENABLE) {
- if (value & X2APIC_ENABLE) {
- kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
- kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true);
- } else
- kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false);
- }
+ if (((old_value ^ value) & X2APIC_ENABLE) && (value & X2APIC_ENABLE))
+ kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
+
+ if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE))
+ kvm_x86_ops->set_virtual_apic_mode(vcpu);
apic->base_address = apic->vcpu->arch.apic_base &
MSR_IA32_APICBASE_BASE;
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index edce055e9fd7..ed0ed39abd36 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -16,6 +16,13 @@
#define APIC_BUS_CYCLE_NS 1
#define APIC_BUS_FREQUENCY (1000000000ULL / APIC_BUS_CYCLE_NS)
+enum lapic_mode {
+ LAPIC_MODE_DISABLED = 0,
+ LAPIC_MODE_INVALID = X2APIC_ENABLE,
+ LAPIC_MODE_XAPIC = MSR_IA32_APICBASE_ENABLE,
+ LAPIC_MODE_X2APIC = MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE,
+};
+
struct kvm_timer {
struct hrtimer timer;
s64 period; /* unit: ns */
@@ -89,6 +96,7 @@ u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
+enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu);
int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
@@ -220,4 +228,10 @@ void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu);
void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu);
bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu);
void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu);
+
+static inline enum lapic_mode kvm_apic_mode(u64 apic_base)
+{
+ return apic_base & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE);
+}
+
#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d634f0332c0f..d594690d8b95 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -222,7 +222,6 @@ static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK |
static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;
static void mmu_spte_set(u64 *sptep, u64 spte);
-static void mmu_free_roots(struct kvm_vcpu *vcpu);
void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value)
{
@@ -3343,51 +3342,48 @@ out_unlock:
return RET_PF_RETRY;
}
-
-static void mmu_free_roots(struct kvm_vcpu *vcpu)
+static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
+ struct list_head *invalid_list)
{
- int i;
struct kvm_mmu_page *sp;
- LIST_HEAD(invalid_list);
- if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ if (!VALID_PAGE(*root_hpa))
return;
- if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL &&
- (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL ||
- vcpu->arch.mmu.direct_map)) {
- hpa_t root = vcpu->arch.mmu.root_hpa;
+ sp = page_header(*root_hpa & PT64_BASE_ADDR_MASK);
+ --sp->root_count;
+ if (!sp->root_count && sp->role.invalid)
+ kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
- spin_lock(&vcpu->kvm->mmu_lock);
- sp = page_header(root);
- --sp->root_count;
- if (!sp->root_count && sp->role.invalid) {
- kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
- kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
- }
- spin_unlock(&vcpu->kvm->mmu_lock);
- vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+ *root_hpa = INVALID_PAGE;
+}
+
+void kvm_mmu_free_roots(struct kvm_vcpu *vcpu)
+{
+ int i;
+ LIST_HEAD(invalid_list);
+ struct kvm_mmu *mmu = &vcpu->arch.mmu;
+
+ if (!VALID_PAGE(mmu->root_hpa))
return;
- }
spin_lock(&vcpu->kvm->mmu_lock);
- for (i = 0; i < 4; ++i) {
- hpa_t root = vcpu->arch.mmu.pae_root[i];
- if (root) {
- root &= PT64_BASE_ADDR_MASK;
- sp = page_header(root);
- --sp->root_count;
- if (!sp->root_count && sp->role.invalid)
- kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
- &invalid_list);
- }
- vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
+ if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
+ (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
+ mmu_free_root_page(vcpu->kvm, &mmu->root_hpa, &invalid_list);
+ } else {
+ for (i = 0; i < 4; ++i)
+ if (mmu->pae_root[i] != 0)
+ mmu_free_root_page(vcpu->kvm, &mmu->pae_root[i],
+ &invalid_list);
+ mmu->root_hpa = INVALID_PAGE;
}
+
kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
spin_unlock(&vcpu->kvm->mmu_lock);
- vcpu->arch.mmu.root_hpa = INVALID_PAGE;
}
+EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
{
@@ -3720,7 +3716,6 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
*/
return RET_PF_RETRY;
}
-EXPORT_SYMBOL_GPL(handle_mmio_page_fault);
static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
u32 error_code, gfn_t gfn)
@@ -3812,6 +3807,14 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
struct kvm_memory_slot *slot;
bool async;
+ /*
+ * Don't expose private memslots to L2.
+ */
+ if (is_guest_mode(vcpu) && !kvm_is_visible_gfn(vcpu->kvm, gfn)) {
+ *pfn = KVM_PFN_NOSLOT;
+ return false;
+ }
+
slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
async = false;
*pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
@@ -3951,7 +3954,7 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu,
void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu)
{
- mmu_free_roots(vcpu);
+ kvm_mmu_free_roots(vcpu);
}
static unsigned long get_cr3(struct kvm_vcpu *vcpu)
@@ -4473,6 +4476,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
struct kvm_mmu *context = &vcpu->arch.mmu;
context->base_role.word = 0;
+ context->base_role.guest_mode = is_guest_mode(vcpu);
context->base_role.smm = is_smm(vcpu);
context->base_role.ad_disabled = (shadow_accessed_mask == 0);
context->page_fault = tdp_page_fault;
@@ -4539,6 +4543,7 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
= smep && !is_write_protection(vcpu);
context->base_role.smap_andnot_wp
= smap && !is_write_protection(vcpu);
+ context->base_role.guest_mode = is_guest_mode(vcpu);
context->base_role.smm = is_smm(vcpu);
reset_shadow_zero_bits_mask(vcpu, context);
}
@@ -4564,7 +4569,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
context->root_hpa = INVALID_PAGE;
context->direct_map = false;
context->base_role.ad_disabled = !accessed_dirty;
-
+ context->base_role.guest_mode = 1;
update_permission_bitmask(vcpu, context, true);
update_pkru_bitmask(vcpu, context, true);
update_last_nonleaf_level(vcpu, context);
@@ -4664,7 +4669,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_load);
void kvm_mmu_unload(struct kvm_vcpu *vcpu)
{
- mmu_free_roots(vcpu);
+ kvm_mmu_free_roots(vcpu);
WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
}
EXPORT_SYMBOL_GPL(kvm_mmu_unload);
@@ -4825,6 +4830,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
mask.smep_andnot_wp = 1;
mask.smap_andnot_wp = 1;
mask.smm = 1;
+ mask.guest_mode = 1;
mask.ad_disabled = 1;
/*
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 950ec50f77c3..695b0bd02220 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1768,7 +1768,10 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
unsigned long npages, npinned, size;
unsigned long locked, lock_limit;
struct page **pages;
- int first, last;
+ unsigned long first, last;
+
+ if (ulen == 0 || uaddr + ulen < uaddr)
+ return NULL;
/* Calculate number of pages. */
first = (uaddr & PAGE_MASK) >> PAGE_SHIFT;
@@ -1855,13 +1858,13 @@ static void __unregister_enc_region_locked(struct kvm *kvm,
static struct kvm *svm_vm_alloc(void)
{
- struct kvm_svm *kvm_svm = kzalloc(sizeof(struct kvm_svm), GFP_KERNEL);
+ struct kvm_svm *kvm_svm = vzalloc(sizeof(struct kvm_svm));
return &kvm_svm->kvm;
}
static void svm_vm_free(struct kvm *kvm)
{
- kfree(to_kvm_svm(kvm));
+ vfree(to_kvm_svm(kvm));
}
static void sev_vm_destroy(struct kvm *kvm)
@@ -5062,7 +5065,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
}
-static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
+static void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
{
return;
}
@@ -6949,6 +6952,9 @@ static int svm_register_enc_region(struct kvm *kvm,
if (!sev_guest(kvm))
return -ENOTTY;
+ if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
+ return -EINVAL;
+
region = kzalloc(sizeof(*region), GFP_KERNEL);
if (!region)
return -ENOMEM;
@@ -7100,7 +7106,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.enable_nmi_window = enable_nmi_window,
.enable_irq_window = enable_irq_window,
.update_cr8_intercept = update_cr8_intercept,
- .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
+ .set_virtual_apic_mode = svm_set_virtual_apic_mode,
.get_enable_apicv = svm_get_enable_apicv,
.refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
.load_eoi_exitmap = svm_load_eoi_exitmap,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 9807c314c478..0f997683404f 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -1367,6 +1367,57 @@ TRACE_EVENT(kvm_hv_timer_state,
__entry->vcpu_id,
__entry->hv_timer_in_use)
);
+
+/*
+ * Tracepoint for kvm_hv_flush_tlb.
+ */
+TRACE_EVENT(kvm_hv_flush_tlb,
+ TP_PROTO(u64 processor_mask, u64 address_space, u64 flags),
+ TP_ARGS(processor_mask, address_space, flags),
+
+ TP_STRUCT__entry(
+ __field(u64, processor_mask)
+ __field(u64, address_space)
+ __field(u64, flags)
+ ),
+
+ TP_fast_assign(
+ __entry->processor_mask = processor_mask;
+ __entry->address_space = address_space;
+ __entry->flags = flags;
+ ),
+
+ TP_printk("processor_mask 0x%llx address_space 0x%llx flags 0x%llx",
+ __entry->processor_mask, __entry->address_space,
+ __entry->flags)
+);
+
+/*
+ * Tracepoint for kvm_hv_flush_tlb_ex.
+ */
+TRACE_EVENT(kvm_hv_flush_tlb_ex,
+ TP_PROTO(u64 valid_bank_mask, u64 format, u64 address_space, u64 flags),
+ TP_ARGS(valid_bank_mask, format, address_space, flags),
+
+ TP_STRUCT__entry(
+ __field(u64, valid_bank_mask)
+ __field(u64, format)
+ __field(u64, address_space)
+ __field(u64, flags)
+ ),
+
+ TP_fast_assign(
+ __entry->valid_bank_mask = valid_bank_mask;
+ __entry->format = format;
+ __entry->address_space = address_space;
+ __entry->flags = flags;
+ ),
+
+ TP_printk("valid_bank_mask 0x%llx format 0x%llx "
+ "address_space 0x%llx flags 0x%llx",
+ __entry->valid_bank_mask, __entry->format,
+ __entry->address_space, __entry->flags)
+);
#endif /* _TRACE_KVM_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 40aa29204baf..fc61e25966e4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -242,7 +242,11 @@ struct shared_msr_entry {
* underlying hardware which will be used to run L2.
* This structure is packed to ensure that its layout is identical across
* machines (necessary for live migration).
- * If there are changes in this struct, VMCS12_REVISION must be changed.
+ *
+ * IMPORTANT: Changing the layout of existing fields in this structure
+ * will break save/restore compatibility with older kvm releases. When
+ * adding new fields, either use space in the reserved padding* arrays
+ * or add the new fields to the end of the structure.
*/
typedef u64 natural_width;
struct __packed vmcs12 {
@@ -265,17 +269,14 @@ struct __packed vmcs12 {
u64 virtual_apic_page_addr;
u64 apic_access_addr;
u64 posted_intr_desc_addr;
- u64 vm_function_control;
u64 ept_pointer;
u64 eoi_exit_bitmap0;
u64 eoi_exit_bitmap1;
u64 eoi_exit_bitmap2;
u64 eoi_exit_bitmap3;
- u64 eptp_list_address;
u64 xss_exit_bitmap;
u64 guest_physical_address;
u64 vmcs_link_pointer;
- u64 pml_address;
u64 guest_ia32_debugctl;
u64 guest_ia32_pat;
u64 guest_ia32_efer;
@@ -288,7 +289,12 @@ struct __packed vmcs12 {
u64 host_ia32_pat;
u64 host_ia32_efer;
u64 host_ia32_perf_global_ctrl;
- u64 padding64[8]; /* room for future expansion */
+ u64 vmread_bitmap;
+ u64 vmwrite_bitmap;
+ u64 vm_function_control;
+ u64 eptp_list_address;
+ u64 pml_address;
+ u64 padding64[3]; /* room for future expansion */
/*
* To allow migration of L1 (complete with its L2 guests) between
* machines of different natural widths (32 or 64 bit), we cannot have
@@ -397,7 +403,6 @@ struct __packed vmcs12 {
u16 guest_ldtr_selector;
u16 guest_tr_selector;
u16 guest_intr_status;
- u16 guest_pml_index;
u16 host_es_selector;
u16 host_cs_selector;
u16 host_ss_selector;
@@ -405,12 +410,172 @@ struct __packed vmcs12 {
u16 host_fs_selector;
u16 host_gs_selector;
u16 host_tr_selector;
+ u16 guest_pml_index;
};
/*
+ * For save/restore compatibility, the vmcs12 field offsets must not change.
+ */
+#define CHECK_OFFSET(field, loc) \
+ BUILD_BUG_ON_MSG(offsetof(struct vmcs12, field) != (loc), \
+ "Offset of " #field " in struct vmcs12 has changed.")
+
+static inline void vmx_check_vmcs12_offsets(void) {
+ CHECK_OFFSET(revision_id, 0);
+ CHECK_OFFSET(abort, 4);
+ CHECK_OFFSET(launch_state, 8);
+ CHECK_OFFSET(io_bitmap_a, 40);
+ CHECK_OFFSET(io_bitmap_b, 48);
+ CHECK_OFFSET(msr_bitmap, 56);
+ CHECK_OFFSET(vm_exit_msr_store_addr, 64);
+ CHECK_OFFSET(vm_exit_msr_load_addr, 72);
+ CHECK_OFFSET(vm_entry_msr_load_addr, 80);
+ CHECK_OFFSET(tsc_offset, 88);
+ CHECK_OFFSET(virtual_apic_page_addr, 96);
+ CHECK_OFFSET(apic_access_addr, 104);
+ CHECK_OFFSET(posted_intr_desc_addr, 112);
+ CHECK_OFFSET(ept_pointer, 120);
+ CHECK_OFFSET(eoi_exit_bitmap0, 128);
+ CHECK_OFFSET(eoi_exit_bitmap1, 136);
+ CHECK_OFFSET(eoi_exit_bitmap2, 144);
+ CHECK_OFFSET(eoi_exit_bitmap3, 152);
+ CHECK_OFFSET(xss_exit_bitmap, 160);
+ CHECK_OFFSET(guest_physical_address, 168);
+ CHECK_OFFSET(vmcs_link_pointer, 176);
+ CHECK_OFFSET(guest_ia32_debugctl, 184);
+ CHECK_OFFSET(guest_ia32_pat, 192);
+ CHECK_OFFSET(guest_ia32_efer, 200);
+ CHECK_OFFSET(guest_ia32_perf_global_ctrl, 208);
+ CHECK_OFFSET(guest_pdptr0, 216);
+ CHECK_OFFSET(guest_pdptr1, 224);
+ CHECK_OFFSET(guest_pdptr2, 232);
+ CHECK_OFFSET(guest_pdptr3, 240);
+ CHECK_OFFSET(guest_bndcfgs, 248);
+ CHECK_OFFSET(host_ia32_pat, 256);
+ CHECK_OFFSET(host_ia32_efer, 264);
+ CHECK_OFFSET(host_ia32_perf_global_ctrl, 272);
+ CHECK_OFFSET(vmread_bitmap, 280);
+ CHECK_OFFSET(vmwrite_bitmap, 288);
+ CHECK_OFFSET(vm_function_control, 296);
+ CHECK_OFFSET(eptp_list_address, 304);
+ CHECK_OFFSET(pml_address, 312);
+ CHECK_OFFSET(cr0_guest_host_mask, 344);
+ CHECK_OFFSET(cr4_guest_host_mask, 352);
+ CHECK_OFFSET(cr0_read_shadow, 360);
+ CHECK_OFFSET(cr4_read_shadow, 368);
+ CHECK_OFFSET(cr3_target_value0, 376);
+ CHECK_OFFSET(cr3_target_value1, 384);
+ CHECK_OFFSET(cr3_target_value2, 392);
+ CHECK_OFFSET(cr3_target_value3, 400);
+ CHECK_OFFSET(exit_qualification, 408);
+ CHECK_OFFSET(guest_linear_address, 416);
+ CHECK_OFFSET(guest_cr0, 424);
+ CHECK_OFFSET(guest_cr3, 432);
+ CHECK_OFFSET(guest_cr4, 440);
+ CHECK_OFFSET(guest_es_base, 448);
+ CHECK_OFFSET(guest_cs_base, 456);
+ CHECK_OFFSET(guest_ss_base, 464);
+ CHECK_OFFSET(guest_ds_base, 472);
+ CHECK_OFFSET(guest_fs_base, 480);
+ CHECK_OFFSET(guest_gs_base, 488);
+ CHECK_OFFSET(guest_ldtr_base, 496);
+ CHECK_OFFSET(guest_tr_base, 504);
+ CHECK_OFFSET(guest_gdtr_base, 512);
+ CHECK_OFFSET(guest_idtr_base, 520);
+ CHECK_OFFSET(guest_dr7, 528);
+ CHECK_OFFSET(guest_rsp, 536);
+ CHECK_OFFSET(guest_rip, 544);
+ CHECK_OFFSET(guest_rflags, 552);
+ CHECK_OFFSET(guest_pending_dbg_exceptions, 560);
+ CHECK_OFFSET(guest_sysenter_esp, 568);
+ CHECK_OFFSET(guest_sysenter_eip, 576);
+ CHECK_OFFSET(host_cr0, 584);
+ CHECK_OFFSET(host_cr3, 592);
+ CHECK_OFFSET(host_cr4, 600);
+ CHECK_OFFSET(host_fs_base, 608);
+ CHECK_OFFSET(host_gs_base, 616);
+ CHECK_OFFSET(host_tr_base, 624);
+ CHECK_OFFSET(host_gdtr_base, 632);
+ CHECK_OFFSET(host_idtr_base, 640);
+ CHECK_OFFSET(host_ia32_sysenter_esp, 648);
+ CHECK_OFFSET(host_ia32_sysenter_eip, 656);
+ CHECK_OFFSET(host_rsp, 664);
+ CHECK_OFFSET(host_rip, 672);
+ CHECK_OFFSET(pin_based_vm_exec_control, 744);
+ CHECK_OFFSET(cpu_based_vm_exec_control, 748);
+ CHECK_OFFSET(exception_bitmap, 752);
+ CHECK_OFFSET(page_fault_error_code_mask, 756);
+ CHECK_OFFSET(page_fault_error_code_match, 760);
+ CHECK_OFFSET(cr3_target_count, 764);
+ CHECK_OFFSET(vm_exit_controls, 768);
+ CHECK_OFFSET(vm_exit_msr_store_count, 772);
+ CHECK_OFFSET(vm_exit_msr_load_count, 776);
+ CHECK_OFFSET(vm_entry_controls, 780);
+ CHECK_OFFSET(vm_entry_msr_load_count, 784);
+ CHECK_OFFSET(vm_entry_intr_info_field, 788);
+ CHECK_OFFSET(vm_entry_exception_error_code, 792);
+ CHECK_OFFSET(vm_entry_instruction_len, 796);
+ CHECK_OFFSET(tpr_threshold, 800);
+ CHECK_OFFSET(secondary_vm_exec_control, 804);
+ CHECK_OFFSET(vm_instruction_error, 808);
+ CHECK_OFFSET(vm_exit_reason, 812);
+ CHECK_OFFSET(vm_exit_intr_info, 816);
+ CHECK_OFFSET(vm_exit_intr_error_code, 820);
+ CHECK_OFFSET(idt_vectoring_info_field, 824);
+ CHECK_OFFSET(idt_vectoring_error_code, 828);
+ CHECK_OFFSET(vm_exit_instruction_len, 832);
+ CHECK_OFFSET(vmx_instruction_info, 836);
+ CHECK_OFFSET(guest_es_limit, 840);
+ CHECK_OFFSET(guest_cs_limit, 844);
+ CHECK_OFFSET(guest_ss_limit, 848);
+ CHECK_OFFSET(guest_ds_limit, 852);
+ CHECK_OFFSET(guest_fs_limit, 856);
+ CHECK_OFFSET(guest_gs_limit, 860);
+ CHECK_OFFSET(guest_ldtr_limit, 864);
+ CHECK_OFFSET(guest_tr_limit, 868);
+ CHECK_OFFSET(guest_gdtr_limit, 872);
+ CHECK_OFFSET(guest_idtr_limit, 876);
+ CHECK_OFFSET(guest_es_ar_bytes, 880);
+ CHECK_OFFSET(guest_cs_ar_bytes, 884);
+ CHECK_OFFSET(guest_ss_ar_bytes, 888);
+ CHECK_OFFSET(guest_ds_ar_bytes, 892);
+ CHECK_OFFSET(guest_fs_ar_bytes, 896);
+ CHECK_OFFSET(guest_gs_ar_bytes, 900);
+ CHECK_OFFSET(guest_ldtr_ar_bytes, 904);
+ CHECK_OFFSET(guest_tr_ar_bytes, 908);
+ CHECK_OFFSET(guest_interruptibility_info, 912);
+ CHECK_OFFSET(guest_activity_state, 916);
+ CHECK_OFFSET(guest_sysenter_cs, 920);
+ CHECK_OFFSET(host_ia32_sysenter_cs, 924);
+ CHECK_OFFSET(vmx_preemption_timer_value, 928);
+ CHECK_OFFSET(virtual_processor_id, 960);
+ CHECK_OFFSET(posted_intr_nv, 962);
+ CHECK_OFFSET(guest_es_selector, 964);
+ CHECK_OFFSET(guest_cs_selector, 966);
+ CHECK_OFFSET(guest_ss_selector, 968);
+ CHECK_OFFSET(guest_ds_selector, 970);
+ CHECK_OFFSET(guest_fs_selector, 972);
+ CHECK_OFFSET(guest_gs_selector, 974);
+ CHECK_OFFSET(guest_ldtr_selector, 976);
+ CHECK_OFFSET(guest_tr_selector, 978);
+ CHECK_OFFSET(guest_intr_status, 980);
+ CHECK_OFFSET(host_es_selector, 982);
+ CHECK_OFFSET(host_cs_selector, 984);
+ CHECK_OFFSET(host_ss_selector, 986);
+ CHECK_OFFSET(host_ds_selector, 988);
+ CHECK_OFFSET(host_fs_selector, 990);
+ CHECK_OFFSET(host_gs_selector, 992);
+ CHECK_OFFSET(host_tr_selector, 994);
+ CHECK_OFFSET(guest_pml_index, 996);
+}
+
+/*
* VMCS12_REVISION is an arbitrary id that should be changed if the content or
* layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
* VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
+ *
+ * IMPORTANT: Changing this value will break save/restore compatibility with
+ * older kvm releases.
*/
#define VMCS12_REVISION 0x11e57ed0
@@ -481,7 +646,8 @@ struct nested_vmx {
bool sync_shadow_vmcs;
bool dirty_vmcs12;
- bool change_vmcs01_virtual_x2apic_mode;
+ bool change_vmcs01_virtual_apic_mode;
+
/* L2 must run next, and mustn't decide to exit to L1. */
bool nested_run_pending;
@@ -761,6 +927,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr),
FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr),
FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr),
+ FIELD64(PML_ADDRESS, pml_address),
FIELD64(TSC_OFFSET, tsc_offset),
FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
@@ -772,10 +939,11 @@ static const unsigned short vmcs_field_to_offset_table[] = {
FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
FIELD64(EPTP_LIST_ADDRESS, eptp_list_address),
+ FIELD64(VMREAD_BITMAP, vmread_bitmap),
+ FIELD64(VMWRITE_BITMAP, vmwrite_bitmap),
FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
- FIELD64(PML_ADDRESS, pml_address),
FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
FIELD64(GUEST_IA32_PAT, guest_ia32_pat),
FIELD64(GUEST_IA32_EFER, guest_ia32_efer),
@@ -1089,6 +1257,16 @@ static inline u16 evmcs_read16(unsigned long field)
return *(u16 *)((char *)current_evmcs + offset);
}
+static inline void evmcs_touch_msr_bitmap(void)
+{
+ if (unlikely(!current_evmcs))
+ return;
+
+ if (current_evmcs->hv_enlightenments_control.msr_bitmap)
+ current_evmcs->hv_clean_fields &=
+ ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
+}
+
static void evmcs_load(u64 phys_addr)
{
struct hv_vp_assist_page *vp_ap =
@@ -1173,6 +1351,7 @@ static inline u32 evmcs_read32(unsigned long field) { return 0; }
static inline u16 evmcs_read16(unsigned long field) { return 0; }
static inline void evmcs_load(u64 phys_addr) {}
static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {}
+static inline void evmcs_touch_msr_bitmap(void) {}
#endif /* IS_ENABLED(CONFIG_HYPERV) */
static inline bool is_exception_n(u32 intr_info, u8 vector)
@@ -1393,6 +1572,11 @@ static inline bool cpu_has_vmx_invept_global(void)
return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
}
+static inline bool cpu_has_vmx_invvpid_individual_addr(void)
+{
+ return vmx_capability.vpid & VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT;
+}
+
static inline bool cpu_has_vmx_invvpid_single(void)
{
return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
@@ -1510,6 +1694,17 @@ static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu)
return vmx_misc_cr3_count(to_vmx(vcpu)->nested.msrs.misc_low);
}
+/*
+ * Do the virtual VMX capability MSRs specify that L1 can use VMWRITE
+ * to modify any valid field of the VMCS, or are the VM-exit
+ * information fields read-only?
+ */
+static inline bool nested_cpu_has_vmwrite_any_field(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->nested.msrs.misc_low &
+ MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS;
+}
+
static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
{
return vmcs12->cpu_based_vm_exec_control & bit;
@@ -3127,6 +3322,7 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv)
msrs->misc_high);
msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA;
msrs->misc_low |=
+ MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
VMX_MISC_ACTIVITY_HLT;
msrs->misc_high = 0;
@@ -3300,6 +3496,15 @@ static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
vmx->nested.msrs.misc_low = data;
vmx->nested.msrs.misc_high = data >> 32;
+
+ /*
+ * If L1 has read-only VM-exit information fields, use the
+ * less permissive vmx_vmwrite_bitmap to specify write
+ * permissions for the shadow VMCS.
+ */
+ if (enable_shadow_vmcs && !nested_cpu_has_vmwrite_any_field(&vmx->vcpu))
+ vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
+
return 0;
}
@@ -3354,6 +3559,13 @@ static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ /*
+ * Don't allow changes to the VMX capability MSRs while the vCPU
+ * is in VMX operation.
+ */
+ if (vmx->nested.vmxon)
+ return -EBUSY;
+
switch (msr_index) {
case MSR_IA32_VMX_BASIC:
return vmx_restore_vmx_basic(vmx, data);
@@ -4216,6 +4428,14 @@ static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
if (!loaded_vmcs->msr_bitmap)
goto out_vmcs;
memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
+
+ if (static_branch_unlikely(&enable_evmcs) &&
+ (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
+ struct hv_enlightened_vmcs *evmcs =
+ (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs;
+
+ evmcs->hv_enlightenments_control.msr_bitmap = 1;
+ }
}
return 0;
@@ -5329,6 +5549,9 @@ static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bit
if (!cpu_has_vmx_msr_bitmap())
return;
+ if (static_branch_unlikely(&enable_evmcs))
+ evmcs_touch_msr_bitmap();
+
/*
* See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
* have the write-low and read-high bitmap offsets the wrong way round.
@@ -5364,6 +5587,9 @@ static void __always_inline vmx_enable_intercept_for_msr(unsigned long *msr_bitm
if (!cpu_has_vmx_msr_bitmap())
return;
+ if (static_branch_unlikely(&enable_evmcs))
+ evmcs_touch_msr_bitmap();
+
/*
* See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
* have the write-low and read-high bitmap offsets the wrong way round.
@@ -5946,8 +6172,14 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
int i;
if (enable_shadow_vmcs) {
+ /*
+ * At vCPU creation, "VMWRITE to any supported field
+ * in the VMCS" is supported, so use the more
+ * permissive vmx_vmread_bitmap to specify both read
+ * and write permissions for the shadow VMCS.
+ */
vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
- vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
+ vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmread_bitmap));
}
if (cpu_has_vmx_msr_bitmap())
vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
@@ -7588,8 +7820,7 @@ static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva))
return 1;
- if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, vmpointer,
- sizeof(*vmpointer), &e)) {
+ if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) {
kvm_inject_page_fault(vcpu, &e);
return 1;
}
@@ -7670,6 +7901,12 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
return 1;
}
+ /* CPL=0 must be checked manually. */
+ if (vmx_get_cpl(vcpu)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
if (vmx->nested.vmxon) {
nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
return kvm_skip_emulated_instruction(vcpu);
@@ -7729,6 +7966,11 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
*/
static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
{
+ if (vmx_get_cpl(vcpu)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 0;
+ }
+
if (!to_vmx(vcpu)->nested.vmxon) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 0;
@@ -7928,23 +8170,42 @@ static inline int vmcs12_write_any(struct kvm_vcpu *vcpu,
}
+/*
+ * Copy the writable VMCS shadow fields back to the VMCS12, in case
+ * they have been modified by the L1 guest. Note that the "read-only"
+ * VM-exit information fields are actually writable if the vCPU is
+ * configured to support "VMWRITE to any supported field in the VMCS."
+ */
static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
{
- int i;
+ const u16 *fields[] = {
+ shadow_read_write_fields,
+ shadow_read_only_fields
+ };
+ const int max_fields[] = {
+ max_shadow_read_write_fields,
+ max_shadow_read_only_fields
+ };
+ int i, q;
unsigned long field;
u64 field_value;
struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
- const u16 *fields = shadow_read_write_fields;
- const int num_fields = max_shadow_read_write_fields;
preempt_disable();
vmcs_load(shadow_vmcs);
- for (i = 0; i < num_fields; i++) {
- field = fields[i];
- field_value = __vmcs_readl(field);
- vmcs12_write_any(&vmx->vcpu, field, field_value);
+ for (q = 0; q < ARRAY_SIZE(fields); q++) {
+ for (i = 0; i < max_fields[q]; i++) {
+ field = fields[q][i];
+ field_value = __vmcs_readl(field);
+ vmcs12_write_any(&vmx->vcpu, field, field_value);
+ }
+ /*
+ * Skip the VM-exit information fields if they are read-only.
+ */
+ if (!nested_cpu_has_vmwrite_any_field(&vmx->vcpu))
+ break;
}
vmcs_clear(shadow_vmcs);
@@ -8029,9 +8290,9 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
if (get_vmx_mem_address(vcpu, exit_qualification,
vmx_instruction_info, true, &gva))
return 1;
- /* _system ok, as hardware has verified cpl=0 */
- kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva,
- &field_value, (is_long_mode(vcpu) ? 8 : 4), NULL);
+ /* _system ok, nested_vmx_check_permission has verified cpl=0 */
+ kvm_write_guest_virt_system(vcpu, gva, &field_value,
+ (is_long_mode(vcpu) ? 8 : 4), NULL);
}
nested_vmx_succeed(vcpu);
@@ -8069,8 +8330,8 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
if (get_vmx_mem_address(vcpu, exit_qualification,
vmx_instruction_info, false, &gva))
return 1;
- if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva,
- &field_value, (is_64_bit_mode(vcpu) ? 8 : 4), &e)) {
+ if (kvm_read_guest_virt(vcpu, gva, &field_value,
+ (is_64_bit_mode(vcpu) ? 8 : 4), &e)) {
kvm_inject_page_fault(vcpu, &e);
return 1;
}
@@ -8078,7 +8339,12 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
- if (vmcs_field_readonly(field)) {
+ /*
+ * If the vCPU supports "VMWRITE to any supported field in the
+ * VMCS," then the "read-only" fields are actually read/write.
+ */
+ if (vmcs_field_readonly(field) &&
+ !nested_cpu_has_vmwrite_any_field(vcpu)) {
nested_vmx_failValid(vcpu,
VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
return kvm_skip_emulated_instruction(vcpu);
@@ -8189,10 +8455,10 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
if (get_vmx_mem_address(vcpu, exit_qualification,
vmx_instruction_info, true, &vmcs_gva))
return 1;
- /* ok to use *_system, as hardware has verified cpl=0 */
- if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva,
- (void *)&to_vmx(vcpu)->nested.current_vmptr,
- sizeof(u64), &e)) {
+ /* *_system ok, nested_vmx_check_permission has verified cpl=0 */
+ if (kvm_write_guest_virt_system(vcpu, vmcs_gva,
+ (void *)&to_vmx(vcpu)->nested.current_vmptr,
+ sizeof(u64), &e)) {
kvm_inject_page_fault(vcpu, &e);
return 1;
}
@@ -8239,8 +8505,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
vmx_instruction_info, false, &gva))
return 1;
- if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
- sizeof(operand), &e)) {
+ if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
kvm_inject_page_fault(vcpu, &e);
return 1;
}
@@ -8304,8 +8569,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
vmx_instruction_info, false, &gva))
return 1;
- if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
- sizeof(operand), &e)) {
+ if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
kvm_inject_page_fault(vcpu, &e);
return 1;
}
@@ -8317,12 +8581,19 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
switch (type) {
case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
- if (is_noncanonical_address(operand.gla, vcpu)) {
+ if (!operand.vpid ||
+ is_noncanonical_address(operand.gla, vcpu)) {
nested_vmx_failValid(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
return kvm_skip_emulated_instruction(vcpu);
}
- /* fall through */
+ if (cpu_has_vmx_invvpid_individual_addr() &&
+ vmx->nested.vpid02) {
+ __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR,
+ vmx->nested.vpid02, operand.gla);
+ } else
+ __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
+ break;
case VMX_VPID_EXTENT_SINGLE_CONTEXT:
case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
if (!operand.vpid) {
@@ -8330,15 +8601,16 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
return kvm_skip_emulated_instruction(vcpu);
}
+ __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
break;
case VMX_VPID_EXTENT_ALL_CONTEXT:
+ __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
break;
default:
WARN_ON_ONCE(1);
return kvm_skip_emulated_instruction(vcpu);
}
- __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
nested_vmx_succeed(vcpu);
return kvm_skip_emulated_instruction(vcpu);
@@ -8842,11 +9114,13 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
case EXIT_REASON_TPR_BELOW_THRESHOLD:
return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
case EXIT_REASON_APIC_ACCESS:
- return nested_cpu_has2(vmcs12,
- SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
case EXIT_REASON_APIC_WRITE:
case EXIT_REASON_EOI_INDUCED:
- /* apic_write and eoi_induced should exit unconditionally. */
+ /*
+ * The controls for "virtualize APIC accesses," "APIC-
+ * register virtualization," and "virtual-interrupt
+ * delivery" only come from vmcs12.
+ */
return true;
case EXIT_REASON_EPT_VIOLATION:
/*
@@ -9253,31 +9527,43 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
vmcs_write32(TPR_THRESHOLD, irr);
}
-static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
+static void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
{
u32 sec_exec_control;
+ if (!lapic_in_kernel(vcpu))
+ return;
+
/* Postpone execution until vmcs01 is the current VMCS. */
if (is_guest_mode(vcpu)) {
- to_vmx(vcpu)->nested.change_vmcs01_virtual_x2apic_mode = true;
+ to_vmx(vcpu)->nested.change_vmcs01_virtual_apic_mode = true;
return;
}
- if (!cpu_has_vmx_virtualize_x2apic_mode())
- return;
-
if (!cpu_need_tpr_shadow(vcpu))
return;
sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+ sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
- if (set) {
- sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
- sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
- } else {
- sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
- sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
- vmx_flush_tlb(vcpu, true);
+ switch (kvm_get_apic_mode(vcpu)) {
+ case LAPIC_MODE_INVALID:
+ WARN_ONCE(true, "Invalid local APIC state");
+ case LAPIC_MODE_DISABLED:
+ break;
+ case LAPIC_MODE_XAPIC:
+ if (flexpriority_enabled) {
+ sec_exec_control |=
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ vmx_flush_tlb(vcpu, true);
+ }
+ break;
+ case LAPIC_MODE_X2APIC:
+ if (cpu_has_vmx_virtualize_x2apic_mode())
+ sec_exec_control |=
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
+ break;
}
vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
@@ -9286,24 +9572,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- /*
- * Currently we do not handle the nested case where L2 has an
- * APIC access page of its own; that page is still pinned.
- * Hence, we skip the case where the VCPU is in guest mode _and_
- * L1 prepared an APIC access page for L2.
- *
- * For the case where L1 and L2 share the same APIC access page
- * (flexpriority=Y but SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES clear
- * in the vmcs12), this function will only update either the vmcs01
- * or the vmcs02. If the former, the vmcs02 will be updated by
- * prepare_vmcs02. If the latter, the vmcs01 will be updated in
- * the next L2->L1 exit.
- */
- if (!is_guest_mode(vcpu) ||
- !nested_cpu_has2(get_vmcs12(&vmx->vcpu),
- SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
+ if (!is_guest_mode(vcpu)) {
vmcs_write64(APIC_ACCESS_ADDR, hpa);
vmx_flush_tlb(vcpu, true);
}
@@ -9943,13 +10212,13 @@ STACK_FRAME_NON_STANDARD(vmx_vcpu_run);
static struct kvm *vmx_vm_alloc(void)
{
- struct kvm_vmx *kvm_vmx = kzalloc(sizeof(struct kvm_vmx), GFP_KERNEL);
+ struct kvm_vmx *kvm_vmx = vzalloc(sizeof(struct kvm_vmx));
return &kvm_vmx->kvm;
}
static void vmx_vm_free(struct kvm *kvm)
{
- kfree(to_kvm_vmx(kvm));
+ vfree(to_kvm_vmx(kvm));
}
static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
@@ -10387,11 +10656,6 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
}
- } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
- cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
- vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
- SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
- kvm_vcpu_reload_apic_access_page(vcpu);
}
if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
@@ -10871,8 +11135,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
return 0;
}
-static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
- bool from_vmentry)
+static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -11006,13 +11269,13 @@ static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
* is assigned to entry_failure_code on failure.
*/
static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
- bool from_vmentry, u32 *entry_failure_code)
+ u32 *entry_failure_code)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 exec_control, vmcs12_exec_ctrl;
if (vmx->nested.dirty_vmcs12) {
- prepare_vmcs02_full(vcpu, vmcs12, from_vmentry);
+ prepare_vmcs02_full(vcpu, vmcs12);
vmx->nested.dirty_vmcs12 = false;
}
@@ -11032,7 +11295,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
* HOST_FS_BASE, HOST_GS_BASE.
*/
- if (from_vmentry &&
+ if (vmx->nested.nested_run_pending &&
(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
@@ -11040,7 +11303,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
}
- if (from_vmentry) {
+ if (vmx->nested.nested_run_pending) {
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
vmcs12->vm_entry_intr_info_field);
vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
@@ -11172,7 +11435,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
~VM_ENTRY_IA32E_MODE) |
(vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
- if (from_vmentry &&
+ if (vmx->nested.nested_run_pending &&
(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
vcpu->arch.pat = vmcs12->guest_ia32_pat;
@@ -11197,7 +11460,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) {
if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
vmx->nested.last_vpid = vmcs12->virtual_processor_id;
- __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02, true);
+ __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
}
} else {
vmx_flush_tlb(vcpu, true);
@@ -11240,7 +11503,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmx_set_cr4(vcpu, vmcs12->guest_cr4);
vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
- if (from_vmentry &&
+ if (vmx->nested.nested_run_pending &&
(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
vcpu->arch.efer = vmcs12->guest_ia32_efer;
else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
@@ -11418,7 +11681,7 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
return 0;
}
-static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
+static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@@ -11438,7 +11701,7 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
vcpu->arch.tsc_offset += vmcs12->tsc_offset;
r = EXIT_REASON_INVALID_STATE;
- if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual))
+ if (prepare_vmcs02(vcpu, vmcs12, &exit_qual))
goto fail;
nested_get_vmcs12_pages(vcpu, vmcs12);
@@ -11540,20 +11803,22 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
* the nested entry.
*/
- ret = enter_vmx_non_root_mode(vcpu, true);
- if (ret)
+ vmx->nested.nested_run_pending = 1;
+ ret = enter_vmx_non_root_mode(vcpu);
+ if (ret) {
+ vmx->nested.nested_run_pending = 0;
return ret;
+ }
/*
* If we're entering a halted L2 vcpu and the L2 vcpu won't be woken
* by event injection, halt vcpu.
*/
if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) &&
- !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK))
+ !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK)) {
+ vmx->nested.nested_run_pending = 0;
return kvm_vcpu_halt(vcpu);
-
- vmx->nested.nested_run_pending = 1;
-
+ }
return 1;
out:
@@ -11925,12 +12190,20 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
load_vmcs12_mmu_host_state(vcpu, vmcs12);
- if (enable_vpid) {
- /*
- * Trivially support vpid by letting L2s share their parent
- * L1's vpid. TODO: move to a more elaborate solution, giving
- * each L2 its own vpid and exposing the vpid feature to L1.
- */
+ /*
+ * If vmcs01 don't use VPID, CPU flushes TLB on every
+ * VMEntry/VMExit. Thus, no need to flush TLB.
+ *
+ * If vmcs12 uses VPID, TLB entries populated by L2 are
+ * tagged with vmx->nested.vpid02 while L1 entries are tagged
+ * with vmx->vpid. Thus, no need to flush TLB.
+ *
+ * Therefore, flush TLB only in case vmcs01 uses VPID and
+ * vmcs12 don't use VPID as in this case L1 & L2 TLB entries
+ * are both tagged with vmx->vpid.
+ */
+ if (enable_vpid &&
+ !(nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02)) {
vmx_flush_tlb(vcpu, true);
}
@@ -12069,10 +12342,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
if (kvm_has_tsc_control)
decache_tsc_multiplier(vmx);
- if (vmx->nested.change_vmcs01_virtual_x2apic_mode) {
- vmx->nested.change_vmcs01_virtual_x2apic_mode = false;
- vmx_set_virtual_x2apic_mode(vcpu,
- vcpu->arch.apic_base & X2APIC_ENABLE);
+ if (vmx->nested.change_vmcs01_virtual_apic_mode) {
+ vmx->nested.change_vmcs01_virtual_apic_mode = false;
+ vmx_set_virtual_apic_mode(vcpu);
} else if (!nested_cpu_has_ept(vmcs12) &&
nested_cpu_has2(vmcs12,
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
@@ -12236,7 +12508,7 @@ static inline int u64_shl_div_u64(u64 a, unsigned int shift,
static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
{
struct vcpu_vmx *vmx;
- u64 tscl, guest_tscl, delta_tsc;
+ u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
if (kvm_mwait_in_guest(vcpu->kvm))
return -EOPNOTSUPP;
@@ -12245,6 +12517,12 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
tscl = rdtsc();
guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
+ lapic_timer_advance_cycles = nsec_to_cycles(vcpu, lapic_timer_advance_ns);
+
+ if (delta_tsc > lapic_timer_advance_cycles)
+ delta_tsc -= lapic_timer_advance_cycles;
+ else
+ delta_tsc = 0;
/* Convert to host delta tsc if tsc scaling is enabled */
if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
@@ -12615,7 +12893,7 @@ static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
if (vmx->nested.smm.guest_mode) {
vcpu->arch.hflags &= ~HF_SMM_MASK;
- ret = enter_vmx_non_root_mode(vcpu, false);
+ ret = enter_vmx_non_root_mode(vcpu);
vcpu->arch.hflags |= HF_SMM_MASK;
if (ret)
return ret;
@@ -12700,7 +12978,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.enable_nmi_window = enable_nmi_window,
.enable_irq_window = enable_irq_window,
.update_cr8_intercept = update_cr8_intercept,
- .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
+ .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
.set_apic_access_page_addr = vmx_set_apic_access_page_addr,
.get_enable_apicv = vmx_get_enable_apicv,
.refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
@@ -12812,6 +13090,7 @@ static int __init vmx_init(void)
rcu_assign_pointer(crash_vmclear_loaded_vmcss,
crash_vmclear_local_loaded_vmcss);
#endif
+ vmx_check_vmcs12_offsets();
return 0;
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 71e7cda6d014..cc8c8be1e92d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -138,6 +138,7 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
/* lapic timer advance (tscdeadline mode only) in nanoseconds */
unsigned int __read_mostly lapic_timer_advance_ns = 0;
module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
+EXPORT_SYMBOL_GPL(lapic_timer_advance_ns);
static bool __read_mostly vector_hashing = true;
module_param(vector_hashing, bool, S_IRUGO);
@@ -318,23 +319,27 @@ u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_get_apic_base);
+enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu)
+{
+ return kvm_apic_mode(kvm_get_apic_base(vcpu));
+}
+EXPORT_SYMBOL_GPL(kvm_get_apic_mode);
+
int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
- u64 old_state = vcpu->arch.apic_base &
- (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE);
- u64 new_state = msr_info->data &
- (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE);
+ enum lapic_mode old_mode = kvm_get_apic_mode(vcpu);
+ enum lapic_mode new_mode = kvm_apic_mode(msr_info->data);
u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) | 0x2ff |
(guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE);
- if ((msr_info->data & reserved_bits) || new_state == X2APIC_ENABLE)
- return 1;
- if (!msr_info->host_initiated &&
- ((new_state == MSR_IA32_APICBASE_ENABLE &&
- old_state == (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) ||
- (new_state == (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE) &&
- old_state == 0)))
+ if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID)
return 1;
+ if (!msr_info->host_initiated) {
+ if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC)
+ return 1;
+ if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC)
+ return 1;
+ }
kvm_lapic_set_base(vcpu, msr_info->data);
return 0;
@@ -856,7 +861,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
}
if (is_long_mode(vcpu) &&
- (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 62)))
+ (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63)))
return 1;
else if (is_pae(vcpu) && is_paging(vcpu) &&
!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
@@ -1761,7 +1766,7 @@ static int do_monotonic_boot(s64 *t, u64 *tsc_timestamp)
return mode;
}
-static int do_realtime(struct timespec *ts, u64 *tsc_timestamp)
+static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
{
struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
unsigned long seq;
@@ -1794,7 +1799,7 @@ static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
}
/* returns true if host is using TSC based clocksource */
-static bool kvm_get_walltime_and_clockread(struct timespec *ts,
+static bool kvm_get_walltime_and_clockread(struct timespec64 *ts,
u64 *tsc_timestamp)
{
/* checked again under seqlock below */
@@ -2868,6 +2873,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_HYPERV_SYNIC2:
case KVM_CAP_HYPERV_VP_INDEX:
case KVM_CAP_HYPERV_EVENTFD:
+ case KVM_CAP_HYPERV_TLBFLUSH:
case KVM_CAP_PCI_SEGMENT:
case KVM_CAP_DEBUGREGS:
case KVM_CAP_X86_ROBUST_SINGLESTEP:
@@ -2894,7 +2900,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = KVM_CLOCK_TSC_STABLE;
break;
case KVM_CAP_X86_DISABLE_EXITS:
- r |= KVM_X86_DISABLE_EXITS_HTL | KVM_X86_DISABLE_EXITS_PAUSE;
+ r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE;
if(kvm_can_mwait_in_guest())
r |= KVM_X86_DISABLE_EXITS_MWAIT;
break;
@@ -3962,7 +3968,7 @@ out_nofree:
return r;
}
-int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
+vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
{
return VM_FAULT_SIGBUS;
}
@@ -4248,7 +4254,7 @@ split_irqchip_unlock:
if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) &&
kvm_can_mwait_in_guest())
kvm->arch.mwait_in_guest = true;
- if (cap->args[0] & KVM_X86_DISABLE_EXITS_HTL)
+ if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT)
kvm->arch.hlt_in_guest = true;
if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
kvm->arch.pause_in_guest = true;
@@ -4787,11 +4793,10 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
return X86EMUL_CONTINUE;
}
-int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
+int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
gva_t addr, void *val, unsigned int bytes,
struct x86_exception *exception)
{
- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
@@ -4799,12 +4804,17 @@ int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
}
EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
-static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt,
- gva_t addr, void *val, unsigned int bytes,
- struct x86_exception *exception)
+static int emulator_read_std(struct x86_emulate_ctxt *ctxt,
+ gva_t addr, void *val, unsigned int bytes,
+ struct x86_exception *exception, bool system)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
+ u32 access = 0;
+
+ if (!system && kvm_x86_ops->get_cpl(vcpu) == 3)
+ access |= PFERR_USER_MASK;
+
+ return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception);
}
static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt,
@@ -4816,18 +4826,16 @@ static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt,
return r < 0 ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE;
}
-int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
- gva_t addr, void *val,
- unsigned int bytes,
- struct x86_exception *exception)
+static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
+ struct kvm_vcpu *vcpu, u32 access,
+ struct x86_exception *exception)
{
- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
void *data = val;
int r = X86EMUL_CONTINUE;
while (bytes) {
gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
- PFERR_WRITE_MASK,
+ access,
exception);
unsigned offset = addr & (PAGE_SIZE-1);
unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
@@ -4848,6 +4856,27 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
out:
return r;
}
+
+static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val,
+ unsigned int bytes, struct x86_exception *exception,
+ bool system)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ u32 access = PFERR_WRITE_MASK;
+
+ if (!system && kvm_x86_ops->get_cpl(vcpu) == 3)
+ access |= PFERR_USER_MASK;
+
+ return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
+ access, exception);
+}
+
+int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
+ unsigned int bytes, struct x86_exception *exception)
+{
+ return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
+ PFERR_WRITE_MASK, exception);
+}
EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
int handle_ud(struct kvm_vcpu *vcpu)
@@ -4858,8 +4887,8 @@ int handle_ud(struct kvm_vcpu *vcpu)
struct x86_exception e;
if (force_emulation_prefix &&
- kvm_read_guest_virt(&vcpu->arch.emulate_ctxt,
- kvm_get_linear_rip(vcpu), sig, sizeof(sig), &e) == 0 &&
+ kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu),
+ sig, sizeof(sig), &e) == 0 &&
memcmp(sig, "\xf\xbkvm", sizeof(sig)) == 0) {
kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig));
emul_type = 0;
@@ -5600,8 +5629,8 @@ static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, u64 smbase)
static const struct x86_emulate_ops emulate_ops = {
.read_gpr = emulator_read_gpr,
.write_gpr = emulator_write_gpr,
- .read_std = kvm_read_guest_virt_system,
- .write_std = kvm_write_guest_virt_system,
+ .read_std = emulator_read_std,
+ .write_std = emulator_write_std,
.read_phys = kvm_read_guest_phys_system,
.fetch = kvm_fetch_guest_virt,
.read_emulated = emulator_read_emulated,
@@ -6617,7 +6646,7 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
unsigned long clock_type)
{
struct kvm_clock_pairing clock_pairing;
- struct timespec ts;
+ struct timespec64 ts;
u64 cycle;
int ret;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index c9492f764902..331993c49dae 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -247,11 +247,11 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
u64 get_kvmclock_ns(struct kvm *kvm);
-int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
+int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
gva_t addr, void *val, unsigned int bytes,
struct x86_exception *exception);
-int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
+int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu,
gva_t addr, void *val, unsigned int bytes,
struct x86_exception *exception);